[SM4] try arm64 gcmSm4Enc gcmSm4Dec

2025-09-03 05:49:23 +08:00 · 2022-01-18 15:58:14 +08:00 · 2022-01-18 15:58:14 +08:00 · 129803a389
commit 129803a389
parent 067a12cb20
3 changed files with 653 additions and 2 deletions
--- a/sm4/gcm_arm64.s
+++ b/sm4/gcm_arm64.s
@ -463,3 +463,648 @@ dataBail:
 #undef autLen
 #undef H0
 #undef pTblSave
 // func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
 TEXT ·gcmSm4Enc(SB),NOSPLIT,$0
 #define pTbl R0
 #define dstPtr R1
 #define ctrPtr R2
 #define srcPtr R3
 #define rk R4
 #define tPtr R5
 #define srcPtrLen R6
 #define aluCTR R7
 #define aluTMP R8
 #define H0 R9
 #define H1 R10
 #define pTblSave R11
 #define rkSave R12
 #define mulRoundSingleWithoutRev(X) \
 	VEOR	ACC0.B16, X.B16, X.B16    \
 	VEXT	$8, X.B16, X.B16, T0.B16  \
 	VEOR	X.B16, T0.B16, T0.B16     \
 	VPMULL	X.D1, T1.D1, ACC1.Q1    \
 	VPMULL2	X.D2, T1.D2, ACC0.Q1    \
 	VPMULL	T0.D1, T2.D1, ACCM.Q1   \
 	reduce()                        \
 #define mulRoundSingle(X) \
 	VREV64	X.B16, X.B16            \
 	mulRoundSingleWithoutRev(X)     \
 	MOVD	productTable+0(FP), pTbl
 	MOVD	dst+8(FP), dstPtr
 	MOVD	src_base+32(FP), srcPtr
 	MOVD	src_len+40(FP), srcPtrLen
 	MOVD	ctr+56(FP), ctrPtr
 	MOVD	T+64(FP), tPtr
 	MOVD	rk_base+72(FP), rk
 	MOVD	$0xC2, H1
 	LSL	$56, H1
 	MOVD	$1, H0
 	VMOV	H1, POLY.D[0]
 	VMOV	H0, POLY.D[1]
 	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
 	MOVD	pTbl, pTblSave
 	MOVD rk, rkSave
 	// Current tag, after AAD
 	VLD1	(tPtr), [ACC0.B16]
 	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
 	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
 	// Prepare initial counter, and the increment vector
 	VLD1	(ctrPtr), [CTR.B16]
 	VEOR	INC.B16, INC.B16, INC.B16
 	MOVD	$1, H0
 	VMOV	H0, INC.S[3]
 	VREV32	CTR.B16, CTR.B16
 	VADD	CTR.S4, INC.S4, CTR.S4
 	// Skip to <8 blocks loop
 	CMP	$128, srcPtrLen
 	LOAD_SM4_AESNI_CONSTS()
 	BLT	encNibblesLoop
 	// There are at least 8 blocks to encrypt
 encOctetsLoop:
 		SUB	$128, srcPtrLen
 		// Prepare 8 counters
 		VMOV	CTR.B16, B0.B16
 		VADD	B0.S4, INC.S4, B1.S4
 		VREV32	B0.B16, B0.B16
 		VADD	B1.S4, INC.S4, B2.S4
 		VREV32	B1.B16, B1.B16
 		VADD	B2.S4, INC.S4, B3.S4
 		VREV32	B2.B16, B2.B16
 		VADD	B3.S4, INC.S4, B4.S4
 		VREV32	B3.B16, B3.B16
 		VADD	B4.S4, INC.S4, B5.S4
 		VREV32	B4.B16, B4.B16
 		VADD	B5.S4, INC.S4, B6.S4
 		VREV32	B5.B16, B5.B16
 		VADD	B6.S4, INC.S4, B7.S4
 		VREV32	B6.B16, B6.B16
 		VADD	B7.S4, INC.S4, CTR.S4
 		VREV32	B7.B16, B7.B16
 		// encryption first 4 blocks
 		EOR R13, R13
 		MOVD	rkSave, rk
 encOctetsEnc4Blocks1:	
 			SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
 			SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
 			SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
 			SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
 		ADD $1, R13
 		CMP $8, R13
 		BNE encOctetsEnc4Blocks1
 		VREV32 B0.B16, B0.B16
 		VREV32 B1.B16, B1.B16
 		VREV32 B2.B16, B2.B16
 		VREV32 B3.B16, B3.B16
 		// encryption first 4 blocks
 		MOVD	rkSave, rk
 encOctetsEnc4Blocks2:	
 			SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7)
 			SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4)
 			SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5)
 			SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6)
 		ADD $1, R13
 		CMP $16, R13
 		BNE encOctetsEnc4Blocks2
 		VREV32 B4.B16, B4.B16
 		VREV32 B5.B16, B5.B16
 		VREV32 B6.B16, B6.B16
 		VREV32 B7.B16, B7.B16
 		// XOR plaintext and store ciphertext
 		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
 		VEOR	B0.B16, T1.B16, B0.B16
 		VEOR	B1.B16, T2.B16, B1.B16
 		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
 		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
 		VEOR	B2.B16, T1.B16, B2.B16
 		VEOR	B3.B16, T2.B16, B3.B16
 		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
 		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
 		VEOR	B4.B16, T1.B16, B4.B16
 		VEOR	B5.B16, T2.B16, B5.B16
 		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
 		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
 		VEOR	B6.B16, T1.B16, B6.B16
 		VEOR	B7.B16, T2.B16, B7.B16
 		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
 		VLD1.P	32(pTbl), [T1.B16, T2.B16]
 		VREV64	B0.B16, B0.B16
 		VEOR	ACC0.B16, B0.B16, B0.B16
 		VEXT	$8, B0.B16, B0.B16, T0.B16
 		VEOR	B0.B16, T0.B16, T0.B16
 		VPMULL	B0.D1, T1.D1, ACC1.Q1
 		VPMULL2	B0.D2, T1.D2, ACC0.Q1
 		VPMULL	T0.D1, T2.D1, ACCM.Q1
 		mulRound(B1)
 		mulRound(B2)
 		mulRound(B3)
 		mulRound(B4)
 		mulRound(B5)
 		mulRound(B6)
 		mulRound(B7)
 		MOVD	pTblSave, pTbl
 		reduce()
 		CMP	$128, srcPtrLen
 		BGE	encOctetsLoop
 encNibblesLoop:
 	CBZ	srcPtrLen, encDone
 	ADD	$14*16, pTbl
 	// Preload H and its Karatsuba precomp
 	VLD1.P	(pTbl), [T1.B16, T2.B16]
 	CMP	$64, srcPtrLen
 	BLT	encStartSingles
 	SUB	$64, srcPtrLen
 	// Prepare 4 counters
 	VMOV	CTR.B16, B0.B16
 	VADD	B0.S4, INC.S4, B1.S4
 	VREV32	B0.B16, B0.B16
 	VADD	B1.S4, INC.S4, B2.S4
 	VREV32	B1.B16, B1.B16
 	VADD	B2.S4, INC.S4, B3.S4
 	VREV32	B2.B16, B2.B16
 	VADD	B3.S4, INC.S4, B4.S4
 	VREV32	B3.B16, B3.B16
 	// encryption first 4 blocks
 	EOR R13, R13
 	MOVD	rkSave, rk
 encNibblesEnc4Blocks:	
 		SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
 		SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
 		SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
 		SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
 	ADD $1, R13
 	CMP $8, R13
 	BNE encNibblesEnc4Blocks
 	VREV32 B0.B16, B0.B16
 	VREV32 B1.B16, B1.B16
 	VREV32 B2.B16, B2.B16
 	VREV32 B3.B16, B3.B16
 	// XOR plaintext and store ciphertext
 	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
 	VEOR	B0.B16, K1.B16, B0.B16
 	VEOR	B1.B16, K2.B16, B1.B16
 	VST1.P  [B0.B16, B1.B16], 32(dstPtr)
 	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
 	VEOR	B2.B16, K1.B16, B2.B16
 	VEOR	B3.B16, K2.B16, B3.B16
 	VST1.P  [B2.B16, B3.B16], 32(dstPtr)
 	mulRoundSingle(B0)
 	mulRoundSingle(B1)
 	mulRoundSingle(B2)
 	mulRoundSingle(B3)
 encStartSingles:
 	CBZ	srcPtrLen, encDone
 	// Prepare 4 counters
 	VMOV	CTR.B16, B0.B16
 	VADD	B0.S4, INC.S4, B1.S4
 	VREV32	B0.B16, B0.B16
 	VADD	B1.S4, INC.S4, B2.S4
 	VREV32	B1.B16, B1.B16
 	VADD	B2.S4, INC.S4, B3.S4
 	VREV32	B2.B16, B2.B16
 	VADD	B3.S4, INC.S4, B4.S4
 	VREV32	B3.B16, B3.B16
 	// encryption first 4 blocks
 	EOR R13, R13
 	MOVD	rkSave, rk
 encSinglesEnc4Blocks:	
 		SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
 		SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
 		SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
 		SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
 	ADD $1, R13
 	CMP $8, R13
 	BNE encSinglesEnc4Blocks
 	VREV32 B0.B16, B0.B16
 	VREV32 B1.B16, B1.B16
 	VREV32 B2.B16, B2.B16
 	VREV32 B3.B16, B3.B16
 	VMOV B0.B16, K0.B16
 	CMP	$16, srcPtrLen
 	BLT	encTail
 	SUB	$16, srcPtrLen
 	VLD1.P	16(srcPtr), [K1.B16]
 	VEOR	K0.B16, K1.B16, K0.B16		
 	VST1.P  [K0.B16], 16(dstPtr)
 	mulRoundSingle(K0)
 	VMOV B1.B16, K0.B16
 	CMP	$16, srcPtrLen
 	BLT	encTail
 	SUB	$16, srcPtrLen
 	VLD1.P	16(srcPtr), [K1.B16]
 	VEOR	K0.B16, K1.B16, K0.B16		
 	VST1.P  [K0.B16], 16(dstPtr)
 	mulRoundSingle(K0)
 	VMOV B2.B16, K0.B16
 	CMP	$16, srcPtrLen
 	BLT	encTail
 	SUB	$16, srcPtrLen
 	VLD1.P	16(srcPtr), [K1.B16]
 	VEOR	K0.B16, K1.B16, K0.B16		
 	VST1.P  [K0.B16], 16(dstPtr)
 	mulRoundSingle(K0)
 	VMOV B3.B16, K0.B16
 	CMP	$16, srcPtrLen
 	BLT	encTail
 	SUB	$16, srcPtrLen
 	VLD1.P	16(srcPtr), [K1.B16]
 	VEOR	K0.B16, K1.B16, K0.B16		
 	VST1.P  [K0.B16], 16(dstPtr)
 	mulRoundSingle(K0)
 encTail:
 	CBZ	srcPtrLen, encDone
 	VEOR	T0.B16, T0.B16, T0.B16
 	VEOR	T3.B16, T3.B16, T3.B16
 	MOVD	$0, H1
 	SUB	$1, H1
 	ADD	srcPtrLen, srcPtr
 	TBZ	$3, srcPtrLen, ld4
 	MOVD.W	-8(srcPtr), H0
 	VMOV	H0, T0.D[0]
 	VMOV	H1, T3.D[0]
 ld4:
 	TBZ	$2, srcPtrLen, ld2
 	MOVW.W	-4(srcPtr), H0
 	VEXT	$12, T0.B16, ZERO.B16, T0.B16
 	VEXT	$12, T3.B16, ZERO.B16, T3.B16
 	VMOV	H0, T0.S[0]
 	VMOV	H1, T3.S[0]
 ld2:
 	TBZ	$1, srcPtrLen, ld1
 	MOVH.W	-2(srcPtr), H0
 	VEXT	$14, T0.B16, ZERO.B16, T0.B16
 	VEXT	$14, T3.B16, ZERO.B16, T3.B16
 	VMOV	H0, T0.H[0]
 	VMOV	H1, T3.H[0]
 ld1:
 	TBZ	$0, srcPtrLen, ld0
 	MOVB.W	-1(srcPtr), H0
 	VEXT	$15, T0.B16, ZERO.B16, T0.B16
 	VEXT	$15, T3.B16, ZERO.B16, T3.B16
 	VMOV	H0, T0.B[0]
 	VMOV	H1, T3.B[0]
 ld0:
 	MOVD	ZR, srcPtrLen
 	VEOR	T0.B16, K0.B16, K0.B16
 	VAND	T3.B16, K0.B16, K0.B16
 	VST1.P  [K0.B16], 16(dstPtr)
 	mulRoundSingle(K0)
 encDone:
 	VST1	[ACC0.B16], (tPtr)
 	RET
 // func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
 TEXT ·gcmSm4Dec(SB),NOSPLIT,$0
 	MOVD	productTable+0(FP), pTbl
 	MOVD	dst+8(FP), dstPtr
 	MOVD	src_base+32(FP), srcPtr
 	MOVD	src_len+40(FP), srcPtrLen
 	MOVD	ctr+56(FP), ctrPtr
 	MOVD	T+64(FP), tPtr
 	MOVD	rk_base+72(FP), rk
 	MOVD	$0xC2, H1
 	LSL	$56, H1
 	MOVD	$1, H0
 	VMOV	H1, POLY.D[0]
 	VMOV	H0, POLY.D[1]
 	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
 	MOVD	pTbl, pTblSave
 	MOVD rk, rkSave
 	// Current tag, after AAD
 	VLD1	(tPtr), [ACC0.B16]
 	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
 	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
 	// Prepare initial counter, and the increment vector
 	VLD1	(ctrPtr), [CTR.B16]
 	VEOR	INC.B16, INC.B16, INC.B16
 	MOVD	$1, H0
 	VMOV	H0, INC.S[3]
 	VREV32	CTR.B16, CTR.B16
 	VADD	CTR.S4, INC.S4, CTR.S4
 	// Skip to <8 blocks loop
 	CMP	$128, srcPtrLen
 	LOAD_SM4_AESNI_CONSTS()
 	BLT	decNibblesLoop
 	// There are at least 8 blocks to encrypt
 decOctetsLoop:
 		SUB	$128, srcPtrLen
 		VMOV	CTR.B16, B0.B16
 		VADD	B0.S4, INC.S4, B1.S4
 		VREV32	B0.B16, B0.B16
 		VADD	B1.S4, INC.S4, B2.S4
 		VREV32	B1.B16, B1.B16
 		VADD	B2.S4, INC.S4, B3.S4
 		VREV32	B2.B16, B2.B16
 		VADD	B3.S4, INC.S4, B4.S4
 		VREV32	B3.B16, B3.B16
 		VADD	B4.S4, INC.S4, B5.S4
 		VREV32	B4.B16, B4.B16
 		VADD	B5.S4, INC.S4, B6.S4
 		VREV32	B5.B16, B5.B16
 		VADD	B6.S4, INC.S4, B7.S4
 		VREV32	B6.B16, B6.B16
 		VADD	B7.S4, INC.S4, CTR.S4
 		VREV32	B7.B16, B7.B16
 		// encryption first 4 blocks
 		EOR R13, R13
 		MOVD	rkSave, rk
 decOctetsEnc4Blocks1:	
 			SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
 			SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
 			SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
 			SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
 		ADD $1, R13
 		CMP $8, R13
 		BNE decOctetsEnc4Blocks1
 		VREV32 B0.B16, T1.B16
 		VREV32 B1.B16, T2.B16
 		VREV32 B2.B16, B2.B16
 		VREV32 B3.B16, B3.B16
 		// encryption first 4 blocks
 		MOVD	rkSave, rk
 decOctetsEnc4Blocks2:	
 			SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7)
 			SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4)
 			SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5)
 			SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6)
 		ADD $1, R13
 		CMP $16, R13
 		BNE decOctetsEnc4Blocks2
 		VREV32 B4.B16, B4.B16
 		VREV32 B5.B16, B5.B16
 		VREV32 B6.B16, B6.B16
 		VREV32 B7.B16, B7.B16
 		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
 		VEOR	B0.B16, T1.B16, T1.B16
 		VEOR	B1.B16, T2.B16, T2.B16
 		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
 		VLD1.P	32(pTbl), [T1.B16, T2.B16]
 		VREV64	B0.B16, B0.B16
 		VEOR	ACC0.B16, B0.B16, B0.B16
 		VEXT	$8, B0.B16, B0.B16, T0.B16
 		VEOR	B0.B16, T0.B16, T0.B16
 		VPMULL	B0.D1, T1.D1, ACC1.Q1
 		VPMULL2	B0.D2, T1.D2, ACC0.Q1
 		VPMULL	T0.D1, T2.D1, ACCM.Q1
 		mulRound(B1)
 		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
 		VEOR	B2.B16, B0.B16, T1.B16
 		VEOR	B3.B16, B1.B16, T2.B16
 		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
 		mulRound(B0)
 		mulRound(B1)
 		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
 		VEOR	B4.B16, B0.B16, T1.B16
 		VEOR	B5.B16, B1.B16, T2.B16
 		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
 		mulRound(B0)
 		mulRound(B1)
 		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
 		VEOR	B6.B16, B0.B16, T1.B16
 		VEOR	B7.B16, B1.B16, T2.B16
 		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
 		mulRound(B0)
 		mulRound(B1)
 		MOVD	pTblSave, pTbl
 		reduce()
 		CMP	$128, srcPtrLen
 		BGE	decOctetsLoop
 decNibblesLoop:
 	CBZ	srcPtrLen, decDone
 	ADD	$14*16, pTbl
 	// Preload H and its Karatsuba precomp
 	VLD1.P	(pTbl), [T1.B16, T2.B16]
 	CMP	$64, srcPtrLen
 	BLT	decStartSingles
 	SUB	$64, srcPtrLen
 	// Prepare 4 counters
 	VMOV	CTR.B16, B0.B16
 	VADD	B0.S4, INC.S4, B1.S4
 	VREV32	B0.B16, B0.B16
 	VADD	B1.S4, INC.S4, B2.S4
 	VREV32	B1.B16, B1.B16
 	VADD	B2.S4, INC.S4, B3.S4
 	VREV32	B2.B16, B2.B16
 	VADD	B3.S4, INC.S4, B4.S4
 	VREV32	B3.B16, B3.B16
 	// encryption first 4 blocks
 	EOR R13, R13
 	MOVD	rkSave, rk
 decNibblesEnc4Blocks:	
 		SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
 		SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
 		SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
 		SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
 	ADD $1, R13
 	CMP $8, R13
 	BNE decNibblesEnc4Blocks
 	VREV32 B0.B16, B0.B16
 	VREV32 B1.B16, B1.B16
 	VREV32 B2.B16, B2.B16
 	VREV32 B3.B16, B3.B16
 	// XOR plaintext and store ciphertext
 	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
 	VREV64	K1.B16, B4.B16
 	VREV64	K2.B16, B5.B16
 	VEOR	B0.B16, K1.B16, B0.B16
 	VEOR	B1.B16, K2.B16, B1.B16
 	VST1.P  [B0.B16, B1.B16], 32(dstPtr)
 	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
 	VREV64	K1.B16, B6.B16
 	VREV64	K2.B16, B7.B16
 	VEOR	B2.B16, K1.B16, B2.B16
 	VEOR	B3.B16, K2.B16, B3.B16
 	VST1.P  [B2.B16, B3.B16], 32(dstPtr)
 	mulRoundSingleWithoutRev(B4)
 	mulRoundSingleWithoutRev(B5)
 	mulRoundSingleWithoutRev(B6)
 	mulRoundSingleWithoutRev(B7)
 decStartSingles:
 	CBZ	srcPtrLen, decDone
 	// Prepare 4 counters
 	VMOV	CTR.B16, B0.B16
 	VADD	B0.S4, INC.S4, B1.S4
 	VREV32	B0.B16, B0.B16
 	VADD	B1.S4, INC.S4, B2.S4
 	VREV32	B1.B16, B1.B16
 	VADD	B2.S4, INC.S4, B3.S4
 	VREV32	B2.B16, B2.B16
 	VADD	B3.S4, INC.S4, B4.S4
 	VREV32	B3.B16, B3.B16
 	// encryption first 4 blocks
 	EOR R13, R13
 	MOVD	rkSave, rk
 decSinglesEnc4Blocks:	
 		SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
 		SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
 		SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
 		SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
 	ADD $1, R13
 	CMP $8, R13
 	BNE decSinglesEnc4Blocks
 	VREV32 B0.B16, B0.B16
 	VREV32 B1.B16, B1.B16
 	VREV32 B2.B16, B2.B16
 	VREV32 B3.B16, B3.B16
 	VMOV B0.B16, K0.B16
 	CMP	$16, srcPtrLen
 	BLT	decTail
 	SUB	$16, srcPtrLen
 	VLD1.P	16(srcPtr), [K1.B16]
 	VREV64	K1.B16, B5.B16
 	VEOR	K0.B16, K1.B16, K0.B16		
 	VST1.P  [K0.B16], 16(dstPtr)
 	mulRoundSingleWithoutRev(B5)
 	VMOV B1.B16, K0.B16
 	CMP	$16, srcPtrLen
 	BLT	decTail
 	SUB	$16, srcPtrLen
 	VLD1.P	16(srcPtr), [K1.B16]
 	VREV64	K1.B16, B5.B16
 	VEOR	K0.B16, K1.B16, K0.B16		
 	VST1.P  [K0.B16], 16(dstPtr)
 	mulRoundSingleWithoutRev(B5)
 	VMOV B2.B16, K0.B16
 	CMP	$16, srcPtrLen
 	BLT	decTail
 	SUB	$16, srcPtrLen
 	VLD1.P	16(srcPtr), [K1.B16]
 	VREV64	K1.B16, B5.B16
 	VEOR	K0.B16, K1.B16, K0.B16		
 	VST1.P  [K0.B16], 16(dstPtr)
 	mulRoundSingleWithoutRev(B5)
 	VMOV B3.B16, K0.B16
 	CMP	$16, srcPtrLen
 	BLT	decTail
 	SUB	$16, srcPtrLen
 	VLD1.P	16(srcPtr), [K1.B16]
 	VREV64	K1.B16, B5.B16
 	VEOR	K0.B16, K1.B16, K0.B16		
 	VST1.P  [K0.B16], 16(dstPtr)
 	mulRoundSingleWithoutRev(B5)
 decTail:
 	CBZ	srcPtrLen, decDone
 	// Assuming it is safe to load past dstPtr due to the presence of the tag
 	VLD1	(srcPtr), [B5.B16]
 	VEOR	B5.B16, K0.B16, B0.B16
 	VEOR	T3.B16, T3.B16, T3.B16
 	MOVD	$0, H1
 	SUB	$1, H1
 	TBZ	$3, srcPtrLen, decLd4
 	VMOV	B0.D[0], H0
 	MOVD.P	H0, 8(dstPtr)
 	VMOV	H1, T3.D[0]
 	VEXT	$8, ZERO.B16, B0.B16, B0.B16
 decLd4:
 	TBZ	$2, srcPtrLen, decLd2
 	VMOV	B0.S[0], H0
 	MOVW.P	H0, 4(dstPtr)
 	VEXT	$12, T3.B16, ZERO.B16, T3.B16
 	VMOV	H1, T3.S[0]
 	VEXT	$4, ZERO.B16, B0.B16, B0.B16
 decLd2:
 	TBZ	$1, srcPtrLen, decLd1
 	VMOV	B0.H[0], H0
 	MOVH.P	H0, 2(dstPtr)
 	VEXT	$14, T3.B16, ZERO.B16, T3.B16
 	VMOV	H1, T3.H[0]
 	VEXT	$2, ZERO.B16, B0.B16, B0.B16
 decLd1:
 	TBZ	$0, srcPtrLen, decLd0
 	VMOV	B0.B[0], H0
 	MOVB.P	H0, 1(dstPtr)
 	VEXT	$15, T3.B16, ZERO.B16, T3.B16
 	VMOV	H1, T3.B[0]
 decLd0:
 	VAND	T3.B16, B5.B16, B5.B16
 	VREV64	B5.B16, B5.B16
 	VEOR	ACC0.B16, B5.B16, B5.B16
 	VEXT	$8, B5.B16, B5.B16, T0.B16
 	VEOR	B5.B16, T0.B16, T0.B16
 	VPMULL	B5.D1, T1.D1, ACC1.Q1
 	VPMULL2	B5.D2, T1.D2, ACC0.Q1
 	VPMULL	T0.D1, T2.D1, ACCM.Q1
 	reduce()
 decDone:
 	VST1	[ACC0.B16], (tPtr)
 	RET
--- a/sm4/gcm_amd64_test.go
+++ b/sm4/gcm_amd64_test.go
@ -1,5 +1,5 @@
-//go:build amd64
+//go:build amd64 || arm64
-// +build amd64
+// +build amd64 arm64
 package sm4
--- a/sm4/sm4_gcm_arm64.go
+++ b/sm4/sm4_gcm_arm64.go
@ -23,6 +23,12 @@ var _ gcmAble = (*sm4CipherGCM)(nil)
 //go:noescape
 func gcmSm4Init(productTable *[256]byte, rk []uint32)
 //go:noescape
 func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
 //go:noescape
 func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
 //go:noescape
 func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)