[SM4] try arm64 gcmSm4Enc gcmSm4Dec

2025-09-03 05:49:23 +08:00 · 2022-01-18 15:58:14 +08:00 · 2022-01-18 15:58:14 +08:00 · 129803a389
commit 129803a389
parent 067a12cb20
3 changed files with 653 additions and 2 deletions
--- a/sm4/gcm_arm64.s
+++ b/sm4/gcm_arm64.s
@ -463,3 +463,648 @@ dataBail:
 #undef autLen
 #undef H0
 #undef pTblSave
+
+// func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+TEXT ·gcmSm4Enc(SB),NOSPLIT,$0
+#define pTbl R0
+#define dstPtr R1
+#define ctrPtr R2
+#define srcPtr R3
+#define rk R4
+#define tPtr R5
+#define srcPtrLen R6
+#define aluCTR R7
+#define aluTMP R8
+#define H0 R9
+#define H1 R10
+#define pTblSave R11
+#define rkSave R12
+#define mulRoundSingleWithoutRev(X) \
+	VEOR	ACC0.B16, X.B16, X.B16    \
+	VEXT	$8, X.B16, X.B16, T0.B16  \
+	VEOR	X.B16, T0.B16, T0.B16     \
+	VPMULL	X.D1, T1.D1, ACC1.Q1    \
+	VPMULL2	X.D2, T1.D2, ACC0.Q1    \
+	VPMULL	T0.D1, T2.D1, ACCM.Q1   \
+	reduce()                        \
+
+#define mulRoundSingle(X) \
+	VREV64	X.B16, X.B16            \
+	mulRoundSingleWithoutRev(X)     \
+
+	MOVD	productTable+0(FP), pTbl
+	MOVD	dst+8(FP), dstPtr
+	MOVD	src_base+32(FP), srcPtr
+	MOVD	src_len+40(FP), srcPtrLen
+	MOVD	ctr+56(FP), ctrPtr
+	MOVD	T+64(FP), tPtr
+	MOVD	rk_base+72(FP), rk
+	
+	MOVD	$0xC2, H1
+	LSL	$56, H1
+	MOVD	$1, H0
+	VMOV	H1, POLY.D[0]
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	MOVD	pTbl, pTblSave
+	MOVD rk, rkSave
+	// Current tag, after AAD
+	VLD1	(tPtr), [ACC0.B16]
+	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
+	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
+	// Prepare initial counter, and the increment vector
+	VLD1	(ctrPtr), [CTR.B16]
+	VEOR	INC.B16, INC.B16, INC.B16
+	MOVD	$1, H0
+	VMOV	H0, INC.S[3]
+	VREV32	CTR.B16, CTR.B16
+	VADD	CTR.S4, INC.S4, CTR.S4
+	// Skip to <8 blocks loop
+	CMP	$128, srcPtrLen
+
+	LOAD_SM4_AESNI_CONSTS()
+
+	BLT	encNibblesLoop
+	// There are at least 8 blocks to encrypt
+
+encOctetsLoop:
+		SUB	$128, srcPtrLen
+		// Prepare 8 counters
+		VMOV	CTR.B16, B0.B16
+		VADD	B0.S4, INC.S4, B1.S4
+		VREV32	B0.B16, B0.B16
+		VADD	B1.S4, INC.S4, B2.S4
+		VREV32	B1.B16, B1.B16
+		VADD	B2.S4, INC.S4, B3.S4
+		VREV32	B2.B16, B2.B16
+		VADD	B3.S4, INC.S4, B4.S4
+		VREV32	B3.B16, B3.B16
+		VADD	B4.S4, INC.S4, B5.S4
+		VREV32	B4.B16, B4.B16
+		VADD	B5.S4, INC.S4, B6.S4
+		VREV32	B5.B16, B5.B16
+		VADD	B6.S4, INC.S4, B7.S4
+		VREV32	B6.B16, B6.B16
+		VADD	B7.S4, INC.S4, CTR.S4
+		VREV32	B7.B16, B7.B16
+
+		// encryption first 4 blocks
+		EOR R13, R13
+		MOVD	rkSave, rk
+
+encOctetsEnc4Blocks1:	
+			SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
+			SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
+			SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
+			SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
+
+		ADD $1, R13
+		CMP $8, R13
+		BNE encOctetsEnc4Blocks1
+		VREV32 B0.B16, B0.B16
+		VREV32 B1.B16, B1.B16
+		VREV32 B2.B16, B2.B16
+		VREV32 B3.B16, B3.B16
+
+		// encryption first 4 blocks
+		MOVD	rkSave, rk
+
+encOctetsEnc4Blocks2:	
+			SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7)
+			SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4)
+			SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5)
+			SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6)
+
+		ADD $1, R13
+		CMP $16, R13
+		BNE encOctetsEnc4Blocks2
+		VREV32 B4.B16, B4.B16
+		VREV32 B5.B16, B5.B16
+		VREV32 B6.B16, B6.B16
+		VREV32 B7.B16, B7.B16
+
+		// XOR plaintext and store ciphertext
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B0.B16, T1.B16, B0.B16
+		VEOR	B1.B16, T2.B16, B1.B16
+		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B2.B16, T1.B16, B2.B16
+		VEOR	B3.B16, T2.B16, B3.B16
+		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B4.B16, T1.B16, B4.B16
+		VEOR	B5.B16, T2.B16, B5.B16
+		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B6.B16, T1.B16, B6.B16
+		VEOR	B7.B16, T2.B16, B7.B16
+		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		mulRound(B1)
+		mulRound(B2)
+		mulRound(B3)
+		mulRound(B4)
+		mulRound(B5)
+		mulRound(B6)
+		mulRound(B7)
+		MOVD	pTblSave, pTbl
+		reduce()
+
+		CMP	$128, srcPtrLen
+		BGE	encOctetsLoop
+
+encNibblesLoop:
+	CBZ	srcPtrLen, encDone
+	ADD	$14*16, pTbl
+	// Preload H and its Karatsuba precomp
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+
+	CMP	$64, srcPtrLen
+	BLT	encStartSingles
+	SUB	$64, srcPtrLen
+
+	// Prepare 4 counters
+	VMOV	CTR.B16, B0.B16
+	VADD	B0.S4, INC.S4, B1.S4
+	VREV32	B0.B16, B0.B16
+	VADD	B1.S4, INC.S4, B2.S4
+	VREV32	B1.B16, B1.B16
+	VADD	B2.S4, INC.S4, B3.S4
+	VREV32	B2.B16, B2.B16
+	VADD	B3.S4, INC.S4, B4.S4
+	VREV32	B3.B16, B3.B16
+
+	// encryption first 4 blocks
+	EOR R13, R13
+	MOVD	rkSave, rk
+
+encNibblesEnc4Blocks:	
+		SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
+		SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
+		SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
+		SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
+
+	ADD $1, R13
+	CMP $8, R13
+	BNE encNibblesEnc4Blocks
+	VREV32 B0.B16, B0.B16
+	VREV32 B1.B16, B1.B16
+	VREV32 B2.B16, B2.B16
+	VREV32 B3.B16, B3.B16
+
+	// XOR plaintext and store ciphertext
+	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
+	VEOR	B0.B16, K1.B16, B0.B16
+	VEOR	B1.B16, K2.B16, B1.B16
+	VST1.P  [B0.B16, B1.B16], 32(dstPtr)
+	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
+	VEOR	B2.B16, K1.B16, B2.B16
+	VEOR	B3.B16, K2.B16, B3.B16
+	VST1.P  [B2.B16, B3.B16], 32(dstPtr)
+
+	mulRoundSingle(B0)
+	mulRoundSingle(B1)
+	mulRoundSingle(B2)
+	mulRoundSingle(B3)
+
+encStartSingles:
+	CBZ	srcPtrLen, encDone
+
+	// Prepare 4 counters
+	VMOV	CTR.B16, B0.B16
+	VADD	B0.S4, INC.S4, B1.S4
+	VREV32	B0.B16, B0.B16
+	VADD	B1.S4, INC.S4, B2.S4
+	VREV32	B1.B16, B1.B16
+	VADD	B2.S4, INC.S4, B3.S4
+	VREV32	B2.B16, B2.B16
+	VADD	B3.S4, INC.S4, B4.S4
+	VREV32	B3.B16, B3.B16
+
+	// encryption first 4 blocks
+	EOR R13, R13
+	MOVD	rkSave, rk
+
+encSinglesEnc4Blocks:	
+		SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
+		SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
+		SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
+		SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
+
+	ADD $1, R13
+	CMP $8, R13
+	BNE encSinglesEnc4Blocks
+	VREV32 B0.B16, B0.B16
+	VREV32 B1.B16, B1.B16
+	VREV32 B2.B16, B2.B16
+	VREV32 B3.B16, B3.B16
+
+	VMOV B0.B16, K0.B16
+	CMP	$16, srcPtrLen
+	BLT	encTail
+	SUB	$16, srcPtrLen
+	VLD1.P	16(srcPtr), [K1.B16]
+	VEOR	K0.B16, K1.B16, K0.B16		
+	VST1.P  [K0.B16], 16(dstPtr)
+	mulRoundSingle(K0)
+
+	VMOV B1.B16, K0.B16
+	CMP	$16, srcPtrLen
+	BLT	encTail
+	SUB	$16, srcPtrLen
+	VLD1.P	16(srcPtr), [K1.B16]
+	VEOR	K0.B16, K1.B16, K0.B16		
+	VST1.P  [K0.B16], 16(dstPtr)
+	mulRoundSingle(K0)
+
+	VMOV B2.B16, K0.B16
+	CMP	$16, srcPtrLen
+	BLT	encTail
+	SUB	$16, srcPtrLen
+	VLD1.P	16(srcPtr), [K1.B16]
+	VEOR	K0.B16, K1.B16, K0.B16		
+	VST1.P  [K0.B16], 16(dstPtr)
+	mulRoundSingle(K0)
+
+	VMOV B3.B16, K0.B16
+	CMP	$16, srcPtrLen
+	BLT	encTail
+	SUB	$16, srcPtrLen
+	VLD1.P	16(srcPtr), [K1.B16]
+	VEOR	K0.B16, K1.B16, K0.B16		
+	VST1.P  [K0.B16], 16(dstPtr)
+	mulRoundSingle(K0)
+
+encTail:
+	CBZ	srcPtrLen, encDone
+	VEOR	T0.B16, T0.B16, T0.B16
+	VEOR	T3.B16, T3.B16, T3.B16
+	MOVD	$0, H1
+	SUB	$1, H1
+	ADD	srcPtrLen, srcPtr
+
+	TBZ	$3, srcPtrLen, ld4
+	MOVD.W	-8(srcPtr), H0
+	VMOV	H0, T0.D[0]
+	VMOV	H1, T3.D[0]
+ld4:
+	TBZ	$2, srcPtrLen, ld2
+	MOVW.W	-4(srcPtr), H0
+	VEXT	$12, T0.B16, ZERO.B16, T0.B16
+	VEXT	$12, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.S[0]
+	VMOV	H1, T3.S[0]
+ld2:
+	TBZ	$1, srcPtrLen, ld1
+	MOVH.W	-2(srcPtr), H0
+	VEXT	$14, T0.B16, ZERO.B16, T0.B16
+	VEXT	$14, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.H[0]
+	VMOV	H1, T3.H[0]
+ld1:
+	TBZ	$0, srcPtrLen, ld0
+	MOVB.W	-1(srcPtr), H0
+	VEXT	$15, T0.B16, ZERO.B16, T0.B16
+	VEXT	$15, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.B[0]
+	VMOV	H1, T3.B[0]
+ld0:
+	MOVD	ZR, srcPtrLen
+	VEOR	T0.B16, K0.B16, K0.B16
+	VAND	T3.B16, K0.B16, K0.B16
+	VST1.P  [K0.B16], 16(dstPtr)
+	mulRoundSingle(K0)
+
+encDone:
+	VST1	[ACC0.B16], (tPtr)
+	RET
+
+// func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+TEXT ·gcmSm4Dec(SB),NOSPLIT,$0
+	MOVD	productTable+0(FP), pTbl
+	MOVD	dst+8(FP), dstPtr
+	MOVD	src_base+32(FP), srcPtr
+	MOVD	src_len+40(FP), srcPtrLen
+	MOVD	ctr+56(FP), ctrPtr
+	MOVD	T+64(FP), tPtr
+	MOVD	rk_base+72(FP), rk
+
+	MOVD	$0xC2, H1
+	LSL	$56, H1
+	MOVD	$1, H0
+	VMOV	H1, POLY.D[0]
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	MOVD	pTbl, pTblSave
+	MOVD rk, rkSave
+	// Current tag, after AAD
+	VLD1	(tPtr), [ACC0.B16]
+	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
+	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
+	// Prepare initial counter, and the increment vector
+	VLD1	(ctrPtr), [CTR.B16]
+	VEOR	INC.B16, INC.B16, INC.B16
+	MOVD	$1, H0
+	VMOV	H0, INC.S[3]
+	VREV32	CTR.B16, CTR.B16
+	VADD	CTR.S4, INC.S4, CTR.S4
+
+	// Skip to <8 blocks loop
+	CMP	$128, srcPtrLen
+
+	LOAD_SM4_AESNI_CONSTS()
+
+	BLT	decNibblesLoop
+	// There are at least 8 blocks to encrypt
+
+decOctetsLoop:
+		SUB	$128, srcPtrLen
+
+		VMOV	CTR.B16, B0.B16
+		VADD	B0.S4, INC.S4, B1.S4
+		VREV32	B0.B16, B0.B16
+		VADD	B1.S4, INC.S4, B2.S4
+		VREV32	B1.B16, B1.B16
+		VADD	B2.S4, INC.S4, B3.S4
+		VREV32	B2.B16, B2.B16
+		VADD	B3.S4, INC.S4, B4.S4
+		VREV32	B3.B16, B3.B16
+		VADD	B4.S4, INC.S4, B5.S4
+		VREV32	B4.B16, B4.B16
+		VADD	B5.S4, INC.S4, B6.S4
+		VREV32	B5.B16, B5.B16
+		VADD	B6.S4, INC.S4, B7.S4
+		VREV32	B6.B16, B6.B16
+		VADD	B7.S4, INC.S4, CTR.S4
+		VREV32	B7.B16, B7.B16
+
+		// encryption first 4 blocks
+		EOR R13, R13
+		MOVD	rkSave, rk
+
+decOctetsEnc4Blocks1:	
+			SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
+			SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
+			SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
+			SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
+
+		ADD $1, R13
+		CMP $8, R13
+		BNE decOctetsEnc4Blocks1
+		VREV32 B0.B16, T1.B16
+		VREV32 B1.B16, T2.B16
+		VREV32 B2.B16, B2.B16
+		VREV32 B3.B16, B3.B16
+
+		// encryption first 4 blocks
+		MOVD	rkSave, rk
+
+decOctetsEnc4Blocks2:	
+			SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7)
+			SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4)
+			SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5)
+			SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6)
+
+		ADD $1, R13
+		CMP $16, R13
+		BNE decOctetsEnc4Blocks2
+		VREV32 B4.B16, B4.B16
+		VREV32 B5.B16, B5.B16
+		VREV32 B6.B16, B6.B16
+		VREV32 B7.B16, B7.B16
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B0.B16, T1.B16, T1.B16
+		VEOR	B1.B16, T2.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B2.B16, B0.B16, T1.B16
+		VEOR	B3.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B4.B16, B0.B16, T1.B16
+		VEOR	B5.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B6.B16, B0.B16, T1.B16
+		VEOR	B7.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		MOVD	pTblSave, pTbl
+		reduce()
+
+		CMP	$128, srcPtrLen
+		BGE	decOctetsLoop
+
+decNibblesLoop:
+	CBZ	srcPtrLen, decDone
+	ADD	$14*16, pTbl
+	// Preload H and its Karatsuba precomp
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+	CMP	$64, srcPtrLen
+	BLT	decStartSingles
+	SUB	$64, srcPtrLen
+
+	// Prepare 4 counters
+	VMOV	CTR.B16, B0.B16
+	VADD	B0.S4, INC.S4, B1.S4
+	VREV32	B0.B16, B0.B16
+	VADD	B1.S4, INC.S4, B2.S4
+	VREV32	B1.B16, B1.B16
+	VADD	B2.S4, INC.S4, B3.S4
+	VREV32	B2.B16, B2.B16
+	VADD	B3.S4, INC.S4, B4.S4
+	VREV32	B3.B16, B3.B16
+
+	// encryption first 4 blocks
+	EOR R13, R13
+	MOVD	rkSave, rk
+
+decNibblesEnc4Blocks:	
+		SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
+		SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
+		SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
+		SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
+
+	ADD $1, R13
+	CMP $8, R13
+	BNE decNibblesEnc4Blocks
+	VREV32 B0.B16, B0.B16
+	VREV32 B1.B16, B1.B16
+	VREV32 B2.B16, B2.B16
+	VREV32 B3.B16, B3.B16
+
+	// XOR plaintext and store ciphertext
+	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
+	VREV64	K1.B16, B4.B16
+	VREV64	K2.B16, B5.B16
+	VEOR	B0.B16, K1.B16, B0.B16
+	VEOR	B1.B16, K2.B16, B1.B16
+	VST1.P  [B0.B16, B1.B16], 32(dstPtr)
+	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
+	VREV64	K1.B16, B6.B16
+	VREV64	K2.B16, B7.B16
+	VEOR	B2.B16, K1.B16, B2.B16
+	VEOR	B3.B16, K2.B16, B3.B16
+	VST1.P  [B2.B16, B3.B16], 32(dstPtr)
+	mulRoundSingleWithoutRev(B4)
+	mulRoundSingleWithoutRev(B5)
+	mulRoundSingleWithoutRev(B6)
+	mulRoundSingleWithoutRev(B7)
+
+decStartSingles:
+	CBZ	srcPtrLen, decDone
+
+	// Prepare 4 counters
+	VMOV	CTR.B16, B0.B16
+	VADD	B0.S4, INC.S4, B1.S4
+	VREV32	B0.B16, B0.B16
+	VADD	B1.S4, INC.S4, B2.S4
+	VREV32	B1.B16, B1.B16
+	VADD	B2.S4, INC.S4, B3.S4
+	VREV32	B2.B16, B2.B16
+	VADD	B3.S4, INC.S4, B4.S4
+	VREV32	B3.B16, B3.B16
+
+	// encryption first 4 blocks
+	EOR R13, R13
+	MOVD	rkSave, rk
+
+decSinglesEnc4Blocks:	
+		SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
+		SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
+		SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
+		SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
+
+	ADD $1, R13
+	CMP $8, R13
+	BNE decSinglesEnc4Blocks
+	VREV32 B0.B16, B0.B16
+	VREV32 B1.B16, B1.B16
+	VREV32 B2.B16, B2.B16
+	VREV32 B3.B16, B3.B16
+
+	VMOV B0.B16, K0.B16
+	CMP	$16, srcPtrLen
+	BLT	decTail
+	SUB	$16, srcPtrLen
+	VLD1.P	16(srcPtr), [K1.B16]
+	VREV64	K1.B16, B5.B16
+	VEOR	K0.B16, K1.B16, K0.B16		
+	VST1.P  [K0.B16], 16(dstPtr)
+	mulRoundSingleWithoutRev(B5)
+
+	VMOV B1.B16, K0.B16
+	CMP	$16, srcPtrLen
+	BLT	decTail
+	SUB	$16, srcPtrLen
+	VLD1.P	16(srcPtr), [K1.B16]
+	VREV64	K1.B16, B5.B16
+	VEOR	K0.B16, K1.B16, K0.B16		
+	VST1.P  [K0.B16], 16(dstPtr)
+	mulRoundSingleWithoutRev(B5)
+
+	VMOV B2.B16, K0.B16
+	CMP	$16, srcPtrLen
+	BLT	decTail
+	SUB	$16, srcPtrLen
+	VLD1.P	16(srcPtr), [K1.B16]
+	VREV64	K1.B16, B5.B16
+	VEOR	K0.B16, K1.B16, K0.B16		
+	VST1.P  [K0.B16], 16(dstPtr)
+	mulRoundSingleWithoutRev(B5)
+
+	VMOV B3.B16, K0.B16
+	CMP	$16, srcPtrLen
+	BLT	decTail
+	SUB	$16, srcPtrLen
+	VLD1.P	16(srcPtr), [K1.B16]
+	VREV64	K1.B16, B5.B16
+	VEOR	K0.B16, K1.B16, K0.B16		
+	VST1.P  [K0.B16], 16(dstPtr)
+	mulRoundSingleWithoutRev(B5)
+
+decTail:
+	CBZ	srcPtrLen, decDone
+	// Assuming it is safe to load past dstPtr due to the presence of the tag
+	VLD1	(srcPtr), [B5.B16]
+
+	VEOR	B5.B16, K0.B16, B0.B16
+
+	VEOR	T3.B16, T3.B16, T3.B16
+	MOVD	$0, H1
+	SUB	$1, H1
+
+	TBZ	$3, srcPtrLen, decLd4
+	VMOV	B0.D[0], H0
+	MOVD.P	H0, 8(dstPtr)
+	VMOV	H1, T3.D[0]
+	VEXT	$8, ZERO.B16, B0.B16, B0.B16
+
+decLd4:
+	TBZ	$2, srcPtrLen, decLd2
+	VMOV	B0.S[0], H0
+	MOVW.P	H0, 4(dstPtr)
+	VEXT	$12, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.S[0]
+	VEXT	$4, ZERO.B16, B0.B16, B0.B16
+decLd2:
+	TBZ	$1, srcPtrLen, decLd1
+	VMOV	B0.H[0], H0
+	MOVH.P	H0, 2(dstPtr)
+	VEXT	$14, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.H[0]
+	VEXT	$2, ZERO.B16, B0.B16, B0.B16
+decLd1:
+	TBZ	$0, srcPtrLen, decLd0
+	VMOV	B0.B[0], H0
+	MOVB.P	H0, 1(dstPtr)
+	VEXT	$15, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.B[0]
+decLd0:
+
+	VAND	T3.B16, B5.B16, B5.B16
+	VREV64	B5.B16, B5.B16
+
+	VEOR	ACC0.B16, B5.B16, B5.B16
+	VEXT	$8, B5.B16, B5.B16, T0.B16
+	VEOR	B5.B16, T0.B16, T0.B16
+	VPMULL	B5.D1, T1.D1, ACC1.Q1
+	VPMULL2	B5.D2, T1.D2, ACC0.Q1
+	VPMULL	T0.D1, T2.D1, ACCM.Q1
+	reduce()
+
+decDone:
+	VST1	[ACC0.B16], (tPtr)
+	RET
--- a/sm4/gcm_amd64_test.go
+++ b/sm4/gcm_amd64_test.go
@ -1,5 +1,5 @@
-//go:build amd64
-// +build amd64
+//go:build amd64 || arm64
+// +build amd64 arm64

 package sm4

--- a/sm4/sm4_gcm_arm64.go
+++ b/sm4/sm4_gcm_arm64.go
@ -23,6 +23,12 @@ var _ gcmAble = (*sm4CipherGCM)(nil)
 //go:noescape
 func gcmSm4Init(productTable *[256]byte, rk []uint32)

+//go:noescape
+func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+
+//go:noescape
+func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+
 //go:noescape
 func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)