diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s
index 2915314..00ef81d 100644
--- a/sm4/gcm_arm64.s
+++ b/sm4/gcm_arm64.s
@@ -252,7 +252,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
 	SM4_TAO_L1(x, y, z);                              \
 	VEOR x.B16, t0.B16, t0.B16
 
-// func gcmSm4Init(productTable *[256]byte, rk []uint32)
+// func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
 TEXT ·gcmSm4Init(SB),NOSPLIT,$0
 #define pTbl R0
 #define RK R1
@@ -260,6 +260,7 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
 
 	MOVD productTable+0(FP), pTbl
 	MOVD rk+8(FP), RK
+	MOVD inst+16(FP), R5
 
 	MOVD	$0xC2, I
 	LSL	$56, I
@@ -269,6 +270,9 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
 	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
 
 	// Encrypt block 0 with the SM4 keys to generate the hash key H
+	CMP $1, R5
+	BEQ sm4InitSM4E
+
 	LOAD_SM4_AESNI_CONSTS()
 	VEOR	B0.B16, B0.B16, B0.B16
 	VEOR	B1.B16, B1.B16, B1.B16
@@ -290,7 +294,22 @@ sm4InitEncLoop:
 	VMOV B1.S[0], B0.S[3]
 	VMOV B2.S[0], B0.S[0]
 	VMOV B3.S[0], B0.S[1]
-
+	B sm4InitEncDone
+sm4InitSM4E:
+	VEOR	B0.B16, B0.B16, B0.B16
+	VLD1.P	64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
+	WORD 0x6085c0ce          //SM4E V0.4S, V11.4S
+	WORD 0x8085c0ce          //SM4E V0.4S, V12.4S
+	WORD 0xa085c0ce          //SM4E V0.4S, V13.4S
+	WORD 0xc085c0ce          //SM4E V0.4S, V14.4S
+	VLD1.P	64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
+	WORD 0x6085c0ce          //SM4E V0.4S, V11.4S
+	WORD 0x8085c0ce          //SM4E V0.4S, V12.4S
+	WORD 0xa085c0ce          //SM4E V0.4S, V13.4S
+	WORD 0xc085c0ce          //SM4E V0.4S, V14.4S
+	VREV32	B0.B16, B0.B16
+	VREV64	B0.B16, B0.B16		
+sm4InitEncDone:
 	// Multiply by 2 modulo P
 	VMOV	B0.D[0], I
 	ASR	$63, I
@@ -547,6 +566,7 @@ TEXT ·gcmSm4Enc(SB),NOSPLIT,$0
 	VMOV	H0, INC.S[3]
 	VREV32	CTR.B16, CTR.B16
 	VADD	CTR.S4, INC.S4, CTR.S4
+
 	// Skip to <8 blocks loop
 	CMP	$128, srcPtrLen
 
@@ -587,7 +607,7 @@ encOctetsEnc4Blocks1:
 		VREV32 B2.B16, B2.B16
 		VREV32 B3.B16, B3.B16
 		TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
-		// encryption first 4 blocks
+		// encryption second 4 blocks
 		PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
 		MOVD	rkSave, rk
 
@@ -880,7 +900,7 @@ decOctetsEnc4Blocks1:
 		VREV32 B3.B16, B3.B16
 		TRANSPOSE_MATRIX(T1, T2, B2, B3, K0)
 
-		// encryption first 4 blocks
+		// encryption second 4 blocks
 		PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
 		MOVD	rkSave, rk
 
diff --git a/sm4/gcm_sm4ni_arm64.s b/sm4/gcm_sm4ni_arm64.s
new file mode 100644
index 0000000..ee40b10
--- /dev/null
+++ b/sm4/gcm_sm4ni_arm64.s
@@ -0,0 +1,610 @@
+#include "textflag.h"
+
+#define B0 V0
+#define B1 V1
+#define B2 V2
+#define B3 V3
+#define B4 V4
+#define B5 V5
+#define B6 V6
+#define B7 V7
+
+#define ACC0 V8
+#define ACC1 V9
+#define ACCM V10
+
+#define T0 V11
+#define T1 V12
+#define T2 V13
+#define T3 V14
+
+#define POLY V15
+#define ZERO V16
+#define INC V17
+#define CTR V18
+
+#define K0 V19
+#define K1 V20
+#define K2 V21
+#define K3 V22
+#define K4 V23
+#define K5 V24
+#define K6 V25
+#define K7 V26
+
+#define reduce() \
+	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
+	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
+	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
+	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
+	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
+	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
+	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
+	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
+	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
+	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
+	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
+	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
+	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
+
+#define mulRound(X) \
+	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
+	VREV64	X.B16, X.B16               \
+	VEXT	$8, X.B16, X.B16, T0.B16   \
+	VEOR	X.B16, T0.B16, T0.B16      \
+	VPMULL	X.D1, T1.D1, T3.Q1         \
+	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
+	VPMULL2	X.D2, T1.D2, T3.Q1         \
+	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
+	VPMULL	T0.D1, T2.D1, T3.Q1        \
+	VEOR	T3.B16, ACCM.B16, ACCM.B16
+
+// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
+#define pTbl R0
+#define dstPtr R1
+#define ctrPtr R2
+#define srcPtr R3
+#define rk R4
+#define tPtr R5
+#define srcPtrLen R6
+#define aluCTR R7
+#define aluTMP R8
+#define H0 R9
+#define H1 R10
+#define pTblSave R11
+	MOVD	productTable+0(FP), pTbl
+	MOVD	dst+8(FP), dstPtr
+	MOVD	src_base+32(FP), srcPtr
+	MOVD	src_len+40(FP), srcPtrLen
+	MOVD	ctr+56(FP), ctrPtr
+	MOVD	T+64(FP), tPtr
+	MOVD	rk_base+72(FP), rk
+	
+	MOVD	$0xC2, H1
+	LSL	$56, H1
+	MOVD	$1, H0
+	VMOV	H1, POLY.D[0]
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	MOVD	pTbl, pTblSave
+	// Current tag, after AAD
+	VLD1	(tPtr), [ACC0.B16]
+	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
+	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
+	// Prepare initial counter, and the increment vector
+	VLD1	(ctrPtr), [CTR.B16]
+	VEOR	INC.B16, INC.B16, INC.B16
+	MOVD	$1, H0
+	VMOV	H0, INC.S[3]
+	VREV32	CTR.B16, CTR.B16
+	VADD	CTR.S4, INC.S4, CTR.S4
+
+	// Skip to <8 blocks loop
+	CMP	$128, srcPtrLen
+
+	MOVD	rk, H0
+	// For SM4 round keys are stored in: K0 .. K7
+	VLD1.P	64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
+	VLD1.P	64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
+
+	BLT	startSingles
+octetsLoop:
+		SUB	$128, srcPtrLen
+		// Prepare 8 counters
+		VMOV	CTR.B16, B0.B16
+		VADD	B0.S4, INC.S4, B1.S4
+		VADD	B1.S4, INC.S4, B2.S4
+		VADD	B2.S4, INC.S4, B3.S4
+		VADD	B3.S4, INC.S4, B4.S4
+		VADD	B4.S4, INC.S4, B5.S4
+		VADD	B5.S4, INC.S4, B6.S4
+		VADD	B6.S4, INC.S4, B7.S4
+		VADD	B7.S4, INC.S4, CTR.S4
+
+        WORD 0x6086c0ce          //SM4E V0.4S, V19.4S
+        WORD 0x8086c0ce          //SM4E V0.4S, V20.4S
+        WORD 0xa086c0ce          //SM4E V0.4S, V21.4S
+        WORD 0xc086c0ce          //SM4E V0.4S, V22.4S
+        WORD 0xe086c0ce          //SM4E V0.4S, V23.4S
+        WORD 0x0087c0ce          //SM4E V0.4S, V24.4S
+        WORD 0x2087c0ce          //SM4E V0.4S, V25.4S
+        WORD 0x4087c0ce          //SM4E V0.4S, V26.4S
+        WORD 0x6186c0ce          //SM4E V1.4S, V19.4S
+        WORD 0x8186c0ce          //SM4E V1.4S, V20.4S
+        WORD 0xa186c0ce          //SM4E V1.4S, V21.4S
+        WORD 0xc186c0ce          //SM4E V1.4S, V22.4S
+        WORD 0xe186c0ce          //SM4E V1.4S, V23.4S
+        WORD 0x0187c0ce          //SM4E V1.4S, V24.4S
+        WORD 0x2187c0ce          //SM4E V1.4S, V25.4S
+        WORD 0x4187c0ce          //SM4E V1.4S, V26.4S
+        WORD 0x6286c0ce          //SM4E V2.4S, V19.4S
+        WORD 0x8286c0ce          //SM4E V2.4S, V20.4S
+        WORD 0xa286c0ce          //SM4E V2.4S, V21.4S
+        WORD 0xc286c0ce          //SM4E V2.4S, V22.4S
+        WORD 0xe286c0ce          //SM4E V2.4S, V23.4S
+        WORD 0x0287c0ce          //SM4E V2.4S, V24.4S
+        WORD 0x2287c0ce          //SM4E V2.4S, V25.4S
+        WORD 0x4287c0ce          //SM4E V2.4S, V26.4S
+        WORD 0x6386c0ce          //SM4E V3.4S, V19.4S
+        WORD 0x8386c0ce          //SM4E V3.4S, V20.4S
+        WORD 0xa386c0ce          //SM4E V3.4S, V21.4S
+        WORD 0xc386c0ce          //SM4E V3.4S, V22.4S
+        WORD 0xe386c0ce          //SM4E V3.4S, V23.4S
+        WORD 0x0387c0ce          //SM4E V3.4S, V24.4S
+        WORD 0x2387c0ce          //SM4E V3.4S, V25.4S
+        WORD 0x4387c0ce          //SM4E V3.4S, V26.4S
+        WORD 0x6486c0ce          //SM4E V4.4S, V19.4S
+        WORD 0x8486c0ce          //SM4E V4.4S, V20.4S
+        WORD 0xa486c0ce          //SM4E V4.4S, V21.4S
+        WORD 0xc486c0ce          //SM4E V4.4S, V22.4S
+        WORD 0xe486c0ce          //SM4E V4.4S, V23.4S
+        WORD 0x0487c0ce          //SM4E V4.4S, V24.4S
+        WORD 0x2487c0ce          //SM4E V4.4S, V25.4S
+        WORD 0x4487c0ce          //SM4E V4.4S, V26.4S
+        WORD 0x6586c0ce          //SM4E V5.4S, V19.4S
+        WORD 0x8586c0ce          //SM4E V5.4S, V20.4S
+        WORD 0xa586c0ce          //SM4E V5.4S, V21.4S
+        WORD 0xc586c0ce          //SM4E V5.4S, V22.4S
+        WORD 0xe586c0ce          //SM4E V5.4S, V23.4S
+        WORD 0x0587c0ce          //SM4E V5.4S, V24.4S
+        WORD 0x2587c0ce          //SM4E V5.4S, V25.4S
+        WORD 0x4587c0ce          //SM4E V5.4S, V26.4S
+        WORD 0x6686c0ce          //SM4E V6.4S, V19.4S
+        WORD 0x8686c0ce          //SM4E V6.4S, V20.4S
+        WORD 0xa686c0ce          //SM4E V6.4S, V21.4S
+        WORD 0xc686c0ce          //SM4E V6.4S, V22.4S
+        WORD 0xe686c0ce          //SM4E V6.4S, V23.4S
+        WORD 0x0687c0ce          //SM4E V6.4S, V24.4S
+        WORD 0x2687c0ce          //SM4E V6.4S, V25.4S
+        WORD 0x4687c0ce          //SM4E V6.4S, V26.4S
+        WORD 0x6786c0ce          //SM4E V7.4S, V19.4S
+        WORD 0x8786c0ce          //SM4E V7.4S, V20.4S
+        WORD 0xa786c0ce          //SM4E V7.4S, V21.4S
+        WORD 0xc786c0ce          //SM4E V7.4S, V22.4S
+        WORD 0xe786c0ce          //SM4E V7.4S, V23.4S
+        WORD 0x0787c0ce          //SM4E V7.4S, V24.4S
+        WORD 0x2787c0ce          //SM4E V7.4S, V25.4S
+        WORD 0x4787c0ce          //SM4E V7.4S, V26.4S        
+        VREV32 B0.B16, B0.B16
+        VREV32 B1.B16, B1.B16
+        VREV32 B2.B16, B2.B16
+        VREV32 B3.B16, B3.B16
+        VREV32 B4.B16, B4.B16
+        VREV32 B5.B16, B5.B16
+        VREV32 B6.B16, B6.B16
+        VREV32 B7.B16, B7.B16
+
+		// XOR plaintext and store ciphertext
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B0.B16, T1.B16, B0.B16
+		VEOR	B1.B16, T2.B16, B1.B16
+		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B2.B16, T1.B16, B2.B16
+		VEOR	B3.B16, T2.B16, B3.B16
+		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B4.B16, T1.B16, B4.B16
+		VEOR	B5.B16, T2.B16, B5.B16
+		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B6.B16, T1.B16, B6.B16
+		VEOR	B7.B16, T2.B16, B7.B16
+		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		mulRound(B1)
+		mulRound(B2)
+		mulRound(B3)
+		mulRound(B4)
+		mulRound(B5)
+		mulRound(B6)
+		mulRound(B7)
+		MOVD	pTblSave, pTbl
+		reduce()
+
+		CMP	$128, srcPtrLen
+		BGE	octetsLoop
+
+startSingles:
+	CBZ	srcPtrLen, done
+	ADD	$14*16, pTbl
+	// Preload H and its Karatsuba precomp
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+
+singlesLoop:
+		CMP	$16, srcPtrLen
+		BLT	tail
+		SUB	$16, srcPtrLen
+
+        VMOV	CTR.B16, B0.B16
+		VADD	CTR.S4, INC.S4, CTR.S4
+        // SM4 8 rounds
+        WORD 0x6086c0ce          //SM4E V0.4S, V19.4S
+        WORD 0x8086c0ce          //SM4E V0.4S, V20.4S
+        WORD 0xa086c0ce          //SM4E V0.4S, V21.4S
+        WORD 0xc086c0ce          //SM4E V0.4S, V22.4S
+        WORD 0xe086c0ce          //SM4E V0.4S, V23.4S
+        WORD 0x0087c0ce          //SM4E V0.4S, V24.4S
+        WORD 0x2087c0ce          //SM4E V0.4S, V25.4S
+        WORD 0x4087c0ce          //SM4E V0.4S, V26.4S
+        VREV32 B0.B16, B0.B16
+
+singlesLast:
+        VLD1.P	16(srcPtr), [T0.B16]
+        VEOR	T0.B16, B0.B16, B0.B16
+
+encReduce:
+		VST1.P	[B0.B16], 16(dstPtr)
+
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		reduce()
+
+	B	singlesLoop
+tail:
+	CBZ	srcPtrLen, done
+
+	VEOR	T0.B16, T0.B16, T0.B16
+	VEOR	T3.B16, T3.B16, T3.B16
+	MOVD	$0, H1
+	SUB	$1, H1
+	ADD	srcPtrLen, srcPtr
+
+	TBZ	$3, srcPtrLen, ld4
+	MOVD.W	-8(srcPtr), H0
+	VMOV	H0, T0.D[0]
+	VMOV	H1, T3.D[0]
+
+ld4:
+	TBZ	$2, srcPtrLen, ld2
+	MOVW.W	-4(srcPtr), H0
+	VEXT	$12, T0.B16, ZERO.B16, T0.B16
+	VEXT	$12, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.S[0]
+	VMOV	H1, T3.S[0]
+ld2:
+	TBZ	$1, srcPtrLen, ld1
+	MOVH.W	-2(srcPtr), H0
+	VEXT	$14, T0.B16, ZERO.B16, T0.B16
+	VEXT	$14, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.H[0]
+	VMOV	H1, T3.H[0]
+ld1:
+	TBZ	$0, srcPtrLen, ld0
+	MOVB.W	-1(srcPtr), H0
+	VEXT	$15, T0.B16, ZERO.B16, T0.B16
+	VEXT	$15, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.B[0]
+	VMOV	H1, T3.B[0]
+ld0:
+	MOVD	ZR, srcPtrLen
+    VMOV	CTR.B16, B0.B16
+    // SM4 8 rounds
+    WORD 0x6086c0ce          //SM4E V0.4S, V19.4S
+    WORD 0x8086c0ce          //SM4E V0.4S, V20.4S
+    WORD 0xa086c0ce          //SM4E V0.4S, V21.4S
+    WORD 0xc086c0ce          //SM4E V0.4S, V22.4S
+    WORD 0xe086c0ce          //SM4E V0.4S, V23.4S
+    WORD 0x0087c0ce          //SM4E V0.4S, V24.4S
+    WORD 0x2087c0ce          //SM4E V0.4S, V25.4S
+    WORD 0x4087c0ce          //SM4E V0.4S, V26.4S
+    VREV32 B0.B16, B0.B16
+
+tailLast:
+	VEOR	T0.B16, B0.B16, B0.B16
+	VAND	T3.B16, B0.B16, B0.B16
+	B	encReduce
+
+done:
+	VST1	[ACC0.B16], (tPtr)
+    RET
+
+// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
+	MOVD	productTable+0(FP), pTbl
+	MOVD	dst+8(FP), dstPtr
+	MOVD	src_base+32(FP), srcPtr
+	MOVD	src_len+40(FP), srcPtrLen
+	MOVD	ctr+56(FP), ctrPtr
+	MOVD	T+64(FP), tPtr
+	MOVD	rk_base+72(FP), rk
+
+	MOVD	$0xC2, H1
+	LSL	$56, H1
+	MOVD	$1, H0
+	VMOV	H1, POLY.D[0]
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	MOVD	pTbl, pTblSave
+	MOVD rk, rkSave
+	// Current tag, after AAD
+	VLD1	(tPtr), [ACC0.B16]
+	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
+	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
+	// Prepare initial counter, and the increment vector
+	VLD1	(ctrPtr), [CTR.B16]
+	VEOR	INC.B16, INC.B16, INC.B16
+	MOVD	$1, H0
+	VMOV	H0, INC.S[3]
+	VREV32	CTR.B16, CTR.B16
+	VADD	CTR.S4, INC.S4, CTR.S4
+
+	// Skip to <8 blocks loop
+	CMP	$128, srcPtrLen
+
+	MOVD	rk, H0
+	// For SM4 round keys are stored in: K0 .. K7
+	VLD1.P	64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
+	VLD1.P	64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
+
+	BLT	startSingles
+octetsLoop:
+		SUB	$128, srcPtrLen
+
+		VMOV	CTR.B16, B0.B16
+		VADD	B0.S4, INC.S4, B1.S4
+		VADD	B1.S4, INC.S4, B2.S4
+		VADD	B2.S4, INC.S4, B3.S4
+		VADD	B3.S4, INC.S4, B4.S4
+		VADD	B4.S4, INC.S4, B5.S4
+		VADD	B5.S4, INC.S4, B6.S4
+		VADD	B6.S4, INC.S4, B7.S4
+		VADD	B7.S4, INC.S4, CTR.S4
+
+        WORD 0x6086c0ce          //SM4E V0.4S, V19.4S
+        WORD 0x8086c0ce          //SM4E V0.4S, V20.4S
+        WORD 0xa086c0ce          //SM4E V0.4S, V21.4S
+        WORD 0xc086c0ce          //SM4E V0.4S, V22.4S
+        WORD 0xe086c0ce          //SM4E V0.4S, V23.4S
+        WORD 0x0087c0ce          //SM4E V0.4S, V24.4S
+        WORD 0x2087c0ce          //SM4E V0.4S, V25.4S
+        WORD 0x4087c0ce          //SM4E V0.4S, V26.4S
+        WORD 0x6186c0ce          //SM4E V1.4S, V19.4S
+        WORD 0x8186c0ce          //SM4E V1.4S, V20.4S
+        WORD 0xa186c0ce          //SM4E V1.4S, V21.4S
+        WORD 0xc186c0ce          //SM4E V1.4S, V22.4S
+        WORD 0xe186c0ce          //SM4E V1.4S, V23.4S
+        WORD 0x0187c0ce          //SM4E V1.4S, V24.4S
+        WORD 0x2187c0ce          //SM4E V1.4S, V25.4S
+        WORD 0x4187c0ce          //SM4E V1.4S, V26.4S
+        WORD 0x6286c0ce          //SM4E V2.4S, V19.4S
+        WORD 0x8286c0ce          //SM4E V2.4S, V20.4S
+        WORD 0xa286c0ce          //SM4E V2.4S, V21.4S
+        WORD 0xc286c0ce          //SM4E V2.4S, V22.4S
+        WORD 0xe286c0ce          //SM4E V2.4S, V23.4S
+        WORD 0x0287c0ce          //SM4E V2.4S, V24.4S
+        WORD 0x2287c0ce          //SM4E V2.4S, V25.4S
+        WORD 0x4287c0ce          //SM4E V2.4S, V26.4S
+        WORD 0x6386c0ce          //SM4E V3.4S, V19.4S
+        WORD 0x8386c0ce          //SM4E V3.4S, V20.4S
+        WORD 0xa386c0ce          //SM4E V3.4S, V21.4S
+        WORD 0xc386c0ce          //SM4E V3.4S, V22.4S
+        WORD 0xe386c0ce          //SM4E V3.4S, V23.4S
+        WORD 0x0387c0ce          //SM4E V3.4S, V24.4S
+        WORD 0x2387c0ce          //SM4E V3.4S, V25.4S
+        WORD 0x4387c0ce          //SM4E V3.4S, V26.4S
+        WORD 0x6486c0ce          //SM4E V4.4S, V19.4S
+        WORD 0x8486c0ce          //SM4E V4.4S, V20.4S
+        WORD 0xa486c0ce          //SM4E V4.4S, V21.4S
+        WORD 0xc486c0ce          //SM4E V4.4S, V22.4S
+        WORD 0xe486c0ce          //SM4E V4.4S, V23.4S
+        WORD 0x0487c0ce          //SM4E V4.4S, V24.4S
+        WORD 0x2487c0ce          //SM4E V4.4S, V25.4S
+        WORD 0x4487c0ce          //SM4E V4.4S, V26.4S
+        WORD 0x6586c0ce          //SM4E V5.4S, V19.4S
+        WORD 0x8586c0ce          //SM4E V5.4S, V20.4S
+        WORD 0xa586c0ce          //SM4E V5.4S, V21.4S
+        WORD 0xc586c0ce          //SM4E V5.4S, V22.4S
+        WORD 0xe586c0ce          //SM4E V5.4S, V23.4S
+        WORD 0x0587c0ce          //SM4E V5.4S, V24.4S
+        WORD 0x2587c0ce          //SM4E V5.4S, V25.4S
+        WORD 0x4587c0ce          //SM4E V5.4S, V26.4S
+        WORD 0x6686c0ce          //SM4E V6.4S, V19.4S
+        WORD 0x8686c0ce          //SM4E V6.4S, V20.4S
+        WORD 0xa686c0ce          //SM4E V6.4S, V21.4S
+        WORD 0xc686c0ce          //SM4E V6.4S, V22.4S
+        WORD 0xe686c0ce          //SM4E V6.4S, V23.4S
+        WORD 0x0687c0ce          //SM4E V6.4S, V24.4S
+        WORD 0x2687c0ce          //SM4E V6.4S, V25.4S
+        WORD 0x4687c0ce          //SM4E V6.4S, V26.4S
+        WORD 0x6786c0ce          //SM4E V7.4S, V19.4S
+        WORD 0x8786c0ce          //SM4E V7.4S, V20.4S
+        WORD 0xa786c0ce          //SM4E V7.4S, V21.4S
+        WORD 0xc786c0ce          //SM4E V7.4S, V22.4S
+        WORD 0xe786c0ce          //SM4E V7.4S, V23.4S
+        WORD 0x0787c0ce          //SM4E V7.4S, V24.4S
+        WORD 0x2787c0ce          //SM4E V7.4S, V25.4S
+        WORD 0x4787c0ce          //SM4E V7.4S, V26.4S        
+		VREV32 B0.B16, T1.B16
+		VREV32 B1.B16, T2.B16
+        VREV32 B2.B16, B2.B16
+        VREV32 B3.B16, B3.B16
+        VREV32 B4.B16, B4.B16
+        VREV32 B5.B16, B5.B16
+        VREV32 B6.B16, B6.B16
+        VREV32 B7.B16, B7.B16
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B0.B16, T1.B16, T1.B16
+		VEOR	B1.B16, T2.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B2.B16, B0.B16, T1.B16
+		VEOR	B3.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B4.B16, B0.B16, T1.B16
+		VEOR	B5.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B6.B16, B0.B16, T1.B16
+		VEOR	B7.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		MOVD	pTblSave, pTbl
+		reduce()
+
+		CMP	$128, srcPtrLen
+		BGE	octetsLoop
+
+startSingles:
+	CBZ	srcPtrLen, done
+	ADD	$14*16, pTbl
+	// Preload H and its Karatsuba precomp
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+
+singlesLoop:
+		CMP	$16, srcPtrLen
+		BLT	tail
+		SUB	$16, srcPtrLen
+        
+        VMOV	CTR.B16, B0.B16
+		VADD	CTR.S4, INC.S4, CTR.S4
+        // SM4 8 rounds
+        WORD 0x6086c0ce          //SM4E V0.4S, V19.4S
+        WORD 0x8086c0ce          //SM4E V0.4S, V20.4S
+        WORD 0xa086c0ce          //SM4E V0.4S, V21.4S
+        WORD 0xc086c0ce          //SM4E V0.4S, V22.4S
+        WORD 0xe086c0ce          //SM4E V0.4S, V23.4S
+        WORD 0x0087c0ce          //SM4E V0.4S, V24.4S
+        WORD 0x2087c0ce          //SM4E V0.4S, V25.4S
+        WORD 0x4087c0ce          //SM4E V0.4S, V26.4S
+        VREV32 B0.B16, B0.B16
+
+singlesLast:
+        VLD1.P	16(srcPtr), [T0.B16]
+        VEOR	T0.B16, B0.B16, B0.B16
+		VST1.P	[B0.B16], 16(dstPtr)
+
+		VEOR	ACC0.B16, B5.B16, B5.B16
+		VEXT	$8, B5.B16, B5.B16, T0.B16
+		VEOR	B5.B16, T0.B16, T0.B16
+		VPMULL	B5.D1, T1.D1, ACC1.Q1
+		VPMULL2	B5.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+		reduce()
+
+	B	singlesLoop        
+tail:
+	CBZ	srcPtrLen, done
+    VMOV	CTR.B16, B0.B16
+    VADD	CTR.S4, INC.S4, CTR.S4
+    // SM4 8 rounds
+    WORD 0x6086c0ce          //SM4E V0.4S, V19.4S
+    WORD 0x8086c0ce          //SM4E V0.4S, V20.4S
+    WORD 0xa086c0ce          //SM4E V0.4S, V21.4S
+    WORD 0xc086c0ce          //SM4E V0.4S, V22.4S
+    WORD 0xe086c0ce          //SM4E V0.4S, V23.4S
+    WORD 0x0087c0ce          //SM4E V0.4S, V24.4S
+    WORD 0x2087c0ce          //SM4E V0.4S, V25.4S
+    WORD 0x4087c0ce          //SM4E V0.4S, V26.4S
+    VREV32 B0.B16, B0.B16    
+tailLast:
+	// Assuming it is safe to load past dstPtr due to the presence of the tag
+	VLD1	(srcPtr), [B5.B16]
+
+	VEOR	B5.B16, B0.B16, B0.B16
+
+	VEOR	T3.B16, T3.B16, T3.B16
+	MOVD	$0, H1
+	SUB	$1, H1
+
+	TBZ	$3, srcPtrLen, ld4
+	VMOV	B0.D[0], H0
+	MOVD.P	H0, 8(dstPtr)
+	VMOV	H1, T3.D[0]
+	VEXT	$8, ZERO.B16, B0.B16, B0.B16
+ld4:
+	TBZ	$2, srcPtrLen, ld2
+	VMOV	B0.S[0], H0
+	MOVW.P	H0, 4(dstPtr)
+	VEXT	$12, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.S[0]
+	VEXT	$4, ZERO.B16, B0.B16, B0.B16
+ld2:
+	TBZ	$1, srcPtrLen, ld1
+	VMOV	B0.H[0], H0
+	MOVH.P	H0, 2(dstPtr)
+	VEXT	$14, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.H[0]
+	VEXT	$2, ZERO.B16, B0.B16, B0.B16
+ld1:
+	TBZ	$0, srcPtrLen, ld0
+	VMOV	B0.B[0], H0
+	MOVB.P	H0, 1(dstPtr)
+	VEXT	$15, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.B[0]
+ld0:
+
+	VAND	T3.B16, B5.B16, B5.B16
+	VREV64	B5.B16, B5.B16
+
+	VEOR	ACC0.B16, B5.B16, B5.B16
+	VEXT	$8, B5.B16, B5.B16, T0.B16
+	VEOR	B5.B16, T0.B16, T0.B16
+	VPMULL	B5.D1, T1.D1, ACC1.Q1
+	VPMULL2	B5.D2, T1.D2, ACC0.Q1
+	VPMULL	T0.D1, T2.D1, ACCM.Q1
+	reduce()
+done:
+	VST1	[ACC0.B16], (tPtr)
+
+	RET    
diff --git a/sm4/sm4_gcm_asm.go b/sm4/sm4_gcm_asm.go
index 515f754..ccba10b 100644
--- a/sm4/sm4_gcm_asm.go
+++ b/sm4/sm4_gcm_asm.go
@@ -21,7 +21,7 @@ type sm4CipherGCM struct {
 var _ gcmAble = (*sm4CipherGCM)(nil)
 
 //go:noescape
-func gcmSm4Init(productTable *[256]byte, rk []uint32)
+func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
 
 //go:noescape
 func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
@@ -29,6 +29,12 @@ func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []
 //go:noescape
 func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
 
+//go:noescape
+func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+
+//go:noescape
+func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+
 //go:noescape
 func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
 
@@ -40,6 +46,30 @@ type gcmAsm struct {
 	bytesProductTable [256]byte
 }
 
+func gcmSm4InitInst(productTable *[256]byte, rk []uint32) {
+	if supportSM4 {
+		gcmSm4Init(productTable, rk, 1)
+	} else {
+		gcmSm4Init(productTable, rk, 0)
+	}
+}
+
+func gcmSm4EncInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
+	if supportSM4 {
+		gcmSm4niEnc(productTable, dst, src, ctr, T, rk)
+	} else {
+		gcmSm4Enc(productTable, dst, src, ctr, T, rk)
+	}
+}
+
+func gcmSm4DecInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
+	if supportSM4 {
+		gcmSm4niDec(productTable, dst, src, ctr, T, rk)
+	} else {
+		gcmSm4Dec(productTable, dst, src, ctr, T, rk)
+	}
+}
+
 // NewGCM returns the SM4 cipher wrapped in Galois Counter Mode. This is only
 // called by crypto/cipher.NewGCM via the gcmAble interface.
 func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
@@ -47,7 +77,7 @@ func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
 	g.cipher = &c.sm4CipherAsm
 	g.nonceSize = nonceSize
 	g.tagSize = tagSize
-	gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
+	gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
 	return g, nil
 }
 
@@ -92,7 +122,7 @@ func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
 	}
 
 	if len(plaintext) > 0 {
-		gcmSm4Enc(&g.bytesProductTable, out, plaintext, &counter, &tagOut, g.cipher.enc)
+		gcmSm4EncInst(&g.bytesProductTable, out, plaintext, &counter, &tagOut, g.cipher.enc)
 	}
 	gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data)))
 	copy(out[len(plaintext):], tagOut[:])
@@ -145,7 +175,7 @@ func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
 		panic("cipher: invalid buffer overlap")
 	}
 	if len(ciphertext) > 0 {
-		gcmSm4Dec(&g.bytesProductTable, out, ciphertext, &counter, &expectedTag, g.cipher.enc)
+		gcmSm4DecInst(&g.bytesProductTable, out, ciphertext, &counter, &expectedTag, g.cipher.enc)
 	}
 	gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data)))
 
diff --git a/sm4/sm4_gcm_test.go b/sm4/sm4_gcm_test.go
index 219d8ac..cc5b6bc 100644
--- a/sm4/sm4_gcm_test.go
+++ b/sm4/sm4_gcm_test.go
@@ -12,11 +12,11 @@ import (
 func genPrecomputeTable() *gcmAsm {
 	key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
 	c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
-	expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
+	expandKey(key, c.enc, c.dec)
 	c1 := &sm4CipherGCM{c}
 	g := &gcmAsm{}
 	g.cipher = &c1.sm4CipherAsm
-	gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
+	gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
 	return g
 }
 
@@ -146,12 +146,12 @@ func TestBothDataPlaintext(t *testing.T) {
 func createGcm() *gcmAsm {
 	key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
 	c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
-	expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
+	expandKey(key, c.enc, c.dec)
 	c1 := &sm4CipherGCM{c}
 	g := &gcmAsm{}
 	g.cipher = &c1.sm4CipherAsm
 	g.tagSize = 16
-	gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
+	gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
 	return g
 }
 
@@ -214,7 +214,7 @@ func TestGcmSm4Enc(t *testing.T) {
 
 		out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
 		gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
-		gcmSm4Enc(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
+		gcmSm4EncInst(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
 		if hex.EncodeToString(out1) != hex.EncodeToString(out2) {
 			t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString(out1), hex.EncodeToString(out2))
 		}
@@ -244,7 +244,7 @@ func TestGcmSm4Dec(t *testing.T) {
 
 		out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
 		gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
-		gcmSm4Dec(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)
+		gcmSm4DecInst(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)
 
 		if hex.EncodeToString([]byte(test.plaintext)) != hex.EncodeToString(out2[:len(test.plaintext)]) {
 			t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString([]byte(test.plaintext)), hex.EncodeToString(out2[:len(test.plaintext)]))