Merge pull request #52 from emmansun/sm3_sm4_ni

Sm3 sm4 ni
2025-10-15 15:50:46 +08:00 · 2022-05-01 16:19:35 +08:00 · 2022-05-01 16:19:35 +08:00 · 9a2d7123f8
commit 9a2d7123f8
parent 660d54649b c71d5ccd96
15 changed files with 1533 additions and 70 deletions
--- a/.github/workflows/sm3_sm4_ni.ci.yml
+++ b/.github/workflows/sm3_sm4_ni.ci.yml
@ -0,0 +1,39 @@
+name: ci
+
+on:
+  push:
+    branches: [ sm3_sm4_ni ]
+  pull_request:
+    branches: [ sm3_sm4_ni ]
+
+jobs:
+
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        goVer: ['1.15', '1.16', '1.17']    
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v2
+  
+    - name: Set up Go
+      uses: actions/setup-go@v2
+      with:
+        go-version: ${{ matrix.goVer }}
+
+    - name: Setup Environment
+      run: |
+         echo "GOPATH=$(go env GOPATH)" >> $GITHUB_ENV
+         echo "$(go env GOPATH)/bin" >> $GITHUB_PATH
+
+    - name: Module cache
+      uses: actions/cache@v2.1.7
+      env:
+        cache-name: go-mod-cache
+      with:
+        path: ~/go/pkg/mod
+        key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/go.sum') }}
+          
+    - name: Test
+      run: go test -v ./...
--- a/sm3/gen_sm3block_ni.go
+++ b/sm3/gen_sm3block_ni.go
@ -58,18 +58,18 @@ func sm3tt2b(Vd, Vn, Vm, imm2 byte) uint32 {

 // Used v5 as temp register
 func roundA(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
-	fmt.Fprintf(buf, "\tWORD 0x%08x           //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
+	fmt.Fprintf(buf, "\tWORD $0x%08x           //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
 	fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t)
-	fmt.Fprintf(buf, "\tWORD 0x%08x           //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i)
-	fmt.Fprintf(buf, "\tWORD 0x%08x           //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i)
+	fmt.Fprintf(buf, "\tWORD $0x%08x           //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i)
+	fmt.Fprintf(buf, "\tWORD $0x%08x           //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i)
 }

 // Used v5 as temp register
 func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
-	fmt.Fprintf(buf, "\tWORD 0x%08x           //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
+	fmt.Fprintf(buf, "\tWORD $0x%08x           //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
 	fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t)
-	fmt.Fprintf(buf, "\tWORD 0x%08x           //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i)
-	fmt.Fprintf(buf, "\tWORD 0x%08x           //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i)
+	fmt.Fprintf(buf, "\tWORD $0x%08x           //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i)
+	fmt.Fprintf(buf, "\tWORD $0x%08x           //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i)
 }

 // Compress 4 words and generate 4 words, use v6, v7, v10 as temp registers
@ -82,12 +82,12 @@ func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
 // st1, st2, sm3 state
 func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
 	fmt.Fprintf(buf, "\t// Extension\n")
-	fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4)
-	fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6)
-	fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7)
-	fmt.Fprintf(buf, "\tWORD 0x%08x          //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
-	fmt.Fprintf(buf, "\tWORD 0x%08x          //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
-	fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0)
+	fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4)
+	fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6)
+	fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7)
+	fmt.Fprintf(buf, "\tWORD $0x%08x          //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
+	fmt.Fprintf(buf, "\tWORD $0x%08x          //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
+	fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0)
 	fmt.Fprintf(buf, "\t// Compression\n")
 	roundA(buf, 0, t, st1, st2, s0, 10)
 	roundA(buf, 1, t, st1, st2, s0, 10)
@ -100,13 +100,13 @@ func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
 func qroundB(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
 	if s4 != 0xff {
 		fmt.Fprintf(buf, "\t// Extension\n")
-		fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4)
-		fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6)
-		fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7)
-		fmt.Fprintf(buf, "\tWORD 0x%08x          //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
-		fmt.Fprintf(buf, "\tWORD 0x%08x          //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
+		fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4)
+		fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6)
+		fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7)
+		fmt.Fprintf(buf, "\tWORD $0x%08x          //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
+		fmt.Fprintf(buf, "\tWORD $0x%08x          //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
 	}
-	fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0)
+	fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0)
 	fmt.Fprintf(buf, "\t// Compression\n")
 	roundB(buf, 0, t, st1, st2, s0, 10)
 	roundB(buf, 1, t, st1, st2, s0, 10)
@ -165,8 +165,8 @@ blockloop:

 	fmt.Fprint(buf, `
 	SUB	$64, R3, R3                                  // message length - 64bytes, then compare with 64bytes
-	VEOR	V8.S4, V15.S4, V8.S4
-	VEOR	V9.S4, V16.S4, V9.S4
+	VEOR	V8.B16, V15.B16, V8.B16
+	VEOR	V9.B16, V16.B16, V9.B16
 	CBNZ	R3, blockloop

 sm3ret:
--- a/sm3/sm3block_arm64.go
+++ b/sm3/sm3block_arm64.go
@ -19,10 +19,10 @@ func blockARM64(dig *digest, p []byte)
 func blockSM3NI(h []uint32, p []byte, t []uint32)

 func block(dig *digest, p []byte) {
-	//if !useSM3NI {
+	if !useSM3NI {
 		blockARM64(dig, p)
-	//} else {
-	//	h := dig.h[:]
-	//	blockSM3NI(h, p, t)
-	//}
+	} else {
+		h := dig.h[:]
+		blockSM3NI(h, p, t)
+	}
 }
--- a/sm3/sm3blockni_arm64.s
+++ b/sm3/sm3blockni_arm64.s
@ -0,0 +1,416 @@
+// Generated by gen_sm3block_ni.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+// func blockSM3NI(h []uint32, p []byte, t []uint32)
+TEXT ·blockSM3NI(SB), 0, $0
+	MOVD	h_base+0(FP), R0                           // Hash value first address
+	MOVD	p_base+24(FP), R1                          // message first address
+	MOVD	p_len+32(FP), R3                           // message length
+	MOVD	t_base+48(FP), R2                          // t constants first address
+
+	VLD1 (R0), [V8.S4, V9.S4]                          // load h(a,b,c,d,e,f,g,h)
+	LDPW	(0*8)(R2), (R5, R6)                        // load t constants
+    
+blockloop:
+	VLD1.P	64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]    // load 64bytes message
+	VMOV	V8.B16, V15.B16                             // backup: V8 h(dcba)
+	VMOV	V9.B16, V16.B16                             // backup: V9 h(hgfe)
+	VREV32	V0.B16, V0.B16                              // prepare for using message in Byte format
+	VREV32	V1.B16, V1.B16
+	VREV32	V2.B16, V2.B16
+	VREV32	V3.B16, V3.B16    
+	// first 16 rounds
+	VMOV R5, V11.S[3]
+	// Extension
+	VEXT $3, V2.B16, V1.B16, V4.B16
+	VEXT $3, V1.B16, V0.B16, V6.B16
+	VEXT $2, V3.B16, V2.B16, V7.B16
+	WORD $0x04c063ce          //SM3PARTW1 V4.4S, V0.4S, V3.4S
+	WORD $0xe4c466ce          //SM3PARTW2 V4.4S, V7.4S, V6.4S
+	VEOR V1.B16, V0.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8804ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98840ce           //SM3TT2A V9d.4S, V5.4S, V0.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8904ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99840ce           //SM3TT2A V9d.4S, V5.4S, V0.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9a840ce           //SM3TT2A V9d.4S, V5.4S, V0.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9b840ce           //SM3TT2A V9d.4S, V5.4S, V0.S, 3
+
+	// Extension
+	VEXT $3, V3.B16, V2.B16, V0.B16
+	VEXT $3, V2.B16, V1.B16, V6.B16
+	VEXT $2, V4.B16, V3.B16, V7.B16
+	WORD $0x20c064ce          //SM3PARTW1 V0.4S, V1.4S, V4.4S
+	WORD $0xe0c466ce          //SM3PARTW2 V0.4S, V7.4S, V6.4S
+	VEOR V2.B16, V1.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8804ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98841ce           //SM3TT2A V9d.4S, V5.4S, V1.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8904ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99841ce           //SM3TT2A V9d.4S, V5.4S, V1.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9a841ce           //SM3TT2A V9d.4S, V5.4S, V1.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9b841ce           //SM3TT2A V9d.4S, V5.4S, V1.S, 3
+
+	// Extension
+	VEXT $3, V4.B16, V3.B16, V1.B16
+	VEXT $3, V3.B16, V2.B16, V6.B16
+	VEXT $2, V0.B16, V4.B16, V7.B16
+	WORD $0x41c060ce          //SM3PARTW1 V1.4S, V2.4S, V0.4S
+	WORD $0xe1c466ce          //SM3PARTW2 V1.4S, V7.4S, V6.4S
+	VEOR V3.B16, V2.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8804ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98842ce           //SM3TT2A V9d.4S, V5.4S, V2.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8904ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99842ce           //SM3TT2A V9d.4S, V5.4S, V2.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9a842ce           //SM3TT2A V9d.4S, V5.4S, V2.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9b842ce           //SM3TT2A V9d.4S, V5.4S, V2.S, 3
+
+	// Extension
+	VEXT $3, V0.B16, V4.B16, V2.B16
+	VEXT $3, V4.B16, V3.B16, V6.B16
+	VEXT $2, V1.B16, V0.B16, V7.B16
+	WORD $0x62c061ce          //SM3PARTW1 V2.4S, V3.4S, V1.4S
+	WORD $0xe2c466ce          //SM3PARTW2 V2.4S, V7.4S, V6.4S
+	VEOR V4.B16, V3.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8804ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98843ce           //SM3TT2A V9d.4S, V5.4S, V3.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8904ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99843ce           //SM3TT2A V9d.4S, V5.4S, V3.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9a843ce           //SM3TT2A V9d.4S, V5.4S, V3.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9b843ce           //SM3TT2A V9d.4S, V5.4S, V3.S, 3
+
+	// second 48 rounds
+	VMOV R6, V11.S[3]
+	// Extension
+	VEXT $3, V1.B16, V0.B16, V3.B16
+	VEXT $3, V0.B16, V4.B16, V6.B16
+	VEXT $2, V2.B16, V1.B16, V7.B16
+	WORD $0x83c062ce          //SM3PARTW1 V3.4S, V4.4S, V2.4S
+	WORD $0xe3c466ce          //SM3PARTW2 V3.4S, V7.4S, V6.4S
+	VEOR V0.B16, V4.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 3
+
+	// Extension
+	VEXT $3, V2.B16, V1.B16, V4.B16
+	VEXT $3, V1.B16, V0.B16, V6.B16
+	VEXT $2, V3.B16, V2.B16, V7.B16
+	WORD $0x04c063ce          //SM3PARTW1 V4.4S, V0.4S, V3.4S
+	WORD $0xe4c466ce          //SM3PARTW2 V4.4S, V7.4S, V6.4S
+	VEOR V1.B16, V0.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 3
+
+	// Extension
+	VEXT $3, V3.B16, V2.B16, V0.B16
+	VEXT $3, V2.B16, V1.B16, V6.B16
+	VEXT $2, V4.B16, V3.B16, V7.B16
+	WORD $0x20c064ce          //SM3PARTW1 V0.4S, V1.4S, V4.4S
+	WORD $0xe0c466ce          //SM3PARTW2 V0.4S, V7.4S, V6.4S
+	VEOR V2.B16, V1.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 3
+
+	// Extension
+	VEXT $3, V4.B16, V3.B16, V1.B16
+	VEXT $3, V3.B16, V2.B16, V6.B16
+	VEXT $2, V0.B16, V4.B16, V7.B16
+	WORD $0x41c060ce          //SM3PARTW1 V1.4S, V2.4S, V0.4S
+	WORD $0xe1c466ce          //SM3PARTW2 V1.4S, V7.4S, V6.4S
+	VEOR V3.B16, V2.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 3
+
+	// Extension
+	VEXT $3, V0.B16, V4.B16, V2.B16
+	VEXT $3, V4.B16, V3.B16, V6.B16
+	VEXT $2, V1.B16, V0.B16, V7.B16
+	WORD $0x62c061ce          //SM3PARTW1 V2.4S, V3.4S, V1.4S
+	WORD $0xe2c466ce          //SM3PARTW2 V2.4S, V7.4S, V6.4S
+	VEOR V4.B16, V3.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 3
+
+	// Extension
+	VEXT $3, V1.B16, V0.B16, V3.B16
+	VEXT $3, V0.B16, V4.B16, V6.B16
+	VEXT $2, V2.B16, V1.B16, V7.B16
+	WORD $0x83c062ce          //SM3PARTW1 V3.4S, V4.4S, V2.4S
+	WORD $0xe3c466ce          //SM3PARTW2 V3.4S, V7.4S, V6.4S
+	VEOR V0.B16, V4.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 3
+
+	// Extension
+	VEXT $3, V2.B16, V1.B16, V4.B16
+	VEXT $3, V1.B16, V0.B16, V6.B16
+	VEXT $2, V3.B16, V2.B16, V7.B16
+	WORD $0x04c063ce          //SM3PARTW1 V4.4S, V0.4S, V3.4S
+	WORD $0xe4c466ce          //SM3PARTW2 V4.4S, V7.4S, V6.4S
+	VEOR V1.B16, V0.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 3
+
+	// Extension
+	VEXT $3, V3.B16, V2.B16, V0.B16
+	VEXT $3, V2.B16, V1.B16, V6.B16
+	VEXT $2, V4.B16, V3.B16, V7.B16
+	WORD $0x20c064ce          //SM3PARTW1 V0.4S, V1.4S, V4.4S
+	WORD $0xe0c466ce          //SM3PARTW2 V0.4S, V7.4S, V6.4S
+	VEOR V2.B16, V1.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 3
+
+	// Extension
+	VEXT $3, V4.B16, V3.B16, V1.B16
+	VEXT $3, V3.B16, V2.B16, V6.B16
+	VEXT $2, V0.B16, V4.B16, V7.B16
+	WORD $0x41c060ce          //SM3PARTW1 V1.4S, V2.4S, V0.4S
+	WORD $0xe1c466ce          //SM3PARTW2 V1.4S, V7.4S, V6.4S
+	VEOR V3.B16, V2.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 3
+
+	VEOR V4.B16, V3.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 3
+
+	VEOR V0.B16, V4.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 3
+
+	VEOR V1.B16, V0.B16, V10.B16
+	// Compression
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
+	WORD $0xa98c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 0
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
+	WORD $0xa99c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 1
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
+	WORD $0xa9ac40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 2
+	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
+	VSHL $1, V11.S4, V11.S4
+	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
+	WORD $0xa9bc40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 3
+
+	SUB	$64, R3, R3                                  // message length - 64bytes, then compare with 64bytes
+	VEOR	V8.B16, V15.B16, V8.B16
+	VEOR	V9.B16, V16.B16, V9.B16
+	CBNZ	R3, blockloop
+
+sm3ret:
+	VST1	[V8.S4, V9.S4], (R0)                       // store hash value H	
+	RET
--- a/sm4/asm_amd64.s
+++ b/sm4/asm_amd64.s
@ -290,7 +290,7 @@ GLOBL fk_mask<>(SB), RODATA, $16
  AVX_SM4_TAO_L1(x, y);                              \  
  VPXOR x, t0, t0

-// func expandKeyAsm(key *byte, ck, enc, dec *uint32)
+// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
 TEXT ·expandKeyAsm(SB),NOSPLIT,$0
  MOVQ key+0(FP), AX
  MOVQ  ck+8(FP), BX
@ -321,7 +321,7 @@ loop:
 expand_end:  
  RET 

-// func encryptBlocksAsm(xk *uint32, dst, src []byte)
+// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
 TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
  MOVQ xk+0(FP), AX
  MOVQ dst+8(FP), BX
@ -497,7 +497,7 @@ avx2_sm4_done:
  VZEROUPPER
  RET

-// func encryptBlockAsm(xk *uint32, dst, src *byte)
+// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
  MOVQ xk+0(FP), AX
  MOVQ dst+8(FP), BX
--- a/sm4/asm_arm64.s
+++ b/sm4/asm_arm64.s
@ -164,13 +164,44 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
  VMOV R0, R24_MASK.D[0]       \
  VMOV R1, R24_MASK.D[1]

-// func expandKeyAsm(key *byte, ck, enc, dec *uint32)
+#define SM4EKEY_EXPORT_KEYS() \
+  VMOV V9.S[3], V10.S[0]            \
+  VMOV V9.S[2], V10.S[1]            \
+  VMOV V9.S[1], V10.S[2]            \
+  VMOV V9.S[0], V10.S[3]            \
+  VMOV V8.S[3], V11.S[0]            \
+  VMOV V8.S[2], V11.S[1]            \
+  VMOV V8.S[1], V11.S[2]            \
+  VMOV V8.S[0], V11.S[3]            \
+  VST1.P	[V8.S4, V9.S4], 32(R10)   \
+  VST1	[V10.S4, V11.S4], (R11)     \
+  SUB  $32, R11, R11
+
+#define SM4E_ROUND() \
+  VLD1.P 16(R10), [V8.B16]    \
+  VREV32 V8.B16, V8.B16      \
+  WORD $0x0884c0ce            \ 
+  WORD $0x2884c0ce            \ 
+  WORD $0x4884c0ce            \ 
+  WORD $0x6884c0ce            \ 
+  WORD $0x8884c0ce            \ 
+  WORD $0xa884c0ce            \ 
+  WORD $0xc884c0ce            \ 
+  WORD $0xe884c0ce            \ 
+  VREV32 V8.B16, V8.B16      \
+  VST1.P  [V8.B16], 16(R9)
+
+// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
 TEXT ·expandKeyAsm(SB),NOSPLIT,$0
  MOVD key+0(FP), R8
  MOVD ck+8(FP), R9
  MOVD enc+16(FP), R10
  MOVD dec+24(FP), R11
-  
+  MOVD inst+32(FP), R12
+
+  CMP $1, R12
+  BEQ sm4ekey
+
  load_global_data_1()
 	
  VLD1 (R8), [t0.B16]
@ -193,14 +224,46 @@ ksLoop:
  ADD $16, R0 
  CMP $128, R0
  BNE ksLoop
-
  RET 

-// func encryptBlocksAsm(xk *uint32, dst, src []byte)
+sm4ekey:
+  LDP fk_mask<>(SB), (R0, R1)
+  VMOV R0, FK_MASK.D[0]             
+  VMOV R1, FK_MASK.D[1]
+	VLD1 (R8), [V9.B16]
+	VREV32 V9.B16, V9.B16
+	VEOR FK_MASK.B16, V9.B16, V9.B16
+  ADD $96, R11
+
+	VLD1.P	64(R9), [V0.S4, V1.S4, V2.S4, V3.S4]
+	WORD $0x28c960ce          //SM4EKEY V8.4S, V9.4S, V0.4S
+	WORD $0x09c961ce          //SM4EKEY V9.4S, V8.4S, V1.4S
+  SM4EKEY_EXPORT_KEYS()
+
+	WORD $0x28c962ce          //SM4EKEY V8.4S, V9.4S, V2.4S
+	WORD $0x09c963ce          //SM4EKEY V9.4S, V8.4S, V3.4S
+	SM4EKEY_EXPORT_KEYS()
+
+	VLD1.P	64(R9), [V0.S4, V1.S4, V2.S4, V3.S4]
+	WORD $0x28c960ce          //SM4EKEY V8.4S, V9.4S, V0.4S
+	WORD $0x09c961ce          //SM4EKEY V9.4S, V8.4S, V1.4S
+	SM4EKEY_EXPORT_KEYS()
+
+	WORD $0x28c962ce          //SM4EKEY V8.4S, V9.4S, V2.4S
+	WORD $0x09c963ce          //SM4EKEY V9.4S, V8.4S, V3.4S
+	SM4EKEY_EXPORT_KEYS()
+  RET
+
+// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
 TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
  MOVD xk+0(FP), R8
  MOVD dst+8(FP), R9
  MOVD src+32(FP), R10
+  MOVD src_len+40(FP), R12
+  MOVD inst+56(FP), R11
+
+  CMP $1, R11
+  BEQ sm4niblocks

  VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4]
  VMOV V5.S[0], t0.S[0]
@ -271,15 +334,26 @@ encryptBlocksLoop:
  VMOV t1.S[3], V8.S[2]
  VMOV t0.S[3], V8.S[3]
  VST1 [V8.B16], (R9)
-
  RET

+sm4niblocks:
+  VLD1.P  64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
+  VLD1.P  64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
+sm4niblockloop:  
+  SM4E_ROUND()
+	SUB	$16, R12, R12                                  // message length - 16bytes, then compare with 16bytes
+	CBNZ	R12, sm4niblockloop  
+  RET

-// func encryptBlockAsm(xk *uint32, dst, src *byte)
+// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
  MOVD xk+0(FP), R8
  MOVD dst+8(FP), R9
  MOVD src+16(FP), R10
+  MOVD inst+24(FP), R11
+
+  CMP $1, R11
+  BEQ sm4niblock

  VLD1 (R10), [t0.S4]
  VREV32 t0.B16, t0.B16
@ -312,5 +386,21 @@ encryptBlockLoop:
  VMOV t1.S[0], V8.S[2]
  VMOV t0.S[0], V8.S[3]
  VST1 [V8.B16], (R9)
+  RET

+sm4niblock:
+	VLD1 (R10), [V8.B16]
+	VREV32 V8.B16, V8.B16
+	VLD1.P	64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
+	WORD $0x0884c0ce          //SM4E V8.4S, V0.4S
+	WORD $0x2884c0ce          //SM4E V8.4S, V1.4S
+	WORD $0x4884c0ce          //SM4E V8.4S, V2.4S
+	WORD $0x6884c0ce          //SM4E V8.4S, V3.4S
+	VLD1.P	64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
+	WORD $0x0884c0ce          //SM4E V8.4S, V0.4S
+	WORD $0x2884c0ce          //SM4E V8.4S, V1.4S
+	WORD $0x4884c0ce          //SM4E V8.4S, V2.4S
+	WORD $0x6884c0ce          //SM4E V8.4S, V3.4S
+	VREV32 V8.B16, V8.B16
+	VST1	[V8.B16], (R9)
  RET  
--- a/sm4/cipher_asm.go
+++ b/sm4/cipher_asm.go
@ -15,14 +15,19 @@ var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
 var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
 var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2

-//go:noescape
-func encryptBlocksAsm(xk *uint32, dst, src []byte)
+const (
+	INST_AES int = iota
+	INST_SM4
+)

 //go:noescape
-func encryptBlockAsm(xk *uint32, dst, src *byte)
+func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)

 //go:noescape
-func expandKeyAsm(key *byte, ck, enc, dec *uint32)
+func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
+
+//go:noescape
+func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)

 type sm4CipherAsm struct {
 	sm4Cipher
@ -30,24 +35,66 @@ type sm4CipherAsm struct {
 	blocksSize  int
 }

+type sm4CipherNI struct {
+	sm4Cipher
+}
+
+func newCipherNI(key []byte) (cipher.Block, error) {
+	c := &sm4CipherNI{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}}
+	expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0], INST_SM4)
+	if supportsGFMUL {
+		return &sm4CipherNIGCM{c}, nil
+	}
+	return c, nil
+}
+
+func (c *sm4CipherNI) Encrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("sm4: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("sm4: output not full block")
+	}
+	if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
+		panic("sm4: invalid buffer overlap")
+	}
+	encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_SM4)
+}
+
+func (c *sm4CipherNI) Decrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("sm4: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("sm4: output not full block")
+	}
+	if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
+		panic("sm4: invalid buffer overlap")
+	}
+	encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_SM4)
+}
+
 func newCipher(key []byte) (cipher.Block, error) {
+	if supportSM4 {
+		return newCipherNI(key)
+	}
+
 	if !supportsAES {
 		return newCipherGeneric(key)
 	}
+
 	blocks := 4
 	if useAVX2 {
 		blocks = 8
 	}
-	c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, blocks, blocks * BlockSize}
-	expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
-	if supportsAES && supportsGFMUL {
+	c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, blocks, blocks * BlockSize}
+	expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0], INST_AES)
+	if supportsGFMUL {
 		return &sm4CipherGCM{c}, nil
 	}
-	return &c, nil
+	return c, nil
 }

-func (c *sm4CipherAsm) BlockSize() int { return BlockSize }
-
 func (c *sm4CipherAsm) Concurrency() int { return c.batchBlocks }

 func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
@ -60,7 +107,7 @@ func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
 	if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
 		panic("sm4: invalid buffer overlap")
 	}
-	encryptBlockAsm(&c.enc[0], &dst[0], &src[0])
+	encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_AES)
 }

 func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
@ -73,7 +120,7 @@ func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
 	if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) {
 		panic("sm4: invalid buffer overlap")
 	}
-	encryptBlocksAsm(&c.enc[0], dst, src)
+	encryptBlocksAsm(&c.enc[0], dst, src, INST_AES)
 }

 func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
@ -86,7 +133,7 @@ func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
 	if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
 		panic("sm4: invalid buffer overlap")
 	}
-	encryptBlockAsm(&c.dec[0], &dst[0], &src[0])
+	encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_AES)
 }

 func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) {
@ -99,14 +146,16 @@ func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) {
 	if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) {
 		panic("sm4: invalid buffer overlap")
 	}
-	encryptBlocksAsm(&c.dec[0], dst, src)
+	encryptBlocksAsm(&c.dec[0], dst, src, INST_AES)
 }

 // expandKey is used by BenchmarkExpand to ensure that the asm implementation
 // of key expansion is used for the benchmark when it is available.
 func expandKey(key []byte, enc, dec []uint32) {
-	if supportsAES {
-		expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0])
+	if supportSM4 {
+		expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0], INST_SM4)
+	} else if supportsAES {
+		expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0], INST_AES)
 	} else {
 		expandKeyGo(key, enc, dec)
 	}
--- a/sm4/cipher_asm_fuzzy_test.go
+++ b/sm4/cipher_asm_fuzzy_test.go
@ -34,7 +34,7 @@ func TestExpandKey(t *testing.T) {
 		}
 		io.ReadFull(rand.Reader, key)
 		expandKeyGo(key, encRes1, decRes1)
-		expandKeyAsm(&key[0], &ck[0], &encRes2[0], &decRes2[0])
+		expandKey(key, encRes2, decRes2)
 		if !reflect.DeepEqual(encRes1, encRes2) {
 			t.Errorf("expected=%v, result=%v\n", encRes1, encRes2)
 		}
--- a/sm4/gcm_amd64.s
+++ b/sm4/gcm_amd64.s
@ -2201,3 +2201,11 @@ avx2GcmSm4DecDone:
 	VMOVDQU ACC0, (tPtr)
 	VZEROUPPER	
 	RET
+
+// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
+	RET
+
+// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
+	RET
--- a/sm4/gcm_arm64.s
+++ b/sm4/gcm_arm64.s
@ -252,7 +252,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
 	SM4_TAO_L1(x, y, z);                              \
 	VEOR x.B16, t0.B16, t0.B16

-// func gcmSm4Init(productTable *[256]byte, rk []uint32)
+// func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
 TEXT ·gcmSm4Init(SB),NOSPLIT,$0
 #define pTbl R0
 #define RK R1
@ -260,6 +260,7 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0

 	MOVD productTable+0(FP), pTbl
 	MOVD rk+8(FP), RK
+	MOVD inst+16(FP), R5

 	MOVD	$0xC2, I
 	LSL	$56, I
@ -269,6 +270,9 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
 	VEOR	ZERO.B16, ZERO.B16, ZERO.B16

 	// Encrypt block 0 with the SM4 keys to generate the hash key H
+	CMP $1, R5
+	BEQ sm4InitSM4E
+
 	LOAD_SM4_AESNI_CONSTS()
 	VEOR	B0.B16, B0.B16, B0.B16
 	VEOR	B1.B16, B1.B16, B1.B16
@ -290,7 +294,22 @@ sm4InitEncLoop:
 	VMOV B1.S[0], B0.S[3]
 	VMOV B2.S[0], B0.S[0]
 	VMOV B3.S[0], B0.S[1]
-
+	B sm4InitEncDone
+sm4InitSM4E:
+	VEOR	B0.B16, B0.B16, B0.B16
+	VLD1.P	64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
+	WORD $0x6085c0ce          //SM4E V0.4S, V11.4S
+	WORD $0x8085c0ce          //SM4E V0.4S, V12.4S
+	WORD $0xa085c0ce          //SM4E V0.4S, V13.4S
+	WORD $0xc085c0ce          //SM4E V0.4S, V14.4S
+	VLD1.P	64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
+	WORD $0x6085c0ce          //SM4E V0.4S, V11.4S
+	WORD $0x8085c0ce          //SM4E V0.4S, V12.4S
+	WORD $0xa085c0ce          //SM4E V0.4S, V13.4S
+	WORD $0xc085c0ce          //SM4E V0.4S, V14.4S
+	VREV32	B0.B16, B0.B16
+	VREV64	B0.B16, B0.B16		
+sm4InitEncDone:
 	// Multiply by 2 modulo P
 	VMOV	B0.D[0], I
 	ASR	$63, I
@ -547,6 +566,7 @@ TEXT ·gcmSm4Enc(SB),NOSPLIT,$0
 	VMOV	H0, INC.S[3]
 	VREV32	CTR.B16, CTR.B16
 	VADD	CTR.S4, INC.S4, CTR.S4
+
 	// Skip to <8 blocks loop
 	CMP	$128, srcPtrLen

@ -587,7 +607,7 @@ encOctetsEnc4Blocks1:
 		VREV32 B2.B16, B2.B16
 		VREV32 B3.B16, B3.B16
 		TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
-		// encryption first 4 blocks
+		// encryption second 4 blocks
 		PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
 		MOVD	rkSave, rk

@ -880,7 +900,7 @@ decOctetsEnc4Blocks1:
 		VREV32 B3.B16, B3.B16
 		TRANSPOSE_MATRIX(T1, T2, B2, B3, K0)

-		// encryption first 4 blocks
+		// encryption second 4 blocks
 		PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
 		MOVD	rkSave, rk

--- a/sm4/gcm_sm4ni_arm64.s
+++ b/sm4/gcm_sm4ni_arm64.s
@ -0,0 +1,525 @@
+#include "textflag.h"
+
+#define B0 V0
+#define B1 V1
+#define B2 V2
+#define B3 V3
+#define B4 V4
+#define B5 V5
+#define B6 V6
+#define B7 V7
+
+#define ACC0 V8
+#define ACC1 V9
+#define ACCM V10
+
+#define T0 V11
+#define T1 V12
+#define T2 V13
+#define T3 V14
+
+#define POLY V15
+#define ZERO V16
+#define INC V17
+#define CTR V18
+
+#define K0 V19
+#define K1 V20
+#define K2 V21
+#define K3 V22
+#define K4 V23
+#define K5 V24
+#define K6 V25
+#define K7 V26
+
+#define reduce() \
+	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
+	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
+	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
+	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
+	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
+	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
+	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
+	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
+	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
+	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
+	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
+	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
+	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
+
+#define mulRound(X) \
+	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
+	VREV64	X.B16, X.B16               \
+	VEXT	$8, X.B16, X.B16, T0.B16   \
+	VEOR	X.B16, T0.B16, T0.B16      \
+	VPMULL	X.D1, T1.D1, T3.Q1         \
+	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
+	VPMULL2	X.D2, T1.D2, T3.Q1         \
+	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
+	VPMULL	T0.D1, T2.D1, T3.Q1        \
+	VEOR	T3.B16, ACCM.B16, ACCM.B16
+
+#define sm4eEnc1block() \
+        WORD $0x6086c0ce         \ //SM4E V0.4S, V19.4S
+        WORD $0x8086c0ce         \ //SM4E V0.4S, V20.4S
+        WORD $0xa086c0ce         \ //SM4E V0.4S, V21.4S
+        WORD $0xc086c0ce         \ //SM4E V0.4S, V22.4S
+        WORD $0xe086c0ce         \ //SM4E V0.4S, V23.4S
+        WORD $0x0087c0ce         \ //SM4E V0.4S, V24.4S
+        WORD $0x2087c0ce         \ //SM4E V0.4S, V25.4S
+        WORD $0x4087c0ce           //SM4E V0.4S, V26.4S
+
+#define sm4eEnc8blocks() \
+        sm4eEnc1block()         \
+        WORD $0x6186c0ce         \ //SM4E V1.4S, V19.4S
+        WORD $0x8186c0ce         \ //SM4E V1.4S, V20.4S
+        WORD $0xa186c0ce         \ //SM4E V1.4S, V21.4S
+        WORD $0xc186c0ce         \ //SM4E V1.4S, V22.4S
+        WORD $0xe186c0ce         \ //SM4E V1.4S, V23.4S
+        WORD $0x0187c0ce         \ //SM4E V1.4S, V24.4S
+        WORD $0x2187c0ce         \ //SM4E V1.4S, V25.4S
+        WORD $0x4187c0ce         \ //SM4E V1.4S, V26.4S
+        WORD $0x6286c0ce         \ //SM4E V2.4S, V19.4S
+        WORD $0x8286c0ce         \ //SM4E V2.4S, V20.4S
+        WORD $0xa286c0ce         \ //SM4E V2.4S, V21.4S
+        WORD $0xc286c0ce         \ //SM4E V2.4S, V22.4S
+        WORD $0xe286c0ce         \ //SM4E V2.4S, V23.4S
+        WORD $0x0287c0ce         \ //SM4E V2.4S, V24.4S
+        WORD $0x2287c0ce         \ //SM4E V2.4S, V25.4S
+        WORD $0x4287c0ce         \ //SM4E V2.4S, V26.4S
+        WORD $0x6386c0ce         \ //SM4E V3.4S, V19.4S
+        WORD $0x8386c0ce         \ //SM4E V3.4S, V20.4S
+        WORD $0xa386c0ce         \ //SM4E V3.4S, V21.4S
+        WORD $0xc386c0ce         \ //SM4E V3.4S, V22.4S
+        WORD $0xe386c0ce         \ //SM4E V3.4S, V23.4S
+        WORD $0x0387c0ce         \ //SM4E V3.4S, V24.4S
+        WORD $0x2387c0ce         \ //SM4E V3.4S, V25.4S
+        WORD $0x4387c0ce         \ //SM4E V3.4S, V26.4S
+        WORD $0x6486c0ce         \ //SM4E V4.4S, V19.4S
+        WORD $0x8486c0ce         \ //SM4E V4.4S, V20.4S
+        WORD $0xa486c0ce         \ //SM4E V4.4S, V21.4S
+        WORD $0xc486c0ce         \ //SM4E V4.4S, V22.4S
+        WORD $0xe486c0ce         \ //SM4E V4.4S, V23.4S
+        WORD $0x0487c0ce         \ //SM4E V4.4S, V24.4S
+        WORD $0x2487c0ce         \ //SM4E V4.4S, V25.4S
+        WORD $0x4487c0ce         \ //SM4E V4.4S, V26.4S
+        WORD $0x6586c0ce         \ //SM4E V5.4S, V19.4S
+        WORD $0x8586c0ce         \ //SM4E V5.4S, V20.4S
+        WORD $0xa586c0ce         \ //SM4E V5.4S, V21.4S
+        WORD $0xc586c0ce         \ //SM4E V5.4S, V22.4S
+        WORD $0xe586c0ce         \ //SM4E V5.4S, V23.4S
+        WORD $0x0587c0ce         \ //SM4E V5.4S, V24.4S
+        WORD $0x2587c0ce         \ //SM4E V5.4S, V25.4S
+        WORD $0x4587c0ce         \ //SM4E V5.4S, V26.4S
+        WORD $0x6686c0ce         \ //SM4E V6.4S, V19.4S
+        WORD $0x8686c0ce         \ //SM4E V6.4S, V20.4S
+        WORD $0xa686c0ce         \ //SM4E V6.4S, V21.4S
+        WORD $0xc686c0ce         \ //SM4E V6.4S, V22.4S
+        WORD $0xe686c0ce         \ //SM4E V6.4S, V23.4S
+        WORD $0x0687c0ce         \ //SM4E V6.4S, V24.4S
+        WORD $0x2687c0ce         \ //SM4E V6.4S, V25.4S
+        WORD $0x4687c0ce         \ //SM4E V6.4S, V26.4S
+        WORD $0x6786c0ce         \ //SM4E V7.4S, V19.4S
+        WORD $0x8786c0ce         \ //SM4E V7.4S, V20.4S
+        WORD $0xa786c0ce         \ //SM4E V7.4S, V21.4S
+        WORD $0xc786c0ce         \ //SM4E V7.4S, V22.4S
+        WORD $0xe786c0ce         \ //SM4E V7.4S, V23.4S
+        WORD $0x0787c0ce         \ //SM4E V7.4S, V24.4S
+        WORD $0x2787c0ce         \ //SM4E V7.4S, V25.4S
+        WORD $0x4787c0ce           //SM4E V7.4S, V26.4S    
+
+// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
+#define pTbl R0
+#define dstPtr R1
+#define ctrPtr R2
+#define srcPtr R3
+#define rk R4
+#define tPtr R5
+#define srcPtrLen R6
+#define aluCTR R7
+#define aluTMP R8
+#define H0 R9
+#define H1 R10
+#define pTblSave R11
+#define rkSave R12
+	MOVD	productTable+0(FP), pTbl
+	MOVD	dst+8(FP), dstPtr
+	MOVD	src_base+32(FP), srcPtr
+	MOVD	src_len+40(FP), srcPtrLen
+	MOVD	ctr+56(FP), ctrPtr
+	MOVD	T+64(FP), tPtr
+	MOVD	rk_base+72(FP), rk
+	
+	MOVD	$0xC2, H1
+	LSL	$56, H1
+	MOVD	$1, H0
+	VMOV	H1, POLY.D[0]
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	MOVD	pTbl, pTblSave
+	// Current tag, after AAD
+	VLD1	(tPtr), [ACC0.B16]
+	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
+	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
+	// Prepare initial counter, and the increment vector
+	VLD1	(ctrPtr), [CTR.B16]
+	VEOR	INC.B16, INC.B16, INC.B16
+	MOVD	$1, H0
+	VMOV	H0, INC.S[3]
+	VREV32	CTR.B16, CTR.B16
+	VADD	CTR.S4, INC.S4, CTR.S4
+
+	// Skip to <8 blocks loop
+	CMP	$128, srcPtrLen
+
+	MOVD	rk, H0
+	// For SM4 round keys are stored in: K0 .. K7
+	VLD1.P	64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
+	VLD1.P	64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
+
+	BLT	startSingles
+octetsLoop:
+		SUB	$128, srcPtrLen
+		// Prepare 8 counters
+		VMOV	CTR.B16, B0.B16
+		VADD	B0.S4, INC.S4, B1.S4
+		VADD	B1.S4, INC.S4, B2.S4
+		VADD	B2.S4, INC.S4, B3.S4
+		VADD	B3.S4, INC.S4, B4.S4
+		VADD	B4.S4, INC.S4, B5.S4
+		VADD	B5.S4, INC.S4, B6.S4
+		VADD	B6.S4, INC.S4, B7.S4
+		VADD	B7.S4, INC.S4, CTR.S4
+
+        sm4eEnc8blocks()
+        VREV32 B0.B16, B0.B16
+        VREV32 B1.B16, B1.B16
+        VREV32 B2.B16, B2.B16
+        VREV32 B3.B16, B3.B16
+        VREV32 B4.B16, B4.B16
+        VREV32 B5.B16, B5.B16
+        VREV32 B6.B16, B6.B16
+        VREV32 B7.B16, B7.B16
+
+		// XOR plaintext and store ciphertext
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B0.B16, T1.B16, B0.B16
+		VEOR	B1.B16, T2.B16, B1.B16
+		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B2.B16, T1.B16, B2.B16
+		VEOR	B3.B16, T2.B16, B3.B16
+		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B4.B16, T1.B16, B4.B16
+		VEOR	B5.B16, T2.B16, B5.B16
+		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B6.B16, T1.B16, B6.B16
+		VEOR	B7.B16, T2.B16, B7.B16
+		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		mulRound(B1)
+		mulRound(B2)
+		mulRound(B3)
+		mulRound(B4)
+		mulRound(B5)
+		mulRound(B6)
+		mulRound(B7)
+		MOVD	pTblSave, pTbl
+		reduce()
+
+		CMP	$128, srcPtrLen
+		BGE	octetsLoop
+
+startSingles:
+	CBZ	srcPtrLen, done
+	ADD	$14*16, pTbl
+	// Preload H and its Karatsuba precomp
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+
+singlesLoop:
+		CMP	$16, srcPtrLen
+		BLT	tail
+		SUB	$16, srcPtrLen
+
+        VMOV	CTR.B16, B0.B16
+		VADD	CTR.S4, INC.S4, CTR.S4
+        sm4eEnc1block()
+        VREV32 B0.B16, B0.B16
+
+singlesLast:
+        VLD1.P	16(srcPtr), [T0.B16]
+        VEOR	T0.B16, B0.B16, B0.B16
+
+encReduce:
+		VST1.P	[B0.B16], 16(dstPtr)
+
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		reduce()
+
+	B	singlesLoop
+tail:
+	CBZ	srcPtrLen, done
+
+	VEOR	T0.B16, T0.B16, T0.B16
+	VEOR	T3.B16, T3.B16, T3.B16
+	MOVD	$0, H1
+	SUB	$1, H1
+	ADD	srcPtrLen, srcPtr
+
+	TBZ	$3, srcPtrLen, ld4
+	MOVD.W	-8(srcPtr), H0
+	VMOV	H0, T0.D[0]
+	VMOV	H1, T3.D[0]
+
+ld4:
+	TBZ	$2, srcPtrLen, ld2
+	MOVW.W	-4(srcPtr), H0
+	VEXT	$12, T0.B16, ZERO.B16, T0.B16
+	VEXT	$12, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.S[0]
+	VMOV	H1, T3.S[0]
+ld2:
+	TBZ	$1, srcPtrLen, ld1
+	MOVH.W	-2(srcPtr), H0
+	VEXT	$14, T0.B16, ZERO.B16, T0.B16
+	VEXT	$14, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.H[0]
+	VMOV	H1, T3.H[0]
+ld1:
+	TBZ	$0, srcPtrLen, ld0
+	MOVB.W	-1(srcPtr), H0
+	VEXT	$15, T0.B16, ZERO.B16, T0.B16
+	VEXT	$15, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.B[0]
+	VMOV	H1, T3.B[0]
+ld0:
+	MOVD	ZR, srcPtrLen
+    VMOV	CTR.B16, B0.B16
+    sm4eEnc1block()
+    VREV32 B0.B16, B0.B16
+
+tailLast:
+	VEOR	T0.B16, B0.B16, B0.B16
+	VAND	T3.B16, B0.B16, B0.B16
+	B	encReduce
+
+done:
+	VST1	[ACC0.B16], (tPtr)
+    RET
+
+// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
+	MOVD	productTable+0(FP), pTbl
+	MOVD	dst+8(FP), dstPtr
+	MOVD	src_base+32(FP), srcPtr
+	MOVD	src_len+40(FP), srcPtrLen
+	MOVD	ctr+56(FP), ctrPtr
+	MOVD	T+64(FP), tPtr
+	MOVD	rk_base+72(FP), rk
+
+	MOVD	$0xC2, H1
+	LSL	$56, H1
+	MOVD	$1, H0
+	VMOV	H1, POLY.D[0]
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	MOVD	pTbl, pTblSave
+	MOVD rk, rkSave
+	// Current tag, after AAD
+	VLD1	(tPtr), [ACC0.B16]
+	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
+	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
+	// Prepare initial counter, and the increment vector
+	VLD1	(ctrPtr), [CTR.B16]
+	VEOR	INC.B16, INC.B16, INC.B16
+	MOVD	$1, H0
+	VMOV	H0, INC.S[3]
+	VREV32	CTR.B16, CTR.B16
+	VADD	CTR.S4, INC.S4, CTR.S4
+
+	// Skip to <8 blocks loop
+	CMP	$128, srcPtrLen
+
+	MOVD	rk, H0
+	// For SM4 round keys are stored in: K0 .. K7
+	VLD1.P	64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
+	VLD1.P	64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
+
+	BLT	startSingles
+octetsLoop:
+		SUB	$128, srcPtrLen
+
+		VMOV	CTR.B16, B0.B16
+		VADD	B0.S4, INC.S4, B1.S4
+		VADD	B1.S4, INC.S4, B2.S4
+		VADD	B2.S4, INC.S4, B3.S4
+		VADD	B3.S4, INC.S4, B4.S4
+		VADD	B4.S4, INC.S4, B5.S4
+		VADD	B5.S4, INC.S4, B6.S4
+		VADD	B6.S4, INC.S4, B7.S4
+		VADD	B7.S4, INC.S4, CTR.S4
+
+        sm4eEnc8blocks()      
+		VREV32 B0.B16, T1.B16
+		VREV32 B1.B16, T2.B16
+        VREV32 B2.B16, B2.B16
+        VREV32 B3.B16, B3.B16
+        VREV32 B4.B16, B4.B16
+        VREV32 B5.B16, B5.B16
+        VREV32 B6.B16, B6.B16
+        VREV32 B7.B16, B7.B16
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B0.B16, T1.B16, T1.B16
+		VEOR	B1.B16, T2.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B2.B16, B0.B16, T1.B16
+		VEOR	B3.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B4.B16, B0.B16, T1.B16
+		VEOR	B5.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B6.B16, B0.B16, T1.B16
+		VEOR	B7.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		MOVD	pTblSave, pTbl
+		reduce()
+
+		CMP	$128, srcPtrLen
+		BGE	octetsLoop
+
+startSingles:
+	CBZ	srcPtrLen, done
+	ADD	$14*16, pTbl
+	// Preload H and its Karatsuba precomp
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+
+singlesLoop:
+		CMP	$16, srcPtrLen
+		BLT	tail
+		SUB	$16, srcPtrLen
+        
+        VLD1.P	16(srcPtr), [T0.B16]
+        VREV64	T0.B16, B5.B16
+        
+        VMOV	CTR.B16, B0.B16
+		VADD	CTR.S4, INC.S4, CTR.S4
+        sm4eEnc1block()
+        VREV32 B0.B16, B0.B16
+
+singlesLast:
+        VEOR	T0.B16, B0.B16, B0.B16
+		VST1.P	[B0.B16], 16(dstPtr)
+
+		VEOR	ACC0.B16, B5.B16, B5.B16
+		VEXT	$8, B5.B16, B5.B16, T0.B16
+		VEOR	B5.B16, T0.B16, T0.B16
+		VPMULL	B5.D1, T1.D1, ACC1.Q1
+		VPMULL2	B5.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+		reduce()
+
+	B	singlesLoop        
+tail:
+	CBZ	srcPtrLen, done
+    VMOV	CTR.B16, B0.B16
+    VADD	CTR.S4, INC.S4, CTR.S4
+    sm4eEnc1block()
+    VREV32 B0.B16, B0.B16    
+tailLast:
+	// Assuming it is safe to load past dstPtr due to the presence of the tag
+    // B5 stored last ciphertext
+	VLD1	(srcPtr), [B5.B16]
+
+	VEOR	B5.B16, B0.B16, B0.B16
+
+	VEOR	T3.B16, T3.B16, T3.B16
+	MOVD	$0, H1
+	SUB	$1, H1
+
+	TBZ	$3, srcPtrLen, ld4 // Test if srcPtrLen < 8, if yes, goto ld4
+	VMOV	B0.D[0], H0
+	MOVD.P	H0, 8(dstPtr)
+	VMOV	H1, T3.D[0]
+	VEXT	$8, ZERO.B16, B0.B16, B0.B16
+ld4:
+	TBZ	$2, srcPtrLen, ld2 // Test if srcPtrLen < 4, if yes, goto ld2
+	VMOV	B0.S[0], H0
+	MOVW.P	H0, 4(dstPtr)
+	VEXT	$12, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.S[0]
+	VEXT	$4, ZERO.B16, B0.B16, B0.B16
+ld2:
+	TBZ	$1, srcPtrLen, ld1 // Test if srcPtrLen < 2, if yes, goto ld1
+	VMOV	B0.H[0], H0
+	MOVH.P	H0, 2(dstPtr)
+	VEXT	$14, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.H[0]
+	VEXT	$2, ZERO.B16, B0.B16, B0.B16
+ld1:
+	TBZ	$0, srcPtrLen, ld0 // Test if srcPtrLen < 1, if yes, goto ld0
+	VMOV	B0.B[0], H0
+	MOVB.P	H0, 1(dstPtr)
+	VEXT	$15, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.B[0]
+ld0:
+
+	VAND	T3.B16, B5.B16, B5.B16
+	VREV64	B5.B16, B5.B16
+
+	VEOR	ACC0.B16, B5.B16, B5.B16
+	VEXT	$8, B5.B16, B5.B16, T0.B16
+	VEOR	B5.B16, T0.B16, T0.B16
+	VPMULL	B5.D1, T1.D1, ACC1.Q1
+	VPMULL2	B5.D2, T1.D2, ACC0.Q1
+	VPMULL	T0.D1, T2.D1, ACCM.Q1
+	reduce()
+done:
+	VST1	[ACC0.B16], (tPtr)
+
+	RET    
--- a/sm4/gen_arm64_ni.go
+++ b/sm4/gen_arm64_ni.go
@ -0,0 +1,137 @@
+// Not used yet!!!
+// go run gen_arm64_ni.go
+
+//go:build ignore
+// +build ignore
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"log"
+	"math/bits"
+	"os"
+)
+
+//SM4E <Vd>.4S, <Vn>.4S
+func sm4e(Vd, Vn byte) uint32 {
+	inst := uint32(0xcec08400) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5
+	return bits.ReverseBytes32(inst)
+}
+
+//SM4EKEY <Vd>.4S, <Vn>.4S, <Vm>.4S
+func sm4ekey(Vd, Vn, Vm byte) uint32 {
+	inst := uint32(0xce60c800) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5 | (uint32(Vm&0x1f) << 16)
+	return bits.ReverseBytes32(inst)
+}
+
+func sm4ekeyRound(buf *bytes.Buffer, d, n, m byte) {
+	fmt.Fprintf(buf, "\tWORD $0x%08x          //SM4EKEY V%d.4S, V%d.4S, V%d.4S\n", sm4ekey(d, n, m), d, n, m)
+}
+
+func sm4eRound(buf *bytes.Buffer, d, n byte) {
+	fmt.Fprintf(buf, "\tWORD $0x%08x          //SM4E V%d.4S, V%d.4S\n", sm4e(d, n), d, n)
+}
+
+func main() {
+	buf := new(bytes.Buffer)
+	fmt.Fprint(buf, `
+// Generated by gen_arm64_ni.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+// func expandKeySM4E(key *byte, fk, ck, enc *uint32)
+TEXT ·expandKeySM4E(SB),NOSPLIT,$0
+	MOVD key+0(FP), R8
+	MOVD fk+8(FP), R9
+	MOVD ck+16(FP), R10
+	MOVD enc+24(FP), R11
+
+	VLD1 (R8), [V9.B16]
+	VREV32 V9.B16, V9.B16
+	VLD1 (R9), [V8.S4]
+	VEOR V9, V8, V9
+	VLD1.P	64(R10), [V0.S4, V1.S4, V2.S4, V3.S4]
+`[1:])
+
+	sm4ekeyRound(buf, 8, 9, 0)
+	sm4ekeyRound(buf, 9, 8, 1)
+	fmt.Fprintf(buf, "\tVST1.P	[V8.S4, V9.S4], 32(R11)\n")
+	sm4ekeyRound(buf, 8, 9, 2)
+	sm4ekeyRound(buf, 9, 8, 3)
+	fmt.Fprintf(buf, "\tVST1.P	[V8.S4, V9.S4], 32(R11)\n")
+	fmt.Fprintf(buf, "\tVLD1.P	64(R10), [V0.S4, V1.S4, V2.S4, V3.S4]\n")
+	sm4ekeyRound(buf, 8, 9, 0)
+	sm4ekeyRound(buf, 9, 8, 1)
+	fmt.Fprintf(buf, "\tVST1.P	[V8.S4, V9.S4], 32(R11)\n")
+	sm4ekeyRound(buf, 8, 9, 2)
+	sm4ekeyRound(buf, 9, 8, 3)
+	fmt.Fprintf(buf, `
+	VST1.P	[V8.S4, V9.S4], 32(R11)
+	RET
+`[1:])
+	fmt.Fprint(buf, `
+
+// func encryptBlockSM4E(xk *uint32, dst, src *byte)
+TEXT ·encryptBlockSM4E(SB),NOSPLIT,$0
+	MOVD xk+0(FP), R8
+	MOVD dst+8(FP), R9
+	MOVD src+16(FP), R10
+
+	VLD1 (R10), [V8.B16]
+	VREV32 V8.B16, V8.B16
+	VLD1.P	64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
+`[1:])
+	sm4eRound(buf, 8, 0)
+	sm4eRound(buf, 8, 1)
+	sm4eRound(buf, 8, 2)
+	sm4eRound(buf, 8, 3)
+	fmt.Fprintf(buf, "\tVLD1.P	64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]\n")
+	sm4eRound(buf, 8, 0)
+	sm4eRound(buf, 8, 1)
+	sm4eRound(buf, 8, 2)
+	sm4eRound(buf, 8, 3)
+	fmt.Fprintf(buf, `
+	VREV32 V8.B16, V8.B16
+	VST1	[V8.B16], (R9)
+	RET
+`[1:])
+
+	fmt.Fprint(buf, `
+
+// func encryptBlocksSM4E(xk *uint32, dst, src *byte)
+TEXT ·encryptBlocksSM4E(SB),NOSPLIT,$0
+	MOVD xk+0(FP), R8
+	MOVD dst+8(FP), R9
+	MOVD src+16(FP), R10
+
+	VLD1.P	64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
+	VLD1.P	64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
+
+`[1:])
+	for i := 0; i < 4; i++ {
+		fmt.Fprintf(buf, "\tVLD1.P 16(R10), [V8.B16]\n")
+		fmt.Fprintf(buf, "\tVREV32 V8.B16, V8.B16\n")
+		sm4eRound(buf, 8, 0)
+		sm4eRound(buf, 8, 1)
+		sm4eRound(buf, 8, 2)
+		sm4eRound(buf, 8, 3)
+		sm4eRound(buf, 8, 4)
+		sm4eRound(buf, 8, 5)
+		sm4eRound(buf, 8, 6)
+		sm4eRound(buf, 8, 7)
+		fmt.Fprintf(buf, "\tVREV32 V8.B16, V8.B16\n")
+		fmt.Fprintf(buf, "\tVST1.P	[V8.B16], 16(R9)\n\n")
+	}
+	fmt.Fprintf(buf, `
+	RET
+`[1:])
+
+	src := buf.Bytes()
+	// fmt.Println(string(src))
+	err := os.WriteFile("sm4e_arm64.s", src, 0644)
+	if err != nil {
+		log.Fatal(err)
+	}
+}
--- a/sm4/sm4_gcm_asm.go
+++ b/sm4/sm4_gcm_asm.go
@ -12,16 +12,16 @@ import (

 // sm4CipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
 // will use the optimised implementation in this file when possible. Instances
-// of this type only exist when hasGCMAsm returns true.
+// of this type only exist when hasGCMAsm and hasAES returns true.
 type sm4CipherGCM struct {
-	sm4CipherAsm
+	*sm4CipherAsm
 }

 // Assert that sm4CipherGCM implements the gcmAble interface.
 var _ gcmAble = (*sm4CipherGCM)(nil)

 //go:noescape
-func gcmSm4Init(productTable *[256]byte, rk []uint32)
+func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)

 //go:noescape
 func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
@ -35,6 +35,33 @@ func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
 //go:noescape
 func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)

+// gcmSm4InitInst is used for test
+func gcmSm4InitInst(productTable *[256]byte, rk []uint32) {
+	if supportSM4 {
+		gcmSm4Init(productTable, rk, INST_SM4)
+	} else {
+		gcmSm4Init(productTable, rk, INST_AES)
+	}
+}
+
+// gcmSm4EncInst is used for test
+func gcmSm4EncInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
+	if supportSM4 {
+		gcmSm4niEnc(productTable, dst, src, ctr, T, rk)
+	} else {
+		gcmSm4Enc(productTable, dst, src, ctr, T, rk)
+	}
+}
+
+// gcmSm4DecInst is used for test
+func gcmSm4DecInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
+	if supportSM4 {
+		gcmSm4niDec(productTable, dst, src, ctr, T, rk)
+	} else {
+		gcmSm4Dec(productTable, dst, src, ctr, T, rk)
+	}
+}
+
 type gcmAsm struct {
 	gcm
 	bytesProductTable [256]byte
@ -44,10 +71,10 @@ type gcmAsm struct {
 // called by crypto/cipher.NewGCM via the gcmAble interface.
 func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
 	g := &gcmAsm{}
-	g.cipher = &c.sm4CipherAsm
+	g.cipher = c.sm4CipherAsm
 	g.nonceSize = nonceSize
 	g.tagSize = tagSize
-	gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
+	gcmSm4Init(&g.bytesProductTable, g.cipher.enc, INST_AES)
 	return g, nil
 }

--- a/sm4/sm4_gcm_test.go
+++ b/sm4/sm4_gcm_test.go
@ -11,12 +11,12 @@ import (

 func genPrecomputeTable() *gcmAsm {
 	key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
-	c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
-	expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
+	c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
+	expandKey(key, c.enc, c.dec)
 	c1 := &sm4CipherGCM{c}
 	g := &gcmAsm{}
-	g.cipher = &c1.sm4CipherAsm
-	gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
+	g.cipher = c1.sm4CipherAsm
+	gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
 	return g
 }

@ -145,13 +145,13 @@ func TestBothDataPlaintext(t *testing.T) {

 func createGcm() *gcmAsm {
 	key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
-	c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
-	expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
+	c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
+	expandKey(key, c.enc, c.dec)
 	c1 := &sm4CipherGCM{c}
 	g := &gcmAsm{}
-	g.cipher = &c1.sm4CipherAsm
+	g.cipher = c1.sm4CipherAsm
 	g.tagSize = 16
-	gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
+	gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
 	return g
 }

@ -214,7 +214,7 @@ func TestGcmSm4Enc(t *testing.T) {

 		out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
 		gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
-		gcmSm4Enc(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
+		gcmSm4EncInst(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
 		if hex.EncodeToString(out1) != hex.EncodeToString(out2) {
 			t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString(out1), hex.EncodeToString(out2))
 		}
@ -244,7 +244,7 @@ func TestGcmSm4Dec(t *testing.T) {

 		out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
 		gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
-		gcmSm4Dec(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)
+		gcmSm4DecInst(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)

 		if hex.EncodeToString([]byte(test.plaintext)) != hex.EncodeToString(out2[:len(test.plaintext)]) {
 			t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString([]byte(test.plaintext)), hex.EncodeToString(out2[:len(test.plaintext)]))
--- a/sm4/sm4ni_gcm_asm.go
+++ b/sm4/sm4ni_gcm_asm.go
@ -0,0 +1,152 @@
+//go:build amd64 || arm64
+// +build amd64 arm64
+
+package sm4
+
+import (
+	"crypto/cipher"
+	goSubtle "crypto/subtle"
+
+	"github.com/emmansun/gmsm/internal/subtle"
+)
+
+//go:noescape
+func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+
+//go:noescape
+func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
+
+// sm4CipherNIGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
+// will use the optimised implementation in this file when possible. Instances
+// of this type only exist when hasGCMAsm and hasSM4 returns true.
+type sm4CipherNIGCM struct {
+	*sm4CipherNI
+}
+
+// Assert that sm4CipherNIGCM implements the gcmAble interface.
+var _ gcmAble = (*sm4CipherNIGCM)(nil)
+
+type gcmNI struct {
+	cipher            *sm4CipherNI
+	nonceSize         int
+	tagSize           int
+	bytesProductTable [256]byte
+}
+
+func (g *gcmNI) NonceSize() int {
+	return g.nonceSize
+}
+
+func (g *gcmNI) Overhead() int {
+	return g.tagSize
+}
+
+// NewGCM returns the SM4 cipher wrapped in Galois Counter Mode. This is only
+// called by crypto/cipher.NewGCM via the gcmAble interface.
+func (c *sm4CipherNIGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
+	g := &gcmNI{}
+	g.cipher = c.sm4CipherNI
+	g.nonceSize = nonceSize
+	g.tagSize = tagSize
+	gcmSm4Init(&g.bytesProductTable, g.cipher.enc, INST_SM4)
+	return g, nil
+}
+
+// Seal encrypts and authenticates plaintext. See the cipher.AEAD interface for
+// details.
+func (g *gcmNI) Seal(dst, nonce, plaintext, data []byte) []byte {
+	if len(nonce) != g.nonceSize {
+		panic("cipher: incorrect nonce length given to GCM")
+	}
+	if uint64(len(plaintext)) > ((1<<32)-2)*BlockSize {
+		panic("cipher: message too large for GCM")
+	}
+
+	var counter, tagMask [gcmBlockSize]byte
+
+	if len(nonce) == gcmStandardNonceSize {
+		// Init counter to nonce||1
+		copy(counter[:], nonce)
+		counter[gcmBlockSize-1] = 1
+	} else {
+		// Otherwise counter = GHASH(nonce)
+		gcmSm4Data(&g.bytesProductTable, nonce, &counter)
+		gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
+	}
+
+	g.cipher.Encrypt(tagMask[:], counter[:])
+
+	var tagOut [gcmTagSize]byte
+	gcmSm4Data(&g.bytesProductTable, data, &tagOut)
+
+	ret, out := subtle.SliceForAppend(dst, len(plaintext)+g.tagSize)
+	if subtle.InexactOverlap(out[:len(plaintext)], plaintext) {
+		panic("cipher: invalid buffer overlap")
+	}
+
+	if len(plaintext) > 0 {
+		gcmSm4niEnc(&g.bytesProductTable, out, plaintext, &counter, &tagOut, g.cipher.enc)
+	}
+	gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data)))
+	copy(out[len(plaintext):], tagOut[:])
+
+	return ret
+}
+
+// Open authenticates and decrypts ciphertext. See the cipher.AEAD interface
+// for details.
+func (g *gcmNI) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
+	if len(nonce) != g.nonceSize {
+		panic("cipher: incorrect nonce length given to GCM")
+	}
+	// Sanity check to prevent the authentication from always succeeding if an implementation
+	// leaves tagSize uninitialized, for example.
+	if g.tagSize < gcmMinimumTagSize {
+		panic("cipher: incorrect GCM tag size")
+	}
+
+	if len(ciphertext) < g.tagSize {
+		return nil, errOpen
+	}
+	if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(BlockSize)+uint64(g.tagSize) {
+		return nil, errOpen
+	}
+
+	tag := ciphertext[len(ciphertext)-g.tagSize:]
+	ciphertext = ciphertext[:len(ciphertext)-g.tagSize]
+
+	// See GCM spec, section 7.1.
+	var counter, tagMask [gcmBlockSize]byte
+
+	if len(nonce) == gcmStandardNonceSize {
+		// Init counter to nonce||1
+		copy(counter[:], nonce)
+		counter[gcmBlockSize-1] = 1
+	} else {
+		// Otherwise counter = GHASH(nonce)
+		gcmSm4Data(&g.bytesProductTable, nonce, &counter)
+		gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
+	}
+
+	g.cipher.Encrypt(tagMask[:], counter[:])
+
+	var expectedTag [gcmTagSize]byte
+	gcmSm4Data(&g.bytesProductTable, data, &expectedTag)
+
+	ret, out := subtle.SliceForAppend(dst, len(ciphertext))
+	if subtle.InexactOverlap(out, ciphertext) {
+		panic("cipher: invalid buffer overlap")
+	}
+	if len(ciphertext) > 0 {
+		gcmSm4niDec(&g.bytesProductTable, out, ciphertext, &counter, &expectedTag, g.cipher.enc)
+	}
+	gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data)))
+
+	if goSubtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 {
+		for i := range out {
+			out[i] = 0
+		}
+		return nil, errOpen
+	}
+	return ret, nil
+}