From 8a61c0ad3fee9db37313d6f0fdc4efa055d5c34b Mon Sep 17 00:00:00 2001
From: emmansun <emman.sun@foxmail.com>
Date: Sun, 2 Jan 2022 11:22:24 +0800
Subject: [PATCH] support gcm arm64 & cbc/ctr use asm cipher

---
 sm4/{cbc_amd64.go => cbc_cipher_asm.go} |   3 +
 sm4/cipher_asm.go                       |   8 +-
 sm4/{ctr_amd64.go => ctr_cipher_asm.go} |   3 +
 sm4/gcm_arm64.s                         | 325 ++++++++++++++++++++++++
 sm4/{gcm_amd64.go => gcm_cipher_asm.go} |   3 +
 sm4/sm4_gcm.go                          |   4 +-
 6 files changed, 340 insertions(+), 6 deletions(-)
 rename sm4/{cbc_amd64.go => cbc_cipher_asm.go} (94%)
 rename sm4/{ctr_amd64.go => ctr_cipher_asm.go} (93%)
 create mode 100644 sm4/gcm_arm64.s
 rename sm4/{gcm_amd64.go => gcm_cipher_asm.go} (96%)

diff --git a/sm4/cbc_amd64.go b/sm4/cbc_cipher_asm.go
similarity index 94%
rename from sm4/cbc_amd64.go
rename to sm4/cbc_cipher_asm.go
index e7f8509..889eed4 100644
--- a/sm4/cbc_amd64.go
+++ b/sm4/cbc_cipher_asm.go
@@ -1,3 +1,6 @@
+//go:build amd64 || arm64
+// +build amd64 arm64
+
 package sm4
 
 import (
diff --git a/sm4/cipher_asm.go b/sm4/cipher_asm.go
index b8c7945..98cb55f 100644
--- a/sm4/cipher_asm.go
+++ b/sm4/cipher_asm.go
@@ -12,7 +12,7 @@ import (
 
 var supportSM4 = cpu.ARM64.HasSM4
 var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
-var supportsGFMUL = cpu.X86.HasPCLMULQDQ // || cpu.ARM64.HasPMULL
+var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
 
 //go:noescape
 func encryptBlocksAsm(xk *uint32, dst, src *byte)
@@ -33,9 +33,9 @@ func newCipher(key []byte) (cipher.Block, error) {
 	}
 	c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}}
 	expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
-	//if supportsAES && supportsGFMUL {
-	//	return &sm4CipherGCM{c}, nil
-	//}
+	if supportsAES && supportsGFMUL {
+		return &sm4CipherGCM{c}, nil
+	}
 	return &c, nil
 }
 
diff --git a/sm4/ctr_amd64.go b/sm4/ctr_cipher_asm.go
similarity index 93%
rename from sm4/ctr_amd64.go
rename to sm4/ctr_cipher_asm.go
index 1d5782e..57703a3 100644
--- a/sm4/ctr_amd64.go
+++ b/sm4/ctr_cipher_asm.go
@@ -1,3 +1,6 @@
+//go:build amd64 || arm64
+// +build amd64 arm64
+
 package sm4
 
 import (
diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s
new file mode 100644
index 0000000..bbc1165
--- /dev/null
+++ b/sm4/gcm_arm64.s
@@ -0,0 +1,325 @@
+#include "textflag.h"
+
+#define B0 V0
+#define B1 V1
+#define B2 V2
+#define B3 V3
+#define B4 V4
+#define B5 V5
+#define B6 V6
+#define B7 V7
+
+#define ACC0 V8
+#define ACC1 V9
+#define ACCM V10
+
+#define T0 V11
+#define T1 V12
+#define T2 V13
+#define T3 V14
+
+#define POLY V15
+#define ZERO V16
+#define INC V17
+#define CTR V18
+
+#define K0 V19
+#define K1 V20
+#define K2 V21
+#define K3 V22
+#define K4 V23
+#define K5 V24
+#define K6 V25
+#define K7 V26
+#define K8 V27
+#define K9 V28
+#define K10 V29
+#define K11 V30
+#define KLAST V31
+
+#define reduce() \
+	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
+	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
+	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
+	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
+	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
+	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
+	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
+	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
+	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
+	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
+	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
+	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
+	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
+
+// func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
+TEXT ·gcmSm4Finish(SB),NOSPLIT,$0    
+#define pTbl R0
+#define tMsk R1
+#define tPtr R2
+#define plen R3
+#define dlen R4
+
+	MOVD	$0xC2, R1
+	LSL	$56, R1
+	MOVD	$1, R0
+	VMOV	R1, POLY.D[0]
+	VMOV	R0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	MOVD	productTable+0(FP), pTbl
+	MOVD	tagMask+8(FP), tMsk
+	MOVD	T+16(FP), tPtr
+	MOVD	pLen+24(FP), plen
+	MOVD	dLen+32(FP), dlen
+
+	VLD1	(tPtr), [ACC0.B16]
+	VLD1	(tMsk), [B1.B16]
+
+	LSL	$3, plen
+	LSL	$3, dlen
+
+	VMOV	dlen, B0.D[0]
+	VMOV	plen, B0.D[1]
+
+	ADD	$14*16, pTbl
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+
+	VEOR	ACC0.B16, B0.B16, B0.B16
+
+	VEXT	$8, B0.B16, B0.B16, T0.B16
+	VEOR	B0.B16, T0.B16, T0.B16
+	VPMULL	B0.D1, T1.D1, ACC1.Q1
+	VPMULL2	B0.D2, T1.D2, ACC0.Q1
+	VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+	reduce()
+
+	VREV64	ACC0.B16, ACC0.B16
+	VEOR	B1.B16, ACC0.B16, ACC0.B16
+
+	VST1	[ACC0.B16], (tPtr)
+	RET
+#undef pTbl
+#undef tMsk
+#undef tPtr
+#undef plen
+#undef dlen
+
+// func precomputeTableAsm(productTable *[256]byte, src *[16]byte)
+TEXT ·precomputeTableAsm(SB),NOSPLIT,$0
+#define pTbl R0
+#define SRC R1
+#define I R3
+
+	MOVD productTable+0(FP), pTbl
+	MOVD src+8(FP), SRC
+
+	MOVD	$0xC2, I
+	LSL	$56, I
+	VMOV	I, POLY.D[0]
+	MOVD	$1, I
+	VMOV	I, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	VLD1 (SRC), [B0.B16]
+	VREV64	B0.B16, B0.B16
+
+	// Multiply by 2 modulo P
+	VMOV	B0.D[0], I
+	ASR	$63, I
+	VMOV	I, T1.D[0]
+	VMOV	I, T1.D[1]
+	VAND	POLY.B16, T1.B16, T1.B16
+	VUSHR	$63, B0.D2, T2.D2
+	VEXT	$8, ZERO.B16, T2.B16, T2.B16
+	VSHL	$1, B0.D2, B0.D2
+	VEOR	T1.B16, B0.B16, B0.B16
+	VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
+
+	// Karatsuba pre-computation
+	VEXT	$8, B0.B16, B0.B16, B1.B16
+	VEOR	B0.B16, B1.B16, B1.B16
+
+	ADD	$14*16, pTbl
+	VST1	[B0.B16, B1.B16], (pTbl)
+	SUB	$2*16, pTbl
+
+	VMOV	B0.B16, B2.B16
+	VMOV	B1.B16, B3.B16
+
+	MOVD	$7, I
+
+initLoop:
+	// Compute powers of H
+	SUBS	$1, I
+
+	VPMULL	B0.D1, B2.D1, T1.Q1
+	VPMULL2	B0.D2, B2.D2, T0.Q1
+	VPMULL	B1.D1, B3.D1, T2.Q1
+	VEOR	T0.B16, T2.B16, T2.B16
+	VEOR	T1.B16, T2.B16, T2.B16
+	VEXT	$8, ZERO.B16, T2.B16, T3.B16
+	VEXT	$8, T2.B16, ZERO.B16, T2.B16
+	VEOR	T2.B16, T0.B16, T0.B16
+	VEOR	T3.B16, T1.B16, T1.B16
+	VPMULL	POLY.D1, T0.D1, T2.Q1
+	VEXT	$8, T0.B16, T0.B16, T0.B16
+	VEOR	T2.B16, T0.B16, T0.B16
+	VPMULL	POLY.D1, T0.D1, T2.Q1
+	VEXT	$8, T0.B16, T0.B16, T0.B16
+	VEOR	T2.B16, T0.B16, T0.B16
+	VEOR	T1.B16, T0.B16, B2.B16
+	VMOV	B2.B16, B3.B16
+	VEXT	$8, B2.B16, B2.B16, B2.B16
+	VEOR	B2.B16, B3.B16, B3.B16
+
+	VST1	[B2.B16, B3.B16], (pTbl)
+	SUB	$2*16, pTbl
+
+	BNE	initLoop
+	RET
+#undef I
+#undef NR
+#undef KS
+#undef pTbl	
+
+// func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
+TEXT ·gcmSm4Data(SB),NOSPLIT,$0
+#define pTbl R0
+#define aut R1
+#define tPtr R2
+#define autLen R3
+#define H0 R4
+#define pTblSave R5
+
+#define mulRound(X) \
+	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
+	VREV64	X.B16, X.B16               \
+	VEXT	$8, X.B16, X.B16, T0.B16   \
+	VEOR	X.B16, T0.B16, T0.B16      \
+	VPMULL	X.D1, T1.D1, T3.Q1         \
+	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
+	VPMULL2	X.D2, T1.D2, T3.Q1         \
+	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
+	VPMULL	T0.D1, T2.D1, T3.Q1        \
+	VEOR	T3.B16, ACCM.B16, ACCM.B16
+
+	MOVD	productTable+0(FP), pTbl
+	MOVD	data_base+8(FP), aut
+	MOVD	data_len+16(FP), autLen
+	MOVD	T+32(FP), tPtr
+
+	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
+	CBZ	autLen, dataBail
+
+	MOVD	$0xC2, H0
+	LSL	$56, H0
+	VMOV	H0, POLY.D[0]
+	MOVD	$1, H0
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+	MOVD	pTbl, pTblSave
+
+	CMP	$13, autLen
+	BEQ	dataTLS
+	CMP	$128, autLen
+	BLT	startSinglesLoop
+	B	octetsLoop
+
+dataTLS:
+	ADD	$14*16, pTbl
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+	VEOR	B0.B16, B0.B16, B0.B16
+
+	MOVD	(aut), H0
+	VMOV	H0, B0.D[0]
+	MOVW	8(aut), H0
+	VMOV	H0, B0.S[2]
+	MOVB	12(aut), H0
+	VMOV	H0, B0.B[12]
+
+	MOVD	$0, autLen
+	B	dataMul
+
+octetsLoop:
+		CMP	$128, autLen
+		BLT	startSinglesLoop
+		SUB	$128, autLen
+
+		VLD1.P	32(aut), [B0.B16, B1.B16]
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		mulRound(B1)
+		VLD1.P  32(aut), [B2.B16, B3.B16]
+		mulRound(B2)
+		mulRound(B3)
+		VLD1.P  32(aut), [B4.B16, B5.B16]
+		mulRound(B4)
+		mulRound(B5)
+		VLD1.P  32(aut), [B6.B16, B7.B16]
+		mulRound(B6)
+		mulRound(B7)
+
+		MOVD	pTblSave, pTbl
+		reduce()
+	B	octetsLoop
+
+startSinglesLoop:
+
+	ADD	$14*16, pTbl
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+
+singlesLoop:
+
+		CMP	$16, autLen
+		BLT	dataEnd
+		SUB	$16, autLen
+
+		VLD1.P	16(aut), [B0.B16]
+dataMul:
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		reduce()
+
+	B	singlesLoop
+
+dataEnd:
+
+	CBZ	autLen, dataBail
+	VEOR	B0.B16, B0.B16, B0.B16
+	ADD	autLen, aut
+
+dataLoadLoop:
+		MOVB.W	-1(aut), H0
+		VEXT	$15, B0.B16, ZERO.B16, B0.B16
+		VMOV	H0, B0.B[0]
+		SUBS	$1, autLen
+		BNE	dataLoadLoop
+	B	dataMul
+
+dataBail:
+	VST1	[ACC0.B16], (tPtr)
+	RET
+
+#undef pTbl
+#undef aut
+#undef tPtr
+#undef autLen
+#undef H0
+#undef pTblSave
diff --git a/sm4/gcm_amd64.go b/sm4/gcm_cipher_asm.go
similarity index 96%
rename from sm4/gcm_amd64.go
rename to sm4/gcm_cipher_asm.go
index 662aa18..5845243 100644
--- a/sm4/gcm_amd64.go
+++ b/sm4/gcm_cipher_asm.go
@@ -1,3 +1,6 @@
+//go:build amd64 || arm64
+// +build amd64 arm64
+
 package sm4
 
 import (
diff --git a/sm4/sm4_gcm.go b/sm4/sm4_gcm.go
index 8cf74d3..2e277ab 100644
--- a/sm4/sm4_gcm.go
+++ b/sm4/sm4_gcm.go
@@ -1,5 +1,5 @@
-//go:build amd64
-// +build amd64
+//go:build amd64 || arm64
+// +build amd64 arm64
 
 package sm4