sm4: Support AES-NI + AVX #135

This commit is contained in:
Sun Yimin 2023-07-03 12:00:27 +08:00 committed by GitHub
parent 1e5ba7f5a1
commit d3e6412258
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 904 additions and 85 deletions

View File

@ -9,6 +9,22 @@ import (
"github.com/emmansun/gmsm/sm4" "github.com/emmansun/gmsm/sm4"
) )
func benchmarkEBCEncrypt1K(b *testing.B, block cipher.Block) {
buf := make([]byte, 1024)
b.SetBytes(int64(len(buf)))
ecb := smcipher.NewECBEncrypter(block)
for i := 0; i < b.N; i++ {
ecb.CryptBlocks(buf, buf)
}
}
func BenchmarkSM4EBCEncrypt1K(b *testing.B) {
var key [16]byte
c, _ := sm4.NewCipher(key[:])
benchmarkEBCEncrypt1K(b, c)
}
func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) { func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) {
buf := make([]byte, 1024) buf := make([]byte, 1024)
b.SetBytes(int64(len(buf))) b.SetBytes(int64(len(buf)))

View File

@ -213,7 +213,7 @@
#define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ #define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
; \ // ############################# RND N + 0 ############################// ; \ // ############################# RND N + 0 ############################//
RORXL $(-12), a, y0; \ // y0 = a <<< 12 RORXL $(-12), a, y0; \ // y0 = a <<< 12, RORXL is BMI2 instr
MOVL e, y1; \ MOVL e, y1; \
ADDL $const, y1; \ ADDL $const, y1; \
VPALIGNR $12, XDWORD0, XDWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3} VPALIGNR $12, XDWORD0, XDWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3}

View File

@ -165,6 +165,9 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
CMPB ·useAVX2(SB), $1 CMPB ·useAVX2(SB), $1
JE avx2 JE avx2
CMPB ·useAVX(SB), $1
JE avx
non_avx2_start: non_avx2_start:
PINSRD $0, 0(DX), t0 PINSRD $0, 0(DX), t0
PINSRD $1, 16(DX), t0 PINSRD $1, 16(DX), t0
@ -238,6 +241,51 @@ loop:
done_sm4: done_sm4:
RET RET
avx:
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
XORL CX, CX
avx_loop:
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
ADDL $16, CX
CMPL CX, $4*32
JB avx_loop
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
RET
avx2: avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
CMPQ DI, $64 CMPQ DI, $64
@ -304,7 +352,7 @@ avx2_4blocks:
XORL CX, CX XORL CX, CX
avx_loop: avx2_4blocks_loop:
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0) AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1) AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
@ -312,7 +360,7 @@ avx_loop:
ADDL $16, CX ADDL $16, CX
CMPL CX, $4*32 CMPL CX, $4*32
JB avx_loop JB avx2_4blocks_loop
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)

View File

@ -85,9 +85,20 @@ done_sm4:
#define XDWORD2 Y6 #define XDWORD2 Y6
#define XDWORD3 Y7 #define XDWORD3 Y7
#define XWTMP0 X0
#define XWTMP1 X1
#define XWTMP2 X2
#define XWORD0 X4
#define XWORD1 X5
#define XWORD2 X6
#define XWORD3 X7
#define NIBBLE_MASK Y3 #define NIBBLE_MASK Y3
#define X_NIBBLE_MASK X3 #define X_NIBBLE_MASK X3
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
#define XDWORD Y8 #define XDWORD Y8
#define YDWORD Y9 #define YDWORD Y9
@ -113,6 +124,24 @@ done_sm4:
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \ AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0 VPXOR x, t0, t0
// SM4 round function, AVX version, handle 128 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 128 bits temp register
// - y: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
VPXOR x, t0, t0
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX MOVQ xk+0(FP), AX
@ -123,6 +152,9 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
CMPB ·useAVX2(SB), $1 CMPB ·useAVX2(SB), $1
JE avx2 JE avx2
CMPB ·useAVX(SB), $1
JE avx
non_avx2_start: non_avx2_start:
PINSRD $0, 0(DX), t0 PINSRD $0, 0(DX), t0
PINSRD $1, 16(DX), t0 PINSRD $1, 16(DX), t0
@ -180,6 +212,56 @@ loop:
done_sm4: done_sm4:
RET RET
avx:
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
XORL CX, CX
avx_loop:
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
ADDL $16, CX
CMPL CX, $4*32
JB avx_loop
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
VPXOR 0(SI), XWORD0, XWORD0
VPXOR 16(SI), XWORD1, XWORD1
VPXOR 32(SI), XWORD2, XWORD2
VPXOR 48(SI), XWORD3, XWORD3
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
RET
avx2: avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK

View File

@ -14,7 +14,8 @@ import (
var supportSM4 = cpu.ARM64.HasSM4 && os.Getenv("DISABLE_SM4NI") != "1" var supportSM4 = cpu.ARM64.HasSM4 && os.Getenv("DISABLE_SM4NI") != "1"
var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 var useAVX2 = cpu.X86.HasAVX512
var useAVX = cpu.X86.HasAVX
const ( const (
INST_AES int = iota INST_AES int = iota

File diff suppressed because it is too large Load Diff