mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
sm4: Support AES-NI + AVX #135
This commit is contained in:
parent
1e5ba7f5a1
commit
d3e6412258
@ -9,6 +9,22 @@ import (
|
|||||||
"github.com/emmansun/gmsm/sm4"
|
"github.com/emmansun/gmsm/sm4"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func benchmarkEBCEncrypt1K(b *testing.B, block cipher.Block) {
|
||||||
|
buf := make([]byte, 1024)
|
||||||
|
b.SetBytes(int64(len(buf)))
|
||||||
|
|
||||||
|
ecb := smcipher.NewECBEncrypter(block)
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
ecb.CryptBlocks(buf, buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkSM4EBCEncrypt1K(b *testing.B) {
|
||||||
|
var key [16]byte
|
||||||
|
c, _ := sm4.NewCipher(key[:])
|
||||||
|
benchmarkEBCEncrypt1K(b, c)
|
||||||
|
}
|
||||||
|
|
||||||
func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) {
|
func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) {
|
||||||
buf := make([]byte, 1024)
|
buf := make([]byte, 1024)
|
||||||
b.SetBytes(int64(len(buf)))
|
b.SetBytes(int64(len(buf)))
|
||||||
|
@ -213,7 +213,7 @@
|
|||||||
|
|
||||||
#define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
|
#define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
|
||||||
; \ // ############################# RND N + 0 ############################//
|
; \ // ############################# RND N + 0 ############################//
|
||||||
RORXL $(-12), a, y0; \ // y0 = a <<< 12
|
RORXL $(-12), a, y0; \ // y0 = a <<< 12, RORXL is BMI2 instr
|
||||||
MOVL e, y1; \
|
MOVL e, y1; \
|
||||||
ADDL $const, y1; \
|
ADDL $const, y1; \
|
||||||
VPALIGNR $12, XDWORD0, XDWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3}
|
VPALIGNR $12, XDWORD0, XDWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3}
|
||||||
|
@ -165,6 +165,9 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
|||||||
CMPB ·useAVX2(SB), $1
|
CMPB ·useAVX2(SB), $1
|
||||||
JE avx2
|
JE avx2
|
||||||
|
|
||||||
|
CMPB ·useAVX(SB), $1
|
||||||
|
JE avx
|
||||||
|
|
||||||
non_avx2_start:
|
non_avx2_start:
|
||||||
PINSRD $0, 0(DX), t0
|
PINSRD $0, 0(DX), t0
|
||||||
PINSRD $1, 16(DX), t0
|
PINSRD $1, 16(DX), t0
|
||||||
@ -238,6 +241,51 @@ loop:
|
|||||||
done_sm4:
|
done_sm4:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
avx:
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
|
||||||
|
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||||
|
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
||||||
|
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||||
|
|
||||||
|
XORL CX, CX
|
||||||
|
|
||||||
|
avx_loop:
|
||||||
|
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||||
|
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||||
|
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||||
|
|
||||||
|
ADDL $16, CX
|
||||||
|
CMPL CX, $4*32
|
||||||
|
JB avx_loop
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||||
|
|
||||||
|
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
|
||||||
|
RET
|
||||||
|
|
||||||
avx2:
|
avx2:
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
CMPQ DI, $64
|
CMPQ DI, $64
|
||||||
@ -304,7 +352,7 @@ avx2_4blocks:
|
|||||||
|
|
||||||
XORL CX, CX
|
XORL CX, CX
|
||||||
|
|
||||||
avx_loop:
|
avx2_4blocks_loop:
|
||||||
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||||
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||||
@ -312,7 +360,7 @@ avx_loop:
|
|||||||
|
|
||||||
ADDL $16, CX
|
ADDL $16, CX
|
||||||
CMPL CX, $4*32
|
CMPL CX, $4*32
|
||||||
JB avx_loop
|
JB avx2_4blocks_loop
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||||
|
@ -85,9 +85,20 @@ done_sm4:
|
|||||||
#define XDWORD2 Y6
|
#define XDWORD2 Y6
|
||||||
#define XDWORD3 Y7
|
#define XDWORD3 Y7
|
||||||
|
|
||||||
|
#define XWTMP0 X0
|
||||||
|
#define XWTMP1 X1
|
||||||
|
#define XWTMP2 X2
|
||||||
|
|
||||||
|
#define XWORD0 X4
|
||||||
|
#define XWORD1 X5
|
||||||
|
#define XWORD2 X6
|
||||||
|
#define XWORD3 X7
|
||||||
|
|
||||||
#define NIBBLE_MASK Y3
|
#define NIBBLE_MASK Y3
|
||||||
#define X_NIBBLE_MASK X3
|
#define X_NIBBLE_MASK X3
|
||||||
|
|
||||||
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
||||||
|
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
|
||||||
|
|
||||||
#define XDWORD Y8
|
#define XDWORD Y8
|
||||||
#define YDWORD Y9
|
#define YDWORD Y9
|
||||||
@ -113,6 +124,24 @@ done_sm4:
|
|||||||
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
|
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||||
VPXOR x, t0, t0
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
|
// SM4 round function, AVX version, handle 128 bits
|
||||||
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
|
// parameters:
|
||||||
|
// - index: round key index immediate number
|
||||||
|
// - x: 128 bits temp register
|
||||||
|
// - y: 128 bits temp register
|
||||||
|
// - t0: 128 bits register for data as result
|
||||||
|
// - t1: 128 bits register for data
|
||||||
|
// - t2: 128 bits register for data
|
||||||
|
// - t3: 128 bits register for data
|
||||||
|
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||||
|
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||||
|
VPXOR t1, x, x; \
|
||||||
|
VPXOR t2, x, x; \
|
||||||
|
VPXOR t3, x, x; \
|
||||||
|
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
|
||||||
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||||
MOVQ xk+0(FP), AX
|
MOVQ xk+0(FP), AX
|
||||||
@ -123,6 +152,9 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
|||||||
CMPB ·useAVX2(SB), $1
|
CMPB ·useAVX2(SB), $1
|
||||||
JE avx2
|
JE avx2
|
||||||
|
|
||||||
|
CMPB ·useAVX(SB), $1
|
||||||
|
JE avx
|
||||||
|
|
||||||
non_avx2_start:
|
non_avx2_start:
|
||||||
PINSRD $0, 0(DX), t0
|
PINSRD $0, 0(DX), t0
|
||||||
PINSRD $1, 16(DX), t0
|
PINSRD $1, 16(DX), t0
|
||||||
@ -180,6 +212,56 @@ loop:
|
|||||||
done_sm4:
|
done_sm4:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
avx:
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
|
||||||
|
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||||
|
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
||||||
|
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||||
|
|
||||||
|
XORL CX, CX
|
||||||
|
|
||||||
|
avx_loop:
|
||||||
|
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||||
|
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||||
|
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||||
|
|
||||||
|
ADDL $16, CX
|
||||||
|
CMPL CX, $4*32
|
||||||
|
JB avx_loop
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||||
|
|
||||||
|
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||||
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
VPXOR 16(SI), XWORD1, XWORD1
|
||||||
|
VPXOR 32(SI), XWORD2, XWORD2
|
||||||
|
VPXOR 48(SI), XWORD3, XWORD3
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
|
||||||
|
RET
|
||||||
|
|
||||||
avx2:
|
avx2:
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
|
|
||||||
|
@ -14,7 +14,8 @@ import (
|
|||||||
var supportSM4 = cpu.ARM64.HasSM4 && os.Getenv("DISABLE_SM4NI") != "1"
|
var supportSM4 = cpu.ARM64.HasSM4 && os.Getenv("DISABLE_SM4NI") != "1"
|
||||||
var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
|
var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
|
||||||
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
|
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
|
||||||
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
|
var useAVX2 = cpu.X86.HasAVX512
|
||||||
|
var useAVX = cpu.X86.HasAVX
|
||||||
|
|
||||||
const (
|
const (
|
||||||
INST_AES int = iota
|
INST_AES int = iota
|
||||||
|
834
sm4/gcm_amd64.s
834
sm4/gcm_amd64.s
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user