mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
sm4: fix AVX version use AVX2 inst. issue
This commit is contained in:
parent
fc287b6e96
commit
fc2f105dd2
@ -264,6 +264,25 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
||||
|
||||
// SM4 round function, AVX version, handle 128 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - t0: 128 bits register for data as result
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||
MOVL (index * 4)(RK)(IND*1), x; \
|
||||
VPSHUFD $0, x, x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// SM4 sbox function, AVX2 version
|
||||
// parameters:
|
||||
// - x: 256 bits register as sbox input/output data
|
||||
@ -321,3 +340,40 @@ GLOBL fk_mask<>(SB), 8, $16
|
||||
VPSHUFB z, x, z; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, z, x
|
||||
|
||||
// SM4 round function, AVX2 version, handle 256 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 256 bits temp register, MUST use XDWORD!
|
||||
// - y: 256 bits temp register, MUST use YDWORD!
|
||||
// - t0: 256 bits register for data as result
|
||||
// - t1: 256 bits register for data
|
||||
// - t2: 256 bits register for data
|
||||
// - t3: 256 bits register for data
|
||||
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// SM4 round function, AVX version, handle 128 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - t0: 128 bits register for data as result
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPSHUFD $0, x, x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
@ -88,42 +88,6 @@
|
||||
#define XWORD X8
|
||||
#define YWORD X9
|
||||
|
||||
// SM4 round function, AVX2 version, handle 256 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 256 bits temp register, MUST use XDWORD!
|
||||
// - y: 256 bits temp register, MUST use YDWORD!
|
||||
// - t0: 256 bits register for data as result
|
||||
// - t1: 256 bits register for data
|
||||
// - t2: 256 bits register for data
|
||||
// - t3: 256 bits register for data
|
||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// SM4 round function, AVX version, handle 128 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - t0: 128 bits register for data as result
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
|
||||
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
||||
MOVQ key+0(FP), AX
|
||||
@ -225,10 +189,10 @@ avx:
|
||||
XORL CX, CX
|
||||
|
||||
avx_loop:
|
||||
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
@ -274,10 +238,10 @@ avx2_8blocks:
|
||||
XORL CX, CX
|
||||
|
||||
avx2_loop:
|
||||
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
@ -317,10 +281,10 @@ avx2_4blocks:
|
||||
XORL CX, CX
|
||||
|
||||
avx2_4blocks_loop:
|
||||
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
|
@ -106,42 +106,6 @@ done_sm4:
|
||||
#define XWORD X8
|
||||
#define YWORD X9
|
||||
|
||||
// SM4 round function, AVX2 version, handle 256 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 256 bits temp register
|
||||
// - y: 256 bits temp register
|
||||
// - t0: 256 bits register for data as result
|
||||
// - t1: 256 bits register for data
|
||||
// - t2: 256 bits register for data
|
||||
// - t3: 256 bits register for data
|
||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// SM4 round function, AVX version, handle 128 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - t0: 128 bits register for data as result
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
MOVQ xk+0(FP), AX
|
||||
@ -217,10 +181,10 @@ avx:
|
||||
XORL CX, CX
|
||||
|
||||
avx_loop:
|
||||
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
@ -269,10 +233,10 @@ avx2_8blocks:
|
||||
XORL CX, CX
|
||||
|
||||
avx2_loop:
|
||||
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
|
@ -263,22 +263,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
PSHUFB BSWAP, t1; \
|
||||
PSHUFB BSWAP, t0
|
||||
|
||||
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
||||
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||
#define dst DI
|
||||
@ -1614,10 +1598,10 @@ avx2GcmSm4EncNibbles:
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc4Loop2:
|
||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
@ -1676,10 +1660,10 @@ avx2GcmSm4EncSingles:
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Enc4Loop1:
|
||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
@ -2472,10 +2456,10 @@ avx2GcmSm4DecNibbles:
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Dec4Loop2:
|
||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
@ -2538,10 +2522,10 @@ avx2GcmSm4DecSingles:
|
||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||
|
||||
avx2GcmSm4Dec4Loop1:
|
||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||
|
||||
ADDL $16, BX
|
||||
CMPL BX, $4*32
|
||||
|
Loading…
x
Reference in New Issue
Block a user