mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
sm4: fix AVX version use AVX2 inst. issue
This commit is contained in:
parent
fc287b6e96
commit
fc2f105dd2
@ -264,6 +264,25 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
||||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
||||||
|
|
||||||
|
// SM4 round function, AVX version, handle 128 bits
|
||||||
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
|
// parameters:
|
||||||
|
// - index: round key index immediate number
|
||||||
|
// - x: 128 bits temp register
|
||||||
|
// - y: 128 bits temp register
|
||||||
|
// - t0: 128 bits register for data as result
|
||||||
|
// - t1: 128 bits register for data
|
||||||
|
// - t2: 128 bits register for data
|
||||||
|
// - t3: 128 bits register for data
|
||||||
|
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||||
|
MOVL (index * 4)(RK)(IND*1), x; \
|
||||||
|
VPSHUFD $0, x, x; \
|
||||||
|
VPXOR t1, x, x; \
|
||||||
|
VPXOR t2, x, x; \
|
||||||
|
VPXOR t3, x, x; \
|
||||||
|
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||||
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
// SM4 sbox function, AVX2 version
|
// SM4 sbox function, AVX2 version
|
||||||
// parameters:
|
// parameters:
|
||||||
// - x: 256 bits register as sbox input/output data
|
// - x: 256 bits register as sbox input/output data
|
||||||
@ -321,3 +340,40 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
VPSHUFB z, x, z; \
|
VPSHUFB z, x, z; \
|
||||||
VPXOR y, x, x; \
|
VPXOR y, x, x; \
|
||||||
VPXOR x, z, x
|
VPXOR x, z, x
|
||||||
|
|
||||||
|
// SM4 round function, AVX2 version, handle 256 bits
|
||||||
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
|
// parameters:
|
||||||
|
// - index: round key index immediate number
|
||||||
|
// - x: 256 bits temp register, MUST use XDWORD!
|
||||||
|
// - y: 256 bits temp register, MUST use YDWORD!
|
||||||
|
// - t0: 256 bits register for data as result
|
||||||
|
// - t1: 256 bits register for data
|
||||||
|
// - t2: 256 bits register for data
|
||||||
|
// - t3: 256 bits register for data
|
||||||
|
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||||
|
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||||
|
VPXOR t1, x, x; \
|
||||||
|
VPXOR t2, x, x; \
|
||||||
|
VPXOR t3, x, x; \
|
||||||
|
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||||
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
|
// SM4 round function, AVX version, handle 128 bits
|
||||||
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
|
// parameters:
|
||||||
|
// - index: round key index immediate number
|
||||||
|
// - x: 128 bits temp register
|
||||||
|
// - y: 128 bits temp register
|
||||||
|
// - t0: 128 bits register for data as result
|
||||||
|
// - t1: 128 bits register for data
|
||||||
|
// - t2: 128 bits register for data
|
||||||
|
// - t3: 128 bits register for data
|
||||||
|
#define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||||
|
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||||
|
VPSHUFD $0, x, x; \
|
||||||
|
VPXOR t1, x, x; \
|
||||||
|
VPXOR t2, x, x; \
|
||||||
|
VPXOR t3, x, x; \
|
||||||
|
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||||
|
VPXOR x, t0, t0
|
||||||
|
@ -88,42 +88,6 @@
|
|||||||
#define XWORD X8
|
#define XWORD X8
|
||||||
#define YWORD X9
|
#define YWORD X9
|
||||||
|
|
||||||
// SM4 round function, AVX2 version, handle 256 bits
|
|
||||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
|
||||||
// parameters:
|
|
||||||
// - index: round key index immediate number
|
|
||||||
// - x: 256 bits temp register, MUST use XDWORD!
|
|
||||||
// - y: 256 bits temp register, MUST use YDWORD!
|
|
||||||
// - t0: 256 bits register for data as result
|
|
||||||
// - t1: 256 bits register for data
|
|
||||||
// - t2: 256 bits register for data
|
|
||||||
// - t3: 256 bits register for data
|
|
||||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
|
||||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
|
||||||
VPXOR t1, x, x; \
|
|
||||||
VPXOR t2, x, x; \
|
|
||||||
VPXOR t3, x, x; \
|
|
||||||
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
|
|
||||||
VPXOR x, t0, t0
|
|
||||||
|
|
||||||
// SM4 round function, AVX version, handle 128 bits
|
|
||||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
|
||||||
// parameters:
|
|
||||||
// - index: round key index immediate number
|
|
||||||
// - x: 128 bits temp register
|
|
||||||
// - y: 128 bits temp register
|
|
||||||
// - t0: 128 bits register for data as result
|
|
||||||
// - t1: 128 bits register for data
|
|
||||||
// - t2: 128 bits register for data
|
|
||||||
// - t3: 128 bits register for data
|
|
||||||
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
|
||||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
|
||||||
VPXOR t1, x, x; \
|
|
||||||
VPXOR t2, x, x; \
|
|
||||||
VPXOR t3, x, x; \
|
|
||||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
|
|
||||||
VPXOR x, t0, t0
|
|
||||||
|
|
||||||
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
|
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
|
||||||
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
||||||
MOVQ key+0(FP), AX
|
MOVQ key+0(FP), AX
|
||||||
@ -225,10 +189,10 @@ avx:
|
|||||||
XORL CX, CX
|
XORL CX, CX
|
||||||
|
|
||||||
avx_loop:
|
avx_loop:
|
||||||
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||||
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||||
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||||
|
|
||||||
ADDL $16, CX
|
ADDL $16, CX
|
||||||
CMPL CX, $4*32
|
CMPL CX, $4*32
|
||||||
@ -274,10 +238,10 @@ avx2_8blocks:
|
|||||||
XORL CX, CX
|
XORL CX, CX
|
||||||
|
|
||||||
avx2_loop:
|
avx2_loop:
|
||||||
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||||
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||||
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||||
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||||
|
|
||||||
ADDL $16, CX
|
ADDL $16, CX
|
||||||
CMPL CX, $4*32
|
CMPL CX, $4*32
|
||||||
@ -317,10 +281,10 @@ avx2_4blocks:
|
|||||||
XORL CX, CX
|
XORL CX, CX
|
||||||
|
|
||||||
avx2_4blocks_loop:
|
avx2_4blocks_loop:
|
||||||
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX2_SM4_ROUND_4BLOCKS(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
AVX2_SM4_ROUND_4BLOCKS(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||||
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
AVX2_SM4_ROUND_4BLOCKS(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||||
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
AVX2_SM4_ROUND_4BLOCKS(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||||
|
|
||||||
ADDL $16, CX
|
ADDL $16, CX
|
||||||
CMPL CX, $4*32
|
CMPL CX, $4*32
|
||||||
|
@ -106,42 +106,6 @@ done_sm4:
|
|||||||
#define XWORD X8
|
#define XWORD X8
|
||||||
#define YWORD X9
|
#define YWORD X9
|
||||||
|
|
||||||
// SM4 round function, AVX2 version, handle 256 bits
|
|
||||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
|
||||||
// parameters:
|
|
||||||
// - index: round key index immediate number
|
|
||||||
// - x: 256 bits temp register
|
|
||||||
// - y: 256 bits temp register
|
|
||||||
// - t0: 256 bits register for data as result
|
|
||||||
// - t1: 256 bits register for data
|
|
||||||
// - t2: 256 bits register for data
|
|
||||||
// - t3: 256 bits register for data
|
|
||||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
|
||||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
|
||||||
VPXOR t1, x, x; \
|
|
||||||
VPXOR t2, x, x; \
|
|
||||||
VPXOR t3, x, x; \
|
|
||||||
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
|
|
||||||
VPXOR x, t0, t0
|
|
||||||
|
|
||||||
// SM4 round function, AVX version, handle 128 bits
|
|
||||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
|
||||||
// parameters:
|
|
||||||
// - index: round key index immediate number
|
|
||||||
// - x: 128 bits temp register
|
|
||||||
// - y: 128 bits temp register
|
|
||||||
// - t0: 128 bits register for data as result
|
|
||||||
// - t1: 128 bits register for data
|
|
||||||
// - t2: 128 bits register for data
|
|
||||||
// - t3: 128 bits register for data
|
|
||||||
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
|
||||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
|
||||||
VPXOR t1, x, x; \
|
|
||||||
VPXOR t2, x, x; \
|
|
||||||
VPXOR t3, x, x; \
|
|
||||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
|
|
||||||
VPXOR x, t0, t0
|
|
||||||
|
|
||||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||||
MOVQ xk+0(FP), AX
|
MOVQ xk+0(FP), AX
|
||||||
@ -217,10 +181,10 @@ avx:
|
|||||||
XORL CX, CX
|
XORL CX, CX
|
||||||
|
|
||||||
avx_loop:
|
avx_loop:
|
||||||
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||||
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||||
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||||
|
|
||||||
ADDL $16, CX
|
ADDL $16, CX
|
||||||
CMPL CX, $4*32
|
CMPL CX, $4*32
|
||||||
@ -269,10 +233,10 @@ avx2_8blocks:
|
|||||||
XORL CX, CX
|
XORL CX, CX
|
||||||
|
|
||||||
avx2_loop:
|
avx2_loop:
|
||||||
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||||
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||||
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||||
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||||
|
|
||||||
ADDL $16, CX
|
ADDL $16, CX
|
||||||
CMPL CX, $4*32
|
CMPL CX, $4*32
|
||||||
|
@ -263,22 +263,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
|||||||
PSHUFB BSWAP, t1; \
|
PSHUFB BSWAP, t1; \
|
||||||
PSHUFB BSWAP, t0
|
PSHUFB BSWAP, t0
|
||||||
|
|
||||||
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
|
||||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
|
||||||
VPXOR t1, x, x; \
|
|
||||||
VPXOR t2, x, x; \
|
|
||||||
VPXOR t3, x, x; \
|
|
||||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
|
||||||
VPXOR x, t0, t0
|
|
||||||
|
|
||||||
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
|
||||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
|
||||||
VPXOR t1, x, x; \
|
|
||||||
VPXOR t2, x, x; \
|
|
||||||
VPXOR t3, x, x; \
|
|
||||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
|
||||||
VPXOR x, t0, t0
|
|
||||||
|
|
||||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
||||||
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||||
#define dst DI
|
#define dst DI
|
||||||
@ -1614,10 +1598,10 @@ avx2GcmSm4EncNibbles:
|
|||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Enc4Loop2:
|
avx2GcmSm4Enc4Loop2:
|
||||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
@ -1676,10 +1660,10 @@ avx2GcmSm4EncSingles:
|
|||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Enc4Loop1:
|
avx2GcmSm4Enc4Loop1:
|
||||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
@ -2472,10 +2456,10 @@ avx2GcmSm4DecNibbles:
|
|||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Dec4Loop2:
|
avx2GcmSm4Dec4Loop2:
|
||||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
@ -2538,10 +2522,10 @@ avx2GcmSm4DecSingles:
|
|||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Dec4Loop1:
|
avx2GcmSm4Dec4Loop1:
|
||||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
|
Loading…
x
Reference in New Issue
Block a user