mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
sm4: code format
This commit is contained in:
parent
9bd64f94d3
commit
5243d0fe91
@ -20,7 +20,7 @@ DATA inverse_shift_rows<>+0x10(SB)/8, $0x0B0E0104070A0D00
|
|||||||
DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508
|
DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508
|
||||||
GLOBL inverse_shift_rows<>(SB), 8, $32
|
GLOBL inverse_shift_rows<>(SB), 8, $32
|
||||||
|
|
||||||
// Affine transform 1 (low and high hibbles)
|
// Affine transform 1 (low and high nibbles)
|
||||||
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
||||||
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||||
DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69
|
DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69
|
||||||
@ -33,7 +33,7 @@ DATA m1_high<>+0x10(SB)/8, $0xC35BF46CAF379800
|
|||||||
DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB
|
DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB
|
||||||
GLOBL m1_high<>(SB), 8, $32
|
GLOBL m1_high<>(SB), 8, $32
|
||||||
|
|
||||||
// Affine transform 2 (low and high hibbles)
|
// Affine transform 2 (low and high nibbles)
|
||||||
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
|
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
|
||||||
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
|
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
|
||||||
DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61
|
DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61
|
||||||
@ -270,10 +270,10 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
MOVOU (7*16)(RK), rk128; \
|
MOVOU (7*16)(RK), rk128; \
|
||||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||||
PSHUFB bswap_mask<>(SB), t3; \
|
PSHUFB bswap_mask<>(SB), t3; \
|
||||||
PSHUFB bswap_mask<>(SB), t2; \
|
PSHUFB bswap_mask<>(SB), t2; \
|
||||||
PSHUFB bswap_mask<>(SB), t1; \
|
PSHUFB bswap_mask<>(SB), t1; \
|
||||||
PSHUFB bswap_mask<>(SB), t0
|
PSHUFB bswap_mask<>(SB), t0
|
||||||
|
|
||||||
#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
@ -306,33 +306,33 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
|
SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||||
|
|
||||||
#define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
#define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||||
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
||||||
MOVOU (0*16)(RK), rk128; \
|
MOVOU (0*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
MOVOU (1*16)(RK), rk128; \
|
MOVOU (1*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
MOVOU (2*16)(RK), rk128; \
|
MOVOU (2*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
MOVOU (3*16)(RK), rk128; \
|
MOVOU (3*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
MOVOU (4*16)(RK), rk128; \
|
MOVOU (4*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
MOVOU (5*16)(RK), rk128; \
|
MOVOU (5*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
MOVOU (6*16)(RK), rk128; \
|
MOVOU (6*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
MOVOU (7*16)(RK), rk128; \
|
MOVOU (7*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||||
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
||||||
PSHUFB bswap_mask<>(SB), t3; \
|
PSHUFB bswap_mask<>(SB), t3; \
|
||||||
PSHUFB bswap_mask<>(SB), t2; \
|
PSHUFB bswap_mask<>(SB), t2; \
|
||||||
PSHUFB bswap_mask<>(SB), t1; \
|
PSHUFB bswap_mask<>(SB), t1; \
|
||||||
PSHUFB bswap_mask<>(SB), t0; \
|
PSHUFB bswap_mask<>(SB), t0; \
|
||||||
PSHUFB bswap_mask<>(SB), t7; \
|
PSHUFB bswap_mask<>(SB), t7; \
|
||||||
PSHUFB bswap_mask<>(SB), t6; \
|
PSHUFB bswap_mask<>(SB), t6; \
|
||||||
PSHUFB bswap_mask<>(SB), t5; \
|
PSHUFB bswap_mask<>(SB), t5; \
|
||||||
PSHUFB bswap_mask<>(SB), t4
|
PSHUFB bswap_mask<>(SB), t4
|
||||||
|
|
||||||
// SM4 sbox function, AVX version
|
// SM4 sbox function, AVX version
|
||||||
@ -366,7 +366,7 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - y: 128 bits temp register
|
// - y: 128 bits temp register
|
||||||
// - tmp: 128 bits temp register
|
// - tmp: 128 bits temp register
|
||||||
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
||||||
AVX_SM4_SBOX(x, y, tmp); \
|
AVX_SM4_SBOX(x, y, tmp); \
|
||||||
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
|
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
|
||||||
VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16
|
VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16
|
||||||
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
||||||
@ -408,7 +408,7 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - t3: 128 bits register for data
|
// - t3: 128 bits register for data
|
||||||
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||||
MOVL (index * 4)(RK)(IND*1), x; \
|
MOVL (index * 4)(RK)(IND*1), x; \
|
||||||
VPSHUFD $0, x, x; \
|
VPSHUFD $0, x, x; \ // Use VBROADCASTSS ?
|
||||||
VPXOR t1, x, x; \
|
VPXOR t1, x, x; \
|
||||||
VPXOR t2, x, x; \
|
VPXOR t2, x, x; \
|
||||||
VPXOR t3, x, x; \
|
VPXOR t3, x, x; \
|
||||||
@ -434,15 +434,15 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
|
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
|
||||||
|
|
||||||
#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
VPSHUFB flip_mask<>(SB), t0, t0 \
|
VPSHUFB flip_mask<>(SB), t0, t0 \
|
||||||
VPSHUFB flip_mask<>(SB), t1, t1 \
|
VPSHUFB flip_mask<>(SB), t1, t1 \
|
||||||
VPSHUFB flip_mask<>(SB), t2, t2 \
|
VPSHUFB flip_mask<>(SB), t2, t2 \
|
||||||
VPSHUFB flip_mask<>(SB), t3, t3 \
|
VPSHUFB flip_mask<>(SB), t3, t3 \
|
||||||
; \
|
; \
|
||||||
AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
|
AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
|
||||||
|
|
||||||
#define AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
#define AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||||
VMOVDQU (0*16)(RK), rk128; \
|
VMOVDQU (0*16)(RK), rk128; \
|
||||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
VMOVDQU (1*16)(RK), rk128; \
|
VMOVDQU (1*16)(RK), rk128; \
|
||||||
@ -493,31 +493,31 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
VPSHUFB flip_mask<>(SB), t5, t5 \
|
VPSHUFB flip_mask<>(SB), t5, t5 \
|
||||||
VPSHUFB flip_mask<>(SB), t6, t6 \
|
VPSHUFB flip_mask<>(SB), t6, t6 \
|
||||||
VPSHUFB flip_mask<>(SB), t7, t7 \
|
VPSHUFB flip_mask<>(SB), t7, t7 \
|
||||||
; \
|
; \
|
||||||
AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
|
AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||||
|
|
||||||
#define AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
#define AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
||||||
VMOVDQU (0*16)(RK), rk128; \
|
VMOVDQU (0*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
VMOVDQU (1*16)(RK), rk128; \
|
VMOVDQU (1*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
VMOVDQU (2*16)(RK), rk128; \
|
VMOVDQU (2*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
VMOVDQU (3*16)(RK), rk128; \
|
VMOVDQU (3*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
VMOVDQU (4*16)(RK), rk128; \
|
VMOVDQU (4*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
VMOVDQU (5*16)(RK), rk128; \
|
VMOVDQU (5*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
VMOVDQU (6*16)(RK), rk128; \
|
VMOVDQU (6*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
VMOVDQU (7*16)(RK), rk128; \
|
VMOVDQU (7*16)(RK), rk128; \
|
||||||
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
; \ // Transpose matrix 4 x 4 32bits word
|
; \ // Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
||||||
VPSHUFB bswap_mask<>(SB), t0, t0 \
|
VPSHUFB bswap_mask<>(SB), t0, t0 \
|
||||||
VPSHUFB bswap_mask<>(SB), t1, t1 \
|
VPSHUFB bswap_mask<>(SB), t1, t1 \
|
||||||
VPSHUFB bswap_mask<>(SB), t2, t2 \
|
VPSHUFB bswap_mask<>(SB), t2, t2 \
|
||||||
@ -538,24 +538,24 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||||
#define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
#define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||||
VPAND yNibbleMask, x, z; \
|
VPAND yNibbleMask, x, z; \
|
||||||
VMOVDQU m1_low<>(SB), y; \
|
VMOVDQU m1_low<>(SB), y; \
|
||||||
VPSHUFB z, y, y; \
|
VPSHUFB z, y, y; \
|
||||||
VPSRLQ $4, x, x; \
|
VPSRLQ $4, x, x; \
|
||||||
VPAND yNibbleMask, x, x; \
|
VPAND yNibbleMask, x, x; \
|
||||||
VMOVDQU m1_high<>(SB), z; \
|
VMOVDQU m1_high<>(SB), z; \
|
||||||
VPSHUFB x, z, x; \
|
VPSHUFB x, z, x; \
|
||||||
VPXOR y, x, x; \
|
VPXOR y, x, x; \
|
||||||
VPSHUFB inverse_shift_rows<>(SB), x, x; \
|
VPSHUFB inverse_shift_rows<>(SB), x, x; \
|
||||||
VEXTRACTI128 $1, x, yw \
|
VEXTRACTI128 $1, x, yw \
|
||||||
VAESENCLAST xNibbleMask, xw, xw; \
|
VAESENCLAST xNibbleMask, xw, xw; \
|
||||||
VAESENCLAST xNibbleMask, yw, yw; \
|
VAESENCLAST xNibbleMask, yw, yw; \
|
||||||
VINSERTI128 $1, yw, x, x; \
|
VINSERTI128 $1, yw, x, x; \
|
||||||
VPANDN yNibbleMask, x, z; \
|
VPANDN yNibbleMask, x, z; \
|
||||||
VMOVDQU m2_low<>(SB), y; \
|
VMOVDQU m2_low<>(SB), y; \
|
||||||
VPSHUFB z, y, y; \
|
VPSHUFB z, y, y; \
|
||||||
VPSRLQ $4, x, x; \
|
VPSRLQ $4, x, x; \
|
||||||
VPAND yNibbleMask, x, x; \
|
VPAND yNibbleMask, x, x; \
|
||||||
VMOVDQU m2_high<>(SB), z; \
|
VMOVDQU m2_high<>(SB), z; \
|
||||||
VPSHUFB x, z, x; \
|
VPSHUFB x, z, x; \
|
||||||
VPXOR y, x, x
|
VPXOR y, x, x
|
||||||
|
|
||||||
@ -570,11 +570,11 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||||
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||||
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
||||||
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
|
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
|
||||||
VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16
|
VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16
|
||||||
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
||||||
VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||||
VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24
|
VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24
|
||||||
VPXOR x, z, x; \ // x = x ^ (x <<< 24)
|
VPXOR x, z, x; \ // x = x ^ (x <<< 24)
|
||||||
VPSLLD $2, y, z; \
|
VPSLLD $2, y, z; \
|
||||||
VPSRLD $30, y, y; \
|
VPSRLD $30, y, y; \
|
||||||
@ -611,10 +611,10 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - t3: 256 bits register for data
|
// - t3: 256 bits register for data
|
||||||
#define AVX2_SM4_ROUND2(index, RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
#define AVX2_SM4_ROUND2(index, RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||||
VPBROADCASTD (index * 4)(RK), x; \
|
VPBROADCASTD (index * 4)(RK), x; \
|
||||||
VPXOR t1, x, x; \
|
VPXOR t1, x, x; \
|
||||||
VPXOR t2, x, x; \
|
VPXOR t2, x, x; \
|
||||||
VPXOR t3, x, x; \
|
VPXOR t3, x, x; \
|
||||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||||
VPXOR x, t0, t0
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
// SM4 round function, AVX version, handle 128 bits
|
// SM4 round function, AVX version, handle 128 bits
|
||||||
@ -675,6 +675,8 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - index: round key index immediate number
|
// - index: round key index immediate number
|
||||||
// - x: 256 bits temp register, MUST use XDWORD!
|
// - x: 256 bits temp register, MUST use XDWORD!
|
||||||
// - y: 256 bits temp register, MUST use YDWORD!
|
// - y: 256 bits temp register, MUST use YDWORD!
|
||||||
|
// - tmp: 256 bits temp register
|
||||||
|
// - tmp1: 256 bits temp register
|
||||||
// - t0: 256 bits register for data as result
|
// - t0: 256 bits register for data as result
|
||||||
// - t1: 256 bits register for data
|
// - t1: 256 bits register for data
|
||||||
// - t2: 256 bits register for data
|
// - t2: 256 bits register for data
|
||||||
@ -694,16 +696,16 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
VPXOR x, t4, t4; \
|
VPXOR x, t4, t4; \
|
||||||
|
|
||||||
#define AVX2_SM4_16BLOCKS(RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \
|
#define AVX2_SM4_16BLOCKS(RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(10, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
AVX2_SM4_16BLOCKS_ROUND(10, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(11, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
AVX2_SM4_16BLOCKS_ROUND(11, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
AVX2_SM4_16BLOCKS_ROUND(12, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
AVX2_SM4_16BLOCKS_ROUND(12, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
@ -11,7 +11,6 @@
|
|||||||
#define t3 X5
|
#define t3 X5
|
||||||
|
|
||||||
#define XTMP6 X6
|
#define XTMP6 X6
|
||||||
#define IV X8
|
|
||||||
|
|
||||||
#include "aesni_macros_amd64.s"
|
#include "aesni_macros_amd64.s"
|
||||||
|
|
||||||
@ -27,15 +26,14 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
|
|||||||
MOVQ src_len+40(FP), ptxLen
|
MOVQ src_len+40(FP), ptxLen
|
||||||
MOVQ iv+56(FP), SI
|
MOVQ iv+56(FP), SI
|
||||||
|
|
||||||
MOVUPS (SI), IV
|
MOVOU (SI), t0
|
||||||
|
|
||||||
loopSrc:
|
loopSrc:
|
||||||
CMPQ ptxLen, $16
|
CMPQ ptxLen, $16
|
||||||
JB done_sm4
|
JB done_sm4
|
||||||
SUBQ $16, ptxLen
|
SUBQ $16, ptxLen
|
||||||
|
|
||||||
MOVOU (ptx), t0
|
PXOR (ptx), t0
|
||||||
PXOR IV, t0
|
|
||||||
|
|
||||||
PSHUFB flip_mask<>(SB), t0
|
PSHUFB flip_mask<>(SB), t0
|
||||||
PSHUFD $1, t0, t1
|
PSHUFD $1, t0, t1
|
||||||
@ -60,7 +58,6 @@ loopRound:
|
|||||||
PALIGNR $4, t1, t0
|
PALIGNR $4, t1, t0
|
||||||
PSHUFB flip_mask<>(SB), t0
|
PSHUFB flip_mask<>(SB), t0
|
||||||
|
|
||||||
MOVOU t0, IV
|
|
||||||
MOVOU t0, (ctx)
|
MOVOU t0, (ctx)
|
||||||
|
|
||||||
LEAQ 16(ptx), ptx
|
LEAQ 16(ptx), ptx
|
||||||
@ -69,7 +66,7 @@ loopRound:
|
|||||||
JMP loopSrc
|
JMP loopSrc
|
||||||
|
|
||||||
done_sm4:
|
done_sm4:
|
||||||
MOVUPS IV, (SI)
|
MOVOU t0, (SI)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
#undef ctx
|
#undef ctx
|
||||||
|
Loading…
x
Reference in New Issue
Block a user