sm4: use package level instead of local for shared variables

This commit is contained in:
Sun Yimin 2024-11-11 17:40:41 +08:00 committed by GitHub
parent b721bed0cc
commit aa82b5836b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 247 additions and 266 deletions

View File

@ -1,62 +1,3 @@
// shuffle byte order from LE to BE
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL flip_mask<>(SB), 8, $16
// shuffle byte and word order
DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
GLOBL bswap_mask<>(SB), 8, $16
//nibble mask
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nibble_mask<>(SB), 8, $16
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
DATA inverse_shift_rows<>+0x10(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), 8, $32
// Affine transform 1 (low and high nibbles)
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_low<>+0x18(SB)/8, $0x3045F98CEF9A2653
GLOBL m1_low<>(SB), 8, $32
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
DATA m1_high<>+0x10(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB
GLOBL m1_high<>(SB), 8, $32
// Affine transform 2 (low and high nibbles)
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61
DATA m2_low<>+0x18(SB)/8, $0x0E019E916A65FAF5
GLOBL m2_low<>(SB), 8, $32
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
DATA m2_high<>+0x10(SB)/8, $0x892D69CD44E0A400
DATA m2_high<>+0x18(SB)/8, $0x2C88CC68E14501A5
GLOBL m2_high<>(SB), 8, $32
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), 8, $32
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), 8, $16
// Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions. // Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions.
// input: from high to low // input: from high to low
// r0 = [w3, w2, w1, w0] // r0 = [w3, w2, w1, w0]
@ -110,26 +51,26 @@ GLOBL fk_mask<>(SB), 8, $16
#define SM4_SBOX(x, y, z) \ #define SM4_SBOX(x, y, z) \
; \ //############################# inner affine ############################// ; \ //############################# inner affine ############################//
MOVOU x, z; \ MOVOU x, z; \
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f); PAND ·nibble_mask(SB), z; \ //y = _mm_and_si128(x, c0f);
MOVOU m1_low<>(SB), y; \ MOVOU ·m1_low(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y); PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); PAND ·nibble_mask(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m1_high<>(SB), z; \ MOVOU ·m1_high(SB), z; \
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x); PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x); MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
; \ // inverse ShiftRows ; \ // inverse ShiftRows
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); PSHUFB ·inverse_shift_rows(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction AESENCLAST ·nibble_mask(SB), x; \ // AESNI instruction
; \ //############################# outer affine ############################// ; \ //############################# outer affine ############################//
MOVOU x, z; \ MOVOU x, z; \
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f); PANDN ·nibble_mask(SB), z; \ //z = _mm_andnot_si128(x, c0f);
MOVOU m2_low<>(SB), y; \ MOVOU ·m2_low(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z) PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); PAND ·nibble_mask(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m2_high<>(SB), z; \ MOVOU ·m2_high(SB), z; \
PSHUFB x, z; \ PSHUFB x, z; \
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
@ -143,12 +84,12 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_SBOX(x, y, z); \ SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################// ; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \ MOVOU x, y; \
PSHUFB r08_mask<>(SB), y; \ //y = x <<< 8 PSHUFB ·r08_mask(SB), y; \ //y = x <<< 8
MOVOU y, z; \ MOVOU y, z; \
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 16 PSHUFB ·r08_mask(SB), z; \ //z = x <<< 16
PXOR x, y; \ //y = x ^ (x <<< 8) PXOR x, y; \ //y = x ^ (x <<< 8)
PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16) PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16)
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 24 PSHUFB ·r08_mask(SB), z; \ //z = x <<< 24
PXOR z, x; \ //x = x ^ (x <<< 24) PXOR z, x; \ //x = x ^ (x <<< 24)
MOVOU y, z; \ MOVOU y, z; \
PSLLL $2, z; \ PSLLL $2, z; \
@ -214,7 +155,7 @@ GLOBL fk_mask<>(SB), 8, $16
// Requires: SSSE3 // Requires: SSSE3
#define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \ #define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \ PSHUFB ·flip_mask(SB), t0; \
PSHUFD $1, t0, t1; \ PSHUFD $1, t0, t1; \
PSHUFD $2, t0, t2; \ PSHUFD $2, t0, t2; \
PSHUFD $3, t0, t3; \ PSHUFD $3, t0, t3; \
@ -238,13 +179,13 @@ GLOBL fk_mask<>(SB), 8, $16
PALIGNR $4, t3, t2; \ PALIGNR $4, t3, t2; \
PALIGNR $4, t2, t1; \ PALIGNR $4, t2, t1; \
PALIGNR $4, t1, t0; \ PALIGNR $4, t1, t0; \
PSHUFB flip_mask<>(SB), t0 PSHUFB ·flip_mask(SB), t0
#define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ #define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \ PSHUFB ·flip_mask(SB), t0; \
PSHUFB flip_mask<>(SB), t1; \ PSHUFB ·flip_mask(SB), t1; \
PSHUFB flip_mask<>(SB), t2; \ PSHUFB ·flip_mask(SB), t2; \
PSHUFB flip_mask<>(SB), t3; \ PSHUFB ·flip_mask(SB), t3; \
SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
#define SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \ #define SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \
@ -266,10 +207,10 @@ GLOBL fk_mask<>(SB), 8, $16
MOVOU (7*16)(RK), rk128; \ MOVOU (7*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
PSHUFB bswap_mask<>(SB), t3; \ PSHUFB ·bswap_mask(SB), t3; \
PSHUFB bswap_mask<>(SB), t2; \ PSHUFB ·bswap_mask(SB), t2; \
PSHUFB bswap_mask<>(SB), t1; \ PSHUFB ·bswap_mask(SB), t1; \
PSHUFB bswap_mask<>(SB), t0 PSHUFB ·bswap_mask(SB), t0
#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ #define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
PSHUFD $0, rk128, x; \ PSHUFD $0, rk128, x; \
@ -290,14 +231,14 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \ SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \
#define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ #define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
PSHUFB flip_mask<>(SB), t0; \ PSHUFB ·flip_mask(SB), t0; \
PSHUFB flip_mask<>(SB), t1; \ PSHUFB ·flip_mask(SB), t1; \
PSHUFB flip_mask<>(SB), t2; \ PSHUFB ·flip_mask(SB), t2; \
PSHUFB flip_mask<>(SB), t3; \ PSHUFB ·flip_mask(SB), t3; \
PSHUFB flip_mask<>(SB), t4; \ PSHUFB ·flip_mask(SB), t4; \
PSHUFB flip_mask<>(SB), t5; \ PSHUFB ·flip_mask(SB), t5; \
PSHUFB flip_mask<>(SB), t6; \ PSHUFB ·flip_mask(SB), t6; \
PSHUFB flip_mask<>(SB), t7; \ PSHUFB ·flip_mask(SB), t7; \
SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
#define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ #define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
@ -321,14 +262,14 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
PSHUFB bswap_mask<>(SB), t3; \ PSHUFB ·bswap_mask(SB), t3; \
PSHUFB bswap_mask<>(SB), t2; \ PSHUFB ·bswap_mask(SB), t2; \
PSHUFB bswap_mask<>(SB), t1; \ PSHUFB ·bswap_mask(SB), t1; \
PSHUFB bswap_mask<>(SB), t0; \ PSHUFB ·bswap_mask(SB), t0; \
PSHUFB bswap_mask<>(SB), t7; \ PSHUFB ·bswap_mask(SB), t7; \
PSHUFB bswap_mask<>(SB), t6; \ PSHUFB ·bswap_mask(SB), t6; \
PSHUFB bswap_mask<>(SB), t5; \ PSHUFB ·bswap_mask(SB), t5; \
PSHUFB bswap_mask<>(SB), t4 PSHUFB ·bswap_mask(SB), t4
// SM4 sbox function, AVX version // SM4 sbox function, AVX version
// parameters: // parameters:
@ -336,22 +277,22 @@ GLOBL fk_mask<>(SB), 8, $16
// - y: 128 bits temp register // - y: 128 bits temp register
// - tmp: 128 bits temp register // - tmp: 128 bits temp register
#define AVX_SM4_SBOX(x, y, tmp) \ #define AVX_SM4_SBOX(x, y, tmp) \
VPAND nibble_mask<>(SB), x, tmp; \ VPAND ·nibble_mask(SB), x, tmp; \
VMOVDQU m1_low<>(SB), y; \ VMOVDQU ·m1_low(SB), y; \
VPSHUFB tmp, y, y; \ VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \ VPSRLQ $4, x, x; \
VPAND nibble_mask<>(SB), x, x; \ VPAND ·nibble_mask(SB), x, x; \
VMOVDQU m1_high<>(SB), tmp; \ VMOVDQU ·m1_high(SB), tmp; \
VPSHUFB x, tmp, x; \ VPSHUFB x, tmp, x; \
VPXOR y, x, x; \ VPXOR y, x, x; \
VPSHUFB inverse_shift_rows<>(SB), x, x; \ VPSHUFB ·inverse_shift_rows(SB), x, x; \
VAESENCLAST nibble_mask<>(SB), x, x; \ VAESENCLAST ·nibble_mask(SB), x, x; \
VPANDN nibble_mask<>(SB), x, tmp; \ VPANDN ·nibble_mask(SB), x, tmp; \
VMOVDQU m2_low<>(SB), y; \ VMOVDQU ·m2_low(SB), y; \
VPSHUFB tmp, y, y; \ VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \ VPSRLQ $4, x, x; \
VPAND nibble_mask<>(SB), x, x; \ VPAND ·nibble_mask(SB), x, x; \
VMOVDQU m2_high<>(SB), tmp; \ VMOVDQU ·m2_high(SB), tmp; \
VPSHUFB x, tmp, x; \ VPSHUFB x, tmp, x; \
VPXOR y, x, x VPXOR y, x, x
@ -362,11 +303,11 @@ GLOBL fk_mask<>(SB), 8, $16
// - tmp: 128 bits temp register // - tmp: 128 bits temp register
#define AVX_SM4_TAO_L1(x, y, tmp) \ #define AVX_SM4_TAO_L1(x, y, tmp) \
AVX_SM4_SBOX(x, y, tmp); \ AVX_SM4_SBOX(x, y, tmp); \
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 VPSHUFB ·r08_mask(SB), x, y; \ // y = x <<< 8
VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16 VPSHUFB ·r08_mask(SB), y, tmp; \ // tmp = x <<< 16
VPXOR x, y, y; \ // y = x ^ (x <<< 8) VPXOR x, y, y; \ // y = x ^ (x <<< 8)
VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
VPSHUFB r08_mask<>(SB), tmp, tmp; \ // tmp = x <<< 24 VPSHUFB ·r08_mask(SB), tmp, tmp; \ // tmp = x <<< 24
VPXOR x, tmp, x; \ // x = x ^ (x <<< 24) VPXOR x, tmp, x; \ // x = x ^ (x <<< 24)
VPSLLD $2, y, tmp; \ VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \ VPSRLD $30, y, y; \
@ -429,10 +370,10 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \ SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ #define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
VPSHUFB flip_mask<>(SB), t0, t0 \ VPSHUFB ·flip_mask(SB), t0, t0 \
VPSHUFB flip_mask<>(SB), t1, t1 \ VPSHUFB ·flip_mask(SB), t1, t1 \
VPSHUFB flip_mask<>(SB), t2, t2 \ VPSHUFB ·flip_mask(SB), t2, t2 \
VPSHUFB flip_mask<>(SB), t3, t3 \ VPSHUFB ·flip_mask(SB), t3, t3 \
; \ ; \
AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
@ -456,10 +397,10 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
; \ // Transpose matrix 4 x 4 32bits word ; \ // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
VPSHUFB bswap_mask<>(SB), t0, t0 \ VPSHUFB ·bswap_mask(SB), t0, t0 \
VPSHUFB bswap_mask<>(SB), t1, t1 \ VPSHUFB ·bswap_mask(SB), t1, t1 \
VPSHUFB bswap_mask<>(SB), t2, t2 \ VPSHUFB ·bswap_mask(SB), t2, t2 \
VPSHUFB bswap_mask<>(SB), t3, t3 \ VPSHUFB ·bswap_mask(SB), t3, t3 \
#define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ #define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
VPSHUFD $0, rk128, x; \ VPSHUFD $0, rk128, x; \
@ -480,14 +421,14 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \ SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \
#define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ #define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
VPSHUFB flip_mask<>(SB), t0, t0 \ VPSHUFB ·flip_mask(SB), t0, t0 \
VPSHUFB flip_mask<>(SB), t1, t1 \ VPSHUFB ·flip_mask(SB), t1, t1 \
VPSHUFB flip_mask<>(SB), t2, t2 \ VPSHUFB ·flip_mask(SB), t2, t2 \
VPSHUFB flip_mask<>(SB), t3, t3 \ VPSHUFB ·flip_mask(SB), t3, t3 \
VPSHUFB flip_mask<>(SB), t4, t4 \ VPSHUFB ·flip_mask(SB), t4, t4 \
VPSHUFB flip_mask<>(SB), t5, t5 \ VPSHUFB ·flip_mask(SB), t5, t5 \
VPSHUFB flip_mask<>(SB), t6, t6 \ VPSHUFB ·flip_mask(SB), t6, t6 \
VPSHUFB flip_mask<>(SB), t7, t7 \ VPSHUFB ·flip_mask(SB), t7, t7 \
; \ ; \
AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
@ -513,14 +454,14 @@ GLOBL fk_mask<>(SB), 8, $16
; \ // Transpose matrix 4 x 4 32bits word ; \ // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
VPSHUFB bswap_mask<>(SB), t0, t0 \ VPSHUFB ·bswap_mask(SB), t0, t0 \
VPSHUFB bswap_mask<>(SB), t1, t1 \ VPSHUFB ·bswap_mask(SB), t1, t1 \
VPSHUFB bswap_mask<>(SB), t2, t2 \ VPSHUFB ·bswap_mask(SB), t2, t2 \
VPSHUFB bswap_mask<>(SB), t3, t3 \ VPSHUFB ·bswap_mask(SB), t3, t3 \
VPSHUFB bswap_mask<>(SB), t4, t4 \ VPSHUFB ·bswap_mask(SB), t4, t4 \
VPSHUFB bswap_mask<>(SB), t5, t5 \ VPSHUFB ·bswap_mask(SB), t5, t5 \
VPSHUFB bswap_mask<>(SB), t6, t6 \ VPSHUFB ·bswap_mask(SB), t6, t6 \
VPSHUFB bswap_mask<>(SB), t7, t7 \ VPSHUFB ·bswap_mask(SB), t7, t7 \
// SM4 sbox function, AVX2 version // SM4 sbox function, AVX2 version
// parameters: // parameters:
@ -533,24 +474,24 @@ GLOBL fk_mask<>(SB), 8, $16
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
#define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ #define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
VPAND yNibbleMask, x, z; \ VPAND yNibbleMask, x, z; \
VMOVDQU m1_low<>(SB), y; \ VMOVDQU ·m1_low(SB), y; \
VPSHUFB z, y, y; \ VPSHUFB z, y, y; \
VPSRLQ $4, x, x; \ VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \ VPAND yNibbleMask, x, x; \
VMOVDQU m1_high<>(SB), z; \ VMOVDQU ·m1_high(SB), z; \
VPSHUFB x, z, x; \ VPSHUFB x, z, x; \
VPXOR y, x, x; \ VPXOR y, x, x; \
VPSHUFB inverse_shift_rows<>(SB), x, x; \ VPSHUFB ·inverse_shift_rows(SB), x, x; \
VEXTRACTI128 $1, x, yw \ VEXTRACTI128 $1, x, yw \
VAESENCLAST xNibbleMask, xw, xw; \ VAESENCLAST xNibbleMask, xw, xw; \
VAESENCLAST xNibbleMask, yw, yw; \ VAESENCLAST xNibbleMask, yw, yw; \
VINSERTI128 $1, yw, x, x; \ VINSERTI128 $1, yw, x, x; \
VPANDN yNibbleMask, x, z; \ VPANDN yNibbleMask, x, z; \
VMOVDQU m2_low<>(SB), y; \ VMOVDQU ·m2_low(SB), y; \
VPSHUFB z, y, y; \ VPSHUFB z, y, y; \
VPSRLQ $4, x, x; \ VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \ VPAND yNibbleMask, x, x; \
VMOVDQU m2_high<>(SB), z; \ VMOVDQU ·m2_high(SB), z; \
VPSHUFB x, z, x; \ VPSHUFB x, z, x; \
VPXOR y, x, x VPXOR y, x, x
@ -565,11 +506,11 @@ GLOBL fk_mask<>(SB), 8, $16
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ #define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \ AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 VPSHUFB ·r08_mask(SB), x, y; \ // y = x <<< 8
VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16 VPSHUFB ·r08_mask(SB), y, z; \ // z = x <<< 16
VPXOR x, y, y; \ // y = x ^ (x <<< 8) VPXOR x, y, y; \ // y = x ^ (x <<< 8)
VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24 VPSHUFB ·r08_mask(SB), z, z; \ // z = x <<< 24
VPXOR x, z, x; \ // x = x ^ (x <<< 24) VPXOR x, z, x; \ // x = x ^ (x <<< 24)
VPSLLD $2, y, z; \ VPSLLD $2, y, z; \
VPSRLD $30, y, y; \ VPSRLD $30, y, y; \

View File

@ -1,37 +1,9 @@
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), (16+8), $16
// Affine transform 1 & 2 (low and high nibbles)
DATA m1_2<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_2<>+0x08(SB)/8, $0x3045F98CEF9A2653
DATA m1_2<>+0x10(SB)/8, $0xC35BF46CAF379800
DATA m1_2<>+0x18(SB)/8, $0x68F05FC7049C33AB
DATA m1_2<>+0x20(SB)/8, $0x9A950A05FEF16E61
DATA m1_2<>+0x28(SB)/8, $0x0E019E916A65FAF5
DATA m1_2<>+0x30(SB)/8, $0x892D69CD44E0A400
DATA m1_2<>+0x38(SB)/8, $0x2C88CC68E14501A5
GLOBL m1_2<>(SB), (16+8), $64
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), (16+8), $16
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), (16+8), $16
#define LOAD_SM4_AESNI_CONSTS() \ #define LOAD_SM4_AESNI_CONSTS() \
MOVW $0x0F0F0F0F, R20 \ MOVW $0x0F0F0F0F, R20 \
VDUP R20, NIBBLE_MASK.S4 \ VDUP R20, NIBBLE_MASK.S4 \
MOVD $m1_2<>(SB), R20 \ MOVD $·rcon(SB), R20 \
VLD1 (R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ VLD1.P 64(R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
MOVD $inverse_shift_rows<>(SB), R20 \ VLD1 (R20), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16]
VLD1 (R20), [INVERSE_SHIFT_ROWS.B16] \
MOVD $r08_mask<>(SB), R20 \
VLD1 (R20), [R08_MASK.B16] \
// input: from high to low // input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0 // t0 = t0.S3, t0.S2, t0.S1, t0.S0

View File

@ -13,6 +13,61 @@
#define XTMP6 X10 #define XTMP6 X10
#define XTMP7 X11 #define XTMP7 X11
// shuffle byte order from LE to BE
DATA ·flip_mask+0x00(SB)/8, $0x0405060700010203
DATA ·flip_mask+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL ·flip_mask(SB), RODATA, $16
// shuffle byte and word order
DATA ·bswap_mask+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA ·bswap_mask+0x08(SB)/8, $0x0001020304050607
GLOBL ·bswap_mask(SB), RODATA, $16
//nibble mask
DATA ·nibble_mask+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA ·nibble_mask+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL ·nibble_mask(SB), RODATA, $16
// inverse shift rows
DATA ·inverse_shift_rows+0x00(SB)/8, $0x0B0E0104070A0D00
DATA ·inverse_shift_rows+0x08(SB)/8, $0x0306090C0F020508
DATA ·inverse_shift_rows+0x10(SB)/8, $0x0B0E0104070A0D00
DATA ·inverse_shift_rows+0x18(SB)/8, $0x0306090C0F020508
GLOBL ·inverse_shift_rows(SB), RODATA, $32
// Affine transform 1 (low and high nibbles)
DATA ·m1_low+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA ·m1_low+0x08(SB)/8, $0x3045F98CEF9A2653
DATA ·m1_low+0x10(SB)/8, $0x0A7FC3B6D5A01C69
DATA ·m1_low+0x18(SB)/8, $0x3045F98CEF9A2653
GLOBL ·m1_low(SB), RODATA, $32
DATA ·m1_high+0x00(SB)/8, $0xC35BF46CAF379800
DATA ·m1_high+0x08(SB)/8, $0x68F05FC7049C33AB
DATA ·m1_high+0x10(SB)/8, $0xC35BF46CAF379800
DATA ·m1_high+0x18(SB)/8, $0x68F05FC7049C33AB
GLOBL ·m1_high(SB), RODATA, $32
// Affine transform 2 (low and high nibbles)
DATA ·m2_low+0x00(SB)/8, $0x9A950A05FEF16E61
DATA ·m2_low+0x08(SB)/8, $0x0E019E916A65FAF5
DATA ·m2_low+0x10(SB)/8, $0x9A950A05FEF16E61
DATA ·m2_low+0x18(SB)/8, $0x0E019E916A65FAF5
GLOBL ·m2_low(SB), RODATA, $32
DATA ·m2_high+0x00(SB)/8, $0x892D69CD44E0A400
DATA ·m2_high+0x08(SB)/8, $0x2C88CC68E14501A5
DATA ·m2_high+0x10(SB)/8, $0x892D69CD44E0A400
DATA ·m2_high+0x18(SB)/8, $0x2C88CC68E14501A5
GLOBL ·m2_high(SB), RODATA, $32
// left rotations of 32-bit words by 8-bit increments
DATA ·r08_mask+0x00(SB)/8, $0x0605040702010003
DATA ·r08_mask+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA ·r08_mask+0x10(SB)/8, $0x0605040702010003
DATA ·r08_mask+0x18(SB)/8, $0x0E0D0C0F0A09080B
GLOBL ·r08_mask(SB), RODATA, $32
#include "aesni_macros_amd64.s" #include "aesni_macros_amd64.s"
// SM4 TAO L2 function, used for key expand // SM4 TAO L2 function, used for key expand
@ -105,8 +160,8 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0
MOVQ dec+24(FP), DI MOVQ dec+24(FP), DI
MOVUPS 0(AX), t0 MOVUPS 0(AX), t0
PSHUFB flip_mask<>(SB), t0 PSHUFB ·flip_mask(SB), t0
PXOR fk_mask<>(SB), t0 PXOR ·fk(SB), t0
PSHUFD $1, t0, t1 PSHUFD $1, t0, t1
PSHUFD $2, t0, t2 PSHUFD $2, t0, t2
PSHUFD $3, t0, t3 PSHUFD $3, t0, t3
@ -225,7 +280,7 @@ avx_done_sm4:
RET RET
avx2: avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
CMPQ DI, $256 CMPQ DI, $256
JEQ avx2_16blocks JEQ avx2_16blocks
@ -235,7 +290,7 @@ avx2_8blocks:
VMOVDQU 32(DX), XDWORD1 VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2 VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3 VMOVDQU 96(DX), XDWORD3
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE // Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
@ -251,7 +306,7 @@ avx2_8blocks:
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
@ -275,7 +330,7 @@ avx2_16blocks:
VMOVDQU 192(DX), XDWORD6 VMOVDQU 192(DX), XDWORD6
VMOVDQU 224(DX), XDWORD7 VMOVDQU 224(DX), XDWORD7
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE // Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
@ -297,7 +352,7 @@ avx2_16blocks:
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
@ -328,7 +383,7 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVQ src+16(FP), DX MOVQ src+16(FP), DX
MOVUPS (DX), t0 MOVUPS (DX), t0
PSHUFB flip_mask<>(SB), t0 PSHUFB ·flip_mask(SB), t0
PSHUFD $1, t0, t1 PSHUFD $1, t0, t1
PSHUFD $2, t0, t2 PSHUFD $2, t0, t2
PSHUFD $3, t0, t3 PSHUFD $3, t0, t3
@ -353,7 +408,7 @@ loop:
PUNPCKLLQ t2, t3 PUNPCKLLQ t2, t3
PUNPCKLLQ t0, t1 PUNPCKLLQ t0, t1
PUNPCKLQDQ t1, t3 PUNPCKLQDQ t1, t3
PSHUFB flip_mask<>(SB), t3 PSHUFB ·flip_mask(SB), t3
MOVUPS t3, (BX) MOVUPS t3, (BX)
done_sm4: done_sm4:

View File

@ -20,10 +20,26 @@
#define M2H V23 #define M2H V23
#define R08_MASK V24 #define R08_MASK V24
#define INVERSE_SHIFT_ROWS V25 #define INVERSE_SHIFT_ROWS V25
#define NIBBLE_MASK V26 #define FK_MASK V26
#define FK_MASK V27 #define NIBBLE_MASK V27
#define ZERO V28 #define ZERO V28
DATA ·rcon+0x00(SB)/8, $0x0A7FC3B6D5A01C69 // m1l
DATA ·rcon+0x08(SB)/8, $0x3045F98CEF9A2653
DATA ·rcon+0x10(SB)/8, $0xC35BF46CAF379800 // m1h
DATA ·rcon+0x18(SB)/8, $0x68F05FC7049C33AB
DATA ·rcon+0x20(SB)/8, $0x9A950A05FEF16E61 // m2l
DATA ·rcon+0x28(SB)/8, $0x0E019E916A65FAF5
DATA ·rcon+0x30(SB)/8, $0x892D69CD44E0A400 // m2h
DATA ·rcon+0x38(SB)/8, $0x2C88CC68E14501A5
DATA ·rcon+0x40(SB)/8, $0x0605040702010003 // left rotations of 32-bit words by 8-bit increments
DATA ·rcon+0x48(SB)/8, $0x0E0D0C0F0A09080B
DATA ·rcon+0x50(SB)/8, $0x0B0E0104070A0D00 // inverse shift rows
DATA ·rcon+0x58(SB)/8, $0x0306090C0F020508
DATA ·rcon+0x60(SB)/8, $0x56aa3350a3b1bac6 // fk
DATA ·rcon+0x68(SB)/8, $0xb27022dc677d9197
GLOBL ·rcon(SB), RODATA, $112
#include "aesni_macros_arm64.s" #include "aesni_macros_arm64.s"
#define SM4_TAO_L2(x, y) \ #define SM4_TAO_L2(x, y) \
@ -49,14 +65,11 @@
MOVW.P R2, -4(R11) MOVW.P R2, -4(R11)
#define LOAD_SM4KEY_AESNI_CONSTS() \ #define LOAD_SM4KEY_AESNI_CONSTS() \
MOVW $0x0F0F0F0F, R0 \ MOVW $0x0F0F0F0F, R0 \
VDUP R0, NIBBLE_MASK.S4 \ VDUP R0, NIBBLE_MASK.S4 \
MOVD $m1_2<>(SB), R0 \ MOVD $·rcon(SB), R0 \
VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ VLD1.P 64(R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
MOVD $fk_mask<>(SB), R0 \ VLD1 (R0), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16, FK.B16]
VLD1 (R0), [FK_MASK.B16] \
MOVD $inverse_shift_rows<>(SB), R0 \
VLD1 (R0), [INVERSE_SHIFT_ROWS.B16]
#define SM4EKEY_EXPORT_KEYS() \ #define SM4EKEY_EXPORT_KEYS() \
VREV64 V8.S4, V11.S4 \ VREV64 V8.S4, V11.S4 \

View File

@ -360,9 +360,9 @@ avxCbcSm4Done:
RET RET
avx2Start: avx2Start:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK
VMOVDQU -16(DX), X15 VMOVDQU -16(DX), X15

View File

@ -16,15 +16,15 @@
#define t7 V13 #define t7 V13
#define IV V18 #define IV V18
#define LAST_BLOCK V15
#define ZERO V16 #define ZERO V16
#define NIBBLE_MASK V20 #define M1L V20
#define INVERSE_SHIFT_ROWS V21 #define M1H V21
#define M1L V22 #define M2L V22
#define M1H V23 #define M2H V23
#define M2L V24 #define R08_MASK V24
#define M2H V25 #define INVERSE_SHIFT_ROWS V25
#define R08_MASK V26 #define NIBBLE_MASK V26
#define FK_MASK V27
#include "aesni_macros_arm64.s" #include "aesni_macros_arm64.s"
@ -49,7 +49,7 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
ADD srcPtr, srcPtrLen, R10 ADD srcPtr, srcPtrLen, R10
SUB $16, R10, R10 SUB $16, R10, R10
VLD1 (R10), [V15.S4] VLD1 (R10), [LAST_BLOCK.S4]
cbcSm4Octets: cbcSm4Octets:
CMP $128, srcPtrLen CMP $128, srcPtrLen
@ -293,5 +293,5 @@ cbc4BlocksLoop48:
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr) VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
cbcSm4Done: cbcSm4Done:
VST1 [V15.S4], (R6) VST1 [LAST_BLOCK.S4], (R6)
RET RET

View File

@ -219,9 +219,9 @@ avxEcbSm4Done:
RET RET
avx2_start: avx2_start:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK
avx2_16blocks: avx2_16blocks:
CMPQ DI, $256 CMPQ DI, $256

View File

@ -8,15 +8,6 @@
#define t1 V3 #define t1 V3
#define t2 V4 #define t2 V4
#define t3 V5 #define t3 V5
#define ZERO V16
#define NIBBLE_MASK V20
#define INVERSE_SHIFT_ROWS V21
#define M1L V22
#define M1H V23
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define FK_MASK V27
#define XTMP6 V6 #define XTMP6 V6
#define XTMP7 V7 #define XTMP7 V7
#define t4 V10 #define t4 V10
@ -24,6 +15,15 @@
#define t6 V12 #define t6 V12
#define t7 V13 #define t7 V13
#define ZERO V16
#define M1L V20
#define M1H V21
#define M2L V22
#define M2H V23
#define R08_MASK V24
#define INVERSE_SHIFT_ROWS V25
#define NIBBLE_MASK V26
#include "aesni_macros_arm64.s" #include "aesni_macros_arm64.s"
// func encryptSm4Ecb(xk *uint32, dst, src []byte) // func encryptSm4Ecb(xk *uint32, dst, src []byte)

View File

@ -95,7 +95,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
MOVOU (tPtr), ACC0 MOVOU (tPtr), ACC0
MOVOU (tMsk), T2 MOVOU (tMsk), T2
MOVOU bswap_mask<>(SB), BSWAP MOVOU ·bswap_mask(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY MOVOU gcmPoly<>(SB), POLY
SHLQ $3, plen SHLQ $3, plen
@ -279,7 +279,7 @@ TEXT ·gcmSm4Data(SB),NOSPLIT,$0
PXOR ACC0, ACC0 PXOR ACC0, ACC0
// MOVOU (tPtr), ACC0 // originally we passed in tag initial value // MOVOU (tPtr), ACC0 // originally we passed in tag initial value
MOVOU bswap_mask<>(SB), BSWAP MOVOU ·bswap_mask(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY MOVOU gcmPoly<>(SB), POLY
TESTQ autLen, autLen TESTQ autLen, autLen
@ -527,14 +527,14 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
CMPB ·useAVX(SB), $1 CMPB ·useAVX(SB), $1
JE avxGcmSm4Enc JE avxGcmSm4Enc
MOVOU bswap_mask<>(SB), BSWAP MOVOU ·bswap_mask(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY MOVOU gcmPoly<>(SB), POLY
MOVOU (tPtr), ACC0 MOVOU (tPtr), ACC0
PXOR ACC1, ACC1 PXOR ACC1, ACC1
PXOR ACCM, ACCM PXOR ACCM, ACCM
MOVOU (ctrPtr), T0 MOVOU (ctrPtr), T0
PSHUFB flip_mask<>(SB), T0 PSHUFB ·flip_mask(SB), T0
PEXTRD $3, T0, aluCTR PEXTRD $3, T0, aluCTR
MOVOU T0, (8*16 + 0*16)(SP) MOVOU T0, (8*16 + 0*16)(SP)
@ -870,14 +870,14 @@ gcmSm4EncDone:
RET RET
avxGcmSm4Enc: avxGcmSm4Enc:
VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0 VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1 VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0 VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0 VPSHUFB ·flip_mask(SB), T0, T0
VPEXTRD $3, T0, aluCTR VPEXTRD $3, T0, aluCTR
VMOVDQU T0, (8*16 + 0*16)(SP) VMOVDQU T0, (8*16 + 0*16)(SP)
@ -1198,14 +1198,14 @@ avxGcmSm4EncDone:
RET RET
avx2GcmSm4Enc: avx2GcmSm4Enc:
VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0 VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1 VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0 VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0 VPSHUFB ·flip_mask(SB), T0, T0
VPEXTRD $3, T0, aluCTR VPEXTRD $3, T0, aluCTR
VINSERTI128 $1, T0, Y11, Y11 VINSERTI128 $1, T0, Y11, Y11
@ -1228,7 +1228,7 @@ avx2GcmSm4Enc:
increment(6) increment(6)
increment(7) increment(7)
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
// load 8 ctrs for encryption // load 8 ctrs for encryption
VMOVDQU (4*32 + 0*32)(SP), DWB0 VMOVDQU (4*32 + 0*32)(SP), DWB0
VMOVDQU (4*32 + 1*32)(SP), DWB1 VMOVDQU (4*32 + 1*32)(SP), DWB1
@ -1239,7 +1239,7 @@ avx2GcmSm4Enc:
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD) TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
increment(1) increment(1)
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3) AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
increment(2) increment(2)
@ -1613,14 +1613,14 @@ TEXT ·gcmSm4Dec(SB),0,$128-96
CMPB ·useAVX(SB), $1 CMPB ·useAVX(SB), $1
JE avxGcmSm4Dec JE avxGcmSm4Dec
MOVOU bswap_mask<>(SB), BSWAP MOVOU ·bswap_mask(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY MOVOU gcmPoly<>(SB), POLY
MOVOU (tPtr), ACC0 MOVOU (tPtr), ACC0
PXOR ACC1, ACC1 PXOR ACC1, ACC1
PXOR ACCM, ACCM PXOR ACCM, ACCM
MOVOU (ctrPtr), T0 MOVOU (ctrPtr), T0
PSHUFB flip_mask<>(SB), T0 PSHUFB ·flip_mask(SB), T0
PEXTRD $3, T0, aluCTR PEXTRD $3, T0, aluCTR
MOVOU T0, (0*16)(SP) MOVOU T0, (0*16)(SP)
@ -1841,14 +1841,14 @@ gcmSm4DecDone:
RET RET
avxGcmSm4Dec: avxGcmSm4Dec:
VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0 VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1 VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0 VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0 VPSHUFB ·flip_mask(SB), T0, T0
VPEXTRD $3, T0, aluCTR VPEXTRD $3, T0, aluCTR
VMOVDQU T0, (0*16)(SP) VMOVDQU T0, (0*16)(SP)
@ -2065,14 +2065,14 @@ avxGcmSm4DecDone:
RET RET
avx2GcmSm4Dec: avx2GcmSm4Dec:
VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0 VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1 VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0 VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0 VPSHUFB ·flip_mask(SB), T0, T0
VPEXTRD $3, T0, aluCTR VPEXTRD $3, T0, aluCTR
VINSERTI128 $1, T0, Y11, Y11 VINSERTI128 $1, T0, Y11, Y11
@ -2094,8 +2094,8 @@ avx2GcmSm4Dec:
increment(6) increment(6)
increment(7) increment(7)
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
avx2GcmSm4DecOctetsLoop: avx2GcmSm4DecOctetsLoop:
CMPQ ptxLen, $128 CMPQ ptxLen, $128

View File

@ -29,13 +29,14 @@
#define K1 V20 #define K1 V20
#define K2 V21 #define K2 V21
#define K3 V22 #define K3 V22
#define NIBBLE_MASK V23
#define INVERSE_SHIFT_ROWS V24 #define M1L V23
#define M1L V25 #define M1H V24
#define M1H V26 #define M2L V25
#define M2L V27 #define M2H V26
#define M2H V28 #define R08_MASK V27
#define R08_MASK V29 #define INVERSE_SHIFT_ROWS V28
#define NIBBLE_MASK V29
#define reduce() \ #define reduce() \
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ VEOR ACC0.B16, ACCM.B16, ACCM.B16 \

View File

@ -329,7 +329,7 @@ GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16
VPXOR (32*7)(SP), Y7, Y7 VPXOR (32*7)(SP), Y7, Y7
#define avx2LE2BE8Blocks \ #define avx2LE2BE8Blocks \
VBROADCASTI128 flip_mask<>(SB), Y11; \ VBROADCASTI128 ·flip_mask(SB), Y11; \
VPSHUFB Y11, Y0, Y0; \ VPSHUFB Y11, Y0, Y0; \
VPSHUFB Y11, Y1, Y1; \ VPSHUFB Y11, Y1, Y1; \
VPSHUFB Y11, Y2, Y2; \ VPSHUFB Y11, Y2, Y2; \
@ -589,8 +589,8 @@ avxXtsSm4EncDone:
avx2XtsSm4Enc: avx2XtsSm4Enc:
VMOVDQU gcmPoly<>(SB), POLY VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (0*16)(BX), TW VMOVDQU (0*16)(BX), TW
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
avx2XtsSm4Enc16Blocks: avx2XtsSm4Enc16Blocks:
CMPQ DI, $256 CMPQ DI, $256
@ -735,7 +735,7 @@ TEXT ·encryptSm4XtsGB(SB),0,$256-64
JE avxXtsSm4Enc JE avxXtsSm4Enc
MOVOU gbGcmPoly<>(SB), POLY MOVOU gbGcmPoly<>(SB), POLY
MOVOU bswap_mask<>(SB), BSWAP MOVOU ·bswap_mask(SB), BSWAP
MOVOU (0*16)(BX), TW MOVOU (0*16)(BX), TW
xtsSm4EncOctets: xtsSm4EncOctets:
@ -834,7 +834,7 @@ xtsSm4EncDone:
avxXtsSm4Enc: avxXtsSm4Enc:
VMOVDQU gbGcmPoly<>(SB), POLY VMOVDQU gbGcmPoly<>(SB), POLY
VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU (0*16)(BX), TW VMOVDQU (0*16)(BX), TW
avxXtsSm4EncOctets: avxXtsSm4EncOctets:
@ -934,8 +934,8 @@ avxXtsSm4EncDone:
avx2XtsSm4Enc: avx2XtsSm4Enc:
VMOVDQU gbGcmPoly<>(SB), POLY VMOVDQU gbGcmPoly<>(SB), POLY
VMOVDQU (0*16)(BX), TW VMOVDQU (0*16)(BX), TW
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
avx2XtsSm4Enc16Blocks: avx2XtsSm4Enc16Blocks:
CMPQ DI, $256 CMPQ DI, $256
@ -1327,8 +1327,8 @@ avxXtsSm4DecDone:
avx2XtsSm4Dec: avx2XtsSm4Dec:
VMOVDQU gcmPoly<>(SB), POLY VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (0*16)(BX), TW VMOVDQU (0*16)(BX), TW
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
avx2XtsSm4Dec16Blocks: avx2XtsSm4Dec16Blocks:
CMPQ DI, $256 CMPQ DI, $256
@ -1498,7 +1498,7 @@ TEXT ·decryptSm4XtsGB(SB),0,$256-64
JE avxXtsSm4Dec JE avxXtsSm4Dec
MOVOU gbGcmPoly<>(SB), POLY MOVOU gbGcmPoly<>(SB), POLY
MOVOU bswap_mask<>(SB), BSWAP MOVOU ·bswap_mask(SB), BSWAP
MOVOU (0*16)(BX), TW MOVOU (0*16)(BX), TW
xtsSm4DecOctets: xtsSm4DecOctets:
@ -1622,7 +1622,7 @@ xtsSm4DecDone:
avxXtsSm4Dec: avxXtsSm4Dec:
VMOVDQU gbGcmPoly<>(SB), POLY VMOVDQU gbGcmPoly<>(SB), POLY
VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU (0*16)(BX), TW VMOVDQU (0*16)(BX), TW
avxXtsSm4DecOctets: avxXtsSm4DecOctets:
@ -1747,8 +1747,8 @@ avxXtsSm4DecDone:
avx2XtsSm4Dec: avx2XtsSm4Dec:
VMOVDQU gbGcmPoly<>(SB), POLY VMOVDQU gbGcmPoly<>(SB), POLY
VMOVDQU (0*16)(BX), TW VMOVDQU (0*16)(BX), TW
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
avx2XtsSm4Dec16Blocks: avx2XtsSm4Dec16Blocks:
CMPQ DI, $256 CMPQ DI, $256

View File

@ -29,13 +29,13 @@
#define K2 V21 #define K2 V21
#define K3 V22 #define K3 V22
#define NIBBLE_MASK V23 #define M1L V23
#define INVERSE_SHIFT_ROWS V24 #define M1H V24
#define M1L V25 #define M2L V25
#define M1H V26 #define M2H V26
#define M2L V27 #define R08_MASK V27
#define M2H V28 #define INVERSE_SHIFT_ROWS V28
#define R08_MASK V29 #define NIBBLE_MASK V29
#include "aesni_macros_arm64.s" #include "aesni_macros_arm64.s"
#include "xts_macros_arm64.s" #include "xts_macros_arm64.s"

View File

@ -28,7 +28,6 @@ DATA rcon<>+0x90(SB)/8, $0x00ff00ff00ff00ff // S1
DATA rcon<>+0x98(SB)/8, $0x00ff00ff00ff00ff DATA rcon<>+0x98(SB)/8, $0x00ff00ff00ff00ff
GLOBL rcon<>(SB), RODATA, $160 GLOBL rcon<>(SB), RODATA, $160
#define M1L V20 #define M1L V20
#define M1H V21 #define M1H V21
#define M2L V22 #define M2L V22