sm4: use package level instead of local for shared variables

This commit is contained in:
Sun Yimin 2024-11-11 17:40:41 +08:00 committed by GitHub
parent b721bed0cc
commit aa82b5836b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 247 additions and 266 deletions

View File

@ -1,62 +1,3 @@
// shuffle byte order from LE to BE
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL flip_mask<>(SB), 8, $16
// shuffle byte and word order
DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
GLOBL bswap_mask<>(SB), 8, $16
//nibble mask
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL nibble_mask<>(SB), 8, $16
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
DATA inverse_shift_rows<>+0x10(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), 8, $32
// Affine transform 1 (low and high nibbles)
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_low<>+0x18(SB)/8, $0x3045F98CEF9A2653
GLOBL m1_low<>(SB), 8, $32
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
DATA m1_high<>+0x10(SB)/8, $0xC35BF46CAF379800
DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB
GLOBL m1_high<>(SB), 8, $32
// Affine transform 2 (low and high nibbles)
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61
DATA m2_low<>+0x18(SB)/8, $0x0E019E916A65FAF5
GLOBL m2_low<>(SB), 8, $32
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
DATA m2_high<>+0x10(SB)/8, $0x892D69CD44E0A400
DATA m2_high<>+0x18(SB)/8, $0x2C88CC68E14501A5
GLOBL m2_high<>(SB), 8, $32
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), 8, $32
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), 8, $16
// Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions.
// input: from high to low
// r0 = [w3, w2, w1, w0]
@ -110,26 +51,26 @@ GLOBL fk_mask<>(SB), 8, $16
#define SM4_SBOX(x, y, z) \
; \ //############################# inner affine ############################//
MOVOU x, z; \
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
MOVOU m1_low<>(SB), y; \
PAND ·nibble_mask(SB), z; \ //y = _mm_and_si128(x, c0f);
MOVOU ·m1_low(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m1_high<>(SB), z; \
PAND ·nibble_mask(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU ·m1_high(SB), z; \
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
; \ // inverse ShiftRows
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
PSHUFB ·inverse_shift_rows(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST ·nibble_mask(SB), x; \ // AESNI instruction
; \ //############################# outer affine ############################//
MOVOU x, z; \
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
MOVOU m2_low<>(SB), y; \
PANDN ·nibble_mask(SB), z; \ //z = _mm_andnot_si128(x, c0f);
MOVOU ·m2_low(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m2_high<>(SB), z; \
PAND ·nibble_mask(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU ·m2_high(SB), z; \
PSHUFB x, z; \
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
@ -143,12 +84,12 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \
PSHUFB r08_mask<>(SB), y; \ //y = x <<< 8
PSHUFB ·r08_mask(SB), y; \ //y = x <<< 8
MOVOU y, z; \
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 16
PSHUFB ·r08_mask(SB), z; \ //z = x <<< 16
PXOR x, y; \ //y = x ^ (x <<< 8)
PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16)
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 24
PSHUFB ·r08_mask(SB), z; \ //z = x <<< 24
PXOR z, x; \ //x = x ^ (x <<< 24)
MOVOU y, z; \
PSLLL $2, z; \
@ -214,7 +155,7 @@ GLOBL fk_mask<>(SB), 8, $16
// Requires: SSSE3
#define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \
PSHUFB ·flip_mask(SB), t0; \
PSHUFD $1, t0, t1; \
PSHUFD $2, t0, t2; \
PSHUFD $3, t0, t3; \
@ -238,13 +179,13 @@ GLOBL fk_mask<>(SB), 8, $16
PALIGNR $4, t3, t2; \
PALIGNR $4, t2, t1; \
PALIGNR $4, t1, t0; \
PSHUFB flip_mask<>(SB), t0
PSHUFB ·flip_mask(SB), t0
#define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \
PSHUFB flip_mask<>(SB), t1; \
PSHUFB flip_mask<>(SB), t2; \
PSHUFB flip_mask<>(SB), t3; \
PSHUFB ·flip_mask(SB), t0; \
PSHUFB ·flip_mask(SB), t1; \
PSHUFB ·flip_mask(SB), t2; \
PSHUFB ·flip_mask(SB), t3; \
SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
#define SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \
@ -266,10 +207,10 @@ GLOBL fk_mask<>(SB), 8, $16
MOVOU (7*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
PSHUFB bswap_mask<>(SB), t3; \
PSHUFB bswap_mask<>(SB), t2; \
PSHUFB bswap_mask<>(SB), t1; \
PSHUFB bswap_mask<>(SB), t0
PSHUFB ·bswap_mask(SB), t3; \
PSHUFB ·bswap_mask(SB), t2; \
PSHUFB ·bswap_mask(SB), t1; \
PSHUFB ·bswap_mask(SB), t0
#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
PSHUFD $0, rk128, x; \
@ -290,14 +231,14 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \
#define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
PSHUFB flip_mask<>(SB), t0; \
PSHUFB flip_mask<>(SB), t1; \
PSHUFB flip_mask<>(SB), t2; \
PSHUFB flip_mask<>(SB), t3; \
PSHUFB flip_mask<>(SB), t4; \
PSHUFB flip_mask<>(SB), t5; \
PSHUFB flip_mask<>(SB), t6; \
PSHUFB flip_mask<>(SB), t7; \
PSHUFB ·flip_mask(SB), t0; \
PSHUFB ·flip_mask(SB), t1; \
PSHUFB ·flip_mask(SB), t2; \
PSHUFB ·flip_mask(SB), t3; \
PSHUFB ·flip_mask(SB), t4; \
PSHUFB ·flip_mask(SB), t5; \
PSHUFB ·flip_mask(SB), t6; \
PSHUFB ·flip_mask(SB), t7; \
SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
#define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
@ -321,14 +262,14 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
PSHUFB bswap_mask<>(SB), t3; \
PSHUFB bswap_mask<>(SB), t2; \
PSHUFB bswap_mask<>(SB), t1; \
PSHUFB bswap_mask<>(SB), t0; \
PSHUFB bswap_mask<>(SB), t7; \
PSHUFB bswap_mask<>(SB), t6; \
PSHUFB bswap_mask<>(SB), t5; \
PSHUFB bswap_mask<>(SB), t4
PSHUFB ·bswap_mask(SB), t3; \
PSHUFB ·bswap_mask(SB), t2; \
PSHUFB ·bswap_mask(SB), t1; \
PSHUFB ·bswap_mask(SB), t0; \
PSHUFB ·bswap_mask(SB), t7; \
PSHUFB ·bswap_mask(SB), t6; \
PSHUFB ·bswap_mask(SB), t5; \
PSHUFB ·bswap_mask(SB), t4
// SM4 sbox function, AVX version
// parameters:
@ -336,22 +277,22 @@ GLOBL fk_mask<>(SB), 8, $16
// - y: 128 bits temp register
// - tmp: 128 bits temp register
#define AVX_SM4_SBOX(x, y, tmp) \
VPAND nibble_mask<>(SB), x, tmp; \
VMOVDQU m1_low<>(SB), y; \
VPAND ·nibble_mask(SB), x, tmp; \
VMOVDQU ·m1_low(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND nibble_mask<>(SB), x, x; \
VMOVDQU m1_high<>(SB), tmp; \
VPAND ·nibble_mask(SB), x, x; \
VMOVDQU ·m1_high(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VPSHUFB inverse_shift_rows<>(SB), x, x; \
VAESENCLAST nibble_mask<>(SB), x, x; \
VPANDN nibble_mask<>(SB), x, tmp; \
VMOVDQU m2_low<>(SB), y; \
VPSHUFB ·inverse_shift_rows(SB), x, x; \
VAESENCLAST ·nibble_mask(SB), x, x; \
VPANDN ·nibble_mask(SB), x, tmp; \
VMOVDQU ·m2_low(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND nibble_mask<>(SB), x, x; \
VMOVDQU m2_high<>(SB), tmp; \
VPAND ·nibble_mask(SB), x, x; \
VMOVDQU ·m2_high(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
@ -362,11 +303,11 @@ GLOBL fk_mask<>(SB), 8, $16
// - tmp: 128 bits temp register
#define AVX_SM4_TAO_L1(x, y, tmp) \
AVX_SM4_SBOX(x, y, tmp); \
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16
VPSHUFB ·r08_mask(SB), x, y; \ // y = x <<< 8
VPSHUFB ·r08_mask(SB), y, tmp; \ // tmp = x <<< 16
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
VPSHUFB r08_mask<>(SB), tmp, tmp; \ // tmp = x <<< 24
VPSHUFB ·r08_mask(SB), tmp, tmp; \ // tmp = x <<< 24
VPXOR x, tmp, x; \ // x = x ^ (x <<< 24)
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
@ -429,10 +370,10 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
VPSHUFB flip_mask<>(SB), t0, t0 \
VPSHUFB flip_mask<>(SB), t1, t1 \
VPSHUFB flip_mask<>(SB), t2, t2 \
VPSHUFB flip_mask<>(SB), t3, t3 \
VPSHUFB ·flip_mask(SB), t0, t0 \
VPSHUFB ·flip_mask(SB), t1, t1 \
VPSHUFB ·flip_mask(SB), t2, t2 \
VPSHUFB ·flip_mask(SB), t3, t3 \
; \
AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
@ -456,10 +397,10 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
; \ // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
VPSHUFB bswap_mask<>(SB), t0, t0 \
VPSHUFB bswap_mask<>(SB), t1, t1 \
VPSHUFB bswap_mask<>(SB), t2, t2 \
VPSHUFB bswap_mask<>(SB), t3, t3 \
VPSHUFB ·bswap_mask(SB), t0, t0 \
VPSHUFB ·bswap_mask(SB), t1, t1 \
VPSHUFB ·bswap_mask(SB), t2, t2 \
VPSHUFB ·bswap_mask(SB), t3, t3 \
#define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
VPSHUFD $0, rk128, x; \
@ -480,14 +421,14 @@ GLOBL fk_mask<>(SB), 8, $16
SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \
#define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
VPSHUFB flip_mask<>(SB), t0, t0 \
VPSHUFB flip_mask<>(SB), t1, t1 \
VPSHUFB flip_mask<>(SB), t2, t2 \
VPSHUFB flip_mask<>(SB), t3, t3 \
VPSHUFB flip_mask<>(SB), t4, t4 \
VPSHUFB flip_mask<>(SB), t5, t5 \
VPSHUFB flip_mask<>(SB), t6, t6 \
VPSHUFB flip_mask<>(SB), t7, t7 \
VPSHUFB ·flip_mask(SB), t0, t0 \
VPSHUFB ·flip_mask(SB), t1, t1 \
VPSHUFB ·flip_mask(SB), t2, t2 \
VPSHUFB ·flip_mask(SB), t3, t3 \
VPSHUFB ·flip_mask(SB), t4, t4 \
VPSHUFB ·flip_mask(SB), t5, t5 \
VPSHUFB ·flip_mask(SB), t6, t6 \
VPSHUFB ·flip_mask(SB), t7, t7 \
; \
AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
@ -513,14 +454,14 @@ GLOBL fk_mask<>(SB), 8, $16
; \ // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
VPSHUFB bswap_mask<>(SB), t0, t0 \
VPSHUFB bswap_mask<>(SB), t1, t1 \
VPSHUFB bswap_mask<>(SB), t2, t2 \
VPSHUFB bswap_mask<>(SB), t3, t3 \
VPSHUFB bswap_mask<>(SB), t4, t4 \
VPSHUFB bswap_mask<>(SB), t5, t5 \
VPSHUFB bswap_mask<>(SB), t6, t6 \
VPSHUFB bswap_mask<>(SB), t7, t7 \
VPSHUFB ·bswap_mask(SB), t0, t0 \
VPSHUFB ·bswap_mask(SB), t1, t1 \
VPSHUFB ·bswap_mask(SB), t2, t2 \
VPSHUFB ·bswap_mask(SB), t3, t3 \
VPSHUFB ·bswap_mask(SB), t4, t4 \
VPSHUFB ·bswap_mask(SB), t5, t5 \
VPSHUFB ·bswap_mask(SB), t6, t6 \
VPSHUFB ·bswap_mask(SB), t7, t7 \
// SM4 sbox function, AVX2 version
// parameters:
@ -533,24 +474,24 @@ GLOBL fk_mask<>(SB), 8, $16
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
#define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
VPAND yNibbleMask, x, z; \
VMOVDQU m1_low<>(SB), y; \
VMOVDQU ·m1_low(SB), y; \
VPSHUFB z, y, y; \
VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \
VMOVDQU m1_high<>(SB), z; \
VMOVDQU ·m1_high(SB), z; \
VPSHUFB x, z, x; \
VPXOR y, x, x; \
VPSHUFB inverse_shift_rows<>(SB), x, x; \
VPSHUFB ·inverse_shift_rows(SB), x, x; \
VEXTRACTI128 $1, x, yw \
VAESENCLAST xNibbleMask, xw, xw; \
VAESENCLAST xNibbleMask, yw, yw; \
VINSERTI128 $1, yw, x, x; \
VPANDN yNibbleMask, x, z; \
VMOVDQU m2_low<>(SB), y; \
VMOVDQU ·m2_low(SB), y; \
VPSHUFB z, y, y; \
VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \
VMOVDQU m2_high<>(SB), z; \
VMOVDQU ·m2_high(SB), z; \
VPSHUFB x, z, x; \
VPXOR y, x, x
@ -565,11 +506,11 @@ GLOBL fk_mask<>(SB), 8, $16
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16
VPSHUFB ·r08_mask(SB), x, y; \ // y = x <<< 8
VPSHUFB ·r08_mask(SB), y, z; \ // z = x <<< 16
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24
VPSHUFB ·r08_mask(SB), z, z; \ // z = x <<< 24
VPXOR x, z, x; \ // x = x ^ (x <<< 24)
VPSLLD $2, y, z; \
VPSRLD $30, y, y; \

View File

@ -1,37 +1,9 @@
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), (16+8), $16
// Affine transform 1 & 2 (low and high nibbles)
DATA m1_2<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_2<>+0x08(SB)/8, $0x3045F98CEF9A2653
DATA m1_2<>+0x10(SB)/8, $0xC35BF46CAF379800
DATA m1_2<>+0x18(SB)/8, $0x68F05FC7049C33AB
DATA m1_2<>+0x20(SB)/8, $0x9A950A05FEF16E61
DATA m1_2<>+0x28(SB)/8, $0x0E019E916A65FAF5
DATA m1_2<>+0x30(SB)/8, $0x892D69CD44E0A400
DATA m1_2<>+0x38(SB)/8, $0x2C88CC68E14501A5
GLOBL m1_2<>(SB), (16+8), $64
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), (16+8), $16
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), (16+8), $16
#define LOAD_SM4_AESNI_CONSTS() \
MOVW $0x0F0F0F0F, R20 \
VDUP R20, NIBBLE_MASK.S4 \
MOVD $m1_2<>(SB), R20 \
VLD1 (R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
MOVD $inverse_shift_rows<>(SB), R20 \
VLD1 (R20), [INVERSE_SHIFT_ROWS.B16] \
MOVD $r08_mask<>(SB), R20 \
VLD1 (R20), [R08_MASK.B16] \
MOVW $0x0F0F0F0F, R20 \
VDUP R20, NIBBLE_MASK.S4 \
MOVD $·rcon(SB), R20 \
VLD1.P 64(R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
VLD1 (R20), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16]
// input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0

View File

@ -13,6 +13,61 @@
#define XTMP6 X10
#define XTMP7 X11
// shuffle byte order from LE to BE
DATA ·flip_mask+0x00(SB)/8, $0x0405060700010203
DATA ·flip_mask+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL ·flip_mask(SB), RODATA, $16
// shuffle byte and word order
DATA ·bswap_mask+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA ·bswap_mask+0x08(SB)/8, $0x0001020304050607
GLOBL ·bswap_mask(SB), RODATA, $16
//nibble mask
DATA ·nibble_mask+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA ·nibble_mask+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL ·nibble_mask(SB), RODATA, $16
// inverse shift rows
DATA ·inverse_shift_rows+0x00(SB)/8, $0x0B0E0104070A0D00
DATA ·inverse_shift_rows+0x08(SB)/8, $0x0306090C0F020508
DATA ·inverse_shift_rows+0x10(SB)/8, $0x0B0E0104070A0D00
DATA ·inverse_shift_rows+0x18(SB)/8, $0x0306090C0F020508
GLOBL ·inverse_shift_rows(SB), RODATA, $32
// Affine transform 1 (low and high nibbles)
DATA ·m1_low+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA ·m1_low+0x08(SB)/8, $0x3045F98CEF9A2653
DATA ·m1_low+0x10(SB)/8, $0x0A7FC3B6D5A01C69
DATA ·m1_low+0x18(SB)/8, $0x3045F98CEF9A2653
GLOBL ·m1_low(SB), RODATA, $32
DATA ·m1_high+0x00(SB)/8, $0xC35BF46CAF379800
DATA ·m1_high+0x08(SB)/8, $0x68F05FC7049C33AB
DATA ·m1_high+0x10(SB)/8, $0xC35BF46CAF379800
DATA ·m1_high+0x18(SB)/8, $0x68F05FC7049C33AB
GLOBL ·m1_high(SB), RODATA, $32
// Affine transform 2 (low and high nibbles)
DATA ·m2_low+0x00(SB)/8, $0x9A950A05FEF16E61
DATA ·m2_low+0x08(SB)/8, $0x0E019E916A65FAF5
DATA ·m2_low+0x10(SB)/8, $0x9A950A05FEF16E61
DATA ·m2_low+0x18(SB)/8, $0x0E019E916A65FAF5
GLOBL ·m2_low(SB), RODATA, $32
DATA ·m2_high+0x00(SB)/8, $0x892D69CD44E0A400
DATA ·m2_high+0x08(SB)/8, $0x2C88CC68E14501A5
DATA ·m2_high+0x10(SB)/8, $0x892D69CD44E0A400
DATA ·m2_high+0x18(SB)/8, $0x2C88CC68E14501A5
GLOBL ·m2_high(SB), RODATA, $32
// left rotations of 32-bit words by 8-bit increments
DATA ·r08_mask+0x00(SB)/8, $0x0605040702010003
DATA ·r08_mask+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA ·r08_mask+0x10(SB)/8, $0x0605040702010003
DATA ·r08_mask+0x18(SB)/8, $0x0E0D0C0F0A09080B
GLOBL ·r08_mask(SB), RODATA, $32
#include "aesni_macros_amd64.s"
// SM4 TAO L2 function, used for key expand
@ -105,8 +160,8 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0
MOVQ dec+24(FP), DI
MOVUPS 0(AX), t0
PSHUFB flip_mask<>(SB), t0
PXOR fk_mask<>(SB), t0
PSHUFB ·flip_mask(SB), t0
PXOR ·fk(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
@ -225,7 +280,7 @@ avx_done_sm4:
RET
avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
CMPQ DI, $256
JEQ avx2_16blocks
@ -235,7 +290,7 @@ avx2_8blocks:
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
@ -251,7 +306,7 @@ avx2_8blocks:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
@ -275,7 +330,7 @@ avx2_16blocks:
VMOVDQU 192(DX), XDWORD6
VMOVDQU 224(DX), XDWORD7
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
@ -297,7 +352,7 @@ avx2_16blocks:
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
@ -328,7 +383,7 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVQ src+16(FP), DX
MOVUPS (DX), t0
PSHUFB flip_mask<>(SB), t0
PSHUFB ·flip_mask(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
@ -353,7 +408,7 @@ loop:
PUNPCKLLQ t2, t3
PUNPCKLLQ t0, t1
PUNPCKLQDQ t1, t3
PSHUFB flip_mask<>(SB), t3
PSHUFB ·flip_mask(SB), t3
MOVUPS t3, (BX)
done_sm4:

View File

@ -20,10 +20,26 @@
#define M2H V23
#define R08_MASK V24
#define INVERSE_SHIFT_ROWS V25
#define NIBBLE_MASK V26
#define FK_MASK V27
#define FK_MASK V26
#define NIBBLE_MASK V27
#define ZERO V28
DATA ·rcon+0x00(SB)/8, $0x0A7FC3B6D5A01C69 // m1l
DATA ·rcon+0x08(SB)/8, $0x3045F98CEF9A2653
DATA ·rcon+0x10(SB)/8, $0xC35BF46CAF379800 // m1h
DATA ·rcon+0x18(SB)/8, $0x68F05FC7049C33AB
DATA ·rcon+0x20(SB)/8, $0x9A950A05FEF16E61 // m2l
DATA ·rcon+0x28(SB)/8, $0x0E019E916A65FAF5
DATA ·rcon+0x30(SB)/8, $0x892D69CD44E0A400 // m2h
DATA ·rcon+0x38(SB)/8, $0x2C88CC68E14501A5
DATA ·rcon+0x40(SB)/8, $0x0605040702010003 // left rotations of 32-bit words by 8-bit increments
DATA ·rcon+0x48(SB)/8, $0x0E0D0C0F0A09080B
DATA ·rcon+0x50(SB)/8, $0x0B0E0104070A0D00 // inverse shift rows
DATA ·rcon+0x58(SB)/8, $0x0306090C0F020508
DATA ·rcon+0x60(SB)/8, $0x56aa3350a3b1bac6 // fk
DATA ·rcon+0x68(SB)/8, $0xb27022dc677d9197
GLOBL ·rcon(SB), RODATA, $112
#include "aesni_macros_arm64.s"
#define SM4_TAO_L2(x, y) \
@ -49,14 +65,11 @@
MOVW.P R2, -4(R11)
#define LOAD_SM4KEY_AESNI_CONSTS() \
MOVW $0x0F0F0F0F, R0 \
VDUP R0, NIBBLE_MASK.S4 \
MOVD $m1_2<>(SB), R0 \
VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
MOVD $fk_mask<>(SB), R0 \
VLD1 (R0), [FK_MASK.B16] \
MOVD $inverse_shift_rows<>(SB), R0 \
VLD1 (R0), [INVERSE_SHIFT_ROWS.B16]
MOVW $0x0F0F0F0F, R0 \
VDUP R0, NIBBLE_MASK.S4 \
MOVD $·rcon(SB), R0 \
VLD1.P 64(R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
VLD1 (R0), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16, FK.B16]
#define SM4EKEY_EXPORT_KEYS() \
VREV64 V8.S4, V11.S4 \

View File

@ -360,9 +360,9 @@ avxCbcSm4Done:
RET
avx2Start:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK
VMOVDQU -16(DX), X15

View File

@ -16,15 +16,15 @@
#define t7 V13
#define IV V18
#define LAST_BLOCK V15
#define ZERO V16
#define NIBBLE_MASK V20
#define INVERSE_SHIFT_ROWS V21
#define M1L V22
#define M1H V23
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define FK_MASK V27
#define M1L V20
#define M1H V21
#define M2L V22
#define M2H V23
#define R08_MASK V24
#define INVERSE_SHIFT_ROWS V25
#define NIBBLE_MASK V26
#include "aesni_macros_arm64.s"
@ -49,7 +49,7 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
ADD srcPtr, srcPtrLen, R10
SUB $16, R10, R10
VLD1 (R10), [V15.S4]
VLD1 (R10), [LAST_BLOCK.S4]
cbcSm4Octets:
CMP $128, srcPtrLen
@ -293,5 +293,5 @@ cbc4BlocksLoop48:
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
cbcSm4Done:
VST1 [V15.S4], (R6)
VST1 [LAST_BLOCK.S4], (R6)
RET

View File

@ -219,9 +219,9 @@ avxEcbSm4Done:
RET
avx2_start:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK
avx2_16blocks:
CMPQ DI, $256

View File

@ -8,15 +8,6 @@
#define t1 V3
#define t2 V4
#define t3 V5
#define ZERO V16
#define NIBBLE_MASK V20
#define INVERSE_SHIFT_ROWS V21
#define M1L V22
#define M1H V23
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define FK_MASK V27
#define XTMP6 V6
#define XTMP7 V7
#define t4 V10
@ -24,6 +15,15 @@
#define t6 V12
#define t7 V13
#define ZERO V16
#define M1L V20
#define M1H V21
#define M2L V22
#define M2H V23
#define R08_MASK V24
#define INVERSE_SHIFT_ROWS V25
#define NIBBLE_MASK V26
#include "aesni_macros_arm64.s"
// func encryptSm4Ecb(xk *uint32, dst, src []byte)

View File

@ -95,7 +95,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
MOVOU (tPtr), ACC0
MOVOU (tMsk), T2
MOVOU bswap_mask<>(SB), BSWAP
MOVOU ·bswap_mask(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
SHLQ $3, plen
@ -279,7 +279,7 @@ TEXT ·gcmSm4Data(SB),NOSPLIT,$0
PXOR ACC0, ACC0
// MOVOU (tPtr), ACC0 // originally we passed in tag initial value
MOVOU bswap_mask<>(SB), BSWAP
MOVOU ·bswap_mask(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
TESTQ autLen, autLen
@ -527,14 +527,14 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
CMPB ·useAVX(SB), $1
JE avxGcmSm4Enc
MOVOU bswap_mask<>(SB), BSWAP
MOVOU ·bswap_mask(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
MOVOU (tPtr), ACC0
PXOR ACC1, ACC1
PXOR ACCM, ACCM
MOVOU (ctrPtr), T0
PSHUFB flip_mask<>(SB), T0
PSHUFB ·flip_mask(SB), T0
PEXTRD $3, T0, aluCTR
MOVOU T0, (8*16 + 0*16)(SP)
@ -870,14 +870,14 @@ gcmSm4EncDone:
RET
avxGcmSm4Enc:
VMOVDQU bswap_mask<>(SB), BSWAP
VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0
VPSHUFB ·flip_mask(SB), T0, T0
VPEXTRD $3, T0, aluCTR
VMOVDQU T0, (8*16 + 0*16)(SP)
@ -1198,14 +1198,14 @@ avxGcmSm4EncDone:
RET
avx2GcmSm4Enc:
VMOVDQU bswap_mask<>(SB), BSWAP
VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0
VPSHUFB ·flip_mask(SB), T0, T0
VPEXTRD $3, T0, aluCTR
VINSERTI128 $1, T0, Y11, Y11
@ -1228,7 +1228,7 @@ avx2GcmSm4Enc:
increment(6)
increment(7)
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
// load 8 ctrs for encryption
VMOVDQU (4*32 + 0*32)(SP), DWB0
VMOVDQU (4*32 + 1*32)(SP), DWB1
@ -1239,7 +1239,7 @@ avx2GcmSm4Enc:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
increment(1)
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
increment(2)
@ -1613,14 +1613,14 @@ TEXT ·gcmSm4Dec(SB),0,$128-96
CMPB ·useAVX(SB), $1
JE avxGcmSm4Dec
MOVOU bswap_mask<>(SB), BSWAP
MOVOU ·bswap_mask(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
MOVOU (tPtr), ACC0
PXOR ACC1, ACC1
PXOR ACCM, ACCM
MOVOU (ctrPtr), T0
PSHUFB flip_mask<>(SB), T0
PSHUFB ·flip_mask(SB), T0
PEXTRD $3, T0, aluCTR
MOVOU T0, (0*16)(SP)
@ -1841,14 +1841,14 @@ gcmSm4DecDone:
RET
avxGcmSm4Dec:
VMOVDQU bswap_mask<>(SB), BSWAP
VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0
VPSHUFB ·flip_mask(SB), T0, T0
VPEXTRD $3, T0, aluCTR
VMOVDQU T0, (0*16)(SP)
@ -2065,14 +2065,14 @@ avxGcmSm4DecDone:
RET
avx2GcmSm4Dec:
VMOVDQU bswap_mask<>(SB), BSWAP
VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0
VPSHUFB ·flip_mask(SB), T0, T0
VPEXTRD $3, T0, aluCTR
VINSERTI128 $1, T0, Y11, Y11
@ -2094,8 +2094,8 @@ avx2GcmSm4Dec:
increment(6)
increment(7)
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
avx2GcmSm4DecOctetsLoop:
CMPQ ptxLen, $128

View File

@ -29,13 +29,14 @@
#define K1 V20
#define K2 V21
#define K3 V22
#define NIBBLE_MASK V23
#define INVERSE_SHIFT_ROWS V24
#define M1L V25
#define M1H V26
#define M2L V27
#define M2H V28
#define R08_MASK V29
#define M1L V23
#define M1H V24
#define M2L V25
#define M2H V26
#define R08_MASK V27
#define INVERSE_SHIFT_ROWS V28
#define NIBBLE_MASK V29
#define reduce() \
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \

View File

@ -329,7 +329,7 @@ GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16
VPXOR (32*7)(SP), Y7, Y7
#define avx2LE2BE8Blocks \
VBROADCASTI128 flip_mask<>(SB), Y11; \
VBROADCASTI128 ·flip_mask(SB), Y11; \
VPSHUFB Y11, Y0, Y0; \
VPSHUFB Y11, Y1, Y1; \
VPSHUFB Y11, Y2, Y2; \
@ -589,8 +589,8 @@ avxXtsSm4EncDone:
avx2XtsSm4Enc:
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (0*16)(BX), TW
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
avx2XtsSm4Enc16Blocks:
CMPQ DI, $256
@ -735,7 +735,7 @@ TEXT ·encryptSm4XtsGB(SB),0,$256-64
JE avxXtsSm4Enc
MOVOU gbGcmPoly<>(SB), POLY
MOVOU bswap_mask<>(SB), BSWAP
MOVOU ·bswap_mask(SB), BSWAP
MOVOU (0*16)(BX), TW
xtsSm4EncOctets:
@ -834,7 +834,7 @@ xtsSm4EncDone:
avxXtsSm4Enc:
VMOVDQU gbGcmPoly<>(SB), POLY
VMOVDQU bswap_mask<>(SB), BSWAP
VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU (0*16)(BX), TW
avxXtsSm4EncOctets:
@ -934,8 +934,8 @@ avxXtsSm4EncDone:
avx2XtsSm4Enc:
VMOVDQU gbGcmPoly<>(SB), POLY
VMOVDQU (0*16)(BX), TW
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
avx2XtsSm4Enc16Blocks:
CMPQ DI, $256
@ -1327,8 +1327,8 @@ avxXtsSm4DecDone:
avx2XtsSm4Dec:
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (0*16)(BX), TW
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
avx2XtsSm4Dec16Blocks:
CMPQ DI, $256
@ -1498,7 +1498,7 @@ TEXT ·decryptSm4XtsGB(SB),0,$256-64
JE avxXtsSm4Dec
MOVOU gbGcmPoly<>(SB), POLY
MOVOU bswap_mask<>(SB), BSWAP
MOVOU ·bswap_mask(SB), BSWAP
MOVOU (0*16)(BX), TW
xtsSm4DecOctets:
@ -1622,7 +1622,7 @@ xtsSm4DecDone:
avxXtsSm4Dec:
VMOVDQU gbGcmPoly<>(SB), POLY
VMOVDQU bswap_mask<>(SB), BSWAP
VMOVDQU ·bswap_mask(SB), BSWAP
VMOVDQU (0*16)(BX), TW
avxXtsSm4DecOctets:
@ -1747,8 +1747,8 @@ avxXtsSm4DecDone:
avx2XtsSm4Dec:
VMOVDQU gbGcmPoly<>(SB), POLY
VMOVDQU (0*16)(BX), TW
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
avx2XtsSm4Dec16Blocks:
CMPQ DI, $256

View File

@ -29,13 +29,13 @@
#define K2 V21
#define K3 V22
#define NIBBLE_MASK V23
#define INVERSE_SHIFT_ROWS V24
#define M1L V25
#define M1H V26
#define M2L V27
#define M2H V28
#define R08_MASK V29
#define M1L V23
#define M1H V24
#define M2L V25
#define M2H V26
#define R08_MASK V27
#define INVERSE_SHIFT_ROWS V28
#define NIBBLE_MASK V29
#include "aesni_macros_arm64.s"
#include "xts_macros_arm64.s"

View File

@ -28,7 +28,6 @@ DATA rcon<>+0x90(SB)/8, $0x00ff00ff00ff00ff // S1
DATA rcon<>+0x98(SB)/8, $0x00ff00ff00ff00ff
GLOBL rcon<>(SB), RODATA, $160
#define M1L V20
#define M1H V21
#define M2L V22