mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
sm4: use package level instead of local for shared variables
This commit is contained in:
parent
b721bed0cc
commit
aa82b5836b
@ -1,62 +1,3 @@
|
|||||||
// shuffle byte order from LE to BE
|
|
||||||
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
|
|
||||||
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
|
||||||
GLOBL flip_mask<>(SB), 8, $16
|
|
||||||
|
|
||||||
// shuffle byte and word order
|
|
||||||
DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
|
|
||||||
DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
|
|
||||||
GLOBL bswap_mask<>(SB), 8, $16
|
|
||||||
|
|
||||||
//nibble mask
|
|
||||||
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
|
||||||
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
|
||||||
GLOBL nibble_mask<>(SB), 8, $16
|
|
||||||
|
|
||||||
// inverse shift rows
|
|
||||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
|
||||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
|
||||||
DATA inverse_shift_rows<>+0x10(SB)/8, $0x0B0E0104070A0D00
|
|
||||||
DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508
|
|
||||||
GLOBL inverse_shift_rows<>(SB), 8, $32
|
|
||||||
|
|
||||||
// Affine transform 1 (low and high nibbles)
|
|
||||||
DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
|
||||||
DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
|
||||||
DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69
|
|
||||||
DATA m1_low<>+0x18(SB)/8, $0x3045F98CEF9A2653
|
|
||||||
GLOBL m1_low<>(SB), 8, $32
|
|
||||||
|
|
||||||
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
|
|
||||||
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
|
||||||
DATA m1_high<>+0x10(SB)/8, $0xC35BF46CAF379800
|
|
||||||
DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB
|
|
||||||
GLOBL m1_high<>(SB), 8, $32
|
|
||||||
|
|
||||||
// Affine transform 2 (low and high nibbles)
|
|
||||||
DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
|
|
||||||
DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
|
|
||||||
DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61
|
|
||||||
DATA m2_low<>+0x18(SB)/8, $0x0E019E916A65FAF5
|
|
||||||
GLOBL m2_low<>(SB), 8, $32
|
|
||||||
|
|
||||||
DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
|
|
||||||
DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
|
|
||||||
DATA m2_high<>+0x10(SB)/8, $0x892D69CD44E0A400
|
|
||||||
DATA m2_high<>+0x18(SB)/8, $0x2C88CC68E14501A5
|
|
||||||
GLOBL m2_high<>(SB), 8, $32
|
|
||||||
|
|
||||||
// left rotations of 32-bit words by 8-bit increments
|
|
||||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
|
||||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
|
||||||
DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003
|
|
||||||
DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
|
||||||
GLOBL r08_mask<>(SB), 8, $32
|
|
||||||
|
|
||||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
|
||||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
|
||||||
GLOBL fk_mask<>(SB), 8, $16
|
|
||||||
|
|
||||||
// Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions.
|
// Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions.
|
||||||
// input: from high to low
|
// input: from high to low
|
||||||
// r0 = [w3, w2, w1, w0]
|
// r0 = [w3, w2, w1, w0]
|
||||||
@ -110,26 +51,26 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
#define SM4_SBOX(x, y, z) \
|
#define SM4_SBOX(x, y, z) \
|
||||||
; \ //############################# inner affine ############################//
|
; \ //############################# inner affine ############################//
|
||||||
MOVOU x, z; \
|
MOVOU x, z; \
|
||||||
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
|
PAND ·nibble_mask(SB), z; \ //y = _mm_and_si128(x, c0f);
|
||||||
MOVOU m1_low<>(SB), y; \
|
MOVOU ·m1_low(SB), y; \
|
||||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
|
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
|
||||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
PAND ·nibble_mask(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||||
MOVOU m1_high<>(SB), z; \
|
MOVOU ·m1_high(SB), z; \
|
||||||
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
|
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
|
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||||
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
|
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
|
||||||
; \ // inverse ShiftRows
|
; \ // inverse ShiftRows
|
||||||
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
|
PSHUFB ·inverse_shift_rows(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
|
||||||
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
|
AESENCLAST ·nibble_mask(SB), x; \ // AESNI instruction
|
||||||
; \ //############################# outer affine ############################//
|
; \ //############################# outer affine ############################//
|
||||||
MOVOU x, z; \
|
MOVOU x, z; \
|
||||||
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
|
PANDN ·nibble_mask(SB), z; \ //z = _mm_andnot_si128(x, c0f);
|
||||||
MOVOU m2_low<>(SB), y; \
|
MOVOU ·m2_low(SB), y; \
|
||||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
|
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
|
||||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
PAND ·nibble_mask(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||||
MOVOU m2_high<>(SB), z; \
|
MOVOU ·m2_high(SB), z; \
|
||||||
PSHUFB x, z; \
|
PSHUFB x, z; \
|
||||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
||||||
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
||||||
@ -143,12 +84,12 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_SBOX(x, y, z); \
|
SM4_SBOX(x, y, z); \
|
||||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||||
MOVOU x, y; \
|
MOVOU x, y; \
|
||||||
PSHUFB r08_mask<>(SB), y; \ //y = x <<< 8
|
PSHUFB ·r08_mask(SB), y; \ //y = x <<< 8
|
||||||
MOVOU y, z; \
|
MOVOU y, z; \
|
||||||
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 16
|
PSHUFB ·r08_mask(SB), z; \ //z = x <<< 16
|
||||||
PXOR x, y; \ //y = x ^ (x <<< 8)
|
PXOR x, y; \ //y = x ^ (x <<< 8)
|
||||||
PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16)
|
PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||||
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 24
|
PSHUFB ·r08_mask(SB), z; \ //z = x <<< 24
|
||||||
PXOR z, x; \ //x = x ^ (x <<< 24)
|
PXOR z, x; \ //x = x ^ (x <<< 24)
|
||||||
MOVOU y, z; \
|
MOVOU y, z; \
|
||||||
PSLLL $2, z; \
|
PSLLL $2, z; \
|
||||||
@ -214,7 +155,7 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
|
|
||||||
// Requires: SSSE3
|
// Requires: SSSE3
|
||||||
#define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
#define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
PSHUFB flip_mask<>(SB), t0; \
|
PSHUFB ·flip_mask(SB), t0; \
|
||||||
PSHUFD $1, t0, t1; \
|
PSHUFD $1, t0, t1; \
|
||||||
PSHUFD $2, t0, t2; \
|
PSHUFD $2, t0, t2; \
|
||||||
PSHUFD $3, t0, t3; \
|
PSHUFD $3, t0, t3; \
|
||||||
@ -238,13 +179,13 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
PALIGNR $4, t3, t2; \
|
PALIGNR $4, t3, t2; \
|
||||||
PALIGNR $4, t2, t1; \
|
PALIGNR $4, t2, t1; \
|
||||||
PALIGNR $4, t1, t0; \
|
PALIGNR $4, t1, t0; \
|
||||||
PSHUFB flip_mask<>(SB), t0
|
PSHUFB ·flip_mask(SB), t0
|
||||||
|
|
||||||
#define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
#define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
PSHUFB flip_mask<>(SB), t0; \
|
PSHUFB ·flip_mask(SB), t0; \
|
||||||
PSHUFB flip_mask<>(SB), t1; \
|
PSHUFB ·flip_mask(SB), t1; \
|
||||||
PSHUFB flip_mask<>(SB), t2; \
|
PSHUFB ·flip_mask(SB), t2; \
|
||||||
PSHUFB flip_mask<>(SB), t3; \
|
PSHUFB ·flip_mask(SB), t3; \
|
||||||
SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
|
SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
|
||||||
|
|
||||||
#define SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
#define SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
@ -266,10 +207,10 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
MOVOU (7*16)(RK), rk128; \
|
MOVOU (7*16)(RK), rk128; \
|
||||||
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||||
PSHUFB bswap_mask<>(SB), t3; \
|
PSHUFB ·bswap_mask(SB), t3; \
|
||||||
PSHUFB bswap_mask<>(SB), t2; \
|
PSHUFB ·bswap_mask(SB), t2; \
|
||||||
PSHUFB bswap_mask<>(SB), t1; \
|
PSHUFB ·bswap_mask(SB), t1; \
|
||||||
PSHUFB bswap_mask<>(SB), t0
|
PSHUFB ·bswap_mask(SB), t0
|
||||||
|
|
||||||
#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
PSHUFD $0, rk128, x; \
|
PSHUFD $0, rk128, x; \
|
||||||
@ -290,14 +231,14 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \
|
SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \
|
||||||
|
|
||||||
#define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
#define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
PSHUFB flip_mask<>(SB), t0; \
|
PSHUFB ·flip_mask(SB), t0; \
|
||||||
PSHUFB flip_mask<>(SB), t1; \
|
PSHUFB ·flip_mask(SB), t1; \
|
||||||
PSHUFB flip_mask<>(SB), t2; \
|
PSHUFB ·flip_mask(SB), t2; \
|
||||||
PSHUFB flip_mask<>(SB), t3; \
|
PSHUFB ·flip_mask(SB), t3; \
|
||||||
PSHUFB flip_mask<>(SB), t4; \
|
PSHUFB ·flip_mask(SB), t4; \
|
||||||
PSHUFB flip_mask<>(SB), t5; \
|
PSHUFB ·flip_mask(SB), t5; \
|
||||||
PSHUFB flip_mask<>(SB), t6; \
|
PSHUFB ·flip_mask(SB), t6; \
|
||||||
PSHUFB flip_mask<>(SB), t7; \
|
PSHUFB ·flip_mask(SB), t7; \
|
||||||
SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
|
SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||||
|
|
||||||
#define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
#define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
@ -321,14 +262,14 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||||
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
||||||
PSHUFB bswap_mask<>(SB), t3; \
|
PSHUFB ·bswap_mask(SB), t3; \
|
||||||
PSHUFB bswap_mask<>(SB), t2; \
|
PSHUFB ·bswap_mask(SB), t2; \
|
||||||
PSHUFB bswap_mask<>(SB), t1; \
|
PSHUFB ·bswap_mask(SB), t1; \
|
||||||
PSHUFB bswap_mask<>(SB), t0; \
|
PSHUFB ·bswap_mask(SB), t0; \
|
||||||
PSHUFB bswap_mask<>(SB), t7; \
|
PSHUFB ·bswap_mask(SB), t7; \
|
||||||
PSHUFB bswap_mask<>(SB), t6; \
|
PSHUFB ·bswap_mask(SB), t6; \
|
||||||
PSHUFB bswap_mask<>(SB), t5; \
|
PSHUFB ·bswap_mask(SB), t5; \
|
||||||
PSHUFB bswap_mask<>(SB), t4
|
PSHUFB ·bswap_mask(SB), t4
|
||||||
|
|
||||||
// SM4 sbox function, AVX version
|
// SM4 sbox function, AVX version
|
||||||
// parameters:
|
// parameters:
|
||||||
@ -336,22 +277,22 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - y: 128 bits temp register
|
// - y: 128 bits temp register
|
||||||
// - tmp: 128 bits temp register
|
// - tmp: 128 bits temp register
|
||||||
#define AVX_SM4_SBOX(x, y, tmp) \
|
#define AVX_SM4_SBOX(x, y, tmp) \
|
||||||
VPAND nibble_mask<>(SB), x, tmp; \
|
VPAND ·nibble_mask(SB), x, tmp; \
|
||||||
VMOVDQU m1_low<>(SB), y; \
|
VMOVDQU ·m1_low(SB), y; \
|
||||||
VPSHUFB tmp, y, y; \
|
VPSHUFB tmp, y, y; \
|
||||||
VPSRLQ $4, x, x; \
|
VPSRLQ $4, x, x; \
|
||||||
VPAND nibble_mask<>(SB), x, x; \
|
VPAND ·nibble_mask(SB), x, x; \
|
||||||
VMOVDQU m1_high<>(SB), tmp; \
|
VMOVDQU ·m1_high(SB), tmp; \
|
||||||
VPSHUFB x, tmp, x; \
|
VPSHUFB x, tmp, x; \
|
||||||
VPXOR y, x, x; \
|
VPXOR y, x, x; \
|
||||||
VPSHUFB inverse_shift_rows<>(SB), x, x; \
|
VPSHUFB ·inverse_shift_rows(SB), x, x; \
|
||||||
VAESENCLAST nibble_mask<>(SB), x, x; \
|
VAESENCLAST ·nibble_mask(SB), x, x; \
|
||||||
VPANDN nibble_mask<>(SB), x, tmp; \
|
VPANDN ·nibble_mask(SB), x, tmp; \
|
||||||
VMOVDQU m2_low<>(SB), y; \
|
VMOVDQU ·m2_low(SB), y; \
|
||||||
VPSHUFB tmp, y, y; \
|
VPSHUFB tmp, y, y; \
|
||||||
VPSRLQ $4, x, x; \
|
VPSRLQ $4, x, x; \
|
||||||
VPAND nibble_mask<>(SB), x, x; \
|
VPAND ·nibble_mask(SB), x, x; \
|
||||||
VMOVDQU m2_high<>(SB), tmp; \
|
VMOVDQU ·m2_high(SB), tmp; \
|
||||||
VPSHUFB x, tmp, x; \
|
VPSHUFB x, tmp, x; \
|
||||||
VPXOR y, x, x
|
VPXOR y, x, x
|
||||||
|
|
||||||
@ -362,11 +303,11 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - tmp: 128 bits temp register
|
// - tmp: 128 bits temp register
|
||||||
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
||||||
AVX_SM4_SBOX(x, y, tmp); \
|
AVX_SM4_SBOX(x, y, tmp); \
|
||||||
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
|
VPSHUFB ·r08_mask(SB), x, y; \ // y = x <<< 8
|
||||||
VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16
|
VPSHUFB ·r08_mask(SB), y, tmp; \ // tmp = x <<< 16
|
||||||
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
||||||
VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||||
VPSHUFB r08_mask<>(SB), tmp, tmp; \ // tmp = x <<< 24
|
VPSHUFB ·r08_mask(SB), tmp, tmp; \ // tmp = x <<< 24
|
||||||
VPXOR x, tmp, x; \ // x = x ^ (x <<< 24)
|
VPXOR x, tmp, x; \ // x = x ^ (x <<< 24)
|
||||||
VPSLLD $2, y, tmp; \
|
VPSLLD $2, y, tmp; \
|
||||||
VPSRLD $30, y, y; \
|
VPSRLD $30, y, y; \
|
||||||
@ -429,10 +370,10 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
|
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
|
||||||
|
|
||||||
#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
VPSHUFB flip_mask<>(SB), t0, t0 \
|
VPSHUFB ·flip_mask(SB), t0, t0 \
|
||||||
VPSHUFB flip_mask<>(SB), t1, t1 \
|
VPSHUFB ·flip_mask(SB), t1, t1 \
|
||||||
VPSHUFB flip_mask<>(SB), t2, t2 \
|
VPSHUFB ·flip_mask(SB), t2, t2 \
|
||||||
VPSHUFB flip_mask<>(SB), t3, t3 \
|
VPSHUFB ·flip_mask(SB), t3, t3 \
|
||||||
; \
|
; \
|
||||||
AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
|
AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
|
||||||
|
|
||||||
@ -456,10 +397,10 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
; \ // Transpose matrix 4 x 4 32bits word
|
; \ // Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||||
VPSHUFB bswap_mask<>(SB), t0, t0 \
|
VPSHUFB ·bswap_mask(SB), t0, t0 \
|
||||||
VPSHUFB bswap_mask<>(SB), t1, t1 \
|
VPSHUFB ·bswap_mask(SB), t1, t1 \
|
||||||
VPSHUFB bswap_mask<>(SB), t2, t2 \
|
VPSHUFB ·bswap_mask(SB), t2, t2 \
|
||||||
VPSHUFB bswap_mask<>(SB), t3, t3 \
|
VPSHUFB ·bswap_mask(SB), t3, t3 \
|
||||||
|
|
||||||
#define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
#define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
VPSHUFD $0, rk128, x; \
|
VPSHUFD $0, rk128, x; \
|
||||||
@ -480,14 +421,14 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \
|
SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \
|
||||||
|
|
||||||
#define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
#define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
VPSHUFB flip_mask<>(SB), t0, t0 \
|
VPSHUFB ·flip_mask(SB), t0, t0 \
|
||||||
VPSHUFB flip_mask<>(SB), t1, t1 \
|
VPSHUFB ·flip_mask(SB), t1, t1 \
|
||||||
VPSHUFB flip_mask<>(SB), t2, t2 \
|
VPSHUFB ·flip_mask(SB), t2, t2 \
|
||||||
VPSHUFB flip_mask<>(SB), t3, t3 \
|
VPSHUFB ·flip_mask(SB), t3, t3 \
|
||||||
VPSHUFB flip_mask<>(SB), t4, t4 \
|
VPSHUFB ·flip_mask(SB), t4, t4 \
|
||||||
VPSHUFB flip_mask<>(SB), t5, t5 \
|
VPSHUFB ·flip_mask(SB), t5, t5 \
|
||||||
VPSHUFB flip_mask<>(SB), t6, t6 \
|
VPSHUFB ·flip_mask(SB), t6, t6 \
|
||||||
VPSHUFB flip_mask<>(SB), t7, t7 \
|
VPSHUFB ·flip_mask(SB), t7, t7 \
|
||||||
; \
|
; \
|
||||||
AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
|
AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||||
|
|
||||||
@ -513,14 +454,14 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
; \ // Transpose matrix 4 x 4 32bits word
|
; \ // Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
||||||
VPSHUFB bswap_mask<>(SB), t0, t0 \
|
VPSHUFB ·bswap_mask(SB), t0, t0 \
|
||||||
VPSHUFB bswap_mask<>(SB), t1, t1 \
|
VPSHUFB ·bswap_mask(SB), t1, t1 \
|
||||||
VPSHUFB bswap_mask<>(SB), t2, t2 \
|
VPSHUFB ·bswap_mask(SB), t2, t2 \
|
||||||
VPSHUFB bswap_mask<>(SB), t3, t3 \
|
VPSHUFB ·bswap_mask(SB), t3, t3 \
|
||||||
VPSHUFB bswap_mask<>(SB), t4, t4 \
|
VPSHUFB ·bswap_mask(SB), t4, t4 \
|
||||||
VPSHUFB bswap_mask<>(SB), t5, t5 \
|
VPSHUFB ·bswap_mask(SB), t5, t5 \
|
||||||
VPSHUFB bswap_mask<>(SB), t6, t6 \
|
VPSHUFB ·bswap_mask(SB), t6, t6 \
|
||||||
VPSHUFB bswap_mask<>(SB), t7, t7 \
|
VPSHUFB ·bswap_mask(SB), t7, t7 \
|
||||||
|
|
||||||
// SM4 sbox function, AVX2 version
|
// SM4 sbox function, AVX2 version
|
||||||
// parameters:
|
// parameters:
|
||||||
@ -533,24 +474,24 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||||
#define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
#define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||||
VPAND yNibbleMask, x, z; \
|
VPAND yNibbleMask, x, z; \
|
||||||
VMOVDQU m1_low<>(SB), y; \
|
VMOVDQU ·m1_low(SB), y; \
|
||||||
VPSHUFB z, y, y; \
|
VPSHUFB z, y, y; \
|
||||||
VPSRLQ $4, x, x; \
|
VPSRLQ $4, x, x; \
|
||||||
VPAND yNibbleMask, x, x; \
|
VPAND yNibbleMask, x, x; \
|
||||||
VMOVDQU m1_high<>(SB), z; \
|
VMOVDQU ·m1_high(SB), z; \
|
||||||
VPSHUFB x, z, x; \
|
VPSHUFB x, z, x; \
|
||||||
VPXOR y, x, x; \
|
VPXOR y, x, x; \
|
||||||
VPSHUFB inverse_shift_rows<>(SB), x, x; \
|
VPSHUFB ·inverse_shift_rows(SB), x, x; \
|
||||||
VEXTRACTI128 $1, x, yw \
|
VEXTRACTI128 $1, x, yw \
|
||||||
VAESENCLAST xNibbleMask, xw, xw; \
|
VAESENCLAST xNibbleMask, xw, xw; \
|
||||||
VAESENCLAST xNibbleMask, yw, yw; \
|
VAESENCLAST xNibbleMask, yw, yw; \
|
||||||
VINSERTI128 $1, yw, x, x; \
|
VINSERTI128 $1, yw, x, x; \
|
||||||
VPANDN yNibbleMask, x, z; \
|
VPANDN yNibbleMask, x, z; \
|
||||||
VMOVDQU m2_low<>(SB), y; \
|
VMOVDQU ·m2_low(SB), y; \
|
||||||
VPSHUFB z, y, y; \
|
VPSHUFB z, y, y; \
|
||||||
VPSRLQ $4, x, x; \
|
VPSRLQ $4, x, x; \
|
||||||
VPAND yNibbleMask, x, x; \
|
VPAND yNibbleMask, x, x; \
|
||||||
VMOVDQU m2_high<>(SB), z; \
|
VMOVDQU ·m2_high(SB), z; \
|
||||||
VPSHUFB x, z, x; \
|
VPSHUFB x, z, x; \
|
||||||
VPXOR y, x, x
|
VPXOR y, x, x
|
||||||
|
|
||||||
@ -565,11 +506,11 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||||
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||||
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
||||||
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
|
VPSHUFB ·r08_mask(SB), x, y; \ // y = x <<< 8
|
||||||
VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16
|
VPSHUFB ·r08_mask(SB), y, z; \ // z = x <<< 16
|
||||||
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
||||||
VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||||
VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24
|
VPSHUFB ·r08_mask(SB), z, z; \ // z = x <<< 24
|
||||||
VPXOR x, z, x; \ // x = x ^ (x <<< 24)
|
VPXOR x, z, x; \ // x = x ^ (x <<< 24)
|
||||||
VPSLLD $2, y, z; \
|
VPSLLD $2, y, z; \
|
||||||
VPSRLD $30, y, y; \
|
VPSRLD $30, y, y; \
|
||||||
|
@ -1,37 +1,9 @@
|
|||||||
// inverse shift rows
|
|
||||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
|
||||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
|
||||||
GLOBL inverse_shift_rows<>(SB), (16+8), $16
|
|
||||||
|
|
||||||
// Affine transform 1 & 2 (low and high nibbles)
|
|
||||||
DATA m1_2<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
|
||||||
DATA m1_2<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
|
||||||
DATA m1_2<>+0x10(SB)/8, $0xC35BF46CAF379800
|
|
||||||
DATA m1_2<>+0x18(SB)/8, $0x68F05FC7049C33AB
|
|
||||||
DATA m1_2<>+0x20(SB)/8, $0x9A950A05FEF16E61
|
|
||||||
DATA m1_2<>+0x28(SB)/8, $0x0E019E916A65FAF5
|
|
||||||
DATA m1_2<>+0x30(SB)/8, $0x892D69CD44E0A400
|
|
||||||
DATA m1_2<>+0x38(SB)/8, $0x2C88CC68E14501A5
|
|
||||||
GLOBL m1_2<>(SB), (16+8), $64
|
|
||||||
|
|
||||||
// left rotations of 32-bit words by 8-bit increments
|
|
||||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
|
||||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
|
||||||
GLOBL r08_mask<>(SB), (16+8), $16
|
|
||||||
|
|
||||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
|
||||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
|
||||||
GLOBL fk_mask<>(SB), (16+8), $16
|
|
||||||
|
|
||||||
#define LOAD_SM4_AESNI_CONSTS() \
|
#define LOAD_SM4_AESNI_CONSTS() \
|
||||||
MOVW $0x0F0F0F0F, R20 \
|
MOVW $0x0F0F0F0F, R20 \
|
||||||
VDUP R20, NIBBLE_MASK.S4 \
|
VDUP R20, NIBBLE_MASK.S4 \
|
||||||
MOVD $m1_2<>(SB), R20 \
|
MOVD $·rcon(SB), R20 \
|
||||||
VLD1 (R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
|
VLD1.P 64(R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
|
||||||
MOVD $inverse_shift_rows<>(SB), R20 \
|
VLD1 (R20), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16]
|
||||||
VLD1 (R20), [INVERSE_SHIFT_ROWS.B16] \
|
|
||||||
MOVD $r08_mask<>(SB), R20 \
|
|
||||||
VLD1 (R20), [R08_MASK.B16] \
|
|
||||||
|
|
||||||
// input: from high to low
|
// input: from high to low
|
||||||
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
||||||
|
@ -13,6 +13,61 @@
|
|||||||
#define XTMP6 X10
|
#define XTMP6 X10
|
||||||
#define XTMP7 X11
|
#define XTMP7 X11
|
||||||
|
|
||||||
|
// shuffle byte order from LE to BE
|
||||||
|
DATA ·flip_mask+0x00(SB)/8, $0x0405060700010203
|
||||||
|
DATA ·flip_mask+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
||||||
|
GLOBL ·flip_mask(SB), RODATA, $16
|
||||||
|
|
||||||
|
// shuffle byte and word order
|
||||||
|
DATA ·bswap_mask+0x00(SB)/8, $0x08090a0b0c0d0e0f
|
||||||
|
DATA ·bswap_mask+0x08(SB)/8, $0x0001020304050607
|
||||||
|
GLOBL ·bswap_mask(SB), RODATA, $16
|
||||||
|
|
||||||
|
//nibble mask
|
||||||
|
DATA ·nibble_mask+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||||
|
DATA ·nibble_mask+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||||
|
GLOBL ·nibble_mask(SB), RODATA, $16
|
||||||
|
|
||||||
|
// inverse shift rows
|
||||||
|
DATA ·inverse_shift_rows+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||||
|
DATA ·inverse_shift_rows+0x08(SB)/8, $0x0306090C0F020508
|
||||||
|
DATA ·inverse_shift_rows+0x10(SB)/8, $0x0B0E0104070A0D00
|
||||||
|
DATA ·inverse_shift_rows+0x18(SB)/8, $0x0306090C0F020508
|
||||||
|
GLOBL ·inverse_shift_rows(SB), RODATA, $32
|
||||||
|
|
||||||
|
// Affine transform 1 (low and high nibbles)
|
||||||
|
DATA ·m1_low+0x00(SB)/8, $0x0A7FC3B6D5A01C69
|
||||||
|
DATA ·m1_low+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||||
|
DATA ·m1_low+0x10(SB)/8, $0x0A7FC3B6D5A01C69
|
||||||
|
DATA ·m1_low+0x18(SB)/8, $0x3045F98CEF9A2653
|
||||||
|
GLOBL ·m1_low(SB), RODATA, $32
|
||||||
|
|
||||||
|
DATA ·m1_high+0x00(SB)/8, $0xC35BF46CAF379800
|
||||||
|
DATA ·m1_high+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||||
|
DATA ·m1_high+0x10(SB)/8, $0xC35BF46CAF379800
|
||||||
|
DATA ·m1_high+0x18(SB)/8, $0x68F05FC7049C33AB
|
||||||
|
GLOBL ·m1_high(SB), RODATA, $32
|
||||||
|
|
||||||
|
// Affine transform 2 (low and high nibbles)
|
||||||
|
DATA ·m2_low+0x00(SB)/8, $0x9A950A05FEF16E61
|
||||||
|
DATA ·m2_low+0x08(SB)/8, $0x0E019E916A65FAF5
|
||||||
|
DATA ·m2_low+0x10(SB)/8, $0x9A950A05FEF16E61
|
||||||
|
DATA ·m2_low+0x18(SB)/8, $0x0E019E916A65FAF5
|
||||||
|
GLOBL ·m2_low(SB), RODATA, $32
|
||||||
|
|
||||||
|
DATA ·m2_high+0x00(SB)/8, $0x892D69CD44E0A400
|
||||||
|
DATA ·m2_high+0x08(SB)/8, $0x2C88CC68E14501A5
|
||||||
|
DATA ·m2_high+0x10(SB)/8, $0x892D69CD44E0A400
|
||||||
|
DATA ·m2_high+0x18(SB)/8, $0x2C88CC68E14501A5
|
||||||
|
GLOBL ·m2_high(SB), RODATA, $32
|
||||||
|
|
||||||
|
// left rotations of 32-bit words by 8-bit increments
|
||||||
|
DATA ·r08_mask+0x00(SB)/8, $0x0605040702010003
|
||||||
|
DATA ·r08_mask+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||||
|
DATA ·r08_mask+0x10(SB)/8, $0x0605040702010003
|
||||||
|
DATA ·r08_mask+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
||||||
|
GLOBL ·r08_mask(SB), RODATA, $32
|
||||||
|
|
||||||
#include "aesni_macros_amd64.s"
|
#include "aesni_macros_amd64.s"
|
||||||
|
|
||||||
// SM4 TAO L2 function, used for key expand
|
// SM4 TAO L2 function, used for key expand
|
||||||
@ -105,8 +160,8 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
|||||||
MOVQ dec+24(FP), DI
|
MOVQ dec+24(FP), DI
|
||||||
|
|
||||||
MOVUPS 0(AX), t0
|
MOVUPS 0(AX), t0
|
||||||
PSHUFB flip_mask<>(SB), t0
|
PSHUFB ·flip_mask(SB), t0
|
||||||
PXOR fk_mask<>(SB), t0
|
PXOR ·fk(SB), t0
|
||||||
PSHUFD $1, t0, t1
|
PSHUFD $1, t0, t1
|
||||||
PSHUFD $2, t0, t2
|
PSHUFD $2, t0, t2
|
||||||
PSHUFD $3, t0, t3
|
PSHUFD $3, t0, t3
|
||||||
@ -225,7 +280,7 @@ avx_done_sm4:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
avx2:
|
avx2:
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
|
||||||
|
|
||||||
CMPQ DI, $256
|
CMPQ DI, $256
|
||||||
JEQ avx2_16blocks
|
JEQ avx2_16blocks
|
||||||
@ -235,7 +290,7 @@ avx2_8blocks:
|
|||||||
VMOVDQU 32(DX), XDWORD1
|
VMOVDQU 32(DX), XDWORD1
|
||||||
VMOVDQU 64(DX), XDWORD2
|
VMOVDQU 64(DX), XDWORD2
|
||||||
VMOVDQU 96(DX), XDWORD3
|
VMOVDQU 96(DX), XDWORD3
|
||||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
|
||||||
|
|
||||||
// Apply Byte Flip Mask: LE -> BE
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
@ -251,7 +306,7 @@ avx2_8blocks:
|
|||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
|
|
||||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||||
@ -275,7 +330,7 @@ avx2_16blocks:
|
|||||||
VMOVDQU 192(DX), XDWORD6
|
VMOVDQU 192(DX), XDWORD6
|
||||||
VMOVDQU 224(DX), XDWORD7
|
VMOVDQU 224(DX), XDWORD7
|
||||||
|
|
||||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
|
||||||
|
|
||||||
// Apply Byte Flip Mask: LE -> BE
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
@ -297,7 +352,7 @@ avx2_16blocks:
|
|||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||||
|
|
||||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||||
@ -328,7 +383,7 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
|||||||
MOVQ src+16(FP), DX
|
MOVQ src+16(FP), DX
|
||||||
|
|
||||||
MOVUPS (DX), t0
|
MOVUPS (DX), t0
|
||||||
PSHUFB flip_mask<>(SB), t0
|
PSHUFB ·flip_mask(SB), t0
|
||||||
PSHUFD $1, t0, t1
|
PSHUFD $1, t0, t1
|
||||||
PSHUFD $2, t0, t2
|
PSHUFD $2, t0, t2
|
||||||
PSHUFD $3, t0, t3
|
PSHUFD $3, t0, t3
|
||||||
@ -353,7 +408,7 @@ loop:
|
|||||||
PUNPCKLLQ t2, t3
|
PUNPCKLLQ t2, t3
|
||||||
PUNPCKLLQ t0, t1
|
PUNPCKLLQ t0, t1
|
||||||
PUNPCKLQDQ t1, t3
|
PUNPCKLQDQ t1, t3
|
||||||
PSHUFB flip_mask<>(SB), t3
|
PSHUFB ·flip_mask(SB), t3
|
||||||
MOVUPS t3, (BX)
|
MOVUPS t3, (BX)
|
||||||
|
|
||||||
done_sm4:
|
done_sm4:
|
||||||
|
@ -20,10 +20,26 @@
|
|||||||
#define M2H V23
|
#define M2H V23
|
||||||
#define R08_MASK V24
|
#define R08_MASK V24
|
||||||
#define INVERSE_SHIFT_ROWS V25
|
#define INVERSE_SHIFT_ROWS V25
|
||||||
#define NIBBLE_MASK V26
|
#define FK_MASK V26
|
||||||
#define FK_MASK V27
|
#define NIBBLE_MASK V27
|
||||||
#define ZERO V28
|
#define ZERO V28
|
||||||
|
|
||||||
|
DATA ·rcon+0x00(SB)/8, $0x0A7FC3B6D5A01C69 // m1l
|
||||||
|
DATA ·rcon+0x08(SB)/8, $0x3045F98CEF9A2653
|
||||||
|
DATA ·rcon+0x10(SB)/8, $0xC35BF46CAF379800 // m1h
|
||||||
|
DATA ·rcon+0x18(SB)/8, $0x68F05FC7049C33AB
|
||||||
|
DATA ·rcon+0x20(SB)/8, $0x9A950A05FEF16E61 // m2l
|
||||||
|
DATA ·rcon+0x28(SB)/8, $0x0E019E916A65FAF5
|
||||||
|
DATA ·rcon+0x30(SB)/8, $0x892D69CD44E0A400 // m2h
|
||||||
|
DATA ·rcon+0x38(SB)/8, $0x2C88CC68E14501A5
|
||||||
|
DATA ·rcon+0x40(SB)/8, $0x0605040702010003 // left rotations of 32-bit words by 8-bit increments
|
||||||
|
DATA ·rcon+0x48(SB)/8, $0x0E0D0C0F0A09080B
|
||||||
|
DATA ·rcon+0x50(SB)/8, $0x0B0E0104070A0D00 // inverse shift rows
|
||||||
|
DATA ·rcon+0x58(SB)/8, $0x0306090C0F020508
|
||||||
|
DATA ·rcon+0x60(SB)/8, $0x56aa3350a3b1bac6 // fk
|
||||||
|
DATA ·rcon+0x68(SB)/8, $0xb27022dc677d9197
|
||||||
|
GLOBL ·rcon(SB), RODATA, $112
|
||||||
|
|
||||||
#include "aesni_macros_arm64.s"
|
#include "aesni_macros_arm64.s"
|
||||||
|
|
||||||
#define SM4_TAO_L2(x, y) \
|
#define SM4_TAO_L2(x, y) \
|
||||||
@ -49,14 +65,11 @@
|
|||||||
MOVW.P R2, -4(R11)
|
MOVW.P R2, -4(R11)
|
||||||
|
|
||||||
#define LOAD_SM4KEY_AESNI_CONSTS() \
|
#define LOAD_SM4KEY_AESNI_CONSTS() \
|
||||||
MOVW $0x0F0F0F0F, R0 \
|
MOVW $0x0F0F0F0F, R0 \
|
||||||
VDUP R0, NIBBLE_MASK.S4 \
|
VDUP R0, NIBBLE_MASK.S4 \
|
||||||
MOVD $m1_2<>(SB), R0 \
|
MOVD $·rcon(SB), R0 \
|
||||||
VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
|
VLD1.P 64(R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
|
||||||
MOVD $fk_mask<>(SB), R0 \
|
VLD1 (R0), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16, FK.B16]
|
||||||
VLD1 (R0), [FK_MASK.B16] \
|
|
||||||
MOVD $inverse_shift_rows<>(SB), R0 \
|
|
||||||
VLD1 (R0), [INVERSE_SHIFT_ROWS.B16]
|
|
||||||
|
|
||||||
#define SM4EKEY_EXPORT_KEYS() \
|
#define SM4EKEY_EXPORT_KEYS() \
|
||||||
VREV64 V8.S4, V11.S4 \
|
VREV64 V8.S4, V11.S4 \
|
||||||
|
@ -360,9 +360,9 @@ avxCbcSm4Done:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
avx2Start:
|
avx2Start:
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
|
||||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
|
||||||
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
|
VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK
|
||||||
|
|
||||||
VMOVDQU -16(DX), X15
|
VMOVDQU -16(DX), X15
|
||||||
|
|
||||||
|
@ -16,15 +16,15 @@
|
|||||||
#define t7 V13
|
#define t7 V13
|
||||||
#define IV V18
|
#define IV V18
|
||||||
|
|
||||||
|
#define LAST_BLOCK V15
|
||||||
#define ZERO V16
|
#define ZERO V16
|
||||||
#define NIBBLE_MASK V20
|
#define M1L V20
|
||||||
#define INVERSE_SHIFT_ROWS V21
|
#define M1H V21
|
||||||
#define M1L V22
|
#define M2L V22
|
||||||
#define M1H V23
|
#define M2H V23
|
||||||
#define M2L V24
|
#define R08_MASK V24
|
||||||
#define M2H V25
|
#define INVERSE_SHIFT_ROWS V25
|
||||||
#define R08_MASK V26
|
#define NIBBLE_MASK V26
|
||||||
#define FK_MASK V27
|
|
||||||
|
|
||||||
#include "aesni_macros_arm64.s"
|
#include "aesni_macros_arm64.s"
|
||||||
|
|
||||||
@ -49,7 +49,7 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
ADD srcPtr, srcPtrLen, R10
|
ADD srcPtr, srcPtrLen, R10
|
||||||
SUB $16, R10, R10
|
SUB $16, R10, R10
|
||||||
VLD1 (R10), [V15.S4]
|
VLD1 (R10), [LAST_BLOCK.S4]
|
||||||
|
|
||||||
cbcSm4Octets:
|
cbcSm4Octets:
|
||||||
CMP $128, srcPtrLen
|
CMP $128, srcPtrLen
|
||||||
@ -293,5 +293,5 @@ cbc4BlocksLoop48:
|
|||||||
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
|
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
|
||||||
|
|
||||||
cbcSm4Done:
|
cbcSm4Done:
|
||||||
VST1 [V15.S4], (R6)
|
VST1 [LAST_BLOCK.S4], (R6)
|
||||||
RET
|
RET
|
||||||
|
@ -219,9 +219,9 @@ avxEcbSm4Done:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
avx2_start:
|
avx2_start:
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
|
||||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
|
||||||
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
|
VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK
|
||||||
|
|
||||||
avx2_16blocks:
|
avx2_16blocks:
|
||||||
CMPQ DI, $256
|
CMPQ DI, $256
|
||||||
|
@ -8,15 +8,6 @@
|
|||||||
#define t1 V3
|
#define t1 V3
|
||||||
#define t2 V4
|
#define t2 V4
|
||||||
#define t3 V5
|
#define t3 V5
|
||||||
#define ZERO V16
|
|
||||||
#define NIBBLE_MASK V20
|
|
||||||
#define INVERSE_SHIFT_ROWS V21
|
|
||||||
#define M1L V22
|
|
||||||
#define M1H V23
|
|
||||||
#define M2L V24
|
|
||||||
#define M2H V25
|
|
||||||
#define R08_MASK V26
|
|
||||||
#define FK_MASK V27
|
|
||||||
#define XTMP6 V6
|
#define XTMP6 V6
|
||||||
#define XTMP7 V7
|
#define XTMP7 V7
|
||||||
#define t4 V10
|
#define t4 V10
|
||||||
@ -24,6 +15,15 @@
|
|||||||
#define t6 V12
|
#define t6 V12
|
||||||
#define t7 V13
|
#define t7 V13
|
||||||
|
|
||||||
|
#define ZERO V16
|
||||||
|
#define M1L V20
|
||||||
|
#define M1H V21
|
||||||
|
#define M2L V22
|
||||||
|
#define M2H V23
|
||||||
|
#define R08_MASK V24
|
||||||
|
#define INVERSE_SHIFT_ROWS V25
|
||||||
|
#define NIBBLE_MASK V26
|
||||||
|
|
||||||
#include "aesni_macros_arm64.s"
|
#include "aesni_macros_arm64.s"
|
||||||
|
|
||||||
// func encryptSm4Ecb(xk *uint32, dst, src []byte)
|
// func encryptSm4Ecb(xk *uint32, dst, src []byte)
|
||||||
|
@ -95,7 +95,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
|||||||
MOVOU (tPtr), ACC0
|
MOVOU (tPtr), ACC0
|
||||||
MOVOU (tMsk), T2
|
MOVOU (tMsk), T2
|
||||||
|
|
||||||
MOVOU bswap_mask<>(SB), BSWAP
|
MOVOU ·bswap_mask(SB), BSWAP
|
||||||
MOVOU gcmPoly<>(SB), POLY
|
MOVOU gcmPoly<>(SB), POLY
|
||||||
|
|
||||||
SHLQ $3, plen
|
SHLQ $3, plen
|
||||||
@ -279,7 +279,7 @@ TEXT ·gcmSm4Data(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
PXOR ACC0, ACC0
|
PXOR ACC0, ACC0
|
||||||
// MOVOU (tPtr), ACC0 // originally we passed in tag initial value
|
// MOVOU (tPtr), ACC0 // originally we passed in tag initial value
|
||||||
MOVOU bswap_mask<>(SB), BSWAP
|
MOVOU ·bswap_mask(SB), BSWAP
|
||||||
MOVOU gcmPoly<>(SB), POLY
|
MOVOU gcmPoly<>(SB), POLY
|
||||||
|
|
||||||
TESTQ autLen, autLen
|
TESTQ autLen, autLen
|
||||||
@ -527,14 +527,14 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
|
|||||||
CMPB ·useAVX(SB), $1
|
CMPB ·useAVX(SB), $1
|
||||||
JE avxGcmSm4Enc
|
JE avxGcmSm4Enc
|
||||||
|
|
||||||
MOVOU bswap_mask<>(SB), BSWAP
|
MOVOU ·bswap_mask(SB), BSWAP
|
||||||
MOVOU gcmPoly<>(SB), POLY
|
MOVOU gcmPoly<>(SB), POLY
|
||||||
|
|
||||||
MOVOU (tPtr), ACC0
|
MOVOU (tPtr), ACC0
|
||||||
PXOR ACC1, ACC1
|
PXOR ACC1, ACC1
|
||||||
PXOR ACCM, ACCM
|
PXOR ACCM, ACCM
|
||||||
MOVOU (ctrPtr), T0
|
MOVOU (ctrPtr), T0
|
||||||
PSHUFB flip_mask<>(SB), T0
|
PSHUFB ·flip_mask(SB), T0
|
||||||
PEXTRD $3, T0, aluCTR
|
PEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
MOVOU T0, (8*16 + 0*16)(SP)
|
MOVOU T0, (8*16 + 0*16)(SP)
|
||||||
@ -870,14 +870,14 @@ gcmSm4EncDone:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
avxGcmSm4Enc:
|
avxGcmSm4Enc:
|
||||||
VMOVDQU bswap_mask<>(SB), BSWAP
|
VMOVDQU ·bswap_mask(SB), BSWAP
|
||||||
VMOVDQU gcmPoly<>(SB), POLY
|
VMOVDQU gcmPoly<>(SB), POLY
|
||||||
|
|
||||||
VMOVDQU (tPtr), ACC0
|
VMOVDQU (tPtr), ACC0
|
||||||
VPXOR ACC1, ACC1, ACC1
|
VPXOR ACC1, ACC1, ACC1
|
||||||
VPXOR ACCM, ACCM, ACCM
|
VPXOR ACCM, ACCM, ACCM
|
||||||
VMOVDQU (ctrPtr), T0
|
VMOVDQU (ctrPtr), T0
|
||||||
VPSHUFB flip_mask<>(SB), T0, T0
|
VPSHUFB ·flip_mask(SB), T0, T0
|
||||||
VPEXTRD $3, T0, aluCTR
|
VPEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
VMOVDQU T0, (8*16 + 0*16)(SP)
|
VMOVDQU T0, (8*16 + 0*16)(SP)
|
||||||
@ -1198,14 +1198,14 @@ avxGcmSm4EncDone:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
avx2GcmSm4Enc:
|
avx2GcmSm4Enc:
|
||||||
VMOVDQU bswap_mask<>(SB), BSWAP
|
VMOVDQU ·bswap_mask(SB), BSWAP
|
||||||
VMOVDQU gcmPoly<>(SB), POLY
|
VMOVDQU gcmPoly<>(SB), POLY
|
||||||
|
|
||||||
VMOVDQU (tPtr), ACC0
|
VMOVDQU (tPtr), ACC0
|
||||||
VPXOR ACC1, ACC1, ACC1
|
VPXOR ACC1, ACC1, ACC1
|
||||||
VPXOR ACCM, ACCM, ACCM
|
VPXOR ACCM, ACCM, ACCM
|
||||||
VMOVDQU (ctrPtr), T0
|
VMOVDQU (ctrPtr), T0
|
||||||
VPSHUFB flip_mask<>(SB), T0, T0
|
VPSHUFB ·flip_mask(SB), T0, T0
|
||||||
VPEXTRD $3, T0, aluCTR
|
VPEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
VINSERTI128 $1, T0, Y11, Y11
|
VINSERTI128 $1, T0, Y11, Y11
|
||||||
@ -1228,7 +1228,7 @@ avx2GcmSm4Enc:
|
|||||||
increment(6)
|
increment(6)
|
||||||
increment(7)
|
increment(7)
|
||||||
|
|
||||||
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
|
||||||
// load 8 ctrs for encryption
|
// load 8 ctrs for encryption
|
||||||
VMOVDQU (4*32 + 0*32)(SP), DWB0
|
VMOVDQU (4*32 + 0*32)(SP), DWB0
|
||||||
VMOVDQU (4*32 + 1*32)(SP), DWB1
|
VMOVDQU (4*32 + 1*32)(SP), DWB1
|
||||||
@ -1239,7 +1239,7 @@ avx2GcmSm4Enc:
|
|||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
|
||||||
|
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
|
||||||
increment(1)
|
increment(1)
|
||||||
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
|
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
|
||||||
increment(2)
|
increment(2)
|
||||||
@ -1613,14 +1613,14 @@ TEXT ·gcmSm4Dec(SB),0,$128-96
|
|||||||
CMPB ·useAVX(SB), $1
|
CMPB ·useAVX(SB), $1
|
||||||
JE avxGcmSm4Dec
|
JE avxGcmSm4Dec
|
||||||
|
|
||||||
MOVOU bswap_mask<>(SB), BSWAP
|
MOVOU ·bswap_mask(SB), BSWAP
|
||||||
MOVOU gcmPoly<>(SB), POLY
|
MOVOU gcmPoly<>(SB), POLY
|
||||||
|
|
||||||
MOVOU (tPtr), ACC0
|
MOVOU (tPtr), ACC0
|
||||||
PXOR ACC1, ACC1
|
PXOR ACC1, ACC1
|
||||||
PXOR ACCM, ACCM
|
PXOR ACCM, ACCM
|
||||||
MOVOU (ctrPtr), T0
|
MOVOU (ctrPtr), T0
|
||||||
PSHUFB flip_mask<>(SB), T0
|
PSHUFB ·flip_mask(SB), T0
|
||||||
PEXTRD $3, T0, aluCTR
|
PEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
MOVOU T0, (0*16)(SP)
|
MOVOU T0, (0*16)(SP)
|
||||||
@ -1841,14 +1841,14 @@ gcmSm4DecDone:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
avxGcmSm4Dec:
|
avxGcmSm4Dec:
|
||||||
VMOVDQU bswap_mask<>(SB), BSWAP
|
VMOVDQU ·bswap_mask(SB), BSWAP
|
||||||
VMOVDQU gcmPoly<>(SB), POLY
|
VMOVDQU gcmPoly<>(SB), POLY
|
||||||
|
|
||||||
VMOVDQU (tPtr), ACC0
|
VMOVDQU (tPtr), ACC0
|
||||||
VPXOR ACC1, ACC1, ACC1
|
VPXOR ACC1, ACC1, ACC1
|
||||||
VPXOR ACCM, ACCM, ACCM
|
VPXOR ACCM, ACCM, ACCM
|
||||||
VMOVDQU (ctrPtr), T0
|
VMOVDQU (ctrPtr), T0
|
||||||
VPSHUFB flip_mask<>(SB), T0, T0
|
VPSHUFB ·flip_mask(SB), T0, T0
|
||||||
VPEXTRD $3, T0, aluCTR
|
VPEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
VMOVDQU T0, (0*16)(SP)
|
VMOVDQU T0, (0*16)(SP)
|
||||||
@ -2065,14 +2065,14 @@ avxGcmSm4DecDone:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
avx2GcmSm4Dec:
|
avx2GcmSm4Dec:
|
||||||
VMOVDQU bswap_mask<>(SB), BSWAP
|
VMOVDQU ·bswap_mask(SB), BSWAP
|
||||||
VMOVDQU gcmPoly<>(SB), POLY
|
VMOVDQU gcmPoly<>(SB), POLY
|
||||||
|
|
||||||
VMOVDQU (tPtr), ACC0
|
VMOVDQU (tPtr), ACC0
|
||||||
VPXOR ACC1, ACC1, ACC1
|
VPXOR ACC1, ACC1, ACC1
|
||||||
VPXOR ACCM, ACCM, ACCM
|
VPXOR ACCM, ACCM, ACCM
|
||||||
VMOVDQU (ctrPtr), T0
|
VMOVDQU (ctrPtr), T0
|
||||||
VPSHUFB flip_mask<>(SB), T0, T0
|
VPSHUFB ·flip_mask(SB), T0, T0
|
||||||
VPEXTRD $3, T0, aluCTR
|
VPEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
VINSERTI128 $1, T0, Y11, Y11
|
VINSERTI128 $1, T0, Y11, Y11
|
||||||
@ -2094,8 +2094,8 @@ avx2GcmSm4Dec:
|
|||||||
increment(6)
|
increment(6)
|
||||||
increment(7)
|
increment(7)
|
||||||
|
|
||||||
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4DecOctetsLoop:
|
avx2GcmSm4DecOctetsLoop:
|
||||||
CMPQ ptxLen, $128
|
CMPQ ptxLen, $128
|
||||||
|
@ -29,13 +29,14 @@
|
|||||||
#define K1 V20
|
#define K1 V20
|
||||||
#define K2 V21
|
#define K2 V21
|
||||||
#define K3 V22
|
#define K3 V22
|
||||||
#define NIBBLE_MASK V23
|
|
||||||
#define INVERSE_SHIFT_ROWS V24
|
#define M1L V23
|
||||||
#define M1L V25
|
#define M1H V24
|
||||||
#define M1H V26
|
#define M2L V25
|
||||||
#define M2L V27
|
#define M2H V26
|
||||||
#define M2H V28
|
#define R08_MASK V27
|
||||||
#define R08_MASK V29
|
#define INVERSE_SHIFT_ROWS V28
|
||||||
|
#define NIBBLE_MASK V29
|
||||||
|
|
||||||
#define reduce() \
|
#define reduce() \
|
||||||
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
|
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
|
||||||
|
@ -329,7 +329,7 @@ GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16
|
|||||||
VPXOR (32*7)(SP), Y7, Y7
|
VPXOR (32*7)(SP), Y7, Y7
|
||||||
|
|
||||||
#define avx2LE2BE8Blocks \
|
#define avx2LE2BE8Blocks \
|
||||||
VBROADCASTI128 flip_mask<>(SB), Y11; \
|
VBROADCASTI128 ·flip_mask(SB), Y11; \
|
||||||
VPSHUFB Y11, Y0, Y0; \
|
VPSHUFB Y11, Y0, Y0; \
|
||||||
VPSHUFB Y11, Y1, Y1; \
|
VPSHUFB Y11, Y1, Y1; \
|
||||||
VPSHUFB Y11, Y2, Y2; \
|
VPSHUFB Y11, Y2, Y2; \
|
||||||
@ -589,8 +589,8 @@ avxXtsSm4EncDone:
|
|||||||
avx2XtsSm4Enc:
|
avx2XtsSm4Enc:
|
||||||
VMOVDQU gcmPoly<>(SB), POLY
|
VMOVDQU gcmPoly<>(SB), POLY
|
||||||
VMOVDQU (0*16)(BX), TW
|
VMOVDQU (0*16)(BX), TW
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
|
||||||
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
|
||||||
|
|
||||||
avx2XtsSm4Enc16Blocks:
|
avx2XtsSm4Enc16Blocks:
|
||||||
CMPQ DI, $256
|
CMPQ DI, $256
|
||||||
@ -735,7 +735,7 @@ TEXT ·encryptSm4XtsGB(SB),0,$256-64
|
|||||||
JE avxXtsSm4Enc
|
JE avxXtsSm4Enc
|
||||||
|
|
||||||
MOVOU gbGcmPoly<>(SB), POLY
|
MOVOU gbGcmPoly<>(SB), POLY
|
||||||
MOVOU bswap_mask<>(SB), BSWAP
|
MOVOU ·bswap_mask(SB), BSWAP
|
||||||
MOVOU (0*16)(BX), TW
|
MOVOU (0*16)(BX), TW
|
||||||
|
|
||||||
xtsSm4EncOctets:
|
xtsSm4EncOctets:
|
||||||
@ -834,7 +834,7 @@ xtsSm4EncDone:
|
|||||||
|
|
||||||
avxXtsSm4Enc:
|
avxXtsSm4Enc:
|
||||||
VMOVDQU gbGcmPoly<>(SB), POLY
|
VMOVDQU gbGcmPoly<>(SB), POLY
|
||||||
VMOVDQU bswap_mask<>(SB), BSWAP
|
VMOVDQU ·bswap_mask(SB), BSWAP
|
||||||
VMOVDQU (0*16)(BX), TW
|
VMOVDQU (0*16)(BX), TW
|
||||||
|
|
||||||
avxXtsSm4EncOctets:
|
avxXtsSm4EncOctets:
|
||||||
@ -934,8 +934,8 @@ avxXtsSm4EncDone:
|
|||||||
avx2XtsSm4Enc:
|
avx2XtsSm4Enc:
|
||||||
VMOVDQU gbGcmPoly<>(SB), POLY
|
VMOVDQU gbGcmPoly<>(SB), POLY
|
||||||
VMOVDQU (0*16)(BX), TW
|
VMOVDQU (0*16)(BX), TW
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
|
||||||
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
|
||||||
|
|
||||||
avx2XtsSm4Enc16Blocks:
|
avx2XtsSm4Enc16Blocks:
|
||||||
CMPQ DI, $256
|
CMPQ DI, $256
|
||||||
@ -1327,8 +1327,8 @@ avxXtsSm4DecDone:
|
|||||||
avx2XtsSm4Dec:
|
avx2XtsSm4Dec:
|
||||||
VMOVDQU gcmPoly<>(SB), POLY
|
VMOVDQU gcmPoly<>(SB), POLY
|
||||||
VMOVDQU (0*16)(BX), TW
|
VMOVDQU (0*16)(BX), TW
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
|
||||||
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
|
||||||
|
|
||||||
avx2XtsSm4Dec16Blocks:
|
avx2XtsSm4Dec16Blocks:
|
||||||
CMPQ DI, $256
|
CMPQ DI, $256
|
||||||
@ -1498,7 +1498,7 @@ TEXT ·decryptSm4XtsGB(SB),0,$256-64
|
|||||||
JE avxXtsSm4Dec
|
JE avxXtsSm4Dec
|
||||||
|
|
||||||
MOVOU gbGcmPoly<>(SB), POLY
|
MOVOU gbGcmPoly<>(SB), POLY
|
||||||
MOVOU bswap_mask<>(SB), BSWAP
|
MOVOU ·bswap_mask(SB), BSWAP
|
||||||
MOVOU (0*16)(BX), TW
|
MOVOU (0*16)(BX), TW
|
||||||
|
|
||||||
xtsSm4DecOctets:
|
xtsSm4DecOctets:
|
||||||
@ -1622,7 +1622,7 @@ xtsSm4DecDone:
|
|||||||
|
|
||||||
avxXtsSm4Dec:
|
avxXtsSm4Dec:
|
||||||
VMOVDQU gbGcmPoly<>(SB), POLY
|
VMOVDQU gbGcmPoly<>(SB), POLY
|
||||||
VMOVDQU bswap_mask<>(SB), BSWAP
|
VMOVDQU ·bswap_mask(SB), BSWAP
|
||||||
VMOVDQU (0*16)(BX), TW
|
VMOVDQU (0*16)(BX), TW
|
||||||
|
|
||||||
avxXtsSm4DecOctets:
|
avxXtsSm4DecOctets:
|
||||||
@ -1747,8 +1747,8 @@ avxXtsSm4DecDone:
|
|||||||
avx2XtsSm4Dec:
|
avx2XtsSm4Dec:
|
||||||
VMOVDQU gbGcmPoly<>(SB), POLY
|
VMOVDQU gbGcmPoly<>(SB), POLY
|
||||||
VMOVDQU (0*16)(BX), TW
|
VMOVDQU (0*16)(BX), TW
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
|
||||||
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
|
||||||
|
|
||||||
avx2XtsSm4Dec16Blocks:
|
avx2XtsSm4Dec16Blocks:
|
||||||
CMPQ DI, $256
|
CMPQ DI, $256
|
||||||
|
@ -29,13 +29,13 @@
|
|||||||
#define K2 V21
|
#define K2 V21
|
||||||
#define K3 V22
|
#define K3 V22
|
||||||
|
|
||||||
#define NIBBLE_MASK V23
|
#define M1L V23
|
||||||
#define INVERSE_SHIFT_ROWS V24
|
#define M1H V24
|
||||||
#define M1L V25
|
#define M2L V25
|
||||||
#define M1H V26
|
#define M2H V26
|
||||||
#define M2L V27
|
#define R08_MASK V27
|
||||||
#define M2H V28
|
#define INVERSE_SHIFT_ROWS V28
|
||||||
#define R08_MASK V29
|
#define NIBBLE_MASK V29
|
||||||
|
|
||||||
#include "aesni_macros_arm64.s"
|
#include "aesni_macros_arm64.s"
|
||||||
#include "xts_macros_arm64.s"
|
#include "xts_macros_arm64.s"
|
||||||
|
@ -28,7 +28,6 @@ DATA rcon<>+0x90(SB)/8, $0x00ff00ff00ff00ff // S1
|
|||||||
DATA rcon<>+0x98(SB)/8, $0x00ff00ff00ff00ff
|
DATA rcon<>+0x98(SB)/8, $0x00ff00ff00ff00ff
|
||||||
GLOBL rcon<>(SB), RODATA, $160
|
GLOBL rcon<>(SB), RODATA, $160
|
||||||
|
|
||||||
|
|
||||||
#define M1L V20
|
#define M1L V20
|
||||||
#define M1H V21
|
#define M1H V21
|
||||||
#define M2L V22
|
#define M2L V22
|
||||||
|
Loading…
x
Reference in New Issue
Block a user