From 8ddf1bc68fa1d0de95628b6d4436dd254e65f564 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Thu, 21 Jul 2022 13:41:56 +0800 Subject: [PATCH] sm4: reduce duplicated asm code --- sm4/aesni_amd64.h | 255 +++++++++++++++++++++++++++++++++++++++ sm4/aesni_arm64.h | 111 +++++++++++++++++ sm4/asm_amd64.s | 211 ++------------------------------- sm4/asm_arm64.s | 80 +------------ sm4/gcm_amd64.s | 296 ++++------------------------------------------ sm4/gcm_arm64.s | 108 +---------------- 6 files changed, 406 insertions(+), 655 deletions(-) create mode 100644 sm4/aesni_amd64.h create mode 100644 sm4/aesni_arm64.h diff --git a/sm4/aesni_amd64.h b/sm4/aesni_amd64.h new file mode 100644 index 0000000..f79fda1 --- /dev/null +++ b/sm4/aesni_amd64.h @@ -0,0 +1,255 @@ +// shuffle byte order from LE to BE +DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 +DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b +GLOBL flip_mask<>(SB), RODATA, $16 + +// shuffle byte and word order +DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f +DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607 +GLOBL bswap_mask<>(SB), RODATA, $16 + +//nibble mask +DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F +DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F +GLOBL nibble_mask<>(SB), RODATA, $16 + +// inverse shift rows +DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 +DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 +GLOBL inverse_shift_rows<>(SB), RODATA, $16 + +// Affine transform 1 (low and high hibbles) +DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 +DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 +GLOBL m1_low<>(SB), RODATA, $16 + +DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 +DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB +GLOBL m1_high<>(SB), RODATA, $16 + +// Affine transform 2 (low and high hibbles) +DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 +DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 +GLOBL m2_low<>(SB), RODATA, $16 + +DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 +DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 +GLOBL m2_high<>(SB), RODATA, $16 + +// left rotations of 32-bit words by 8-bit increments +DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 +DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +GLOBL r08_mask<>(SB), RODATA, $16 + +DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 +DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +GLOBL r16_mask<>(SB), RODATA, $16 + +DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 +DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 +GLOBL r24_mask<>(SB), RODATA, $16 + +DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 +DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 +GLOBL fk_mask<>(SB), RODATA, $16 + +// MOVOU r0, tmp2; +// PUNPCKHDQ r1, tmp2; +// PUNPCKLDQ r1, r0; +// MOVOU r2, tmp1; +// PUNPCKLDQ r3, tmp1; +// PUNPCKHDQ r3, r2; +// MOVOU r0, r1; +// PUNPCKHQDQ tmp1, r1; +// PUNPCKLQDQ tmp1, r0; +// MOVOU tmp2, r3; +// PUNPCKHQDQ r2, r3; +// PUNPCKLQDQ r2, tmp2; +// MOVOU tmp2, r2 +#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \ + PEXTRD $2, r0, r; \ + PINSRD $0, r, tmp2; \ + PEXTRD $2, r1, r; \ + PINSRD $1, r, tmp2; \ + ; \ + PEXTRD $3, r0, r; \ + PINSRD $2, r, tmp2; \ + PEXTRD $3, r1, r; \ + PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2] + ; \ + PEXTRD $1, r0, r; \ + PINSRD $2, r, r0; \ + PEXTRD $0, r1, r; \ + PINSRD $1, r, r0; \ + PEXTRD $1, r1, r; \ + PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0] + ; \ + PEXTRD $0, r2, r; \ + PINSRD $0, r, tmp1; \ + PEXTRD $0, r3, r; \ + PINSRD $1, r, tmp1; \ + PEXTRD $1, r2, r; \ + PINSRD $2, r, tmp1; \ + PEXTRD $1, r3, r; \ + PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8] + ; \ + PEXTRD $2, r2, r; \ + PINSRD $0, r, r2; \ + PEXTRD $2, r3, r; \ + PINSRD $1, r, r2; \ + PEXTRD $3, r2, r; \ + PINSRD $2, r, r2; \ + PEXTRD $3, r3, r; \ + PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10] + ; \ + MOVOU r0, r1; \ + PEXTRQ $1, r1, r; \ + PINSRQ $0, r, r1; \ + PEXTRQ $1, tmp1, r; \ + PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1] + ; \ + PEXTRQ $0, tmp1, r; \ + PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0] + ; \ + MOVOU tmp2, r3; \ + PEXTRQ $1, r3, r; \ + PINSRQ $0, r, r3; \ + PEXTRQ $1, r2, r; \ + PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3] + ; \ + PEXTRQ $0, r2, r; \ + PINSRQ $1, r, r2; \ + PEXTRQ $0, tmp2, r; \ + PINSRQ $0, r, r2 + +#define SM4_SBOX(x, y, z) \ + ; \ //############################# inner affine ############################// + MOVOU x, z; \ + PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f); + MOVOU m1_low<>(SB), y; \ + PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y); + PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); + PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); + MOVOU m1_high<>(SB), z; \ + PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x); + MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x); + PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; + ; \ // inverse ShiftRows + PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); + AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction + ; \ //############################# outer affine ############################// + MOVOU x, z; \ + PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f); + MOVOU m2_low<>(SB), y; \ + PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z) + PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); + PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); + MOVOU m2_high<>(SB), z; \ + PSHUFB x, z; \ + MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) + PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; + +#define SM4_TAO_L1(x, y, z) \ + SM4_SBOX(x, y, z); \ + ; \ //#################### 4 parallel L1 linear transforms ##################// + MOVOU x, y; \ + PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) + PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08) + MOVOU x, z; \ + PSHUFB r16_mask<>(SB), z; \ + PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16) + MOVOU y, z; \ + PSLLL $2, z; \ + PSRLL $30, y; \ + POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30); + MOVOU x, z; \ + PSHUFB r24_mask<>(SB), z; \ + PXOR y, x; \ //x = x xor y + PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24); + +#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \ + VPAND X_NIBBLE_MASK, x, tmp; \ + VMOVDQU m1_low<>(SB), y; \ + VPSHUFB tmp, y, y; \ + VPSRLQ $4, x, x; \ + VPAND X_NIBBLE_MASK, x, x; \ + VMOVDQU m1_high<>(SB), tmp; \ + VPSHUFB x, tmp, x; \ + VPXOR y, x, x; \ + VMOVDQU inverse_shift_rows<>(SB), tmp; \ + VPSHUFB tmp, x, x; \ + VAESENCLAST X_NIBBLE_MASK, x, x; \ + VPANDN X_NIBBLE_MASK, x, tmp; \ + VMOVDQU m2_low<>(SB), y; \ + VPSHUFB tmp, y, y; \ + VPSRLQ $4, x, x; \ + VPAND X_NIBBLE_MASK, x, x; \ + VMOVDQU m2_high<>(SB), tmp; \ + VPSHUFB x, tmp, x; \ + VPXOR y, x, x + +#define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \ + AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \ + VMOVDQU r08_mask<>(SB), tmp; \ + VPSHUFB tmp, x, y; \ + VPXOR x, y, y; \ + VMOVDQU r16_mask<>(SB), tmp; \ + VPSHUFB tmp, x, tmp; \ + VPXOR tmp, y, y; \ + VPSLLD $2, y, tmp; \ + VPSRLD $30, y, y; \ + VPXOR tmp, y, y; \ + VMOVDQU r24_mask<>(SB), tmp; \ + VPSHUFB tmp, x, tmp; \ + VPXOR y, x, x; \ + VPXOR x, tmp, x + +#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ + VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] + VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] + VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8] + VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10] + VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1] + VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0] + VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] + VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] + +#define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \ + VPAND yNibbleMask, x, tmp; \ + VBROADCASTI128 m1_low<>(SB), y; \ + VPSHUFB tmp, y, y; \ + VPSRLQ $4, x, x; \ + VPAND yNibbleMask, x, x; \ + VBROADCASTI128 m1_high<>(SB), tmp; \ + VPSHUFB x, tmp, x; \ + VPXOR y, x, x; \ + VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \ + VPSHUFB tmp, x, x; \ + VEXTRACTI128 $1, x, yw \ + VAESENCLAST xNibbleMask, xw, xw; \ + VAESENCLAST xNibbleMask, yw, yw; \ + VINSERTI128 $1, yw, x, x; \ + VPANDN yNibbleMask, x, tmp; \ + VBROADCASTI128 m2_low<>(SB), y; \ + VPSHUFB tmp, y, y; \ + VPSRLQ $4, x, x; \ + VPAND yNibbleMask, x, x; \ + VBROADCASTI128 m2_high<>(SB), tmp; \ + VPSHUFB x, tmp, x; \ + VPXOR y, x, x + +#define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \ + AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \ + VBROADCASTI128 r08_mask<>(SB), tmp; \ + VPSHUFB tmp, x, y; \ + VPXOR x, y, y; \ + VBROADCASTI128 r16_mask<>(SB), tmp; \ + VPSHUFB tmp, x, tmp; \ + VPXOR tmp, y, y; \ + VPSLLD $2, y, tmp; \ + VPSRLD $30, y, y; \ + VPXOR tmp, y, y; \ + VBROADCASTI128 r24_mask<>(SB), tmp; \ + VPSHUFB tmp, x, tmp; \ + VPXOR y, x, x; \ + VPXOR x, tmp, x diff --git a/sm4/aesni_arm64.h b/sm4/aesni_arm64.h new file mode 100644 index 0000000..9dbb2ea --- /dev/null +++ b/sm4/aesni_arm64.h @@ -0,0 +1,111 @@ +//nibble mask +DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F +DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F +GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16 + +// inverse shift rows +DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 +DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 +GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16 + +// Affine transform 1 (low and high hibbles) +DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 +DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 +GLOBL m1_low<>(SB), (NOPTR+RODATA), $16 + +DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 +DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB +GLOBL m1_high<>(SB), (NOPTR+RODATA), $16 + +// Affine transform 2 (low and high hibbles) +DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 +DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 +GLOBL m2_low<>(SB), (NOPTR+RODATA), $16 + +DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 +DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 +GLOBL m2_high<>(SB), (NOPTR+RODATA), $16 + +// left rotations of 32-bit words by 8-bit increments +DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 +DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16 + +DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 +DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16 + +DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 +DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 +GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16 + +DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 +DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 +GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 + +#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ + VMOV t0.B16, K.B16 \ + VMOV t1.S[0], t0.S[1] \ + VMOV t2.S[0], t0.S[2] \ + VMOV t3.S[0], t0.S[3] \ + VMOV K.S[1], t1.S[0] \ + VMOV K.S[2], t2.S[0] \ + VMOV K.S[3], t3.S[0] \ + VMOV t1.D[1], K.D[1] \ + VMOV t2.S[1], t1.S[2] \ + VMOV t3.S[1], t1.S[3] \ + VMOV K.S[2], t2.S[1] \ + VMOV K.S[3], t3.S[1] \ + VMOV t2.S[3], K.S[3] \ + VMOV t3.S[2], t2.S[3] \ + VMOV K.S[3], t3.S[2] + +#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ + VMOV t0.B16, K.B16 \ + VMOV t3.S[0], t0.S[0] \ + VMOV t2.S[0], t0.S[1] \ + VMOV t1.S[0], t0.S[2] \ + VMOV K0.S[0], t0.S[3] \ + VMOV t3.S[1], t1.S[0] \ + VMOV t3.S[2], t2.S[0] \ + VMOV t3.S[3], t3.S[0] \ + VMOV t2.S[3], t3.S[1] \ + VMOV t1.S[3], t3.S[2] \ + VMOV K.S[3], t3.S[3] \ + VMOV K.S[2], t2.S[3] \ + VMOV K.S[1], t1.S[3] \ + VMOV t1.B16, K.B16 \ + VMOV t2.S[1], t1.S[1] \ + VMOV K.S[1], t1.S[2] \ + VMOV t2.S[2], t2.S[1] \ + VMOV K.S[2], t2.S[2] + +#define SM4_SBOX(x, y, z) \ + ; \ + VAND x.B16, NIBBLE_MASK.B16, z.B16; \ + VTBL z.B16, [M1L.B16], y.B16; \ + VUSHR $4, x.D2, x.D2; \ + VAND x.B16, NIBBLE_MASK.B16, z.B16; \ + VTBL z.B16, [M1H.B16], z.B16; \ + VEOR y.B16, z.B16, x.B16; \ + VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ + AESE ZERO.B16, x.B16; \ + VAND x.B16, NIBBLE_MASK.B16, z.B16; \ + VTBL z.B16, [M2L.B16], y.B16; \ + VUSHR $4, x.D2, x.D2; \ + VAND x.B16, NIBBLE_MASK.B16, z.B16; \ + VTBL z.B16, [M2H.B16], z.B16; \ + VEOR y.B16, z.B16, x.B16 + +#define SM4_TAO_L1(x, y, z) \ + SM4_SBOX(x, y, z); \ + VTBL R08_MASK.B16, [x.B16], y.B16; \ + VEOR y.B16, x.B16, y.B16; \ + VTBL R16_MASK.B16, [x.B16], z.B16; \ + VEOR z.B16, y.B16, y.B16; \ + VSHL $2, y.S4, z.S4; \ + VUSHR $30, y.S4, y.S4; \ + VORR y.B16, z.B16, y.B16; \ + VTBL R24_MASK.B16, [x.B16], z.B16; \ + VEOR z.B16, x.B16, x.B16; \ + VEOR y.B16, x.B16, x.B16 diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 94b6908..b6c49c8 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -14,108 +14,10 @@ #define XTMP6 X6 #define XTMP7 X7 -// shuffle byte order from LE to BE -DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 -DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b -GLOBL flip_mask<>(SB), RODATA, $16 - -// shuffle byte and word order -DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f -DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607 -GLOBL bswap_mask<>(SB), RODATA, $16 - -//nibble mask -DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F -DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F -GLOBL nibble_mask<>(SB), RODATA, $16 - -// inverse shift rows -DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 -GLOBL inverse_shift_rows<>(SB), RODATA, $16 - -// Affine transform 1 (low and high hibbles) -DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 -DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 -GLOBL m1_low<>(SB), RODATA, $16 - -DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 -DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB -GLOBL m1_high<>(SB), RODATA, $16 - -// Affine transform 2 (low and high hibbles) -DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 -DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 -GLOBL m2_low<>(SB), RODATA, $16 - -DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 -DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 -GLOBL m2_high<>(SB), RODATA, $16 - -// left rotations of 32-bit words by 8-bit increments -DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 -DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -GLOBL r08_mask<>(SB), RODATA, $16 - -DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 -DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -GLOBL r16_mask<>(SB), RODATA, $16 - -DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 -DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 -GLOBL r24_mask<>(SB), RODATA, $16 - -DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 -DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 -GLOBL fk_mask<>(SB), RODATA, $16 - -#define SM4_SBOX(x, y) \ - ; \ //############################# inner affine ############################// - MOVOU x, XTMP6; \ - PAND nibble_mask<>(SB), XTMP6; \ //y = _mm_and_si128(x, c0f); - MOVOU m1_low<>(SB), y; \ - PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m1l, y); - PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); - PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); - MOVOU m1_high<>(SB), XTMP6; \ - PSHUFB x, XTMP6; \ //x = _mm_shuffle_epi8(m1h, x); - MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m1h, x); - PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; - ; \ // inverse ShiftRows - PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); - AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction - ; \ //############################# outer affine ############################// - MOVOU x, XTMP6; \ - PANDN nibble_mask<>(SB), XTMP6; \ //XTMP6 = _mm_andnot_si128(x, c0f); - MOVOU m2_low<>(SB), y; \ - PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m2l, XTMP6) - PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); - PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); - MOVOU m2_high<>(SB), XTMP6; \ - PSHUFB x, XTMP6; \ - MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m2h, x) - PXOR y, x; \ //x = _mm_shuffle_epi8(m2h, x) ^ y; - -#define SM4_TAO_L1(x, y) \ - SM4_SBOX(x, y); \ - ; \ //#################### 4 parallel L1 linear transforms ##################// - MOVOU x, y; \ - PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) - PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08) - MOVOU x, XTMP6; \ - PSHUFB r16_mask<>(SB), XTMP6; \ - PXOR XTMP6, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16) - MOVOU y, XTMP6; \ - PSLLL $2, XTMP6; \ - PSRLL $30, y; \ - POR XTMP6, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30); - MOVOU x, XTMP7; \ - PSHUFB r24_mask<>(SB), XTMP7; \ - PXOR y, x; \ //x = x xor y - PXOR XTMP7, x //x = x xor y xor _mm_shuffle_epi8(x, r24); +#include "aesni_amd64.h" #define SM4_TAO_L2(x, y) \ - SM4_SBOX(x, y); \ + SM4_SBOX(x, y, XTMP6); \ ; \ //#################### 4 parallel L2 linear transforms ##################// MOVOU x, y; \ MOVOU x, XTMP6; \ @@ -135,7 +37,7 @@ GLOBL fk_mask<>(SB), RODATA, $16 PXOR t1, x; \ PXOR t2, x; \ PXOR t3, x; \ - SM4_TAO_L1(x, y); \ + SM4_TAO_L1(x, y, XTMP6); \ PXOR x, t0 #define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \ @@ -143,7 +45,7 @@ GLOBL fk_mask<>(SB), RODATA, $16 PXOR t1, x; \ PXOR t2, x; \ PXOR t3, x; \ - SM4_TAO_L1(x, y); \ + SM4_TAO_L1(x, y, XTMP6); \ PXOR x, t0 #define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \ @@ -187,110 +89,20 @@ GLOBL fk_mask<>(SB), RODATA, $16 #define XWORD X8 #define YWORD X9 -#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ - VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] - VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] - VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8] - VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10] - VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1] - VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0] - VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] - VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] - -// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html -#define AVX2_SM4_SBOX(x, y) \ - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK; \ - VPAND NIBBLE_MASK, x, XDWTMP1; \ - VBROADCASTI128 m1_low<>(SB), y; \ - VPSHUFB XDWTMP1, y, y; \ - VPSRLQ $4, x, x; \ - VPAND NIBBLE_MASK, x, x; \ - VBROADCASTI128 m1_high<>(SB), XDWTMP1; \ - VPSHUFB x, XDWTMP1, x; \ - VPXOR y, x, x; \ - VBROADCASTI128 inverse_shift_rows<>(SB), XDWTMP1;\ - VPSHUFB XDWTMP1, x, x; \ - VEXTRACTI128 $1, x, YWORD \ - VAESENCLAST X_NIBBLE_MASK, XWORD, XWORD; \ - VAESENCLAST X_NIBBLE_MASK, YWORD, YWORD; \ - VINSERTI128 $1, YWORD, x, x; \ - VPANDN NIBBLE_MASK, x, XDWTMP1; \ - VBROADCASTI128 m2_low<>(SB), y; \ - VPSHUFB XDWTMP1, y, y; \ - VPSRLQ $4, x, x; \ - VPAND NIBBLE_MASK, x, x; \ - VBROADCASTI128 m2_high<>(SB), XDWTMP1; \ - VPSHUFB x, XDWTMP1, x; \ - VPXOR y, x, x - -#define AVX2_SM4_TAO_L1(x, y) \ - AVX2_SM4_SBOX(x, y); \ - VBROADCASTI128 r08_mask<>(SB), XDWTMP0; \ - VPSHUFB XDWTMP0, x, y; \ - VPXOR x, y, y; \ - VBROADCASTI128 r16_mask<>(SB), XDWTMP0; \ - VPSHUFB XDWTMP0, x, XDWTMP0; \ - VPXOR XDWTMP0, y, y; \ - VPSLLD $2, y, XDWTMP1; \ - VPSRLD $30, y, y; \ - VPXOR XDWTMP1, y, y; \ - VBROADCASTI128 r24_mask<>(SB), XDWTMP0; \ - VPSHUFB XDWTMP0, x, XDWTMP0; \ - VPXOR y, x, x; \ - VPXOR x, XDWTMP0, x - -#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ - VPBROADCASTD (index * 4)(AX)(CX*1), x; \ - VPXOR t1, x, x; \ - VPXOR t2, x, x; \ - VPXOR t3, x, x; \ - AVX2_SM4_TAO_L1(x, y); \ +#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ + VPBROADCASTD (index * 4)(AX)(CX*1), x; \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \ VPXOR x, t0, t0 -#define AVX_SM4_SBOX(x, y) \ - VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK; \ - VPAND X_NIBBLE_MASK, x, XWTMP1; \ - VMOVDQU m1_low<>(SB), y; \ - VPSHUFB XWTMP1, y, y; \ - VPSRLQ $4, x, x; \ - VPAND X_NIBBLE_MASK, x, x; \ - VMOVDQU m1_high<>(SB), XWTMP1; \ - VPSHUFB x, XWTMP1, x; \ - VPXOR y, x, x; \ - VMOVDQU inverse_shift_rows<>(SB), XWTMP1; \ - VPSHUFB XWTMP1, x, x; \ - VAESENCLAST X_NIBBLE_MASK, x, x; \ - VPANDN X_NIBBLE_MASK, x, XWTMP1; \ - VMOVDQU m2_low<>(SB), y; \ - VPSHUFB XWTMP1, y, y; \ - VPSRLQ $4, x, x; \ - VPAND X_NIBBLE_MASK, x, x; \ - VMOVDQU m2_high<>(SB), XWTMP1; \ - VPSHUFB x, XWTMP1, x; \ - VPXOR y, x, x - -#define AVX_SM4_TAO_L1(x, y) \ - AVX_SM4_SBOX(x, y); \ - VMOVDQU r08_mask<>(SB), XWTMP0; \ - VPSHUFB XWTMP0, x, y; \ - VPXOR x, y, y; \ - VMOVDQU r16_mask<>(SB), XWTMP0; \ - VPSHUFB XWTMP0, x, XWTMP0; \ - VPXOR XWTMP0, y, y; \ - VPSLLD $2, y, XWTMP1; \ - VPSRLD $30, y, y; \ - VPXOR XWTMP1, y, y; \ - VMOVDQU r24_mask<>(SB), XWTMP0; \ - VPSHUFB XWTMP0, x, XWTMP0; \ - VPXOR y, x, x; \ - VPXOR x, XWTMP0, x - #define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ VPBROADCASTD (index * 4)(AX)(CX*1), x; \ VPXOR t1, x, x; \ VPXOR t2, x, x; \ VPXOR t3, x, x; \ - AVX_SM4_TAO_L1(x, y); \ + AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \ VPXOR x, t0, t0 // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) @@ -408,6 +220,7 @@ done_sm4: RET avx2: + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK CMPQ DI, $64 JBE avx2_4blocks diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index fd8c20b..3b62fdb 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -23,84 +23,10 @@ #define XTMP6 V6 #define XTMP7 V7 -//nibble mask -DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F -DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F -GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16 - -// inverse shift rows -DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 -GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16 - -// Affine transform 1 (low and high hibbles) -DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 -DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 -GLOBL m1_low<>(SB), (NOPTR+RODATA), $16 - -DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 -DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB -GLOBL m1_high<>(SB), (NOPTR+RODATA), $16 - -// Affine transform 2 (low and high hibbles) -DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 -DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 -GLOBL m2_low<>(SB), (NOPTR+RODATA), $16 - -DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 -DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 -GLOBL m2_high<>(SB), (NOPTR+RODATA), $16 - -// left rotations of 32-bit words by 8-bit increments -DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 -DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16 - -DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 -DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16 - -DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 -DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 -GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16 - -DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 -DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 -GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 - -#define SM4_SBOX(x, y) \ - ; \ //############################# inner affine ############################// - VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ - VTBL XTMP7.B16, [M1L.B16], y.B16; \ - VUSHR $4, x.D2, x.D2; \ - VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ - VTBL XTMP7.B16, [M1H.B16], XTMP7.B16; \ - VEOR y.B16, XTMP7.B16, x.B16; \ - VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ - AESE ZERO.B16, x.B16; \ - VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ - VTBL XTMP7.B16, [M2L.B16], y.B16; \ - VUSHR $4, x.D2, x.D2; \ - VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ - VTBL XTMP7.B16, [M2H.B16], XTMP7.B16; \ - VEOR y.B16, XTMP7.B16, x.B16 - -#define SM4_TAO_L1(x, y) \ - SM4_SBOX(x, y); \ - ; \ //#################### 4 parallel L1 linear transforms ##################// - VTBL R08_MASK.B16, [x.B16], y.B16; \ - VEOR y.B16, x.B16, y.B16; \ - VTBL R16_MASK.B16, [x.B16], XTMP7.B16; \ - VEOR XTMP7.B16, y.B16, y.B16; \ - VSHL $2, y.S4, XTMP7.S4; \ - VUSHR $30, y.S4, y.S4; \ - VORR y.B16, XTMP7.B16, y.B16; \ - VTBL R24_MASK.B16, [x.B16], XTMP7.B16; \ - VEOR XTMP7.B16, x.B16, x.B16; \ - VEOR y.B16, x.B16, x.B16 +#include "aesni_arm64.h" #define SM4_TAO_L2(x, y) \ - SM4_SBOX(x, y); \ + SM4_SBOX(x, y, XTMP6); \ ; \ //#################### 4 parallel L2 linear transforms ##################// VSHL $13, x.S4, XTMP6.S4; \ VUSHR $19, x.S4, y.S4; \ @@ -117,7 +43,7 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 VEOR t1.B16, x.B16, x.B16; \ VEOR t2.B16, x.B16, x.B16; \ VEOR t3.B16, x.B16, x.B16; \ - SM4_TAO_L1(x, y); \ + SM4_TAO_L1(x, y, XTMP6); \ VEOR x.B16, t0.B16, t0.B16 #define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \ diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index 10ccd3f..7fd847d 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -41,44 +41,6 @@ #define NIBBLE_MASK Y11 #define X_NIBBLE_MASK X11 -// shuffle byte order from LE to BE -DATA flipMask<>+0x00(SB)/8, $0x0405060700010203 -DATA flipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b - -//nibble mask -DATA nibbleMask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F -DATA nibbleMask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F - -// inverse shift rows -DATA inverseShiftRows<>+0x00(SB)/8, $0x0B0E0104070A0D00 -DATA inverseShiftRows<>+0x08(SB)/8, $0x0306090C0F020508 - -// Affine transform 1 (low and high hibbles) -DATA m1Low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 -DATA m1Low<>+0x08(SB)/8, $0x3045F98CEF9A2653 - -DATA m1High<>+0x00(SB)/8, $0xC35BF46CAF379800 -DATA m1High<>+0x08(SB)/8, $0x68F05FC7049C33AB - -// Affine transform 2 (low and high hibbles) -DATA m2Low<>+0x00(SB)/8, $0x9A950A05FEF16E61 -DATA m2Low<>+0x08(SB)/8, $0x0E019E916A65FAF5 - -DATA m2High<>+0x00(SB)/8, $0x892D69CD44E0A400 -DATA m2High<>+0x08(SB)/8, $0x2C88CC68E14501A5 - -// left rotations of 32-bit words by 8-bit increments -DATA r08Mask<>+0x00(SB)/8, $0x0605040702010003 -DATA r08Mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B - -DATA r16Mask<>+0x00(SB)/8, $0x0504070601000302 -DATA r16Mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A - -DATA r24Mask<>+0x00(SB)/8, $0x0407060500030201 -DATA r24Mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 - -DATA fkMask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 -DATA fkMask<>+0x08(SB)/8, $0xb27022dc677d9197 DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 @@ -117,21 +79,12 @@ DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff -GLOBL flipMask<>(SB), (NOPTR+RODATA), $16 -GLOBL nibbleMask<>(SB), (NOPTR+RODATA), $16 -GLOBL inverseShiftRows<>(SB), (NOPTR+RODATA), $16 -GLOBL m1Low<>(SB), (NOPTR+RODATA), $16 -GLOBL m1High<>(SB), (NOPTR+RODATA), $16 -GLOBL m2Low<>(SB), (NOPTR+RODATA), $16 -GLOBL m2High<>(SB), (NOPTR+RODATA), $16 -GLOBL r08Mask<>(SB), (NOPTR+RODATA), $16 -GLOBL r16Mask<>(SB), (NOPTR+RODATA), $16 -GLOBL r24Mask<>(SB), (NOPTR+RODATA), $16 -GLOBL fkMask<>(SB), (NOPTR+RODATA), $16 GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 +#include "aesni_amd64.h" + // func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 #define pTbl DI @@ -202,51 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 #undef plen #undef dlen -#define SM4_SBOX(x, y, z) \ - ; \ //############################# inner affine ############################// - MOVOU x, z; \ - PAND nibbleMask<>(SB), z; \ //y = _mm_and_si128(x, c0f); - MOVOU m1Low<>(SB), y; \ - PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y); - PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); - PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f); - MOVOU m1High<>(SB), z; \ - PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x); - MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x); - PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; - ; \ // inverse ShiftRows - PSHUFB inverseShiftRows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); - AESENCLAST nibbleMask<>(SB), x; \ // AESNI instruction - ; \ //############################# outer affine ############################// - MOVOU x, z; \ - PANDN nibbleMask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f); - MOVOU m2Low<>(SB), y; \ - PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z) - PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); - PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f); - MOVOU m2High<>(SB), z; \ - PSHUFB x, z; \ - MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) - PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; - -#define SM4_TAO_L1(x, y, z) \ - SM4_SBOX(x, y, z); \ - ; \ //#################### 4 parallel L1 linear transforms ##################// - MOVOU x, y; \ - PSHUFB r08Mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) - PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08) - MOVOU x, z; \ - PSHUFB r16Mask<>(SB), z; \ - PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16) - MOVOU y, z; \ - PSLLL $2, z; \ - PSRLL $30, y; \ - POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30); - MOVOU x, z; \ - PSHUFB r24Mask<>(SB), z; \ - PXOR y, x; \ //x = x xor y - PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24); - #define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ PINSRD $0, (index * 4)(RK)(IND*1), x; \ PXOR t1, x; \ @@ -264,80 +172,11 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 SM4_TAO_L1(x, y, z); \ PXOR x, t0 -// MOVOU r0, tmp2; -// PUNPCKHDQ r1, tmp2; -// PUNPCKLDQ r1, r0; -// MOVOU r2, tmp1; -// PUNPCKLDQ r3, tmp1; -// PUNPCKHDQ r3, r2; -// MOVOU r0, r1; -// PUNPCKHQDQ tmp1, r1; -// PUNPCKLQDQ tmp1, r0; -// MOVOU tmp2, r3; -// PUNPCKHQDQ r2, r3; -// PUNPCKLQDQ r2, tmp2; -// MOVOU tmp2, r2 -#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \ - PEXTRD $2, r0, r; \ - PINSRD $0, r, tmp2; \ - PEXTRD $2, r1, r; \ - PINSRD $1, r, tmp2; \ - ; \ - PEXTRD $3, r0, r; \ - PINSRD $2, r, tmp2; \ - PEXTRD $3, r1, r; \ - PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2] - ; \ - PEXTRD $1, r0, r; \ - PINSRD $2, r, r0; \ - PEXTRD $0, r1, r; \ - PINSRD $1, r, r0; \ - PEXTRD $1, r1, r; \ - PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0] - ; \ - PEXTRD $0, r2, r; \ - PINSRD $0, r, tmp1; \ - PEXTRD $0, r3, r; \ - PINSRD $1, r, tmp1; \ - PEXTRD $1, r2, r; \ - PINSRD $2, r, tmp1; \ - PEXTRD $1, r3, r; \ - PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8] - ; \ - PEXTRD $2, r2, r; \ - PINSRD $0, r, r2; \ - PEXTRD $2, r3, r; \ - PINSRD $1, r, r2; \ - PEXTRD $3, r2, r; \ - PINSRD $2, r, r2; \ - PEXTRD $3, r3, r; \ - PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10] - ; \ - MOVOU r0, r1; \ - PEXTRQ $1, r1, r; \ - PINSRQ $0, r, r1; \ - PEXTRQ $1, tmp1, r; \ - PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1] - ; \ - PEXTRQ $0, tmp1, r; \ - PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0] - ; \ - MOVOU tmp2, r3; \ - PEXTRQ $1, r3, r; \ - PINSRQ $0, r, r3; \ - PEXTRQ $1, r2, r; \ - PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3] - ; \ - PEXTRQ $0, r2, r; \ - PINSRQ $1, r, r2; \ - PEXTRQ $0, tmp2, r; \ - PINSRQ $0, r, r2 - #define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \ - PSHUFB flipMask<>(SB), t0; \ - PSHUFB flipMask<>(SB), t1; \ - PSHUFB flipMask<>(SB), t2; \ - PSHUFB flipMask<>(SB), t3; \ + PSHUFB flip_mask<>(SB), t0; \ + PSHUFB flip_mask<>(SB), t1; \ + PSHUFB flip_mask<>(SB), t2; \ + PSHUFB flip_mask<>(SB), t3; \ SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \ XORL IND, IND; \ SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ @@ -385,107 +224,20 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 PSHUFB BSWAP, t1; \ PSHUFB BSWAP, t0 -#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ - VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] - VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] - VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8] - VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10] - VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1] - VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0] - VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] - VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] - -#define AVX2_SM4_SBOX(x, y, xw, yw, tmp) \ - VPAND NIBBLE_MASK, x, tmp; \ - VBROADCASTI128 m1Low<>(SB), y; \ - VPSHUFB tmp, y, y; \ - VPSRLQ $4, x, x; \ - VPAND NIBBLE_MASK, x, x; \ - VBROADCASTI128 m1High<>(SB), tmp; \ - VPSHUFB x, tmp, x; \ - VPXOR y, x, x; \ - VBROADCASTI128 inverseShiftRows<>(SB), tmp; \ - VPSHUFB tmp, x, x; \ - VEXTRACTI128 $1, x, yw \ - VAESENCLAST X_NIBBLE_MASK, xw, xw; \ - VAESENCLAST X_NIBBLE_MASK, yw, yw; \ - VINSERTI128 $1, yw, x, x; \ - VPANDN NIBBLE_MASK, x, tmp; \ - VBROADCASTI128 m2Low<>(SB), y; \ - VPSHUFB tmp, y, y; \ - VPSRLQ $4, x, x; \ - VPAND NIBBLE_MASK, x, x; \ - VBROADCASTI128 m2High<>(SB), tmp; \ - VPSHUFB x, tmp, x; \ - VPXOR y, x, x - -#define AVX2_SM4_TAO_L1(x, y, xw, yw, tmp) \ - AVX2_SM4_SBOX(x, y, xw, yw, tmp); \ - VBROADCASTI128 r08Mask<>(SB), tmp; \ - VPSHUFB tmp, x, y; \ - VPXOR x, y, y; \ - VBROADCASTI128 r16Mask<>(SB), tmp; \ - VPSHUFB tmp, x, tmp; \ - VPXOR tmp, y, y; \ - VPSLLD $2, y, tmp; \ - VPSRLD $30, y, y; \ - VPXOR tmp, y, y; \ - VBROADCASTI128 r24Mask<>(SB), tmp; \ - VPSHUFB tmp, x, tmp; \ - VPXOR y, x, x; \ - VPXOR x, tmp, x - #define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \ - VPBROADCASTD (index * 4)(RK)(IND*1), x; \ - VPXOR t1, x, x; \ - VPXOR t2, x, x; \ - VPXOR t3, x, x; \ - AVX2_SM4_TAO_L1(x, y, xw, yw, tmp); \ + VPBROADCASTD (index * 4)(RK)(IND*1), x; \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \ VPXOR x, t0, t0 -#define AVX_SM4_SBOX(x, y, tmp) \ - VPAND X_NIBBLE_MASK, x, tmp; \ - VMOVDQU m1Low<>(SB), y; \ - VPSHUFB tmp, y, y; \ - VPSRLQ $4, x, x; \ - VPAND X_NIBBLE_MASK, x, x; \ - VMOVDQU m1High<>(SB), tmp; \ - VPSHUFB x, tmp, x; \ - VPXOR y, x, x; \ - VMOVDQU inverseShiftRows<>(SB), tmp; \ - VPSHUFB tmp, x, x; \ - VAESENCLAST X_NIBBLE_MASK, x, x; \ - VPANDN X_NIBBLE_MASK, x, tmp; \ - VMOVDQU m2Low<>(SB), y; \ - VPSHUFB tmp, y, y; \ - VPSRLQ $4, x, x; \ - VPAND X_NIBBLE_MASK, x, x; \ - VMOVDQU m2High<>(SB), tmp; \ - VPSHUFB x, tmp, x; \ - VPXOR y, x, x - -#define AVX_SM4_TAO_L1(x, y, tmp) \ - AVX_SM4_SBOX(x, y, tmp); \ - VMOVDQU r08Mask<>(SB), tmp; \ - VPSHUFB tmp, x, y; \ - VPXOR x, y, y; \ - VMOVDQU r16Mask<>(SB), tmp; \ - VPSHUFB tmp, x, tmp; \ - VPXOR tmp, y, y; \ - VPSLLD $2, y, tmp; \ - VPSRLD $30, y, y; \ - VPXOR tmp, y, y; \ - VMOVDQU r24Mask<>(SB), tmp; \ - VPSHUFB tmp, x, tmp; \ - VPXOR y, x, x; \ - VPXOR x, tmp, x - #define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ VPBROADCASTD (index * 4)(RK)(IND*1), x; \ VPXOR t1, x, x; \ VPXOR t2, x, x; \ VPXOR t3, x, x; \ - AVX_SM4_TAO_L1(x, y, tmp); \ + AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \ VPXOR x, t0, t0 // func gcmSm4Init(productTable *[256]byte, rk []uint32) @@ -1206,7 +958,7 @@ avx2GcmSm4Enc: VMOVDQU (4*32 + 2*32)(SP), DWB2 VMOVDQU (4*32 + 3*32)(SP), DWB3 - VBROADCASTI128 flipMask<>(SB), XDWTMP0 + VBROADCASTI128 flip_mask<>(SB), XDWTMP0 // Apply Byte Flip Mask: LE -> BE VPSHUFB XDWTMP0, DWB0, DWB0 VPSHUFB XDWTMP0, DWB1, DWB1 @@ -1216,7 +968,7 @@ avx2GcmSm4Enc: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) XORL BX, BX - VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK avx2GcmSm4Enc8Loop1: AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) @@ -1289,7 +1041,7 @@ avx2GcmSm4EncOctetsLoop: VMOVDQU (4*32 + 2*32)(SP), DWB2 VMOVDQU (4*32 + 3*32)(SP), DWB3 - VBROADCASTI128 flipMask<>(SB), XDWTMP0 + VBROADCASTI128 flip_mask<>(SB), XDWTMP0 // Apply Byte Flip Mask: LE -> BE VPSHUFB XDWTMP0, DWB0, DWB0 VPSHUFB XDWTMP0, DWB1, DWB1 @@ -1311,7 +1063,7 @@ avx2GcmSm4EncOctetsLoop: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) XORL BX, BX - VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK avx2GcmSm4Enc8Loop2: AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) @@ -1430,7 +1182,7 @@ avx2GcmSm4EncOctetsEnd: SUBQ $4, aluCTR avx2GcmSm4EncNibbles: - VMOVDQU flipMask<>(SB), B7 + VMOVDQU flip_mask<>(SB), B7 CMPQ ptxLen, $64 JBE avx2GcmSm4EncSingles SUBQ $64, ptxLen @@ -1447,7 +1199,7 @@ avx2GcmSm4EncNibbles: TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) XORL BX, BX - VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK + VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK avx2GcmSm4Enc4Loop2: AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) @@ -1509,7 +1261,7 @@ avx2GcmSm4EncSingles: TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) XORL BX, BX - VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK + VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK avx2GcmSm4Enc4Loop1: AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) @@ -1937,7 +1689,7 @@ avx2GcmSm4DecOctetsLoop: VMOVDQU (2*32)(SP), DWB2 VMOVDQU (3*32)(SP), DWB3 - VBROADCASTI128 flipMask<>(SB), XDWTMP0 + VBROADCASTI128 flip_mask<>(SB), XDWTMP0 // Apply Byte Flip Mask: LE -> BE VPSHUFB XDWTMP0, DWB0, DWB0 VPSHUFB XDWTMP0, DWB1, DWB1 @@ -1962,7 +1714,7 @@ avx2GcmSm4DecOctetsLoop: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) XORL BX, BX - VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK avx2GcmSm4Dec8Loop2: AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) @@ -2047,7 +1799,7 @@ avx2GcmSm4DecEndOctets: SUBQ $4, aluCTR avx2GcmSm4DecNibbles: - VMOVDQU flipMask<>(SB), B7 // DO NOT CHANGE B7 + VMOVDQU flip_mask<>(SB), B7 // DO NOT CHANGE B7 CMPQ ptxLen, $64 JBE avx2GcmSm4DecSingles SUBQ $64, ptxLen @@ -2064,7 +1816,7 @@ avx2GcmSm4DecNibbles: TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) XORL BX, BX - VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK + VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK avx2GcmSm4Dec4Loop2: AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) @@ -2130,7 +1882,7 @@ avx2GcmSm4DecSingles: TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) XORL BX, BX - VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK + VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK avx2GcmSm4Dec4Loop1: AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index d27aba4..7d6f384 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -3,47 +3,6 @@ #include "textflag.h" -//nibble mask -DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F -DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F -GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16 - -// inverse shift rows -DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 -GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16 - -// Affine transform 1 (low and high hibbles) -DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 -DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 -GLOBL m1_low<>(SB), (NOPTR+RODATA), $16 - -DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 -DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB -GLOBL m1_high<>(SB), (NOPTR+RODATA), $16 - -// Affine transform 2 (low and high hibbles) -DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 -DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 -GLOBL m2_low<>(SB), (NOPTR+RODATA), $16 - -DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 -DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 -GLOBL m2_high<>(SB), (NOPTR+RODATA), $16 - -// left rotations of 32-bit words by 8-bit increments -DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 -DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16 - -DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 -DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16 - -DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 -DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 -GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16 - #define B0 V0 #define B1 V1 #define B2 V2 @@ -150,42 +109,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 #undef plen #undef dlen -#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ - VMOV t0.B16, K.B16 \ - VMOV t1.S[0], t0.S[1] \ - VMOV t2.S[0], t0.S[2] \ - VMOV t3.S[0], t0.S[3] \ - VMOV K.S[1], t1.S[0] \ - VMOV K.S[2], t2.S[0] \ - VMOV K.S[3], t3.S[0] \ - VMOV t1.D[1], K.D[1] \ - VMOV t2.S[1], t1.S[2] \ - VMOV t3.S[1], t1.S[3] \ - VMOV K.S[2], t2.S[1] \ - VMOV K.S[3], t3.S[1] \ - VMOV t2.S[3], K.S[3] \ - VMOV t3.S[2], t2.S[3] \ - VMOV K.S[3], t3.S[2] - -#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ - VMOV t0.B16, K.B16 \ - VMOV t3.S[0], t0.S[0] \ - VMOV t2.S[0], t0.S[1] \ - VMOV t1.S[0], t0.S[2] \ - VMOV K0.S[0], t0.S[3] \ - VMOV t3.S[1], t1.S[0] \ - VMOV t3.S[2], t2.S[0] \ - VMOV t3.S[3], t3.S[0] \ - VMOV t2.S[3], t3.S[1] \ - VMOV t1.S[3], t3.S[2] \ - VMOV K.S[3], t3.S[3] \ - VMOV K.S[2], t2.S[3] \ - VMOV K.S[1], t1.S[3] \ - VMOV t1.B16, K.B16 \ - VMOV t2.S[1], t1.S[1] \ - VMOV K.S[1], t1.S[2] \ - VMOV t2.S[2], t2.S[1] \ - VMOV K.S[2], t2.S[2] +#include "aesni_arm64.h" #define LOAD_SM4_AESNI_CONSTS() \ LDP nibble_mask<>(SB), (R20, R21) \ @@ -216,36 +140,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 VMOV R20, R24_MASK.D[0] \ VMOV R21, R24_MASK.D[1] -#define SM4_SBOX(x, y, z) \ - ; \ - VAND x.B16, NIBBLE_MASK.B16, z.B16; \ - VTBL z.B16, [M1L.B16], y.B16; \ - VUSHR $4, x.D2, x.D2; \ - VAND x.B16, NIBBLE_MASK.B16, z.B16; \ - VTBL z.B16, [M1H.B16], z.B16; \ - VEOR y.B16, z.B16, x.B16; \ - VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ - AESE ZERO.B16, x.B16; \ - VAND x.B16, NIBBLE_MASK.B16, z.B16; \ - VTBL z.B16, [M2L.B16], y.B16; \ - VUSHR $4, x.D2, x.D2; \ - VAND x.B16, NIBBLE_MASK.B16, z.B16; \ - VTBL z.B16, [M2H.B16], z.B16; \ - VEOR y.B16, z.B16, x.B16 - -#define SM4_TAO_L1(x, y, z) \ - SM4_SBOX(x, y, z); \ - VTBL R08_MASK.B16, [x.B16], y.B16; \ - VEOR y.B16, x.B16, y.B16; \ - VTBL R16_MASK.B16, [x.B16], z.B16; \ - VEOR z.B16, y.B16, y.B16; \ - VSHL $2, y.S4, z.S4; \ - VUSHR $30, y.S4, y.S4; \ - VORR y.B16, z.B16, y.B16; \ - VTBL R24_MASK.B16, [x.B16], z.B16; \ - VEOR z.B16, x.B16, x.B16; \ - VEOR y.B16, x.B16, x.B16 - #define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \ MOVW.P 4(RK), R19; \ VMOV R19, x.S4; \