From 53e121c2b52e4a8528f8204776bf5235d35e9402 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Thu, 28 Sep 2023 10:11:31 +0800 Subject: [PATCH] sm4: optimize TAO L1 #168 --- sm4/aesni_macros_amd64.s | 74 +++++++++++++++------------------------- sm4/aesni_macros_arm64.s | 34 ++++++------------ sm4/asm_amd64.s | 1 + sm4/asm_arm64.s | 12 ++----- sm4/cbc_arm64.s | 4 +-- sm4/ecb_arm64.s | 4 +-- sm4/gcm_arm64.s | 2 -- sm4/xts_arm64.s | 2 -- 8 files changed, 42 insertions(+), 91 deletions(-) diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index b4da364..91c4c6e 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -41,14 +41,6 @@ DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B GLOBL r08_mask<>(SB), 8, $16 -DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 -DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -GLOBL r16_mask<>(SB), 8, $16 - -DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 -DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 -GLOBL r24_mask<>(SB), 8, $16 - DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 GLOBL fk_mask<>(SB), 8, $16 @@ -66,18 +58,6 @@ DATA r08_mask256<>+0x10(SB)/8, $0x0605040702010003 DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B GLOBL r08_mask256<>(SB), 8, $32 -DATA r16_mask256<>+0x00(SB)/8, $0x0504070601000302 -DATA r16_mask256<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -DATA r16_mask256<>+0x10(SB)/8, $0x0504070601000302 -DATA r16_mask256<>+0x18(SB)/8, $0x0D0C0F0E09080B0A -GLOBL r16_mask256<>(SB), 8, $32 - -DATA r24_mask256<>+0x00(SB)/8, $0x0407060500030201 -DATA r24_mask256<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 -DATA r24_mask256<>+0x10(SB)/8, $0x0407060500030201 -DATA r24_mask256<>+0x18(SB)/8, $0x0C0F0E0D080B0A09 -GLOBL r24_mask256<>(SB), 8, $32 - // Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance! // input: from high to low // r0 = [w3, w2, w1, w0] @@ -164,19 +144,18 @@ GLOBL r24_mask256<>(SB), 8, $32 SM4_SBOX(x, y, z); \ ; \ //#################### 4 parallel L1 linear transforms ##################// MOVOU x, y; \ - PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) - PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08) - MOVOU x, z; \ - PSHUFB r16_mask<>(SB), z; \ - PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16) + PSHUFB r08_mask<>(SB), y; \ //y = x <<< 8 + MOVOU y, z; \ + PSHUFB r08_mask<>(SB), z; \ //z = x <<< 16 + PXOR x, y; \ //y = x ^ (x <<< 8) + PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16) + PSHUFB r08_mask<>(SB), z; \ //z = x <<< 24 + PXOR z, x; \ //x = x ^ (x <<< 24) MOVOU y, z; \ PSLLL $2, z; \ PSRLL $30, y; \ - POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30); - MOVOU x, z; \ - PSHUFB r24_mask<>(SB), z; \ - PXOR y, x; \ //x = x xor y - PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24); + POR z, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18) + PXOR y, x // SM4 single round function, handle 16 bytes data // t0 ^= tao_l1(t1^t2^t3^xk) @@ -239,6 +218,7 @@ GLOBL r24_mask256<>(SB), 8, $32 PSHUFD $0xFF, rk128, x; \ SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \ +// Requires: SSSE3 #define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \ PSHUFB flip_mask<>(SB), t0; \ PSHUFD $1, t0, t1; \ @@ -388,16 +368,16 @@ GLOBL r24_mask256<>(SB), 8, $32 // - tmp: 128 bits temp register #define AVX_SM4_TAO_L1(x, y, tmp) \ AVX_SM4_SBOX(x, y, tmp); \ - VPSHUFB r08_mask<>(SB), x, y; \ - VPXOR x, y, y; \ - VPSHUFB r16_mask<>(SB), x, tmp; \ - VPXOR tmp, y, y; \ + VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 + VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16 + VPXOR x, y, y; \ // y = x ^ (x <<< 8) + VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) + VPSHUFB r08_mask<>(SB), tmp, tmp; \ // tmp = x <<< 24 + VPXOR x, tmp, x; \ // x = x ^ (x <<< 24) VPSLLD $2, y, tmp; \ VPSRLD $30, y, y; \ - VPOR tmp, y, y; \ - VPSHUFB r24_mask<>(SB), x, tmp; \ - VPXOR y, x, x; \ - VPXOR x, tmp, x + VPOR tmp, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18) + VPXOR y, x, x // transpose matrix function, AVX/AVX2 version // parameters: @@ -433,7 +413,7 @@ GLOBL r24_mask256<>(SB), 8, $32 VPXOR t1, x, x; \ VPXOR t2, x, x; \ VPXOR t3, x, x; \ - AVX_SM4_TAO_L1(x, y, tmp); \ + AVX_SM4_TAO_L1(x, y, tmp); \ VPXOR x, t0, t0 @@ -591,16 +571,16 @@ GLOBL r24_mask256<>(SB), 8, $32 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. #define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \ - VPSHUFB r08_mask256<>(SB), x, y; \ - VPXOR x, y, y; \ - VPSHUFB r16_mask256<>(SB), x, z; \ - VPXOR z, y, y; \ + VPSHUFB r08_mask256<>(SB), x, y; \ // y = x <<< 8 + VPSHUFB r08_mask256<>(SB), y, z; \ // z = x <<< 16 + VPXOR x, y, y; \ // y = x ^ (x <<< 8) + VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) + VPSHUFB r08_mask256<>(SB), z, z; \ // z = x <<< 24 + VPXOR x, z, x; \ // x = x ^ (x <<< 24) VPSLLD $2, y, z; \ VPSRLD $30, y, y; \ - VPOR z, y, y; \ - VPSHUFB r24_mask256<>(SB), x, z; \ - VPXOR y, x, x; \ - VPXOR x, z, x + VPOR z, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18) + VPXOR y, x, x // SM4 round function, AVX2 version, handle 256 bits // t0 ^= tao_l1(t1^t2^t3^xk) diff --git a/sm4/aesni_macros_arm64.s b/sm4/aesni_macros_arm64.s index e1e874b..b3035b4 100644 --- a/sm4/aesni_macros_arm64.s +++ b/sm4/aesni_macros_arm64.s @@ -31,14 +31,6 @@ DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B GLOBL r08_mask<>(SB), (16+8), $16 -DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 -DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -GLOBL r16_mask<>(SB), (16+8), $16 - -DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 -DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 -GLOBL r24_mask<>(SB), (16+8), $16 - DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 GLOBL fk_mask<>(SB), (16+8), $16 @@ -64,13 +56,7 @@ GLOBL fk_mask<>(SB), (16+8), $16 VMOV R21, INVERSE_SHIFT_ROWS.D[1] \ LDP r08_mask<>(SB), (R20, R21) \ VMOV R20, R08_MASK.D[0] \ - VMOV R21, R08_MASK.D[1] \ - LDP r16_mask<>(SB), (R20, R21) \ - VMOV R20, R16_MASK.D[0] \ - VMOV R21, R16_MASK.D[1] \ - LDP r24_mask<>(SB), (R20, R21) \ - VMOV R20, R24_MASK.D[0] \ - VMOV R21, R24_MASK.D[1] + VMOV R21, R08_MASK.D[1] // input: from high to low // t0 = t0.S3, t0.S2, t0.S1, t0.S0 @@ -141,15 +127,15 @@ GLOBL fk_mask<>(SB), (16+8), $16 // - z: 128 bits temp register #define SM4_TAO_L1(x, y, z) \ SM4_SBOX(x, y, z); \ - VTBL R08_MASK.B16, [x.B16], y.B16; \ - VEOR y.B16, x.B16, y.B16; \ - VTBL R16_MASK.B16, [x.B16], z.B16; \ - VEOR z.B16, y.B16, z.B16; \ - VSHL $2, z.S4, y.S4; \ - VSRI $30, z.S4, y.S4; \ - VTBL R24_MASK.B16, [x.B16], z.B16; \ - VEOR z.B16, x.B16, x.B16; \ - VEOR y.B16, x.B16, x.B16 + VTBL R08_MASK.B16, [x.B16], y.B16; \ // y = x <<< 8 + VTBL R08_MASK.B16, [y.B16], z.B16; \ // z = x <<< 16 + VEOR x.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8) + VEOR z.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8) ^ (x <<< 16) + VTBL R08_MASK.B16, [z.B16], z.B16; \ // z = x <<< 24 + VEOR z.B16, x.B16, x.B16; \ // x = x ^ (x <<< 24) + VSHL $2, y.S4, z.S4; \ + VSRI $30, y.S4, z.S4; \ // z = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18) + VEOR z.B16, x.B16, x.B16 // SM4 round function // t0 ^= tao_l1(t1^t2^t3^xk) diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index f0d0909..90265c7 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -322,6 +322,7 @@ avx2_sm4_done: RET // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) +// Requires: SSSE3 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 MOVQ xk+0(FP), AX MOVQ dst+8(FP), BX diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index 03bdac0..f7c46af 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -21,9 +21,7 @@ #define M2L V24 #define M2H V25 #define R08_MASK V26 -#define R16_MASK V27 -#define R24_MASK V28 -#define FK_MASK V29 +#define FK_MASK V27 #define XTMP6 V6 #define XTMP7 V7 @@ -78,13 +76,7 @@ load_global_data_1() \ LDP r08_mask<>(SB), (R0, R1) \ VMOV R0, R08_MASK.D[0] \ - VMOV R1, R08_MASK.D[1] \ - LDP r16_mask<>(SB), (R0, R1) \ - VMOV R0, R16_MASK.D[0] \ - VMOV R1, R16_MASK.D[1] \ - LDP r24_mask<>(SB), (R0, R1) \ - VMOV R0, R24_MASK.D[0] \ - VMOV R1, R24_MASK.D[1] + VMOV R1, R08_MASK.D[1] #define SM4EKEY_EXPORT_KEYS() \ VMOV V9.S[3], V10.S[0] \ diff --git a/sm4/cbc_arm64.s b/sm4/cbc_arm64.s index 7b42c6d..b4e6c7a 100644 --- a/sm4/cbc_arm64.s +++ b/sm4/cbc_arm64.s @@ -17,9 +17,7 @@ #define M2L V24 #define M2H V25 #define R08_MASK V26 -#define R16_MASK V27 -#define R24_MASK V28 -#define FK_MASK V29 +#define FK_MASK V27 #define XTMP6 V6 #define IV V7 diff --git a/sm4/ecb_arm64.s b/sm4/ecb_arm64.s index 96b6f8c..5abd26e 100644 --- a/sm4/ecb_arm64.s +++ b/sm4/ecb_arm64.s @@ -17,9 +17,7 @@ #define M2L V24 #define M2H V25 #define R08_MASK V26 -#define R16_MASK V27 -#define R24_MASK V28 -#define FK_MASK V29 +#define FK_MASK V27 #define XTMP6 V6 #define XTMP7 V7 #define t4 V10 diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index 81431e1..4c2edb7 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -37,8 +37,6 @@ #define M2L V27 #define M2H V28 #define R08_MASK V29 -#define R16_MASK V30 -#define R24_MASK V31 #define reduce() \ VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ diff --git a/sm4/xts_arm64.s b/sm4/xts_arm64.s index 8215e30..4399395 100644 --- a/sm4/xts_arm64.s +++ b/sm4/xts_arm64.s @@ -37,8 +37,6 @@ #define M2L V27 #define M2H V28 #define R08_MASK V29 -#define R16_MASK V30 -#define R24_MASK V31 #include "aesni_macros_arm64.s" #include "xts_macros_arm64.s"