From 9b364dca8b803f8b2bbf9560ce42cfb469806764 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Mon, 25 Jul 2022 13:20:58 +0800 Subject: [PATCH] sm4: reduce duplicated code and add comments --- sm4/aesni_amd64.h | 187 +++++++++++++++++++++++++++++-------- sm4/aesni_arm64.h | 80 ++++++++++++++++ sm4/asm_amd64.s | 71 ++++++++------ sm4/asm_arm64.s | 25 ++--- sm4/cbc_cipher_asm_amd64.s | 8 -- sm4/cbc_cipher_asm_arm64.s | 106 ++++++--------------- sm4/gcm_amd64.s | 19 +--- sm4/gcm_arm64.s | 110 +++++++--------------- 8 files changed, 349 insertions(+), 257 deletions(-) diff --git a/sm4/aesni_amd64.h b/sm4/aesni_amd64.h index 8f8bc05..099ce5c 100644 --- a/sm4/aesni_amd64.h +++ b/sm4/aesni_amd64.h @@ -53,6 +53,23 @@ DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 GLOBL fk_mask<>(SB), RODATA, $16 +// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance! +// input: from high to low +// r0 = [w3, w2, w1, w0] +// r1 = [w7, w6, w5, w4] +// r2 = [w11, w10, w9, w8] +// r3 = [w15, w14, w13, w12] +// r: 32/64 temp register +// tmp1: 128 bits temp register +// tmp2: 128 bits temp register +// +// output: from high to low +// r0 = [w12, w8, w4, w0] +// r1 = [w13, w9, w5, w1] +// r2 = [w14, w10, w6, w2] +// r3 = [w15, w11, w7, w3] +// +// SSE2/MMX instructions: // MOVOU r0, tmp2; // PUNPCKHDQ r1, tmp2; // PUNPCKLDQ r1, r0; @@ -122,6 +139,11 @@ GLOBL fk_mask<>(SB), RODATA, $16 PEXTRQ $0, tmp2, r; \ PINSRQ $0, r, r2 +// SM4 sbox function +// parameters: +// - x: 128 bits register as sbox input/output data +// - y: 128 bits temp register +// - z: 128 bits temp register #define SM4_SBOX(x, y, z) \ ; \ //############################# inner affine ############################// MOVOU x, z; \ @@ -149,6 +171,11 @@ GLOBL fk_mask<>(SB), RODATA, $16 MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; +// SM4 TAO L1 function +// parameters: +// - x: 128 bits register as TAO_L1 input/output data +// - y: 128 bits temp register +// - z: 128 bits temp register #define SM4_TAO_L1(x, y, z) \ SM4_SBOX(x, y, z); \ ; \ //#################### 4 parallel L1 linear transforms ##################// @@ -167,6 +194,56 @@ GLOBL fk_mask<>(SB), RODATA, $16 PXOR y, x; \ //x = x xor y PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24); +// SM4 single round function, handle 16 bytes data +// t0 ^= tao_l1(t1^t2^t3^xk) +// used R19 as temp 32/64 bits register +// parameters: +// - index: round key index immediate number +// - RK: round key register +// - IND: round key index base register +// - x: 128 bits temp register +// - y: 128 bits temp register +// - z: 128 bits temp register +// - t0: 128 bits register for data as result +// - t1: 128 bits register for data +// - t2: 128 bits register for data +// - t3: 128 bits register for data +#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ + PINSRD $0, (index * 4)(RK)(IND*1), x; \ + PXOR t1, x; \ + PXOR t2, x; \ + PXOR t3, x; \ + SM4_TAO_L1(x, y, z); \ + PXOR x, t0 + +// SM4 round function, handle 64 bytes data +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - index: round key index immediate number +// - RK: round key register +// - IND: round key index base register +// - x: 128 bits temp register +// - y: 128 bits temp register +// - z: 128 bits temp register +// - t0: 128 bits register for data as result +// - t1: 128 bits register for data +// - t2: 128 bits register for data +// - t3: 128 bits register for data +#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ + PINSRD $0, (index * 4)(RK)(IND*1), x; \ + PSHUFD $0, x, x; \ + PXOR t1, x; \ + PXOR t2, x; \ + PXOR t3, x; \ + SM4_TAO_L1(x, y, z); \ + PXOR x, t0 + +// SM4 sbox function, AVX version +// parameters: +// - x: 128 bits register as sbox input/output data +// - y: 128 bits temp register +// - X_NIBBLE_MASK: 128 bits register stored nibble mask, should be loaded earlier. +// - tmp: 128 bits temp register #define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \ VPAND X_NIBBLE_MASK, x, tmp; \ VMOVDQU m1_low<>(SB), y; \ @@ -188,8 +265,14 @@ GLOBL fk_mask<>(SB), RODATA, $16 VPSHUFB x, tmp, x; \ VPXOR y, x, x -#define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \ - AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \ +// SM4 TAO L1 function, AVX version +// parameters: +// - x: 128 bits register as sbox input/output data +// - y: 128 bits temp register +// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier. +// - tmp: 128 bits temp register +#define AVX_SM4_TAO_L1(x, y, xNibbleMask, tmp) \ + AVX_SM4_SBOX(x, y, xNibbleMask, tmp); \ VMOVDQU r08_mask<>(SB), tmp; \ VPSHUFB tmp, x, y; \ VPXOR x, y, y; \ @@ -204,6 +287,14 @@ GLOBL fk_mask<>(SB), RODATA, $16 VPXOR y, x, x; \ VPXOR x, tmp, x +// transpose matrix function, AVX/AVX2 version +// parameters: +// - r0: 128/256 bits register as input/output data +// - r1: 128/256 bits register as input/output data +// - r2: 128/256 bits register as input/output data +// - r3: 128/256 bits register as input/output data +// - tmp1: 128/256 bits temp register +// - tmp2: 128/256 bits temp register #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] @@ -214,42 +305,60 @@ GLOBL fk_mask<>(SB), RODATA, $16 VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] -#define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \ - VPAND yNibbleMask, x, tmp; \ - VBROADCASTI128 m1_low<>(SB), y; \ - VPSHUFB tmp, y, y; \ - VPSRLQ $4, x, x; \ - VPAND yNibbleMask, x, x; \ - VBROADCASTI128 m1_high<>(SB), tmp; \ - VPSHUFB x, tmp, x; \ - VPXOR y, x, x; \ - VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \ - VPSHUFB tmp, x, x; \ - VEXTRACTI128 $1, x, yw \ - VAESENCLAST xNibbleMask, xw, xw; \ - VAESENCLAST xNibbleMask, yw, yw; \ - VINSERTI128 $1, yw, x, x; \ - VPANDN yNibbleMask, x, tmp; \ - VBROADCASTI128 m2_low<>(SB), y; \ - VPSHUFB tmp, y, y; \ - VPSRLQ $4, x, x; \ - VPAND yNibbleMask, x, x; \ - VBROADCASTI128 m2_high<>(SB), tmp; \ - VPSHUFB x, tmp, x; \ +// SM4 sbox function, AVX2 version +// parameters: +// - x: 256 bits register as sbox input/output data +// - y: 256 bits temp register +// - z: 256 bits temp register +// - xw: 128 bits temp register +// - yw: 128 bits temp register +// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier. +// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. +#define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ + VPAND yNibbleMask, x, z; \ + VBROADCASTI128 m1_low<>(SB), y; \ + VPSHUFB z, y, y; \ + VPSRLQ $4, x, x; \ + VPAND yNibbleMask, x, x; \ + VBROADCASTI128 m1_high<>(SB), z; \ + VPSHUFB x, z, x; \ + VPXOR y, x, x; \ + VBROADCASTI128 inverse_shift_rows<>(SB), z; \ + VPSHUFB z, x, x; \ + VEXTRACTI128 $1, x, yw \ + VAESENCLAST xNibbleMask, xw, xw; \ + VAESENCLAST xNibbleMask, yw, yw; \ + VINSERTI128 $1, yw, x, x; \ + VPANDN yNibbleMask, x, z; \ + VBROADCASTI128 m2_low<>(SB), y; \ + VPSHUFB z, y, y; \ + VPSRLQ $4, x, x; \ + VPAND yNibbleMask, x, x; \ + VBROADCASTI128 m2_high<>(SB), z; \ + VPSHUFB x, z, x; \ VPXOR y, x, x -#define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \ - AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \ - VBROADCASTI128 r08_mask<>(SB), tmp; \ - VPSHUFB tmp, x, y; \ - VPXOR x, y, y; \ - VBROADCASTI128 r16_mask<>(SB), tmp; \ - VPSHUFB tmp, x, tmp; \ - VPXOR tmp, y, y; \ - VPSLLD $2, y, tmp; \ - VPSRLD $30, y, y; \ - VPXOR tmp, y, y; \ - VBROADCASTI128 r24_mask<>(SB), tmp; \ - VPSHUFB tmp, x, tmp; \ - VPXOR y, x, x; \ - VPXOR x, tmp, x +// SM4 TAO L1 function, AVX2 version +// parameters: +// - x: 256 bits register as sbox input/output data +// - y: 256 bits temp register +// - z: 256 bits temp register +// - xw: 128 bits temp register +// - yw: 128 bits temp register +// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier. +// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. +#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ + AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \ + VBROADCASTI128 r08_mask<>(SB), z; \ + VPSHUFB z, x, y; \ + VPXOR x, y, y; \ + VBROADCASTI128 r16_mask<>(SB), z; \ + VPSHUFB z, x, z; \ + VPXOR z, y, y; \ + VPSLLD $2, y, z; \ + VPSRLD $30, y, y; \ + VPXOR z, y, y; \ + VBROADCASTI128 r24_mask<>(SB), z; \ + VPSHUFB z, x, z; \ + VPXOR y, x, x; \ + VPXOR x, z, x diff --git a/sm4/aesni_arm64.h b/sm4/aesni_arm64.h index 6afbeff..363da9c 100644 --- a/sm4/aesni_arm64.h +++ b/sm4/aesni_arm64.h @@ -43,6 +43,45 @@ DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 +#define LOAD_SM4_AESNI_CONSTS() \ + LDP nibble_mask<>(SB), (R20, R21) \ + VMOV R20, NIBBLE_MASK.D[0] \ + VMOV R21, NIBBLE_MASK.D[1] \ + LDP m1_low<>(SB), (R20, R21) \ + VMOV R20, M1L.D[0] \ + VMOV R21, M1L.D[1] \ + LDP m1_high<>(SB), (R20, R21) \ + VMOV R20, M1H.D[0] \ + VMOV R21, M1H.D[1] \ + LDP m2_low<>(SB), (R20, R21) \ + VMOV R20, M2L.D[0] \ + VMOV R21, M2L.D[1] \ + LDP m2_high<>(SB), (R20, R21) \ + VMOV R20, M2H.D[0] \ + VMOV R21, M2H.D[1] \ + LDP inverse_shift_rows<>(SB), (R20, R21) \ + VMOV R20, INVERSE_SHIFT_ROWS.D[0] \ + VMOV R21, INVERSE_SHIFT_ROWS.D[1] \ + LDP r08_mask<>(SB), (R20, R21) \ + VMOV R20, R08_MASK.D[0] \ + VMOV R21, R08_MASK.D[1] \ + LDP r16_mask<>(SB), (R20, R21) \ + VMOV R20, R16_MASK.D[0] \ + VMOV R21, R16_MASK.D[1] \ + LDP r24_mask<>(SB), (R20, R21) \ + VMOV R20, R24_MASK.D[0] \ + VMOV R21, R24_MASK.D[1] + +// input: from high to low +// t0 = t0.S3, t0.S2, t0.S1, t0.S0 +// t1 = t1.S3, t1.S2, t1.S1, t1.S0 +// t2 = t2.S3, t2.S2, t2.S1, t2.S0 +// t3 = t3.S3, t3.S2, t3.S1, t3.S0 +// output: from high to low +// t0 = t3.S0, t2.S0, t1.S0, t0.S0 +// t1 = t3.S1, t2.S1, t1.S1, t0.S1 +// t2 = t3.S2, t2.S2, t1.S2, t0.S2 +// t3 = t3.S3, t2.S3, t1.S3, t0.S3 #define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ VMOV t0.B16, K.B16 \ VMOV t1.S[0], t0.S[1] \ @@ -60,6 +99,16 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 VMOV t3.S[2], t2.S[3] \ VMOV K.S[3], t3.S[2] +// input: from high to low +// t0 = t0.S3, t0.S2, t0.S1, t0.S0 +// t1 = t1.S3, t1.S2, t1.S1, t1.S0 +// t2 = t2.S3, t2.S2, t2.S1, t2.S0 +// t3 = t3.S3, t3.S2, t3.S1, t3.S0 +// output: from high to low +// t0 = t0.S0, t1.S0, t2.S0, t3.S0 +// t1 = t0.S1, t1.S1, t2.S1, t3.S1 +// t2 = t0.S2, t1.S2, t2.S2, t3.S2 +// t3 = t0.S3, t1.S3, t2.S3, t3.S3 #define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ VMOV t0.B16, K.B16 \ VMOV t3.S[0], t0.S[0] \ @@ -80,6 +129,11 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 VMOV t2.S[2], t2.S[1] \ VMOV K.S[2], t2.S[2] +// SM4 sbox function +// parameters: +// - x: 128 bits register as sbox input/output data +// - y: 128 bits temp register +// - z: 128 bits temp register #define SM4_SBOX(x, y, z) \ ; \ VAND x.B16, NIBBLE_MASK.B16, z.B16; \ @@ -97,6 +151,11 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 VTBL z.B16, [M2H.B16], z.B16; \ VEOR y.B16, z.B16, x.B16 +// SM4 TAO L1 function +// parameters: +// - x: 128 bits register as TAO_L1 input/output data +// - y: 128 bits temp register +// - z: 128 bits temp register #define SM4_TAO_L1(x, y, z) \ SM4_SBOX(x, y, z); \ VTBL R08_MASK.B16, [x.B16], y.B16; \ @@ -109,3 +168,24 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 VTBL R24_MASK.B16, [x.B16], z.B16; \ VEOR z.B16, x.B16, x.B16; \ VEOR y.B16, x.B16, x.B16 + +// SM4 round function +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - RK: round key register +// - tmp32: temp 32/64 bits register +// - x: 128 bits temp register +// - y: 128 bits temp register +// - z: 128 bits temp register +// - t0: 128 bits register for data as result +// - t1: 128 bits register for data +// - t2: 128 bits register for data +// - t3: 128 bits register for data +#define SM4_ROUND(RK, tmp32, x, y, z, t0, t1, t2, t3) \ + MOVW.P 4(RK), tmp32; \ + VMOV tmp32, x.S4; \ + VEOR t1.B16, x.B16, x.B16; \ + VEOR t2.B16, x.B16, x.B16; \ + VEOR t3.B16, x.B16, x.B16; \ + SM4_TAO_L1(x, y, z); \ + VEOR x.B16, t0.B16, t0.B16 diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 3492d9d..66e8584 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -16,6 +16,12 @@ #include "aesni_amd64.h" +// SM4 TAO L2 function, used for key expand +// parameters: +// - x: 128 bits register as TAO_L1 input/output data +// - y: 128 bits temp register +// - tmp1: 128 bits temp register +// - tmp2: 128 bits temp register #define SM4_TAO_L2(x, y, tmp1, tmp2) \ SM4_SBOX(x, y, tmp1); \ ; \ //#################### 4 parallel L2 linear transforms ##################// @@ -31,23 +37,16 @@ PXOR tmp2, y; \ PXOR y, x -#define SM4_ROUND(index, x, y, t0, t1, t2, t3) \ - PINSRD $0, (index * 4)(AX)(CX*1), x; \ - PSHUFD $0, x, x; \ - PXOR t1, x; \ - PXOR t2, x; \ - PXOR t3, x; \ - SM4_TAO_L1(x, y, XTMP6); \ - PXOR x, t0 - -#define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \ - PINSRD $0, (index * 4)(AX)(CX*1), x; \ - PXOR t1, x; \ - PXOR t2, x; \ - PXOR t3, x; \ - SM4_TAO_L1(x, y, XTMP6); \ - PXOR x, t0 - +// SM4 expand round function +// t0 ^= tao_l2(t1^t2^t3^ck) and store t0.S[0] to enc/dec +// parameters: +// - index: round key index immediate number +// - x: 128 bits temp register +// - y: 128 bits temp register +// - t0: 128 bits register for data +// - t1: 128 bits register for data +// - t2: 128 bits register for data +// - t3: 128 bits register for data #define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \ PINSRD $0, (index * 4)(BX)(CX*1), x; \ PXOR t1, x; \ @@ -89,14 +88,34 @@ #define XWORD X8 #define YWORD X9 +// SM4 round function, AVX2 version, handle 256 bits +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - index: round key index immediate number +// - x: 256 bits temp register +// - y: 256 bits temp register +// - t0: 256 bits register for data as result +// - t1: 256 bits register for data +// - t2: 256 bits register for data +// - t3: 256 bits register for data #define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ VPBROADCASTD (index * 4)(AX)(CX*1), x; \ VPXOR t1, x, x; \ VPXOR t2, x, x; \ VPXOR t3, x, x; \ - AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \ + AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \ VPXOR x, t0, t0 +// SM4 round function, AVX version, handle 128 bits +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - index: round key index immediate number +// - x: 128 bits temp register +// - y: 128 bits temp register +// - t0: 128 bits register for data as result +// - t1: 128 bits register for data +// - t2: 128 bits register for data +// - t3: 128 bits register for data #define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ VPBROADCASTD (index * 4)(AX)(CX*1), x; \ VPXOR t1, x, x; \ @@ -174,10 +193,10 @@ non_avx2_start: XORL CX, CX loop: - SM4_ROUND(0, x, y, t0, t1, t2, t3) - SM4_ROUND(1, x, y, t1, t2, t3, t0) - SM4_ROUND(2, x, y, t2, t3, t0, t1) - SM4_ROUND(3, x, y, t3, t0, t1, t2) + SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2) ADDL $16, CX CMPL CX, $4*32 @@ -328,10 +347,10 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 XORL CX, CX loop: - SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3) - SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0) - SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1) - SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2) + SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3) + SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0) + SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1) + SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2) ADDL $16, CX CMPL CX, $4*32 diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index 1d59cdb..06ed930 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -37,15 +37,6 @@ VEOR XTMP7.B16, y.B16, y.B16; \ VEOR x.B16, y.B16, x.B16 -#define SM4_ROUND(RK, x, y, t0, t1, t2, t3) \ - MOVW.P 4(RK), R19; \ - VMOV R19, x.S4; \ - VEOR t1.B16, x.B16, x.B16; \ - VEOR t2.B16, x.B16, x.B16; \ - VEOR t3.B16, x.B16, x.B16; \ - SM4_TAO_L1(x, y, XTMP6); \ - VEOR x.B16, t0.B16, t0.B16 - #define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \ MOVW.P 4(R9), R19; \ VMOV R19, x.S[0]; \ @@ -226,10 +217,10 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 EOR R0, R0 encryptBlocksLoop: - SM4_ROUND(R8, x, y, t0, t1, t2, t3) - SM4_ROUND(R8, x, y, t1, t2, t3, t0) - SM4_ROUND(R8, x, y, t2, t3, t0, t1) - SM4_ROUND(R8, x, y, t3, t0, t1, t2) + SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2) ADD $16, R0 CMP $128, R0 @@ -297,10 +288,10 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 EOR R0, R0 encryptBlockLoop: - SM4_ROUND(R8, x, y, t0, t1, t2, t3) - SM4_ROUND(R8, x, y, t1, t2, t3, t0) - SM4_ROUND(R8, x, y, t2, t3, t0, t1) - SM4_ROUND(R8, x, y, t3, t0, t1, t2) + SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2) ADD $16, R0 CMP $128, R0 diff --git a/sm4/cbc_cipher_asm_amd64.s b/sm4/cbc_cipher_asm_amd64.s index 91fe2bc..f01f1b4 100644 --- a/sm4/cbc_cipher_asm_amd64.s +++ b/sm4/cbc_cipher_asm_amd64.s @@ -15,14 +15,6 @@ #include "aesni_amd64.h" -#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ - PINSRD $0, (index * 4)(RK)(IND*1), x; \ - PXOR t1, x; \ - PXOR t2, x; \ - PXOR t3, x; \ - SM4_TAO_L1(x, y, z); \ - PXOR x, t0 - // func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 #define ctx BX diff --git a/sm4/cbc_cipher_asm_arm64.s b/sm4/cbc_cipher_asm_arm64.s index 5f9473c..3edd9b7 100644 --- a/sm4/cbc_cipher_asm_arm64.s +++ b/sm4/cbc_cipher_asm_arm64.s @@ -25,50 +25,6 @@ #include "aesni_arm64.h" -#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \ - MOVW.P 4(RK), R19; \ - VMOV R19, x.S4; \ - VEOR t1.B16, x.B16, x.B16; \ - VEOR t2.B16, x.B16, x.B16; \ - VEOR t3.B16, x.B16, x.B16; \ - SM4_TAO_L1(x, y, z); \ - VEOR x.B16, t0.B16, t0.B16 - -#define load_global_data_1() \ - LDP nibble_mask<>(SB), (R0, R1) \ - VMOV R0, NIBBLE_MASK.D[0] \ - VMOV R1, NIBBLE_MASK.D[1] \ - LDP m1_low<>(SB), (R0, R1) \ - VMOV R0, M1L.D[0] \ - VMOV R1, M1L.D[1] \ - LDP m1_high<>(SB), (R0, R1) \ - VMOV R0, M1H.D[0] \ - VMOV R1, M1H.D[1] \ - LDP m2_low<>(SB), (R0, R1) \ - VMOV R0, M2L.D[0] \ - VMOV R1, M2L.D[1] \ - LDP m2_high<>(SB), (R0, R1) \ - VMOV R0, M2H.D[0] \ - VMOV R1, M2H.D[1] \ - LDP fk_mask<>(SB), (R0, R1) \ - VMOV R0, FK_MASK.D[0] \ - VMOV R1, FK_MASK.D[1] \ - LDP inverse_shift_rows<>(SB), (R0, R1) \ - VMOV R0, INVERSE_SHIFT_ROWS.D[0] \ - VMOV R1, INVERSE_SHIFT_ROWS.D[1] - -#define load_global_data_2() \ - load_global_data_1() \ - LDP r08_mask<>(SB), (R0, R1) \ - VMOV R0, R08_MASK.D[0] \ - VMOV R1, R08_MASK.D[1] \ - LDP r16_mask<>(SB), (R0, R1) \ - VMOV R0, R16_MASK.D[0] \ - VMOV R1, R16_MASK.D[1] \ - LDP r24_mask<>(SB), (R0, R1) \ - VMOV R0, R24_MASK.D[0] \ - VMOV R1, R24_MASK.D[1] - // func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 #define ctx R1 @@ -76,55 +32,55 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 #define ptxLen R4 #define rkSave R8 - load_global_data_2() + LOAD_SM4_AESNI_CONSTS() MOVD xk+0(FP), rkSave MOVD dst+8(FP), ctx MOVD src+32(FP), ptx MOVD src_len+40(FP), ptxLen MOVD iv+56(FP), R5 - - VEOR ZERO.B16, ZERO.B16, ZERO.B16 - VLD1 (R5), [IV.B16] + + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + VLD1 (R5), [IV.B16] loopSrc: - CMP $16, ptxLen - BLT done_sm4 - SUB $16, ptxLen + CMP $16, ptxLen + BLT done_sm4 + SUB $16, ptxLen - VLD1.P (ptx), [t0.S4] - VEOR IV.B16, t0.B16, t0.B16 - VREV32 t0.B16, t0.B16 - VMOV t0.S[1], t1.S[0] - VMOV t0.S[2], t2.S[0] - VMOV t0.S[3], t3.S[0] + VLD1.P (ptx), [t0.S4] + VEOR IV.B16, t0.B16, t0.B16 + VREV32 t0.B16, t0.B16 + VMOV t0.S[1], t1.S[0] + VMOV t0.S[2], t2.S[0] + VMOV t0.S[3], t3.S[0] - EOR R2, R2 - MOVD rkSave, R0 + EOR R2, R2 + MOVD rkSave, R0 encryptBlockLoop: - SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3) - SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0) - SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1) - SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2) + SM4_ROUND(R0, R19, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(R0, R19, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(R0, R19, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(R0, R19, x, y, XTMP6, t3, t0, t1, t2) - ADD $16, R2 - CMP $128, R2 - BNE encryptBlockLoop + ADD $16, R2 + CMP $128, R2 + BNE encryptBlockLoop - VMOV t2.S[0], t3.S[1] - VMOV t1.S[0], t3.S[2] - VMOV t0.S[0], t3.S[3] - VREV32 t3.B16, t3.B16 + VMOV t2.S[0], t3.S[1] + VMOV t1.S[0], t3.S[2] + VMOV t0.S[0], t3.S[3] + VREV32 t3.B16, t3.B16 - VST1.P [t3.B16], (ctx) - VMOV t3.B16, IV.B16 + VST1.P [t3.B16], (ctx) + VMOV t3.B16, IV.B16 - B loopSrc + B loopSrc done_sm4: - VST1 [IV.B16], (R5) - RET + VST1 [IV.B16], (R5) + RET #undef ctx #undef ptx diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index dd0a5cb..c980d44 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -155,23 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 #undef plen #undef dlen -#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ - PINSRD $0, (index * 4)(RK)(IND*1), x; \ - PXOR t1, x; \ - PXOR t2, x; \ - PXOR t3, x; \ - SM4_TAO_L1(x, y, z); \ - PXOR x, t0 - -#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ - PINSRD $0, (index * 4)(RK)(IND*1), x; \ - PSHUFD $0, x, x; \ - PXOR t1, x; \ - PXOR t2, x; \ - PXOR t3, x; \ - SM4_TAO_L1(x, y, z); \ - PXOR x, t0 - #define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \ PSHUFB flip_mask<>(SB), t0; \ PSHUFB flip_mask<>(SB), t1; \ @@ -229,7 +212,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 VPXOR t1, x, x; \ VPXOR t2, x, x; \ VPXOR t3, x, x; \ - AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \ + AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ VPXOR x, t0, t0 #define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index 295999a..8b7356d 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -111,44 +111,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 #include "aesni_arm64.h" -#define LOAD_SM4_AESNI_CONSTS() \ - LDP nibble_mask<>(SB), (R20, R21) \ - VMOV R20, NIBBLE_MASK.D[0] \ - VMOV R21, NIBBLE_MASK.D[1] \ - LDP m1_low<>(SB), (R20, R21) \ - VMOV R20, M1L.D[0] \ - VMOV R21, M1L.D[1] \ - LDP m1_high<>(SB), (R20, R21) \ - VMOV R20, M1H.D[0] \ - VMOV R21, M1H.D[1] \ - LDP m2_low<>(SB), (R20, R21) \ - VMOV R20, M2L.D[0] \ - VMOV R21, M2L.D[1] \ - LDP m2_high<>(SB), (R20, R21) \ - VMOV R20, M2H.D[0] \ - VMOV R21, M2H.D[1] \ - LDP inverse_shift_rows<>(SB), (R20, R21) \ - VMOV R20, INVERSE_SHIFT_ROWS.D[0] \ - VMOV R21, INVERSE_SHIFT_ROWS.D[1] \ - LDP r08_mask<>(SB), (R20, R21) \ - VMOV R20, R08_MASK.D[0] \ - VMOV R21, R08_MASK.D[1] \ - LDP r16_mask<>(SB), (R20, R21) \ - VMOV R20, R16_MASK.D[0] \ - VMOV R21, R16_MASK.D[1] \ - LDP r24_mask<>(SB), (R20, R21) \ - VMOV R20, R24_MASK.D[0] \ - VMOV R21, R24_MASK.D[1] - -#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \ - MOVW.P 4(RK), R19; \ - VMOV R19, x.S4; \ - VEOR t1.B16, x.B16, x.B16; \ - VEOR t2.B16, x.B16, x.B16; \ - VEOR t3.B16, x.B16, x.B16; \ - SM4_TAO_L1(x, y, z); \ - VEOR x.B16, t0.B16, t0.B16 - // func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int) TEXT ·gcmSm4Init(SB),NOSPLIT,$0 #define pTbl R0 @@ -178,10 +140,10 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0 EOR R3, R3 sm4InitEncLoop: - SM4_ROUND(RK, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(RK, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(RK, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(RK, K0, K1, K2, B3, B0, B1, B2) + SM4_ROUND(RK, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(RK, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(RK, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(RK, R19, K0, K1, K2, B3, B0, B1, B2) ADD $1, R3 CMP $8, R3 @@ -491,10 +453,10 @@ encOctetsLoop: MOVD rkSave, rk encOctetsEnc4Blocks1: - SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) ADD $1, R13 CMP $8, R13 @@ -509,10 +471,10 @@ encOctetsEnc4Blocks1: MOVD rkSave, rk encOctetsEnc4Blocks2: - SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7) - SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4) - SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5) - SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6) + SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7) + SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4) + SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5) + SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6) ADD $1, R13 CMP $16, R13 @@ -586,10 +548,10 @@ encNibblesLoop: MOVD rkSave, rk encNibblesEnc4Blocks: - SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) ADD $1, R13 CMP $8, R13 @@ -631,10 +593,10 @@ encStartSingles: MOVD rkSave, rk encSinglesEnc4Blocks: - SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) ADD $1, R13 CMP $8, R13 @@ -783,10 +745,10 @@ decOctetsLoop: MOVD rkSave, rk decOctetsEnc4Blocks1: - SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) ADD $1, R13 CMP $8, R13 @@ -802,10 +764,10 @@ decOctetsEnc4Blocks1: MOVD rkSave, rk decOctetsEnc4Blocks2: - SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7) - SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4) - SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5) - SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6) + SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7) + SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4) + SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5) + SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6) ADD $1, R13 CMP $16, R13 @@ -880,10 +842,10 @@ decNibblesLoop: MOVD rkSave, rk decNibblesEnc4Blocks: - SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) ADD $1, R13 CMP $8, R13 @@ -928,10 +890,10 @@ decStartSingles: MOVD rkSave, rk decSinglesEnc4Blocks: - SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) ADD $1, R13 CMP $8, R13