sm4: reduce duplicated code and add comments

This commit is contained in:
Sun Yimin 2022-07-25 13:20:58 +08:00 committed by GitHub
parent 9204f1f4b2
commit 9b364dca8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 349 additions and 257 deletions

View File

@ -53,6 +53,23 @@ DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), RODATA, $16 GLOBL fk_mask<>(SB), RODATA, $16
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
// input: from high to low
// r0 = [w3, w2, w1, w0]
// r1 = [w7, w6, w5, w4]
// r2 = [w11, w10, w9, w8]
// r3 = [w15, w14, w13, w12]
// r: 32/64 temp register
// tmp1: 128 bits temp register
// tmp2: 128 bits temp register
//
// output: from high to low
// r0 = [w12, w8, w4, w0]
// r1 = [w13, w9, w5, w1]
// r2 = [w14, w10, w6, w2]
// r3 = [w15, w11, w7, w3]
//
// SSE2/MMX instructions:
// MOVOU r0, tmp2; // MOVOU r0, tmp2;
// PUNPCKHDQ r1, tmp2; // PUNPCKHDQ r1, tmp2;
// PUNPCKLDQ r1, r0; // PUNPCKLDQ r1, r0;
@ -122,6 +139,11 @@ GLOBL fk_mask<>(SB), RODATA, $16
PEXTRQ $0, tmp2, r; \ PEXTRQ $0, tmp2, r; \
PINSRQ $0, r, r2 PINSRQ $0, r, r2
// SM4 sbox function
// parameters:
// - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - z: 128 bits temp register
#define SM4_SBOX(x, y, z) \ #define SM4_SBOX(x, y, z) \
; \ //############################# inner affine ############################// ; \ //############################# inner affine ############################//
MOVOU x, z; \ MOVOU x, z; \
@ -149,6 +171,11 @@ GLOBL fk_mask<>(SB), RODATA, $16
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
// SM4 TAO L1 function
// parameters:
// - x: 128 bits register as TAO_L1 input/output data
// - y: 128 bits temp register
// - z: 128 bits temp register
#define SM4_TAO_L1(x, y, z) \ #define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \ SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################// ; \ //#################### 4 parallel L1 linear transforms ##################//
@ -167,6 +194,56 @@ GLOBL fk_mask<>(SB), RODATA, $16
PXOR y, x; \ //x = x xor y PXOR y, x; \ //x = x xor y
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24); PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
// SM4 single round function, handle 16 bytes data
// t0 ^= tao_l1(t1^t2^t3^xk)
// used R19 as temp 32/64 bits register
// parameters:
// - index: round key index immediate number
// - RK: round key register
// - IND: round key index base register
// - x: 128 bits temp register
// - y: 128 bits temp register
// - z: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
// SM4 round function, handle 64 bytes data
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - RK: round key register
// - IND: round key index base register
// - x: 128 bits temp register
// - y: 128 bits temp register
// - z: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PSHUFD $0, x, x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
// SM4 sbox function, AVX version
// parameters:
// - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - X_NIBBLE_MASK: 128 bits register stored nibble mask, should be loaded earlier.
// - tmp: 128 bits temp register
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \ #define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
VPAND X_NIBBLE_MASK, x, tmp; \ VPAND X_NIBBLE_MASK, x, tmp; \
VMOVDQU m1_low<>(SB), y; \ VMOVDQU m1_low<>(SB), y; \
@ -188,8 +265,14 @@ GLOBL fk_mask<>(SB), RODATA, $16
VPSHUFB x, tmp, x; \ VPSHUFB x, tmp, x; \
VPXOR y, x, x VPXOR y, x, x
#define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \ // SM4 TAO L1 function, AVX version
AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \ // parameters:
// - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
// - tmp: 128 bits temp register
#define AVX_SM4_TAO_L1(x, y, xNibbleMask, tmp) \
AVX_SM4_SBOX(x, y, xNibbleMask, tmp); \
VMOVDQU r08_mask<>(SB), tmp; \ VMOVDQU r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \ VPSHUFB tmp, x, y; \
VPXOR x, y, y; \ VPXOR x, y, y; \
@ -204,6 +287,14 @@ GLOBL fk_mask<>(SB), RODATA, $16
VPXOR y, x, x; \ VPXOR y, x, x; \
VPXOR x, tmp, x VPXOR x, tmp, x
// transpose matrix function, AVX/AVX2 version
// parameters:
// - r0: 128/256 bits register as input/output data
// - r1: 128/256 bits register as input/output data
// - r2: 128/256 bits register as input/output data
// - r3: 128/256 bits register as input/output data
// - tmp1: 128/256 bits temp register
// - tmp2: 128/256 bits temp register
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
@ -214,42 +305,60 @@ GLOBL fk_mask<>(SB), RODATA, $16
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
#define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \ // SM4 sbox function, AVX2 version
VPAND yNibbleMask, x, tmp; \ // parameters:
VBROADCASTI128 m1_low<>(SB), y; \ // - x: 256 bits register as sbox input/output data
VPSHUFB tmp, y, y; \ // - y: 256 bits temp register
VPSRLQ $4, x, x; \ // - z: 256 bits temp register
VPAND yNibbleMask, x, x; \ // - xw: 128 bits temp register
VBROADCASTI128 m1_high<>(SB), tmp; \ // - yw: 128 bits temp register
VPSHUFB x, tmp, x; \ // - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
VPXOR y, x, x; \ // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \ #define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
VPSHUFB tmp, x, x; \ VPAND yNibbleMask, x, z; \
VEXTRACTI128 $1, x, yw \ VBROADCASTI128 m1_low<>(SB), y; \
VAESENCLAST xNibbleMask, xw, xw; \ VPSHUFB z, y, y; \
VAESENCLAST xNibbleMask, yw, yw; \ VPSRLQ $4, x, x; \
VINSERTI128 $1, yw, x, x; \ VPAND yNibbleMask, x, x; \
VPANDN yNibbleMask, x, tmp; \ VBROADCASTI128 m1_high<>(SB), z; \
VBROADCASTI128 m2_low<>(SB), y; \ VPSHUFB x, z, x; \
VPSHUFB tmp, y, y; \ VPXOR y, x, x; \
VPSRLQ $4, x, x; \ VBROADCASTI128 inverse_shift_rows<>(SB), z; \
VPAND yNibbleMask, x, x; \ VPSHUFB z, x, x; \
VBROADCASTI128 m2_high<>(SB), tmp; \ VEXTRACTI128 $1, x, yw \
VPSHUFB x, tmp, x; \ VAESENCLAST xNibbleMask, xw, xw; \
VAESENCLAST xNibbleMask, yw, yw; \
VINSERTI128 $1, yw, x, x; \
VPANDN yNibbleMask, x, z; \
VBROADCASTI128 m2_low<>(SB), y; \
VPSHUFB z, y, y; \
VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \
VBROADCASTI128 m2_high<>(SB), z; \
VPSHUFB x, z, x; \
VPXOR y, x, x VPXOR y, x, x
#define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \ // SM4 TAO L1 function, AVX2 version
AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \ // parameters:
VBROADCASTI128 r08_mask<>(SB), tmp; \ // - x: 256 bits register as sbox input/output data
VPSHUFB tmp, x, y; \ // - y: 256 bits temp register
VPXOR x, y, y; \ // - z: 256 bits temp register
VBROADCASTI128 r16_mask<>(SB), tmp; \ // - xw: 128 bits temp register
VPSHUFB tmp, x, tmp; \ // - yw: 128 bits temp register
VPXOR tmp, y, y; \ // - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
VPSLLD $2, y, tmp; \ // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
VPSRLD $30, y, y; \ #define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
VPXOR tmp, y, y; \ AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
VBROADCASTI128 r24_mask<>(SB), tmp; \ VBROADCASTI128 r08_mask<>(SB), z; \
VPSHUFB tmp, x, tmp; \ VPSHUFB z, x, y; \
VPXOR y, x, x; \ VPXOR x, y, y; \
VPXOR x, tmp, x VBROADCASTI128 r16_mask<>(SB), z; \
VPSHUFB z, x, z; \
VPXOR z, y, y; \
VPSLLD $2, y, z; \
VPSRLD $30, y, y; \
VPXOR z, y, y; \
VBROADCASTI128 r24_mask<>(SB), z; \
VPSHUFB z, x, z; \
VPXOR y, x, x; \
VPXOR x, z, x

View File

@ -43,6 +43,45 @@ DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
#define LOAD_SM4_AESNI_CONSTS() \
LDP nibble_mask<>(SB), (R20, R21) \
VMOV R20, NIBBLE_MASK.D[0] \
VMOV R21, NIBBLE_MASK.D[1] \
LDP m1_low<>(SB), (R20, R21) \
VMOV R20, M1L.D[0] \
VMOV R21, M1L.D[1] \
LDP m1_high<>(SB), (R20, R21) \
VMOV R20, M1H.D[0] \
VMOV R21, M1H.D[1] \
LDP m2_low<>(SB), (R20, R21) \
VMOV R20, M2L.D[0] \
VMOV R21, M2L.D[1] \
LDP m2_high<>(SB), (R20, R21) \
VMOV R20, M2H.D[0] \
VMOV R21, M2H.D[1] \
LDP inverse_shift_rows<>(SB), (R20, R21) \
VMOV R20, INVERSE_SHIFT_ROWS.D[0] \
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
LDP r08_mask<>(SB), (R20, R21) \
VMOV R20, R08_MASK.D[0] \
VMOV R21, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R20, R21) \
VMOV R20, R16_MASK.D[0] \
VMOV R21, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R20, R21) \
VMOV R20, R24_MASK.D[0] \
VMOV R21, R24_MASK.D[1]
// input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
// t1 = t1.S3, t1.S2, t1.S1, t1.S0
// t2 = t2.S3, t2.S2, t2.S1, t2.S0
// t3 = t3.S3, t3.S2, t3.S1, t3.S0
// output: from high to low
// t0 = t3.S0, t2.S0, t1.S0, t0.S0
// t1 = t3.S1, t2.S1, t1.S1, t0.S1
// t2 = t3.S2, t2.S2, t1.S2, t0.S2
// t3 = t3.S3, t2.S3, t1.S3, t0.S3
#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ #define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
VMOV t0.B16, K.B16 \ VMOV t0.B16, K.B16 \
VMOV t1.S[0], t0.S[1] \ VMOV t1.S[0], t0.S[1] \
@ -60,6 +99,16 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
VMOV t3.S[2], t2.S[3] \ VMOV t3.S[2], t2.S[3] \
VMOV K.S[3], t3.S[2] VMOV K.S[3], t3.S[2]
// input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
// t1 = t1.S3, t1.S2, t1.S1, t1.S0
// t2 = t2.S3, t2.S2, t2.S1, t2.S0
// t3 = t3.S3, t3.S2, t3.S1, t3.S0
// output: from high to low
// t0 = t0.S0, t1.S0, t2.S0, t3.S0
// t1 = t0.S1, t1.S1, t2.S1, t3.S1
// t2 = t0.S2, t1.S2, t2.S2, t3.S2
// t3 = t0.S3, t1.S3, t2.S3, t3.S3
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ #define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
VMOV t0.B16, K.B16 \ VMOV t0.B16, K.B16 \
VMOV t3.S[0], t0.S[0] \ VMOV t3.S[0], t0.S[0] \
@ -80,6 +129,11 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
VMOV t2.S[2], t2.S[1] \ VMOV t2.S[2], t2.S[1] \
VMOV K.S[2], t2.S[2] VMOV K.S[2], t2.S[2]
// SM4 sbox function
// parameters:
// - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - z: 128 bits temp register
#define SM4_SBOX(x, y, z) \ #define SM4_SBOX(x, y, z) \
; \ ; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \ VAND x.B16, NIBBLE_MASK.B16, z.B16; \
@ -97,6 +151,11 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
VTBL z.B16, [M2H.B16], z.B16; \ VTBL z.B16, [M2H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16 VEOR y.B16, z.B16, x.B16
// SM4 TAO L1 function
// parameters:
// - x: 128 bits register as TAO_L1 input/output data
// - y: 128 bits temp register
// - z: 128 bits temp register
#define SM4_TAO_L1(x, y, z) \ #define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \ SM4_SBOX(x, y, z); \
VTBL R08_MASK.B16, [x.B16], y.B16; \ VTBL R08_MASK.B16, [x.B16], y.B16; \
@ -109,3 +168,24 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
VTBL R24_MASK.B16, [x.B16], z.B16; \ VTBL R24_MASK.B16, [x.B16], z.B16; \
VEOR z.B16, x.B16, x.B16; \ VEOR z.B16, x.B16, x.B16; \
VEOR y.B16, x.B16, x.B16 VEOR y.B16, x.B16, x.B16
// SM4 round function
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - RK: round key register
// - tmp32: temp 32/64 bits register
// - x: 128 bits temp register
// - y: 128 bits temp register
// - z: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define SM4_ROUND(RK, tmp32, x, y, z, t0, t1, t2, t3) \
MOVW.P 4(RK), tmp32; \
VMOV tmp32, x.S4; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y, z); \
VEOR x.B16, t0.B16, t0.B16

View File

@ -16,6 +16,12 @@
#include "aesni_amd64.h" #include "aesni_amd64.h"
// SM4 TAO L2 function, used for key expand
// parameters:
// - x: 128 bits register as TAO_L1 input/output data
// - y: 128 bits temp register
// - tmp1: 128 bits temp register
// - tmp2: 128 bits temp register
#define SM4_TAO_L2(x, y, tmp1, tmp2) \ #define SM4_TAO_L2(x, y, tmp1, tmp2) \
SM4_SBOX(x, y, tmp1); \ SM4_SBOX(x, y, tmp1); \
; \ //#################### 4 parallel L2 linear transforms ##################// ; \ //#################### 4 parallel L2 linear transforms ##################//
@ -31,23 +37,16 @@
PXOR tmp2, y; \ PXOR tmp2, y; \
PXOR y, x PXOR y, x
#define SM4_ROUND(index, x, y, t0, t1, t2, t3) \ // SM4 expand round function
PINSRD $0, (index * 4)(AX)(CX*1), x; \ // t0 ^= tao_l2(t1^t2^t3^ck) and store t0.S[0] to enc/dec
PSHUFD $0, x, x; \ // parameters:
PXOR t1, x; \ // - index: round key index immediate number
PXOR t2, x; \ // - x: 128 bits temp register
PXOR t3, x; \ // - y: 128 bits temp register
SM4_TAO_L1(x, y, XTMP6); \ // - t0: 128 bits register for data
PXOR x, t0 // - t1: 128 bits register for data
// - t2: 128 bits register for data
#define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \ // - t3: 128 bits register for data
PINSRD $0, (index * 4)(AX)(CX*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, XTMP6); \
PXOR x, t0
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \ #define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(BX)(CX*1), x; \ PINSRD $0, (index * 4)(BX)(CX*1), x; \
PXOR t1, x; \ PXOR t1, x; \
@ -89,14 +88,34 @@
#define XWORD X8 #define XWORD X8
#define YWORD X9 #define YWORD X9
// SM4 round function, AVX2 version, handle 256 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 256 bits temp register
// - y: 256 bits temp register
// - t0: 256 bits register for data as result
// - t1: 256 bits register for data
// - t2: 256 bits register for data
// - t3: 256 bits register for data
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ #define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \ VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \ VPXOR t1, x, x; \
VPXOR t2, x, x; \ VPXOR t2, x, x; \
VPXOR t3, x, x; \ VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \ AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0 VPXOR x, t0, t0
// SM4 round function, AVX version, handle 128 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 128 bits temp register
// - y: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ #define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \ VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \ VPXOR t1, x, x; \
@ -174,10 +193,10 @@ non_avx2_start:
XORL CX, CX XORL CX, CX
loop: loop:
SM4_ROUND(0, x, y, t0, t1, t2, t3) SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(1, x, y, t1, t2, t3, t0) SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(2, x, y, t2, t3, t0, t1) SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(3, x, y, t3, t0, t1, t2) SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
ADDL $16, CX ADDL $16, CX
CMPL CX, $4*32 CMPL CX, $4*32
@ -328,10 +347,10 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
XORL CX, CX XORL CX, CX
loop: loop:
SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3) SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0) SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1) SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2) SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
ADDL $16, CX ADDL $16, CX
CMPL CX, $4*32 CMPL CX, $4*32

View File

@ -37,15 +37,6 @@
VEOR XTMP7.B16, y.B16, y.B16; \ VEOR XTMP7.B16, y.B16, y.B16; \
VEOR x.B16, y.B16, x.B16 VEOR x.B16, y.B16, x.B16
#define SM4_ROUND(RK, x, y, t0, t1, t2, t3) \
MOVW.P 4(RK), R19; \
VMOV R19, x.S4; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y, XTMP6); \
VEOR x.B16, t0.B16, t0.B16
#define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \ #define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \
MOVW.P 4(R9), R19; \ MOVW.P 4(R9), R19; \
VMOV R19, x.S[0]; \ VMOV R19, x.S[0]; \
@ -226,10 +217,10 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
EOR R0, R0 EOR R0, R0
encryptBlocksLoop: encryptBlocksLoop:
SM4_ROUND(R8, x, y, t0, t1, t2, t3) SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0) SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1) SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2) SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0 ADD $16, R0
CMP $128, R0 CMP $128, R0
@ -297,10 +288,10 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
EOR R0, R0 EOR R0, R0
encryptBlockLoop: encryptBlockLoop:
SM4_ROUND(R8, x, y, t0, t1, t2, t3) SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0) SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1) SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2) SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0 ADD $16, R0
CMP $128, R0 CMP $128, R0

View File

@ -15,14 +15,6 @@
#include "aesni_amd64.h" #include "aesni_amd64.h"
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) // func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
#define ctx BX #define ctx BX

View File

@ -25,50 +25,6 @@
#include "aesni_arm64.h" #include "aesni_arm64.h"
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
MOVW.P 4(RK), R19; \
VMOV R19, x.S4; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y, z); \
VEOR x.B16, t0.B16, t0.B16
#define load_global_data_1() \
LDP nibble_mask<>(SB), (R0, R1) \
VMOV R0, NIBBLE_MASK.D[0] \
VMOV R1, NIBBLE_MASK.D[1] \
LDP m1_low<>(SB), (R0, R1) \
VMOV R0, M1L.D[0] \
VMOV R1, M1L.D[1] \
LDP m1_high<>(SB), (R0, R1) \
VMOV R0, M1H.D[0] \
VMOV R1, M1H.D[1] \
LDP m2_low<>(SB), (R0, R1) \
VMOV R0, M2L.D[0] \
VMOV R1, M2L.D[1] \
LDP m2_high<>(SB), (R0, R1) \
VMOV R0, M2H.D[0] \
VMOV R1, M2H.D[1] \
LDP fk_mask<>(SB), (R0, R1) \
VMOV R0, FK_MASK.D[0] \
VMOV R1, FK_MASK.D[1] \
LDP inverse_shift_rows<>(SB), (R0, R1) \
VMOV R0, INVERSE_SHIFT_ROWS.D[0] \
VMOV R1, INVERSE_SHIFT_ROWS.D[1]
#define load_global_data_2() \
load_global_data_1() \
LDP r08_mask<>(SB), (R0, R1) \
VMOV R0, R08_MASK.D[0] \
VMOV R1, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R0, R1) \
VMOV R0, R16_MASK.D[0] \
VMOV R1, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R0, R1) \
VMOV R0, R24_MASK.D[0] \
VMOV R1, R24_MASK.D[1]
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) // func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
#define ctx R1 #define ctx R1
@ -76,55 +32,55 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
#define ptxLen R4 #define ptxLen R4
#define rkSave R8 #define rkSave R8
load_global_data_2() LOAD_SM4_AESNI_CONSTS()
MOVD xk+0(FP), rkSave MOVD xk+0(FP), rkSave
MOVD dst+8(FP), ctx MOVD dst+8(FP), ctx
MOVD src+32(FP), ptx MOVD src+32(FP), ptx
MOVD src_len+40(FP), ptxLen MOVD src_len+40(FP), ptxLen
MOVD iv+56(FP), R5 MOVD iv+56(FP), R5
VEOR ZERO.B16, ZERO.B16, ZERO.B16 VEOR ZERO.B16, ZERO.B16, ZERO.B16
VLD1 (R5), [IV.B16] VLD1 (R5), [IV.B16]
loopSrc: loopSrc:
CMP $16, ptxLen CMP $16, ptxLen
BLT done_sm4 BLT done_sm4
SUB $16, ptxLen SUB $16, ptxLen
VLD1.P (ptx), [t0.S4] VLD1.P (ptx), [t0.S4]
VEOR IV.B16, t0.B16, t0.B16 VEOR IV.B16, t0.B16, t0.B16
VREV32 t0.B16, t0.B16 VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0] VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0] VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0] VMOV t0.S[3], t3.S[0]
EOR R2, R2 EOR R2, R2
MOVD rkSave, R0 MOVD rkSave, R0
encryptBlockLoop: encryptBlockLoop:
SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3) SM4_ROUND(R0, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0) SM4_ROUND(R0, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1) SM4_ROUND(R0, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2) SM4_ROUND(R0, R19, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R2 ADD $16, R2
CMP $128, R2 CMP $128, R2
BNE encryptBlockLoop BNE encryptBlockLoop
VMOV t2.S[0], t3.S[1] VMOV t2.S[0], t3.S[1]
VMOV t1.S[0], t3.S[2] VMOV t1.S[0], t3.S[2]
VMOV t0.S[0], t3.S[3] VMOV t0.S[0], t3.S[3]
VREV32 t3.B16, t3.B16 VREV32 t3.B16, t3.B16
VST1.P [t3.B16], (ctx) VST1.P [t3.B16], (ctx)
VMOV t3.B16, IV.B16 VMOV t3.B16, IV.B16
B loopSrc B loopSrc
done_sm4: done_sm4:
VST1 [IV.B16], (R5) VST1 [IV.B16], (R5)
RET RET
#undef ctx #undef ctx
#undef ptx #undef ptx

View File

@ -155,23 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#undef plen #undef plen
#undef dlen #undef dlen
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PSHUFD $0, x, x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \ #define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \ PSHUFB flip_mask<>(SB), t0; \
PSHUFB flip_mask<>(SB), t1; \ PSHUFB flip_mask<>(SB), t1; \
@ -229,7 +212,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
VPXOR t1, x, x; \ VPXOR t1, x, x; \
VPXOR t2, x, x; \ VPXOR t2, x, x; \
VPXOR t3, x, x; \ VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \ AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0 VPXOR x, t0, t0
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ #define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \

View File

@ -111,44 +111,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#include "aesni_arm64.h" #include "aesni_arm64.h"
#define LOAD_SM4_AESNI_CONSTS() \
LDP nibble_mask<>(SB), (R20, R21) \
VMOV R20, NIBBLE_MASK.D[0] \
VMOV R21, NIBBLE_MASK.D[1] \
LDP m1_low<>(SB), (R20, R21) \
VMOV R20, M1L.D[0] \
VMOV R21, M1L.D[1] \
LDP m1_high<>(SB), (R20, R21) \
VMOV R20, M1H.D[0] \
VMOV R21, M1H.D[1] \
LDP m2_low<>(SB), (R20, R21) \
VMOV R20, M2L.D[0] \
VMOV R21, M2L.D[1] \
LDP m2_high<>(SB), (R20, R21) \
VMOV R20, M2H.D[0] \
VMOV R21, M2H.D[1] \
LDP inverse_shift_rows<>(SB), (R20, R21) \
VMOV R20, INVERSE_SHIFT_ROWS.D[0] \
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
LDP r08_mask<>(SB), (R20, R21) \
VMOV R20, R08_MASK.D[0] \
VMOV R21, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R20, R21) \
VMOV R20, R16_MASK.D[0] \
VMOV R21, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R20, R21) \
VMOV R20, R24_MASK.D[0] \
VMOV R21, R24_MASK.D[1]
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
MOVW.P 4(RK), R19; \
VMOV R19, x.S4; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y, z); \
VEOR x.B16, t0.B16, t0.B16
// func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int) // func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
TEXT ·gcmSm4Init(SB),NOSPLIT,$0 TEXT ·gcmSm4Init(SB),NOSPLIT,$0
#define pTbl R0 #define pTbl R0
@ -178,10 +140,10 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
EOR R3, R3 EOR R3, R3
sm4InitEncLoop: sm4InitEncLoop:
SM4_ROUND(RK, K0, K1, K2, B0, B1, B2, B3) SM4_ROUND(RK, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(RK, K0, K1, K2, B1, B2, B3, B0) SM4_ROUND(RK, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(RK, K0, K1, K2, B2, B3, B0, B1) SM4_ROUND(RK, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(RK, K0, K1, K2, B3, B0, B1, B2) SM4_ROUND(RK, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R3 ADD $1, R3
CMP $8, R3 CMP $8, R3
@ -491,10 +453,10 @@ encOctetsLoop:
MOVD rkSave, rk MOVD rkSave, rk
encOctetsEnc4Blocks1: encOctetsEnc4Blocks1:
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13 ADD $1, R13
CMP $8, R13 CMP $8, R13
@ -509,10 +471,10 @@ encOctetsEnc4Blocks1:
MOVD rkSave, rk MOVD rkSave, rk
encOctetsEnc4Blocks2: encOctetsEnc4Blocks2:
SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7) SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4) SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5) SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6) SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
ADD $1, R13 ADD $1, R13
CMP $16, R13 CMP $16, R13
@ -586,10 +548,10 @@ encNibblesLoop:
MOVD rkSave, rk MOVD rkSave, rk
encNibblesEnc4Blocks: encNibblesEnc4Blocks:
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13 ADD $1, R13
CMP $8, R13 CMP $8, R13
@ -631,10 +593,10 @@ encStartSingles:
MOVD rkSave, rk MOVD rkSave, rk
encSinglesEnc4Blocks: encSinglesEnc4Blocks:
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13 ADD $1, R13
CMP $8, R13 CMP $8, R13
@ -783,10 +745,10 @@ decOctetsLoop:
MOVD rkSave, rk MOVD rkSave, rk
decOctetsEnc4Blocks1: decOctetsEnc4Blocks1:
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13 ADD $1, R13
CMP $8, R13 CMP $8, R13
@ -802,10 +764,10 @@ decOctetsEnc4Blocks1:
MOVD rkSave, rk MOVD rkSave, rk
decOctetsEnc4Blocks2: decOctetsEnc4Blocks2:
SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7) SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4) SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5) SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6) SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
ADD $1, R13 ADD $1, R13
CMP $16, R13 CMP $16, R13
@ -880,10 +842,10 @@ decNibblesLoop:
MOVD rkSave, rk MOVD rkSave, rk
decNibblesEnc4Blocks: decNibblesEnc4Blocks:
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13 ADD $1, R13
CMP $8, R13 CMP $8, R13
@ -928,10 +890,10 @@ decStartSingles:
MOVD rkSave, rk MOVD rkSave, rk
decSinglesEnc4Blocks: decSinglesEnc4Blocks:
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13 ADD $1, R13
CMP $8, R13 CMP $8, R13