sm4: format asm code

This commit is contained in:
Sun Yimin 2022-07-22 10:08:15 +08:00 committed by GitHub
parent ff434b7bd7
commit acffd83cc9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 548 additions and 547 deletions

View File

@ -67,189 +67,189 @@ GLOBL fk_mask<>(SB), RODATA, $16
// PUNPCKLQDQ r2, tmp2; // PUNPCKLQDQ r2, tmp2;
// MOVOU tmp2, r2 // MOVOU tmp2, r2
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \ #define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
PEXTRD $2, r0, r; \ PEXTRD $2, r0, r; \
PINSRD $0, r, tmp2; \ PINSRD $0, r, tmp2; \
PEXTRD $2, r1, r; \ PEXTRD $2, r1, r; \
PINSRD $1, r, tmp2; \ PINSRD $1, r, tmp2; \
; \ ; \
PEXTRD $3, r0, r; \ PEXTRD $3, r0, r; \
PINSRD $2, r, tmp2; \ PINSRD $2, r, tmp2; \
PEXTRD $3, r1, r; \ PEXTRD $3, r1, r; \
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2] PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
; \ ; \
PEXTRD $1, r0, r; \ PEXTRD $1, r0, r; \
PINSRD $2, r, r0; \ PINSRD $2, r, r0; \
PEXTRD $0, r1, r; \ PEXTRD $0, r1, r; \
PINSRD $1, r, r0; \ PINSRD $1, r, r0; \
PEXTRD $1, r1, r; \ PEXTRD $1, r1, r; \
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0] PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
; \ ; \
PEXTRD $0, r2, r; \ PEXTRD $0, r2, r; \
PINSRD $0, r, tmp1; \ PINSRD $0, r, tmp1; \
PEXTRD $0, r3, r; \ PEXTRD $0, r3, r; \
PINSRD $1, r, tmp1; \ PINSRD $1, r, tmp1; \
PEXTRD $1, r2, r; \ PEXTRD $1, r2, r; \
PINSRD $2, r, tmp1; \ PINSRD $2, r, tmp1; \
PEXTRD $1, r3, r; \ PEXTRD $1, r3, r; \
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8] PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
; \ ; \
PEXTRD $2, r2, r; \ PEXTRD $2, r2, r; \
PINSRD $0, r, r2; \ PINSRD $0, r, r2; \
PEXTRD $2, r3, r; \ PEXTRD $2, r3, r; \
PINSRD $1, r, r2; \ PINSRD $1, r, r2; \
PEXTRD $3, r2, r; \ PEXTRD $3, r2, r; \
PINSRD $2, r, r2; \ PINSRD $2, r, r2; \
PEXTRD $3, r3, r; \ PEXTRD $3, r3, r; \
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10] PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
; \ ; \
MOVOU r0, r1; \ MOVOU r0, r1; \
PEXTRQ $1, r1, r; \ PEXTRQ $1, r1, r; \
PINSRQ $0, r, r1; \ PINSRQ $0, r, r1; \
PEXTRQ $1, tmp1, r; \ PEXTRQ $1, tmp1, r; \
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1] PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
; \ ; \
PEXTRQ $0, tmp1, r; \ PEXTRQ $0, tmp1, r; \
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0] PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
; \ ; \
MOVOU tmp2, r3; \ MOVOU tmp2, r3; \
PEXTRQ $1, r3, r; \ PEXTRQ $1, r3, r; \
PINSRQ $0, r, r3; \ PINSRQ $0, r, r3; \
PEXTRQ $1, r2, r; \ PEXTRQ $1, r2, r; \
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3] PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
; \ ; \
PEXTRQ $0, r2, r; \ PEXTRQ $0, r2, r; \
PINSRQ $1, r, r2; \ PINSRQ $1, r, r2; \
PEXTRQ $0, tmp2, r; \ PEXTRQ $0, tmp2, r; \
PINSRQ $0, r, r2 PINSRQ $0, r, r2
#define SM4_SBOX(x, y, z) \ #define SM4_SBOX(x, y, z) \
; \ //############################# inner affine ############################// ; \ //############################# inner affine ############################//
MOVOU x, z; \ MOVOU x, z; \
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f); PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
MOVOU m1_low<>(SB), y; \ MOVOU m1_low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y); PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m1_high<>(SB), z; \ MOVOU m1_high<>(SB), z; \
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x); PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x); MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
; \ // inverse ShiftRows ; \ // inverse ShiftRows
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
; \ //############################# outer affine ############################// ; \ //############################# outer affine ############################//
MOVOU x, z; \ MOVOU x, z; \
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f); PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
MOVOU m2_low<>(SB), y; \ MOVOU m2_low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z) PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m2_high<>(SB), z; \ MOVOU m2_high<>(SB), z; \
PSHUFB x, z; \ PSHUFB x, z; \
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
#define SM4_TAO_L1(x, y, z) \ #define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \ SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################// ; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \ MOVOU x, y; \
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08) PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
MOVOU x, z; \ MOVOU x, z; \
PSHUFB r16_mask<>(SB), z; \ PSHUFB r16_mask<>(SB), z; \
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16) PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
MOVOU y, z; \ MOVOU y, z; \
PSLLL $2, z; \ PSLLL $2, z; \
PSRLL $30, y; \ PSRLL $30, y; \
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30); POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
MOVOU x, z; \ MOVOU x, z; \
PSHUFB r24_mask<>(SB), z; \ PSHUFB r24_mask<>(SB), z; \
PXOR y, x; \ //x = x xor y PXOR y, x; \ //x = x xor y
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24); PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \ #define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
VPAND X_NIBBLE_MASK, x, tmp; \ VPAND X_NIBBLE_MASK, x, tmp; \
VMOVDQU m1_low<>(SB), y; \ VMOVDQU m1_low<>(SB), y; \
VPSHUFB tmp, y, y; \ VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \ VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \ VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m1_high<>(SB), tmp; \ VMOVDQU m1_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \ VPSHUFB x, tmp, x; \
VPXOR y, x, x; \ VPXOR y, x, x; \
VMOVDQU inverse_shift_rows<>(SB), tmp; \ VMOVDQU inverse_shift_rows<>(SB), tmp; \
VPSHUFB tmp, x, x; \ VPSHUFB tmp, x, x; \
VAESENCLAST X_NIBBLE_MASK, x, x; \ VAESENCLAST X_NIBBLE_MASK, x, x; \
VPANDN X_NIBBLE_MASK, x, tmp; \ VPANDN X_NIBBLE_MASK, x, tmp; \
VMOVDQU m2_low<>(SB), y; \ VMOVDQU m2_low<>(SB), y; \
VPSHUFB tmp, y, y; \ VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \ VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \ VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m2_high<>(SB), tmp; \ VMOVDQU m2_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \ VPSHUFB x, tmp, x; \
VPXOR y, x, x VPXOR y, x, x
#define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \ #define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \
AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \ AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \
VMOVDQU r08_mask<>(SB), tmp; \ VMOVDQU r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \ VPSHUFB tmp, x, y; \
VPXOR x, y, y; \ VPXOR x, y, y; \
VMOVDQU r16_mask<>(SB), tmp; \ VMOVDQU r16_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \ VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \ VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \ VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \ VPSRLD $30, y, y; \
VPXOR tmp, y, y; \ VPXOR tmp, y, y; \
VMOVDQU r24_mask<>(SB), tmp; \ VMOVDQU r24_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \ VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \ VPXOR y, x, x; \
VPXOR x, tmp, x VPXOR x, tmp, x
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8] VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10] VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1] VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0] VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
#define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \ #define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
VPAND yNibbleMask, x, tmp; \ VPAND yNibbleMask, x, tmp; \
VBROADCASTI128 m1_low<>(SB), y; \ VBROADCASTI128 m1_low<>(SB), y; \
VPSHUFB tmp, y, y; \ VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \ VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \ VPAND yNibbleMask, x, x; \
VBROADCASTI128 m1_high<>(SB), tmp; \ VBROADCASTI128 m1_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \ VPSHUFB x, tmp, x; \
VPXOR y, x, x; \ VPXOR y, x, x; \
VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \ VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \
VPSHUFB tmp, x, x; \ VPSHUFB tmp, x, x; \
VEXTRACTI128 $1, x, yw \ VEXTRACTI128 $1, x, yw \
VAESENCLAST xNibbleMask, xw, xw; \ VAESENCLAST xNibbleMask, xw, xw; \
VAESENCLAST xNibbleMask, yw, yw; \ VAESENCLAST xNibbleMask, yw, yw; \
VINSERTI128 $1, yw, x, x; \ VINSERTI128 $1, yw, x, x; \
VPANDN yNibbleMask, x, tmp; \ VPANDN yNibbleMask, x, tmp; \
VBROADCASTI128 m2_low<>(SB), y; \ VBROADCASTI128 m2_low<>(SB), y; \
VPSHUFB tmp, y, y; \ VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \ VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \ VPAND yNibbleMask, x, x; \
VBROADCASTI128 m2_high<>(SB), tmp; \ VBROADCASTI128 m2_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \ VPSHUFB x, tmp, x; \
VPXOR y, x, x VPXOR y, x, x
#define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \ #define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \ AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \
VBROADCASTI128 r08_mask<>(SB), tmp; \ VBROADCASTI128 r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \ VPSHUFB tmp, x, y; \
VPXOR x, y, y; \ VPXOR x, y, y; \
VBROADCASTI128 r16_mask<>(SB), tmp; \ VBROADCASTI128 r16_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \ VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \ VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \ VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \ VPSRLD $30, y, y; \
VPXOR tmp, y, y; \ VPXOR tmp, y, y; \
VBROADCASTI128 r24_mask<>(SB), tmp; \ VBROADCASTI128 r24_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \ VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \ VPXOR y, x, x; \
VPXOR x, tmp, x VPXOR x, tmp, x

View File

@ -62,7 +62,7 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ #define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
VMOV t0.B16, K.B16 \ VMOV t0.B16, K.B16 \
VMOV t3.S[0], t0.S[0] \ VMOV t3.S[0], t0.S[0] \
VMOV t2.S[0], t0.S[1] \ VMOV t2.S[0], t0.S[1] \
VMOV t1.S[0], t0.S[2] \ VMOV t1.S[0], t0.S[2] \
VMOV K0.S[0], t0.S[3] \ VMOV K0.S[0], t0.S[3] \
@ -81,21 +81,21 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
VMOV K.S[2], t2.S[2] VMOV K.S[2], t2.S[2]
#define SM4_SBOX(x, y, z) \ #define SM4_SBOX(x, y, z) \
; \ ; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \ VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1L.B16], y.B16; \ VTBL z.B16, [M1L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \ VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \ VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1H.B16], z.B16; \ VTBL z.B16, [M1H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16; \ VEOR y.B16, z.B16, x.B16; \
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
AESE ZERO.B16, x.B16; \ AESE ZERO.B16, x.B16; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \ VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2L.B16], y.B16; \ VTBL z.B16, [M2L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \ VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \ VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2H.B16], z.B16; \ VTBL z.B16, [M2H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16 VEOR y.B16, z.B16, x.B16
#define SM4_TAO_L1(x, y, z) \ #define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \ SM4_SBOX(x, y, z); \

View File

@ -16,48 +16,48 @@
#include "aesni_amd64.h" #include "aesni_amd64.h"
#define SM4_TAO_L2(x, y) \ #define SM4_TAO_L2(x, y, tmp1, tmp2) \
SM4_SBOX(x, y, XTMP6); \ SM4_SBOX(x, y, tmp1); \
; \ //#################### 4 parallel L2 linear transforms ##################// ; \ //#################### 4 parallel L2 linear transforms ##################//
MOVOU x, y; \ MOVOU x, y; \
MOVOU x, XTMP6; \ MOVOU x, tmp1; \
PSLLL $13, XTMP6; \ PSLLL $13, tmp1; \
PSRLL $19, y; \ PSRLL $19, y; \
POR XTMP6, y; \ //y = X roll 13 POR tmp1, y; \ //y = X roll 13
PSLLL $10, XTMP6; \ PSLLL $10, tmp1; \
MOVOU x, XTMP7; \ MOVOU x, tmp2; \
PSRLL $9, XTMP7; \ PSRLL $9, tmp2; \
POR XTMP6, XTMP7; \ //XTMP7 = x roll 23 POR tmp1, tmp2; \ //tmp2 = x roll 23
PXOR XTMP7, y; \ PXOR tmp2, y; \
PXOR y, x PXOR y, x
#define SM4_ROUND(index, x, y, t0, t1, t2, t3) \ #define SM4_ROUND(index, x, y, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(AX)(CX*1), x; \ PINSRD $0, (index * 4)(AX)(CX*1), x; \
PSHUFD $0, x, x; \ PSHUFD $0, x, x; \
PXOR t1, x; \ PXOR t1, x; \
PXOR t2, x; \ PXOR t2, x; \
PXOR t3, x; \ PXOR t3, x; \
SM4_TAO_L1(x, y, XTMP6); \ SM4_TAO_L1(x, y, XTMP6); \
PXOR x, t0 PXOR x, t0
#define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \ #define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(AX)(CX*1), x; \ PINSRD $0, (index * 4)(AX)(CX*1), x; \
PXOR t1, x; \ PXOR t1, x; \
PXOR t2, x; \ PXOR t2, x; \
PXOR t3, x; \ PXOR t3, x; \
SM4_TAO_L1(x, y, XTMP6); \ SM4_TAO_L1(x, y, XTMP6); \
PXOR x, t0 PXOR x, t0
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \ #define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(BX)(CX*1), x; \ PINSRD $0, (index * 4)(BX)(CX*1), x; \
PXOR t1, x; \ PXOR t1, x; \
PXOR t2, x; \ PXOR t2, x; \
PXOR t3, x; \ PXOR t3, x; \
SM4_TAO_L2(x, y); \ SM4_TAO_L2(x, y, XTMP6, XTMP7); \
PXOR x, t0; \ PXOR x, t0; \
PEXTRD $0, t0, R8; \ PEXTRD $0, t0, R8; \
MOVL R8, (index * 4)(DX)(CX*1); \ MOVL R8, (index * 4)(DX)(CX*1); \
MOVL R8, (12 - index * 4)(DI)(SI*1) MOVL R8, (12 - index * 4)(DI)(SI*1)
#define XDWORD0 Y4 #define XDWORD0 Y4
#define XDWORD1 Y5 #define XDWORD1 Y5
@ -89,253 +89,253 @@
#define XWORD X8 #define XWORD X8
#define YWORD X9 #define YWORD X9
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ #define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \ VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \ VPXOR t1, x, x; \
VPXOR t2, x, x; \ VPXOR t2, x, x; \
VPXOR t3, x, x; \ VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \ AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \
VPXOR x, t0, t0 VPXOR x, t0, t0
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ #define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \ VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \ VPXOR t1, x, x; \
VPXOR t2, x, x; \ VPXOR t2, x, x; \
VPXOR t3, x, x; \ VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \ AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
VPXOR x, t0, t0 VPXOR x, t0, t0
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
TEXT ·expandKeyAsm(SB),NOSPLIT,$0 TEXT ·expandKeyAsm(SB),NOSPLIT,$0
MOVQ key+0(FP), AX MOVQ key+0(FP), AX
MOVQ ck+8(FP), BX MOVQ ck+8(FP), BX
MOVQ enc+16(FP), DX MOVQ enc+16(FP), DX
MOVQ dec+24(FP), DI MOVQ dec+24(FP), DI
MOVUPS 0(AX), t0 MOVUPS 0(AX), t0
PSHUFB flip_mask<>(SB), t0 PSHUFB flip_mask<>(SB), t0
PXOR fk_mask<>(SB), t0 PXOR fk_mask<>(SB), t0
PSHUFD $1, t0, t1 PSHUFD $1, t0, t1
PSHUFD $2, t0, t2 PSHUFD $2, t0, t2
PSHUFD $3, t0, t3 PSHUFD $3, t0, t3
XORL CX, CX XORL CX, CX
MOVL $112, SI MOVL $112, SI
loop: loop:
SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3) SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3)
SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0) SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0)
SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1) SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1)
SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2) SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2)
ADDL $16, CX ADDL $16, CX
SUBL $16, SI SUBL $16, SI
CMPL CX, $4*32 CMPL CX, $4*32
JB loop JB loop
expand_end: expand_end:
RET RET
// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX MOVQ dst+8(FP), BX
MOVQ src+32(FP), DX MOVQ src+32(FP), DX
MOVQ src_len+40(FP), DI MOVQ src_len+40(FP), DI
CMPB ·useAVX2(SB), $1 CMPB ·useAVX2(SB), $1
JE avx2 JE avx2
non_avx2_start: non_avx2_start:
PINSRD $0, 0(DX), t0 PINSRD $0, 0(DX), t0
PINSRD $1, 16(DX), t0 PINSRD $1, 16(DX), t0
PINSRD $2, 32(DX), t0 PINSRD $2, 32(DX), t0
PINSRD $3, 48(DX), t0 PINSRD $3, 48(DX), t0
PSHUFB flip_mask<>(SB), t0 PSHUFB flip_mask<>(SB), t0
PINSRD $0, 4(DX), t1 PINSRD $0, 4(DX), t1
PINSRD $1, 20(DX), t1 PINSRD $1, 20(DX), t1
PINSRD $2, 36(DX), t1 PINSRD $2, 36(DX), t1
PINSRD $3, 52(DX), t1 PINSRD $3, 52(DX), t1
PSHUFB flip_mask<>(SB), t1 PSHUFB flip_mask<>(SB), t1
PINSRD $0, 8(DX), t2 PINSRD $0, 8(DX), t2
PINSRD $1, 24(DX), t2 PINSRD $1, 24(DX), t2
PINSRD $2, 40(DX), t2 PINSRD $2, 40(DX), t2
PINSRD $3, 56(DX), t2 PINSRD $3, 56(DX), t2
PSHUFB flip_mask<>(SB), t2 PSHUFB flip_mask<>(SB), t2
PINSRD $0, 12(DX), t3 PINSRD $0, 12(DX), t3
PINSRD $1, 28(DX), t3 PINSRD $1, 28(DX), t3
PINSRD $2, 44(DX), t3 PINSRD $2, 44(DX), t3
PINSRD $3, 60(DX), t3 PINSRD $3, 60(DX), t3
PSHUFB flip_mask<>(SB), t3 PSHUFB flip_mask<>(SB), t3
XORL CX, CX XORL CX, CX
loop: loop:
SM4_ROUND(0, x, y, t0, t1, t2, t3) SM4_ROUND(0, x, y, t0, t1, t2, t3)
SM4_ROUND(1, x, y, t1, t2, t3, t0) SM4_ROUND(1, x, y, t1, t2, t3, t0)
SM4_ROUND(2, x, y, t2, t3, t0, t1) SM4_ROUND(2, x, y, t2, t3, t0, t1)
SM4_ROUND(3, x, y, t3, t0, t1, t2) SM4_ROUND(3, x, y, t3, t0, t1, t2)
ADDL $16, CX ADDL $16, CX
CMPL CX, $4*32 CMPL CX, $4*32
JB loop JB loop
PSHUFB flip_mask<>(SB), t3 PSHUFB flip_mask<>(SB), t3
PSHUFB flip_mask<>(SB), t2 PSHUFB flip_mask<>(SB), t2
PSHUFB flip_mask<>(SB), t1 PSHUFB flip_mask<>(SB), t1
PSHUFB flip_mask<>(SB), t0 PSHUFB flip_mask<>(SB), t0
MOVUPS t3, 0(BX) MOVUPS t3, 0(BX)
MOVUPS t2, 16(BX) MOVUPS t2, 16(BX)
MOVUPS t1, 32(BX) MOVUPS t1, 32(BX)
MOVUPS t0, 48(BX) MOVUPS t0, 48(BX)
MOVL 4(BX), R8 MOVL 4(BX), R8
MOVL 8(BX), R9 MOVL 8(BX), R9
MOVL 12(BX), R10 MOVL 12(BX), R10
MOVL 16(BX), R11 MOVL 16(BX), R11
MOVL 32(BX), R12 MOVL 32(BX), R12
MOVL 48(BX), R13 MOVL 48(BX), R13
MOVL R11, 4(BX) MOVL R11, 4(BX)
MOVL R12, 8(BX) MOVL R12, 8(BX)
MOVL R13, 12(BX) MOVL R13, 12(BX)
MOVL R8, 16(BX) MOVL R8, 16(BX)
MOVL R9, 32(BX) MOVL R9, 32(BX)
MOVL R10, 48(BX) MOVL R10, 48(BX)
MOVL 24(BX), R8 MOVL 24(BX), R8
MOVL 28(BX), R9 MOVL 28(BX), R9
MOVL 36(BX), R10 MOVL 36(BX), R10
MOVL 52(BX), R11 MOVL 52(BX), R11
MOVL R10, 24(BX) MOVL R10, 24(BX)
MOVL R11, 28(BX) MOVL R11, 28(BX)
MOVL R8, 36(BX) MOVL R8, 36(BX)
MOVL R9, 52(BX) MOVL R9, 52(BX)
MOVL 44(BX), R8 MOVL 44(BX), R8
MOVL 56(BX), R9 MOVL 56(BX), R9
MOVL R9, 44(BX) MOVL R9, 44(BX)
MOVL R8, 56(BX) MOVL R8, 56(BX)
done_sm4: done_sm4:
RET RET
avx2: avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
CMPQ DI, $64 CMPQ DI, $64
JBE avx2_4blocks JBE avx2_4blocks
avx2_8blocks: avx2_8blocks:
VMOVDQU 0(DX), XDWORD0 VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1 VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2 VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3 VMOVDQU 96(DX), XDWORD3
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE // Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
XORL CX, CX XORL CX, CX
avx2_loop: avx2_loop:
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3) AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0) AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1) AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2) AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
ADDL $16, CX ADDL $16, CX
CMPL CX, $4*32 CMPL CX, $4*32
JB avx2_loop JB avx2_loop
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VMOVDQU XDWORD0, 0(BX) VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX) VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX) VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX) VMOVDQU XDWORD3, 96(BX)
JMP avx2_sm4_done JMP avx2_sm4_done
avx2_4blocks: avx2_4blocks:
VMOVDQU 0(DX), XWORD0 VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1 VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2 VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3 VMOVDQU 48(DX), XWORD3
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
XORL CX, CX XORL CX, CX
avx_loop: avx_loop:
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0) AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1) AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2) AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
ADDL $16, CX ADDL $16, CX
CMPL CX, $4*32 CMPL CX, $4*32
JB avx_loop JB avx_loop
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
VMOVDQU XWORD0, 0(BX) VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX) VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX) VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX) VMOVDQU XWORD3, 48(BX)
avx2_sm4_done: avx2_sm4_done:
VZEROUPPER VZEROUPPER
RET RET
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX MOVQ dst+8(FP), BX
MOVQ src+16(FP), DX MOVQ src+16(FP), DX
MOVUPS (DX), t0 MOVUPS (DX), t0
PSHUFB flip_mask<>(SB), t0 PSHUFB flip_mask<>(SB), t0
PSHUFD $1, t0, t1 PSHUFD $1, t0, t1
PSHUFD $2, t0, t2 PSHUFD $2, t0, t2
PSHUFD $3, t0, t3 PSHUFD $3, t0, t3
XORL CX, CX XORL CX, CX
loop: loop:
SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3) SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3)
SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0) SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0)
SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1) SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1)
SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2) SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2)
ADDL $16, CX ADDL $16, CX
CMPL CX, $4*32 CMPL CX, $4*32
JB loop JB loop
PEXTRD $0, t2, R8 PEXTRD $0, t2, R8
PINSRD $1, R8, t3 PINSRD $1, R8, t3
@ -347,4 +347,4 @@ loop:
MOVUPS t3, (BX) MOVUPS t3, (BX)
done_sm4: done_sm4:
RET RET

View File

@ -103,21 +103,21 @@
VMOV V8.S[1], V11.S[2] \ VMOV V8.S[1], V11.S[2] \
VMOV V8.S[0], V11.S[3] \ VMOV V8.S[0], V11.S[3] \
VST1.P [V8.S4, V9.S4], 32(R10) \ VST1.P [V8.S4, V9.S4], 32(R10) \
VST1 [V10.S4, V11.S4], (R11) \ VST1 [V10.S4, V11.S4], (R11) \
SUB $32, R11, R11 SUB $32, R11, R11
#define SM4E_ROUND() \ #define SM4E_ROUND() \
VLD1.P 16(R10), [V8.B16] \ VLD1.P 16(R10), [V8.B16] \
VREV32 V8.B16, V8.B16 \ VREV32 V8.B16, V8.B16 \
WORD $0x0884c0ce \ WORD $0x0884c0ce \
WORD $0x2884c0ce \ WORD $0x2884c0ce \
WORD $0x4884c0ce \ WORD $0x4884c0ce \
WORD $0x6884c0ce \ WORD $0x6884c0ce \
WORD $0x8884c0ce \ WORD $0x8884c0ce \
WORD $0xa884c0ce \ WORD $0xa884c0ce \
WORD $0xc884c0ce \ WORD $0xc884c0ce \
WORD $0xe884c0ce \ WORD $0xe884c0ce \
VREV32 V8.B16, V8.B16 \ VREV32 V8.B16, V8.B16 \
VST1.P [V8.B16], 16(R9) VST1.P [V8.B16], 16(R9)
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
@ -145,14 +145,14 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0
VEOR ZERO.B16, ZERO.B16, ZERO.B16 VEOR ZERO.B16, ZERO.B16, ZERO.B16
ksLoop: ksLoop:
SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3)
SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0) SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0)
SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1) SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1)
SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2) SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2)
ADD $16, R0 ADD $16, R0
CMP $128, R0 CMP $128, R0
BNE ksLoop BNE ksLoop
RET RET
sm4ekey: sm4ekey:
@ -226,14 +226,14 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
EOR R0, R0 EOR R0, R0
encryptBlocksLoop: encryptBlocksLoop:
SM4_ROUND(R8, x, y, t0, t1, t2, t3) SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0) SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1) SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2) SM4_ROUND(R8, x, y, t3, t0, t1, t2)
ADD $16, R0 ADD $16, R0
CMP $128, R0 CMP $128, R0
BNE encryptBlocksLoop BNE encryptBlocksLoop
VREV32 t0.B16, t0.B16 VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16 VREV32 t1.B16, t1.B16
@ -268,10 +268,11 @@ encryptBlocksLoop:
sm4niblocks: sm4niblocks:
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4] VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
sm4niblockloop: sm4niblockloop:
SM4E_ROUND() SM4E_ROUND()
SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes
CBNZ R12, sm4niblockloop CBNZ R12, sm4niblockloop
RET RET
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
@ -296,14 +297,14 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
EOR R0, R0 EOR R0, R0
encryptBlockLoop: encryptBlockLoop:
SM4_ROUND(R8, x, y, t0, t1, t2, t3) SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0) SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1) SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2) SM4_ROUND(R8, x, y, t3, t0, t1, t2)
ADD $16, R0 ADD $16, R0
CMP $128, R0 CMP $128, R0
BNE encryptBlockLoop BNE encryptBlockLoop
VREV32 t0.B16, t0.B16 VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16 VREV32 t1.B16, t1.B16

View File

@ -38,45 +38,45 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
MOVUPS (SI), IV MOVUPS (SI), IV
loopSrc: loopSrc:
CMPQ ptxLen, $16 CMPQ ptxLen, $16
JB done_sm4 JB done_sm4
SUBQ $16, ptxLen SUBQ $16, ptxLen
MOVUPS (ptx), t0 MOVUPS (ptx), t0
PXOR IV, t0 PXOR IV, t0
PSHUFB flip_mask<>(SB), t0 PSHUFB flip_mask<>(SB), t0
PSHUFD $1, t0, t1 PSHUFD $1, t0, t1
PSHUFD $2, t0, t2 PSHUFD $2, t0, t2
PSHUFD $3, t0, t3 PSHUFD $3, t0, t3
XORL CX, CX XORL CX, CX
loopRound: loopRound:
SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3) SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0) SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1) SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2) SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
ADDL $16, CX ADDL $16, CX
CMPL CX, $4*32 CMPL CX, $4*32
JB loopRound JB loopRound
PEXTRD $0, t2, R8 PEXTRD $0, t2, R8
PINSRD $1, R8, t3 PINSRD $1, R8, t3
PEXTRD $0, t1, R8 PEXTRD $0, t1, R8
PINSRD $2, R8, t3 PINSRD $2, R8, t3
PEXTRD $0, t0, R8 PEXTRD $0, t0, R8
PINSRD $3, R8, t3 PINSRD $3, R8, t3
PSHUFB flip_mask<>(SB), t3 PSHUFB flip_mask<>(SB), t3
MOVOU t3, IV MOVOU t3, IV
MOVUPS t3, (ctx) MOVUPS t3, (ctx)
LEAQ 16(ptx), ptx LEAQ 16(ptx), ptx
LEAQ 16(ctx), ctx LEAQ 16(ctx), ctx
JMP loopSrc JMP loopSrc
done_sm4: done_sm4:
MOVUPS IV, (SI) MOVUPS IV, (SI)

View File

@ -86,42 +86,42 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
VEOR ZERO.B16, ZERO.B16, ZERO.B16 VEOR ZERO.B16, ZERO.B16, ZERO.B16
VLD1 (R5), [IV.B16] VLD1 (R5), [IV.B16]
loopSrc: loopSrc:
CMP $16, ptxLen CMP $16, ptxLen
BLT done_sm4 BLT done_sm4
SUB $16, ptxLen SUB $16, ptxLen
VLD1.P (ptx), [t0.S4] VLD1.P (ptx), [t0.S4]
VEOR IV.B16, t0.B16, t0.B16 VEOR IV.B16, t0.B16, t0.B16
VREV32 t0.B16, t0.B16 VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0] VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0] VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0] VMOV t0.S[3], t3.S[0]
EOR R2, R2
EOR R2, R2 MOVD rkSave, R0
MOVD rkSave, R0
encryptBlockLoop: encryptBlockLoop:
SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3) SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0) SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1) SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2) SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R2 ADD $16, R2
CMP $128, R2 CMP $128, R2
BNE encryptBlockLoop BNE encryptBlockLoop
VMOV t2.S[0], t3.S[1] VMOV t2.S[0], t3.S[1]
VMOV t1.S[0], t3.S[2] VMOV t1.S[0], t3.S[2]
VMOV t0.S[0], t3.S[3] VMOV t0.S[0], t3.S[3]
VREV32 t3.B16, t3.B16 VREV32 t3.B16, t3.B16
VST1.P [t3.B16], (ctx) VST1.P [t3.B16], (ctx)
VMOV t3.B16, IV.B16 VMOV t3.B16, IV.B16
B loopSrc
B loopSrc
done_sm4: done_sm4:
VST1 [IV.B16], (R5) VST1 [IV.B16], (R5)
RET RET

View File

@ -156,21 +156,21 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#undef dlen #undef dlen
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ #define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \ PINSRD $0, (index * 4)(RK)(IND*1), x; \
PXOR t1, x; \ PXOR t1, x; \
PXOR t2, x; \ PXOR t2, x; \
PXOR t3, x; \ PXOR t3, x; \
SM4_TAO_L1(x, y, z); \ SM4_TAO_L1(x, y, z); \
PXOR x, t0 PXOR x, t0
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ #define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \ PINSRD $0, (index * 4)(RK)(IND*1), x; \
PSHUFD $0, x, x; \ PSHUFD $0, x, x; \
PXOR t1, x; \ PXOR t1, x; \
PXOR t2, x; \ PXOR t2, x; \
PXOR t3, x; \ PXOR t3, x; \
SM4_TAO_L1(x, y, z); \ SM4_TAO_L1(x, y, z); \
PXOR x, t0 PXOR x, t0
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \ #define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \ PSHUFB flip_mask<>(SB), t0; \
@ -225,20 +225,20 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
PSHUFB BSWAP, t0 PSHUFB BSWAP, t0
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \ #define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \ VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \ VPXOR t1, x, x; \
VPXOR t2, x, x; \ VPXOR t2, x, x; \
VPXOR t3, x, x; \ VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \ AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \
VPXOR x, t0, t0 VPXOR x, t0, t0
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ #define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \ VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \ VPXOR t1, x, x; \
VPXOR t2, x, x; \ VPXOR t2, x, x; \
VPXOR t3, x, x; \ VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \ AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
VPXOR x, t0, t0 VPXOR x, t0, t0
// func gcmSm4Init(productTable *[256]byte, rk []uint32) // func gcmSm4Init(productTable *[256]byte, rk []uint32)
TEXT ·gcmSm4Init(SB),NOSPLIT,$0 TEXT ·gcmSm4Init(SB),NOSPLIT,$0

View File

@ -112,33 +112,33 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#include "aesni_arm64.h" #include "aesni_arm64.h"
#define LOAD_SM4_AESNI_CONSTS() \ #define LOAD_SM4_AESNI_CONSTS() \
LDP nibble_mask<>(SB), (R20, R21) \ LDP nibble_mask<>(SB), (R20, R21) \
VMOV R20, NIBBLE_MASK.D[0] \ VMOV R20, NIBBLE_MASK.D[0] \
VMOV R21, NIBBLE_MASK.D[1] \ VMOV R21, NIBBLE_MASK.D[1] \
LDP m1_low<>(SB), (R20, R21) \ LDP m1_low<>(SB), (R20, R21) \
VMOV R20, M1L.D[0] \ VMOV R20, M1L.D[0] \
VMOV R21, M1L.D[1] \ VMOV R21, M1L.D[1] \
LDP m1_high<>(SB), (R20, R21) \ LDP m1_high<>(SB), (R20, R21) \
VMOV R20, M1H.D[0] \ VMOV R20, M1H.D[0] \
VMOV R21, M1H.D[1] \ VMOV R21, M1H.D[1] \
LDP m2_low<>(SB), (R20, R21) \ LDP m2_low<>(SB), (R20, R21) \
VMOV R20, M2L.D[0] \ VMOV R20, M2L.D[0] \
VMOV R21, M2L.D[1] \ VMOV R21, M2L.D[1] \
LDP m2_high<>(SB), (R20, R21) \ LDP m2_high<>(SB), (R20, R21) \
VMOV R20, M2H.D[0] \ VMOV R20, M2H.D[0] \
VMOV R21, M2H.D[1] \ VMOV R21, M2H.D[1] \
LDP inverse_shift_rows<>(SB), (R20, R21) \ LDP inverse_shift_rows<>(SB), (R20, R21) \
VMOV R20, INVERSE_SHIFT_ROWS.D[0] \ VMOV R20, INVERSE_SHIFT_ROWS.D[0] \
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \ VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
LDP r08_mask<>(SB), (R20, R21) \ LDP r08_mask<>(SB), (R20, R21) \
VMOV R20, R08_MASK.D[0] \ VMOV R20, R08_MASK.D[0] \
VMOV R21, R08_MASK.D[1] \ VMOV R21, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R20, R21) \ LDP r16_mask<>(SB), (R20, R21) \
VMOV R20, R16_MASK.D[0] \ VMOV R20, R16_MASK.D[0] \
VMOV R21, R16_MASK.D[1] \ VMOV R21, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R20, R21) \ LDP r24_mask<>(SB), (R20, R21) \
VMOV R20, R24_MASK.D[0] \ VMOV R20, R24_MASK.D[0] \
VMOV R21, R24_MASK.D[1] VMOV R21, R24_MASK.D[1]
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \ #define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
MOVW.P 4(RK), R19; \ MOVW.P 4(RK), R19; \