sm4: format asm code

This commit is contained in:
Sun Yimin 2022-07-22 10:08:15 +08:00 committed by GitHub
parent ff434b7bd7
commit acffd83cc9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 548 additions and 547 deletions

View File

@ -67,189 +67,189 @@ GLOBL fk_mask<>(SB), RODATA, $16
// PUNPCKLQDQ r2, tmp2;
// MOVOU tmp2, r2
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
PEXTRD $2, r0, r; \
PINSRD $0, r, tmp2; \
PEXTRD $2, r1, r; \
PINSRD $1, r, tmp2; \
; \
PEXTRD $3, r0, r; \
PINSRD $2, r, tmp2; \
PEXTRD $3, r1, r; \
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
; \
PEXTRD $1, r0, r; \
PINSRD $2, r, r0; \
PEXTRD $0, r1, r; \
PINSRD $1, r, r0; \
PEXTRD $1, r1, r; \
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
; \
PEXTRD $0, r2, r; \
PINSRD $0, r, tmp1; \
PEXTRD $0, r3, r; \
PINSRD $1, r, tmp1; \
PEXTRD $1, r2, r; \
PINSRD $2, r, tmp1; \
PEXTRD $1, r3, r; \
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
; \
PEXTRD $2, r2, r; \
PINSRD $0, r, r2; \
PEXTRD $2, r3, r; \
PINSRD $1, r, r2; \
PEXTRD $3, r2, r; \
PINSRD $2, r, r2; \
PEXTRD $3, r3, r; \
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
; \
PEXTRD $2, r0, r; \
PINSRD $0, r, tmp2; \
PEXTRD $2, r1, r; \
PINSRD $1, r, tmp2; \
; \
PEXTRD $3, r0, r; \
PINSRD $2, r, tmp2; \
PEXTRD $3, r1, r; \
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
; \
PEXTRD $1, r0, r; \
PINSRD $2, r, r0; \
PEXTRD $0, r1, r; \
PINSRD $1, r, r0; \
PEXTRD $1, r1, r; \
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
; \
PEXTRD $0, r2, r; \
PINSRD $0, r, tmp1; \
PEXTRD $0, r3, r; \
PINSRD $1, r, tmp1; \
PEXTRD $1, r2, r; \
PINSRD $2, r, tmp1; \
PEXTRD $1, r3, r; \
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
; \
PEXTRD $2, r2, r; \
PINSRD $0, r, r2; \
PEXTRD $2, r3, r; \
PINSRD $1, r, r2; \
PEXTRD $3, r2, r; \
PINSRD $2, r, r2; \
PEXTRD $3, r3, r; \
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
; \
MOVOU r0, r1; \
PEXTRQ $1, r1, r; \
PINSRQ $0, r, r1; \
PEXTRQ $1, tmp1, r; \
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
; \
PEXTRQ $0, tmp1, r; \
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
; \
PEXTRQ $1, r1, r; \
PINSRQ $0, r, r1; \
PEXTRQ $1, tmp1, r; \
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
; \
PEXTRQ $0, tmp1, r; \
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
; \
MOVOU tmp2, r3; \
PEXTRQ $1, r3, r; \
PINSRQ $0, r, r3; \
PEXTRQ $1, r2, r; \
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
; \
PEXTRQ $0, r2, r; \
PINSRQ $1, r, r2; \
PEXTRQ $0, tmp2, r; \
PINSRQ $0, r, r2
PEXTRQ $1, r3, r; \
PINSRQ $0, r, r3; \
PEXTRQ $1, r2, r; \
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
; \
PEXTRQ $0, r2, r; \
PINSRQ $1, r, r2; \
PEXTRQ $0, tmp2, r; \
PINSRQ $0, r, r2
#define SM4_SBOX(x, y, z) \
; \ //############################# inner affine ############################//
MOVOU x, z; \
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
MOVOU m1_low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m1_high<>(SB), z; \
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
; \ // inverse ShiftRows
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
; \ //############################# outer affine ############################//
MOVOU x, z; \
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
MOVOU m2_low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m2_high<>(SB), z; \
PSHUFB x, z; \
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
; \ //############################# inner affine ############################//
MOVOU x, z; \
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
MOVOU m1_low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m1_high<>(SB), z; \
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
; \ // inverse ShiftRows
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
; \ //############################# outer affine ############################//
MOVOU x, z; \
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
MOVOU m2_low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m2_high<>(SB), z; \
PSHUFB x, z; \
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
#define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
MOVOU x, z; \
PSHUFB r16_mask<>(SB), z; \
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
MOVOU y, z; \
PSLLL $2, z; \
PSRLL $30, y; \
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
MOVOU x, z; \
PSHUFB r24_mask<>(SB), z; \
PXOR y, x; \ //x = x xor y
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
MOVOU x, z; \
PSHUFB r16_mask<>(SB), z; \
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
MOVOU y, z; \
PSLLL $2, z; \
PSRLL $30, y; \
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
MOVOU x, z; \
PSHUFB r24_mask<>(SB), z; \
PXOR y, x; \ //x = x xor y
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
VPAND X_NIBBLE_MASK, x, tmp; \
VMOVDQU m1_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m1_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VMOVDQU inverse_shift_rows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VAESENCLAST X_NIBBLE_MASK, x, x; \
VPANDN X_NIBBLE_MASK, x, tmp; \
VMOVDQU m2_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m2_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
VPAND X_NIBBLE_MASK, x, tmp; \
VMOVDQU m1_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m1_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VMOVDQU inverse_shift_rows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VAESENCLAST X_NIBBLE_MASK, x, x; \
VPANDN X_NIBBLE_MASK, x, tmp; \
VMOVDQU m2_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m2_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
#define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \
AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \
VMOVDQU r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VMOVDQU r16_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VMOVDQU r24_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \
VMOVDQU r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VMOVDQU r16_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VMOVDQU r24_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
#define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
VPAND yNibbleMask, x, tmp; \
VBROADCASTI128 m1_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \
VBROADCASTI128 m1_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VEXTRACTI128 $1, x, yw \
VAESENCLAST xNibbleMask, xw, xw; \
VAESENCLAST xNibbleMask, yw, yw; \
VINSERTI128 $1, yw, x, x; \
VPANDN yNibbleMask, x, tmp; \
VBROADCASTI128 m2_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \
VBROADCASTI128 m2_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
VPAND yNibbleMask, x, tmp; \
VBROADCASTI128 m1_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \
VBROADCASTI128 m1_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VEXTRACTI128 $1, x, yw \
VAESENCLAST xNibbleMask, xw, xw; \
VAESENCLAST xNibbleMask, yw, yw; \
VINSERTI128 $1, yw, x, x; \
VPANDN yNibbleMask, x, tmp; \
VBROADCASTI128 m2_low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND yNibbleMask, x, x; \
VBROADCASTI128 m2_high<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
#define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \
VBROADCASTI128 r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VBROADCASTI128 r16_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VBROADCASTI128 r24_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \
VBROADCASTI128 r08_mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VBROADCASTI128 r16_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VBROADCASTI128 r24_mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x

View File

@ -81,21 +81,21 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
VMOV K.S[2], t2.S[2]
#define SM4_SBOX(x, y, z) \
; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16; \
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
AESE ZERO.B16, x.B16; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16
; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M1H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16; \
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
AESE ZERO.B16, x.B16; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
VTBL z.B16, [M2H.B16], z.B16; \
VEOR y.B16, z.B16, x.B16
#define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \

View File

@ -16,48 +16,48 @@
#include "aesni_amd64.h"
#define SM4_TAO_L2(x, y) \
SM4_SBOX(x, y, XTMP6); \
; \ //#################### 4 parallel L2 linear transforms ##################//
MOVOU x, y; \
MOVOU x, XTMP6; \
PSLLL $13, XTMP6; \
PSRLL $19, y; \
POR XTMP6, y; \ //y = X roll 13
PSLLL $10, XTMP6; \
MOVOU x, XTMP7; \
PSRLL $9, XTMP7; \
POR XTMP6, XTMP7; \ //XTMP7 = x roll 23
PXOR XTMP7, y; \
PXOR y, x
#define SM4_TAO_L2(x, y, tmp1, tmp2) \
SM4_SBOX(x, y, tmp1); \
; \ //#################### 4 parallel L2 linear transforms ##################//
MOVOU x, y; \
MOVOU x, tmp1; \
PSLLL $13, tmp1; \
PSRLL $19, y; \
POR tmp1, y; \ //y = X roll 13
PSLLL $10, tmp1; \
MOVOU x, tmp2; \
PSRLL $9, tmp2; \
POR tmp1, tmp2; \ //tmp2 = x roll 23
PXOR tmp2, y; \
PXOR y, x
#define SM4_ROUND(index, x, y, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(AX)(CX*1), x; \
PSHUFD $0, x, x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, XTMP6); \
PXOR x, t0
PINSRD $0, (index * 4)(AX)(CX*1), x; \
PSHUFD $0, x, x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, XTMP6); \
PXOR x, t0
#define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(AX)(CX*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, XTMP6); \
PXOR x, t0
PINSRD $0, (index * 4)(AX)(CX*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, XTMP6); \
PXOR x, t0
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(BX)(CX*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L2(x, y); \
PXOR x, t0; \
PEXTRD $0, t0, R8; \
MOVL R8, (index * 4)(DX)(CX*1); \
MOVL R8, (12 - index * 4)(DI)(SI*1)
PINSRD $0, (index * 4)(BX)(CX*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L2(x, y, XTMP6, XTMP7); \
PXOR x, t0; \
PEXTRD $0, t0, R8; \
MOVL R8, (index * 4)(DX)(CX*1); \
MOVL R8, (12 - index * 4)(DI)(SI*1)
#define XDWORD0 Y4
#define XDWORD1 Y5
@ -90,252 +90,252 @@
#define YWORD X9
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \
VPXOR x, t0, t0
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \
VPXOR x, t0, t0
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
VPXOR x, t0, t0
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
VPXOR x, t0, t0
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
MOVQ key+0(FP), AX
MOVQ ck+8(FP), BX
MOVQ enc+16(FP), DX
MOVQ dec+24(FP), DI
MOVQ key+0(FP), AX
MOVQ ck+8(FP), BX
MOVQ enc+16(FP), DX
MOVQ dec+24(FP), DI
MOVUPS 0(AX), t0
PSHUFB flip_mask<>(SB), t0
PXOR fk_mask<>(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
MOVUPS 0(AX), t0
PSHUFB flip_mask<>(SB), t0
PXOR fk_mask<>(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
XORL CX, CX
MOVL $112, SI
XORL CX, CX
MOVL $112, SI
loop:
SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3)
SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0)
SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1)
SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2)
SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3)
SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0)
SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1)
SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2)
ADDL $16, CX
SUBL $16, SI
CMPL CX, $4*32
JB loop
ADDL $16, CX
SUBL $16, SI
CMPL CX, $4*32
JB loop
expand_end:
RET
RET
// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX
MOVQ src+32(FP), DX
MOVQ src_len+40(FP), DI
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX
MOVQ src+32(FP), DX
MOVQ src_len+40(FP), DI
CMPB ·useAVX2(SB), $1
JE avx2
CMPB ·useAVX2(SB), $1
JE avx2
non_avx2_start:
PINSRD $0, 0(DX), t0
PINSRD $1, 16(DX), t0
PINSRD $2, 32(DX), t0
PINSRD $3, 48(DX), t0
PSHUFB flip_mask<>(SB), t0
PINSRD $0, 0(DX), t0
PINSRD $1, 16(DX), t0
PINSRD $2, 32(DX), t0
PINSRD $3, 48(DX), t0
PSHUFB flip_mask<>(SB), t0
PINSRD $0, 4(DX), t1
PINSRD $1, 20(DX), t1
PINSRD $2, 36(DX), t1
PINSRD $3, 52(DX), t1
PSHUFB flip_mask<>(SB), t1
PINSRD $0, 4(DX), t1
PINSRD $1, 20(DX), t1
PINSRD $2, 36(DX), t1
PINSRD $3, 52(DX), t1
PSHUFB flip_mask<>(SB), t1
PINSRD $0, 8(DX), t2
PINSRD $1, 24(DX), t2
PINSRD $2, 40(DX), t2
PINSRD $3, 56(DX), t2
PSHUFB flip_mask<>(SB), t2
PINSRD $0, 8(DX), t2
PINSRD $1, 24(DX), t2
PINSRD $2, 40(DX), t2
PINSRD $3, 56(DX), t2
PSHUFB flip_mask<>(SB), t2
PINSRD $0, 12(DX), t3
PINSRD $1, 28(DX), t3
PINSRD $2, 44(DX), t3
PINSRD $3, 60(DX), t3
PSHUFB flip_mask<>(SB), t3
PINSRD $0, 12(DX), t3
PINSRD $1, 28(DX), t3
PINSRD $2, 44(DX), t3
PINSRD $3, 60(DX), t3
PSHUFB flip_mask<>(SB), t3
XORL CX, CX
XORL CX, CX
loop:
SM4_ROUND(0, x, y, t0, t1, t2, t3)
SM4_ROUND(1, x, y, t1, t2, t3, t0)
SM4_ROUND(2, x, y, t2, t3, t0, t1)
SM4_ROUND(3, x, y, t3, t0, t1, t2)
SM4_ROUND(0, x, y, t0, t1, t2, t3)
SM4_ROUND(1, x, y, t1, t2, t3, t0)
SM4_ROUND(2, x, y, t2, t3, t0, t1)
SM4_ROUND(3, x, y, t3, t0, t1, t2)
ADDL $16, CX
CMPL CX, $4*32
JB loop
ADDL $16, CX
CMPL CX, $4*32
JB loop
PSHUFB flip_mask<>(SB), t3
PSHUFB flip_mask<>(SB), t2
PSHUFB flip_mask<>(SB), t1
PSHUFB flip_mask<>(SB), t0
MOVUPS t3, 0(BX)
MOVUPS t2, 16(BX)
MOVUPS t1, 32(BX)
MOVUPS t0, 48(BX)
MOVL 4(BX), R8
MOVL 8(BX), R9
MOVL 12(BX), R10
MOVL 16(BX), R11
MOVL 32(BX), R12
MOVL 48(BX), R13
MOVL R11, 4(BX)
MOVL R12, 8(BX)
MOVL R13, 12(BX)
MOVL R8, 16(BX)
MOVL R9, 32(BX)
MOVL R10, 48(BX)
MOVL 24(BX), R8
MOVL 28(BX), R9
MOVL 36(BX), R10
MOVL 52(BX), R11
MOVL R10, 24(BX)
MOVL R11, 28(BX)
MOVL R8, 36(BX)
MOVL R9, 52(BX)
MOVL 44(BX), R8
MOVL 56(BX), R9
MOVL R9, 44(BX)
MOVL R8, 56(BX)
PSHUFB flip_mask<>(SB), t3
PSHUFB flip_mask<>(SB), t2
PSHUFB flip_mask<>(SB), t1
PSHUFB flip_mask<>(SB), t0
MOVUPS t3, 0(BX)
MOVUPS t2, 16(BX)
MOVUPS t1, 32(BX)
MOVUPS t0, 48(BX)
MOVL 4(BX), R8
MOVL 8(BX), R9
MOVL 12(BX), R10
MOVL 16(BX), R11
MOVL 32(BX), R12
MOVL 48(BX), R13
MOVL R11, 4(BX)
MOVL R12, 8(BX)
MOVL R13, 12(BX)
MOVL R8, 16(BX)
MOVL R9, 32(BX)
MOVL R10, 48(BX)
MOVL 24(BX), R8
MOVL 28(BX), R9
MOVL 36(BX), R10
MOVL 52(BX), R11
MOVL R10, 24(BX)
MOVL R11, 28(BX)
MOVL R8, 36(BX)
MOVL R9, 52(BX)
MOVL 44(BX), R8
MOVL 56(BX), R9
MOVL R9, 44(BX)
MOVL R8, 56(BX)
done_sm4:
RET
RET
avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
CMPQ DI, $64
JBE avx2_4blocks
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
CMPQ DI, $64
JBE avx2_4blocks
avx2_8blocks:
VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
XORL CX, CX
XORL CX, CX
avx2_loop:
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
ADDL $16, CX
CMPL CX, $4*32
JB avx2_loop
ADDL $16, CX
CMPL CX, $4*32
JB avx2_loop
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
JMP avx2_sm4_done
VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
JMP avx2_sm4_done
avx2_4blocks:
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
XORL CX, CX
XORL CX, CX
avx_loop:
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
ADDL $16, CX
CMPL CX, $4*32
JB avx_loop
ADDL $16, CX
CMPL CX, $4*32
JB avx_loop
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
avx2_sm4_done:
VZEROUPPER
RET
VZEROUPPER
RET
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX
MOVQ src+16(FP), DX
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX
MOVQ src+16(FP), DX
MOVUPS (DX), t0
MOVUPS (DX), t0
PSHUFB flip_mask<>(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
XORL CX, CX
XORL CX, CX
loop:
SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3)
SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0)
SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1)
SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2)
SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3)
SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0)
SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1)
SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2)
ADDL $16, CX
CMPL CX, $4*32
JB loop
ADDL $16, CX
CMPL CX, $4*32
JB loop
PEXTRD $0, t2, R8
PINSRD $1, R8, t3
@ -347,4 +347,4 @@ loop:
MOVUPS t3, (BX)
done_sm4:
RET
RET

View File

@ -103,12 +103,12 @@
VMOV V8.S[1], V11.S[2] \
VMOV V8.S[0], V11.S[3] \
VST1.P [V8.S4, V9.S4], 32(R10) \
VST1 [V10.S4, V11.S4], (R11) \
VST1 [V10.S4, V11.S4], (R11) \
SUB $32, R11, R11
#define SM4E_ROUND() \
VLD1.P 16(R10), [V8.B16] \
VREV32 V8.B16, V8.B16 \
VREV32 V8.B16, V8.B16 \
WORD $0x0884c0ce \
WORD $0x2884c0ce \
WORD $0x4884c0ce \
@ -117,7 +117,7 @@
WORD $0xa884c0ce \
WORD $0xc884c0ce \
WORD $0xe884c0ce \
VREV32 V8.B16, V8.B16 \
VREV32 V8.B16, V8.B16 \
VST1.P [V8.B16], 16(R9)
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
@ -145,14 +145,14 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0
VEOR ZERO.B16, ZERO.B16, ZERO.B16
ksLoop:
SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3)
SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0)
SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1)
SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2)
SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3)
SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0)
SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1)
SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE ksLoop
ADD $16, R0
CMP $128, R0
BNE ksLoop
RET
sm4ekey:
@ -226,14 +226,14 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
EOR R0, R0
encryptBlocksLoop:
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
@ -268,10 +268,11 @@ encryptBlocksLoop:
sm4niblocks:
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
sm4niblockloop:
SM4E_ROUND()
SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes
CBNZ R12, sm4niblockloop
SM4E_ROUND()
SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes
CBNZ R12, sm4niblockloop
RET
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
@ -296,14 +297,14 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
EOR R0, R0
encryptBlockLoop:
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlockLoop
ADD $16, R0
CMP $128, R0
BNE encryptBlockLoop
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16

View File

@ -38,45 +38,45 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
MOVUPS (SI), IV
loopSrc:
CMPQ ptxLen, $16
JB done_sm4
SUBQ $16, ptxLen
CMPQ ptxLen, $16
JB done_sm4
SUBQ $16, ptxLen
MOVUPS (ptx), t0
PXOR IV, t0
MOVUPS (ptx), t0
PXOR IV, t0
PSHUFB flip_mask<>(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
PSHUFB flip_mask<>(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
XORL CX, CX
XORL CX, CX
loopRound:
SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
ADDL $16, CX
CMPL CX, $4*32
JB loopRound
ADDL $16, CX
CMPL CX, $4*32
JB loopRound
PEXTRD $0, t2, R8
PINSRD $1, R8, t3
PEXTRD $0, t1, R8
PINSRD $2, R8, t3
PEXTRD $0, t0, R8
PINSRD $3, R8, t3
PSHUFB flip_mask<>(SB), t3
PEXTRD $0, t2, R8
PINSRD $1, R8, t3
PEXTRD $0, t1, R8
PINSRD $2, R8, t3
PEXTRD $0, t0, R8
PINSRD $3, R8, t3
PSHUFB flip_mask<>(SB), t3
MOVOU t3, IV
MOVUPS t3, (ctx)
MOVOU t3, IV
MOVUPS t3, (ctx)
LEAQ 16(ptx), ptx
LEAQ 16(ctx), ctx
LEAQ 16(ptx), ptx
LEAQ 16(ctx), ctx
JMP loopSrc
JMP loopSrc
done_sm4:
MOVUPS IV, (SI)

View File

@ -88,40 +88,40 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
VLD1 (R5), [IV.B16]
loopSrc:
CMP $16, ptxLen
BLT done_sm4
SUB $16, ptxLen
CMP $16, ptxLen
BLT done_sm4
SUB $16, ptxLen
VLD1.P (ptx), [t0.S4]
VEOR IV.B16, t0.B16, t0.B16
VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0]
VLD1.P (ptx), [t0.S4]
VEOR IV.B16, t0.B16, t0.B16
VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0]
EOR R2, R2
MOVD rkSave, R0
EOR R2, R2
MOVD rkSave, R0
encryptBlockLoop:
SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2)
SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R2
CMP $128, R2
BNE encryptBlockLoop
ADD $16, R2
CMP $128, R2
BNE encryptBlockLoop
VMOV t2.S[0], t3.S[1]
VMOV t1.S[0], t3.S[2]
VMOV t0.S[0], t3.S[3]
VREV32 t3.B16, t3.B16
VMOV t2.S[0], t3.S[1]
VMOV t1.S[0], t3.S[2]
VMOV t0.S[0], t3.S[3]
VREV32 t3.B16, t3.B16
VST1.P [t3.B16], (ctx)
VMOV t3.B16, IV.B16
VST1.P [t3.B16], (ctx)
VMOV t3.B16, IV.B16
B loopSrc
B loopSrc
done_sm4:
VST1 [IV.B16], (R5)
RET

View File

@ -156,21 +156,21 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#undef dlen
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PSHUFD $0, x, x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PSHUFD $0, x, x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
PSHUFB flip_mask<>(SB), t0; \
@ -225,20 +225,20 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
PSHUFB BSWAP, t0
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \
VPXOR x, t0, t0
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \
VPXOR x, t0, t0
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
VPXOR x, t0, t0
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
VPXOR x, t0, t0
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
TEXT ·gcmSm4Init(SB),NOSPLIT,$0

View File

@ -112,33 +112,33 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#include "aesni_arm64.h"
#define LOAD_SM4_AESNI_CONSTS() \
LDP nibble_mask<>(SB), (R20, R21) \
VMOV R20, NIBBLE_MASK.D[0] \
VMOV R21, NIBBLE_MASK.D[1] \
LDP m1_low<>(SB), (R20, R21) \
VMOV R20, M1L.D[0] \
VMOV R21, M1L.D[1] \
LDP m1_high<>(SB), (R20, R21) \
VMOV R20, M1H.D[0] \
VMOV R21, M1H.D[1] \
LDP m2_low<>(SB), (R20, R21) \
VMOV R20, M2L.D[0] \
VMOV R21, M2L.D[1] \
LDP m2_high<>(SB), (R20, R21) \
VMOV R20, M2H.D[0] \
VMOV R21, M2H.D[1] \
LDP inverse_shift_rows<>(SB), (R20, R21) \
VMOV R20, INVERSE_SHIFT_ROWS.D[0] \
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
LDP r08_mask<>(SB), (R20, R21) \
VMOV R20, R08_MASK.D[0] \
VMOV R21, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R20, R21) \
VMOV R20, R16_MASK.D[0] \
VMOV R21, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R20, R21) \
VMOV R20, R24_MASK.D[0] \
VMOV R21, R24_MASK.D[1]
LDP nibble_mask<>(SB), (R20, R21) \
VMOV R20, NIBBLE_MASK.D[0] \
VMOV R21, NIBBLE_MASK.D[1] \
LDP m1_low<>(SB), (R20, R21) \
VMOV R20, M1L.D[0] \
VMOV R21, M1L.D[1] \
LDP m1_high<>(SB), (R20, R21) \
VMOV R20, M1H.D[0] \
VMOV R21, M1H.D[1] \
LDP m2_low<>(SB), (R20, R21) \
VMOV R20, M2L.D[0] \
VMOV R21, M2L.D[1] \
LDP m2_high<>(SB), (R20, R21) \
VMOV R20, M2H.D[0] \
VMOV R21, M2H.D[1] \
LDP inverse_shift_rows<>(SB), (R20, R21) \
VMOV R20, INVERSE_SHIFT_ROWS.D[0] \
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
LDP r08_mask<>(SB), (R20, R21) \
VMOV R20, R08_MASK.D[0] \
VMOV R21, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R20, R21) \
VMOV R20, R16_MASK.D[0] \
VMOV R21, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R20, R21) \
VMOV R20, R24_MASK.D[0] \
VMOV R21, R24_MASK.D[1]
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
MOVW.P 4(RK), R19; \