mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
sm4: format asm code
This commit is contained in:
parent
ff434b7bd7
commit
acffd83cc9
@ -67,189 +67,189 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
// PUNPCKLQDQ r2, tmp2;
|
||||
// MOVOU tmp2, r2
|
||||
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
|
||||
PEXTRD $2, r0, r; \
|
||||
PINSRD $0, r, tmp2; \
|
||||
PEXTRD $2, r1, r; \
|
||||
PINSRD $1, r, tmp2; \
|
||||
; \
|
||||
PEXTRD $3, r0, r; \
|
||||
PINSRD $2, r, tmp2; \
|
||||
PEXTRD $3, r1, r; \
|
||||
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
|
||||
; \
|
||||
PEXTRD $1, r0, r; \
|
||||
PINSRD $2, r, r0; \
|
||||
PEXTRD $0, r1, r; \
|
||||
PINSRD $1, r, r0; \
|
||||
PEXTRD $1, r1, r; \
|
||||
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
|
||||
; \
|
||||
PEXTRD $0, r2, r; \
|
||||
PINSRD $0, r, tmp1; \
|
||||
PEXTRD $0, r3, r; \
|
||||
PINSRD $1, r, tmp1; \
|
||||
PEXTRD $1, r2, r; \
|
||||
PINSRD $2, r, tmp1; \
|
||||
PEXTRD $1, r3, r; \
|
||||
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
|
||||
; \
|
||||
PEXTRD $2, r2, r; \
|
||||
PINSRD $0, r, r2; \
|
||||
PEXTRD $2, r3, r; \
|
||||
PINSRD $1, r, r2; \
|
||||
PEXTRD $3, r2, r; \
|
||||
PINSRD $2, r, r2; \
|
||||
PEXTRD $3, r3, r; \
|
||||
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
|
||||
; \
|
||||
PEXTRD $2, r0, r; \
|
||||
PINSRD $0, r, tmp2; \
|
||||
PEXTRD $2, r1, r; \
|
||||
PINSRD $1, r, tmp2; \
|
||||
; \
|
||||
PEXTRD $3, r0, r; \
|
||||
PINSRD $2, r, tmp2; \
|
||||
PEXTRD $3, r1, r; \
|
||||
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
|
||||
; \
|
||||
PEXTRD $1, r0, r; \
|
||||
PINSRD $2, r, r0; \
|
||||
PEXTRD $0, r1, r; \
|
||||
PINSRD $1, r, r0; \
|
||||
PEXTRD $1, r1, r; \
|
||||
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
|
||||
; \
|
||||
PEXTRD $0, r2, r; \
|
||||
PINSRD $0, r, tmp1; \
|
||||
PEXTRD $0, r3, r; \
|
||||
PINSRD $1, r, tmp1; \
|
||||
PEXTRD $1, r2, r; \
|
||||
PINSRD $2, r, tmp1; \
|
||||
PEXTRD $1, r3, r; \
|
||||
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
|
||||
; \
|
||||
PEXTRD $2, r2, r; \
|
||||
PINSRD $0, r, r2; \
|
||||
PEXTRD $2, r3, r; \
|
||||
PINSRD $1, r, r2; \
|
||||
PEXTRD $3, r2, r; \
|
||||
PINSRD $2, r, r2; \
|
||||
PEXTRD $3, r3, r; \
|
||||
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
|
||||
; \
|
||||
MOVOU r0, r1; \
|
||||
PEXTRQ $1, r1, r; \
|
||||
PINSRQ $0, r, r1; \
|
||||
PEXTRQ $1, tmp1, r; \
|
||||
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
|
||||
; \
|
||||
PEXTRQ $0, tmp1, r; \
|
||||
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
|
||||
; \
|
||||
PEXTRQ $1, r1, r; \
|
||||
PINSRQ $0, r, r1; \
|
||||
PEXTRQ $1, tmp1, r; \
|
||||
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
|
||||
; \
|
||||
PEXTRQ $0, tmp1, r; \
|
||||
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
|
||||
; \
|
||||
MOVOU tmp2, r3; \
|
||||
PEXTRQ $1, r3, r; \
|
||||
PINSRQ $0, r, r3; \
|
||||
PEXTRQ $1, r2, r; \
|
||||
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
|
||||
; \
|
||||
PEXTRQ $0, r2, r; \
|
||||
PINSRQ $1, r, r2; \
|
||||
PEXTRQ $0, tmp2, r; \
|
||||
PINSRQ $0, r, r2
|
||||
PEXTRQ $1, r3, r; \
|
||||
PINSRQ $0, r, r3; \
|
||||
PEXTRQ $1, r2, r; \
|
||||
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
|
||||
; \
|
||||
PEXTRQ $0, r2, r; \
|
||||
PINSRQ $1, r, r2; \
|
||||
PEXTRQ $0, tmp2, r; \
|
||||
PINSRQ $0, r, r2
|
||||
|
||||
#define SM4_SBOX(x, y, z) \
|
||||
; \ //############################# inner affine ############################//
|
||||
MOVOU x, z; \
|
||||
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_low<>(SB), y; \
|
||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_high<>(SB), z; \
|
||||
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
|
||||
; \ // inverse ShiftRows
|
||||
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
|
||||
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
|
||||
; \ //############################# outer affine ############################//
|
||||
MOVOU x, z; \
|
||||
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
|
||||
MOVOU m2_low<>(SB), y; \
|
||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m2_high<>(SB), z; \
|
||||
PSHUFB x, z; \
|
||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
||||
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
||||
; \ //############################# inner affine ############################//
|
||||
MOVOU x, z; \
|
||||
PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_low<>(SB), y; \
|
||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m1_high<>(SB), z; \
|
||||
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
|
||||
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
|
||||
; \ // inverse ShiftRows
|
||||
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
|
||||
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
|
||||
; \ //############################# outer affine ############################//
|
||||
MOVOU x, z; \
|
||||
PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
|
||||
MOVOU m2_low<>(SB), y; \
|
||||
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
|
||||
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
||||
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
||||
MOVOU m2_high<>(SB), z; \
|
||||
PSHUFB x, z; \
|
||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
||||
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
||||
|
||||
#define SM4_TAO_L1(x, y, z) \
|
||||
SM4_SBOX(x, y, z); \
|
||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
|
||||
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
|
||||
MOVOU x, z; \
|
||||
PSHUFB r16_mask<>(SB), z; \
|
||||
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
|
||||
MOVOU y, z; \
|
||||
PSLLL $2, z; \
|
||||
PSRLL $30, y; \
|
||||
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
|
||||
MOVOU x, z; \
|
||||
PSHUFB r24_mask<>(SB), z; \
|
||||
PXOR y, x; \ //x = x xor y
|
||||
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
||||
SM4_SBOX(x, y, z); \
|
||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
|
||||
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
|
||||
MOVOU x, z; \
|
||||
PSHUFB r16_mask<>(SB), z; \
|
||||
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
|
||||
MOVOU y, z; \
|
||||
PSLLL $2, z; \
|
||||
PSRLL $30, y; \
|
||||
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
|
||||
MOVOU x, z; \
|
||||
PSHUFB r24_mask<>(SB), z; \
|
||||
PXOR y, x; \ //x = x xor y
|
||||
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
||||
|
||||
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
|
||||
VPAND X_NIBBLE_MASK, x, tmp; \
|
||||
VMOVDQU m1_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m1_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VMOVDQU inverse_shift_rows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VAESENCLAST X_NIBBLE_MASK, x, x; \
|
||||
VPANDN X_NIBBLE_MASK, x, tmp; \
|
||||
VMOVDQU m2_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m2_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
VPAND X_NIBBLE_MASK, x, tmp; \
|
||||
VMOVDQU m1_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m1_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VMOVDQU inverse_shift_rows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VAESENCLAST X_NIBBLE_MASK, x, x; \
|
||||
VPANDN X_NIBBLE_MASK, x, tmp; \
|
||||
VMOVDQU m2_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND X_NIBBLE_MASK, x, x; \
|
||||
VMOVDQU m2_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \
|
||||
AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \
|
||||
VMOVDQU r08_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VMOVDQU r16_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VMOVDQU r24_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \
|
||||
VMOVDQU r08_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VMOVDQU r16_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VMOVDQU r24_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
|
||||
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
||||
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
||||
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
|
||||
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
|
||||
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
|
||||
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
|
||||
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
|
||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
||||
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
||||
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
|
||||
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
|
||||
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
|
||||
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
|
||||
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
|
||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
||||
|
||||
#define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
|
||||
VPAND yNibbleMask, x, tmp; \
|
||||
VBROADCASTI128 m1_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m1_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VEXTRACTI128 $1, x, yw \
|
||||
VAESENCLAST xNibbleMask, xw, xw; \
|
||||
VAESENCLAST xNibbleMask, yw, yw; \
|
||||
VINSERTI128 $1, yw, x, x; \
|
||||
VPANDN yNibbleMask, x, tmp; \
|
||||
VBROADCASTI128 m2_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m2_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
VPAND yNibbleMask, x, tmp; \
|
||||
VBROADCASTI128 m1_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m1_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VEXTRACTI128 $1, x, yw \
|
||||
VAESENCLAST xNibbleMask, xw, xw; \
|
||||
VAESENCLAST xNibbleMask, yw, yw; \
|
||||
VINSERTI128 $1, yw, x, x; \
|
||||
VPANDN yNibbleMask, x, tmp; \
|
||||
VBROADCASTI128 m2_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m2_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
|
||||
AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \
|
||||
VBROADCASTI128 r08_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VBROADCASTI128 r16_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VBROADCASTI128 r24_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \
|
||||
VBROADCASTI128 r08_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VBROADCASTI128 r16_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VBROADCASTI128 r24_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
|
@ -62,7 +62,7 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
|
||||
VMOV t0.B16, K.B16 \
|
||||
VMOV t3.S[0], t0.S[0] \
|
||||
VMOV t3.S[0], t0.S[0] \
|
||||
VMOV t2.S[0], t0.S[1] \
|
||||
VMOV t1.S[0], t0.S[2] \
|
||||
VMOV K0.S[0], t0.S[3] \
|
||||
@ -81,21 +81,21 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
VMOV K.S[2], t2.S[2]
|
||||
|
||||
#define SM4_SBOX(x, y, z) \
|
||||
; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M1L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M1H.B16], z.B16; \
|
||||
VEOR y.B16, z.B16, x.B16; \
|
||||
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
|
||||
AESE ZERO.B16, x.B16; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M2L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M2H.B16], z.B16; \
|
||||
VEOR y.B16, z.B16, x.B16
|
||||
; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M1L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M1H.B16], z.B16; \
|
||||
VEOR y.B16, z.B16, x.B16; \
|
||||
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
|
||||
AESE ZERO.B16, x.B16; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M2L.B16], y.B16; \
|
||||
VUSHR $4, x.D2, x.D2; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
VTBL z.B16, [M2H.B16], z.B16; \
|
||||
VEOR y.B16, z.B16, x.B16
|
||||
|
||||
#define SM4_TAO_L1(x, y, z) \
|
||||
SM4_SBOX(x, y, z); \
|
||||
|
436
sm4/asm_amd64.s
436
sm4/asm_amd64.s
@ -16,48 +16,48 @@
|
||||
|
||||
#include "aesni_amd64.h"
|
||||
|
||||
#define SM4_TAO_L2(x, y) \
|
||||
SM4_SBOX(x, y, XTMP6); \
|
||||
; \ //#################### 4 parallel L2 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
MOVOU x, XTMP6; \
|
||||
PSLLL $13, XTMP6; \
|
||||
PSRLL $19, y; \
|
||||
POR XTMP6, y; \ //y = X roll 13
|
||||
PSLLL $10, XTMP6; \
|
||||
MOVOU x, XTMP7; \
|
||||
PSRLL $9, XTMP7; \
|
||||
POR XTMP6, XTMP7; \ //XTMP7 = x roll 23
|
||||
PXOR XTMP7, y; \
|
||||
PXOR y, x
|
||||
#define SM4_TAO_L2(x, y, tmp1, tmp2) \
|
||||
SM4_SBOX(x, y, tmp1); \
|
||||
; \ //#################### 4 parallel L2 linear transforms ##################//
|
||||
MOVOU x, y; \
|
||||
MOVOU x, tmp1; \
|
||||
PSLLL $13, tmp1; \
|
||||
PSRLL $19, y; \
|
||||
POR tmp1, y; \ //y = X roll 13
|
||||
PSLLL $10, tmp1; \
|
||||
MOVOU x, tmp2; \
|
||||
PSRLL $9, tmp2; \
|
||||
POR tmp1, tmp2; \ //tmp2 = x roll 23
|
||||
PXOR tmp2, y; \
|
||||
PXOR y, x
|
||||
|
||||
#define SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(AX)(CX*1), x; \
|
||||
PSHUFD $0, x, x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
PXOR x, t0
|
||||
PINSRD $0, (index * 4)(AX)(CX*1), x; \
|
||||
PSHUFD $0, x, x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(AX)(CX*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
PXOR x, t0
|
||||
PINSRD $0, (index * 4)(AX)(CX*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(BX)(CX*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L2(x, y); \
|
||||
PXOR x, t0; \
|
||||
PEXTRD $0, t0, R8; \
|
||||
MOVL R8, (index * 4)(DX)(CX*1); \
|
||||
MOVL R8, (12 - index * 4)(DI)(SI*1)
|
||||
PINSRD $0, (index * 4)(BX)(CX*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L2(x, y, XTMP6, XTMP7); \
|
||||
PXOR x, t0; \
|
||||
PEXTRD $0, t0, R8; \
|
||||
MOVL R8, (index * 4)(DX)(CX*1); \
|
||||
MOVL R8, (12 - index * 4)(DI)(SI*1)
|
||||
|
||||
#define XDWORD0 Y4
|
||||
#define XDWORD1 Y5
|
||||
@ -89,253 +89,253 @@
|
||||
#define XWORD X8
|
||||
#define YWORD X9
|
||||
|
||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \
|
||||
VPXOR x, t0, t0
|
||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
|
||||
VPXOR x, t0, t0
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
|
||||
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
||||
MOVQ key+0(FP), AX
|
||||
MOVQ ck+8(FP), BX
|
||||
MOVQ enc+16(FP), DX
|
||||
MOVQ dec+24(FP), DI
|
||||
MOVQ key+0(FP), AX
|
||||
MOVQ ck+8(FP), BX
|
||||
MOVQ enc+16(FP), DX
|
||||
MOVQ dec+24(FP), DI
|
||||
|
||||
MOVUPS 0(AX), t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PXOR fk_mask<>(SB), t0
|
||||
PSHUFD $1, t0, t1
|
||||
PSHUFD $2, t0, t2
|
||||
PSHUFD $3, t0, t3
|
||||
MOVUPS 0(AX), t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PXOR fk_mask<>(SB), t0
|
||||
PSHUFD $1, t0, t1
|
||||
PSHUFD $2, t0, t2
|
||||
PSHUFD $3, t0, t3
|
||||
|
||||
XORL CX, CX
|
||||
MOVL $112, SI
|
||||
XORL CX, CX
|
||||
MOVL $112, SI
|
||||
|
||||
loop:
|
||||
SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3)
|
||||
SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0)
|
||||
SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1)
|
||||
SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2)
|
||||
SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3)
|
||||
SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0)
|
||||
SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1)
|
||||
SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2)
|
||||
|
||||
ADDL $16, CX
|
||||
SUBL $16, SI
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
ADDL $16, CX
|
||||
SUBL $16, SI
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
|
||||
expand_end:
|
||||
RET
|
||||
RET
|
||||
|
||||
// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
|
||||
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), BX
|
||||
MOVQ src+32(FP), DX
|
||||
MOVQ src_len+40(FP), DI
|
||||
|
||||
CMPB ·useAVX2(SB), $1
|
||||
JE avx2
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), BX
|
||||
MOVQ src+32(FP), DX
|
||||
MOVQ src_len+40(FP), DI
|
||||
|
||||
CMPB ·useAVX2(SB), $1
|
||||
JE avx2
|
||||
|
||||
non_avx2_start:
|
||||
PINSRD $0, 0(DX), t0
|
||||
PINSRD $1, 16(DX), t0
|
||||
PINSRD $2, 32(DX), t0
|
||||
PINSRD $3, 48(DX), t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PINSRD $0, 0(DX), t0
|
||||
PINSRD $1, 16(DX), t0
|
||||
PINSRD $2, 32(DX), t0
|
||||
PINSRD $3, 48(DX), t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
|
||||
PINSRD $0, 4(DX), t1
|
||||
PINSRD $1, 20(DX), t1
|
||||
PINSRD $2, 36(DX), t1
|
||||
PINSRD $3, 52(DX), t1
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
PINSRD $0, 4(DX), t1
|
||||
PINSRD $1, 20(DX), t1
|
||||
PINSRD $2, 36(DX), t1
|
||||
PINSRD $3, 52(DX), t1
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
|
||||
PINSRD $0, 8(DX), t2
|
||||
PINSRD $1, 24(DX), t2
|
||||
PINSRD $2, 40(DX), t2
|
||||
PINSRD $3, 56(DX), t2
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
PINSRD $0, 8(DX), t2
|
||||
PINSRD $1, 24(DX), t2
|
||||
PINSRD $2, 40(DX), t2
|
||||
PINSRD $3, 56(DX), t2
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
|
||||
PINSRD $0, 12(DX), t3
|
||||
PINSRD $1, 28(DX), t3
|
||||
PINSRD $2, 44(DX), t3
|
||||
PINSRD $3, 60(DX), t3
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
PINSRD $0, 12(DX), t3
|
||||
PINSRD $1, 28(DX), t3
|
||||
PINSRD $2, 44(DX), t3
|
||||
PINSRD $3, 60(DX), t3
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
|
||||
XORL CX, CX
|
||||
XORL CX, CX
|
||||
|
||||
loop:
|
||||
SM4_ROUND(0, x, y, t0, t1, t2, t3)
|
||||
SM4_ROUND(1, x, y, t1, t2, t3, t0)
|
||||
SM4_ROUND(2, x, y, t2, t3, t0, t1)
|
||||
SM4_ROUND(3, x, y, t3, t0, t1, t2)
|
||||
SM4_ROUND(0, x, y, t0, t1, t2, t3)
|
||||
SM4_ROUND(1, x, y, t1, t2, t3, t0)
|
||||
SM4_ROUND(2, x, y, t2, t3, t0, t1)
|
||||
SM4_ROUND(3, x, y, t3, t0, t1, t2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
MOVUPS t3, 0(BX)
|
||||
MOVUPS t2, 16(BX)
|
||||
MOVUPS t1, 32(BX)
|
||||
MOVUPS t0, 48(BX)
|
||||
MOVL 4(BX), R8
|
||||
MOVL 8(BX), R9
|
||||
MOVL 12(BX), R10
|
||||
MOVL 16(BX), R11
|
||||
MOVL 32(BX), R12
|
||||
MOVL 48(BX), R13
|
||||
MOVL R11, 4(BX)
|
||||
MOVL R12, 8(BX)
|
||||
MOVL R13, 12(BX)
|
||||
MOVL R8, 16(BX)
|
||||
MOVL R9, 32(BX)
|
||||
MOVL R10, 48(BX)
|
||||
MOVL 24(BX), R8
|
||||
MOVL 28(BX), R9
|
||||
MOVL 36(BX), R10
|
||||
MOVL 52(BX), R11
|
||||
MOVL R10, 24(BX)
|
||||
MOVL R11, 28(BX)
|
||||
MOVL R8, 36(BX)
|
||||
MOVL R9, 52(BX)
|
||||
MOVL 44(BX), R8
|
||||
MOVL 56(BX), R9
|
||||
MOVL R9, 44(BX)
|
||||
MOVL R8, 56(BX)
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
MOVUPS t3, 0(BX)
|
||||
MOVUPS t2, 16(BX)
|
||||
MOVUPS t1, 32(BX)
|
||||
MOVUPS t0, 48(BX)
|
||||
MOVL 4(BX), R8
|
||||
MOVL 8(BX), R9
|
||||
MOVL 12(BX), R10
|
||||
MOVL 16(BX), R11
|
||||
MOVL 32(BX), R12
|
||||
MOVL 48(BX), R13
|
||||
MOVL R11, 4(BX)
|
||||
MOVL R12, 8(BX)
|
||||
MOVL R13, 12(BX)
|
||||
MOVL R8, 16(BX)
|
||||
MOVL R9, 32(BX)
|
||||
MOVL R10, 48(BX)
|
||||
MOVL 24(BX), R8
|
||||
MOVL 28(BX), R9
|
||||
MOVL 36(BX), R10
|
||||
MOVL 52(BX), R11
|
||||
MOVL R10, 24(BX)
|
||||
MOVL R11, 28(BX)
|
||||
MOVL R8, 36(BX)
|
||||
MOVL R9, 52(BX)
|
||||
MOVL 44(BX), R8
|
||||
MOVL 56(BX), R9
|
||||
MOVL R9, 44(BX)
|
||||
MOVL R8, 56(BX)
|
||||
|
||||
done_sm4:
|
||||
RET
|
||||
RET
|
||||
|
||||
avx2:
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
CMPQ DI, $64
|
||||
JBE avx2_4blocks
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
CMPQ DI, $64
|
||||
JBE avx2_4blocks
|
||||
|
||||
avx2_8blocks:
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
VMOVDQU 96(DX), XDWORD3
|
||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
VMOVDQU 96(DX), XDWORD3
|
||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
|
||||
XORL CX, CX
|
||||
XORL CX, CX
|
||||
|
||||
avx2_loop:
|
||||
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx2_loop
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx2_loop
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
|
||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
JMP avx2_sm4_done
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
JMP avx2_sm4_done
|
||||
|
||||
avx2_4blocks:
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
|
||||
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
|
||||
XORL CX, CX
|
||||
XORL CX, CX
|
||||
|
||||
avx_loop:
|
||||
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx_loop
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx_loop
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
||||
|
||||
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
|
||||
avx2_sm4_done:
|
||||
VZEROUPPER
|
||||
RET
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
|
||||
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), BX
|
||||
MOVQ src+16(FP), DX
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), BX
|
||||
MOVQ src+16(FP), DX
|
||||
|
||||
MOVUPS (DX), t0
|
||||
MOVUPS (DX), t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PSHUFD $1, t0, t1
|
||||
PSHUFD $2, t0, t2
|
||||
PSHUFD $3, t0, t3
|
||||
|
||||
XORL CX, CX
|
||||
XORL CX, CX
|
||||
|
||||
loop:
|
||||
SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3)
|
||||
SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0)
|
||||
SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1)
|
||||
SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2)
|
||||
SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3)
|
||||
SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0)
|
||||
SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1)
|
||||
SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
|
||||
PEXTRD $0, t2, R8
|
||||
PINSRD $1, R8, t3
|
||||
@ -347,4 +347,4 @@ loop:
|
||||
MOVUPS t3, (BX)
|
||||
|
||||
done_sm4:
|
||||
RET
|
||||
RET
|
||||
|
@ -103,21 +103,21 @@
|
||||
VMOV V8.S[1], V11.S[2] \
|
||||
VMOV V8.S[0], V11.S[3] \
|
||||
VST1.P [V8.S4, V9.S4], 32(R10) \
|
||||
VST1 [V10.S4, V11.S4], (R11) \
|
||||
VST1 [V10.S4, V11.S4], (R11) \
|
||||
SUB $32, R11, R11
|
||||
|
||||
#define SM4E_ROUND() \
|
||||
VLD1.P 16(R10), [V8.B16] \
|
||||
VREV32 V8.B16, V8.B16 \
|
||||
WORD $0x0884c0ce \
|
||||
WORD $0x2884c0ce \
|
||||
WORD $0x4884c0ce \
|
||||
WORD $0x6884c0ce \
|
||||
WORD $0x8884c0ce \
|
||||
WORD $0xa884c0ce \
|
||||
WORD $0xc884c0ce \
|
||||
WORD $0xe884c0ce \
|
||||
VREV32 V8.B16, V8.B16 \
|
||||
VREV32 V8.B16, V8.B16 \
|
||||
WORD $0x0884c0ce \
|
||||
WORD $0x2884c0ce \
|
||||
WORD $0x4884c0ce \
|
||||
WORD $0x6884c0ce \
|
||||
WORD $0x8884c0ce \
|
||||
WORD $0xa884c0ce \
|
||||
WORD $0xc884c0ce \
|
||||
WORD $0xe884c0ce \
|
||||
VREV32 V8.B16, V8.B16 \
|
||||
VST1.P [V8.B16], 16(R9)
|
||||
|
||||
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
|
||||
@ -145,14 +145,14 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
|
||||
ksLoop:
|
||||
SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3)
|
||||
SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0)
|
||||
SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1)
|
||||
SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2)
|
||||
SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3)
|
||||
SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0)
|
||||
SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1)
|
||||
SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE ksLoop
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE ksLoop
|
||||
RET
|
||||
|
||||
sm4ekey:
|
||||
@ -226,14 +226,14 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
||||
EOR R0, R0
|
||||
|
||||
encryptBlocksLoop:
|
||||
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
|
||||
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encryptBlocksLoop
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encryptBlocksLoop
|
||||
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
@ -268,10 +268,11 @@ encryptBlocksLoop:
|
||||
sm4niblocks:
|
||||
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
|
||||
|
||||
sm4niblockloop:
|
||||
SM4E_ROUND()
|
||||
SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes
|
||||
CBNZ R12, sm4niblockloop
|
||||
SM4E_ROUND()
|
||||
SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes
|
||||
CBNZ R12, sm4niblockloop
|
||||
RET
|
||||
|
||||
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
|
||||
@ -296,14 +297,14 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
||||
EOR R0, R0
|
||||
|
||||
encryptBlockLoop:
|
||||
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
|
||||
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encryptBlockLoop
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encryptBlockLoop
|
||||
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
|
@ -38,45 +38,45 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
|
||||
MOVUPS (SI), IV
|
||||
|
||||
loopSrc:
|
||||
CMPQ ptxLen, $16
|
||||
JB done_sm4
|
||||
SUBQ $16, ptxLen
|
||||
CMPQ ptxLen, $16
|
||||
JB done_sm4
|
||||
SUBQ $16, ptxLen
|
||||
|
||||
MOVUPS (ptx), t0
|
||||
PXOR IV, t0
|
||||
MOVUPS (ptx), t0
|
||||
PXOR IV, t0
|
||||
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PSHUFD $1, t0, t1
|
||||
PSHUFD $2, t0, t2
|
||||
PSHUFD $3, t0, t3
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PSHUFD $1, t0, t1
|
||||
PSHUFD $2, t0, t2
|
||||
PSHUFD $3, t0, t3
|
||||
|
||||
XORL CX, CX
|
||||
XORL CX, CX
|
||||
|
||||
loopRound:
|
||||
SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
||||
SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loopRound
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loopRound
|
||||
|
||||
PEXTRD $0, t2, R8
|
||||
PINSRD $1, R8, t3
|
||||
PEXTRD $0, t1, R8
|
||||
PINSRD $2, R8, t3
|
||||
PEXTRD $0, t0, R8
|
||||
PINSRD $3, R8, t3
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
PEXTRD $0, t2, R8
|
||||
PINSRD $1, R8, t3
|
||||
PEXTRD $0, t1, R8
|
||||
PINSRD $2, R8, t3
|
||||
PEXTRD $0, t0, R8
|
||||
PINSRD $3, R8, t3
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
|
||||
MOVOU t3, IV
|
||||
MOVUPS t3, (ctx)
|
||||
MOVOU t3, IV
|
||||
MOVUPS t3, (ctx)
|
||||
|
||||
LEAQ 16(ptx), ptx
|
||||
LEAQ 16(ctx), ctx
|
||||
LEAQ 16(ptx), ptx
|
||||
LEAQ 16(ctx), ctx
|
||||
|
||||
JMP loopSrc
|
||||
JMP loopSrc
|
||||
|
||||
done_sm4:
|
||||
MOVUPS IV, (SI)
|
||||
|
@ -86,42 +86,42 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
|
||||
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
VLD1 (R5), [IV.B16]
|
||||
|
||||
|
||||
loopSrc:
|
||||
CMP $16, ptxLen
|
||||
BLT done_sm4
|
||||
SUB $16, ptxLen
|
||||
CMP $16, ptxLen
|
||||
BLT done_sm4
|
||||
SUB $16, ptxLen
|
||||
|
||||
VLD1.P (ptx), [t0.S4]
|
||||
VEOR IV.B16, t0.B16, t0.B16
|
||||
VREV32 t0.B16, t0.B16
|
||||
VMOV t0.S[1], t1.S[0]
|
||||
VMOV t0.S[2], t2.S[0]
|
||||
VMOV t0.S[3], t3.S[0]
|
||||
VLD1.P (ptx), [t0.S4]
|
||||
VEOR IV.B16, t0.B16, t0.B16
|
||||
VREV32 t0.B16, t0.B16
|
||||
VMOV t0.S[1], t1.S[0]
|
||||
VMOV t0.S[2], t2.S[0]
|
||||
VMOV t0.S[3], t3.S[0]
|
||||
|
||||
|
||||
EOR R2, R2
|
||||
MOVD rkSave, R0
|
||||
EOR R2, R2
|
||||
MOVD rkSave, R0
|
||||
|
||||
encryptBlockLoop:
|
||||
SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R2
|
||||
CMP $128, R2
|
||||
BNE encryptBlockLoop
|
||||
ADD $16, R2
|
||||
CMP $128, R2
|
||||
BNE encryptBlockLoop
|
||||
|
||||
VMOV t2.S[0], t3.S[1]
|
||||
VMOV t1.S[0], t3.S[2]
|
||||
VMOV t0.S[0], t3.S[3]
|
||||
VREV32 t3.B16, t3.B16
|
||||
VMOV t2.S[0], t3.S[1]
|
||||
VMOV t1.S[0], t3.S[2]
|
||||
VMOV t0.S[0], t3.S[3]
|
||||
VREV32 t3.B16, t3.B16
|
||||
|
||||
VST1.P [t3.B16], (ctx)
|
||||
VMOV t3.B16, IV.B16
|
||||
VST1.P [t3.B16], (ctx)
|
||||
VMOV t3.B16, IV.B16
|
||||
|
||||
B loopSrc
|
||||
|
||||
B loopSrc
|
||||
done_sm4:
|
||||
VST1 [IV.B16], (R5)
|
||||
RET
|
||||
|
@ -156,21 +156,21 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
#undef dlen
|
||||
|
||||
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PSHUFD $0, x, x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PSHUFD $0, x, x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PSHUFB flip_mask<>(SB), t0; \
|
||||
@ -225,20 +225,20 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
PSHUFB BSWAP, t0
|
||||
|
||||
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
||||
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||
|
@ -112,33 +112,33 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
#include "aesni_arm64.h"
|
||||
|
||||
#define LOAD_SM4_AESNI_CONSTS() \
|
||||
LDP nibble_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, NIBBLE_MASK.D[0] \
|
||||
VMOV R21, NIBBLE_MASK.D[1] \
|
||||
LDP m1_low<>(SB), (R20, R21) \
|
||||
VMOV R20, M1L.D[0] \
|
||||
VMOV R21, M1L.D[1] \
|
||||
LDP m1_high<>(SB), (R20, R21) \
|
||||
VMOV R20, M1H.D[0] \
|
||||
VMOV R21, M1H.D[1] \
|
||||
LDP m2_low<>(SB), (R20, R21) \
|
||||
VMOV R20, M2L.D[0] \
|
||||
VMOV R21, M2L.D[1] \
|
||||
LDP m2_high<>(SB), (R20, R21) \
|
||||
VMOV R20, M2H.D[0] \
|
||||
VMOV R21, M2H.D[1] \
|
||||
LDP inverse_shift_rows<>(SB), (R20, R21) \
|
||||
VMOV R20, INVERSE_SHIFT_ROWS.D[0] \
|
||||
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
|
||||
LDP r08_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R08_MASK.D[0] \
|
||||
VMOV R21, R08_MASK.D[1] \
|
||||
LDP r16_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R16_MASK.D[0] \
|
||||
VMOV R21, R16_MASK.D[1] \
|
||||
LDP r24_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R24_MASK.D[0] \
|
||||
VMOV R21, R24_MASK.D[1]
|
||||
LDP nibble_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, NIBBLE_MASK.D[0] \
|
||||
VMOV R21, NIBBLE_MASK.D[1] \
|
||||
LDP m1_low<>(SB), (R20, R21) \
|
||||
VMOV R20, M1L.D[0] \
|
||||
VMOV R21, M1L.D[1] \
|
||||
LDP m1_high<>(SB), (R20, R21) \
|
||||
VMOV R20, M1H.D[0] \
|
||||
VMOV R21, M1H.D[1] \
|
||||
LDP m2_low<>(SB), (R20, R21) \
|
||||
VMOV R20, M2L.D[0] \
|
||||
VMOV R21, M2L.D[1] \
|
||||
LDP m2_high<>(SB), (R20, R21) \
|
||||
VMOV R20, M2H.D[0] \
|
||||
VMOV R21, M2H.D[1] \
|
||||
LDP inverse_shift_rows<>(SB), (R20, R21) \
|
||||
VMOV R20, INVERSE_SHIFT_ROWS.D[0] \
|
||||
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
|
||||
LDP r08_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R08_MASK.D[0] \
|
||||
VMOV R21, R08_MASK.D[1] \
|
||||
LDP r16_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R16_MASK.D[0] \
|
||||
VMOV R21, R16_MASK.D[1] \
|
||||
LDP r24_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R24_MASK.D[0] \
|
||||
VMOV R21, R24_MASK.D[1]
|
||||
|
||||
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
|
||||
MOVW.P 4(RK), R19; \
|
||||
|
Loading…
x
Reference in New Issue
Block a user