mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-22 02:06:18 +08:00
543 lines
18 KiB
ArmAsm
543 lines
18 KiB
ArmAsm
// This SM4 implementation referenced https://github.com/mjosaarinen/sm4ni/blob/master/sm4ni.c
|
|
#include "textflag.h"
|
|
|
|
#define x X0
|
|
#define y X1
|
|
#define t0 X2
|
|
#define t1 X3
|
|
#define t2 X4
|
|
#define t3 X5
|
|
|
|
#define XTMP6 X6
|
|
#define XTMP7 X7
|
|
|
|
// shuffle byte order from LE to BE
|
|
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
|
|
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
|
GLOBL flip_mask<>(SB), RODATA, $16
|
|
|
|
// shuffle byte and word order
|
|
DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
|
|
DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
|
|
GLOBL bswap_mask<>(SB), RODATA, $16
|
|
|
|
//nibble mask
|
|
DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
|
DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
|
GLOBL nibble_mask<>(SB), RODATA, $16
|
|
|
|
// inverse shift rows
|
|
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
|
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
|
GLOBL inverse_shift_rows<>(SB), RODATA, $16
|
|
|
|
// Affine transform 1 (low and high hibbles)
|
|
DATA m1_low<>+0x00(SB)/8, $0x9197E2E474720701
|
|
DATA m1_low<>+0x08(SB)/8, $0xC7C1B4B222245157
|
|
GLOBL m1_low<>(SB), RODATA, $16
|
|
|
|
DATA m1_high<>+0x00(SB)/8, $0xE240AB09EB49A200
|
|
DATA m1_high<>+0x08(SB)/8, $0xF052B91BF95BB012
|
|
GLOBL m1_high<>(SB), RODATA, $16
|
|
|
|
// Affine transform 2 (low and high hibbles)
|
|
DATA m2_low<>+0x00(SB)/8, $0x5B67F2CEA19D0834
|
|
DATA m2_low<>+0x08(SB)/8, $0xEDD14478172BBE82
|
|
GLOBL m2_low<>(SB), RODATA, $16
|
|
|
|
DATA m2_high<>+0x00(SB)/8, $0xAE7201DD73AFDC00
|
|
DATA m2_high<>+0x08(SB)/8, $0x11CDBE62CC1063BF
|
|
GLOBL m2_high<>(SB), RODATA, $16
|
|
|
|
// left rotations of 32-bit words by 8-bit increments
|
|
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
|
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
|
GLOBL r08_mask<>(SB), RODATA, $16
|
|
|
|
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
|
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
|
GLOBL r16_mask<>(SB), RODATA, $16
|
|
|
|
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
|
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
|
GLOBL r24_mask<>(SB), RODATA, $16
|
|
|
|
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
|
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
|
GLOBL fk_mask<>(SB), RODATA, $16
|
|
|
|
#define SM4_SBOX(x, y) \
|
|
; \ //############################# inner affine ############################//
|
|
MOVOU x, XTMP6; \
|
|
PAND nibble_mask<>(SB), XTMP6; \ //y = _mm_and_si128(x, c0f);
|
|
MOVOU m1_low<>(SB), y; \
|
|
PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m1l, y);
|
|
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
|
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
|
MOVOU m1_high<>(SB), XTMP6; \
|
|
PSHUFB x, XTMP6; \ //x = _mm_shuffle_epi8(m1h, x);
|
|
MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m1h, x);
|
|
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
|
|
; \ // inverse ShiftRows
|
|
PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
|
|
AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction
|
|
; \ //############################# outer affine ############################//
|
|
MOVOU x, XTMP6; \
|
|
PANDN nibble_mask<>(SB), XTMP6; \ //XTMP6 = _mm_andnot_si128(x, c0f);
|
|
MOVOU m2_low<>(SB), y; \
|
|
PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m2l, XTMP6)
|
|
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
|
|
PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
|
|
MOVOU m2_high<>(SB), XTMP6; \
|
|
PSHUFB x, XTMP6; \
|
|
MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
|
PXOR y, x; \ //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
|
|
|
#define SM4_TAO_L1(x, y) \
|
|
SM4_SBOX(x, y); \
|
|
; \ //#################### 4 parallel L1 linear transforms ##################//
|
|
MOVOU x, y; \
|
|
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
|
|
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
|
|
MOVOU x, XTMP6; \
|
|
PSHUFB r16_mask<>(SB), XTMP6; \
|
|
PXOR XTMP6, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
|
|
MOVOU y, XTMP6; \
|
|
PSLLL $2, XTMP6; \
|
|
PSRLL $30, y; \
|
|
POR XTMP6, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
|
|
MOVOU x, XTMP7; \
|
|
PSHUFB r24_mask<>(SB), XTMP7; \
|
|
PXOR y, x; \ //x = x xor y
|
|
PXOR XTMP7, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
|
|
|
#define SM4_TAO_L2(x, y) \
|
|
SM4_SBOX(x, y); \
|
|
; \ //#################### 4 parallel L2 linear transforms ##################//
|
|
MOVOU x, y; \
|
|
MOVOU x, XTMP6; \
|
|
PSLLL $13, XTMP6; \
|
|
PSRLL $19, y; \
|
|
POR XTMP6, y; \ //y = X roll 13
|
|
PSLLL $10, XTMP6; \
|
|
MOVOU x, XTMP7; \
|
|
PSRLL $9, XTMP7; \
|
|
POR XTMP6, XTMP7; \ //XTMP7 = x roll 23
|
|
PXOR XTMP7, y; \
|
|
PXOR y, x
|
|
|
|
#define SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
|
PINSRD $0, (index * 4)(AX)(CX*1), x; \
|
|
PSHUFD $0, x, x; \
|
|
PXOR t1, x; \
|
|
PXOR t2, x; \
|
|
PXOR t3, x; \
|
|
SM4_TAO_L1(x, y); \
|
|
PXOR x, t0
|
|
|
|
#define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \
|
|
PINSRD $0, (index * 4)(AX)(CX*1), x; \
|
|
PXOR t1, x; \
|
|
PXOR t2, x; \
|
|
PXOR t3, x; \
|
|
SM4_TAO_L1(x, y); \
|
|
PXOR x, t0
|
|
|
|
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
|
|
PINSRD $0, (index * 4)(BX)(CX*1), x; \
|
|
PXOR t1, x; \
|
|
PXOR t2, x; \
|
|
PXOR t3, x; \
|
|
SM4_TAO_L2(x, y); \
|
|
PXOR x, t0; \
|
|
PEXTRD $0, t0, R8; \
|
|
MOVL R8, (index * 4)(DX)(CX*1); \
|
|
MOVL R8, (12 - index * 4)(DI)(SI*1)
|
|
|
|
#define XDWORD0 Y4
|
|
#define XDWORD1 Y5
|
|
#define XDWORD2 Y6
|
|
#define XDWORD3 Y7
|
|
|
|
#define XWORD0 X4
|
|
#define XWORD1 X5
|
|
#define XWORD2 X6
|
|
#define XWORD3 X7
|
|
|
|
#define XDWTMP0 Y0
|
|
#define XDWTMP1 Y1
|
|
#define XDWTMP2 Y2
|
|
|
|
#define XWTMP0 X0
|
|
#define XWTMP1 X1
|
|
#define XWTMP2 X2
|
|
|
|
#define NIBBLE_MASK Y3
|
|
#define X_NIBBLE_MASK X3
|
|
|
|
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
|
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
|
|
|
|
#define XDWORD Y8
|
|
#define YDWORD Y9
|
|
|
|
#define XWORD X8
|
|
#define YWORD X9
|
|
|
|
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
|
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
|
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
|
|
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
|
|
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
|
|
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
|
|
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
|
|
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
|
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
|
|
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
|
|
#define AVX2_SM4_SBOX(x, y) \
|
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK; \
|
|
VPAND NIBBLE_MASK, x, XDWTMP1; \
|
|
VBROADCASTI128 m1_low<>(SB), y; \
|
|
VPSHUFB XDWTMP1, y, y; \
|
|
VPSRLQ $4, x, x; \
|
|
VPAND NIBBLE_MASK, x, x; \
|
|
VBROADCASTI128 m1_high<>(SB), XDWTMP1; \
|
|
VPSHUFB x, XDWTMP1, x; \
|
|
VPXOR y, x, x; \
|
|
VBROADCASTI128 inverse_shift_rows<>(SB), XDWTMP1;\
|
|
VPSHUFB XDWTMP1, x, x; \
|
|
VEXTRACTI128 $1, x, YWORD \
|
|
VAESENCLAST X_NIBBLE_MASK, XWORD, XWORD; \
|
|
VAESENCLAST X_NIBBLE_MASK, YWORD, YWORD; \
|
|
VINSERTI128 $1, YWORD, x, x; \
|
|
VPANDN NIBBLE_MASK, x, XDWTMP1; \
|
|
VBROADCASTI128 m2_low<>(SB), y; \
|
|
VPSHUFB XDWTMP1, y, y; \
|
|
VPSRLQ $4, x, x; \
|
|
VPAND NIBBLE_MASK, x, x; \
|
|
VBROADCASTI128 m2_high<>(SB), XDWTMP1; \
|
|
VPSHUFB x, XDWTMP1, x; \
|
|
VPXOR y, x, x
|
|
|
|
#define AVX2_SM4_TAO_L1(x, y) \
|
|
AVX2_SM4_SBOX(x, y); \
|
|
VBROADCASTI128 r08_mask<>(SB), XDWTMP0; \
|
|
VPSHUFB XDWTMP0, x, y; \
|
|
VPXOR x, y, y; \
|
|
VBROADCASTI128 r16_mask<>(SB), XDWTMP0; \
|
|
VPSHUFB XDWTMP0, x, XDWTMP0; \
|
|
VPXOR XDWTMP0, y, y; \
|
|
VPSLLD $2, y, XDWTMP1; \
|
|
VPSRLD $30, y, y; \
|
|
VPXOR XDWTMP1, y, y; \
|
|
VBROADCASTI128 r24_mask<>(SB), XDWTMP0; \
|
|
VPSHUFB XDWTMP0, x, XDWTMP0; \
|
|
VPXOR y, x, x; \
|
|
VPXOR x, XDWTMP0, x
|
|
|
|
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
|
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
|
VPXOR t1, x, x; \
|
|
VPXOR t2, x, x; \
|
|
VPXOR t3, x, x; \
|
|
AVX2_SM4_TAO_L1(x, y); \
|
|
VPXOR x, t0, t0
|
|
|
|
#define AVX_SM4_SBOX(x, y) \
|
|
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK; \
|
|
VPAND X_NIBBLE_MASK, x, XWTMP1; \
|
|
VMOVDQU m1_low<>(SB), y; \
|
|
VPSHUFB XWTMP1, y, y; \
|
|
VPSRLQ $4, x, x; \
|
|
VPAND X_NIBBLE_MASK, x, x; \
|
|
VMOVDQU m1_high<>(SB), XWTMP1; \
|
|
VPSHUFB x, XWTMP1, x; \
|
|
VPXOR y, x, x; \
|
|
VMOVDQU inverse_shift_rows<>(SB), XWTMP1; \
|
|
VPSHUFB XWTMP1, x, x; \
|
|
VAESENCLAST X_NIBBLE_MASK, x, x; \
|
|
VPANDN X_NIBBLE_MASK, x, XWTMP1; \
|
|
VMOVDQU m2_low<>(SB), y; \
|
|
VPSHUFB XWTMP1, y, y; \
|
|
VPSRLQ $4, x, x; \
|
|
VPAND X_NIBBLE_MASK, x, x; \
|
|
VMOVDQU m2_high<>(SB), XWTMP1; \
|
|
VPSHUFB x, XWTMP1, x; \
|
|
VPXOR y, x, x
|
|
|
|
#define AVX_SM4_TAO_L1(x, y) \
|
|
AVX_SM4_SBOX(x, y); \
|
|
VMOVDQU r08_mask<>(SB), XWTMP0; \
|
|
VPSHUFB XWTMP0, x, y; \
|
|
VPXOR x, y, y; \
|
|
VMOVDQU r16_mask<>(SB), XWTMP0; \
|
|
VPSHUFB XWTMP0, x, XWTMP0; \
|
|
VPXOR XWTMP0, y, y; \
|
|
VPSLLD $2, y, XWTMP1; \
|
|
VPSRLD $30, y, y; \
|
|
VPXOR XWTMP1, y, y; \
|
|
VMOVDQU r24_mask<>(SB), XWTMP0; \
|
|
VPSHUFB XWTMP0, x, XWTMP0; \
|
|
VPXOR y, x, x; \
|
|
VPXOR x, XWTMP0, x
|
|
|
|
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
|
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
|
VPXOR t1, x, x; \
|
|
VPXOR t2, x, x; \
|
|
VPXOR t3, x, x; \
|
|
AVX_SM4_TAO_L1(x, y); \
|
|
VPXOR x, t0, t0
|
|
|
|
// func expandKeyAsm(key *byte, ck, enc, dec *uint32)
|
|
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
|
MOVQ key+0(FP), AX
|
|
MOVQ ck+8(FP), BX
|
|
MOVQ enc+16(FP), DX
|
|
MOVQ dec+24(FP), DI
|
|
|
|
MOVUPS 0(AX), t0
|
|
PSHUFB flip_mask<>(SB), t0
|
|
PXOR fk_mask<>(SB), t0
|
|
PSHUFD $1, t0, t1
|
|
PSHUFD $2, t0, t2
|
|
PSHUFD $3, t0, t3
|
|
|
|
XORL CX, CX
|
|
MOVL $112, SI
|
|
|
|
loop:
|
|
SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3)
|
|
SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0)
|
|
SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1)
|
|
SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2)
|
|
|
|
ADDL $16, CX
|
|
SUBL $16, SI
|
|
CMPL CX, $4*32
|
|
JB loop
|
|
|
|
expand_end:
|
|
RET
|
|
|
|
// func encryptBlocksAsm(xk *uint32, dst, src []byte)
|
|
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
|
MOVQ xk+0(FP), AX
|
|
MOVQ dst+8(FP), BX
|
|
MOVQ src+32(FP), DX
|
|
MOVQ src_len+40(FP), DI
|
|
|
|
CMPB ·useAVX2(SB), $1
|
|
JE avx2
|
|
|
|
non_avx2_start:
|
|
PINSRD $0, 0(DX), t0
|
|
PINSRD $1, 16(DX), t0
|
|
PINSRD $2, 32(DX), t0
|
|
PINSRD $3, 48(DX), t0
|
|
PSHUFB flip_mask<>(SB), t0
|
|
|
|
PINSRD $0, 4(DX), t1
|
|
PINSRD $1, 20(DX), t1
|
|
PINSRD $2, 36(DX), t1
|
|
PINSRD $3, 52(DX), t1
|
|
PSHUFB flip_mask<>(SB), t1
|
|
|
|
PINSRD $0, 8(DX), t2
|
|
PINSRD $1, 24(DX), t2
|
|
PINSRD $2, 40(DX), t2
|
|
PINSRD $3, 56(DX), t2
|
|
PSHUFB flip_mask<>(SB), t2
|
|
|
|
PINSRD $0, 12(DX), t3
|
|
PINSRD $1, 28(DX), t3
|
|
PINSRD $2, 44(DX), t3
|
|
PINSRD $3, 60(DX), t3
|
|
PSHUFB flip_mask<>(SB), t3
|
|
|
|
XORL CX, CX
|
|
|
|
loop:
|
|
SM4_ROUND(0, x, y, t0, t1, t2, t3)
|
|
SM4_ROUND(1, x, y, t1, t2, t3, t0)
|
|
SM4_ROUND(2, x, y, t2, t3, t0, t1)
|
|
SM4_ROUND(3, x, y, t3, t0, t1, t2)
|
|
|
|
ADDL $16, CX
|
|
CMPL CX, $4*32
|
|
JB loop
|
|
|
|
PSHUFB flip_mask<>(SB), t3
|
|
PSHUFB flip_mask<>(SB), t2
|
|
PSHUFB flip_mask<>(SB), t1
|
|
PSHUFB flip_mask<>(SB), t0
|
|
MOVUPS t3, 0(BX)
|
|
MOVUPS t2, 16(BX)
|
|
MOVUPS t1, 32(BX)
|
|
MOVUPS t0, 48(BX)
|
|
MOVL 4(BX), R8
|
|
MOVL 8(BX), R9
|
|
MOVL 12(BX), R10
|
|
MOVL 16(BX), R11
|
|
MOVL 32(BX), R12
|
|
MOVL 48(BX), R13
|
|
MOVL R11, 4(BX)
|
|
MOVL R12, 8(BX)
|
|
MOVL R13, 12(BX)
|
|
MOVL R8, 16(BX)
|
|
MOVL R9, 32(BX)
|
|
MOVL R10, 48(BX)
|
|
MOVL 24(BX), R8
|
|
MOVL 28(BX), R9
|
|
MOVL 36(BX), R10
|
|
MOVL 52(BX), R11
|
|
MOVL R10, 24(BX)
|
|
MOVL R11, 28(BX)
|
|
MOVL R8, 36(BX)
|
|
MOVL R9, 52(BX)
|
|
MOVL 44(BX), R8
|
|
MOVL 56(BX), R9
|
|
MOVL R9, 44(BX)
|
|
MOVL R8, 56(BX)
|
|
|
|
done_sm4:
|
|
RET
|
|
|
|
avx2:
|
|
CMPQ DI, $64
|
|
JBE avx2_4blocks
|
|
|
|
avx2_8blocks:
|
|
VMOVDQU 0(DX), XDWORD0
|
|
VMOVDQU 32(DX), XDWORD1
|
|
VMOVDQU 64(DX), XDWORD2
|
|
VMOVDQU 96(DX), XDWORD3
|
|
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
|
|
|
// Apply Byte Flip Mask: LE -> BE
|
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
|
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
|
|
|
// Transpose matrix 4 x 4 32bits word
|
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
|
|
|
XORL CX, CX
|
|
|
|
avx2_loop:
|
|
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
|
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
|
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
|
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
|
|
|
ADDL $16, CX
|
|
CMPL CX, $4*32
|
|
JB avx2_loop
|
|
|
|
// Transpose matrix 4 x 4 32bits word
|
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
|
|
|
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
|
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
|
|
|
VMOVDQU XDWORD0, 0(BX)
|
|
VMOVDQU XDWORD1, 32(BX)
|
|
VMOVDQU XDWORD2, 64(BX)
|
|
VMOVDQU XDWORD3, 96(BX)
|
|
JMP avx2_sm4_done
|
|
|
|
avx2_4blocks:
|
|
VMOVDQU 0(DX), XWORD0
|
|
VMOVDQU 16(DX), XWORD1
|
|
VMOVDQU 32(DX), XWORD2
|
|
VMOVDQU 48(DX), XWORD3
|
|
|
|
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
|
|
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
|
|
|
// Transpose matrix 4 x 4 32bits word
|
|
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
|
|
|
XORL CX, CX
|
|
|
|
avx_loop:
|
|
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
|
|
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
|
|
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
|
|
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
|
|
|
|
ADDL $16, CX
|
|
CMPL CX, $4*32
|
|
JB avx_loop
|
|
|
|
// Transpose matrix 4 x 4 32bits word
|
|
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
|
|
|
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
|
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
|
|
|
VMOVDQU XWORD0, 0(BX)
|
|
VMOVDQU XWORD1, 16(BX)
|
|
VMOVDQU XWORD2, 32(BX)
|
|
VMOVDQU XWORD3, 48(BX)
|
|
|
|
avx2_sm4_done:
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func encryptBlockAsm(xk *uint32, dst, src *byte)
|
|
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
|
MOVQ xk+0(FP), AX
|
|
MOVQ dst+8(FP), BX
|
|
MOVQ src+16(FP), DX
|
|
|
|
PINSRD $0, 0(DX), t0
|
|
PSHUFB flip_mask<>(SB), t0
|
|
|
|
PINSRD $0, 4(DX), t1
|
|
PSHUFB flip_mask<>(SB), t1
|
|
|
|
PINSRD $0, 8(DX), t2
|
|
PSHUFB flip_mask<>(SB), t2
|
|
|
|
PINSRD $0, 12(DX), t3
|
|
PSHUFB flip_mask<>(SB), t3
|
|
|
|
XORL CX, CX
|
|
|
|
loop:
|
|
SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3)
|
|
SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0)
|
|
SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1)
|
|
SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2)
|
|
|
|
ADDL $16, CX
|
|
CMPL CX, $4*32
|
|
JB loop
|
|
|
|
PSHUFB flip_mask<>(SB), t3
|
|
PSHUFB flip_mask<>(SB), t2
|
|
PSHUFB flip_mask<>(SB), t1
|
|
PSHUFB flip_mask<>(SB), t0
|
|
MOVUPS t3, 0(BX)
|
|
PEXTRD $0, t2, R8
|
|
MOVL R8, 4(BX)
|
|
PEXTRD $0, t1, R8
|
|
MOVL R8, 8(BX)
|
|
PEXTRD $0, t0, R8
|
|
MOVL R8, 12(BX)
|
|
done_sm4:
|
|
RET
|