sm4: optimize sse version

This commit is contained in:
emmansun 2023-07-03 16:20:04 +08:00
parent f10b09f5a2
commit a53659eb5b
2 changed files with 28 additions and 105 deletions

View File

@ -84,60 +84,19 @@ GLOBL fk_mask<>(SB), 8, $16
// PUNPCKLQDQ r2, tmp2; // PUNPCKLQDQ r2, tmp2;
// MOVOU tmp2, r2 // MOVOU tmp2, r2
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \ #define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
PEXTRD $2, r0, r; \ MOVOU r0, tmp2; \
PINSRD $0, r, tmp2; \ PUNPCKHLQ r1, tmp2; \
PEXTRD $2, r1, r; \ PUNPCKLLQ r1, r0; \
PINSRD $1, r, tmp2; \ MOVOU r2, tmp1; \
; \ PUNPCKLLQ r3, tmp1; \
PEXTRD $3, r0, r; \ PUNPCKHLQ r3, r2; \
PINSRD $2, r, tmp2; \ MOVOU r0, r1; \
PEXTRD $3, r1, r; \ PUNPCKHQDQ tmp1, r1; \
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2] PUNPCKLQDQ tmp1, r0; \
; \ MOVOU tmp2, r3; \
PEXTRD $1, r0, r; \ PUNPCKHQDQ r2, r3; \
PINSRD $2, r, r0; \ PUNPCKLQDQ r2, tmp2; \
PEXTRD $0, r1, r; \ MOVOU tmp2, r2
PINSRD $1, r, r0; \
PEXTRD $1, r1, r; \
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
; \
PEXTRD $0, r2, r; \
PINSRD $0, r, tmp1; \
PEXTRD $0, r3, r; \
PINSRD $1, r, tmp1; \
PEXTRD $1, r2, r; \
PINSRD $2, r, tmp1; \
PEXTRD $1, r3, r; \
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
; \
PEXTRD $2, r2, r; \
PINSRD $0, r, r2; \
PEXTRD $2, r3, r; \
PINSRD $1, r, r2; \
PEXTRD $3, r2, r; \
PINSRD $2, r, r2; \
PEXTRD $3, r3, r; \
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
; \
MOVOU r0, r1; \
PEXTRQ $1, r1, r; \
PINSRQ $0, r, r1; \
PEXTRQ $1, tmp1, r; \
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
; \
PEXTRQ $0, tmp1, r; \
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
; \
MOVOU tmp2, r3; \
PEXTRQ $1, r3, r; \
PINSRQ $0, r, r3; \
PEXTRQ $1, r2, r; \
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
; \
PEXTRQ $0, r2, r; \
PINSRQ $1, r, r2; \
PEXTRQ $0, tmp2, r; \
PINSRQ $0, r, r2
// SM4 sbox function // SM4 sbox function
// parameters: // parameters:

View File

@ -169,29 +169,15 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
JE avx JE avx
non_avx2_start: non_avx2_start:
PINSRD $0, 0(DX), t0 MOVOU 0(DX), t0
PINSRD $1, 16(DX), t0 MOVOU 16(DX), t1
PINSRD $2, 32(DX), t0 MOVOU 32(DX), t2
PINSRD $3, 48(DX), t0 MOVOU 48(DX), t3
PSHUFB flip_mask<>(SB), t0 PSHUFB flip_mask<>(SB), t0
PINSRD $0, 4(DX), t1
PINSRD $1, 20(DX), t1
PINSRD $2, 36(DX), t1
PINSRD $3, 52(DX), t1
PSHUFB flip_mask<>(SB), t1 PSHUFB flip_mask<>(SB), t1
PINSRD $0, 8(DX), t2
PINSRD $1, 24(DX), t2
PINSRD $2, 40(DX), t2
PINSRD $3, 56(DX), t2
PSHUFB flip_mask<>(SB), t2 PSHUFB flip_mask<>(SB), t2
PINSRD $0, 12(DX), t3
PINSRD $1, 28(DX), t3
PINSRD $2, 44(DX), t3
PINSRD $3, 60(DX), t3
PSHUFB flip_mask<>(SB), t3 PSHUFB flip_mask<>(SB), t3
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y);
XORL CX, CX XORL CX, CX
@ -205,38 +191,16 @@ loop:
CMPL CX, $4*32 CMPL CX, $4*32
JB loop JB loop
PSHUFB flip_mask<>(SB), t3 SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y);
PSHUFB flip_mask<>(SB), t2 PSHUFB bswap_mask<>(SB), t3
PSHUFB flip_mask<>(SB), t1 PSHUFB bswap_mask<>(SB), t2
PSHUFB flip_mask<>(SB), t0 PSHUFB bswap_mask<>(SB), t1
MOVUPS t3, 0(BX) PSHUFB bswap_mask<>(SB), t0
MOVUPS t2, 16(BX)
MOVUPS t1, 32(BX) MOVOU t0, 0(BX)
MOVUPS t0, 48(BX) MOVOU t1, 16(BX)
MOVL 4(BX), R8 MOVOU t2, 32(BX)
MOVL 8(BX), R9 MOVOU t3, 48(BX)
MOVL 12(BX), R10
MOVL 16(BX), R11
MOVL 32(BX), R12
MOVL 48(BX), R13
MOVL R11, 4(BX)
MOVL R12, 8(BX)
MOVL R13, 12(BX)
MOVL R8, 16(BX)
MOVL R9, 32(BX)
MOVL R10, 48(BX)
MOVL 24(BX), R8
MOVL 28(BX), R9
MOVL 36(BX), R10
MOVL 52(BX), R11
MOVL R10, 24(BX)
MOVL R11, 28(BX)
MOVL R8, 36(BX)
MOVL R9, 52(BX)
MOVL 44(BX), R8
MOVL 56(BX), R9
MOVL R9, 44(BX)
MOVL R8, 56(BX)
done_sm4: done_sm4:
RET RET