sm4: optimize sse version

This commit is contained in:
emmansun 2023-07-03 16:20:04 +08:00
parent f10b09f5a2
commit a53659eb5b
2 changed files with 28 additions and 105 deletions

View File

@ -84,60 +84,19 @@ GLOBL fk_mask<>(SB), 8, $16
// PUNPCKLQDQ r2, tmp2;
// MOVOU tmp2, r2
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
PEXTRD $2, r0, r; \
PINSRD $0, r, tmp2; \
PEXTRD $2, r1, r; \
PINSRD $1, r, tmp2; \
; \
PEXTRD $3, r0, r; \
PINSRD $2, r, tmp2; \
PEXTRD $3, r1, r; \
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
; \
PEXTRD $1, r0, r; \
PINSRD $2, r, r0; \
PEXTRD $0, r1, r; \
PINSRD $1, r, r0; \
PEXTRD $1, r1, r; \
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
; \
PEXTRD $0, r2, r; \
PINSRD $0, r, tmp1; \
PEXTRD $0, r3, r; \
PINSRD $1, r, tmp1; \
PEXTRD $1, r2, r; \
PINSRD $2, r, tmp1; \
PEXTRD $1, r3, r; \
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
; \
PEXTRD $2, r2, r; \
PINSRD $0, r, r2; \
PEXTRD $2, r3, r; \
PINSRD $1, r, r2; \
PEXTRD $3, r2, r; \
PINSRD $2, r, r2; \
PEXTRD $3, r3, r; \
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
; \
MOVOU r0, r1; \
PEXTRQ $1, r1, r; \
PINSRQ $0, r, r1; \
PEXTRQ $1, tmp1, r; \
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
; \
PEXTRQ $0, tmp1, r; \
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
; \
MOVOU tmp2, r3; \
PEXTRQ $1, r3, r; \
PINSRQ $0, r, r3; \
PEXTRQ $1, r2, r; \
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
; \
PEXTRQ $0, r2, r; \
PINSRQ $1, r, r2; \
PEXTRQ $0, tmp2, r; \
PINSRQ $0, r, r2
MOVOU r0, tmp2; \
PUNPCKHLQ r1, tmp2; \
PUNPCKLLQ r1, r0; \
MOVOU r2, tmp1; \
PUNPCKLLQ r3, tmp1; \
PUNPCKHLQ r3, r2; \
MOVOU r0, r1; \
PUNPCKHQDQ tmp1, r1; \
PUNPCKLQDQ tmp1, r0; \
MOVOU tmp2, r3; \
PUNPCKHQDQ r2, r3; \
PUNPCKLQDQ r2, tmp2; \
MOVOU tmp2, r2
// SM4 sbox function
// parameters:

View File

@ -169,29 +169,15 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
JE avx
non_avx2_start:
PINSRD $0, 0(DX), t0
PINSRD $1, 16(DX), t0
PINSRD $2, 32(DX), t0
PINSRD $3, 48(DX), t0
MOVOU 0(DX), t0
MOVOU 16(DX), t1
MOVOU 32(DX), t2
MOVOU 48(DX), t3
PSHUFB flip_mask<>(SB), t0
PINSRD $0, 4(DX), t1
PINSRD $1, 20(DX), t1
PINSRD $2, 36(DX), t1
PINSRD $3, 52(DX), t1
PSHUFB flip_mask<>(SB), t1
PINSRD $0, 8(DX), t2
PINSRD $1, 24(DX), t2
PINSRD $2, 40(DX), t2
PINSRD $3, 56(DX), t2
PSHUFB flip_mask<>(SB), t2
PINSRD $0, 12(DX), t3
PINSRD $1, 28(DX), t3
PINSRD $2, 44(DX), t3
PINSRD $3, 60(DX), t3
PSHUFB flip_mask<>(SB), t3
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y);
XORL CX, CX
@ -205,38 +191,16 @@ loop:
CMPL CX, $4*32
JB loop
PSHUFB flip_mask<>(SB), t3
PSHUFB flip_mask<>(SB), t2
PSHUFB flip_mask<>(SB), t1
PSHUFB flip_mask<>(SB), t0
MOVUPS t3, 0(BX)
MOVUPS t2, 16(BX)
MOVUPS t1, 32(BX)
MOVUPS t0, 48(BX)
MOVL 4(BX), R8
MOVL 8(BX), R9
MOVL 12(BX), R10
MOVL 16(BX), R11
MOVL 32(BX), R12
MOVL 48(BX), R13
MOVL R11, 4(BX)
MOVL R12, 8(BX)
MOVL R13, 12(BX)
MOVL R8, 16(BX)
MOVL R9, 32(BX)
MOVL R10, 48(BX)
MOVL 24(BX), R8
MOVL 28(BX), R9
MOVL 36(BX), R10
MOVL 52(BX), R11
MOVL R10, 24(BX)
MOVL R11, 28(BX)
MOVL R8, 36(BX)
MOVL R9, 52(BX)
MOVL 44(BX), R8
MOVL 56(BX), R9
MOVL R9, 44(BX)
MOVL R8, 56(BX)
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y);
PSHUFB bswap_mask<>(SB), t3
PSHUFB bswap_mask<>(SB), t2
PSHUFB bswap_mask<>(SB), t1
PSHUFB bswap_mask<>(SB), t0
MOVOU t0, 0(BX)
MOVOU t1, 16(BX)
MOVOU t2, 32(BX)
MOVOU t3, 48(BX)
done_sm4:
RET