diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index eb880ac..85bf876 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -84,60 +84,19 @@ GLOBL fk_mask<>(SB), 8, $16 // PUNPCKLQDQ r2, tmp2; // MOVOU tmp2, r2 #define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \ - PEXTRD $2, r0, r; \ - PINSRD $0, r, tmp2; \ - PEXTRD $2, r1, r; \ - PINSRD $1, r, tmp2; \ - ; \ - PEXTRD $3, r0, r; \ - PINSRD $2, r, tmp2; \ - PEXTRD $3, r1, r; \ - PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2] - ; \ - PEXTRD $1, r0, r; \ - PINSRD $2, r, r0; \ - PEXTRD $0, r1, r; \ - PINSRD $1, r, r0; \ - PEXTRD $1, r1, r; \ - PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0] - ; \ - PEXTRD $0, r2, r; \ - PINSRD $0, r, tmp1; \ - PEXTRD $0, r3, r; \ - PINSRD $1, r, tmp1; \ - PEXTRD $1, r2, r; \ - PINSRD $2, r, tmp1; \ - PEXTRD $1, r3, r; \ - PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8] - ; \ - PEXTRD $2, r2, r; \ - PINSRD $0, r, r2; \ - PEXTRD $2, r3, r; \ - PINSRD $1, r, r2; \ - PEXTRD $3, r2, r; \ - PINSRD $2, r, r2; \ - PEXTRD $3, r3, r; \ - PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10] - ; \ - MOVOU r0, r1; \ - PEXTRQ $1, r1, r; \ - PINSRQ $0, r, r1; \ - PEXTRQ $1, tmp1, r; \ - PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1] - ; \ - PEXTRQ $0, tmp1, r; \ - PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0] - ; \ - MOVOU tmp2, r3; \ - PEXTRQ $1, r3, r; \ - PINSRQ $0, r, r3; \ - PEXTRQ $1, r2, r; \ - PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3] - ; \ - PEXTRQ $0, r2, r; \ - PINSRQ $1, r, r2; \ - PEXTRQ $0, tmp2, r; \ - PINSRQ $0, r, r2 + MOVOU r0, tmp2; \ + PUNPCKHLQ r1, tmp2; \ + PUNPCKLLQ r1, r0; \ + MOVOU r2, tmp1; \ + PUNPCKLLQ r3, tmp1; \ + PUNPCKHLQ r3, r2; \ + MOVOU r0, r1; \ + PUNPCKHQDQ tmp1, r1; \ + PUNPCKLQDQ tmp1, r0; \ + MOVOU tmp2, r3; \ + PUNPCKHQDQ r2, r3; \ + PUNPCKLQDQ r2, tmp2; \ + MOVOU tmp2, r2 // SM4 sbox function // parameters: diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 191d554..800c54e 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -169,29 +169,15 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 JE avx non_avx2_start: - PINSRD $0, 0(DX), t0 - PINSRD $1, 16(DX), t0 - PINSRD $2, 32(DX), t0 - PINSRD $3, 48(DX), t0 + MOVOU 0(DX), t0 + MOVOU 16(DX), t1 + MOVOU 32(DX), t2 + MOVOU 48(DX), t3 PSHUFB flip_mask<>(SB), t0 - - PINSRD $0, 4(DX), t1 - PINSRD $1, 20(DX), t1 - PINSRD $2, 36(DX), t1 - PINSRD $3, 52(DX), t1 PSHUFB flip_mask<>(SB), t1 - - PINSRD $0, 8(DX), t2 - PINSRD $1, 24(DX), t2 - PINSRD $2, 40(DX), t2 - PINSRD $3, 56(DX), t2 PSHUFB flip_mask<>(SB), t2 - - PINSRD $0, 12(DX), t3 - PINSRD $1, 28(DX), t3 - PINSRD $2, 44(DX), t3 - PINSRD $3, 60(DX), t3 PSHUFB flip_mask<>(SB), t3 + SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); XORL CX, CX @@ -205,38 +191,16 @@ loop: CMPL CX, $4*32 JB loop - PSHUFB flip_mask<>(SB), t3 - PSHUFB flip_mask<>(SB), t2 - PSHUFB flip_mask<>(SB), t1 - PSHUFB flip_mask<>(SB), t0 - MOVUPS t3, 0(BX) - MOVUPS t2, 16(BX) - MOVUPS t1, 32(BX) - MOVUPS t0, 48(BX) - MOVL 4(BX), R8 - MOVL 8(BX), R9 - MOVL 12(BX), R10 - MOVL 16(BX), R11 - MOVL 32(BX), R12 - MOVL 48(BX), R13 - MOVL R11, 4(BX) - MOVL R12, 8(BX) - MOVL R13, 12(BX) - MOVL R8, 16(BX) - MOVL R9, 32(BX) - MOVL R10, 48(BX) - MOVL 24(BX), R8 - MOVL 28(BX), R9 - MOVL 36(BX), R10 - MOVL 52(BX), R11 - MOVL R10, 24(BX) - MOVL R11, 28(BX) - MOVL R8, 36(BX) - MOVL R9, 52(BX) - MOVL 44(BX), R8 - MOVL 56(BX), R9 - MOVL R9, 44(BX) - MOVL R8, 56(BX) + SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); + PSHUFB bswap_mask<>(SB), t3 + PSHUFB bswap_mask<>(SB), t2 + PSHUFB bswap_mask<>(SB), t1 + PSHUFB bswap_mask<>(SB), t0 + + MOVOU t0, 0(BX) + MOVOU t1, 16(BX) + MOVOU t2, 32(BX) + MOVOU t3, 48(BX) done_sm4: RET