diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index 85bf876..10cb9a8 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -83,13 +83,13 @@ GLOBL fk_mask<>(SB), 8, $16 // PUNPCKHQDQ r2, r3; // PUNPCKLQDQ r2, tmp2; // MOVOU tmp2, r2 -#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \ +#define SSE_TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ MOVOU r0, tmp2; \ PUNPCKHLQ r1, tmp2; \ PUNPCKLLQ r1, r0; \ MOVOU r2, tmp1; \ PUNPCKLLQ r3, tmp1; \ - PUNPCKHLQ r3, r2; \ + PUNPCKHLQ r3, r2; \ MOVOU r0, r1; \ PUNPCKHQDQ tmp1, r1; \ PUNPCKLQDQ tmp1, r0; \ diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 800c54e..84d4f0c 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -177,7 +177,7 @@ non_avx2_start: PSHUFB flip_mask<>(SB), t1 PSHUFB flip_mask<>(SB), t2 PSHUFB flip_mask<>(SB), t3 - SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) XORL CX, CX @@ -191,7 +191,7 @@ loop: CMPL CX, $4*32 JB loop - SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); PSHUFB bswap_mask<>(SB), t3 PSHUFB bswap_mask<>(SB), t2 PSHUFB bswap_mask<>(SB), t1 diff --git a/sm4/cbc_cipher_asm_amd64.s b/sm4/cbc_cipher_asm_amd64.s index 6b38e3d..e46e232 100644 --- a/sm4/cbc_cipher_asm_amd64.s +++ b/sm4/cbc_cipher_asm_amd64.s @@ -156,29 +156,15 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 JE avx non_avx2_start: - PINSRD $0, 0(DX), t0 - PINSRD $1, 16(DX), t0 - PINSRD $2, 32(DX), t0 - PINSRD $3, 48(DX), t0 + MOVOU 0(DX), t0 + MOVOU 16(DX), t1 + MOVOU 32(DX), t2 + MOVOU 48(DX), t3 PSHUFB flip_mask<>(SB), t0 - - PINSRD $0, 4(DX), t1 - PINSRD $1, 20(DX), t1 - PINSRD $2, 36(DX), t1 - PINSRD $3, 52(DX), t1 PSHUFB flip_mask<>(SB), t1 - - PINSRD $0, 8(DX), t2 - PINSRD $1, 24(DX), t2 - PINSRD $2, 40(DX), t2 - PINSRD $3, 56(DX), t2 PSHUFB flip_mask<>(SB), t2 - - PINSRD $0, 12(DX), t3 - PINSRD $1, 28(DX), t3 - PINSRD $2, 44(DX), t3 - PINSRD $3, 60(DX), t3 PSHUFB flip_mask<>(SB), t3 + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) XORL CX, CX @@ -192,22 +178,21 @@ loop: CMPL CX, $4*32 JB loop - PSHUFB flip_mask<>(SB), t3 - PSHUFB flip_mask<>(SB), t2 - PSHUFB flip_mask<>(SB), t1 - PSHUFB flip_mask<>(SB), t0 + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); + PSHUFB bswap_mask<>(SB), t3 + PSHUFB bswap_mask<>(SB), t2 + PSHUFB bswap_mask<>(SB), t1 + PSHUFB bswap_mask<>(SB), t0 - SSE_TRANSPOSE_MATRIX(CX, t3, t2, t1, t0, XWORD, YWORD) + PXOR 0(SI), t0 + PXOR 16(SI), t1 + PXOR 32(SI), t2 + PXOR 48(SI), t3 - PXOR 0(SI), t3 - PXOR 16(SI), t2 - PXOR 32(SI), t1 - PXOR 48(SI), t0 - - MOVUPS t3, 0(BX) - MOVUPS t2, 16(BX) - MOVUPS t1, 32(BX) - MOVUPS t0, 48(BX) + MOVUPS t0, 0(BX) + MOVUPS t1, 16(BX) + MOVUPS t2, 32(BX) + MOVUPS t3, 48(BX) done_sm4: RET diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index d964838..4823fdf 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -216,7 +216,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 PSHUFB flip_mask<>(SB), t1; \ PSHUFB flip_mask<>(SB), t2; \ PSHUFB flip_mask<>(SB), t3; \ - SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \ + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ XORL IND, IND; \ SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ @@ -257,7 +257,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \ + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ PSHUFB BSWAP, t3; \ PSHUFB BSWAP, t2; \ PSHUFB BSWAP, t1; \