diff --git a/sm4/cbc_amd64.s b/sm4/cbc_amd64.s index 5895413..b415f5a 100644 --- a/sm4/cbc_amd64.s +++ b/sm4/cbc_amd64.s @@ -132,7 +132,9 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 JE avx2Start CMPB ·useAVX(SB), $1 - JE avxCbcSm4Octets + JE avxStart + + MOVOU -16(DX), X15 cbcSm4Octets: CMPQ DI, $128 @@ -155,7 +157,7 @@ cbcSm4Octets: PXOR -16(DX), XWORD0 PXOR 0(DX), XWORD1 PXOR 16(DX), XWORD2 - PXOR 32(DX), XWORD3 + PXOR 32(DX), XWORD3 PXOR 48(DX), XWORD4 PXOR 64(DX), XWORD5 PXOR 80(DX), XWORD6 @@ -211,8 +213,6 @@ cbCSm4Single: MOVOU -32(DX), XWORD2 MOVOU -16(DX), XWORD3 - MOVOU XWORD0, XWORD4 - SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) PXOR 0(SI), XWORD0 @@ -224,28 +224,24 @@ cbCSm4Single: MOVUPS XWORD1, -48(BX) MOVUPS XWORD2, -32(BX) MOVUPS XWORD3, -16(BX) - MOVUPS XWORD4, (SI) + JMP cbcSm4Done cbcSm4Single16: MOVOU -16(DX), XWORD0 - MOVOU XWORD0, XWORD4 - SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) PXOR 0(SI), XWORD0 MOVUPS XWORD0, -16(BX) - MOVUPS XWORD4, (SI) + JMP cbcSm4Done cbcSm4Single32: MOVOU -32(DX), XWORD0 MOVOU -16(DX), XWORD1 - MOVOU XWORD0, XWORD4 - SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) PXOR 0(SI), XWORD0 @@ -253,7 +249,7 @@ cbcSm4Single32: MOVUPS XWORD0, -32(BX) MOVUPS XWORD1, -16(BX) - MOVUPS XWORD4, (SI) + JMP cbcSm4Done cbcSm4Single48: @@ -261,8 +257,6 @@ cbcSm4Single48: MOVOU -32(DX), XWORD1 MOVOU -16(DX), XWORD2 - MOVOU XWORD0, XWORD4 - SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) PXOR 0(SI), XWORD0 @@ -272,11 +266,14 @@ cbcSm4Single48: MOVUPS XWORD0, -48(BX) MOVUPS XWORD1, -32(BX) MOVUPS XWORD2, -16(BX) - MOVUPS XWORD4, (SI) cbcSm4Done: + MOVUPS X15, (SI) RET +avxStart: + VMOVDQU -16(DX), X15 + avxCbcSm4Octets: CMPQ DI, $128 JLE avxCbcSm4Nibbles @@ -354,8 +351,6 @@ avxCbCSm4Single: VMOVDQU -32(DX), XWORD2 VMOVDQU -16(DX), XWORD3 - VMOVDQU XWORD0, XWORD4 - AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -367,28 +362,24 @@ avxCbCSm4Single: VMOVDQU XWORD1, -48(BX) VMOVDQU XWORD2, -32(BX) VMOVDQU XWORD3, -16(BX) - VMOVDQU XWORD4, (SI) + JMP avxCbcSm4Done avxCbcSm4Single16: VMOVDQU -16(DX), XWORD0 - VMOVDQU XWORD0, XWORD4 - AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 VMOVDQU XWORD0, -16(BX) - VMOVDQU XWORD4, (SI) + JMP avxCbcSm4Done avxCbcSm4Single32: VMOVDQU -32(DX), XWORD0 VMOVDQU -16(DX), XWORD1 - VMOVDQU XWORD0, XWORD4 - AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -396,7 +387,7 @@ avxCbcSm4Single32: VMOVDQU XWORD0, -32(BX) VMOVDQU XWORD1, -16(BX) - VMOVDQU XWORD4, (SI) + JMP avxCbcSm4Done avxCbcSm4Single48: @@ -404,8 +395,6 @@ avxCbcSm4Single48: VMOVDQU -32(DX), XWORD1 VMOVDQU -16(DX), XWORD2 - VMOVDQU XWORD0, XWORD4 - AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -415,9 +404,9 @@ avxCbcSm4Single48: VMOVDQU XWORD0, -48(BX) VMOVDQU XWORD1, -32(BX) VMOVDQU XWORD2, -16(BX) - VMOVDQU XWORD4, (SI) avxCbcSm4Done: + VMOVDQU X15, (SI) RET avx2Start: @@ -425,6 +414,8 @@ avx2Start: VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK + VMOVDQU -16(DX), X15 + avx2_16blocks: CMPQ DI, $256 JLE avx2CbcSm4Octets @@ -572,8 +563,6 @@ avx2CbCSm4Single: VMOVDQU -32(DX), XWORD2 VMOVDQU -16(DX), XWORD3 - VMOVDQU XWORD0, XWORD4 - AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -585,20 +574,17 @@ avx2CbCSm4Single: VMOVDQU XWORD1, -48(BX) VMOVDQU XWORD2, -32(BX) VMOVDQU XWORD3, -16(BX) - VMOVDQU XWORD4, (SI) + JMP avx2CbcSm4Done avx2CbcSm4Single16: VMOVDQU -16(DX), XWORD0 - VMOVDQU XWORD0, XWORD4 - AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 VMOVDQU XWORD0, -16(BX) - VMOVDQU XWORD4, (SI) JMP avx2CbcSm4Done @@ -606,8 +592,6 @@ avx2CbcSm4Single32: VMOVDQU -32(DX), XWORD0 VMOVDQU -16(DX), XWORD1 - VMOVDQU XWORD0, XWORD4 - AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -615,7 +599,6 @@ avx2CbcSm4Single32: VMOVDQU XWORD0, -32(BX) VMOVDQU XWORD1, -16(BX) - VMOVDQU XWORD4, (SI) JMP avx2CbcSm4Done @@ -624,8 +607,6 @@ avx2CbcSm4Single48: VMOVDQU -32(DX), XWORD1 VMOVDQU -16(DX), XWORD2 - VMOVDQU XWORD0, XWORD4 - AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -635,8 +616,8 @@ avx2CbcSm4Single48: VMOVDQU XWORD0, -48(BX) VMOVDQU XWORD1, -32(BX) VMOVDQU XWORD2, -16(BX) - VMOVDQU XWORD4, (SI) avx2CbcSm4Done: + VMOVDQU X15, (SI) VZEROUPPER RET diff --git a/sm4/cbc_arm64.s b/sm4/cbc_arm64.s index 1449e5d..7b42c6d 100644 --- a/sm4/cbc_arm64.s +++ b/sm4/cbc_arm64.s @@ -114,6 +114,10 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 MOVD rk, rkSave VLD1 (R6), [IV.B16] + ADD srcPtr, srcPtrLen, R10 + SUB $16, R10, R10 + VLD1 (R10), [V15.S4] + cbcSm4Octets: CMP $128, srcPtrLen BLE cbcSm4Nibbles @@ -233,7 +237,6 @@ cbcSm4Single: // 4 blocks VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4] - VMOV t0.B16, t4.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 VREV32 t2.B16, t2.B16 @@ -263,13 +266,11 @@ cbc4BlocksLoop64: VEOR V8.B16, t3.B16, t3.B16 VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr) - VST1 [t4.S4], (R6) B cbcSm4Done cbcSm4Single16: VLD1 (srcPtr), [t0.S4] - VMOV t0.B16, t4.B16 VREV32 t0.B16, t0.B16 VMOV t0.S[1], t1.S[0] VMOV t0.S[2], t2.S[0] @@ -293,13 +294,11 @@ cbc4BlocksLoop16: VEOR IV.B16, t3.B16, t3.B16 VST1 [t3.S4], (dstPtr) - VST1 [t4.S4], (R6) B cbcSm4Done cbcSm4Single32: VLD1 (srcPtr), [t0.S4, t1.S4] - VMOV t0.B16, t4.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) @@ -323,12 +322,10 @@ cbc4BlocksLoop32: VEOR V6.B16, t1.B16, t1.B16 VST1 [t0.S4, t1.S4], (dstPtr) - VST1 [t4.S4], (R6) B cbcSm4Done cbcSm4Single48: VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4] - VMOV t0.B16, t4.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 VREV32 t2.B16, t2.B16 @@ -355,7 +352,7 @@ cbc4BlocksLoop48: VEOR V7.B16, t2.B16, t2.B16 VST1 [t0.S4, t1.S4, t2.S4], (dstPtr) - VST1 [t4.S4], (R6) cbcSm4Done: + VST1 [V15.S4], (R6) RET