diff --git a/sm4/cbc_amd64.s b/sm4/cbc_amd64.s index 8d35e57..5895413 100644 --- a/sm4/cbc_amd64.s +++ b/sm4/cbc_amd64.s @@ -211,6 +211,8 @@ cbCSm4Single: MOVOU -32(DX), XWORD2 MOVOU -16(DX), XWORD3 + MOVOU XWORD0, XWORD4 + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) PXOR 0(SI), XWORD0 @@ -222,22 +224,28 @@ cbCSm4Single: MOVUPS XWORD1, -48(BX) MOVUPS XWORD2, -32(BX) MOVUPS XWORD3, -16(BX) + MOVUPS XWORD4, (SI) JMP cbcSm4Done cbcSm4Single16: MOVOU -16(DX), XWORD0 + MOVOU XWORD0, XWORD4 + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) PXOR 0(SI), XWORD0 MOVUPS XWORD0, -16(BX) + MOVUPS XWORD4, (SI) JMP cbcSm4Done cbcSm4Single32: MOVOU -32(DX), XWORD0 MOVOU -16(DX), XWORD1 + MOVOU XWORD0, XWORD4 + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) PXOR 0(SI), XWORD0 @@ -245,6 +253,7 @@ cbcSm4Single32: MOVUPS XWORD0, -32(BX) MOVUPS XWORD1, -16(BX) + MOVUPS XWORD4, (SI) JMP cbcSm4Done cbcSm4Single48: @@ -252,6 +261,8 @@ cbcSm4Single48: MOVOU -32(DX), XWORD1 MOVOU -16(DX), XWORD2 + MOVOU XWORD0, XWORD4 + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) PXOR 0(SI), XWORD0 @@ -261,6 +272,7 @@ cbcSm4Single48: MOVUPS XWORD0, -48(BX) MOVUPS XWORD1, -32(BX) MOVUPS XWORD2, -16(BX) + MOVUPS XWORD4, (SI) cbcSm4Done: RET @@ -342,6 +354,8 @@ avxCbCSm4Single: VMOVDQU -32(DX), XWORD2 VMOVDQU -16(DX), XWORD3 + VMOVDQU XWORD0, XWORD4 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -353,22 +367,28 @@ avxCbCSm4Single: VMOVDQU XWORD1, -48(BX) VMOVDQU XWORD2, -32(BX) VMOVDQU XWORD3, -16(BX) + VMOVDQU XWORD4, (SI) JMP avxCbcSm4Done avxCbcSm4Single16: VMOVDQU -16(DX), XWORD0 + VMOVDQU XWORD0, XWORD4 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 VMOVDQU XWORD0, -16(BX) + VMOVDQU XWORD4, (SI) JMP avxCbcSm4Done avxCbcSm4Single32: VMOVDQU -32(DX), XWORD0 VMOVDQU -16(DX), XWORD1 + VMOVDQU XWORD0, XWORD4 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -376,6 +396,7 @@ avxCbcSm4Single32: VMOVDQU XWORD0, -32(BX) VMOVDQU XWORD1, -16(BX) + VMOVDQU XWORD4, (SI) JMP avxCbcSm4Done avxCbcSm4Single48: @@ -383,6 +404,8 @@ avxCbcSm4Single48: VMOVDQU -32(DX), XWORD1 VMOVDQU -16(DX), XWORD2 + VMOVDQU XWORD0, XWORD4 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -392,6 +415,7 @@ avxCbcSm4Single48: VMOVDQU XWORD0, -48(BX) VMOVDQU XWORD1, -32(BX) VMOVDQU XWORD2, -16(BX) + VMOVDQU XWORD4, (SI) avxCbcSm4Done: RET @@ -548,6 +572,8 @@ avx2CbCSm4Single: VMOVDQU -32(DX), XWORD2 VMOVDQU -16(DX), XWORD3 + VMOVDQU XWORD0, XWORD4 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -559,22 +585,29 @@ avx2CbCSm4Single: VMOVDQU XWORD1, -48(BX) VMOVDQU XWORD2, -32(BX) VMOVDQU XWORD3, -16(BX) + VMOVDQU XWORD4, (SI) JMP avx2CbcSm4Done avx2CbcSm4Single16: VMOVDQU -16(DX), XWORD0 + VMOVDQU XWORD0, XWORD4 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 VMOVDQU XWORD0, -16(BX) + VMOVDQU XWORD4, (SI) + JMP avx2CbcSm4Done avx2CbcSm4Single32: VMOVDQU -32(DX), XWORD0 VMOVDQU -16(DX), XWORD1 + VMOVDQU XWORD0, XWORD4 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -582,6 +615,8 @@ avx2CbcSm4Single32: VMOVDQU XWORD0, -32(BX) VMOVDQU XWORD1, -16(BX) + VMOVDQU XWORD4, (SI) + JMP avx2CbcSm4Done avx2CbcSm4Single48: @@ -589,6 +624,8 @@ avx2CbcSm4Single48: VMOVDQU -32(DX), XWORD1 VMOVDQU -16(DX), XWORD2 + VMOVDQU XWORD0, XWORD4 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 @@ -598,6 +635,7 @@ avx2CbcSm4Single48: VMOVDQU XWORD0, -48(BX) VMOVDQU XWORD1, -32(BX) VMOVDQU XWORD2, -16(BX) + VMOVDQU XWORD4, (SI) avx2CbcSm4Done: VZEROUPPER diff --git a/sm4/cbc_arm64.s b/sm4/cbc_arm64.s index a9c978f..1449e5d 100644 --- a/sm4/cbc_arm64.s +++ b/sm4/cbc_arm64.s @@ -233,6 +233,7 @@ cbcSm4Single: // 4 blocks VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4] + VMOV t0.B16, t4.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 VREV32 t2.B16, t2.B16 @@ -262,11 +263,13 @@ cbc4BlocksLoop64: VEOR V8.B16, t3.B16, t3.B16 VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr) + VST1 [t4.S4], (R6) B cbcSm4Done cbcSm4Single16: VLD1 (srcPtr), [t0.S4] + VMOV t0.B16, t4.B16 VREV32 t0.B16, t0.B16 VMOV t0.S[1], t1.S[0] VMOV t0.S[2], t2.S[0] @@ -290,11 +293,13 @@ cbc4BlocksLoop16: VEOR IV.B16, t3.B16, t3.B16 VST1 [t3.S4], (dstPtr) + VST1 [t4.S4], (R6) B cbcSm4Done cbcSm4Single32: VLD1 (srcPtr), [t0.S4, t1.S4] + VMOV t0.B16, t4.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) @@ -318,10 +323,12 @@ cbc4BlocksLoop32: VEOR V6.B16, t1.B16, t1.B16 VST1 [t0.S4, t1.S4], (dstPtr) + VST1 [t4.S4], (R6) B cbcSm4Done cbcSm4Single48: VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4] + VMOV t0.B16, t4.B16 VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 VREV32 t2.B16, t2.B16 @@ -348,6 +355,7 @@ cbc4BlocksLoop48: VEOR V7.B16, t2.B16, t2.B16 VST1 [t0.S4, t1.S4, t2.S4], (dstPtr) + VST1 [t4.S4], (R6) cbcSm4Done: RET diff --git a/sm4/cbc_cipher_asm.go b/sm4/cbc_cipher_asm.go index 423fa05..582b051 100644 --- a/sm4/cbc_cipher_asm.go +++ b/sm4/cbc_cipher_asm.go @@ -19,7 +19,6 @@ const cbcDecrypt = 0 type cbc struct { b *sm4CipherAsm iv []byte - tmp []byte enc int } @@ -28,7 +27,6 @@ func (b *sm4CipherAsm) NewCBCEncrypter(iv []byte) cipher.BlockMode { c.b = b c.enc = cbcEncrypt c.iv = make([]byte, BlockSize) - c.tmp = make([]byte, BlockSize) copy(c.iv, iv) return &c } @@ -38,7 +36,6 @@ func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode { c.b = b c.enc = cbcDecrypt c.iv = make([]byte, BlockSize) - c.tmp = make([]byte, BlockSize) copy(c.iv, iv) return &c } @@ -68,16 +65,8 @@ func (x *cbc) CryptBlocks(dst, src []byte) { encryptBlocksChain(&x.b.enc[0], dst, src, &x.iv[0]) return } - // For each block, we need to xor the decrypted data with the previous block's ciphertext (the iv). - // To avoid making a copy each time, we loop over the blocks BACKWARDS. - end := len(src) - // Copy the last block of ciphertext in preparation as the new iv. - copy(x.tmp, src[end-BlockSize:end]) decryptBlocksChain(&x.b.dec[0], dst, src, &x.iv[0]) - - // Set the new iv to the first block we copied earlier. - x.iv, x.tmp = x.tmp, x.iv } func (x *cbc) SetIV(iv []byte) {