From e6d27e8baecc3f8eedee3ae9015ef1330efd9f2e Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Thu, 24 Aug 2023 16:42:33 +0800 Subject: [PATCH] sm4: xts asm arm64, fix dead loop bug --- sm4/xts_arm64.s | 194 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 193 insertions(+), 1 deletion(-) diff --git a/sm4/xts_arm64.s b/sm4/xts_arm64.s index 277d01b..bbb9de7 100644 --- a/sm4/xts_arm64.s +++ b/sm4/xts_arm64.s @@ -589,7 +589,7 @@ decNibblesDec4Blocks: xtsSm4DecSingles: CMP $32, srcPtrLen - BLT xtsSm4DecSingles + BLT xtsSm4DecTail SUB $16, srcPtrLen loadOneBlock @@ -721,4 +721,196 @@ xtsSm4DecDone: // func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) TEXT ·decryptSm4XtsGB(SB),0,$128-64 + LOAD_SM4_AESNI_CONSTS() + MOVD xk+0(FP), rk + MOVD tweak+8(FP), twPtr + MOVD dst+16(FP), dstPtr + MOVD src+40(FP), srcPtr + MOVD src_len+48(FP), srcPtrLen + + VEOR POLY.B16, POLY.B16, POLY.B16 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD $0xE1, I + LSL $56, I + VMOV I, POLY.D[1] + + MOVD rk, rkSave + VLD1 (twPtr), [TW.B16] + +xtsSm4DecOctets: + CMP $128, srcPtrLen + BLT xtsSm4DecNibbles + SUB $128, srcPtrLen + + prepareGB8Tweaks + load8blocks + MOVD rkSave, rk + EOR R13, R13 + +decOctetsDec8Blocks: + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) + + ADD $1, R13 + CMP $8, R13 + BNE decOctetsDec8Blocks + + store8blocks + B xtsSm4DecOctets + +xtsSm4DecNibbles: + CMP $64, srcPtrLen + BLT xtsSm4DecSingles + SUB $64, srcPtrLen + + prepareGB4Tweaks + load4blocks + MOVD rkSave, rk + EOR R13, R13 + +decNibblesDec4Blocks: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + + ADD $1, R13 + CMP $8, R13 + BNE decNibblesDec4Blocks + + store4blocks + +xtsSm4DecSingles: + CMP $32, srcPtrLen + BLT xtsSm4DecTail + SUB $16, srcPtrLen + + loadOneBlock + + MOVD rkSave, rk + EOR R13, R13 + +decSinglesDec4Blocks: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE decSinglesDec4Blocks + + storeOneBlock + mul2GBInline + + B xtsSm4DecSingles + +xtsSm4DecTail: + CBZ srcPtrLen, xtsSm4DecDone + + CMP $16, srcPtrLen + BEQ xtsSm4DecLastBlock + + VMOV TW.B16, B4.B16 + mul2GBInline + loadOneBlock + MOVD rkSave, rk + EOR R13, R13 + +decLastCompleteBlockLoop: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE decLastCompleteBlockLoop + storeOneBlock + VMOV B4.B16, TW.B16 + VST1 [B3.B16], (RSP) + + SUB $16, srcPtrLen + SUB $16, dstPtr, R7 + MOVD R7, R9 + MOVD RSP, R8 + + TBZ $3, srcPtrLen, less_than8 + MOVD.P 8(srcPtr), R11 + MOVD.P R11, 8(R8) + MOVD.P 8(R7), R12 + MOVD.P R12, 8(dstPtr) + +less_than8: + TBZ $2, srcPtrLen, less_than4 + MOVWU.P 4(srcPtr), R11 + MOVWU.P R11, 4(R8) + MOVWU.P 4(R7), R12 + MOVWU.P R12, 4(dstPtr) + +less_than4: + TBZ $1, srcPtrLen, less_than2 + MOVHU.P 2(srcPtr), R11 + MOVHU.P R11, 2(R8) + MOVHU.P 2(R7), R12 + MOVHU.P R12, 2(dstPtr) + +less_than2: + TBZ $0, srcPtrLen, xtsSm4DecTailDec + MOVBU (srcPtr), R11 + MOVBU R11, (R8) + MOVBU (R7), R12 + MOVBU R12, (dstPtr) + +xtsSm4DecTailDec: + VLD1 (RSP), [B0.B16] + VEOR TW.B16, B0.B16, B0.B16 + VREV32 B0.B16, B0.B16 + VMOV B0.S[1], B1.S[0] + VMOV B0.S[2], B2.S[0] + VMOV B0.S[3], B3.S[0] + + MOVD rkSave, rk + EOR R13, R13 + +tailDecLoop: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE tailDecLoop + + VMOV B2.S[0], B3.S[1] + VMOV B1.S[0], B3.S[2] + VMOV B0.S[0], B3.S[3] + VREV32 B3.B16, B3.B16 + + VEOR TW.B16, B3.B16, B3.B16 + VST1 [B3.B16], (R9) + + B xtsSm4DecDone + +xtsSm4DecLastBlock: + loadOneBlock + + MOVD rkSave, rk + EOR R13, R13 + +decLastBlockLoop: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE decLastBlockLoop + + storeOneBlock + mul2GBInline + +xtsSm4DecDone: + VST1 [TW.B16], (twPtr) RET