diff --git a/sm4/xts_arm64.s b/sm4/xts_arm64.s index e92dbc3..277d01b 100644 --- a/sm4/xts_arm64.s +++ b/sm4/xts_arm64.s @@ -526,6 +526,197 @@ xtsSm4EncDone: // func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) TEXT ·decryptSm4Xts(SB),0,$128-64 + LOAD_SM4_AESNI_CONSTS() + MOVD xk+0(FP), rk + MOVD tweak+8(FP), twPtr + MOVD dst+16(FP), dstPtr + MOVD src+40(FP), srcPtr + MOVD src_len+48(FP), srcPtrLen + + VEOR POLY.B16, POLY.B16, POLY.B16 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD $0x87, I + VMOV I, POLY.D[0] + + MOVD rk, rkSave + VLD1 (twPtr), [TW.B16] + +xtsSm4DecOctets: + CMP $128, srcPtrLen + BLT xtsSm4DecNibbles + SUB $128, srcPtrLen + + prepare8Tweaks + load8blocks + MOVD rkSave, rk + EOR R13, R13 + +decOctetsDec8Blocks: + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) + + ADD $1, R13 + CMP $8, R13 + BNE decOctetsDec8Blocks + + store8blocks + B xtsSm4DecOctets + +xtsSm4DecNibbles: + CMP $64, srcPtrLen + BLT xtsSm4DecSingles + SUB $64, srcPtrLen + + prepare4Tweaks + load4blocks + MOVD rkSave, rk + EOR R13, R13 + +decNibblesDec4Blocks: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + + ADD $1, R13 + CMP $8, R13 + BNE decNibblesDec4Blocks + + store4blocks + +xtsSm4DecSingles: + CMP $32, srcPtrLen + BLT xtsSm4DecSingles + SUB $16, srcPtrLen + + loadOneBlock + + MOVD rkSave, rk + EOR R13, R13 + +decSinglesDec4Blocks: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE decSinglesDec4Blocks + + storeOneBlock + mul2Inline + + B xtsSm4DecSingles + +xtsSm4DecTail: + CBZ srcPtrLen, xtsSm4DecDone + + CMP $16, srcPtrLen + BEQ xtsSm4DecLastBlock + + VMOV TW.B16, B4.B16 + mul2Inline + loadOneBlock + MOVD rkSave, rk + EOR R13, R13 + +decLastCompleteBlockLoop: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE decLastCompleteBlockLoop + storeOneBlock + VMOV B4.B16, TW.B16 + VST1 [B3.B16], (RSP) + + SUB $16, srcPtrLen + SUB $16, dstPtr, R7 + MOVD R7, R9 + MOVD RSP, R8 + + TBZ $3, srcPtrLen, less_than8 + MOVD.P 8(srcPtr), R11 + MOVD.P R11, 8(R8) + MOVD.P 8(R7), R12 + MOVD.P R12, 8(dstPtr) + +less_than8: + TBZ $2, srcPtrLen, less_than4 + MOVWU.P 4(srcPtr), R11 + MOVWU.P R11, 4(R8) + MOVWU.P 4(R7), R12 + MOVWU.P R12, 4(dstPtr) + +less_than4: + TBZ $1, srcPtrLen, less_than2 + MOVHU.P 2(srcPtr), R11 + MOVHU.P R11, 2(R8) + MOVHU.P 2(R7), R12 + MOVHU.P R12, 2(dstPtr) + +less_than2: + TBZ $0, srcPtrLen, xtsSm4DecTailDec + MOVBU (srcPtr), R11 + MOVBU R11, (R8) + MOVBU (R7), R12 + MOVBU R12, (dstPtr) + +xtsSm4DecTailDec: + VLD1 (RSP), [B0.B16] + VEOR TW.B16, B0.B16, B0.B16 + VREV32 B0.B16, B0.B16 + VMOV B0.S[1], B1.S[0] + VMOV B0.S[2], B2.S[0] + VMOV B0.S[3], B3.S[0] + + MOVD rkSave, rk + EOR R13, R13 + +tailDecLoop: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE tailDecLoop + + VMOV B2.S[0], B3.S[1] + VMOV B1.S[0], B3.S[2] + VMOV B0.S[0], B3.S[3] + VREV32 B3.B16, B3.B16 + + VEOR TW.B16, B3.B16, B3.B16 + VST1 [B3.B16], (R9) + + B xtsSm4DecDone + +xtsSm4DecLastBlock: + loadOneBlock + + MOVD rkSave, rk + EOR R13, R13 + +decLastBlockLoop: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE decLastBlockLoop + + storeOneBlock + mul2Inline + +xtsSm4DecDone: + VST1 [TW.B16], (twPtr) RET // func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)