diff --git a/sm4/xts_arm64.s b/sm4/xts_arm64.s index 651ebb6..e92dbc3 100644 --- a/sm4/xts_arm64.s +++ b/sm4/xts_arm64.s @@ -372,6 +372,156 @@ xtsSm4EncDone: // func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) TEXT ·encryptSm4XtsGB(SB),0,$128-64 + LOAD_SM4_AESNI_CONSTS() + MOVD xk+0(FP), rk + MOVD tweak+8(FP), twPtr + MOVD dst+16(FP), dstPtr + MOVD src+40(FP), srcPtr + MOVD src_len+48(FP), srcPtrLen + + VEOR POLY.B16, POLY.B16, POLY.B16 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD $0xE1, I + LSL $56, I + VMOV I, POLY.D[1] + + MOVD rk, rkSave + VLD1 (twPtr), [TW.B16] + +xtsSm4EncOctets: + CMP $128, srcPtrLen + BLT xtsSm4EncNibbles + SUB $128, srcPtrLen + + prepareGB8Tweaks + load8blocks + MOVD rkSave, rk + EOR R13, R13 + +encOctetsEnc8Blocks: + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) + + ADD $1, R13 + CMP $8, R13 + BNE encOctetsEnc8Blocks + + store8blocks + B xtsSm4EncOctets + +xtsSm4EncNibbles: + CMP $64, srcPtrLen + BLT xtsSm4EncSingles + SUB $64, srcPtrLen + + prepareGB4Tweaks + load4blocks + MOVD rkSave, rk + EOR R13, R13 + +encNibblesEnc4Blocks: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + + ADD $1, R13 + CMP $8, R13 + BNE encNibblesEnc4Blocks + + store4blocks + +xtsSm4EncSingles: + CMP $16, srcPtrLen + BLT xtsSm4EncTail + SUB $16, srcPtrLen + + loadOneBlock + + MOVD rkSave, rk + EOR R13, R13 + +encSinglesEnc4Blocks: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE encSinglesEnc4Blocks + + storeOneBlock + mul2GBInline + B xtsSm4EncSingles + +xtsSm4EncTail: + CBZ srcPtrLen, xtsSm4EncDone + SUB $16, dstPtr, R7 + MOVD R7, R9 + MOVD RSP, R8 + VLD1 (R7), [B0.B16] + VST1 [B0.B16], (R8) + + TBZ $3, srcPtrLen, less_than8 + MOVD.P 8(srcPtr), R11 + MOVD.P R11, 8(R8) + MOVD.P 8(R7), R12 + MOVD.P R12, 8(dstPtr) + +less_than8: + TBZ $2, srcPtrLen, less_than4 + MOVWU.P 4(srcPtr), R11 + MOVWU.P R11, 4(R8) + MOVWU.P 4(R7), R12 + MOVWU.P R12, 4(dstPtr) + +less_than4: + TBZ $1, srcPtrLen, less_than2 + MOVHU.P 2(srcPtr), R11 + MOVHU.P R11, 2(R8) + MOVHU.P 2(R7), R12 + MOVHU.P R12, 2(dstPtr) + +less_than2: + TBZ $0, srcPtrLen, xtsSm4EncTailEnc + MOVBU (srcPtr), R11 + MOVBU R11, (R8) + MOVBU (R7), R12 + MOVBU R12, (dstPtr) + +xtsSm4EncTailEnc: + VLD1 (RSP), [B0.B16] + VEOR TW.B16, B0.B16, B0.B16 + VREV32 B0.B16, B0.B16 + VMOV B0.S[1], B1.S[0] + VMOV B0.S[2], B2.S[0] + VMOV B0.S[3], B3.S[0] + + MOVD rkSave, rk + EOR R13, R13 + +tailEncLoop: + SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) + ADD $1, R13 + CMP $8, R13 + BNE tailEncLoop + + VMOV B2.S[0], B3.S[1] + VMOV B1.S[0], B3.S[2] + VMOV B0.S[0], B3.S[3] + VREV32 B3.B16, B3.B16 + + VEOR TW.B16, B3.B16, B3.B16 + VST1 [B3.B16], (R9) + +xtsSm4EncDone: + VST1 [TW.B16], (twPtr) RET // func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)