diff --git a/sm4/xts_arm64.s b/sm4/xts_arm64.s index 07e7cea..651ebb6 100644 --- a/sm4/xts_arm64.s +++ b/sm4/xts_arm64.s @@ -372,545 +372,12 @@ xtsSm4EncDone: // func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) TEXT ·encryptSm4XtsGB(SB),0,$128-64 - LOAD_SM4_AESNI_CONSTS() - MOVD xk+0(FP), rk - MOVD tweak+8(FP), twPtr - MOVD dst+16(FP), dstPtr - MOVD src+40(FP), srcPtr - MOVD src_len+48(FP), srcPtrLen - - VEOR POLY.B16, POLY.B16, POLY.B16 - VEOR ZERO.B16, ZERO.B16, ZERO.B16 - - MOVD $0xE1, I - LSL $56, I - VMOV I, POLY.D[1] - - MOVD rk, rkSave - VLD1 (twPtr), [TW.B16] - -xtsSm4EncOctets: - CMP $128, srcPtrLen - BLT xtsSm4EncNibbles - SUB $128, srcPtrLen - - prepareGB8Tweaks - load8blocks - MOVD rkSave, rk - EOR R13, R13 - -encOctetsEnc8Blocks: - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) - - ADD $1, R13 - CMP $8, R13 - BNE encOctetsEnc8Blocks - - store8blocks - B xtsSm4EncOctets - -xtsSm4EncNibbles: - CMP $64, srcPtrLen - BLT xtsSm4EncSingles - SUB $64, srcPtrLen - - prepareGB4Tweaks - load4blocks - MOVD rkSave, rk - EOR R13, R13 - -encNibblesEnc4Blocks: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - - ADD $1, R13 - CMP $8, R13 - BNE encNibblesEnc4Blocks - - store4blocks - -xtsSm4EncSingles: - CMP $16, srcPtrLen - BLT xtsSm4EncTail - SUB $16, srcPtrLen - - loadOneBlock - - MOVD rkSave, rk - EOR R13, R13 - -encSinglesEnc4Blocks: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE encSinglesEnc4Blocks - - storeOneBlock - mul2GBInline - B xtsSm4EncSingles - -xtsSm4EncTail: - CBZ srcPtrLen, xtsSm4EncDone - SUB $16, dstPtr, R7 - MOVD R7, R9 - MOVD RSP, R8 - VLD1 (R7), [B0.B16] - VST1 [B0.B16], (R8) - - TBZ $3, srcPtrLen, less_than8 - MOVD.P 8(srcPtr), R11 - MOVD.P R11, 8(R8) - MOVD.P 8(R7), R12 - MOVD.P R12, 8(dstPtr) - -less_than8: - TBZ $2, srcPtrLen, less_than4 - MOVWU.P 4(srcPtr), R11 - MOVWU.P R11, 4(R8) - MOVWU.P 4(R7), R12 - MOVWU.P R12, 4(dstPtr) - -less_than4: - TBZ $1, srcPtrLen, less_than2 - MOVHU.P 2(srcPtr), R11 - MOVHU.P R11, 2(R8) - MOVHU.P 2(R7), R12 - MOVHU.P R12, 2(dstPtr) - -less_than2: - TBZ $0, srcPtrLen, xtsSm4EncTailEnc - MOVBU (srcPtr), R11 - MOVBU R11, (R8) - MOVBU (R7), R12 - MOVBU R12, (dstPtr) - -xtsSm4EncTailEnc: - VLD1 (RSP), [B0.B16] - VEOR TW.B16, B0.B16, B0.B16 - VREV32 B0.B16, B0.B16 - VMOV B0.S[1], B1.S[0] - VMOV B0.S[2], B2.S[0] - VMOV B0.S[3], B3.S[0] - - MOVD rkSave, rk - EOR R13, R13 - -tailEncLoop: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE tailEncLoop - - VMOV B2.S[0], B3.S[1] - VMOV B1.S[0], B3.S[2] - VMOV B0.S[0], B3.S[3] - VREV32 B3.B16, B3.B16 - - VEOR TW.B16, B3.B16, B3.B16 - VST1 [B3.B16], (R9) - -xtsSm4EncDone: - VST1 [TW.B16], (twPtr) RET // func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) TEXT ·decryptSm4Xts(SB),0,$128-64 - LOAD_SM4_AESNI_CONSTS() - MOVD xk+0(FP), rk - MOVD tweak+8(FP), twPtr - MOVD dst+16(FP), dstPtr - MOVD src+40(FP), srcPtr - MOVD src_len+48(FP), srcPtrLen - - VEOR POLY.B16, POLY.B16, POLY.B16 - VEOR ZERO.B16, ZERO.B16, ZERO.B16 - - MOVD $0x87, I - VMOV I, POLY.D[0] - - MOVD rk, rkSave - VLD1 (twPtr), [TW.B16] - -xtsSm4DecOctets: - CMP $128, srcPtrLen - BLT xtsSm4DecNibbles - SUB $128, srcPtrLen - - prepare8Tweaks - load8blocks - MOVD rkSave, rk - EOR R13, R13 - -decOctetsDec8Blocks: - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) - - ADD $1, R13 - CMP $8, R13 - BNE decOctetsDec8Blocks - - store8blocks - B xtsSm4DecOctets - -xtsSm4DecNibbles: - CMP $64, srcPtrLen - BLT xtsSm4DecSingles - SUB $64, srcPtrLen - - prepare4Tweaks - load4blocks - MOVD rkSave, rk - EOR R13, R13 - -decNibblesDec4Blocks: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - - ADD $1, R13 - CMP $8, R13 - BNE decNibblesDec4Blocks - - store4blocks - -xtsSm4DecSingles: - CMP $32, srcPtrLen - BLT xtsSm4DecSingles - SUB $16, srcPtrLen - - loadOneBlock - - MOVD rkSave, rk - EOR R13, R13 - -decSinglesDec4Blocks: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE decSinglesDec4Blocks - - storeOneBlock - mul2Inline - - B xtsSm4DecSingles - -xtsSm4DecTail: - CBZ srcPtrLen, xtsSm4DecDone - - CMP $16, srcPtrLen - BEQ xtsSm4DecLastBlock - - VMOV TW.B16, B4.B16 - mul2Inline - loadOneBlock - MOVD rkSave, rk - EOR R13, R13 - -decLastCompleteBlockLoop: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE decLastCompleteBlockLoop - storeOneBlock - VMOV B4.B16, TW.B16 - VST1 [B3.B16], (RSP) - - SUB $16, srcPtrLen - SUB $16, dstPtr, R7 - MOVD R7, R9 - MOVD RSP, R8 - - TBZ $3, srcPtrLen, less_than8 - MOVD.P 8(srcPtr), R11 - MOVD.P R11, 8(R8) - MOVD.P 8(R7), R12 - MOVD.P R12, 8(dstPtr) - -less_than8: - TBZ $2, srcPtrLen, less_than4 - MOVWU.P 4(srcPtr), R11 - MOVWU.P R11, 4(R8) - MOVWU.P 4(R7), R12 - MOVWU.P R12, 4(dstPtr) - -less_than4: - TBZ $1, srcPtrLen, less_than2 - MOVHU.P 2(srcPtr), R11 - MOVHU.P R11, 2(R8) - MOVHU.P 2(R7), R12 - MOVHU.P R12, 2(dstPtr) - -less_than2: - TBZ $0, srcPtrLen, xtsSm4DecTailEnc - MOVBU (srcPtr), R11 - MOVBU R11, (R8) - MOVBU (R7), R12 - MOVBU R12, (dstPtr) - -xtsSm4DecTailEnc: - VLD1 (RSP), [B0.B16] - VEOR TW.B16, B0.B16, B0.B16 - VREV32 B0.B16, B0.B16 - VMOV B0.S[1], B1.S[0] - VMOV B0.S[2], B2.S[0] - VMOV B0.S[3], B3.S[0] - - MOVD rkSave, rk - EOR R13, R13 - -tailDecLoop: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE tailDecLoop - - VMOV B2.S[0], B3.S[1] - VMOV B1.S[0], B3.S[2] - VMOV B0.S[0], B3.S[3] - VREV32 B3.B16, B3.B16 - - VEOR TW.B16, B3.B16, B3.B16 - VST1 [B3.B16], (R9) - - B xtsSm4DecDone - -xtsSm4DecLastBlock: - loadOneBlock - - MOVD rkSave, rk - EOR R13, R13 - -decLastBlockLoop: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE decLastBlockLoop - - storeOneBlock - mul2Inline - -xtsSm4DecDone: - VST1 [TW.B16], (twPtr) RET // func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) TEXT ·decryptSm4XtsGB(SB),0,$128-64 - LOAD_SM4_AESNI_CONSTS() - MOVD xk+0(FP), rk - MOVD tweak+8(FP), twPtr - MOVD dst+16(FP), dstPtr - MOVD src+40(FP), srcPtr - MOVD src_len+48(FP), srcPtrLen - - VEOR POLY.B16, POLY.B16, POLY.B16 - VEOR ZERO.B16, ZERO.B16, ZERO.B16 - - MOVD $0xE1, I - LSL $56, I - VMOV I, POLY.D[1] - - MOVD rk, rkSave - VLD1 (twPtr), [TW.B16] - -xtsSm4DecOctets: - CMP $128, srcPtrLen - BLT xtsSm4DecNibbles - SUB $128, srcPtrLen - - prepareGB8Tweaks - load8blocks - MOVD rkSave, rk - EOR R13, R13 - -decOctetsDec8Blocks: - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) - SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) - - ADD $1, R13 - CMP $8, R13 - BNE decOctetsDec8Blocks - - store8blocks - B xtsSm4DecOctets - -xtsSm4DecNibbles: - CMP $64, srcPtrLen - BLT xtsSm4DecSingles - SUB $64, srcPtrLen - - prepareGB4Tweaks - load4blocks - MOVD rkSave, rk - EOR R13, R13 - -decNibblesDec4Blocks: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - - ADD $1, R13 - CMP $8, R13 - BNE decNibblesDec4Blocks - - store4blocks - -xtsSm4DecSingles: - CMP $32, srcPtrLen - BLT xtsSm4DecSingles - SUB $16, srcPtrLen - - loadOneBlock - - MOVD rkSave, rk - EOR R13, R13 - -decSinglesDec4Blocks: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE decSinglesDec4Blocks - - storeOneBlock - mul2GBInline - - B xtsSm4DecSingles - -xtsSm4DecTail: - CBZ srcPtrLen, xtsSm4DecDone - - CMP $16, srcPtrLen - BEQ xtsSm4DecLastBlock - - VMOV TW.B16, B4.B16 - mul2GBInline - loadOneBlock - MOVD rkSave, rk - EOR R13, R13 - -decLastCompleteBlockLoop: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE decLastCompleteBlockLoop - storeOneBlock - VMOV B4.B16, TW.B16 - VST1 [B3.B16], (RSP) - - SUB $16, srcPtrLen - SUB $16, dstPtr, R7 - MOVD R7, R9 - MOVD RSP, R8 - - TBZ $3, srcPtrLen, less_than8 - MOVD.P 8(srcPtr), R11 - MOVD.P R11, 8(R8) - MOVD.P 8(R7), R12 - MOVD.P R12, 8(dstPtr) - -less_than8: - TBZ $2, srcPtrLen, less_than4 - MOVWU.P 4(srcPtr), R11 - MOVWU.P R11, 4(R8) - MOVWU.P 4(R7), R12 - MOVWU.P R12, 4(dstPtr) - -less_than4: - TBZ $1, srcPtrLen, less_than2 - MOVHU.P 2(srcPtr), R11 - MOVHU.P R11, 2(R8) - MOVHU.P 2(R7), R12 - MOVHU.P R12, 2(dstPtr) - -less_than2: - TBZ $0, srcPtrLen, xtsSm4DecTailEnc - MOVBU (srcPtr), R11 - MOVBU R11, (R8) - MOVBU (R7), R12 - MOVBU R12, (dstPtr) - -xtsSm4DecTailEnc: - VLD1 (RSP), [B0.B16] - VEOR TW.B16, B0.B16, B0.B16 - VREV32 B0.B16, B0.B16 - VMOV B0.S[1], B1.S[0] - VMOV B0.S[2], B2.S[0] - VMOV B0.S[3], B3.S[0] - - MOVD rkSave, rk - EOR R13, R13 - -tailDecLoop: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE tailDecLoop - - VMOV B2.S[0], B3.S[1] - VMOV B1.S[0], B3.S[2] - VMOV B0.S[0], B3.S[3] - VREV32 B3.B16, B3.B16 - - VEOR TW.B16, B3.B16, B3.B16 - VST1 [B3.B16], (R9) - - B xtsSm4DecDone - -xtsSm4DecLastBlock: - loadOneBlock - - MOVD rkSave, rk - EOR R13, R13 - -decLastBlockLoop: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) - ADD $1, R13 - CMP $8, R13 - BNE decLastBlockLoop - - storeOneBlock - mul2GBInline - -xtsSm4DecDone: - VST1 [TW.B16], (twPtr) RET