sm4: xts asm arm64 test one func first

This commit is contained in:
Sun Yimin 2023-08-24 16:07:37 +08:00 committed by GitHub
parent 4fc2acf95c
commit 3a304ee8dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -372,545 +372,12 @@ xtsSm4EncDone:
// func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
TEXT ·encryptSm4XtsGB(SB),0,$128-64
LOAD_SM4_AESNI_CONSTS()
MOVD xk+0(FP), rk
MOVD tweak+8(FP), twPtr
MOVD dst+16(FP), dstPtr
MOVD src+40(FP), srcPtr
MOVD src_len+48(FP), srcPtrLen
VEOR POLY.B16, POLY.B16, POLY.B16
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD $0xE1, I
LSL $56, I
VMOV I, POLY.D[1]
MOVD rk, rkSave
VLD1 (twPtr), [TW.B16]
xtsSm4EncOctets:
CMP $128, srcPtrLen
BLT xtsSm4EncNibbles
SUB $128, srcPtrLen
prepareGB8Tweaks
load8blocks
MOVD rkSave, rk
EOR R13, R13
encOctetsEnc8Blocks:
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
ADD $1, R13
CMP $8, R13
BNE encOctetsEnc8Blocks
store8blocks
B xtsSm4EncOctets
xtsSm4EncNibbles:
CMP $64, srcPtrLen
BLT xtsSm4EncSingles
SUB $64, srcPtrLen
prepareGB4Tweaks
load4blocks
MOVD rkSave, rk
EOR R13, R13
encNibblesEnc4Blocks:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE encNibblesEnc4Blocks
store4blocks
xtsSm4EncSingles:
CMP $16, srcPtrLen
BLT xtsSm4EncTail
SUB $16, srcPtrLen
loadOneBlock
MOVD rkSave, rk
EOR R13, R13
encSinglesEnc4Blocks:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE encSinglesEnc4Blocks
storeOneBlock
mul2GBInline
B xtsSm4EncSingles
xtsSm4EncTail:
CBZ srcPtrLen, xtsSm4EncDone
SUB $16, dstPtr, R7
MOVD R7, R9
MOVD RSP, R8
VLD1 (R7), [B0.B16]
VST1 [B0.B16], (R8)
TBZ $3, srcPtrLen, less_than8
MOVD.P 8(srcPtr), R11
MOVD.P R11, 8(R8)
MOVD.P 8(R7), R12
MOVD.P R12, 8(dstPtr)
less_than8:
TBZ $2, srcPtrLen, less_than4
MOVWU.P 4(srcPtr), R11
MOVWU.P R11, 4(R8)
MOVWU.P 4(R7), R12
MOVWU.P R12, 4(dstPtr)
less_than4:
TBZ $1, srcPtrLen, less_than2
MOVHU.P 2(srcPtr), R11
MOVHU.P R11, 2(R8)
MOVHU.P 2(R7), R12
MOVHU.P R12, 2(dstPtr)
less_than2:
TBZ $0, srcPtrLen, xtsSm4EncTailEnc
MOVBU (srcPtr), R11
MOVBU R11, (R8)
MOVBU (R7), R12
MOVBU R12, (dstPtr)
xtsSm4EncTailEnc:
VLD1 (RSP), [B0.B16]
VEOR TW.B16, B0.B16, B0.B16
VREV32 B0.B16, B0.B16
VMOV B0.S[1], B1.S[0]
VMOV B0.S[2], B2.S[0]
VMOV B0.S[3], B3.S[0]
MOVD rkSave, rk
EOR R13, R13
tailEncLoop:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE tailEncLoop
VMOV B2.S[0], B3.S[1]
VMOV B1.S[0], B3.S[2]
VMOV B0.S[0], B3.S[3]
VREV32 B3.B16, B3.B16
VEOR TW.B16, B3.B16, B3.B16
VST1 [B3.B16], (R9)
xtsSm4EncDone:
VST1 [TW.B16], (twPtr)
RET
// func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
TEXT ·decryptSm4Xts(SB),0,$128-64
LOAD_SM4_AESNI_CONSTS()
MOVD xk+0(FP), rk
MOVD tweak+8(FP), twPtr
MOVD dst+16(FP), dstPtr
MOVD src+40(FP), srcPtr
MOVD src_len+48(FP), srcPtrLen
VEOR POLY.B16, POLY.B16, POLY.B16
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD $0x87, I
VMOV I, POLY.D[0]
MOVD rk, rkSave
VLD1 (twPtr), [TW.B16]
xtsSm4DecOctets:
CMP $128, srcPtrLen
BLT xtsSm4DecNibbles
SUB $128, srcPtrLen
prepare8Tweaks
load8blocks
MOVD rkSave, rk
EOR R13, R13
decOctetsDec8Blocks:
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
ADD $1, R13
CMP $8, R13
BNE decOctetsDec8Blocks
store8blocks
B xtsSm4DecOctets
xtsSm4DecNibbles:
CMP $64, srcPtrLen
BLT xtsSm4DecSingles
SUB $64, srcPtrLen
prepare4Tweaks
load4blocks
MOVD rkSave, rk
EOR R13, R13
decNibblesDec4Blocks:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE decNibblesDec4Blocks
store4blocks
xtsSm4DecSingles:
CMP $32, srcPtrLen
BLT xtsSm4DecSingles
SUB $16, srcPtrLen
loadOneBlock
MOVD rkSave, rk
EOR R13, R13
decSinglesDec4Blocks:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE decSinglesDec4Blocks
storeOneBlock
mul2Inline
B xtsSm4DecSingles
xtsSm4DecTail:
CBZ srcPtrLen, xtsSm4DecDone
CMP $16, srcPtrLen
BEQ xtsSm4DecLastBlock
VMOV TW.B16, B4.B16
mul2Inline
loadOneBlock
MOVD rkSave, rk
EOR R13, R13
decLastCompleteBlockLoop:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE decLastCompleteBlockLoop
storeOneBlock
VMOV B4.B16, TW.B16
VST1 [B3.B16], (RSP)
SUB $16, srcPtrLen
SUB $16, dstPtr, R7
MOVD R7, R9
MOVD RSP, R8
TBZ $3, srcPtrLen, less_than8
MOVD.P 8(srcPtr), R11
MOVD.P R11, 8(R8)
MOVD.P 8(R7), R12
MOVD.P R12, 8(dstPtr)
less_than8:
TBZ $2, srcPtrLen, less_than4
MOVWU.P 4(srcPtr), R11
MOVWU.P R11, 4(R8)
MOVWU.P 4(R7), R12
MOVWU.P R12, 4(dstPtr)
less_than4:
TBZ $1, srcPtrLen, less_than2
MOVHU.P 2(srcPtr), R11
MOVHU.P R11, 2(R8)
MOVHU.P 2(R7), R12
MOVHU.P R12, 2(dstPtr)
less_than2:
TBZ $0, srcPtrLen, xtsSm4DecTailEnc
MOVBU (srcPtr), R11
MOVBU R11, (R8)
MOVBU (R7), R12
MOVBU R12, (dstPtr)
xtsSm4DecTailEnc:
VLD1 (RSP), [B0.B16]
VEOR TW.B16, B0.B16, B0.B16
VREV32 B0.B16, B0.B16
VMOV B0.S[1], B1.S[0]
VMOV B0.S[2], B2.S[0]
VMOV B0.S[3], B3.S[0]
MOVD rkSave, rk
EOR R13, R13
tailDecLoop:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE tailDecLoop
VMOV B2.S[0], B3.S[1]
VMOV B1.S[0], B3.S[2]
VMOV B0.S[0], B3.S[3]
VREV32 B3.B16, B3.B16
VEOR TW.B16, B3.B16, B3.B16
VST1 [B3.B16], (R9)
B xtsSm4DecDone
xtsSm4DecLastBlock:
loadOneBlock
MOVD rkSave, rk
EOR R13, R13
decLastBlockLoop:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE decLastBlockLoop
storeOneBlock
mul2Inline
xtsSm4DecDone:
VST1 [TW.B16], (twPtr)
RET
// func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
TEXT ·decryptSm4XtsGB(SB),0,$128-64
LOAD_SM4_AESNI_CONSTS()
MOVD xk+0(FP), rk
MOVD tweak+8(FP), twPtr
MOVD dst+16(FP), dstPtr
MOVD src+40(FP), srcPtr
MOVD src_len+48(FP), srcPtrLen
VEOR POLY.B16, POLY.B16, POLY.B16
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD $0xE1, I
LSL $56, I
VMOV I, POLY.D[1]
MOVD rk, rkSave
VLD1 (twPtr), [TW.B16]
xtsSm4DecOctets:
CMP $128, srcPtrLen
BLT xtsSm4DecNibbles
SUB $128, srcPtrLen
prepareGB8Tweaks
load8blocks
MOVD rkSave, rk
EOR R13, R13
decOctetsDec8Blocks:
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
ADD $1, R13
CMP $8, R13
BNE decOctetsDec8Blocks
store8blocks
B xtsSm4DecOctets
xtsSm4DecNibbles:
CMP $64, srcPtrLen
BLT xtsSm4DecSingles
SUB $64, srcPtrLen
prepareGB4Tweaks
load4blocks
MOVD rkSave, rk
EOR R13, R13
decNibblesDec4Blocks:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE decNibblesDec4Blocks
store4blocks
xtsSm4DecSingles:
CMP $32, srcPtrLen
BLT xtsSm4DecSingles
SUB $16, srcPtrLen
loadOneBlock
MOVD rkSave, rk
EOR R13, R13
decSinglesDec4Blocks:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE decSinglesDec4Blocks
storeOneBlock
mul2GBInline
B xtsSm4DecSingles
xtsSm4DecTail:
CBZ srcPtrLen, xtsSm4DecDone
CMP $16, srcPtrLen
BEQ xtsSm4DecLastBlock
VMOV TW.B16, B4.B16
mul2GBInline
loadOneBlock
MOVD rkSave, rk
EOR R13, R13
decLastCompleteBlockLoop:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE decLastCompleteBlockLoop
storeOneBlock
VMOV B4.B16, TW.B16
VST1 [B3.B16], (RSP)
SUB $16, srcPtrLen
SUB $16, dstPtr, R7
MOVD R7, R9
MOVD RSP, R8
TBZ $3, srcPtrLen, less_than8
MOVD.P 8(srcPtr), R11
MOVD.P R11, 8(R8)
MOVD.P 8(R7), R12
MOVD.P R12, 8(dstPtr)
less_than8:
TBZ $2, srcPtrLen, less_than4
MOVWU.P 4(srcPtr), R11
MOVWU.P R11, 4(R8)
MOVWU.P 4(R7), R12
MOVWU.P R12, 4(dstPtr)
less_than4:
TBZ $1, srcPtrLen, less_than2
MOVHU.P 2(srcPtr), R11
MOVHU.P R11, 2(R8)
MOVHU.P 2(R7), R12
MOVHU.P R12, 2(dstPtr)
less_than2:
TBZ $0, srcPtrLen, xtsSm4DecTailEnc
MOVBU (srcPtr), R11
MOVBU R11, (R8)
MOVBU (R7), R12
MOVBU R12, (dstPtr)
xtsSm4DecTailEnc:
VLD1 (RSP), [B0.B16]
VEOR TW.B16, B0.B16, B0.B16
VREV32 B0.B16, B0.B16
VMOV B0.S[1], B1.S[0]
VMOV B0.S[2], B2.S[0]
VMOV B0.S[3], B3.S[0]
MOVD rkSave, rk
EOR R13, R13
tailDecLoop:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE tailDecLoop
VMOV B2.S[0], B3.S[1]
VMOV B1.S[0], B3.S[2]
VMOV B0.S[0], B3.S[3]
VREV32 B3.B16, B3.B16
VEOR TW.B16, B3.B16, B3.B16
VST1 [B3.B16], (R9)
B xtsSm4DecDone
xtsSm4DecLastBlock:
loadOneBlock
MOVD rkSave, rk
EOR R13, R13
decLastBlockLoop:
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
ADD $1, R13
CMP $8, R13
BNE decLastBlockLoop
storeOneBlock
mul2GBInline
xtsSm4DecDone:
VST1 [TW.B16], (twPtr)
RET