mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
[SM4] try arm64 gcmSm4Enc gcmSm4Dec
This commit is contained in:
parent
067a12cb20
commit
129803a389
645
sm4/gcm_arm64.s
645
sm4/gcm_arm64.s
@ -463,3 +463,648 @@ dataBail:
|
||||
#undef autLen
|
||||
#undef H0
|
||||
#undef pTblSave
|
||||
|
||||
// func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
TEXT ·gcmSm4Enc(SB),NOSPLIT,$0
|
||||
#define pTbl R0
|
||||
#define dstPtr R1
|
||||
#define ctrPtr R2
|
||||
#define srcPtr R3
|
||||
#define rk R4
|
||||
#define tPtr R5
|
||||
#define srcPtrLen R6
|
||||
#define aluCTR R7
|
||||
#define aluTMP R8
|
||||
#define H0 R9
|
||||
#define H1 R10
|
||||
#define pTblSave R11
|
||||
#define rkSave R12
|
||||
#define mulRoundSingleWithoutRev(X) \
|
||||
VEOR ACC0.B16, X.B16, X.B16 \
|
||||
VEXT $8, X.B16, X.B16, T0.B16 \
|
||||
VEOR X.B16, T0.B16, T0.B16 \
|
||||
VPMULL X.D1, T1.D1, ACC1.Q1 \
|
||||
VPMULL2 X.D2, T1.D2, ACC0.Q1 \
|
||||
VPMULL T0.D1, T2.D1, ACCM.Q1 \
|
||||
reduce() \
|
||||
|
||||
#define mulRoundSingle(X) \
|
||||
VREV64 X.B16, X.B16 \
|
||||
mulRoundSingleWithoutRev(X) \
|
||||
|
||||
MOVD productTable+0(FP), pTbl
|
||||
MOVD dst+8(FP), dstPtr
|
||||
MOVD src_base+32(FP), srcPtr
|
||||
MOVD src_len+40(FP), srcPtrLen
|
||||
MOVD ctr+56(FP), ctrPtr
|
||||
MOVD T+64(FP), tPtr
|
||||
MOVD rk_base+72(FP), rk
|
||||
|
||||
MOVD $0xC2, H1
|
||||
LSL $56, H1
|
||||
MOVD $1, H0
|
||||
VMOV H1, POLY.D[0]
|
||||
VMOV H0, POLY.D[1]
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
|
||||
MOVD pTbl, pTblSave
|
||||
MOVD rk, rkSave
|
||||
// Current tag, after AAD
|
||||
VLD1 (tPtr), [ACC0.B16]
|
||||
VEOR ACC1.B16, ACC1.B16, ACC1.B16
|
||||
VEOR ACCM.B16, ACCM.B16, ACCM.B16
|
||||
// Prepare initial counter, and the increment vector
|
||||
VLD1 (ctrPtr), [CTR.B16]
|
||||
VEOR INC.B16, INC.B16, INC.B16
|
||||
MOVD $1, H0
|
||||
VMOV H0, INC.S[3]
|
||||
VREV32 CTR.B16, CTR.B16
|
||||
VADD CTR.S4, INC.S4, CTR.S4
|
||||
// Skip to <8 blocks loop
|
||||
CMP $128, srcPtrLen
|
||||
|
||||
LOAD_SM4_AESNI_CONSTS()
|
||||
|
||||
BLT encNibblesLoop
|
||||
// There are at least 8 blocks to encrypt
|
||||
|
||||
encOctetsLoop:
|
||||
SUB $128, srcPtrLen
|
||||
// Prepare 8 counters
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD B0.S4, INC.S4, B1.S4
|
||||
VREV32 B0.B16, B0.B16
|
||||
VADD B1.S4, INC.S4, B2.S4
|
||||
VREV32 B1.B16, B1.B16
|
||||
VADD B2.S4, INC.S4, B3.S4
|
||||
VREV32 B2.B16, B2.B16
|
||||
VADD B3.S4, INC.S4, B4.S4
|
||||
VREV32 B3.B16, B3.B16
|
||||
VADD B4.S4, INC.S4, B5.S4
|
||||
VREV32 B4.B16, B4.B16
|
||||
VADD B5.S4, INC.S4, B6.S4
|
||||
VREV32 B5.B16, B5.B16
|
||||
VADD B6.S4, INC.S4, B7.S4
|
||||
VREV32 B6.B16, B6.B16
|
||||
VADD B7.S4, INC.S4, CTR.S4
|
||||
VREV32 B7.B16, B7.B16
|
||||
|
||||
// encryption first 4 blocks
|
||||
EOR R13, R13
|
||||
MOVD rkSave, rk
|
||||
|
||||
encOctetsEnc4Blocks1:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
BNE encOctetsEnc4Blocks1
|
||||
VREV32 B0.B16, B0.B16
|
||||
VREV32 B1.B16, B1.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
// encryption first 4 blocks
|
||||
MOVD rkSave, rk
|
||||
|
||||
encOctetsEnc4Blocks2:
|
||||
SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7)
|
||||
SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4)
|
||||
SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5)
|
||||
SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $16, R13
|
||||
BNE encOctetsEnc4Blocks2
|
||||
VREV32 B4.B16, B4.B16
|
||||
VREV32 B5.B16, B5.B16
|
||||
VREV32 B6.B16, B6.B16
|
||||
VREV32 B7.B16, B7.B16
|
||||
|
||||
// XOR plaintext and store ciphertext
|
||||
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||
VEOR B0.B16, T1.B16, B0.B16
|
||||
VEOR B1.B16, T2.B16, B1.B16
|
||||
VST1.P [B0.B16, B1.B16], 32(dstPtr)
|
||||
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||
VEOR B2.B16, T1.B16, B2.B16
|
||||
VEOR B3.B16, T2.B16, B3.B16
|
||||
VST1.P [B2.B16, B3.B16], 32(dstPtr)
|
||||
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||
VEOR B4.B16, T1.B16, B4.B16
|
||||
VEOR B5.B16, T2.B16, B5.B16
|
||||
VST1.P [B4.B16, B5.B16], 32(dstPtr)
|
||||
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||
VEOR B6.B16, T1.B16, B6.B16
|
||||
VEOR B7.B16, T2.B16, B7.B16
|
||||
VST1.P [B6.B16, B7.B16], 32(dstPtr)
|
||||
|
||||
VLD1.P 32(pTbl), [T1.B16, T2.B16]
|
||||
VREV64 B0.B16, B0.B16
|
||||
VEOR ACC0.B16, B0.B16, B0.B16
|
||||
VEXT $8, B0.B16, B0.B16, T0.B16
|
||||
VEOR B0.B16, T0.B16, T0.B16
|
||||
VPMULL B0.D1, T1.D1, ACC1.Q1
|
||||
VPMULL2 B0.D2, T1.D2, ACC0.Q1
|
||||
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||
|
||||
mulRound(B1)
|
||||
mulRound(B2)
|
||||
mulRound(B3)
|
||||
mulRound(B4)
|
||||
mulRound(B5)
|
||||
mulRound(B6)
|
||||
mulRound(B7)
|
||||
MOVD pTblSave, pTbl
|
||||
reduce()
|
||||
|
||||
CMP $128, srcPtrLen
|
||||
BGE encOctetsLoop
|
||||
|
||||
encNibblesLoop:
|
||||
CBZ srcPtrLen, encDone
|
||||
ADD $14*16, pTbl
|
||||
// Preload H and its Karatsuba precomp
|
||||
VLD1.P (pTbl), [T1.B16, T2.B16]
|
||||
|
||||
CMP $64, srcPtrLen
|
||||
BLT encStartSingles
|
||||
SUB $64, srcPtrLen
|
||||
|
||||
// Prepare 4 counters
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD B0.S4, INC.S4, B1.S4
|
||||
VREV32 B0.B16, B0.B16
|
||||
VADD B1.S4, INC.S4, B2.S4
|
||||
VREV32 B1.B16, B1.B16
|
||||
VADD B2.S4, INC.S4, B3.S4
|
||||
VREV32 B2.B16, B2.B16
|
||||
VADD B3.S4, INC.S4, B4.S4
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
// encryption first 4 blocks
|
||||
EOR R13, R13
|
||||
MOVD rkSave, rk
|
||||
|
||||
encNibblesEnc4Blocks:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
BNE encNibblesEnc4Blocks
|
||||
VREV32 B0.B16, B0.B16
|
||||
VREV32 B1.B16, B1.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
// XOR plaintext and store ciphertext
|
||||
VLD1.P 32(srcPtr), [K1.B16, K2.B16]
|
||||
VEOR B0.B16, K1.B16, B0.B16
|
||||
VEOR B1.B16, K2.B16, B1.B16
|
||||
VST1.P [B0.B16, B1.B16], 32(dstPtr)
|
||||
VLD1.P 32(srcPtr), [K1.B16, K2.B16]
|
||||
VEOR B2.B16, K1.B16, B2.B16
|
||||
VEOR B3.B16, K2.B16, B3.B16
|
||||
VST1.P [B2.B16, B3.B16], 32(dstPtr)
|
||||
|
||||
mulRoundSingle(B0)
|
||||
mulRoundSingle(B1)
|
||||
mulRoundSingle(B2)
|
||||
mulRoundSingle(B3)
|
||||
|
||||
encStartSingles:
|
||||
CBZ srcPtrLen, encDone
|
||||
|
||||
// Prepare 4 counters
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD B0.S4, INC.S4, B1.S4
|
||||
VREV32 B0.B16, B0.B16
|
||||
VADD B1.S4, INC.S4, B2.S4
|
||||
VREV32 B1.B16, B1.B16
|
||||
VADD B2.S4, INC.S4, B3.S4
|
||||
VREV32 B2.B16, B2.B16
|
||||
VADD B3.S4, INC.S4, B4.S4
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
// encryption first 4 blocks
|
||||
EOR R13, R13
|
||||
MOVD rkSave, rk
|
||||
|
||||
encSinglesEnc4Blocks:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
BNE encSinglesEnc4Blocks
|
||||
VREV32 B0.B16, B0.B16
|
||||
VREV32 B1.B16, B1.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
VMOV B0.B16, K0.B16
|
||||
CMP $16, srcPtrLen
|
||||
BLT encTail
|
||||
SUB $16, srcPtrLen
|
||||
VLD1.P 16(srcPtr), [K1.B16]
|
||||
VEOR K0.B16, K1.B16, K0.B16
|
||||
VST1.P [K0.B16], 16(dstPtr)
|
||||
mulRoundSingle(K0)
|
||||
|
||||
VMOV B1.B16, K0.B16
|
||||
CMP $16, srcPtrLen
|
||||
BLT encTail
|
||||
SUB $16, srcPtrLen
|
||||
VLD1.P 16(srcPtr), [K1.B16]
|
||||
VEOR K0.B16, K1.B16, K0.B16
|
||||
VST1.P [K0.B16], 16(dstPtr)
|
||||
mulRoundSingle(K0)
|
||||
|
||||
VMOV B2.B16, K0.B16
|
||||
CMP $16, srcPtrLen
|
||||
BLT encTail
|
||||
SUB $16, srcPtrLen
|
||||
VLD1.P 16(srcPtr), [K1.B16]
|
||||
VEOR K0.B16, K1.B16, K0.B16
|
||||
VST1.P [K0.B16], 16(dstPtr)
|
||||
mulRoundSingle(K0)
|
||||
|
||||
VMOV B3.B16, K0.B16
|
||||
CMP $16, srcPtrLen
|
||||
BLT encTail
|
||||
SUB $16, srcPtrLen
|
||||
VLD1.P 16(srcPtr), [K1.B16]
|
||||
VEOR K0.B16, K1.B16, K0.B16
|
||||
VST1.P [K0.B16], 16(dstPtr)
|
||||
mulRoundSingle(K0)
|
||||
|
||||
encTail:
|
||||
CBZ srcPtrLen, encDone
|
||||
VEOR T0.B16, T0.B16, T0.B16
|
||||
VEOR T3.B16, T3.B16, T3.B16
|
||||
MOVD $0, H1
|
||||
SUB $1, H1
|
||||
ADD srcPtrLen, srcPtr
|
||||
|
||||
TBZ $3, srcPtrLen, ld4
|
||||
MOVD.W -8(srcPtr), H0
|
||||
VMOV H0, T0.D[0]
|
||||
VMOV H1, T3.D[0]
|
||||
ld4:
|
||||
TBZ $2, srcPtrLen, ld2
|
||||
MOVW.W -4(srcPtr), H0
|
||||
VEXT $12, T0.B16, ZERO.B16, T0.B16
|
||||
VEXT $12, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H0, T0.S[0]
|
||||
VMOV H1, T3.S[0]
|
||||
ld2:
|
||||
TBZ $1, srcPtrLen, ld1
|
||||
MOVH.W -2(srcPtr), H0
|
||||
VEXT $14, T0.B16, ZERO.B16, T0.B16
|
||||
VEXT $14, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H0, T0.H[0]
|
||||
VMOV H1, T3.H[0]
|
||||
ld1:
|
||||
TBZ $0, srcPtrLen, ld0
|
||||
MOVB.W -1(srcPtr), H0
|
||||
VEXT $15, T0.B16, ZERO.B16, T0.B16
|
||||
VEXT $15, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H0, T0.B[0]
|
||||
VMOV H1, T3.B[0]
|
||||
ld0:
|
||||
MOVD ZR, srcPtrLen
|
||||
VEOR T0.B16, K0.B16, K0.B16
|
||||
VAND T3.B16, K0.B16, K0.B16
|
||||
VST1.P [K0.B16], 16(dstPtr)
|
||||
mulRoundSingle(K0)
|
||||
|
||||
encDone:
|
||||
VST1 [ACC0.B16], (tPtr)
|
||||
RET
|
||||
|
||||
// func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
TEXT ·gcmSm4Dec(SB),NOSPLIT,$0
|
||||
MOVD productTable+0(FP), pTbl
|
||||
MOVD dst+8(FP), dstPtr
|
||||
MOVD src_base+32(FP), srcPtr
|
||||
MOVD src_len+40(FP), srcPtrLen
|
||||
MOVD ctr+56(FP), ctrPtr
|
||||
MOVD T+64(FP), tPtr
|
||||
MOVD rk_base+72(FP), rk
|
||||
|
||||
MOVD $0xC2, H1
|
||||
LSL $56, H1
|
||||
MOVD $1, H0
|
||||
VMOV H1, POLY.D[0]
|
||||
VMOV H0, POLY.D[1]
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
|
||||
MOVD pTbl, pTblSave
|
||||
MOVD rk, rkSave
|
||||
// Current tag, after AAD
|
||||
VLD1 (tPtr), [ACC0.B16]
|
||||
VEOR ACC1.B16, ACC1.B16, ACC1.B16
|
||||
VEOR ACCM.B16, ACCM.B16, ACCM.B16
|
||||
// Prepare initial counter, and the increment vector
|
||||
VLD1 (ctrPtr), [CTR.B16]
|
||||
VEOR INC.B16, INC.B16, INC.B16
|
||||
MOVD $1, H0
|
||||
VMOV H0, INC.S[3]
|
||||
VREV32 CTR.B16, CTR.B16
|
||||
VADD CTR.S4, INC.S4, CTR.S4
|
||||
|
||||
// Skip to <8 blocks loop
|
||||
CMP $128, srcPtrLen
|
||||
|
||||
LOAD_SM4_AESNI_CONSTS()
|
||||
|
||||
BLT decNibblesLoop
|
||||
// There are at least 8 blocks to encrypt
|
||||
|
||||
decOctetsLoop:
|
||||
SUB $128, srcPtrLen
|
||||
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD B0.S4, INC.S4, B1.S4
|
||||
VREV32 B0.B16, B0.B16
|
||||
VADD B1.S4, INC.S4, B2.S4
|
||||
VREV32 B1.B16, B1.B16
|
||||
VADD B2.S4, INC.S4, B3.S4
|
||||
VREV32 B2.B16, B2.B16
|
||||
VADD B3.S4, INC.S4, B4.S4
|
||||
VREV32 B3.B16, B3.B16
|
||||
VADD B4.S4, INC.S4, B5.S4
|
||||
VREV32 B4.B16, B4.B16
|
||||
VADD B5.S4, INC.S4, B6.S4
|
||||
VREV32 B5.B16, B5.B16
|
||||
VADD B6.S4, INC.S4, B7.S4
|
||||
VREV32 B6.B16, B6.B16
|
||||
VADD B7.S4, INC.S4, CTR.S4
|
||||
VREV32 B7.B16, B7.B16
|
||||
|
||||
// encryption first 4 blocks
|
||||
EOR R13, R13
|
||||
MOVD rkSave, rk
|
||||
|
||||
decOctetsEnc4Blocks1:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
BNE decOctetsEnc4Blocks1
|
||||
VREV32 B0.B16, T1.B16
|
||||
VREV32 B1.B16, T2.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
// encryption first 4 blocks
|
||||
MOVD rkSave, rk
|
||||
|
||||
decOctetsEnc4Blocks2:
|
||||
SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7)
|
||||
SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4)
|
||||
SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5)
|
||||
SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $16, R13
|
||||
BNE decOctetsEnc4Blocks2
|
||||
VREV32 B4.B16, B4.B16
|
||||
VREV32 B5.B16, B5.B16
|
||||
VREV32 B6.B16, B6.B16
|
||||
VREV32 B7.B16, B7.B16
|
||||
|
||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||
VEOR B0.B16, T1.B16, T1.B16
|
||||
VEOR B1.B16, T2.B16, T2.B16
|
||||
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||
|
||||
VLD1.P 32(pTbl), [T1.B16, T2.B16]
|
||||
VREV64 B0.B16, B0.B16
|
||||
VEOR ACC0.B16, B0.B16, B0.B16
|
||||
VEXT $8, B0.B16, B0.B16, T0.B16
|
||||
VEOR B0.B16, T0.B16, T0.B16
|
||||
VPMULL B0.D1, T1.D1, ACC1.Q1
|
||||
VPMULL2 B0.D2, T1.D2, ACC0.Q1
|
||||
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||
mulRound(B1)
|
||||
|
||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||
VEOR B2.B16, B0.B16, T1.B16
|
||||
VEOR B3.B16, B1.B16, T2.B16
|
||||
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||
mulRound(B0)
|
||||
mulRound(B1)
|
||||
|
||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||
VEOR B4.B16, B0.B16, T1.B16
|
||||
VEOR B5.B16, B1.B16, T2.B16
|
||||
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||
mulRound(B0)
|
||||
mulRound(B1)
|
||||
|
||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||
VEOR B6.B16, B0.B16, T1.B16
|
||||
VEOR B7.B16, B1.B16, T2.B16
|
||||
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||
mulRound(B0)
|
||||
mulRound(B1)
|
||||
|
||||
MOVD pTblSave, pTbl
|
||||
reduce()
|
||||
|
||||
CMP $128, srcPtrLen
|
||||
BGE decOctetsLoop
|
||||
|
||||
decNibblesLoop:
|
||||
CBZ srcPtrLen, decDone
|
||||
ADD $14*16, pTbl
|
||||
// Preload H and its Karatsuba precomp
|
||||
VLD1.P (pTbl), [T1.B16, T2.B16]
|
||||
CMP $64, srcPtrLen
|
||||
BLT decStartSingles
|
||||
SUB $64, srcPtrLen
|
||||
|
||||
// Prepare 4 counters
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD B0.S4, INC.S4, B1.S4
|
||||
VREV32 B0.B16, B0.B16
|
||||
VADD B1.S4, INC.S4, B2.S4
|
||||
VREV32 B1.B16, B1.B16
|
||||
VADD B2.S4, INC.S4, B3.S4
|
||||
VREV32 B2.B16, B2.B16
|
||||
VADD B3.S4, INC.S4, B4.S4
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
// encryption first 4 blocks
|
||||
EOR R13, R13
|
||||
MOVD rkSave, rk
|
||||
|
||||
decNibblesEnc4Blocks:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
BNE decNibblesEnc4Blocks
|
||||
VREV32 B0.B16, B0.B16
|
||||
VREV32 B1.B16, B1.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
// XOR plaintext and store ciphertext
|
||||
VLD1.P 32(srcPtr), [K1.B16, K2.B16]
|
||||
VREV64 K1.B16, B4.B16
|
||||
VREV64 K2.B16, B5.B16
|
||||
VEOR B0.B16, K1.B16, B0.B16
|
||||
VEOR B1.B16, K2.B16, B1.B16
|
||||
VST1.P [B0.B16, B1.B16], 32(dstPtr)
|
||||
VLD1.P 32(srcPtr), [K1.B16, K2.B16]
|
||||
VREV64 K1.B16, B6.B16
|
||||
VREV64 K2.B16, B7.B16
|
||||
VEOR B2.B16, K1.B16, B2.B16
|
||||
VEOR B3.B16, K2.B16, B3.B16
|
||||
VST1.P [B2.B16, B3.B16], 32(dstPtr)
|
||||
mulRoundSingleWithoutRev(B4)
|
||||
mulRoundSingleWithoutRev(B5)
|
||||
mulRoundSingleWithoutRev(B6)
|
||||
mulRoundSingleWithoutRev(B7)
|
||||
|
||||
decStartSingles:
|
||||
CBZ srcPtrLen, decDone
|
||||
|
||||
// Prepare 4 counters
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD B0.S4, INC.S4, B1.S4
|
||||
VREV32 B0.B16, B0.B16
|
||||
VADD B1.S4, INC.S4, B2.S4
|
||||
VREV32 B1.B16, B1.B16
|
||||
VADD B2.S4, INC.S4, B3.S4
|
||||
VREV32 B2.B16, B2.B16
|
||||
VADD B3.S4, INC.S4, B4.S4
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
// encryption first 4 blocks
|
||||
EOR R13, R13
|
||||
MOVD rkSave, rk
|
||||
|
||||
decSinglesEnc4Blocks:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
BNE decSinglesEnc4Blocks
|
||||
VREV32 B0.B16, B0.B16
|
||||
VREV32 B1.B16, B1.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
|
||||
VMOV B0.B16, K0.B16
|
||||
CMP $16, srcPtrLen
|
||||
BLT decTail
|
||||
SUB $16, srcPtrLen
|
||||
VLD1.P 16(srcPtr), [K1.B16]
|
||||
VREV64 K1.B16, B5.B16
|
||||
VEOR K0.B16, K1.B16, K0.B16
|
||||
VST1.P [K0.B16], 16(dstPtr)
|
||||
mulRoundSingleWithoutRev(B5)
|
||||
|
||||
VMOV B1.B16, K0.B16
|
||||
CMP $16, srcPtrLen
|
||||
BLT decTail
|
||||
SUB $16, srcPtrLen
|
||||
VLD1.P 16(srcPtr), [K1.B16]
|
||||
VREV64 K1.B16, B5.B16
|
||||
VEOR K0.B16, K1.B16, K0.B16
|
||||
VST1.P [K0.B16], 16(dstPtr)
|
||||
mulRoundSingleWithoutRev(B5)
|
||||
|
||||
VMOV B2.B16, K0.B16
|
||||
CMP $16, srcPtrLen
|
||||
BLT decTail
|
||||
SUB $16, srcPtrLen
|
||||
VLD1.P 16(srcPtr), [K1.B16]
|
||||
VREV64 K1.B16, B5.B16
|
||||
VEOR K0.B16, K1.B16, K0.B16
|
||||
VST1.P [K0.B16], 16(dstPtr)
|
||||
mulRoundSingleWithoutRev(B5)
|
||||
|
||||
VMOV B3.B16, K0.B16
|
||||
CMP $16, srcPtrLen
|
||||
BLT decTail
|
||||
SUB $16, srcPtrLen
|
||||
VLD1.P 16(srcPtr), [K1.B16]
|
||||
VREV64 K1.B16, B5.B16
|
||||
VEOR K0.B16, K1.B16, K0.B16
|
||||
VST1.P [K0.B16], 16(dstPtr)
|
||||
mulRoundSingleWithoutRev(B5)
|
||||
|
||||
decTail:
|
||||
CBZ srcPtrLen, decDone
|
||||
// Assuming it is safe to load past dstPtr due to the presence of the tag
|
||||
VLD1 (srcPtr), [B5.B16]
|
||||
|
||||
VEOR B5.B16, K0.B16, B0.B16
|
||||
|
||||
VEOR T3.B16, T3.B16, T3.B16
|
||||
MOVD $0, H1
|
||||
SUB $1, H1
|
||||
|
||||
TBZ $3, srcPtrLen, decLd4
|
||||
VMOV B0.D[0], H0
|
||||
MOVD.P H0, 8(dstPtr)
|
||||
VMOV H1, T3.D[0]
|
||||
VEXT $8, ZERO.B16, B0.B16, B0.B16
|
||||
|
||||
decLd4:
|
||||
TBZ $2, srcPtrLen, decLd2
|
||||
VMOV B0.S[0], H0
|
||||
MOVW.P H0, 4(dstPtr)
|
||||
VEXT $12, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H1, T3.S[0]
|
||||
VEXT $4, ZERO.B16, B0.B16, B0.B16
|
||||
decLd2:
|
||||
TBZ $1, srcPtrLen, decLd1
|
||||
VMOV B0.H[0], H0
|
||||
MOVH.P H0, 2(dstPtr)
|
||||
VEXT $14, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H1, T3.H[0]
|
||||
VEXT $2, ZERO.B16, B0.B16, B0.B16
|
||||
decLd1:
|
||||
TBZ $0, srcPtrLen, decLd0
|
||||
VMOV B0.B[0], H0
|
||||
MOVB.P H0, 1(dstPtr)
|
||||
VEXT $15, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H1, T3.B[0]
|
||||
decLd0:
|
||||
|
||||
VAND T3.B16, B5.B16, B5.B16
|
||||
VREV64 B5.B16, B5.B16
|
||||
|
||||
VEOR ACC0.B16, B5.B16, B5.B16
|
||||
VEXT $8, B5.B16, B5.B16, T0.B16
|
||||
VEOR B5.B16, T0.B16, T0.B16
|
||||
VPMULL B5.D1, T1.D1, ACC1.Q1
|
||||
VPMULL2 B5.D2, T1.D2, ACC0.Q1
|
||||
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||
reduce()
|
||||
|
||||
decDone:
|
||||
VST1 [ACC0.B16], (tPtr)
|
||||
RET
|
||||
|
@ -1,5 +1,5 @@
|
||||
//go:build amd64
|
||||
// +build amd64
|
||||
//go:build amd64 || arm64
|
||||
// +build amd64 arm64
|
||||
|
||||
package sm4
|
||||
|
@ -23,6 +23,12 @@ var _ gcmAble = (*sm4CipherGCM)(nil)
|
||||
//go:noescape
|
||||
func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
||||
|
||||
//go:noescape
|
||||
func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
|
||||
//go:noescape
|
||||
func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
|
||||
//go:noescape
|
||||
func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user