diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index 0d8d9fa..45819e9 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -463,3 +463,648 @@ dataBail: #undef autLen #undef H0 #undef pTblSave + +// func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) +TEXT ·gcmSm4Enc(SB),NOSPLIT,$0 +#define pTbl R0 +#define dstPtr R1 +#define ctrPtr R2 +#define srcPtr R3 +#define rk R4 +#define tPtr R5 +#define srcPtrLen R6 +#define aluCTR R7 +#define aluTMP R8 +#define H0 R9 +#define H1 R10 +#define pTblSave R11 +#define rkSave R12 +#define mulRoundSingleWithoutRev(X) \ + VEOR ACC0.B16, X.B16, X.B16 \ + VEXT $8, X.B16, X.B16, T0.B16 \ + VEOR X.B16, T0.B16, T0.B16 \ + VPMULL X.D1, T1.D1, ACC1.Q1 \ + VPMULL2 X.D2, T1.D2, ACC0.Q1 \ + VPMULL T0.D1, T2.D1, ACCM.Q1 \ + reduce() \ + +#define mulRoundSingle(X) \ + VREV64 X.B16, X.B16 \ + mulRoundSingleWithoutRev(X) \ + + MOVD productTable+0(FP), pTbl + MOVD dst+8(FP), dstPtr + MOVD src_base+32(FP), srcPtr + MOVD src_len+40(FP), srcPtrLen + MOVD ctr+56(FP), ctrPtr + MOVD T+64(FP), tPtr + MOVD rk_base+72(FP), rk + + MOVD $0xC2, H1 + LSL $56, H1 + MOVD $1, H0 + VMOV H1, POLY.D[0] + VMOV H0, POLY.D[1] + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD pTbl, pTblSave + MOVD rk, rkSave + // Current tag, after AAD + VLD1 (tPtr), [ACC0.B16] + VEOR ACC1.B16, ACC1.B16, ACC1.B16 + VEOR ACCM.B16, ACCM.B16, ACCM.B16 + // Prepare initial counter, and the increment vector + VLD1 (ctrPtr), [CTR.B16] + VEOR INC.B16, INC.B16, INC.B16 + MOVD $1, H0 + VMOV H0, INC.S[3] + VREV32 CTR.B16, CTR.B16 + VADD CTR.S4, INC.S4, CTR.S4 + // Skip to <8 blocks loop + CMP $128, srcPtrLen + + LOAD_SM4_AESNI_CONSTS() + + BLT encNibblesLoop + // There are at least 8 blocks to encrypt + +encOctetsLoop: + SUB $128, srcPtrLen + // Prepare 8 counters + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VREV32 B0.B16, B0.B16 + VADD B1.S4, INC.S4, B2.S4 + VREV32 B1.B16, B1.B16 + VADD B2.S4, INC.S4, B3.S4 + VREV32 B2.B16, B2.B16 + VADD B3.S4, INC.S4, B4.S4 + VREV32 B3.B16, B3.B16 + VADD B4.S4, INC.S4, B5.S4 + VREV32 B4.B16, B4.B16 + VADD B5.S4, INC.S4, B6.S4 + VREV32 B5.B16, B5.B16 + VADD B6.S4, INC.S4, B7.S4 + VREV32 B6.B16, B6.B16 + VADD B7.S4, INC.S4, CTR.S4 + VREV32 B7.B16, B7.B16 + + // encryption first 4 blocks + EOR R13, R13 + MOVD rkSave, rk + +encOctetsEnc4Blocks1: + SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + + ADD $1, R13 + CMP $8, R13 + BNE encOctetsEnc4Blocks1 + VREV32 B0.B16, B0.B16 + VREV32 B1.B16, B1.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + + // encryption first 4 blocks + MOVD rkSave, rk + +encOctetsEnc4Blocks2: + SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7) + SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4) + SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5) + SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6) + + ADD $1, R13 + CMP $16, R13 + BNE encOctetsEnc4Blocks2 + VREV32 B4.B16, B4.B16 + VREV32 B5.B16, B5.B16 + VREV32 B6.B16, B6.B16 + VREV32 B7.B16, B7.B16 + + // XOR plaintext and store ciphertext + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B0.B16, T1.B16, B0.B16 + VEOR B1.B16, T2.B16, B1.B16 + VST1.P [B0.B16, B1.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B2.B16, T1.B16, B2.B16 + VEOR B3.B16, T2.B16, B3.B16 + VST1.P [B2.B16, B3.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B4.B16, T1.B16, B4.B16 + VEOR B5.B16, T2.B16, B5.B16 + VST1.P [B4.B16, B5.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B6.B16, T1.B16, B6.B16 + VEOR B7.B16, T2.B16, B7.B16 + VST1.P [B6.B16, B7.B16], 32(dstPtr) + + VLD1.P 32(pTbl), [T1.B16, T2.B16] + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + + mulRound(B1) + mulRound(B2) + mulRound(B3) + mulRound(B4) + mulRound(B5) + mulRound(B6) + mulRound(B7) + MOVD pTblSave, pTbl + reduce() + + CMP $128, srcPtrLen + BGE encOctetsLoop + +encNibblesLoop: + CBZ srcPtrLen, encDone + ADD $14*16, pTbl + // Preload H and its Karatsuba precomp + VLD1.P (pTbl), [T1.B16, T2.B16] + + CMP $64, srcPtrLen + BLT encStartSingles + SUB $64, srcPtrLen + + // Prepare 4 counters + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VREV32 B0.B16, B0.B16 + VADD B1.S4, INC.S4, B2.S4 + VREV32 B1.B16, B1.B16 + VADD B2.S4, INC.S4, B3.S4 + VREV32 B2.B16, B2.B16 + VADD B3.S4, INC.S4, B4.S4 + VREV32 B3.B16, B3.B16 + + // encryption first 4 blocks + EOR R13, R13 + MOVD rkSave, rk + +encNibblesEnc4Blocks: + SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + + ADD $1, R13 + CMP $8, R13 + BNE encNibblesEnc4Blocks + VREV32 B0.B16, B0.B16 + VREV32 B1.B16, B1.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + + // XOR plaintext and store ciphertext + VLD1.P 32(srcPtr), [K1.B16, K2.B16] + VEOR B0.B16, K1.B16, B0.B16 + VEOR B1.B16, K2.B16, B1.B16 + VST1.P [B0.B16, B1.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [K1.B16, K2.B16] + VEOR B2.B16, K1.B16, B2.B16 + VEOR B3.B16, K2.B16, B3.B16 + VST1.P [B2.B16, B3.B16], 32(dstPtr) + + mulRoundSingle(B0) + mulRoundSingle(B1) + mulRoundSingle(B2) + mulRoundSingle(B3) + +encStartSingles: + CBZ srcPtrLen, encDone + + // Prepare 4 counters + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VREV32 B0.B16, B0.B16 + VADD B1.S4, INC.S4, B2.S4 + VREV32 B1.B16, B1.B16 + VADD B2.S4, INC.S4, B3.S4 + VREV32 B2.B16, B2.B16 + VADD B3.S4, INC.S4, B4.S4 + VREV32 B3.B16, B3.B16 + + // encryption first 4 blocks + EOR R13, R13 + MOVD rkSave, rk + +encSinglesEnc4Blocks: + SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + + ADD $1, R13 + CMP $8, R13 + BNE encSinglesEnc4Blocks + VREV32 B0.B16, B0.B16 + VREV32 B1.B16, B1.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + + VMOV B0.B16, K0.B16 + CMP $16, srcPtrLen + BLT encTail + SUB $16, srcPtrLen + VLD1.P 16(srcPtr), [K1.B16] + VEOR K0.B16, K1.B16, K0.B16 + VST1.P [K0.B16], 16(dstPtr) + mulRoundSingle(K0) + + VMOV B1.B16, K0.B16 + CMP $16, srcPtrLen + BLT encTail + SUB $16, srcPtrLen + VLD1.P 16(srcPtr), [K1.B16] + VEOR K0.B16, K1.B16, K0.B16 + VST1.P [K0.B16], 16(dstPtr) + mulRoundSingle(K0) + + VMOV B2.B16, K0.B16 + CMP $16, srcPtrLen + BLT encTail + SUB $16, srcPtrLen + VLD1.P 16(srcPtr), [K1.B16] + VEOR K0.B16, K1.B16, K0.B16 + VST1.P [K0.B16], 16(dstPtr) + mulRoundSingle(K0) + + VMOV B3.B16, K0.B16 + CMP $16, srcPtrLen + BLT encTail + SUB $16, srcPtrLen + VLD1.P 16(srcPtr), [K1.B16] + VEOR K0.B16, K1.B16, K0.B16 + VST1.P [K0.B16], 16(dstPtr) + mulRoundSingle(K0) + +encTail: + CBZ srcPtrLen, encDone + VEOR T0.B16, T0.B16, T0.B16 + VEOR T3.B16, T3.B16, T3.B16 + MOVD $0, H1 + SUB $1, H1 + ADD srcPtrLen, srcPtr + + TBZ $3, srcPtrLen, ld4 + MOVD.W -8(srcPtr), H0 + VMOV H0, T0.D[0] + VMOV H1, T3.D[0] +ld4: + TBZ $2, srcPtrLen, ld2 + MOVW.W -4(srcPtr), H0 + VEXT $12, T0.B16, ZERO.B16, T0.B16 + VEXT $12, T3.B16, ZERO.B16, T3.B16 + VMOV H0, T0.S[0] + VMOV H1, T3.S[0] +ld2: + TBZ $1, srcPtrLen, ld1 + MOVH.W -2(srcPtr), H0 + VEXT $14, T0.B16, ZERO.B16, T0.B16 + VEXT $14, T3.B16, ZERO.B16, T3.B16 + VMOV H0, T0.H[0] + VMOV H1, T3.H[0] +ld1: + TBZ $0, srcPtrLen, ld0 + MOVB.W -1(srcPtr), H0 + VEXT $15, T0.B16, ZERO.B16, T0.B16 + VEXT $15, T3.B16, ZERO.B16, T3.B16 + VMOV H0, T0.B[0] + VMOV H1, T3.B[0] +ld0: + MOVD ZR, srcPtrLen + VEOR T0.B16, K0.B16, K0.B16 + VAND T3.B16, K0.B16, K0.B16 + VST1.P [K0.B16], 16(dstPtr) + mulRoundSingle(K0) + +encDone: + VST1 [ACC0.B16], (tPtr) + RET + +// func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) +TEXT ·gcmSm4Dec(SB),NOSPLIT,$0 + MOVD productTable+0(FP), pTbl + MOVD dst+8(FP), dstPtr + MOVD src_base+32(FP), srcPtr + MOVD src_len+40(FP), srcPtrLen + MOVD ctr+56(FP), ctrPtr + MOVD T+64(FP), tPtr + MOVD rk_base+72(FP), rk + + MOVD $0xC2, H1 + LSL $56, H1 + MOVD $1, H0 + VMOV H1, POLY.D[0] + VMOV H0, POLY.D[1] + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD pTbl, pTblSave + MOVD rk, rkSave + // Current tag, after AAD + VLD1 (tPtr), [ACC0.B16] + VEOR ACC1.B16, ACC1.B16, ACC1.B16 + VEOR ACCM.B16, ACCM.B16, ACCM.B16 + // Prepare initial counter, and the increment vector + VLD1 (ctrPtr), [CTR.B16] + VEOR INC.B16, INC.B16, INC.B16 + MOVD $1, H0 + VMOV H0, INC.S[3] + VREV32 CTR.B16, CTR.B16 + VADD CTR.S4, INC.S4, CTR.S4 + + // Skip to <8 blocks loop + CMP $128, srcPtrLen + + LOAD_SM4_AESNI_CONSTS() + + BLT decNibblesLoop + // There are at least 8 blocks to encrypt + +decOctetsLoop: + SUB $128, srcPtrLen + + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VREV32 B0.B16, B0.B16 + VADD B1.S4, INC.S4, B2.S4 + VREV32 B1.B16, B1.B16 + VADD B2.S4, INC.S4, B3.S4 + VREV32 B2.B16, B2.B16 + VADD B3.S4, INC.S4, B4.S4 + VREV32 B3.B16, B3.B16 + VADD B4.S4, INC.S4, B5.S4 + VREV32 B4.B16, B4.B16 + VADD B5.S4, INC.S4, B6.S4 + VREV32 B5.B16, B5.B16 + VADD B6.S4, INC.S4, B7.S4 + VREV32 B6.B16, B6.B16 + VADD B7.S4, INC.S4, CTR.S4 + VREV32 B7.B16, B7.B16 + + // encryption first 4 blocks + EOR R13, R13 + MOVD rkSave, rk + +decOctetsEnc4Blocks1: + SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + + ADD $1, R13 + CMP $8, R13 + BNE decOctetsEnc4Blocks1 + VREV32 B0.B16, T1.B16 + VREV32 B1.B16, T2.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + + // encryption first 4 blocks + MOVD rkSave, rk + +decOctetsEnc4Blocks2: + SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7) + SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4) + SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5) + SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6) + + ADD $1, R13 + CMP $16, R13 + BNE decOctetsEnc4Blocks2 + VREV32 B4.B16, B4.B16 + VREV32 B5.B16, B5.B16 + VREV32 B6.B16, B6.B16 + VREV32 B7.B16, B7.B16 + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B0.B16, T1.B16, T1.B16 + VEOR B1.B16, T2.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + + VLD1.P 32(pTbl), [T1.B16, T2.B16] + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + mulRound(B1) + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B2.B16, B0.B16, T1.B16 + VEOR B3.B16, B1.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + mulRound(B0) + mulRound(B1) + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B4.B16, B0.B16, T1.B16 + VEOR B5.B16, B1.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + mulRound(B0) + mulRound(B1) + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B6.B16, B0.B16, T1.B16 + VEOR B7.B16, B1.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + mulRound(B0) + mulRound(B1) + + MOVD pTblSave, pTbl + reduce() + + CMP $128, srcPtrLen + BGE decOctetsLoop + +decNibblesLoop: + CBZ srcPtrLen, decDone + ADD $14*16, pTbl + // Preload H and its Karatsuba precomp + VLD1.P (pTbl), [T1.B16, T2.B16] + CMP $64, srcPtrLen + BLT decStartSingles + SUB $64, srcPtrLen + + // Prepare 4 counters + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VREV32 B0.B16, B0.B16 + VADD B1.S4, INC.S4, B2.S4 + VREV32 B1.B16, B1.B16 + VADD B2.S4, INC.S4, B3.S4 + VREV32 B2.B16, B2.B16 + VADD B3.S4, INC.S4, B4.S4 + VREV32 B3.B16, B3.B16 + + // encryption first 4 blocks + EOR R13, R13 + MOVD rkSave, rk + +decNibblesEnc4Blocks: + SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + + ADD $1, R13 + CMP $8, R13 + BNE decNibblesEnc4Blocks + VREV32 B0.B16, B0.B16 + VREV32 B1.B16, B1.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + + // XOR plaintext and store ciphertext + VLD1.P 32(srcPtr), [K1.B16, K2.B16] + VREV64 K1.B16, B4.B16 + VREV64 K2.B16, B5.B16 + VEOR B0.B16, K1.B16, B0.B16 + VEOR B1.B16, K2.B16, B1.B16 + VST1.P [B0.B16, B1.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [K1.B16, K2.B16] + VREV64 K1.B16, B6.B16 + VREV64 K2.B16, B7.B16 + VEOR B2.B16, K1.B16, B2.B16 + VEOR B3.B16, K2.B16, B3.B16 + VST1.P [B2.B16, B3.B16], 32(dstPtr) + mulRoundSingleWithoutRev(B4) + mulRoundSingleWithoutRev(B5) + mulRoundSingleWithoutRev(B6) + mulRoundSingleWithoutRev(B7) + +decStartSingles: + CBZ srcPtrLen, decDone + + // Prepare 4 counters + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VREV32 B0.B16, B0.B16 + VADD B1.S4, INC.S4, B2.S4 + VREV32 B1.B16, B1.B16 + VADD B2.S4, INC.S4, B3.S4 + VREV32 B2.B16, B2.B16 + VADD B3.S4, INC.S4, B4.S4 + VREV32 B3.B16, B3.B16 + + // encryption first 4 blocks + EOR R13, R13 + MOVD rkSave, rk + +decSinglesEnc4Blocks: + SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) + SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) + SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) + SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) + + ADD $1, R13 + CMP $8, R13 + BNE decSinglesEnc4Blocks + VREV32 B0.B16, B0.B16 + VREV32 B1.B16, B1.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + + VMOV B0.B16, K0.B16 + CMP $16, srcPtrLen + BLT decTail + SUB $16, srcPtrLen + VLD1.P 16(srcPtr), [K1.B16] + VREV64 K1.B16, B5.B16 + VEOR K0.B16, K1.B16, K0.B16 + VST1.P [K0.B16], 16(dstPtr) + mulRoundSingleWithoutRev(B5) + + VMOV B1.B16, K0.B16 + CMP $16, srcPtrLen + BLT decTail + SUB $16, srcPtrLen + VLD1.P 16(srcPtr), [K1.B16] + VREV64 K1.B16, B5.B16 + VEOR K0.B16, K1.B16, K0.B16 + VST1.P [K0.B16], 16(dstPtr) + mulRoundSingleWithoutRev(B5) + + VMOV B2.B16, K0.B16 + CMP $16, srcPtrLen + BLT decTail + SUB $16, srcPtrLen + VLD1.P 16(srcPtr), [K1.B16] + VREV64 K1.B16, B5.B16 + VEOR K0.B16, K1.B16, K0.B16 + VST1.P [K0.B16], 16(dstPtr) + mulRoundSingleWithoutRev(B5) + + VMOV B3.B16, K0.B16 + CMP $16, srcPtrLen + BLT decTail + SUB $16, srcPtrLen + VLD1.P 16(srcPtr), [K1.B16] + VREV64 K1.B16, B5.B16 + VEOR K0.B16, K1.B16, K0.B16 + VST1.P [K0.B16], 16(dstPtr) + mulRoundSingleWithoutRev(B5) + +decTail: + CBZ srcPtrLen, decDone + // Assuming it is safe to load past dstPtr due to the presence of the tag + VLD1 (srcPtr), [B5.B16] + + VEOR B5.B16, K0.B16, B0.B16 + + VEOR T3.B16, T3.B16, T3.B16 + MOVD $0, H1 + SUB $1, H1 + + TBZ $3, srcPtrLen, decLd4 + VMOV B0.D[0], H0 + MOVD.P H0, 8(dstPtr) + VMOV H1, T3.D[0] + VEXT $8, ZERO.B16, B0.B16, B0.B16 + +decLd4: + TBZ $2, srcPtrLen, decLd2 + VMOV B0.S[0], H0 + MOVW.P H0, 4(dstPtr) + VEXT $12, T3.B16, ZERO.B16, T3.B16 + VMOV H1, T3.S[0] + VEXT $4, ZERO.B16, B0.B16, B0.B16 +decLd2: + TBZ $1, srcPtrLen, decLd1 + VMOV B0.H[0], H0 + MOVH.P H0, 2(dstPtr) + VEXT $14, T3.B16, ZERO.B16, T3.B16 + VMOV H1, T3.H[0] + VEXT $2, ZERO.B16, B0.B16, B0.B16 +decLd1: + TBZ $0, srcPtrLen, decLd0 + VMOV B0.B[0], H0 + MOVB.P H0, 1(dstPtr) + VEXT $15, T3.B16, ZERO.B16, T3.B16 + VMOV H1, T3.B[0] +decLd0: + + VAND T3.B16, B5.B16, B5.B16 + VREV64 B5.B16, B5.B16 + + VEOR ACC0.B16, B5.B16, B5.B16 + VEXT $8, B5.B16, B5.B16, T0.B16 + VEOR B5.B16, T0.B16, T0.B16 + VPMULL B5.D1, T1.D1, ACC1.Q1 + VPMULL2 B5.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + reduce() + +decDone: + VST1 [ACC0.B16], (tPtr) + RET diff --git a/sm4/gcm_amd64_test.go b/sm4/gcm_asm_test.go similarity index 96% rename from sm4/gcm_amd64_test.go rename to sm4/gcm_asm_test.go index a1d9f0b..6506e65 100644 --- a/sm4/gcm_amd64_test.go +++ b/sm4/gcm_asm_test.go @@ -1,5 +1,5 @@ -//go:build amd64 -// +build amd64 +//go:build amd64 || arm64 +// +build amd64 arm64 package sm4 diff --git a/sm4/sm4_gcm_arm64.go b/sm4/sm4_gcm_arm64.go index 9949176..2573bc9 100644 --- a/sm4/sm4_gcm_arm64.go +++ b/sm4/sm4_gcm_arm64.go @@ -23,6 +23,12 @@ var _ gcmAble = (*sm4CipherGCM)(nil) //go:noescape func gcmSm4Init(productTable *[256]byte, rk []uint32) +//go:noescape +func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) + +//go:noescape +func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) + //go:noescape func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)