//go:build arm64 && !purego // +build arm64,!purego #include "textflag.h" #define B0 V0 #define B1 V1 #define B2 V2 #define B3 V3 #define B4 V4 #define B5 V5 #define B6 V6 #define B7 V7 #define ACC0 V8 #define ACC1 V9 #define ACCM V10 #define T0 V11 #define T1 V12 #define T2 V13 #define T3 V14 #define POLY V15 #define ZERO V16 #define INC V17 #define CTR V18 #define K0 V19 #define K1 V20 #define K2 V21 #define K3 V22 #define K4 V23 #define K5 V24 #define K6 V25 #define K7 V26 #define reduce() \ VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ VEOR ACC1.B16, ACCM.B16, ACCM.B16 \ VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \ VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \ VEOR ACCM.B16, ACC0.B16, ACC0.B16 \ VEOR T0.B16, ACC1.B16, ACC1.B16 \ VPMULL POLY.D1, ACC0.D1, T0.Q1 \ VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \ VEOR T0.B16, ACC0.B16, ACC0.B16 \ VPMULL POLY.D1, ACC0.D1, T0.Q1 \ VEOR T0.B16, ACC1.B16, ACC1.B16 \ VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \ VEOR ACC1.B16, ACC0.B16, ACC0.B16 \ #define mulRound(X) \ VLD1.P 32(pTbl), [T1.B16, T2.B16] \ VREV64 X.B16, X.B16 \ VEXT $8, X.B16, X.B16, T0.B16 \ VEOR X.B16, T0.B16, T0.B16 \ VPMULL X.D1, T1.D1, T3.Q1 \ VEOR T3.B16, ACC1.B16, ACC1.B16 \ VPMULL2 X.D2, T1.D2, T3.Q1 \ VEOR T3.B16, ACC0.B16, ACC0.B16 \ VPMULL T0.D1, T2.D1, T3.Q1 \ VEOR T3.B16, ACCM.B16, ACCM.B16 #include "sm4ni_macros_arm64.s" // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 #define pTbl R0 #define dstPtr R1 #define ctrPtr R2 #define srcPtr R3 #define rk R4 #define tPtr R5 #define srcPtrLen R6 #define aluCTR R7 #define aluTMP R8 #define H0 R9 #define H1 R10 #define pTblSave R11 #define rkSave R12 MOVD productTable+0(FP), pTbl MOVD dst+8(FP), dstPtr MOVD src_base+32(FP), srcPtr MOVD src_len+40(FP), srcPtrLen MOVD ctr+56(FP), ctrPtr MOVD T+64(FP), tPtr MOVD rk_base+72(FP), rk MOVD $0xC2, H1 LSL $56, H1 MOVD $1, H0 VMOV H1, POLY.D[0] VMOV H0, POLY.D[1] VEOR ZERO.B16, ZERO.B16, ZERO.B16 MOVD pTbl, pTblSave // Current tag, after AAD VLD1 (tPtr), [ACC0.B16] VEOR ACC1.B16, ACC1.B16, ACC1.B16 VEOR ACCM.B16, ACCM.B16, ACCM.B16 // Prepare initial counter, and the increment vector VLD1 (ctrPtr), [CTR.B16] VEOR INC.B16, INC.B16, INC.B16 MOVD $1, H0 VMOV H0, INC.S[3] VREV32 CTR.B16, CTR.B16 VADD CTR.S4, INC.S4, CTR.S4 // Skip to <8 blocks loop CMP $128, srcPtrLen MOVD rk, H0 // For SM4 round keys are stored in: K0 .. K7 VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4] VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4] BLT startSingles octetsLoop: SUB $128, srcPtrLen // Prepare 8 counters VMOV CTR.B16, B0.B16 VADD B0.S4, INC.S4, B1.S4 VADD B1.S4, INC.S4, B2.S4 VADD B2.S4, INC.S4, B3.S4 VADD B3.S4, INC.S4, B4.S4 VADD B4.S4, INC.S4, B5.S4 VADD B5.S4, INC.S4, B6.S4 VADD B6.S4, INC.S4, B7.S4 VADD B7.S4, INC.S4, CTR.S4 sm4eEnc8blocks() // XOR plaintext and store ciphertext VLD1.P 32(srcPtr), [T1.B16, T2.B16] VEOR B0.B16, T1.B16, B0.B16 VEOR B1.B16, T2.B16, B1.B16 VST1.P [B0.B16, B1.B16], 32(dstPtr) VLD1.P 32(srcPtr), [T1.B16, T2.B16] VEOR B2.B16, T1.B16, B2.B16 VEOR B3.B16, T2.B16, B3.B16 VST1.P [B2.B16, B3.B16], 32(dstPtr) VLD1.P 32(srcPtr), [T1.B16, T2.B16] VEOR B4.B16, T1.B16, B4.B16 VEOR B5.B16, T2.B16, B5.B16 VST1.P [B4.B16, B5.B16], 32(dstPtr) VLD1.P 32(srcPtr), [T1.B16, T2.B16] VEOR B6.B16, T1.B16, B6.B16 VEOR B7.B16, T2.B16, B7.B16 VST1.P [B6.B16, B7.B16], 32(dstPtr) VLD1.P 32(pTbl), [T1.B16, T2.B16] VREV64 B0.B16, B0.B16 VEOR ACC0.B16, B0.B16, B0.B16 VEXT $8, B0.B16, B0.B16, T0.B16 VEOR B0.B16, T0.B16, T0.B16 VPMULL B0.D1, T1.D1, ACC1.Q1 VPMULL2 B0.D2, T1.D2, ACC0.Q1 VPMULL T0.D1, T2.D1, ACCM.Q1 mulRound(B1) mulRound(B2) mulRound(B3) mulRound(B4) mulRound(B5) mulRound(B6) mulRound(B7) MOVD pTblSave, pTbl reduce() CMP $128, srcPtrLen BGE octetsLoop startSingles: CBZ srcPtrLen, done ADD $14*16, pTbl // Preload H and its Karatsuba precomp VLD1.P (pTbl), [T1.B16, T2.B16] singlesLoop: CMP $16, srcPtrLen BLT tail SUB $16, srcPtrLen VMOV CTR.B16, B0.B16 VADD CTR.S4, INC.S4, CTR.S4 sm4eEnc1block() singlesLast: VLD1.P 16(srcPtr), [T0.B16] VEOR T0.B16, B0.B16, B0.B16 encReduce: VST1.P [B0.B16], 16(dstPtr) VREV64 B0.B16, B0.B16 VEOR ACC0.B16, B0.B16, B0.B16 VEXT $8, B0.B16, B0.B16, T0.B16 VEOR B0.B16, T0.B16, T0.B16 VPMULL B0.D1, T1.D1, ACC1.Q1 VPMULL2 B0.D2, T1.D2, ACC0.Q1 VPMULL T0.D1, T2.D1, ACCM.Q1 reduce() B singlesLoop tail: CBZ srcPtrLen, done VEOR T0.B16, T0.B16, T0.B16 VEOR T3.B16, T3.B16, T3.B16 MOVD $0, H1 SUB $1, H1 ADD srcPtrLen, srcPtr TBZ $3, srcPtrLen, ld4 MOVD.W -8(srcPtr), H0 VMOV H0, T0.D[0] VMOV H1, T3.D[0] ld4: TBZ $2, srcPtrLen, ld2 MOVW.W -4(srcPtr), H0 VEXT $12, T0.B16, ZERO.B16, T0.B16 VEXT $12, T3.B16, ZERO.B16, T3.B16 VMOV H0, T0.S[0] VMOV H1, T3.S[0] ld2: TBZ $1, srcPtrLen, ld1 MOVH.W -2(srcPtr), H0 VEXT $14, T0.B16, ZERO.B16, T0.B16 VEXT $14, T3.B16, ZERO.B16, T3.B16 VMOV H0, T0.H[0] VMOV H1, T3.H[0] ld1: TBZ $0, srcPtrLen, ld0 MOVB.W -1(srcPtr), H0 VEXT $15, T0.B16, ZERO.B16, T0.B16 VEXT $15, T3.B16, ZERO.B16, T3.B16 VMOV H0, T0.B[0] VMOV H1, T3.B[0] ld0: MOVD ZR, srcPtrLen VMOV CTR.B16, B0.B16 sm4eEnc1block() tailLast: VEOR T0.B16, B0.B16, B0.B16 VAND T3.B16, B0.B16, B0.B16 B encReduce done: VST1 [ACC0.B16], (tPtr) RET // func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) TEXT ·gcmSm4niDec(SB),NOSPLIT,$0 MOVD productTable+0(FP), pTbl MOVD dst+8(FP), dstPtr MOVD src_base+32(FP), srcPtr MOVD src_len+40(FP), srcPtrLen MOVD ctr+56(FP), ctrPtr MOVD T+64(FP), tPtr MOVD rk_base+72(FP), rk MOVD $0xC2, H1 LSL $56, H1 MOVD $1, H0 VMOV H1, POLY.D[0] VMOV H0, POLY.D[1] VEOR ZERO.B16, ZERO.B16, ZERO.B16 MOVD pTbl, pTblSave MOVD rk, rkSave // Current tag, after AAD VLD1 (tPtr), [ACC0.B16] VEOR ACC1.B16, ACC1.B16, ACC1.B16 VEOR ACCM.B16, ACCM.B16, ACCM.B16 // Prepare initial counter, and the increment vector VLD1 (ctrPtr), [CTR.B16] VEOR INC.B16, INC.B16, INC.B16 MOVD $1, H0 VMOV H0, INC.S[3] VREV32 CTR.B16, CTR.B16 VADD CTR.S4, INC.S4, CTR.S4 // Skip to <8 blocks loop CMP $128, srcPtrLen MOVD rk, H0 // For SM4 round keys are stored in: K0 .. K7 VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4] VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4] BLT startSingles octetsLoop: SUB $128, srcPtrLen VMOV CTR.B16, B0.B16 VADD B0.S4, INC.S4, B1.S4 VADD B1.S4, INC.S4, B2.S4 VADD B2.S4, INC.S4, B3.S4 VADD B3.S4, INC.S4, B4.S4 VADD B4.S4, INC.S4, B5.S4 VADD B5.S4, INC.S4, B6.S4 VADD B6.S4, INC.S4, B7.S4 VADD B7.S4, INC.S4, CTR.S4 sm4eEnc8blocks() VMOV B0.B16, T1.B16 VMOV B1.B16, T2.B16 VLD1.P 32(srcPtr), [B0.B16, B1.B16] VEOR B0.B16, T1.B16, T1.B16 VEOR B1.B16, T2.B16, T2.B16 VST1.P [T1.B16, T2.B16], 32(dstPtr) VLD1.P 32(pTbl), [T1.B16, T2.B16] VREV64 B0.B16, B0.B16 VEOR ACC0.B16, B0.B16, B0.B16 VEXT $8, B0.B16, B0.B16, T0.B16 VEOR B0.B16, T0.B16, T0.B16 VPMULL B0.D1, T1.D1, ACC1.Q1 VPMULL2 B0.D2, T1.D2, ACC0.Q1 VPMULL T0.D1, T2.D1, ACCM.Q1 mulRound(B1) VLD1.P 32(srcPtr), [B0.B16, B1.B16] VEOR B2.B16, B0.B16, T1.B16 VEOR B3.B16, B1.B16, T2.B16 VST1.P [T1.B16, T2.B16], 32(dstPtr) mulRound(B0) mulRound(B1) VLD1.P 32(srcPtr), [B0.B16, B1.B16] VEOR B4.B16, B0.B16, T1.B16 VEOR B5.B16, B1.B16, T2.B16 VST1.P [T1.B16, T2.B16], 32(dstPtr) mulRound(B0) mulRound(B1) VLD1.P 32(srcPtr), [B0.B16, B1.B16] VEOR B6.B16, B0.B16, T1.B16 VEOR B7.B16, B1.B16, T2.B16 VST1.P [T1.B16, T2.B16], 32(dstPtr) mulRound(B0) mulRound(B1) MOVD pTblSave, pTbl reduce() CMP $128, srcPtrLen BGE octetsLoop startSingles: CBZ srcPtrLen, done ADD $14*16, pTbl // Preload H and its Karatsuba precomp VLD1.P (pTbl), [T1.B16, T2.B16] singlesLoop: CMP $16, srcPtrLen BLT tail SUB $16, srcPtrLen VLD1.P 16(srcPtr), [T0.B16] VREV64 T0.B16, B5.B16 VMOV CTR.B16, B0.B16 VADD CTR.S4, INC.S4, CTR.S4 sm4eEnc1block() singlesLast: VEOR T0.B16, B0.B16, B0.B16 VST1.P [B0.B16], 16(dstPtr) VEOR ACC0.B16, B5.B16, B5.B16 VEXT $8, B5.B16, B5.B16, T0.B16 VEOR B5.B16, T0.B16, T0.B16 VPMULL B5.D1, T1.D1, ACC1.Q1 VPMULL2 B5.D2, T1.D2, ACC0.Q1 VPMULL T0.D1, T2.D1, ACCM.Q1 reduce() B singlesLoop tail: CBZ srcPtrLen, done VMOV CTR.B16, B0.B16 VADD CTR.S4, INC.S4, CTR.S4 sm4eEnc1block() tailLast: // Assuming it is safe to load past dstPtr due to the presence of the tag // B5 stored last ciphertext VLD1 (srcPtr), [B5.B16] VEOR B5.B16, B0.B16, B0.B16 VEOR T3.B16, T3.B16, T3.B16 MOVD $0, H1 SUB $1, H1 TBZ $3, srcPtrLen, ld4 // Test if srcPtrLen < 8, if yes, goto ld4 VMOV B0.D[0], H0 MOVD.P H0, 8(dstPtr) VMOV H1, T3.D[0] VEXT $8, ZERO.B16, B0.B16, B0.B16 ld4: TBZ $2, srcPtrLen, ld2 // Test if srcPtrLen < 4, if yes, goto ld2 VMOV B0.S[0], H0 MOVW.P H0, 4(dstPtr) VEXT $12, T3.B16, ZERO.B16, T3.B16 VMOV H1, T3.S[0] VEXT $4, ZERO.B16, B0.B16, B0.B16 ld2: TBZ $1, srcPtrLen, ld1 // Test if srcPtrLen < 2, if yes, goto ld1 VMOV B0.H[0], H0 MOVH.P H0, 2(dstPtr) VEXT $14, T3.B16, ZERO.B16, T3.B16 VMOV H1, T3.H[0] VEXT $2, ZERO.B16, B0.B16, B0.B16 ld1: TBZ $0, srcPtrLen, ld0 // Test if srcPtrLen < 1, if yes, goto ld0 VMOV B0.B[0], H0 MOVB.P H0, 1(dstPtr) VEXT $15, T3.B16, ZERO.B16, T3.B16 VMOV H1, T3.B[0] ld0: VAND T3.B16, B5.B16, B5.B16 VREV64 B5.B16, B5.B16 VEOR ACC0.B16, B5.B16, B5.B16 VEXT $8, B5.B16, B5.B16, T0.B16 VEOR B5.B16, T0.B16, T0.B16 VPMULL B5.D1, T1.D1, ACC1.Q1 VPMULL2 B5.D2, T1.D2, ACC0.Q1 VPMULL T0.D1, T2.D1, ACCM.Q1 reduce() done: VST1 [ACC0.B16], (tPtr) RET