//go:build arm64 && !purego
// +build arm64,!purego

#include "textflag.h"

#define B0 V0
#define B1 V1
#define B2 V2
#define B3 V3
#define B4 V4
#define B5 V5
#define B6 V6
#define B7 V7

#define ACC0 V8
#define ACC1 V9
#define ACCM V10

#define T0 V11
#define T1 V12
#define T2 V13
#define T3 V14

#define POLY V15
#define ZERO V16
#define INC V17
#define CTR V18

#define K0 V19
#define K1 V20
#define K2 V21
#define K3 V22
#define K4 V23
#define K5 V24
#define K6 V25
#define K7 V26

#define reduce() \
	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \

#define mulRound(X) \
	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
	VREV64	X.B16, X.B16               \
	VEXT	$8, X.B16, X.B16, T0.B16   \
	VEOR	X.B16, T0.B16, T0.B16      \
	VPMULL	X.D1, T1.D1, T3.Q1         \
	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
	VPMULL2	X.D2, T1.D2, T3.Q1         \
	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
	VPMULL	T0.D1, T2.D1, T3.Q1        \
	VEOR	T3.B16, ACCM.B16, ACCM.B16

#include "sm4ni_macros_arm64.s"

// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
#define pTbl R0
#define dstPtr R1
#define ctrPtr R2
#define srcPtr R3
#define rk R4
#define tPtr R5
#define srcPtrLen R6
#define aluCTR R7
#define aluTMP R8
#define H0 R9
#define H1 R10
#define pTblSave R11
#define rkSave R12
	MOVD	productTable+0(FP), pTbl
	MOVD	dst+8(FP), dstPtr
	MOVD	src_base+32(FP), srcPtr
	MOVD	src_len+40(FP), srcPtrLen
	MOVD	ctr+56(FP), ctrPtr
	MOVD	T+64(FP), tPtr
	MOVD	rk_base+72(FP), rk
	
	MOVD	$0xC2, H1
	LSL	$56, H1
	MOVD	$1, H0
	VMOV	H1, POLY.D[0]
	VMOV	H0, POLY.D[1]
	VEOR	ZERO.B16, ZERO.B16, ZERO.B16

	MOVD	pTbl, pTblSave
	// Current tag, after AAD
	VLD1	(tPtr), [ACC0.B16]
	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
	// Prepare initial counter, and the increment vector
	VLD1	(ctrPtr), [CTR.B16]
	VEOR	INC.B16, INC.B16, INC.B16
	MOVD	$1, H0
	VMOV	H0, INC.S[3]
	VREV32	CTR.B16, CTR.B16
	VADD	CTR.S4, INC.S4, CTR.S4

	// Skip to <8 blocks loop
	CMP	$128, srcPtrLen

	MOVD	rk, H0
	// For SM4 round keys are stored in: K0 .. K7
	VLD1.P	64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
	VLD1.P	64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]

	BLT	startSingles
octetsLoop:
		SUB	$128, srcPtrLen
		// Prepare 8 counters
		VMOV	CTR.B16, B0.B16
		VADD	B0.S4, INC.S4, B1.S4
		VADD	B1.S4, INC.S4, B2.S4
		VADD	B2.S4, INC.S4, B3.S4
		VADD	B3.S4, INC.S4, B4.S4
		VADD	B4.S4, INC.S4, B5.S4
		VADD	B5.S4, INC.S4, B6.S4
		VADD	B6.S4, INC.S4, B7.S4
		VADD	B7.S4, INC.S4, CTR.S4

		sm4eEnc8blocks()

		// XOR plaintext and store ciphertext
		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
		VEOR	B0.B16, T1.B16, B0.B16
		VEOR	B1.B16, T2.B16, B1.B16
		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
		VEOR	B2.B16, T1.B16, B2.B16
		VEOR	B3.B16, T2.B16, B3.B16
		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
		VEOR	B4.B16, T1.B16, B4.B16
		VEOR	B5.B16, T2.B16, B5.B16
		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
		VEOR	B6.B16, T1.B16, B6.B16
		VEOR	B7.B16, T2.B16, B7.B16
		VST1.P  [B6.B16, B7.B16], 32(dstPtr)

		VLD1.P	32(pTbl), [T1.B16, T2.B16]
		VREV64	B0.B16, B0.B16
		VEOR	ACC0.B16, B0.B16, B0.B16
		VEXT	$8, B0.B16, B0.B16, T0.B16
		VEOR	B0.B16, T0.B16, T0.B16
		VPMULL	B0.D1, T1.D1, ACC1.Q1
		VPMULL2	B0.D2, T1.D2, ACC0.Q1
		VPMULL	T0.D1, T2.D1, ACCM.Q1

		mulRound(B1)
		mulRound(B2)
		mulRound(B3)
		mulRound(B4)
		mulRound(B5)
		mulRound(B6)
		mulRound(B7)
		MOVD	pTblSave, pTbl
		reduce()

		CMP	$128, srcPtrLen
		BGE	octetsLoop

startSingles:
	CBZ	srcPtrLen, done
	ADD	$14*16, pTbl
	// Preload H and its Karatsuba precomp
	VLD1.P	(pTbl), [T1.B16, T2.B16]

singlesLoop:
		CMP	$16, srcPtrLen
		BLT	tail
		SUB	$16, srcPtrLen

		VMOV	CTR.B16, B0.B16
		VADD	CTR.S4, INC.S4, CTR.S4
		sm4eEnc1block()

singlesLast:
		VLD1.P	16(srcPtr), [T0.B16]
		VEOR	T0.B16, B0.B16, B0.B16

encReduce:
		VST1.P	[B0.B16], 16(dstPtr)

		VREV64	B0.B16, B0.B16
		VEOR	ACC0.B16, B0.B16, B0.B16

		VEXT	$8, B0.B16, B0.B16, T0.B16
		VEOR	B0.B16, T0.B16, T0.B16
		VPMULL	B0.D1, T1.D1, ACC1.Q1
		VPMULL2	B0.D2, T1.D2, ACC0.Q1
		VPMULL	T0.D1, T2.D1, ACCM.Q1

		reduce()

	B	singlesLoop
tail:
	CBZ	srcPtrLen, done

	VEOR	T0.B16, T0.B16, T0.B16
	VEOR	T3.B16, T3.B16, T3.B16
	MOVD	$0, H1
	SUB	$1, H1
	ADD	srcPtrLen, srcPtr

	TBZ	$3, srcPtrLen, ld4
	MOVD.W	-8(srcPtr), H0
	VMOV	H0, T0.D[0]
	VMOV	H1, T3.D[0]

ld4:
	TBZ	$2, srcPtrLen, ld2
	MOVW.W	-4(srcPtr), H0
	VEXT	$12, T0.B16, ZERO.B16, T0.B16
	VEXT	$12, T3.B16, ZERO.B16, T3.B16
	VMOV	H0, T0.S[0]
	VMOV	H1, T3.S[0]
ld2:
	TBZ	$1, srcPtrLen, ld1
	MOVH.W	-2(srcPtr), H0
	VEXT	$14, T0.B16, ZERO.B16, T0.B16
	VEXT	$14, T3.B16, ZERO.B16, T3.B16
	VMOV	H0, T0.H[0]
	VMOV	H1, T3.H[0]
ld1:
	TBZ	$0, srcPtrLen, ld0
	MOVB.W	-1(srcPtr), H0
	VEXT	$15, T0.B16, ZERO.B16, T0.B16
	VEXT	$15, T3.B16, ZERO.B16, T3.B16
	VMOV	H0, T0.B[0]
	VMOV	H1, T3.B[0]
ld0:
	MOVD	ZR, srcPtrLen
	VMOV	CTR.B16, B0.B16
	sm4eEnc1block()

tailLast:
	VEOR	T0.B16, B0.B16, B0.B16
	VAND	T3.B16, B0.B16, B0.B16
	B	encReduce

done:
	VST1	[ACC0.B16], (tPtr)
	RET

// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
	MOVD	productTable+0(FP), pTbl
	MOVD	dst+8(FP), dstPtr
	MOVD	src_base+32(FP), srcPtr
	MOVD	src_len+40(FP), srcPtrLen
	MOVD	ctr+56(FP), ctrPtr
	MOVD	T+64(FP), tPtr
	MOVD	rk_base+72(FP), rk

	MOVD	$0xC2, H1
	LSL	$56, H1
	MOVD	$1, H0
	VMOV	H1, POLY.D[0]
	VMOV	H0, POLY.D[1]
	VEOR	ZERO.B16, ZERO.B16, ZERO.B16

	MOVD	pTbl, pTblSave
	MOVD rk, rkSave
	// Current tag, after AAD
	VLD1	(tPtr), [ACC0.B16]
	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
	// Prepare initial counter, and the increment vector
	VLD1	(ctrPtr), [CTR.B16]
	VEOR	INC.B16, INC.B16, INC.B16
	MOVD	$1, H0
	VMOV	H0, INC.S[3]
	VREV32	CTR.B16, CTR.B16
	VADD	CTR.S4, INC.S4, CTR.S4

	// Skip to <8 blocks loop
	CMP	$128, srcPtrLen

	MOVD	rk, H0
	// For SM4 round keys are stored in: K0 .. K7
	VLD1.P	64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
	VLD1.P	64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]

	BLT	startSingles
octetsLoop:
		SUB	$128, srcPtrLen

		VMOV	CTR.B16, B0.B16
		VADD	B0.S4, INC.S4, B1.S4
		VADD	B1.S4, INC.S4, B2.S4
		VADD	B2.S4, INC.S4, B3.S4
		VADD	B3.S4, INC.S4, B4.S4
		VADD	B4.S4, INC.S4, B5.S4
		VADD	B5.S4, INC.S4, B6.S4
		VADD	B6.S4, INC.S4, B7.S4
		VADD	B7.S4, INC.S4, CTR.S4

		sm4eEnc8blocks()      

		VMOV B0.B16, T1.B16
		VMOV B1.B16, T2.B16
		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
		VEOR	B0.B16, T1.B16, T1.B16
		VEOR	B1.B16, T2.B16, T2.B16
		VST1.P  [T1.B16, T2.B16], 32(dstPtr)

		VLD1.P	32(pTbl), [T1.B16, T2.B16]
		VREV64	B0.B16, B0.B16
		VEOR	ACC0.B16, B0.B16, B0.B16
		VEXT	$8, B0.B16, B0.B16, T0.B16
		VEOR	B0.B16, T0.B16, T0.B16
		VPMULL	B0.D1, T1.D1, ACC1.Q1
		VPMULL2	B0.D2, T1.D2, ACC0.Q1
		VPMULL	T0.D1, T2.D1, ACCM.Q1
		mulRound(B1)

		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
		VEOR	B2.B16, B0.B16, T1.B16
		VEOR	B3.B16, B1.B16, T2.B16
		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
		mulRound(B0)
		mulRound(B1)

		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
		VEOR	B4.B16, B0.B16, T1.B16
		VEOR	B5.B16, B1.B16, T2.B16
		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
		mulRound(B0)
		mulRound(B1)

		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
		VEOR	B6.B16, B0.B16, T1.B16
		VEOR	B7.B16, B1.B16, T2.B16
		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
		mulRound(B0)
		mulRound(B1)

		MOVD	pTblSave, pTbl
		reduce()

		CMP	$128, srcPtrLen
		BGE	octetsLoop

startSingles:
	CBZ	srcPtrLen, done
	ADD	$14*16, pTbl
	// Preload H and its Karatsuba precomp
	VLD1.P	(pTbl), [T1.B16, T2.B16]

singlesLoop:
		CMP	$16, srcPtrLen
		BLT	tail
		SUB	$16, srcPtrLen

		VLD1.P	16(srcPtr), [T0.B16]
		VREV64	T0.B16, B5.B16

		VMOV	CTR.B16, B0.B16
		VADD	CTR.S4, INC.S4, CTR.S4
		sm4eEnc1block()

singlesLast:
		VEOR	T0.B16, B0.B16, B0.B16
		VST1.P	[B0.B16], 16(dstPtr)

		VEOR	ACC0.B16, B5.B16, B5.B16
		VEXT	$8, B5.B16, B5.B16, T0.B16
		VEOR	B5.B16, T0.B16, T0.B16
		VPMULL	B5.D1, T1.D1, ACC1.Q1
		VPMULL2	B5.D2, T1.D2, ACC0.Q1
		VPMULL	T0.D1, T2.D1, ACCM.Q1
		reduce()

	B	singlesLoop        
tail:
	CBZ	srcPtrLen, done
	VMOV	CTR.B16, B0.B16
	VADD	CTR.S4, INC.S4, CTR.S4
	sm4eEnc1block()

tailLast:
	// Assuming it is safe to load past dstPtr due to the presence of the tag
	// B5 stored last ciphertext
	VLD1	(srcPtr), [B5.B16]

	VEOR	B5.B16, B0.B16, B0.B16

	VEOR	T3.B16, T3.B16, T3.B16
	MOVD	$0, H1
	SUB	$1, H1

	TBZ	$3, srcPtrLen, ld4 // Test if srcPtrLen < 8, if yes, goto ld4
	VMOV	B0.D[0], H0
	MOVD.P	H0, 8(dstPtr)
	VMOV	H1, T3.D[0]
	VEXT	$8, ZERO.B16, B0.B16, B0.B16
ld4:
	TBZ	$2, srcPtrLen, ld2 // Test if srcPtrLen < 4, if yes, goto ld2
	VMOV	B0.S[0], H0
	MOVW.P	H0, 4(dstPtr)
	VEXT	$12, T3.B16, ZERO.B16, T3.B16
	VMOV	H1, T3.S[0]
	VEXT	$4, ZERO.B16, B0.B16, B0.B16
ld2:
	TBZ	$1, srcPtrLen, ld1 // Test if srcPtrLen < 2, if yes, goto ld1
	VMOV	B0.H[0], H0
	MOVH.P	H0, 2(dstPtr)
	VEXT	$14, T3.B16, ZERO.B16, T3.B16
	VMOV	H1, T3.H[0]
	VEXT	$2, ZERO.B16, B0.B16, B0.B16
ld1:
	TBZ	$0, srcPtrLen, ld0 // Test if srcPtrLen < 1, if yes, goto ld0
	VMOV	B0.B[0], H0
	MOVB.P	H0, 1(dstPtr)
	VEXT	$15, T3.B16, ZERO.B16, T3.B16
	VMOV	H1, T3.B[0]
ld0:

	VAND	T3.B16, B5.B16, B5.B16
	VREV64	B5.B16, B5.B16

	VEOR	ACC0.B16, B5.B16, B5.B16
	VEXT	$8, B5.B16, B5.B16, T0.B16
	VEOR	B5.B16, T0.B16, T0.B16
	VPMULL	B5.D1, T1.D1, ACC1.Q1
	VPMULL2	B5.D2, T1.D2, ACC0.Q1
	VPMULL	T0.D1, T2.D1, ACCM.Q1
	reduce()
done:
	VST1	[ACC0.B16], (tPtr)

	RET