From 0a559b5202f10e7ecd0109fed9e78daa4094c9b5 Mon Sep 17 00:00:00 2001
From: Sun Yimin <emmansun@users.noreply.github.com>
Date: Tue, 25 Jul 2023 15:02:41 +0800
Subject: [PATCH] sm9/bn256: arm64 complete add

---
 sm9/bn256/gfp2_g1_arm64.go |  74 -----------------
 sm9/bn256/gfp2_g1_arm64.s  | 165 +++++++++++++++++++++++++++++++++++++
 sm9/bn256/gfp2_g1_decl.go  |   4 +-
 3 files changed, 167 insertions(+), 76 deletions(-)
 delete mode 100644 sm9/bn256/gfp2_g1_arm64.go

diff --git a/sm9/bn256/gfp2_g1_arm64.go b/sm9/bn256/gfp2_g1_arm64.go
deleted file mode 100644
index f92c169..0000000
--- a/sm9/bn256/gfp2_g1_arm64.go
+++ /dev/null
@@ -1,74 +0,0 @@
-//go:build arm64 && !purego
-// +build arm64,!purego
-
-package bn256
-
-// gfP2 multiplication.
-//
-//go:noescape
-func gfp2Mul(c, a, b *gfP2)
-
-// gfP2 multiplication. c = a*b*u
-//
-//go:noescape
-func gfp2MulU(c, a, b *gfP2)
-
-// gfP2 square.
-//
-//go:noescape
-func gfp2Square(c, a *gfP2)
-
-// gfP2 square and mult u.
-//
-//go:noescape
-func gfp2SquareU(c, a *gfP2)
-
-// Point doubling. Sets res = in + in. in can be the point at infinity.
-//
-//go:noescape
-func curvePointDoubleComplete(c, a *curvePoint)
-
-func curvePointAddComplete(c, p1, p2 *curvePoint) {
-	// Complete addition formula for a = 0 from "Complete addition formulas for
-	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
-	// Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves.
-
-	t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP)
-	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
-	gfpMul(t0, &p1.x, &p2.x)    // t0 := X1X2
-	gfpMul(t1, &p1.y, &p2.y)    // t1 := Y1Y2
-	gfpMul(t2, &p1.z, &p2.z)    // t2 := Z1Z2
-	gfpAdd(t3, &p1.x, &p1.y)    // t3 := X1 + Y1
-	gfpAdd(t4, &p2.x, &p2.y)    // t4 := X2 + Y2
-	gfpMul(t3, t3, t4)          // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
-	gfpAdd(t4, t0, t1)          // t4 := t0 + t1
-	gfpSub(t3, t3, t4)          // t3 := t3 - t4 = X1Y2 + X2Y1
-	gfpAdd(t4, &p1.y, &p1.z)    // t4 := Y1 + Z1
-	gfpAdd(x3, &p2.y, &p2.z)    // X3 := Y2 + Z2
-	gfpMul(t4, t4, x3)          // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
-	gfpAdd(x3, t1, t2)          // X3 := t1 + t2
-	gfpSub(t4, t4, x3)          // t4 := t4 - X3 = Y1Z2 + Y2Z1
-	gfpAdd(x3, &p1.x, &p1.z)    // X3 := X1 + Z1
-	gfpAdd(y3, &p2.x, &p2.z)    // Y3 := X2 + Z2
-	gfpMul(x3, x3, y3)          // X3 := X3 * Y3
-	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
-	gfpSub(y3, x3, y3)          // Y3 := X3 - Y3 = X1Z2 + X2Z1
-	gfpTriple(t0, t0)           // t0 := t0 + t0 + t0 = 3X1X2
-	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2
-	gfpAdd(z3, t1, t2)          // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
-	gfpSub(t1, t1, t2)          // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
-	gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
-	gfpMul(x3, t4, y3)          // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
-	gfpMul(t2, t3, t1)          // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
-	gfpSub(x3, t2, x3)          // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
-	gfpMul(y3, y3, t0)          // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
-	gfpMul(t1, t1, z3)          // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
-	gfpAdd(y3, t1, y3)          // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
-	gfpMul(t0, t0, t3)          // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
-	gfpMul(z3, z3, t4)          // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
-	gfpAdd(z3, z3, t0)          // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
-
-	c.x.Set(x3)
-	c.y.Set(y3)
-	c.z.Set(z3)
-}
diff --git a/sm9/bn256/gfp2_g1_arm64.s b/sm9/bn256/gfp2_g1_arm64.s
index 878c3ec..a49aebe 100644
--- a/sm9/bn256/gfp2_g1_arm64.s
+++ b/sm9/bn256/gfp2_g1_arm64.s
@@ -757,6 +757,171 @@ TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$168-16
 
 	RET
 
+/* ---------------------------------------*/
 #undef x3t
 #undef y3t
 #undef z3t
+
+#define tmp2(off) (32*2 + 8 + off)(RSP)
+#define tmp3(off) (32*3 + 8 + off)(RSP)
+#define tmp4(off) (32*4 + 8 + off)(RSP)
+#define x3t(off) (32*5 + 8 + off)(RSP)
+#define y3t(off) (32*6 + 8 + off)(RSP)
+#define z3t(off) (32*7 + 8 + off)(RSP)
+
+// func curvePointAddComplete(c, a, b *curvePoint)
+TEXT ·curvePointAddComplete(SB),0,$264-24
+	MOVD	in1+8(FP), a_ptr
+	MOVD	in2+16(FP), b_ptr
+
+	MOVD	·np+0x00(SB), hlp1
+	LDP	·p2+0x00(SB), (const0, const1)
+	LDP	·p2+0x10(SB), (const2, const3)
+
+	LDx (x1in)
+	LDy (x2in)
+	CALL gfpMulInternal(SB)         // t0 := X1X2
+	STy (tmp0)
+	LDx (y1in)
+	LDy (y2in)
+	CALL gfpMulInternal(SB)         // t1 := Y1Y2
+	STy (tmp1)
+	LDx (z1in)
+	LDy (z2in)
+	CALL gfpMulInternal(SB)         // t2 := Z1Z2
+	STy (tmp2)
+
+	LDx (x1in)
+	LDy (y1in)
+	gfpAddInline                    // t3 := X1 + Y1
+	STx (tmp3)
+
+	LDx (x2in)
+	LDy (y2in)
+	gfpAddInline                    // t4 := X2 + Y2
+	LDy (tmp3)
+	CALL gfpMulInternal(SB)         // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
+	STy (tmp3)
+
+	LDx (tmp0)
+	LDy (tmp1)
+	gfpAddInline                    // t4 := t0 + t1
+	LDy (tmp3)
+	CALL gfpSubInternal(SB)         // t3 := t3 - t4 = X1Y2 + X2Y1
+	STx (tmp3)
+
+	LDx (y1in)
+	LDy (z1in)
+	gfpAddInline                    // t4 := Y1 + Z1
+	STx (tmp4)
+
+	LDx (y2in)
+	LDy (z2in)
+	gfpAddInline                    // t3 := Y2 + Z2
+	LDy (tmp4)
+	CALL gfpMulInternal(SB)         // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
+	STy (tmp4)
+
+	LDx (tmp1)
+	LDy (tmp2)
+	gfpAddInline                    // X3 := t1 + t2
+	LDy (tmp4)
+	CALL gfpSubInternal(SB)         // t4 := t4 - X3 = Y1Z2 + Y2Z1
+	STx (tmp4)
+
+	LDx (x1in)
+	LDy (z1in)
+	gfpAddInline                    // X3 := X1 + Z1
+	STx (x3t)
+
+	LDx (x2in)
+	LDy (z2in)
+	gfpAddInline                    // Y3 := X2 + Z2
+	LDy (x3t)
+	CALL gfpMulInternal(SB)         // X3 := X3 * Y3
+	STy (x3t)
+
+	LDx (tmp0)
+	LDy (tmp2)
+	gfpAddInline                    // Y3 := t0 + t2
+	LDy (x3t)
+	CALL gfpSubInternal(SB)         // Y3 := X3 - Y3 = X1Z2 + X2Z1
+	STx (y3t)
+
+	LDy (tmp0)
+	gfpMulBy2Inline
+	gfpAddInline                    // t0 := t0 + t0 + t0 = 3X1X2
+	STx (tmp0)
+
+	LDy (tmp2)
+	gfpMulBy2Inline
+	x2y
+	gfpMulBy2Inline
+	x2y
+	gfpMulBy2Inline
+	x2y
+	gfpMulBy2Inline
+	x2y
+	LDx (tmp2)
+	CALL gfpSubInternal(SB)        // t2 := 3b * t2 = 3bZ1Z2
+	STx (tmp2)
+
+	LDy (tmp1)
+	gfpAddInline                   // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
+	STx (z3t)
+
+	LDx (tmp2)
+	CALL gfpSubInternal(SB)        // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
+	STx (tmp1)
+
+	LDy (y3t)
+	gfpMulBy2Inline
+	x2y
+	gfpMulBy2Inline
+	x2y
+	gfpMulBy2Inline
+	x2y
+	gfpMulBy2Inline
+	x2y
+	LDx (y3t)
+	CALL gfpSubInternal(SB)        // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
+	STx (y3t)
+
+	LDy (tmp4)
+	CALL gfpMulInternal(SB)        // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
+	STy (x3t)
+
+	MOVD res+0(FP), b_ptr
+
+	LDx (tmp3)
+	LDy (tmp1)
+	CALL gfpMulInternal(SB)        // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
+	LDx (x3t)
+	CALL gfpSubInternal(SB)        // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
+	STx (x2in)
+
+	LDy (y3t)
+	LDx (tmp0)
+	CALL gfpMulInternal(SB)        // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
+	STy (y3t)
+
+	LDx (tmp1)
+	LDy (z3t)
+	CALL gfpMulInternal(SB)        // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
+	LDx (y3t)
+	gfpAddInline                   // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
+	STx (y2in)
+
+	LDx (tmp0)
+	LDy (tmp3)
+	CALL gfpMulInternal(SB)        // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
+	STy (tmp0)
+
+	LDx (tmp4)
+	LDy (z3t)
+	CALL gfpMulInternal(SB)        // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
+	LDx (tmp0)
+	gfpAddInline                   // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
+	STx (x2in)
+
+	RET
diff --git a/sm9/bn256/gfp2_g1_decl.go b/sm9/bn256/gfp2_g1_decl.go
index 71c37f2..6e5078a 100644
--- a/sm9/bn256/gfp2_g1_decl.go
+++ b/sm9/bn256/gfp2_g1_decl.go
@@ -1,5 +1,5 @@
-//go:build amd64 && !purego
-// +build amd64,!purego
+//go:build (amd64 && !purego) || (arm64 && !purego)
+// +build amd64,!purego arm64,!purego
 
 package bn256