From c62d6daf7471facd32843f0ec91effde786ddd1b Mon Sep 17 00:00:00 2001
From: Sun Yimin <emmansun@users.noreply.github.com>
Date: Tue, 25 Jul 2023 08:21:44 +0800
Subject: [PATCH] sm9/bn256: arm64 gfp2MulU

---
 sm9/bn256/gfp2_g1_arm64.go | 27 +++---------------
 sm9/bn256/gfp2_g1_arm64.s  | 57 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/sm9/bn256/gfp2_g1_arm64.go b/sm9/bn256/gfp2_g1_arm64.go
index 7fdf2b8..1c65710 100644
--- a/sm9/bn256/gfp2_g1_arm64.go
+++ b/sm9/bn256/gfp2_g1_arm64.go
@@ -8,29 +8,10 @@ package bn256
 //go:noescape
 func gfp2Mul(c, a, b *gfP2)
 
-func gfp2MulU(c, a, b *gfP2) {
-	tmp := &gfP2{}
-	tx := &tmp.x
-	ty := &tmp.y
-	v0, v1 := &gfP{}, &gfP{}
-
-	gfpMul(v0, &a.y, &b.y)
-	gfpMul(v1, &a.x, &b.x)
-
-	gfpAdd(tx, &a.x, &a.y)
-	gfpAdd(ty, &b.x, &b.y)
-
-	gfpMul(ty, tx, ty)
-	gfpSub(ty, ty, v0)
-	gfpSub(ty, ty, v1)
-	gfpDouble(ty, ty)
-	gfpNeg(ty, ty)
-
-	gfpSub(tx, v0, v1)
-	gfpSub(tx, tx, v1)
-
-	gfp2Copy(c, tmp)
-}
+// gfP2 multiplication. c = a*b*u
+//
+//go:noescape
+func gfp2MulU(c, a, b *gfP2)
 
 func gfp2Square(c, a *gfP2) {
 	tmp := &gfP2{}
diff --git a/sm9/bn256/gfp2_g1_arm64.s b/sm9/bn256/gfp2_g1_arm64.s
index beb414f..36b1af6 100644
--- a/sm9/bn256/gfp2_g1_arm64.s
+++ b/sm9/bn256/gfp2_g1_arm64.s
@@ -515,8 +515,8 @@ TEXT ·gfp2Mul(SB),NOSPLIT,$104-24
 	LDx (y1in)
 	LDy (y2in)
 	CALL gfpMulInternal(SB)
-
 	STy (tmp0)
+
 	LDx (x1in)
 	LDy (x2in)
 	CALL gfpMulInternal(SB)
@@ -538,7 +538,7 @@ TEXT ·gfp2Mul(SB),NOSPLIT,$104-24
 	x2y
 	LDx (tmp1)
 	CALL gfpSubInternal(SB)
-	MOVD	res+0(FP), res_ptr  // not use hlp1 any more    
+	MOVD	res+0(FP), res_ptr  // not use hlp1 any more
 	STx (x3out)
 
 	LDy (tmp1)
@@ -548,3 +548,56 @@ TEXT ·gfp2Mul(SB),NOSPLIT,$104-24
 	STx (y3out)
 
 	RET
+
+// func gfp2MulU(c, a, b *gfP2)
+TEXT ·gfp2MulU(SB),NOSPLIT,$104-24
+	MOVD	in1+8(FP), a_ptr
+	MOVD	in2+16(FP), b_ptr
+
+	MOVD	·np+0x00(SB), hlp1
+	LDP	·p2+0x00(SB), (const0, const1)
+	LDP	·p2+0x10(SB), (const2, const3)
+
+	LDx (y1in)
+	LDy (y2in)
+	CALL gfpMulInternal(SB)
+	STy (tmp0)
+
+	LDx (x1in)
+	LDy (x2in)
+	CALL gfpMulInternal(SB)
+	STy (tmp1)
+
+	LDx (x1in)
+	LDy (y1in)
+	gfpAddInline
+	STx (tmp2)
+
+	LDx (x2in)
+	LDy (y2in)
+	gfpAddInline
+	LDy (tmp2)
+	CALL gfpMulInternal(SB)
+
+	LDx (tmp0)
+	CALL gfpSubInternal(SB)
+	x2y
+	LDx (tmp1)
+	CALL gfpSubInternal(SB)
+	x2y
+	gfpMulBy2Inline
+	MOVD	$0, y0 
+	MOVD	$0, y1 
+	MOVD	$0, y2 
+	MOVD	$0, y3
+	CALL gfpSubInternal(SB)
+	MOVD	res+0(FP), res_ptr    // not use hlp1 any more
+	STx (y3out)
+
+	LDy (tmp1)
+	gfpMulBy2Inline
+	LDy (tmp0)
+	CALL gfpSubInternal(SB)
+	STx (x3out)
+
+	RET