From 5f72151e74a22b62fae4b880722f09827121c51a Mon Sep 17 00:00:00 2001
From: Sun Yimin <emmansun@users.noreply.github.com>
Date: Wed, 5 Jul 2023 17:58:19 +0800
Subject: [PATCH] sm9/bn256: special square for final exp & optimize gfp2/gfp12
 square #137 #139

---
 sm9/bn256/bn_pair.go     |  69 +++++-----
 sm9/bn256/gfp.go         |   1 +
 sm9/bn256/gfp12.go       | 276 ++++++++++++++++++++++++++-------------
 sm9/bn256/gfp12_exp_u.go |  17 +--
 sm9/bn256/gfp12_test.go  | 131 +++++++++++++++++++
 sm9/bn256/gfp2.go        |  78 +++--------
 sm9/bn256/gfp2_test.go   |  12 ++
 sm9/bn256/gfp4.go        |  57 +-------
 sm9/bn256/gfp4_test.go   |  19 +++
 sm9/bn256/gfp_decl.go    |   1 +
 10 files changed, 420 insertions(+), 241 deletions(-)

diff --git a/sm9/bn256/bn_pair.go b/sm9/bn256/bn_pair.go
index fe79b77..0435f8c 100644
--- a/sm9/bn256/bn_pair.go
+++ b/sm9/bn256/bn_pair.go
@@ -10,7 +10,7 @@ func lineFunctionAdd(r, p, rOut *twistPoint, q *curvePoint, r2, a, b, c *gfP2) {
 	D.Square(D).Sub(D, r2).Sub(D, &r.t).Mul(D, &r.t) // D = ((Yp + Zr)^2 - Zr^2 - Yp^2)*Zr^2 = 2Yp*Zr^3
 
 	H := (&gfP2{}).Sub(B, &r.x) // H = Xp * Zr^2 - Xr
-	I := (&gfP2{}).SquareNC(H)    // I = (Xp * Zr^2 - Xr)^2 = Xp^2*Zr^4 + Xr^2 - 2Xr*Xp*Zr^2
+	I := (&gfP2{}).SquareNC(H)  // I = (Xp * Zr^2 - Xr)^2 = Xp^2*Zr^4 + Xr^2 - 2Xr*Xp*Zr^2
 
 	E := (&gfP2{}).Add(I, I) // E = 2*(Xp * Zr^2 - Xr)^2
 	E.Add(E, E)              // E = 4*(Xp * Zr^2 - Xr)^2
@@ -37,8 +37,8 @@ func lineFunctionAdd(r, p, rOut *twistPoint, q *curvePoint, r2, a, b, c *gfP2) {
 	t.Add(&p.y, &rOut.z).Square(t).Sub(t, r2).Sub(t, &rOut.t) // t = (Yp + rOut.Z)^2 - Yp^2 - rOut.Z^2 = 2Yp*rOut.Z
 
 	t2.Mul(L1, &p.x)
-	t2.Add(t2, t2)           // t2 = 2 L1 * Xp
-	a.Sub(t2, t) // a =  2 L1 * Xp - 2 Yp * rOut.z
+	t2.Add(t2, t2) // t2 = 2 L1 * Xp
+	a.Sub(t2, t)   // a =  2 L1 * Xp - 2 Yp * rOut.z
 
 	c.MulScalar(&rOut.z, &q.y)
 	c.Add(c, c)
@@ -139,9 +139,9 @@ func miller(q *twistPoint, p *curvePoint) *gfP12 {
 			ret.Square(ret)
 		}
 		mulLine(ret, a, b, c)
-		tmpR= r
+		tmpR = r
 		r = newR
-		newR= tmpR
+		newR = tmpR
 		switch sixUPlus2NAF[i-1] {
 		case 1:
 			lineFunctionAdd(r, aAffine, newR, bAffine, r2, a, b, c)
@@ -152,9 +152,9 @@ func miller(q *twistPoint, p *curvePoint) *gfP12 {
 		}
 
 		mulLine(ret, a, b, c)
-		tmpR= r
+		tmpR = r
 		r = newR
-		newR= tmpR
+		newR = tmpR
 	}
 
 	// In order to calculate Q1 we have to convert q from the sextic twist
@@ -187,9 +187,9 @@ func miller(q *twistPoint, p *curvePoint) *gfP12 {
 	r2.Square(&q1.y)
 	lineFunctionAdd(r, q1, newR, bAffine, r2, a, b, c)
 	mulLine(ret, a, b, c)
-	tmpR= r
+	tmpR = r
 	r = newR
-	newR= tmpR
+	newR = tmpR
 
 	r2.Square(&minusQ2.y)
 	lineFunctionAdd(r, minusQ2, newR, bAffine, r2, a, b, c)
@@ -202,51 +202,50 @@ func miller(q *twistPoint, p *curvePoint) *gfP12 {
 // GF(p¹²) to obtain an element of GT. https://eprint.iacr.org/2007/390.pdf
 // http://cryptojedi.org/papers/dclxvi-20100714.pdf
 func finalExponentiation(in *gfP12) *gfP12 {
-	t1 := &gfP12{}
-
 	// This is the p^6-Frobenius
-	t1.FrobeniusP6(in)
+	t1 := (&gfP12{}).FrobeniusP6(in)
 
-	inv := &gfP12{}
-	inv.Invert(in)
+	inv := (&gfP12{}).Invert(in)
 	t1.Mul(t1, inv)
 
-	t2 := (&gfP12{}).FrobeniusP2(t1)
-	t1.Mul(t1, t2)
+	t2 := inv.FrobeniusP2(t1) // reuse inv
+	t1.Mul(t1, t2)            // t1 = in ^ ((p^6 - 1) * (p^2 + 1)), the first two parts of the exponentiation
 
 	fp := (&gfP12{}).Frobenius(t1)
 	fp2 := (&gfP12{}).FrobeniusP2(t1)
 	fp3 := (&gfP12{}).Frobenius(fp2)
 
-	fu := (&gfP12{}).gfP12ExpU(t1)
-	fu2 := (&gfP12{}).gfP12ExpU(fu)
-	fu3 := (&gfP12{}).gfP12ExpU(fu2)
+	y0 := &gfP12{}
+	y0.MulNC(fp, fp2).Mul(y0, fp3) // y0 = (t1^p) * (t1^(p^2)) * (t1^(p^3))
+
+	// reuse fp, fp2, fp3 local variables
+	// [gfP12ExpU] is most time consuming operation
+	fu := fp.gfP12ExpU(t1)
+	fu2 := fp2.gfP12ExpU(fu)
+	fu3 := fp3.gfP12ExpU(fu2)
 
-	y3 := (&gfP12{}).Frobenius(fu)
 	fu2p := (&gfP12{}).Frobenius(fu2)
 	fu3p := (&gfP12{}).Frobenius(fu3)
-	y2 := (&gfP12{}).FrobeniusP2(fu2)
 
-	y0 := &gfP12{}
-	y0.MulNC(fp, fp2).Mul(y0, fp3)
+	y1 := (&gfP12{}).Conjugate(t1)    // y1 = 1 / t1
+	y2 := (&gfP12{}).FrobeniusP2(fu2) // y2 = (t1^(u^2))^(p^2)
+	y3 := (&gfP12{}).Frobenius(fu)    // y3 = (t1^u)^p
+	y3.Conjugate(y3)                  // y3 = 1 / (t1^u)^p
+	y4 := (&gfP12{}).MulNC(fu, fu2p)  // y4 = (t1^u) * ((t1^(u^2))^p)
+	y4.Conjugate(y4)                  // y4 = 1 / ((t1^u) * ((t1^(u^2))^p))
+	y5 := fu2p.Conjugate(fu2)         // y5 = 1 / t1^(u^2), reuse fu2p
+	y6 := (&gfP12{}).MulNC(fu3, fu3p) // y6 = t1^(u^3) * (t1^(u^3))^p
+	y6.Conjugate(y6)                  // y6 = 1 / (t1^(u^3) * (t1^(u^3))^p)
 
-	y1 := (&gfP12{}).Conjugate(t1)
-	y5 := (&gfP12{}).Conjugate(fu2)
-	y3.Conjugate(y3)
-	y4 := (&gfP12{}).MulNC(fu, fu2p)
-	y4.Conjugate(y4)
-
-	y6 := (&gfP12{}).MulNC(fu3, fu3p)
-	y6.Conjugate(y6)
-
-	t0 := (&gfP12{}).SquareNC(y6)
+	// https://eprint.iacr.org/2008/490.pdf
+	t0 := (&gfP12{}).SpecialSquareNC(y6)
 	t0.Mul(t0, y4).Mul(t0, y5)
 	t1.Mul(y3, y5).Mul(t1, t0)
 	t0.Mul(t0, y2)
-	t1.Square(t1).Mul(t1, t0).Square(t1)
+	t1.SpecialSquare(t1).Mul(t1, t0).SpecialSquare(t1)
 	t0.Mul(t1, y1)
 	t1.Mul(t1, y0)
-	t0.Square(t0).Mul(t0, t1)
+	t0.SpecialSquare(t0).Mul(t0, t1)
 
 	return t0
 }
diff --git a/sm9/bn256/gfp.go b/sm9/bn256/gfp.go
index f834a07..9e0e5d7 100644
--- a/sm9/bn256/gfp.go
+++ b/sm9/bn256/gfp.go
@@ -9,6 +9,7 @@ import (
 
 type gfP [4]uint64
 
+var genericZero = &gfP{0}
 var zero = newGFp(0)
 var one = newGFp(1)
 var two = newGFp(2)
diff --git a/sm9/bn256/gfp12.go b/sm9/bn256/gfp12.go
index 8070fd0..50f2344 100644
--- a/sm9/bn256/gfp12.go
+++ b/sm9/bn256/gfp12.go
@@ -140,36 +140,7 @@ func (e *gfP12) Mul(a, b *gfP12) *gfP12 {
 	// +x0*z1*w^2 + x0*y1*v + x0*x1*v*w
 	//=(z0*z1+y0*x1*v+x0*y1*v) + (z0*y1+y0*z1+x0*x1*v)w + (z0*x1 + y0*y1 + x0*z1)*w^2
 	tmp := &gfP12{}
-	tx := &tmp.x
-	ty := &tmp.y
-	tz := &tmp.z
-	t, v0, v1, v2 := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}
-	v0.MulNC(&a.z, &b.z)
-	v1.MulNC(&a.y, &b.y)
-	v2.MulNC(&a.x, &b.x)
-
-	t.Add(&a.y, &a.x)
-	tz.Add(&b.y, &b.x)
-	t.Mul(t, tz)
-	t.Sub(t, v1)
-	t.Sub(t, v2)
-	t.MulV1(t)
-	tz.Add(t, v0)
-
-	t.Add(&a.z, &a.y)
-	ty.Add(&b.z, &b.y)
-	ty.Mul(t, ty)
-	ty.Sub(ty, v0)
-	ty.Sub(ty, v1)
-	t.MulV1(v2)
-	ty.Add(ty, t)
-
-	t.Add(&a.z, &a.x)
-	tx.Add(&b.z, &b.x)
-	tx.Mul(tx, t)
-	tx.Sub(tx, v0)
-	tx.Add(tx, v1)
-	tx.Sub(tx, v2)
+	tmp.MulNC(a, b)
 	gfp12Copy(e, tmp)
 	return e
 }
@@ -180,6 +151,7 @@ func (e *gfP12) MulNC(a, b *gfP12) *gfP12 {
 	// +y0*z1*w + y0*y1*w^2 + y0*x1*v
 	// +x0*z1*w^2 + x0*y1*v + x0*x1*v*w
 	//=(z0*z1+y0*x1*v+x0*y1*v) + (z0*y1+y0*z1+x0*x1*v)w + (z0*x1 + y0*y1 + x0*z1)*w^2
+	// Karatsuba method
 	tx := &e.x
 	ty := &e.y
 	tz := &e.z
@@ -219,25 +191,7 @@ func (e *gfP12) Square(a *gfP12) *gfP12 {
 	// (z^2 + y*x*v + x*y*v) + (z*y + y*z + v * x^2)w + (z*x + y^2 + x*z)*w^2
 	// (z^2 + 2*x*y*v) + (v*x^2 + 2*y*z) *w + (y^2 + 2*x*z) * w^2
 	tmp := &gfP12{}
-	tx := &tmp.x
-	ty := &tmp.y
-	tz := &tmp.z
-	t := &gfP4{}
-
-	tz.SquareNC(&a.z)
-	t.MulV(&a.x, &a.y)
-	t.Add(t, t)
-	tz.Add(tz, t)
-
-	ty.SquareVNC(&a.x)
-	t.Mul(&a.y, &a.z)
-	t.Add(t, t)
-	ty.Add(ty, t)
-
-	tx.SquareNC(&a.y)
-	t.Mul(&a.x, &a.z)
-	t.Add(t, t)
-	tx.Add(tx, t)
+	tmp.SquareNC(a)
 	gfp12Copy(e, tmp)
 	return e
 }
@@ -247,25 +201,148 @@ func (e *gfP12) SquareNC(a *gfP12) *gfP12 {
 	// z^2 + z*y*w + z*x*w^2 + y*z*w + y^2*w^2 + y*x*v + x*z*w^2 + x*y*v + x^2 *v *w
 	// (z^2 + y*x*v + x*y*v) + (z*y + y*z + v * x^2)w + (z*x + y^2 + x*z)*w^2
 	// (z^2 + 2*x*y*v) + (v*x^2 + 2*y*z) *w + (y^2 + 2*x*z) * w^2
+	// Karatsuba method
 	tx := &e.x
 	ty := &e.y
 	tz := &e.z
-	t := &gfP4{}
+	t, v0, v1, v2 := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}
+	v0.SquareNC(&a.z)
+	v1.SquareNC(&a.y)
+	v2.SquareNC(&a.x)
 
-	tz.SquareNC(&a.z)
-	t.MulV(&a.x, &a.y)
-	t.Add(t, t)
-	tz.Add(tz, t)
+	t.Add(&a.y, &a.x)
+	tz.SquareNC(t)
+	tz.Sub(tz, v1)
+	tz.Sub(tz, v2)
+	tz.MulV1(tz)
+	tz.Add(tz, v0)
 
-	ty.SquareVNC(&a.x)
-	t.Mul(&a.y, &a.z)
-	t.Add(t, t)
+	t.Add(&a.z, &a.y)
+	ty.SquareNC(t)
+	ty.Sub(ty, v0)
+	ty.Sub(ty, v1)
+	t.MulV1(v2)
 	ty.Add(ty, t)
 
-	tx.SquareNC(&a.y)
-	t.Mul(&a.x, &a.z)
-	t.Add(t, t)
-	tx.Add(tx, t)
+	t.Add(&a.z, &a.x)
+	tx.SquareNC(t)
+	tx.Sub(tx, v0)
+	tx.Add(tx, v1)
+	tx.Sub(tx, v2)
+
+	return e
+}
+
+// Special squaring for use on elements in T_6(fp2) (after the
+// easy part of the final exponentiation. Used in the hard part
+// of the final exponentiation. Function uses formulas in
+// Granger/Scott (PKC2010).
+func (e *gfP12) SpecialSquare(a *gfP12) *gfP12 {
+	tmp := &gfP12{}
+	tmp.SpecialSquareNC(a)
+	gfp12Copy(e, tmp)
+	return e
+}
+
+func (e *gfP12) SpecialSquares(a *gfP12, n int) *gfP12 {
+	// Square first round
+	in := &gfP12{}
+	tx, ty, tz := &gfP4{}, &gfP4{}, &gfP4{}
+
+	v0 := &in.x
+	v1 := &in.y
+	v2 := &in.z
+
+	v0.SquareVNC(&a.x) // (t02, t10)
+	v1.SquareNC(&a.y) // (t12, t01)
+	v2.SquareNC(&a.z) // (t11, t00)
+
+	tx.Add(v0, v0)
+	tx.Add(v0, tx)
+	ty.Add(v1, v1)
+	ty.Add(v1, ty)
+	tz.Add(v2, v2)
+	tz.Add(v2, tz)
+
+	v0.Add(&a.x, &a.x) // (f12, f01)
+	v0.y.Neg(&v0.y)
+	v1.Add(&a.y, &a.y) // (f02, f10)
+	v1.x.Neg(&v1.x)
+	v2.Add(&a.z, &a.z) // (f11, f00)
+	v2.y.Neg(&v2.y)
+
+	v0.Add(ty, v0)
+	v1.Add(tx, v1)
+	v2.Add(tz, v2)
+
+	tmp := &gfP12{}
+	var tmp2 *gfP12
+
+	for i := 1; i < n; i++ {
+		v0 = &tmp.x
+		v1 = &tmp.y
+		v2 = &tmp.z
+
+		v0.SquareVNC(&in.x) // (t02, t10)
+		v1.SquareNC(&in.y) // (t12, t01)
+		v2.SquareNC(&in.z) // (t11, t00)
+	
+		tx.Add(v0, v0)
+		tx.Add(v0, tx)
+		ty.Add(v1, v1)
+		ty.Add(v1, ty)
+		tz.Add(v2, v2)
+		tz.Add(v2, tz)
+	
+		v0.Add(&in.x, &in.x) // (f12, f01)
+		v0.y.Neg(&v0.y)
+		v1.Add(&in.y, &in.y) // (f02, f10)
+		v1.x.Neg(&v1.x)
+		v2.Add(&in.z, &in.z) // (f11, f00)
+		v2.y.Neg(&v2.y)
+	
+		v0.Add(ty, v0)
+		v1.Add(tx, v1)
+		v2.Add(tz, v2)
+
+		// Switch references
+		tmp2 = in
+		in = tmp
+		tmp = tmp2
+	}
+	gfp12Copy(e, in)
+	return e
+}
+
+func (e *gfP12) SpecialSquareNC(a *gfP12) *gfP12 {
+	tx, ty, tz := &gfP4{}, &gfP4{}, &gfP4{}
+
+	v0 := &e.x
+	v1 := &e.y
+	v2 := &e.z
+
+	v0.SquareVNC(&a.x) // (t02, t10)
+	v1.SquareNC(&a.y) // (t12, t01)
+	v2.SquareNC(&a.z) // (t11, t00)
+
+	tx.Add(v0, v0)
+	tx.Add(v0, tx)
+	ty.Add(v1, v1)
+	ty.Add(v1, ty)
+	tz.Add(v2, v2)
+	tz.Add(v2, tz)
+
+	v0.Add(&a.x, &a.x) // (f12, f01)
+	v0.y.Neg(&v0.y)
+	v1.Add(&a.y, &a.y) // (f02, f10)
+	v1.x.Neg(&v1.x)
+	v2.Add(&a.z, &a.z) // (f11, f00)
+	v2.y.Neg(&v2.y)
+
+	v0.Add(ty, v0)
+	v1.Add(tx, v1)
+	v2.Add(tz, v2)
+
 	return e
 }
 
@@ -275,51 +352,68 @@ func (e *gfP12) Squares(a *gfP12, n int) *gfP12 {
 	tx := &in.x
 	ty := &in.y
 	tz := &in.z
-	t := &gfP4{}
+	t, v0, v1, v2 := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}
 
-	tz.SquareNC(&a.z)
-	t.MulV(&a.x, &a.y)
-	t.Add(t, t)
-	tz.Add(tz, t)
+	v0.SquareNC(&a.z)
+	v1.SquareNC(&a.y)
+	v2.SquareNC(&a.x)
 
-	ty.SquareVNC(&a.x)
-	t.Mul(&a.y, &a.z)
-	t.Add(t, t)
+	t.Add(&a.y, &a.x)
+	tz.SquareNC(t)
+	tz.Sub(tz, v1)
+	tz.Sub(tz, v2)
+	tz.MulV1(tz)
+	tz.Add(tz, v0)
+
+	t.Add(&a.z, &a.y)
+	ty.SquareNC(t)
+	ty.Sub(ty, v0)
+	ty.Sub(ty, v1)
+	t.MulV1(v2)
 	ty.Add(ty, t)
 
-	tx.SquareNC(&a.y)
-	t.Mul(&a.x, &a.z)
-	t.Add(t, t)
-	tx.Add(tx, t)
+	t.Add(&a.z, &a.x)
+	tx.SquareNC(t)
+	tx.Sub(tx, v0)
+	tx.Add(tx, v1)
+	tx.Sub(tx, v2)
 
 	tmp := &gfP12{}
 	var tmp2 *gfP12
-	tx = &tmp.x
-	ty = &tmp.y
-	tz = &tmp.z
-	for i := 1; i < n; i++ {
-		tz.SquareNC(&in.z)
-		t.MulV(&in.x, &in.y)
-		t.Add(t, t)
-		tz.Add(tz, t)
 
-		ty.SquareVNC(&in.x)
-		t.Mul(&in.y, &in.z)
-		t.Add(t, t)
+	for i := 1; i < n; i++ {
+		tx = &tmp.x
+		ty = &tmp.y
+		tz = &tmp.z
+
+		v0.SquareNC(&in.z)
+		v1.SquareNC(&in.y)
+		v2.SquareNC(&in.x)
+
+		t.Add(&in.y, &in.x)
+		tz.SquareNC(t)
+		tz.Sub(tz, v1)
+		tz.Sub(tz, v2)
+		tz.MulV1(tz)
+		tz.Add(tz, v0)
+
+		t.Add(&in.z, &in.y)
+		ty.SquareNC(t)
+		ty.Sub(ty, v0)
+		ty.Sub(ty, v1)
+		t.MulV1(v2)
 		ty.Add(ty, t)
 
-		tx.SquareNC(&in.y)
-		t.Mul(&in.x, &in.z)
-		t.Add(t, t)
-		tx.Add(tx, t)
+		t.Add(&in.z, &in.x)
+		tx.SquareNC(t)
+		tx.Sub(tx, v0)
+		tx.Add(tx, v1)
+		tx.Sub(tx, v2)
 
 		// Switch references
 		tmp2 = in
 		in = tmp
 		tmp = tmp2
-		tx = &tmp.x
-		ty = &tmp.y
-		tz = &tmp.z
 	}
 	gfp12Copy(e, in)
 	return e
diff --git a/sm9/bn256/gfp12_exp_u.go b/sm9/bn256/gfp12_exp_u.go
index 04a4d54..198be7d 100644
--- a/sm9/bn256/gfp12_exp_u.go
+++ b/sm9/bn256/gfp12_exp_u.go
@@ -1,5 +1,6 @@
 package bn256
 
+// Use special square
 func (e *gfP12) gfP12ExpU(x *gfP12) *gfP12 {
 	// The sequence of 10 multiplications and 61 squarings is derived from the
 	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
@@ -20,23 +21,23 @@ func (e *gfP12) gfP12ExpU(x *gfP12) *gfP12 {
 	var t2 = new(gfP12)
 	var t3 = new(gfP12)
 
-	t2.SquareNC(x)
-	t1.SquareNC(t2)
+	t2.SpecialSquareNC(x)
+	t1.SpecialSquareNC(t2)
 	z.MulNC(x, t1)
 	t0.MulNC(t1, z)
 	t2.Mul(t2, t0)
 	t3.MulNC(x, t2)
-	t3.Squares(t3, 40)
+	t3.SpecialSquares(t3, 40)
 	t3.Mul(t2, t3)
-	t3.Squares(t3, 7)
+	t3.SpecialSquares(t3, 7)
 	t2.Mul(t2, t3)
 	t1.Mul(t1, t2)
-	t1.Squares(t1, 4)
+	t1.SpecialSquares(t1, 4)
 	t0.Mul(t0, t1)
-	t0.Square(t0)
+	t0.SpecialSquare(t0)
 	t0.Mul(x, t0)
-	t0.Squares(t0, 6)
+	t0.SpecialSquares(t0, 6)
 	z.Mul(z, t0)
-	z.Square(z)
+	z.SpecialSquare(z)
 	return e
 }
diff --git a/sm9/bn256/gfp12_test.go b/sm9/bn256/gfp12_test.go
index bedd1c9..580a336 100644
--- a/sm9/bn256/gfp12_test.go
+++ b/sm9/bn256/gfp12_test.go
@@ -35,6 +35,31 @@ func Test_gfP12Square(t *testing.T) {
 	}
 }
 
+func TestSpecialSquare(t *testing.T) {
+	in := &gfP12{
+		testdataP4,
+		testdataP4,
+		*(&gfP4{}).SetOne(),
+	}
+
+	// This is the p^6-Frobenius
+	t1 := (&gfP12{}).FrobeniusP6(in)
+
+	inv := (&gfP12{}).Invert(in)
+	t1.Mul(t1, inv)
+
+	t2 := inv.FrobeniusP2(t1) // reuse inv
+	t1.Mul(t1, t2)            // t1 = in ^ ((p^6 - 1) * (p^2 + 1)), the first two parts of the exponentiation
+
+	got := &gfP12{}
+	expected := &gfP12{}
+	got.SpecialSquare(t1)
+	expected.Square(t1)
+	if *got != *expected {
+		t.Errorf("not same got=%v, expected=%v", got, expected)
+	}
+}
+
 func BenchmarkGfP12Square(b *testing.B) {
 	x := &gfP12{
 		testdataP4,
@@ -49,6 +74,52 @@ func BenchmarkGfP12Square(b *testing.B) {
 	}
 }
 
+func BenchmarkGfP12SpecialSquare(b *testing.B) {
+	in := &gfP12{
+		testdataP4,
+		testdataP4,
+		*(&gfP4{}).SetOne(),
+	}
+
+	// This is the p^6-Frobenius
+	t1 := (&gfP12{}).FrobeniusP6(in)
+
+	inv := (&gfP12{}).Invert(in)
+	t1.Mul(t1, inv)
+
+	t2 := inv.FrobeniusP2(t1) // reuse inv
+	t1.Mul(t1, t2)            // t1 = in ^ ((p^6 - 1) * (p^2 + 1)), the first two parts of the exponentiation
+	x2 := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		x2.Square(t1)
+	}
+}
+
+func BenchmarkGfP12SpecialSqures(b *testing.B) {
+	in := &gfP12{
+		testdataP4,
+		testdataP4,
+		*(&gfP4{}).SetOne(),
+	}
+
+	// This is the p^6-Frobenius
+	t1 := (&gfP12{}).FrobeniusP6(in)
+
+	inv := (&gfP12{}).Invert(in)
+	t1.Mul(t1, inv)
+
+	t2 := inv.FrobeniusP2(t1) // reuse inv
+	t1.Mul(t1, t2)            // t1 = in ^ ((p^6 - 1) * (p^2 + 1)), the first two parts of the exponentiation
+	got := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		got.SpecialSquares(in, 61)
+	}
+}
+
 func testGfP12Invert(t *testing.T, x *gfP12) {
 	xInv := &gfP12{}
 	xInv.Invert(x)
@@ -281,6 +352,20 @@ func Test_W3(t *testing.T) {
 	}
 }
 
+func BenchmarkGfP12Invert(b *testing.B) {
+	x := &gfP12{
+		testdataP4,
+		testdataP4,
+		testdataP4,
+	}
+	got := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		got.Invert(x)
+	}
+}
+
 func BenchmarkGfP12Frobenius(b *testing.B) {
 	x := &gfP12{
 		testdataP4,
@@ -300,6 +385,48 @@ func BenchmarkGfP12Frobenius(b *testing.B) {
 	}
 }
 
+func BenchmarkGfP12Mul(b *testing.B) {
+	x := &gfP12{
+		testdataP4,
+		testdataP4,
+		testdataP4,
+	}
+	got := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		got.Mul(x, x)
+	}
+}
+
+func BenchmarkGfP12Squre(b *testing.B) {
+	x := &gfP12{
+		testdataP4,
+		testdataP4,
+		testdataP4,
+	}
+	got := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		got.Square(x)
+	}
+}
+
+func BenchmarkGfP12Squres(b *testing.B) {
+	x := &gfP12{
+		testdataP4,
+		testdataP4,
+		testdataP4,
+	}
+	got := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		got.Squares(x, 61)
+	}
+}
+
 func BenchmarkGfP12ExpU(b *testing.B) {
 	x := &gfP12{
 		testdataP4,
@@ -311,6 +438,8 @@ func BenchmarkGfP12ExpU(b *testing.B) {
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		got.gfP12ExpU(x)
+		got.gfP12ExpU(x)
+		got.gfP12ExpU(x)
 	}
 }
 
@@ -325,5 +454,7 @@ func BenchmarkGfP12ExpU2(b *testing.B) {
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		got.Exp(x, u)
+		got.Exp(x, u)
+		got.Exp(x, u)
 	}
 }
diff --git a/sm9/bn256/gfp2.go b/sm9/bn256/gfp2.go
index 1e48251..0a17a99 100644
--- a/sm9/bn256/gfp2.go
+++ b/sm9/bn256/gfp2.go
@@ -64,13 +64,13 @@ func (e *gfP2) IsOne() bool {
 
 func (e *gfP2) Conjugate(a *gfP2) *gfP2 {
 	e.y.Set(&a.y)
-	gfpNeg(&e.x, &a.x)
+	gfpSub(&e.x, genericZero, &a.x)
 	return e
 }
 
 func (e *gfP2) Neg(a *gfP2) *gfP2 {
-	gfpNeg(&e.x, &a.x)
-	gfpNeg(&e.y, &a.y)
+	gfpSub(&e.x, genericZero, &a.x)
+	gfpSub(&e.y, genericZero, &a.y)
 	return e
 }
 
@@ -109,22 +109,7 @@ func (e *gfP2) Triple(a *gfP2) *gfP2 {
 // c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0
 func (e *gfP2) Mul(a, b *gfP2) *gfP2 {
 	tmp := &gfP2{}
-	tx := &tmp.x
-	ty := &tmp.y
-	v0, v1 := &gfP{}, &gfP{}
-
-	gfpMul(v0, &a.y, &b.y)
-	gfpMul(v1, &a.x, &b.x)
-
-	gfpAdd(tx, &a.x, &a.y)
-	gfpAdd(ty, &b.x, &b.y)
-	gfpMul(tx, tx, ty)
-	gfpSub(tx, tx, v0)
-	gfpSub(tx, tx, v1)
-
-	gfpSub(ty, v0, v1)
-	gfpSub(ty, ty, v1)
-
+	tmp.MulNC(a, b)
 	gfp2Copy(e, tmp)
 	return e
 }
@@ -170,7 +155,7 @@ func (e *gfP2) MulU(a, b *gfP2) *gfP2 {
 	gfpSub(ty, ty, v0)
 	gfpSub(ty, ty, v1)
 	gfpAdd(ty, ty, ty)
-	gfpNeg(ty, ty)
+	gfpSub(ty, genericZero, ty)
 
 	gfpSub(tx, v0, v1)
 	gfpSub(tx, tx, v1)
@@ -186,7 +171,7 @@ func (e *gfP2) MulU(a, b *gfP2) *gfP2 {
 func (e *gfP2) MulU1(a *gfP2) *gfP2 {
 	t := &gfP{}
 	gfpAdd(t, &a.x, &a.x)
-	gfpNeg(t, t)
+	gfpSub(t, genericZero, t)
 
 	gfpCopy(&e.x, &a.y)
 	gfpCopy(&e.y, t)
@@ -197,15 +182,7 @@ func (e *gfP2) Square(a *gfP2) *gfP2 {
 	// Complex squaring algorithm:
 	// (xu+y)² = y^2-2*x^2 + 2*u*x*y
 	tmp := &gfP2{}
-	tx := &tmp.x
-	ty := &tmp.y
-	gfpSqr(tx, &a.x, 1)
-	gfpSqr(ty, &a.y, 1)
-	gfpSub(ty, ty, tx)
-	gfpSub(ty, ty, tx)
-
-	gfpMul(tx, &a.x, &a.y)
-	gfpAdd(tx, tx, tx)
+	tmp.SquareNC(a)
 	gfp2Copy(e, tmp)
 	return e
 }
@@ -215,13 +192,15 @@ func (e *gfP2) SquareNC(a *gfP2) *gfP2 {
 	// (xu+y)² = y^2-2*x^2 + 2*u*x*y
 	tx := &e.x
 	ty := &e.y
-	gfpSqr(tx, &a.x, 1)
-	gfpSqr(ty, &a.y, 1)
-	gfpSub(ty, ty, tx)
-	gfpSub(ty, ty, tx)
 
+	gfpAdd(ty, &a.x, &a.y)
+	gfpAdd(tx, &a.x, &a.x)
+	gfpSub(tx, &a.y, tx)
+	gfpMul(ty, tx, ty)
 	gfpMul(tx, &a.x, &a.y)
+	gfpAdd(ty, tx, ty)
 	gfpAdd(tx, tx, tx)
+
 	return e
 }
 
@@ -230,20 +209,7 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 {
 	// (xu+y)²*u = (y^2-2*x^2)u - 4*x*y
 
 	tmp := &gfP2{}
-	tx := &tmp.x
-	ty := &tmp.y
-	// tx = a0^2 - 2 * a1^2
-	gfpSqr(ty, &a.x, 1)
-	gfpSqr(tx, &a.y, 1)
-	gfpAdd(ty, ty, ty)
-	gfpSub(tx, tx, ty)
-
-	// ty = -4 * a0 * a1
-	gfpMul(ty, &a.x, &a.y)
-	gfpAdd(ty, ty, ty)
-	gfpAdd(ty, ty, ty)
-	gfpNeg(ty, ty)
-
+	tmp.SquareUNC(a)
 	gfp2Copy(e, tmp)
 	return e
 }
@@ -251,20 +217,18 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 {
 func (e *gfP2) SquareUNC(a *gfP2) *gfP2 {
 	// Complex squaring algorithm:
 	// (xu+y)²*u = (y^2-2*x^2)u - 4*x*y
-
 	tx := &e.x
 	ty := &e.y
-	// tx = a0^2 - 2 * a1^2
-	gfpSqr(ty, &a.x, 1)
-	gfpSqr(tx, &a.y, 1)
-	gfpAdd(ty, ty, ty)
-	gfpSub(tx, tx, ty)
 
-	// ty = -4 * a0 * a1
+	gfpAdd(tx, &a.x, &a.y)
+	gfpAdd(ty, &a.x, &a.x)
+	gfpSub(ty, &a.y, ty)
+	gfpMul(tx, tx, ty)
 	gfpMul(ty, &a.x, &a.y)
+	gfpAdd(tx, tx, ty)
 	gfpAdd(ty, ty, ty)
 	gfpAdd(ty, ty, ty)
-	gfpNeg(ty, ty)
+	gfpSub(ty, genericZero, ty)
 
 	return e
 }
@@ -287,7 +251,7 @@ func (e *gfP2) Invert(a *gfP2) *gfP2 {
 	inv := &gfP{}
 	inv.Invert(t3) // inv = (2 * a.x ^ 2 + a.y ^ 2) ^ (-1)
 
-	gfpNeg(t1, &a.x)
+	gfpSub(t1, genericZero, &a.x)
 
 	gfpMul(&e.x, t1, inv)   // x = - a.x * inv
 	gfpMul(&e.y, &a.y, inv) // y = a.y * inv
diff --git a/sm9/bn256/gfp2_test.go b/sm9/bn256/gfp2_test.go
index b6fd6d7..92d17bb 100644
--- a/sm9/bn256/gfp2_test.go
+++ b/sm9/bn256/gfp2_test.go
@@ -172,6 +172,18 @@ func BenchmarkGfP2Square(b *testing.B) {
 	}
 }
 
+func BenchmarkGfP2SquareU(b *testing.B) {
+	x := &gfP2{
+		*fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141")),
+		*fromBigInt(bigFromHex("3722755292130B08D2AAB97FD34EC120EE265948D19C17ABF9B7213BAF82D65B")),
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		x.SquareU(x)
+	}
+}
+
 /*
 func Test_gfP2QuadraticResidue(t *testing.T) {
 	x := &gfP2{
diff --git a/sm9/bn256/gfp4.go b/sm9/bn256/gfp4.go
index f1eb00a..a8931e5 100644
--- a/sm9/bn256/gfp4.go
+++ b/sm9/bn256/gfp4.go
@@ -99,21 +99,7 @@ func (e *gfP4) Mul(a, b *gfP4) *gfP4 {
 	//c0 = a0*b0 +a1*b1*u
 	//c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0
 	tmp := &gfP4{}
-	tx := &tmp.x
-	ty := &tmp.y
-	v0, v1 := &gfP2{}, &gfP2{}
-	v0.MulNC(&a.y, &b.y)
-	v1.MulNC(&a.x, &b.x)
-
-	tx.Add(&a.x, &a.y)
-	ty.Add(&b.x, &b.y)
-	tx.Mul(tx, ty)
-	tx.Sub(tx, v0)
-	tx.Sub(tx, v1)
-
-	ty.MulU1(v1)
-	ty.Add(ty, v0)
-
+	tmp.MulNC(a, b)
 	gfp4Copy(e, tmp)
 	return e
 }
@@ -151,22 +137,7 @@ func (e *gfP4) MulNC(a, b *gfP4) *gfP4 {
 // c1 = a0*b0 + a1*b1*u
 func (e *gfP4) MulV(a, b *gfP4) *gfP4 {
 	tmp := &gfP4{}
-	tx := &tmp.x
-	ty := &tmp.y
-	v0, v1 := &gfP2{}, &gfP2{}
-	v0.MulNC(&a.y, &b.y)
-	v1.MulNC(&a.x, &b.x)
-
-	tx.Add(&a.x, &a.y)
-	ty.Add(&b.x, &b.y)
-	ty.Mul(tx, ty)
-	ty.Sub(ty, v0)
-	ty.Sub(ty, v1)
-	ty.MulU1(ty)
-
-	tx.MulU1(v1)
-	tx.Add(tx, v0)
-
+	tmp.MulVNC(a, b)
 	gfp4Copy(e, tmp)
 	return e
 }
@@ -208,15 +179,7 @@ func (e *gfP4) Square(a *gfP4) *gfP4 {
 	// Complex squaring algorithm:
 	// (xv+y)² = (x^2*u + y^2) + 2*x*y*v
 	tmp := &gfP4{}
-	tx := &tmp.x
-	ty := &tmp.y
-	tx.SquareUNC(&a.x)
-	ty.SquareNC(&a.y)
-	ty.Add(tx, ty)
-
-	tx.Mul(&a.x, &a.y)
-	tx.Add(tx, tx)
-
+	tmp.SquareNC(a)
 	gfp4Copy(e, tmp)
 	return e
 }
@@ -224,13 +187,15 @@ func (e *gfP4) Square(a *gfP4) *gfP4 {
 func (e *gfP4) SquareNC(a *gfP4) *gfP4 {
 	// Complex squaring algorithm:
 	// (xv+y)² = (x^2*u + y^2) + 2*x*y*v
+	// = (xu + y)(x + y) -xy(1+u) + 2xy*v
 	tx := &e.x
 	ty := &e.y
+
 	tx.SquareUNC(&a.x)
 	ty.SquareNC(&a.y)
 	ty.Add(tx, ty)
 
-	tx.Mul(&a.x, &a.y)
+	tx.MulNC(&a.x, &a.y)
 	tx.Add(tx, tx)
 
 	return e
@@ -240,15 +205,7 @@ func (e *gfP4) SquareNC(a *gfP4) *gfP4 {
 // v*(xv+y)² = (x^2*u + y^2)v + 2*x*y*u
 func (e *gfP4) SquareV(a *gfP4) *gfP4 {
 	tmp := &gfP4{}
-	tx := &tmp.x
-	ty := &tmp.y
-	tx.SquareUNC(&a.x)
-	ty.SquareNC(&a.y)
-	tx.Add(tx, ty)
-
-	ty.MulU(&a.x, &a.y)
-	ty.Add(ty, ty)
-
+	tmp.SquareVNC(a)
 	gfp4Copy(e, tmp)
 	return e
 }
diff --git a/sm9/bn256/gfp4_test.go b/sm9/bn256/gfp4_test.go
index a875f54..e45127d 100644
--- a/sm9/bn256/gfp4_test.go
+++ b/sm9/bn256/gfp4_test.go
@@ -207,3 +207,22 @@ func BenchmarkGfP4Mul(b *testing.B) {
 		t.Mul(x, y)
 	}
 }
+
+func BenchmarkGfP4Square(b *testing.B) {
+	x := &gfP4{
+		gfP2{
+			*fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141")),
+			*fromBigInt(bigFromHex("3722755292130B08D2AAB97FD34EC120EE265948D19C17ABF9B7213BAF82D65B")),
+		},
+		gfP2{
+			*fromBigInt(bigFromHex("17509B092E845C1266BA0D262CBEE6ED0736A96FA347C8BD856DC76B84EBEB96")),
+			*fromBigInt(bigFromHex("A7CF28D519BE3DA65F3170153D278FF247EFBA98A71A08116215BBA5C999A7C7")),
+		},
+	}
+	t := &gfP4{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		t.Square(x)
+	}
+}
diff --git a/sm9/bn256/gfp_decl.go b/sm9/bn256/gfp_decl.go
index 7c0a4a1..ed426d1 100644
--- a/sm9/bn256/gfp_decl.go
+++ b/sm9/bn256/gfp_decl.go
@@ -16,6 +16,7 @@ import (
 var supportADX = cpu.X86.HasADX && cpu.X86.HasBMI2
 
 // Set c = p - a, if c == p, then c = 0
+// It seems this function's performance is worse than gfpSub with zero.
 //
 // go:noescape
 func gfpNeg(c, a *gfP)