diff --git a/sm9/bn256/bn_pair.go b/sm9/bn256/bn_pair.go
index acc3ef8..fe79b77 100644
--- a/sm9/bn256/bn_pair.go
+++ b/sm9/bn256/bn_pair.go
@@ -1,35 +1,34 @@
 // Package bn256 defines/implements ShangMi(SM) sm9's curves and pairing.
 package bn256
 
-func lineFunctionAdd(r, p *twistPoint, q *curvePoint, r2 *gfP2) (a, b, c *gfP2, rOut *twistPoint) {
+func lineFunctionAdd(r, p, rOut *twistPoint, q *curvePoint, r2, a, b, c *gfP2) {
 	// See the mixed addition algorithm from "Faster Computation of the
 	// Tate Pairing", http://arxiv.org/pdf/0904.0854v3.pdf
-	B := (&gfP2{}).Mul(&p.x, &r.t) // B = Xp * Zr^2
+	B := (&gfP2{}).MulNC(&p.x, &r.t) // B = Xp * Zr^2
 
 	D := (&gfP2{}).Add(&p.y, &r.z)                   // D = Yp + Zr
 	D.Square(D).Sub(D, r2).Sub(D, &r.t).Mul(D, &r.t) // D = ((Yp + Zr)^2 - Zr^2 - Yp^2)*Zr^2 = 2Yp*Zr^3
 
 	H := (&gfP2{}).Sub(B, &r.x) // H = Xp * Zr^2 - Xr
-	I := (&gfP2{}).Square(H)    // I = (Xp * Zr^2 - Xr)^2 = Xp^2*Zr^4 + Xr^2 - 2Xr*Xp*Zr^2
+	I := (&gfP2{}).SquareNC(H)    // I = (Xp * Zr^2 - Xr)^2 = Xp^2*Zr^4 + Xr^2 - 2Xr*Xp*Zr^2
 
 	E := (&gfP2{}).Add(I, I) // E = 2*(Xp * Zr^2 - Xr)^2
 	E.Add(E, E)              // E = 4*(Xp * Zr^2 - Xr)^2
 
-	J := (&gfP2{}).Mul(H, E) // J =  4*(Xp * Zr^2 - Xr)^3
+	J := (&gfP2{}).MulNC(H, E) // J =  4*(Xp * Zr^2 - Xr)^3
 
 	L1 := (&gfP2{}).Sub(D, &r.y) // L1 = 2Yp*Zr^3 - Yr
 	L1.Sub(L1, &r.y)             // L1 = 2Yp*Zr^3 - 2*Yr
 
-	V := (&gfP2{}).Mul(&r.x, E) // V = 4 * Xr * (Xp * Zr^2 - Xr)^2
+	V := (&gfP2{}).MulNC(&r.x, E) // V = 4 * Xr * (Xp * Zr^2 - Xr)^2
 
-	rOut = &twistPoint{}
 	rOut.x.Square(L1).Sub(&rOut.x, J).Sub(&rOut.x, V).Sub(&rOut.x, V) // rOut.x = L1^2 - J - 2V
 
 	rOut.z.Add(&r.z, H).Square(&rOut.z).Sub(&rOut.z, &r.t).Sub(&rOut.z, I) // rOut.z = (Zr + H)^2 - Zr^2 - I
 
 	t := (&gfP2{}).Sub(V, &rOut.x) // t = V - rOut.x
 	t.Mul(t, L1)                   // t = L1*(V-rOut.x)
-	t2 := (&gfP2{}).Mul(&r.y, J)
+	t2 := (&gfP2{}).MulNC(&r.y, J)
 	t2.Add(t2, t2)    // t2 = 2Yr * J
 	rOut.y.Sub(t, t2) // rOut.y = L1*(V-rOut.x) - 2Yr*J
 
@@ -39,23 +38,21 @@ func lineFunctionAdd(r, p *twistPoint, q *curvePoint, r2 *gfP2) (a, b, c *gfP2,
 
 	t2.Mul(L1, &p.x)
 	t2.Add(t2, t2)           // t2 = 2 L1 * Xp
-	a = (&gfP2{}).Sub(t2, t) // a =  2 L1 * Xp - 2 Yp * rOut.z
+	a.Sub(t2, t) // a =  2 L1 * Xp - 2 Yp * rOut.z
 
-	c = (&gfP2{}).MulScalar(&rOut.z, &q.y)
+	c.MulScalar(&rOut.z, &q.y)
 	c.Add(c, c)
 
-	b = (&gfP2{}).Neg(L1)
+	b.Neg(L1)
 	b.MulScalar(b, &q.x).Add(b, b)
-
-	return
 }
 
-func lineFunctionDouble(r *twistPoint, q *curvePoint) (a, b, c *gfP2, rOut *twistPoint) {
+func lineFunctionDouble(r, rOut *twistPoint, q *curvePoint, a, b, c *gfP2) {
 	// See the doubling algorithm for a=0 from "Faster Computation of the
 	// Tate Pairing", http://arxiv.org/pdf/0904.0854v3.pdf
-	A := (&gfP2{}).Square(&r.x)
-	B := (&gfP2{}).Square(&r.y)
-	C := (&gfP2{}).Square(B) // C = Yr ^ 4
+	A := (&gfP2{}).SquareNC(&r.x)
+	B := (&gfP2{}).SquareNC(&r.y)
+	C := (&gfP2{}).SquareNC(B) // C = Yr ^ 4
 
 	D := (&gfP2{}).Add(&r.x, B)
 	D.Square(D).Sub(D, A).Sub(D, C).Add(D, D)
@@ -63,9 +60,8 @@ func lineFunctionDouble(r *twistPoint, q *curvePoint) (a, b, c *gfP2, rOut *twis
 	E := (&gfP2{}).Add(A, A) //
 	E.Add(E, A)              // E = 3 * Xr ^ 2
 
-	G := (&gfP2{}).Square(E) // G = 9 * Xr^4
+	G := (&gfP2{}).SquareNC(E) // G = 9 * Xr^4
 
-	rOut = &twistPoint{}
 	rOut.x.Sub(G, D).Sub(&rOut.x, D)
 
 	rOut.z.Add(&r.y, &r.z).Square(&rOut.z).Sub(&rOut.z, B).Sub(&rOut.z, &r.t) // Z3 = (Yr + Zr)^2 - Yr^2 - Zr^2 = 2Yr*Zr
@@ -78,18 +74,16 @@ func lineFunctionDouble(r *twistPoint, q *curvePoint) (a, b, c *gfP2, rOut *twis
 	rOut.t.Square(&rOut.z)
 
 	t.Mul(E, &r.t).Add(t, t)
-	b = (&gfP2{}).Neg(t)
+	b.Neg(t)
 	b.MulScalar(b, &q.x)
 
-	a = (&gfP2{}).Add(&r.x, E)
+	a.Add(&r.x, E)
 	a.Square(a).Sub(a, A).Sub(a, G)
 	t.Add(B, B).Add(t, t)
 	a.Sub(a, t)
 
-	c = (&gfP2{}).Mul(&rOut.z, &r.t)
+	c.Mul(&rOut.z, &r.t)
 	c.Add(c, c).MulScalar(c, &q.y)
-
-	return
 }
 
 // (ret.z + ret.y*w + ret.x*w^2)* ((cv+a) + b*w^2)
@@ -98,15 +92,15 @@ func mulLine(ret *gfP12, a, b, c *gfP2) {
 	gfp2Copy(&bz.x, c)
 	gfp2Copy(&bz.y, a)
 
-	tz.Mul(&ret.z, bz)
+	tz.MulNC(&ret.z, bz)
 	t.MulScalar(&ret.y, b).MulV1(t)
 	tz.Add(tz, t)
 
-	t1.Mul(&ret.y, bz)
+	t1.MulNC(&ret.y, bz)
 	t.MulScalar(&ret.x, b).MulV1(t)
 	ret.y.Add(t1, t)
 
-	t.Mul(&ret.x, bz)
+	t.MulNC(&ret.x, bz)
 	t1.MulScalar(&ret.z, b)
 	ret.x.Add(t1, t)
 	gfp4Copy(&ret.z, tz)
@@ -134,26 +128,33 @@ func miller(q *twistPoint, p *curvePoint) *gfP12 {
 	r := &twistPoint{}
 	r.Set(aAffine)
 
-	r2 := (&gfP2{}).Square(&aAffine.y)
+	r2 := (&gfP2{}).SquareNC(&aAffine.y)
 
+	a, b, c := &gfP2{}, &gfP2{}, &gfP2{}
+	newR := &twistPoint{}
+	var tmpR *twistPoint
 	for i := len(sixUPlus2NAF) - 1; i > 0; i-- {
-		a, b, c, newR := lineFunctionDouble(r, bAffine)
+		lineFunctionDouble(r, newR, bAffine, a, b, c)
 		if i != len(sixUPlus2NAF)-1 {
 			ret.Square(ret)
 		}
 		mulLine(ret, a, b, c)
+		tmpR= r
 		r = newR
+		newR= tmpR
 		switch sixUPlus2NAF[i-1] {
 		case 1:
-			a, b, c, newR = lineFunctionAdd(r, aAffine, bAffine, r2)
+			lineFunctionAdd(r, aAffine, newR, bAffine, r2, a, b, c)
 		case -1:
-			a, b, c, newR = lineFunctionAdd(r, minusA, bAffine, r2)
+			lineFunctionAdd(r, minusA, newR, bAffine, r2, a, b, c)
 		default:
 			continue
 		}
 
 		mulLine(ret, a, b, c)
+		tmpR= r
 		r = newR
+		newR= tmpR
 	}
 
 	// In order to calculate Q1 we have to convert q from the sextic twist
@@ -184,12 +185,14 @@ func miller(q *twistPoint, p *curvePoint) *gfP12 {
 	minusQ2.t.SetOne()
 
 	r2.Square(&q1.y)
-	a, b, c, newR := lineFunctionAdd(r, q1, bAffine, r2)
+	lineFunctionAdd(r, q1, newR, bAffine, r2, a, b, c)
 	mulLine(ret, a, b, c)
+	tmpR= r
 	r = newR
+	newR= tmpR
 
 	r2.Square(&minusQ2.y)
-	a, b, c, _ = lineFunctionAdd(r, minusQ2, bAffine, r2)
+	lineFunctionAdd(r, minusQ2, newR, bAffine, r2, a, b, c)
 	mulLine(ret, a, b, c)
 
 	return ret
@@ -225,18 +228,18 @@ func finalExponentiation(in *gfP12) *gfP12 {
 	y2 := (&gfP12{}).FrobeniusP2(fu2)
 
 	y0 := &gfP12{}
-	y0.Mul(fp, fp2).Mul(y0, fp3)
+	y0.MulNC(fp, fp2).Mul(y0, fp3)
 
 	y1 := (&gfP12{}).Conjugate(t1)
 	y5 := (&gfP12{}).Conjugate(fu2)
 	y3.Conjugate(y3)
-	y4 := (&gfP12{}).Mul(fu, fu2p)
+	y4 := (&gfP12{}).MulNC(fu, fu2p)
 	y4.Conjugate(y4)
 
-	y6 := (&gfP12{}).Mul(fu3, fu3p)
+	y6 := (&gfP12{}).MulNC(fu3, fu3p)
 	y6.Conjugate(y6)
 
-	t0 := (&gfP12{}).Square(y6)
+	t0 := (&gfP12{}).SquareNC(y6)
 	t0.Mul(t0, y4).Mul(t0, y5)
 	t1.Mul(y3, y5).Mul(t1, t0)
 	t0.Mul(t0, y2)
diff --git a/sm9/bn256/bn_pair_b6.go b/sm9/bn256/bn_pair_b6.go
index c016783..46d226b 100644
--- a/sm9/bn256/bn_pair_b6.go
+++ b/sm9/bn256/bn_pair_b6.go
@@ -1,9 +1,128 @@
 package bn256
 
+// (ret.x t + ret.y) * ((cs)t + (bs+a))
+//= ((ret.x * (bs+a))+ret.y*cs) t + (ret.y*(bs+a) + ret.x*cs*s)
+//= (ret.x*bs + ret.x*a + ret.y*cs) t + (ret.y*bs + ret.x*cs*s + ret.y * a)
+//ret.x = (ret.x + ret.y)(cs + bs + a) - ret.y(bs+a) - ret.x*cs
+//ret.y = ret.y(bs+a) + ret.x*cs *s
+func mulLineB6(ret *gfP12b6, a, b, c *gfP2) {
+	a2 := &gfP6{}
+	a2.y.Set(b)
+	a2.z.Set(a)
+	a2.Mul(&ret.y, a2)
+	t3 := &gfP6{}
+	t3.MulScalar(&ret.x, c).MulS(t3)
+
+	t := (&gfP2{}).Add(b, c)
+	t2 := &gfP6{}
+	t2.y.Set(t)
+	t2.z.Set(a)
+	ret.x.Add(&ret.x, &ret.y)
+
+	ret.y.Set(t3)
+
+	ret.x.Mul(&ret.x, t2).Sub(&ret.x, a2).Sub(&ret.x, &ret.y)
+
+	ret.y.MulS(&ret.y)
+	ret.y.Add(&ret.y, a2)
+}
+
+// R-ate Pairing G2 x G1 -> GT
+//
+// P is a point of order q in G1. Q(x,y) is a point of order q in G2.
+// Note that Q is a point on the sextic twist of the curve over Fp^2, P(x,y) is a point on the
+// curve over the base field Fp
+//
+func millerB6(q *twistPoint, p *curvePoint) *gfP12b6 {
+	ret := (&gfP12b6{}).SetOne()
+
+	aAffine := &twistPoint{}
+	aAffine.Set(q)
+	aAffine.MakeAffine()
+
+	minusA := &twistPoint{}
+	minusA.Neg(aAffine)
+
+	bAffine := &curvePoint{}
+	bAffine.Set(p)
+	bAffine.MakeAffine()
+
+	r := &twistPoint{}
+	r.Set(aAffine)
+
+	r2 := (&gfP2{}).Square(&aAffine.y)
+
+	a, b, c := &gfP2{}, &gfP2{}, &gfP2{}
+	newR := &twistPoint{}
+	var tmpR *twistPoint
+	for i := len(sixUPlus2NAF) - 1; i > 0; i-- {
+		lineFunctionDouble(r, newR, bAffine, a, b, c)
+		if i != len(sixUPlus2NAF)-1 {
+			ret.Square(ret)
+		}
+		mulLineB6(ret, a, b, c)
+		tmpR= r
+		r = newR
+		newR= tmpR
+		switch sixUPlus2NAF[i-1] {
+		case 1:
+			lineFunctionAdd(r, aAffine, newR, bAffine, r2, a, b, c)
+		case -1:
+			lineFunctionAdd(r, minusA, newR, bAffine, r2, a, b, c)
+		default:
+			continue
+		}
+
+		mulLineB6(ret, a, b, c)
+		tmpR= r
+		r = newR
+		newR= tmpR
+	}
+
+	// In order to calculate Q1 we have to convert q from the sextic twist
+	// to the full GF(p^12) group, apply the Frobenius there, and convert
+	// back.
+	//
+	// The twist isomorphism is (x', y') -> (x*β^(-1/3), y*β^(-1/2)). If we consider just
+	// x for a moment, then after applying the Frobenius, we have x̄*β^(-p/3)
+	// where x̄ is the conjugate of x.	If we are going to apply the inverse
+	// isomorphism we need a value with a single coefficient of β^(-1/3) so we
+	// rewrite this as x̄*β^((-p+1)/3)*β^(-1/3).
+	//
+	// A similar argument can be made for the y value.
+	q1 := &twistPoint{}
+	q1.x.Conjugate(&aAffine.x)
+	q1.x.MulScalar(&q1.x, betaToNegPPlus1Over3)
+	q1.y.Conjugate(&aAffine.y)
+	q1.y.MulScalar(&q1.y, betaToNegPPlus1Over2)
+	q1.z.SetOne()
+	q1.t.SetOne()
+
+	minusQ2 := &twistPoint{}
+	minusQ2.x.Set(&aAffine.x)
+	minusQ2.x.MulScalar(&minusQ2.x, betaToNegP2Plus1Over3)
+	minusQ2.y.Neg(&aAffine.y)
+	minusQ2.y.MulScalar(&minusQ2.y, betaToNegP2Plus1Over2)
+	minusQ2.z.SetOne()
+	minusQ2.t.SetOne()
+
+	r2.Square(&q1.y)
+	lineFunctionAdd(r, q1, newR, bAffine, r2, a, b, c)
+	mulLineB6(ret, a, b, c)
+	tmpR= r
+	r = newR
+	newR= tmpR
+
+	r2.Square(&minusQ2.y)
+	lineFunctionAdd(r, minusQ2, newR, bAffine, r2, a, b, c)
+	mulLineB6(ret, a, b, c)
+
+	return ret
+}
+
 func pairingB6(a *twistPoint, b *curvePoint) *gfP12 {
-	e := miller(a, b)
-	eb6 := (&gfP12b6{}).SetGfP12(e)
-	ret := finalExponentiationB6(eb6)
+	e := millerB6(a, b)
+	ret := finalExponentiationB6(e)
 
 	if a.IsInfinity() || b.IsInfinity() {
 		ret.SetOne()
diff --git a/sm9/bn256/bn_pair_b6_test.go b/sm9/bn256/bn_pair_b6_test.go
index 6427114..8ef6957 100644
--- a/sm9/bn256/bn_pair_b6_test.go
+++ b/sm9/bn256/bn_pair_b6_test.go
@@ -1,10 +1,21 @@
 package bn256
 
 import (
+	"fmt"
 	"math/big"
 	"testing"
 )
 
+func TestToGfP12_1(t *testing.T) {
+	x := &gfP12b6{}
+	x.SetGfP12(expected1)
+	fmt.Printf("%v\n", gfP12b6Decode(x))
+	x.SetGfP12(expected_b2)
+	fmt.Printf("%v\n", gfP12b6Decode(x))
+	x.SetGfP12(expected_b2_2)
+	fmt.Printf("%v\n", gfP12b6Decode(x))
+}
+
 func Test_finalExponentiationB6(t *testing.T) {
 	x := &gfP12b6{
 		p6,
@@ -89,3 +100,20 @@ func BenchmarkFinalExponentiationB6(b *testing.B) {
 		}
 	}
 }
+
+func BenchmarkPairingB6(b *testing.B) {
+	pk := bigFromHex("0130E78459D78545CB54C587E02CF480CE0B66340F319F348A1D5B1F2DC5F4")
+	g2 := &G2{}
+	_, err := g2.ScalarBaseMult(NormalizeScalar(pk.Bytes()))
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ret := pairingB6(g2.p, curveGen)
+		if *ret != *expected1 {
+			b.Errorf("not expected")
+		}
+	}
+}
diff --git a/sm9/bn256/bn_pair_test.go b/sm9/bn256/bn_pair_test.go
index 019f29d..cb1eb95 100644
--- a/sm9/bn256/bn_pair_test.go
+++ b/sm9/bn256/bn_pair_test.go
@@ -161,3 +161,20 @@ func BenchmarkFinalExponentiation(b *testing.B) {
 		}
 	}
 }
+
+func BenchmarkPairingB4(b *testing.B) {
+	pk := bigFromHex("0130E78459D78545CB54C587E02CF480CE0B66340F319F348A1D5B1F2DC5F4")
+	g2 := &G2{}
+	_, err := g2.ScalarBaseMult(NormalizeScalar(pk.Bytes()))
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ret := pairing(g2.p, curveGen)
+		if *ret != *expected1 {
+			b.Errorf("not expected")
+		}
+	}
+}
diff --git a/sm9/bn256/gfp12.go b/sm9/bn256/gfp12.go
index fbe1533..8070fd0 100644
--- a/sm9/bn256/gfp12.go
+++ b/sm9/bn256/gfp12.go
@@ -144,9 +144,9 @@ func (e *gfP12) Mul(a, b *gfP12) *gfP12 {
 	ty := &tmp.y
 	tz := &tmp.z
 	t, v0, v1, v2 := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}
-	v0.Mul(&a.z, &b.z)
-	v1.Mul(&a.y, &b.y)
-	v2.Mul(&a.x, &b.x)
+	v0.MulNC(&a.z, &b.z)
+	v1.MulNC(&a.y, &b.y)
+	v2.MulNC(&a.x, &b.x)
 
 	t.Add(&a.y, &a.x)
 	tz.Add(&b.y, &b.x)
@@ -174,6 +174,45 @@ func (e *gfP12) Mul(a, b *gfP12) *gfP12 {
 	return e
 }
 
+func (e *gfP12) MulNC(a, b *gfP12) *gfP12 {
+	// (z0 + y0*w + x0*w^2)* (z1 + y1*w + x1*w^2)
+	//  z0*z1 + z0*y1*w + z0*x1*w^2
+	// +y0*z1*w + y0*y1*w^2 + y0*x1*v
+	// +x0*z1*w^2 + x0*y1*v + x0*x1*v*w
+	//=(z0*z1+y0*x1*v+x0*y1*v) + (z0*y1+y0*z1+x0*x1*v)w + (z0*x1 + y0*y1 + x0*z1)*w^2
+	tx := &e.x
+	ty := &e.y
+	tz := &e.z
+	t, v0, v1, v2 := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}
+	v0.MulNC(&a.z, &b.z)
+	v1.MulNC(&a.y, &b.y)
+	v2.MulNC(&a.x, &b.x)
+
+	t.Add(&a.y, &a.x)
+	tz.Add(&b.y, &b.x)
+	t.Mul(t, tz)
+	t.Sub(t, v1)
+	t.Sub(t, v2)
+	t.MulV1(t)
+	tz.Add(t, v0)
+
+	t.Add(&a.z, &a.y)
+	ty.Add(&b.z, &b.y)
+	ty.Mul(t, ty)
+	ty.Sub(ty, v0)
+	ty.Sub(ty, v1)
+	t.MulV1(v2)
+	ty.Add(ty, t)
+
+	t.Add(&a.z, &a.x)
+	tx.Add(&b.z, &b.x)
+	tx.Mul(tx, t)
+	tx.Sub(tx, v0)
+	tx.Add(tx, v1)
+	tx.Sub(tx, v2)
+	return e
+}
+
 func (e *gfP12) Square(a *gfP12) *gfP12 {
 	// (z + y*w + x*w^2)* (z + y*w + x*w^2)
 	// z^2 + z*y*w + z*x*w^2 + y*z*w + y^2*w^2 + y*x*v + x*z*w^2 + x*y*v + x^2 *v *w
@@ -185,17 +224,17 @@ func (e *gfP12) Square(a *gfP12) *gfP12 {
 	tz := &tmp.z
 	t := &gfP4{}
 
-	tz.Square(&a.z)
+	tz.SquareNC(&a.z)
 	t.MulV(&a.x, &a.y)
 	t.Add(t, t)
 	tz.Add(tz, t)
 
-	ty.SquareV(&a.x)
+	ty.SquareVNC(&a.x)
 	t.Mul(&a.y, &a.z)
 	t.Add(t, t)
 	ty.Add(ty, t)
 
-	tx.Square(&a.y)
+	tx.SquareNC(&a.y)
 	t.Mul(&a.x, &a.z)
 	t.Add(t, t)
 	tx.Add(tx, t)
@@ -203,6 +242,33 @@ func (e *gfP12) Square(a *gfP12) *gfP12 {
 	return e
 }
 
+func (e *gfP12) SquareNC(a *gfP12) *gfP12 {
+	// (z + y*w + x*w^2)* (z + y*w + x*w^2)
+	// z^2 + z*y*w + z*x*w^2 + y*z*w + y^2*w^2 + y*x*v + x*z*w^2 + x*y*v + x^2 *v *w
+	// (z^2 + y*x*v + x*y*v) + (z*y + y*z + v * x^2)w + (z*x + y^2 + x*z)*w^2
+	// (z^2 + 2*x*y*v) + (v*x^2 + 2*y*z) *w + (y^2 + 2*x*z) * w^2
+	tx := &e.x
+	ty := &e.y
+	tz := &e.z
+	t := &gfP4{}
+
+	tz.SquareNC(&a.z)
+	t.MulV(&a.x, &a.y)
+	t.Add(t, t)
+	tz.Add(tz, t)
+
+	ty.SquareVNC(&a.x)
+	t.Mul(&a.y, &a.z)
+	t.Add(t, t)
+	ty.Add(ty, t)
+
+	tx.SquareNC(&a.y)
+	t.Mul(&a.x, &a.z)
+	t.Add(t, t)
+	tx.Add(tx, t)
+	return e
+}
+
 func (e *gfP12) Squares(a *gfP12, n int) *gfP12 {
 	// Square first round
 	in := &gfP12{}
@@ -211,17 +277,17 @@ func (e *gfP12) Squares(a *gfP12, n int) *gfP12 {
 	tz := &in.z
 	t := &gfP4{}
 
-	tz.Square(&a.z)
+	tz.SquareNC(&a.z)
 	t.MulV(&a.x, &a.y)
 	t.Add(t, t)
 	tz.Add(tz, t)
 
-	ty.SquareV(&a.x)
+	ty.SquareVNC(&a.x)
 	t.Mul(&a.y, &a.z)
 	t.Add(t, t)
 	ty.Add(ty, t)
 
-	tx.Square(&a.y)
+	tx.SquareNC(&a.y)
 	t.Mul(&a.x, &a.z)
 	t.Add(t, t)
 	tx.Add(tx, t)
@@ -232,17 +298,17 @@ func (e *gfP12) Squares(a *gfP12, n int) *gfP12 {
 	ty = &tmp.y
 	tz = &tmp.z
 	for i := 1; i < n; i++ {
-		tz.Square(&in.z)
+		tz.SquareNC(&in.z)
 		t.MulV(&in.x, &in.y)
 		t.Add(t, t)
 		tz.Add(tz, t)
 
-		ty.SquareV(&in.x)
+		ty.SquareVNC(&in.x)
 		t.Mul(&in.y, &in.z)
 		t.Add(t, t)
 		ty.Add(ty, t)
 
-		tx.Square(&in.y)
+		tx.SquareNC(&in.y)
 		t.Mul(&in.x, &in.z)
 		t.Add(t, t)
 		tx.Add(tx, t)
@@ -289,19 +355,19 @@ func (e *gfP12) Invert(a *gfP12) *gfP12 {
 	// = τ²(y²-ξxz) + τ(ξx²-yz) + (z²-ξxy)
 	//
 	// So that's why A = (z²-ξxy), B = (ξx²-yz), C = (y²-ξxz)
-	t1 := (&gfP4{}).MulV(&a.x, &a.y)
-	A := (&gfP4{}).Square(&a.z)
+	t1 := (&gfP4{}).MulVNC(&a.x, &a.y)
+	A := (&gfP4{}).SquareNC(&a.z)
 	A.Sub(A, t1)
 
-	B := (&gfP4{}).SquareV(&a.x)
+	B := (&gfP4{}).SquareVNC(&a.x)
 	t1.Mul(&a.y, &a.z)
 	B.Sub(B, t1)
 
-	C := (&gfP4{}).Square(&a.y)
+	C := (&gfP4{}).SquareNC(&a.y)
 	t1.Mul(&a.x, &a.z)
 	C.Sub(C, t1)
 
-	F := (&gfP4{}).MulV(C, &a.y)
+	F := (&gfP4{}).MulVNC(C, &a.y)
 	t1.Mul(A, &a.z)
 	F.Add(F, t1)
 	t1.MulV(B, &a.x)
diff --git a/sm9/bn256/gfp12_exp_u.go b/sm9/bn256/gfp12_exp_u.go
index 2f95620..04a4d54 100644
--- a/sm9/bn256/gfp12_exp_u.go
+++ b/sm9/bn256/gfp12_exp_u.go
@@ -14,18 +14,18 @@ func (e *gfP12) gfP12ExpU(x *gfP12) *gfP12 {
 	//	i69    = (2*(i56 << 4 + _1001) + 1) << 6
 	//	return   2*(_101 + i69)
 	//
-	var z = new(gfP12)
+	var z = e
 	var t0 = new(gfP12)
 	var t1 = new(gfP12)
 	var t2 = new(gfP12)
 	var t3 = new(gfP12)
 
-	t2.Square(x)
-	t1.Square(t2)
-	z.Mul(x, t1)
-	t0.Mul(t1, z)
+	t2.SquareNC(x)
+	t1.SquareNC(t2)
+	z.MulNC(x, t1)
+	t0.MulNC(t1, z)
 	t2.Mul(t2, t0)
-	t3.Mul(x, t2)
+	t3.MulNC(x, t2)
 	t3.Squares(t3, 40)
 	t3.Mul(t2, t3)
 	t3.Squares(t3, 7)
@@ -38,6 +38,5 @@ func (e *gfP12) gfP12ExpU(x *gfP12) *gfP12 {
 	t0.Squares(t0, 6)
 	z.Mul(z, t0)
 	z.Square(z)
-	gfp12Copy(e, z)
 	return e
 }
diff --git a/sm9/bn256/gfp2.go b/sm9/bn256/gfp2.go
index e89d19d..1e48251 100644
--- a/sm9/bn256/gfp2.go
+++ b/sm9/bn256/gfp2.go
@@ -129,6 +129,27 @@ func (e *gfP2) Mul(a, b *gfP2) *gfP2 {
 	return e
 }
 
+// Mul without Copy
+func (e *gfP2) MulNC(a, b *gfP2) *gfP2 {
+	tx := &e.x
+	ty := &e.y
+	v0, v1 := &gfP{}, &gfP{}
+
+	gfpMul(v0, &a.y, &b.y)
+	gfpMul(v1, &a.x, &b.x)
+
+	gfpAdd(tx, &a.x, &a.y)
+	gfpAdd(ty, &b.x, &b.y)
+	gfpMul(tx, tx, ty)
+	gfpSub(tx, tx, v0)
+	gfpSub(tx, tx, v1)
+
+	gfpSub(ty, v0, v1)
+	gfpSub(ty, ty, v1)
+
+	return e
+}
+
 // MulU: a * b * u
 // (a0+a1*u)(b0+b1*u)*u=c0+c1*u, where
 // c1 = (a0*b0 - 2a1*b1)u
@@ -189,6 +210,21 @@ func (e *gfP2) Square(a *gfP2) *gfP2 {
 	return e
 }
 
+func (e *gfP2) SquareNC(a *gfP2) *gfP2 {
+	// Complex squaring algorithm:
+	// (xu+y)² = y^2-2*x^2 + 2*u*x*y
+	tx := &e.x
+	ty := &e.y
+	gfpSqr(tx, &a.x, 1)
+	gfpSqr(ty, &a.y, 1)
+	gfpSub(ty, ty, tx)
+	gfpSub(ty, ty, tx)
+
+	gfpMul(tx, &a.x, &a.y)
+	gfpAdd(tx, tx, tx)
+	return e
+}
+
 func (e *gfP2) SquareU(a *gfP2) *gfP2 {
 	// Complex squaring algorithm:
 	// (xu+y)²*u = (y^2-2*x^2)u - 4*x*y
@@ -212,6 +248,27 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 {
 	return e
 }
 
+func (e *gfP2) SquareUNC(a *gfP2) *gfP2 {
+	// Complex squaring algorithm:
+	// (xu+y)²*u = (y^2-2*x^2)u - 4*x*y
+
+	tx := &e.x
+	ty := &e.y
+	// tx = a0^2 - 2 * a1^2
+	gfpSqr(ty, &a.x, 1)
+	gfpSqr(tx, &a.y, 1)
+	gfpAdd(ty, ty, ty)
+	gfpSub(tx, tx, ty)
+
+	// ty = -4 * a0 * a1
+	gfpMul(ty, &a.x, &a.y)
+	gfpAdd(ty, ty, ty)
+	gfpAdd(ty, ty, ty)
+	gfpNeg(ty, ty)
+
+	return e
+}
+
 func (e *gfP2) MulScalar(a *gfP2, b *gfP) *gfP2 {
 	gfpMul(&e.x, &a.x, b)
 	gfpMul(&e.y, &a.y, b)
diff --git a/sm9/bn256/gfp4.go b/sm9/bn256/gfp4.go
index af04808..f1eb00a 100644
--- a/sm9/bn256/gfp4.go
+++ b/sm9/bn256/gfp4.go
@@ -102,8 +102,8 @@ func (e *gfP4) Mul(a, b *gfP4) *gfP4 {
 	tx := &tmp.x
 	ty := &tmp.y
 	v0, v1 := &gfP2{}, &gfP2{}
-	v0.Mul(&a.y, &b.y)
-	v1.Mul(&a.x, &b.x)
+	v0.MulNC(&a.y, &b.y)
+	v1.MulNC(&a.x, &b.x)
 
 	tx.Add(&a.x, &a.y)
 	ty.Add(&b.x, &b.y)
@@ -118,6 +118,31 @@ func (e *gfP4) Mul(a, b *gfP4) *gfP4 {
 	return e
 }
 
+func (e *gfP4) MulNC(a, b *gfP4) *gfP4 {
+	// "Multiplication and Squaring on Pairing-Friendly Fields"
+	// Section 4, Karatsuba method.
+	// http://eprint.iacr.org/2006/471.pdf
+	//(a0+a1*v)(b0+b1*v)=c0+c1*v, where
+	//c0 = a0*b0 +a1*b1*u
+	//c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0
+	tx := &e.x
+	ty := &e.y
+	v0, v1 := &gfP2{}, &gfP2{}
+	v0.MulNC(&a.y, &b.y)
+	v1.MulNC(&a.x, &b.x)
+
+	tx.Add(&a.x, &a.y)
+	ty.Add(&b.x, &b.y)
+	tx.Mul(tx, ty)
+	tx.Sub(tx, v0)
+	tx.Sub(tx, v1)
+
+	ty.MulU1(v1)
+	ty.Add(ty, v0)
+
+	return e
+}
+
 // MulV: a * b * v
 // (a0+a1*v)(b0+b1*v)*v=c0+c1*v, where
 // (a0*b0 + a0*b1v + a1*b0*v + a1*b1*u)*v
@@ -129,8 +154,8 @@ func (e *gfP4) MulV(a, b *gfP4) *gfP4 {
 	tx := &tmp.x
 	ty := &tmp.y
 	v0, v1 := &gfP2{}, &gfP2{}
-	v0.Mul(&a.y, &b.y)
-	v1.Mul(&a.x, &b.x)
+	v0.MulNC(&a.y, &b.y)
+	v1.MulNC(&a.x, &b.x)
 
 	tx.Add(&a.x, &a.y)
 	ty.Add(&b.x, &b.y)
@@ -146,6 +171,26 @@ func (e *gfP4) MulV(a, b *gfP4) *gfP4 {
 	return e
 }
 
+func (e *gfP4) MulVNC(a, b *gfP4) *gfP4 {
+	tx := &e.x
+	ty := &e.y
+	v0, v1 := &gfP2{}, &gfP2{}
+	v0.MulNC(&a.y, &b.y)
+	v1.MulNC(&a.x, &b.x)
+
+	tx.Add(&a.x, &a.y)
+	ty.Add(&b.x, &b.y)
+	ty.Mul(tx, ty)
+	ty.Sub(ty, v0)
+	ty.Sub(ty, v1)
+	ty.MulU1(ty)
+
+	tx.MulU1(v1)
+	tx.Add(tx, v0)
+
+	return e
+}
+
 // MulV1: a * v
 // (a0+a1*v)*v=c0+c1*v, where
 // c0 = a1*u
@@ -165,8 +210,8 @@ func (e *gfP4) Square(a *gfP4) *gfP4 {
 	tmp := &gfP4{}
 	tx := &tmp.x
 	ty := &tmp.y
-	tx.SquareU(&a.x)
-	ty.Square(&a.y)
+	tx.SquareUNC(&a.x)
+	ty.SquareNC(&a.y)
 	ty.Add(tx, ty)
 
 	tx.Mul(&a.x, &a.y)
@@ -176,14 +221,29 @@ func (e *gfP4) Square(a *gfP4) *gfP4 {
 	return e
 }
 
+func (e *gfP4) SquareNC(a *gfP4) *gfP4 {
+	// Complex squaring algorithm:
+	// (xv+y)² = (x^2*u + y^2) + 2*x*y*v
+	tx := &e.x
+	ty := &e.y
+	tx.SquareUNC(&a.x)
+	ty.SquareNC(&a.y)
+	ty.Add(tx, ty)
+
+	tx.Mul(&a.x, &a.y)
+	tx.Add(tx, tx)
+
+	return e
+}
+
 // SquareV: (a^2) * v
 // v*(xv+y)² = (x^2*u + y^2)v + 2*x*y*u
 func (e *gfP4) SquareV(a *gfP4) *gfP4 {
 	tmp := &gfP4{}
 	tx := &tmp.x
 	ty := &tmp.y
-	tx.SquareU(&a.x)
-	ty.Square(&a.y)
+	tx.SquareUNC(&a.x)
+	ty.SquareNC(&a.y)
 	tx.Add(tx, ty)
 
 	ty.MulU(&a.x, &a.y)
@@ -193,6 +253,19 @@ func (e *gfP4) SquareV(a *gfP4) *gfP4 {
 	return e
 }
 
+func (e *gfP4) SquareVNC(a *gfP4) *gfP4 {
+	tx := &e.x
+	ty := &e.y
+	tx.SquareUNC(&a.x)
+	ty.SquareNC(&a.y)
+	tx.Add(tx, ty)
+
+	ty.MulU(&a.x, &a.y)
+	ty.Add(ty, ty)
+
+	return e
+}
+
 func (e *gfP4) Invert(a *gfP4) *gfP4 {
 	// See "Implementing cryptographic pairings", M. Scott, section 3.2.
 	// ftp://136.206.11.249/pub/crypto/pairings.pdf
@@ -202,15 +275,15 @@ func (e *gfP4) Invert(a *gfP4) *gfP4 {
 
 	t3 := &gfP2{}
 
-	t3.SquareU(&a.x)
-	t1.Square(&a.y)
+	t3.SquareUNC(&a.x)
+	t1.SquareNC(&a.y)
 	t3.Sub(t3, t1)
 	t3.Invert(t3)
 
 	t1.Mul(&a.y, t3)
 	t1.Neg(t1)
 
-	t2.Mul(&a.x, t3)
+	t2.MulNC(&a.x, t3)
 
 	gfp4Copy(e, tmp)
 	return e
@@ -242,7 +315,7 @@ func (e *gfP4) Frobenius(a *gfP4) *gfP4 {
 	tmp := &gfP4{}
 	x := &tmp.x
 	y := &tmp.y
-	
+
 	x.Conjugate(&a.x)
 	y.Conjugate(&a.y)
 	x.MulScalar(x, vToPMinus1)
diff --git a/sm9/bn256/gt.go b/sm9/bn256/gt.go
index 9c3e316..71b108f 100644
--- a/sm9/bn256/gt.go
+++ b/sm9/bn256/gt.go
@@ -227,15 +227,12 @@ func GenerateGTFieldTable(basePoint *GT) *[32 * 2]GTFieldTable {
 		for j := 1; j < 15; j += 2 {
 			table[i][j] = &GT{}
 			table[i][j].p = &gfP12{}
-			table[i][j].p.Square(table[i][j/2].p)
+			table[i][j].p.SquareNC(table[i][j/2].p)
 			table[i][j+1] = &GT{}
 			table[i][j+1].p = &gfP12{}
 			table[i][j+1].Add(table[i][j], base)
 		}
-		base.p.Square(base.p)
-		base.p.Square(base.p)
-		base.p.Square(base.p)
-		base.p.Square(base.p)
+		base.p.Squares(base.p, 4)
 	}
 	return table
 }
@@ -277,7 +274,7 @@ func ScalarMultGT(a *GT, scalar []byte) (*GT, error) {
 	for i := 1; i < 15; i += 2 {
 		table[i] = &GT{}
 		table[i].p = &gfP12{}
-		table[i].p.Square(table[i/2].p)
+		table[i].p.SquareNC(table[i/2].p)
 
 		table[i+1] = &GT{}
 		table[i+1].p = &gfP12{}
@@ -292,18 +289,12 @@ func ScalarMultGT(a *GT, scalar []byte) (*GT, error) {
 		// No need to double on the first iteration, as p is the identity at
 		// this point, and [N]∞ = ∞.
 		if i != 0 {
-			e.p.Square(e.p)
-			e.p.Square(e.p)
-			e.p.Square(e.p)
-			e.p.Square(e.p)
+			e.p.Squares(e.p, 4)
 		}
 		windowValue := byte >> 4
 		table.Select(t, windowValue)
 		e.Add(e, t)
-		e.p.Square(e.p)
-		e.p.Square(e.p)
-		e.p.Square(e.p)
-		e.p.Square(e.p)
+		e.p.Squares(e.p, 4)
 		windowValue = byte & 0b1111
 		table.Select(t, windowValue)
 		e.Add(e, t)
diff --git a/sm9/bn256/twist.go b/sm9/bn256/twist.go
index 222db35..a2c99e9 100644
--- a/sm9/bn256/twist.go
+++ b/sm9/bn256/twist.go
@@ -100,23 +100,23 @@ func (c *twistPoint) Add(a, b *twistPoint) {
 	}
 
 	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/addition/add-2007-bl.op3
-	z12 := (&gfP2{}).Square(&a.z)
-	z22 := (&gfP2{}).Square(&b.z)
-	u1 := (&gfP2{}).Mul(&a.x, z22)
-	u2 := (&gfP2{}).Mul(&b.x, z12)
+	z12 := (&gfP2{}).SquareNC(&a.z)
+	z22 := (&gfP2{}).SquareNC(&b.z)
+	u1 := (&gfP2{}).MulNC(&a.x, z22)
+	u2 := (&gfP2{}).MulNC(&b.x, z12)
 
-	t := (&gfP2{}).Mul(&b.z, z22)
-	s1 := (&gfP2{}).Mul(&a.y, t)
+	t := (&gfP2{}).MulNC(&b.z, z22)
+	s1 := (&gfP2{}).MulNC(&a.y, t)
 
 	t.Mul(&a.z, z12)
-	s2 := (&gfP2{}).Mul(&b.y, t)
+	s2 := (&gfP2{}).MulNC(&b.y, t)
 
 	h := (&gfP2{}).Sub(u2, u1)
 	xEqual := h.IsZero()
 
 	t.Add(h, h)
-	i := (&gfP2{}).Square(t)
-	j := (&gfP2{}).Mul(h, i)
+	i := (&gfP2{}).SquareNC(t)
+	j := (&gfP2{}).MulNC(h, i)
 
 	t.Sub(s2, s1)
 	yEqual := t.IsZero()
@@ -126,9 +126,9 @@ func (c *twistPoint) Add(a, b *twistPoint) {
 	}
 	r := (&gfP2{}).Add(t, t)
 
-	v := (&gfP2{}).Mul(u1, i)
+	v := (&gfP2{}).MulNC(u1, i)
 
-	t4 := (&gfP2{}).Square(r)
+	t4 := (&gfP2{}).SquareNC(r)
 	t.Add(v, v)
 	t6 := (&gfP2{}).Sub(t4, j)
 	c.x.Sub(t6, t)
@@ -148,18 +148,18 @@ func (c *twistPoint) Add(a, b *twistPoint) {
 
 func (c *twistPoint) Double(a *twistPoint) {
 	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/doubling/dbl-2009-l.op3
-	A := (&gfP2{}).Square(&a.x)
-	B := (&gfP2{}).Square(&a.y)
-	C := (&gfP2{}).Square(B)
+	A := (&gfP2{}).SquareNC(&a.x)
+	B := (&gfP2{}).SquareNC(&a.y)
+	C := (&gfP2{}).SquareNC(B)
 
 	t := (&gfP2{}).Add(&a.x, B)
-	t2 := (&gfP2{}).Square(t)
+	t2 := (&gfP2{}).SquareNC(t)
 	t.Sub(t2, A)
 	t2.Sub(t, C)
 	d := (&gfP2{}).Add(t2, t2)
 	t.Add(A, A)
 	e := (&gfP2{}).Add(t, A)
-	f := (&gfP2{}).Square(e)
+	f := (&gfP2{}).SquareNC(e)
 
 	t.Add(d, d)
 	c.x.Sub(f, t)
@@ -201,8 +201,8 @@ func (c *twistPoint) MakeAffine() {
 	}
 
 	zInv := (&gfP2{}).Invert(&c.z)
-	t := (&gfP2{}).Mul(&c.y, zInv)
-	zInv2 := (&gfP2{}).Square(zInv)
+	t := (&gfP2{}).MulNC(&c.y, zInv)
+	zInv2 := (&gfP2{}).SquareNC(zInv)
 	c.y.Mul(t, zInv2)
 	t.Mul(&c.x, zInv2)
 	c.x.Set(t)