From c7b3d97304eeef82124e430d9fd88b733594a1db Mon Sep 17 00:00:00 2001
From: Sun Yimin <emmansun@users.noreply.github.com>
Date: Fri, 30 Jun 2023 17:51:35 +0800
Subject: [PATCH] sm9/bn256: value copy acceleration #136

---
 sm9/bn256/bn_pair.go        |  15 ++-
 sm9/bn256/generate.go       |  45 +++++++++
 sm9/bn256/gfp.go            |   6 +-
 sm9/bn256/gfp12.go          | 121 ++++++++++++++++-------
 sm9/bn256/gfp12_exp_u.go    |  43 ++++++++
 sm9/bn256/gfp12_test.go     |  42 ++++++++
 sm9/bn256/gfp2.go           |  62 ++++++------
 sm9/bn256/gfp4.go           |  75 ++++++++------
 sm9/bn256/select_amd64.s    | 192 ++++++++++++++++++++++++++++++++++++
 sm9/bn256/select_arm64.s    |  62 ++++++++++++
 sm9/bn256/select_decl.go    |  12 +++
 sm9/bn256/select_generic.go |  36 +++++++
 sm9/bn256/select_test.go    |  54 ++++++++++
 13 files changed, 657 insertions(+), 108 deletions(-)
 create mode 100644 sm9/bn256/gfp12_exp_u.go
 create mode 100644 sm9/bn256/select_test.go

diff --git a/sm9/bn256/bn_pair.go b/sm9/bn256/bn_pair.go
index 90a9e4d..acc3ef8 100644
--- a/sm9/bn256/bn_pair.go
+++ b/sm9/bn256/bn_pair.go
@@ -95,8 +95,8 @@ func lineFunctionDouble(r *twistPoint, q *curvePoint) (a, b, c *gfP2, rOut *twis
 // (ret.z + ret.y*w + ret.x*w^2)* ((cv+a) + b*w^2)
 func mulLine(ret *gfP12, a, b, c *gfP2) {
 	t1, tz, t, bz := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}
-	bz.x.Set(c)
-	bz.y.Set(a)
+	gfp2Copy(&bz.x, c)
+	gfp2Copy(&bz.y, a)
 
 	tz.Mul(&ret.z, bz)
 	t.MulScalar(&ret.y, b).MulV1(t)
@@ -109,17 +109,14 @@ func mulLine(ret *gfP12, a, b, c *gfP2) {
 	t.Mul(&ret.x, bz)
 	t1.MulScalar(&ret.z, b)
 	ret.x.Add(t1, t)
-
-	ret.z.Set(tz)
+	gfp4Copy(&ret.z, tz)
 }
 
-//
 // R-ate Pairing G2 x G1 -> GT
 //
 // P is a point of order q in G1. Q(x,y) is a point of order q in G2.
 // Note that Q is a point on the sextic twist of the curve over Fp^2, P(x,y) is a point on the
 // curve over the base field Fp
-//
 func miller(q *twistPoint, p *curvePoint) *gfP12 {
 	ret := (&gfP12{}).SetOne()
 
@@ -218,9 +215,9 @@ func finalExponentiation(in *gfP12) *gfP12 {
 	fp2 := (&gfP12{}).FrobeniusP2(t1)
 	fp3 := (&gfP12{}).Frobenius(fp2)
 
-	fu := (&gfP12{}).Exp(t1, u)
-	fu2 := (&gfP12{}).Exp(fu, u)
-	fu3 := (&gfP12{}).Exp(fu2, u)
+	fu := (&gfP12{}).gfP12ExpU(t1)
+	fu2 := (&gfP12{}).gfP12ExpU(fu)
+	fu3 := (&gfP12{}).gfP12ExpU(fu2)
 
 	y3 := (&gfP12{}).Frobenius(fu)
 	fu2p := (&gfP12{}).Frobenius(fu2)
diff --git a/sm9/bn256/generate.go b/sm9/bn256/generate.go
index ac5b1c7..145cb20 100644
--- a/sm9/bn256/generate.go
+++ b/sm9/bn256/generate.go
@@ -97,6 +97,14 @@ func main() {
 	if err = writeFile("gfp2_sqrt.go", out, out1, out2); err != nil {
 		log.Fatal(err)
 	}
+
+	out, err = generate(tmplAddchainExp12, "0x600000000058f98a", "gfP12")
+	if err != nil {
+		log.Fatal(err)
+	}
+	if err = writeFile("gfp12_exp_u.go", out); err != nil {
+		log.Fatal(err)
+	}
 }
 
 const tmplAddchainExp1 = `// Code generated by {{ .Meta.Name }}. DO NOT EDIT.
@@ -300,3 +308,40 @@ func sqrtCandidate(z, x *Element) {
 	{{- end }}
 }
 `
+
+const tmplAddchainExp12 = `// Code generated by {{ .Meta.Name }}. DO NOT EDIT.
+package bn256
+
+func (e *Element) gfP12ExpU(x *Element) *Element {
+	// The sequence of {{ .Ops.Adds }} multiplications and {{ .Ops.Doubles }} squarings is derived from the
+	// following addition chain generated with {{ .Meta.Module }} {{ .Meta.ReleaseTag }}.
+	//
+	{{- range lines (format .Script) }}
+	//	{{ . }}
+	{{- end }}
+	//
+	var z = new(Element).Set(e)
+	{{- range .Program.Temporaries }}
+	var {{ . }} = new(Element)
+	{{- end }}
+	{{ range $i := .Program.Instructions -}}
+	{{- with add $i.Op }}
+	{{ $i.Output }}.Mul({{ .X }}, {{ .Y }})
+	{{- end -}}
+	{{- with double $i.Op }}
+	{{ $i.Output }}.Square({{ .X }})
+	{{- end -}}
+	{{- with shift $i.Op -}}
+	{{- $first := 0 -}}
+	{{- if ne $i.Output.Identifier .X.Identifier }}
+	{{ $i.Output }}.Square({{ .X }})
+	{{- $first = 1 -}}
+	{{- end }}
+	for s := {{ $first }}; s < {{ .S }}; s++ {
+		{{ $i.Output }}.Square({{ $i.Output }})
+	}
+	{{- end -}}
+	{{- end }}
+	return e.Set(z)
+}
+`
diff --git a/sm9/bn256/gfp.go b/sm9/bn256/gfp.go
index 2a56759..f834a07 100644
--- a/sm9/bn256/gfp.go
+++ b/sm9/bn256/gfp.go
@@ -60,11 +60,7 @@ func (e *gfP) String() string {
 }
 
 func (e *gfP) Set(f *gfP) *gfP {
-	e[0] = f[0]
-	e[1] = f[1]
-	e[2] = f[2]
-	e[3] = f[3]
-
+	gfpCopy(e, f)
 	return e
 }
 
diff --git a/sm9/bn256/gfp12.go b/sm9/bn256/gfp12.go
index ea5f1b4..fbe1533 100644
--- a/sm9/bn256/gfp12.go
+++ b/sm9/bn256/gfp12.go
@@ -58,9 +58,7 @@ func (e *gfP12) String() string {
 }
 
 func (e *gfP12) Set(a *gfP12) *gfP12 {
-	e.x.Set(&a.x)
-	e.y.Set(&a.y)
-	e.z.Set(&a.z)
+	gfp12Copy(e, a)
 	return e
 }
 
@@ -141,7 +139,11 @@ func (e *gfP12) Mul(a, b *gfP12) *gfP12 {
 	// +y0*z1*w + y0*y1*w^2 + y0*x1*v
 	// +x0*z1*w^2 + x0*y1*v + x0*x1*v*w
 	//=(z0*z1+y0*x1*v+x0*y1*v) + (z0*y1+y0*z1+x0*x1*v)w + (z0*x1 + y0*y1 + x0*z1)*w^2
-	tx, ty, tz, t, v0, v1, v2 := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}
+	tmp := &gfP12{}
+	tx := &tmp.x
+	ty := &tmp.y
+	tz := &tmp.z
+	t, v0, v1, v2 := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}
 	v0.Mul(&a.z, &b.z)
 	v1.Mul(&a.y, &b.y)
 	v2.Mul(&a.x, &b.x)
@@ -168,10 +170,7 @@ func (e *gfP12) Mul(a, b *gfP12) *gfP12 {
 	tx.Sub(tx, v0)
 	tx.Add(tx, v1)
 	tx.Sub(tx, v2)
-
-	e.x.Set(tx)
-	e.y.Set(ty)
-	e.z.Set(tz)
+	gfp12Copy(e, tmp)
 	return e
 }
 
@@ -180,7 +179,37 @@ func (e *gfP12) Square(a *gfP12) *gfP12 {
 	// z^2 + z*y*w + z*x*w^2 + y*z*w + y^2*w^2 + y*x*v + x*z*w^2 + x*y*v + x^2 *v *w
 	// (z^2 + y*x*v + x*y*v) + (z*y + y*z + v * x^2)w + (z*x + y^2 + x*z)*w^2
 	// (z^2 + 2*x*y*v) + (v*x^2 + 2*y*z) *w + (y^2 + 2*x*z) * w^2
-	tx, ty, tz, t := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}
+	tmp := &gfP12{}
+	tx := &tmp.x
+	ty := &tmp.y
+	tz := &tmp.z
+	t := &gfP4{}
+
+	tz.Square(&a.z)
+	t.MulV(&a.x, &a.y)
+	t.Add(t, t)
+	tz.Add(tz, t)
+
+	ty.SquareV(&a.x)
+	t.Mul(&a.y, &a.z)
+	t.Add(t, t)
+	ty.Add(ty, t)
+
+	tx.Square(&a.y)
+	t.Mul(&a.x, &a.z)
+	t.Add(t, t)
+	tx.Add(tx, t)
+	gfp12Copy(e, tmp)
+	return e
+}
+
+func (e *gfP12) Squares(a *gfP12, n int) *gfP12 {
+	// Square first round
+	in := &gfP12{}
+	tx := &in.x
+	ty := &in.y
+	tz := &in.z
+	t := &gfP4{}
 
 	tz.Square(&a.z)
 	t.MulV(&a.x, &a.y)
@@ -197,9 +226,36 @@ func (e *gfP12) Square(a *gfP12) *gfP12 {
 	t.Add(t, t)
 	tx.Add(tx, t)
 
-	e.x.Set(tx)
-	e.y.Set(ty)
-	e.z.Set(tz)
+	tmp := &gfP12{}
+	var tmp2 *gfP12
+	tx = &tmp.x
+	ty = &tmp.y
+	tz = &tmp.z
+	for i := 1; i < n; i++ {
+		tz.Square(&in.z)
+		t.MulV(&in.x, &in.y)
+		t.Add(t, t)
+		tz.Add(tz, t)
+
+		ty.SquareV(&in.x)
+		t.Mul(&in.y, &in.z)
+		t.Add(t, t)
+		ty.Add(ty, t)
+
+		tx.Square(&in.y)
+		t.Mul(&in.x, &in.z)
+		t.Add(t, t)
+		tx.Add(tx, t)
+
+		// Switch references
+		tmp2 = in
+		in = tmp
+		tmp = tmp2
+		tx = &tmp.x
+		ty = &tmp.y
+		tz = &tmp.z
+	}
+	gfp12Copy(e, in)
 	return e
 }
 
@@ -215,8 +271,7 @@ func (e *gfP12) Exp(f *gfP12, power *big.Int) *gfP12 {
 			sum.Set(t)
 		}
 	}
-
-	e.Set(sum)
+	gfp12Copy(e, sum)
 	return e
 }
 
@@ -268,38 +323,39 @@ func (e *gfP12) Neg(a *gfP12) *gfP12 {
 }
 
 // (z + y*w + x*w^2)^p
-//= z^p + y^p*w*w^(p-1)+x^p*w^2*(w^2)^(p-1)
+// = z^p + y^p*w*w^(p-1)+x^p*w^2*(w^2)^(p-1)
 // w2ToP2Minus1 = vToPMinus1 * wToPMinus1
 func (e *gfP12) Frobenius(a *gfP12) *gfP12 {
-	x, y := &gfP2{}, &gfP2{}
+	tmp := &gfP4{}
+	x := &tmp.x
+	y := &tmp.y
 
 	x.Conjugate(&a.z.x)
 	y.Conjugate(&a.z.y)
 	x.MulScalar(x, vToPMinus1)
-	e.z.x.Set(x)
-	e.z.y.Set(y)
+	gfp4Copy(&e.z, tmp)
 
 	x.Conjugate(&a.y.x)
 	y.Conjugate(&a.y.y)
 	x.MulScalar(x, w2ToP2Minus1)
 	y.MulScalar(y, wToPMinus1)
-	e.y.x.Set(x)
-	e.y.y.Set(y)
+	gfp4Copy(&e.y, tmp)
 
 	x.Conjugate(&a.x.x)
 	y.Conjugate(&a.x.y)
 	x.MulScalar(x, vToPMinus1Mw2ToPMinus1)
 	y.MulScalar(y, w2ToPMinus1)
-	e.x.x.Set(x)
-	e.x.y.Set(y)
+	gfp4Copy(&e.x, tmp)
 
 	return e
 }
 
 // (z + y*w + x*w^2)^(p^2)
-//= z^(p^2) + y^(p^2)*w*w^((p^2)-1)+x^(p^2)*w^2*(w^2)^((p^2)-1)
+// = z^(p^2) + y^(p^2)*w*w^((p^2)-1)+x^(p^2)*w^2*(w^2)^((p^2)-1)
 func (e *gfP12) FrobeniusP2(a *gfP12) *gfP12 {
-	tx, ty, tz := &gfP4{}, &gfP4{}, &gfP4{}
+	tx := &e.x
+	ty := &e.y
+	tz := &e.z
 
 	tz.Conjugate(&a.z)
 
@@ -308,17 +364,12 @@ func (e *gfP12) FrobeniusP2(a *gfP12) *gfP12 {
 
 	tx.Conjugate(&a.x)
 	tx.MulGFP(tx, w2ToP2Minus1)
-
-	e.x.Set(tx)
-	e.y.Set(ty)
-	e.z.Set(tz)
-
 	return e
 }
 
 // (z + y*w + x*w^2)^(p^3)
-//=z^(p^3) + y^(p^3)*w*w^((p^3)-1)+x^(p^3)*w^2*(w^2)^((p^3)-1)
-//=z^(p^3) + y^(p^3)*w*vToPMinus1-x^(p^3)*w^2
+// =z^(p^3) + y^(p^3)*w*w^((p^3)-1)+x^(p^3)*w^2*(w^2)^((p^3)-1)
+// =z^(p^3) + y^(p^3)*w*vToPMinus1-x^(p^3)*w^2
 // vToPMinus1 * vToPMinus1 = -1
 func (e *gfP12) FrobeniusP3(a *gfP12) *gfP12 {
 	x, y := &gfP2{}, &gfP2{}
@@ -352,7 +403,9 @@ func (e *gfP12) FrobeniusP3(a *gfP12) *gfP12 {
 // (z + y*w + x*w^2)^(p^6)
 // = ((z + y*w + x*w^2)^(p^3))^(p^3)
 func (e *gfP12) FrobeniusP6(a *gfP12) *gfP12 {
-	tx, ty, tz := &gfP4{}, &gfP4{}, &gfP4{}
+	tx := &e.x
+	ty := &e.y
+	tz := &e.z
 
 	tz.Conjugate(&a.z)
 
@@ -361,10 +414,6 @@ func (e *gfP12) FrobeniusP6(a *gfP12) *gfP12 {
 
 	tx.Conjugate(&a.x)
 
-	e.x.Set(tx)
-	e.y.Set(ty)
-	e.z.Set(tz)
-
 	return e
 }
 
diff --git a/sm9/bn256/gfp12_exp_u.go b/sm9/bn256/gfp12_exp_u.go
new file mode 100644
index 0000000..2f95620
--- /dev/null
+++ b/sm9/bn256/gfp12_exp_u.go
@@ -0,0 +1,43 @@
+package bn256
+
+func (e *gfP12) gfP12ExpU(x *gfP12) *gfP12 {
+	// The sequence of 10 multiplications and 61 squarings is derived from the
+	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
+	//
+	//	_10    = 2*1
+	//	_100   = 2*_10
+	//	_101   = 1 + _100
+	//	_1001  = _100 + _101
+	//	_1011  = _10 + _1001
+	//	_1100  = 1 + _1011
+	//	i56    = (_1100 << 40 + _1011) << 7 + _1011 + _100
+	//	i69    = (2*(i56 << 4 + _1001) + 1) << 6
+	//	return   2*(_101 + i69)
+	//
+	var z = new(gfP12)
+	var t0 = new(gfP12)
+	var t1 = new(gfP12)
+	var t2 = new(gfP12)
+	var t3 = new(gfP12)
+
+	t2.Square(x)
+	t1.Square(t2)
+	z.Mul(x, t1)
+	t0.Mul(t1, z)
+	t2.Mul(t2, t0)
+	t3.Mul(x, t2)
+	t3.Squares(t3, 40)
+	t3.Mul(t2, t3)
+	t3.Squares(t3, 7)
+	t2.Mul(t2, t3)
+	t1.Mul(t1, t2)
+	t1.Squares(t1, 4)
+	t0.Mul(t0, t1)
+	t0.Square(t0)
+	t0.Mul(x, t0)
+	t0.Squares(t0, 6)
+	z.Mul(z, t0)
+	z.Square(z)
+	gfp12Copy(e, z)
+	return e
+}
diff --git a/sm9/bn256/gfp12_test.go b/sm9/bn256/gfp12_test.go
index ef4efe7..bedd1c9 100644
--- a/sm9/bn256/gfp12_test.go
+++ b/sm9/bn256/gfp12_test.go
@@ -35,6 +35,20 @@ func Test_gfP12Square(t *testing.T) {
 	}
 }
 
+func BenchmarkGfP12Square(b *testing.B) {
+	x := &gfP12{
+		testdataP4,
+		testdataP4,
+		*(&gfP4{}).SetOne(),
+	}
+	x2 := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		x2.Square(x)
+	}
+}
+
 func testGfP12Invert(t *testing.T, x *gfP12) {
 	xInv := &gfP12{}
 	xInv.Invert(x)
@@ -285,3 +299,31 @@ func BenchmarkGfP12Frobenius(b *testing.B) {
 		}
 	}
 }
+
+func BenchmarkGfP12ExpU(b *testing.B) {
+	x := &gfP12{
+		testdataP4,
+		testdataP4,
+		testdataP4,
+	}
+	got := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		got.gfP12ExpU(x)
+	}
+}
+
+func BenchmarkGfP12ExpU2(b *testing.B) {
+	x := &gfP12{
+		testdataP4,
+		testdataP4,
+		testdataP4,
+	}
+	got := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		got.Exp(x, u)
+	}
+}
diff --git a/sm9/bn256/gfp2.go b/sm9/bn256/gfp2.go
index 40034b4..e89d19d 100644
--- a/sm9/bn256/gfp2.go
+++ b/sm9/bn256/gfp2.go
@@ -26,8 +26,7 @@ func (e *gfP2) String() string {
 }
 
 func (e *gfP2) Set(a *gfP2) *gfP2 {
-	e.x.Set(&a.x)
-	e.y.Set(&a.y)
+	gfp2Copy(e, a)
 	return e
 }
 
@@ -105,11 +104,14 @@ func (e *gfP2) Triple(a *gfP2) *gfP2 {
 // See "Multiplication and Squaring in Pairing-Friendly Fields",
 // http://eprint.iacr.org/2006/471.pdf
 // The Karatsuba method
-//(a0+a1*u)(b0+b1*u)=c0+c1*u, where
-//c0 = a0*b0 - 2a1*b1
-//c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0
+// (a0+a1*u)(b0+b1*u)=c0+c1*u, where
+// c0 = a0*b0 - 2a1*b1
+// c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0
 func (e *gfP2) Mul(a, b *gfP2) *gfP2 {
-	tx, ty, v0, v1 := &gfP{}, &gfP{}, &gfP{}, &gfP{}
+	tmp := &gfP2{}
+	tx := &tmp.x
+	ty := &tmp.y
+	v0, v1 := &gfP{}, &gfP{}
 
 	gfpMul(v0, &a.y, &b.y)
 	gfpMul(v1, &a.x, &b.x)
@@ -123,17 +125,19 @@ func (e *gfP2) Mul(a, b *gfP2) *gfP2 {
 	gfpSub(ty, v0, v1)
 	gfpSub(ty, ty, v1)
 
-	e.x.Set(tx)
-	e.y.Set(ty)
+	gfp2Copy(e, tmp)
 	return e
 }
 
 // MulU: a * b * u
-//(a0+a1*u)(b0+b1*u)*u=c0+c1*u, where
-//c1 = (a0*b0 - 2a1*b1)u
-//c0 = -2 * ((a0 + a1)(b0 + b1) - a0*b0 - a1*b1) = -2 * (a0*b1 + a1*b0)
+// (a0+a1*u)(b0+b1*u)*u=c0+c1*u, where
+// c1 = (a0*b0 - 2a1*b1)u
+// c0 = -2 * ((a0 + a1)(b0 + b1) - a0*b0 - a1*b1) = -2 * (a0*b1 + a1*b0)
 func (e *gfP2) MulU(a, b *gfP2) *gfP2 {
-	tx, ty, v0, v1 := &gfP{}, &gfP{}, &gfP{}, &gfP{}
+	tmp := &gfP2{}
+	tx := &tmp.x
+	ty := &tmp.y
+	v0, v1 := &gfP{}, &gfP{}
 
 	gfpMul(v0, &a.y, &b.y)
 	gfpMul(v1, &a.x, &b.x)
@@ -150,29 +154,30 @@ func (e *gfP2) MulU(a, b *gfP2) *gfP2 {
 	gfpSub(tx, v0, v1)
 	gfpSub(tx, tx, v1)
 
-	e.x.Set(tx)
-	e.y.Set(ty)
+	gfp2Copy(e, tmp)
 	return e
 }
 
 // MulU1: a  * u
-//(a0+a1*u)u=c0+c1*u, where
-//c1 = a0
-//c0 = -2a1
+// (a0+a1*u)u=c0+c1*u, where
+// c1 = a0
+// c0 = -2a1
 func (e *gfP2) MulU1(a *gfP2) *gfP2 {
 	t := &gfP{}
 	gfpAdd(t, &a.x, &a.x)
 	gfpNeg(t, t)
 
-	e.x.Set(&a.y)
-	e.y.Set(t)
+	gfpCopy(&e.x, &a.y)
+	gfpCopy(&e.y, t)
 	return e
 }
 
 func (e *gfP2) Square(a *gfP2) *gfP2 {
 	// Complex squaring algorithm:
 	// (xu+y)² = y^2-2*x^2 + 2*u*x*y
-	tx, ty := &gfP{}, &gfP{}
+	tmp := &gfP2{}
+	tx := &tmp.x
+	ty := &tmp.y
 	gfpSqr(tx, &a.x, 1)
 	gfpSqr(ty, &a.y, 1)
 	gfpSub(ty, ty, tx)
@@ -180,9 +185,7 @@ func (e *gfP2) Square(a *gfP2) *gfP2 {
 
 	gfpMul(tx, &a.x, &a.y)
 	gfpAdd(tx, tx, tx)
-
-	e.x.Set(tx)
-	e.y.Set(ty)
+	gfp2Copy(e, tmp)
 	return e
 }
 
@@ -190,7 +193,9 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 {
 	// Complex squaring algorithm:
 	// (xu+y)²*u = (y^2-2*x^2)u - 4*x*y
 
-	tx, ty := &gfP{}, &gfP{}
+	tmp := &gfP2{}
+	tx := &tmp.x
+	ty := &tmp.y
 	// tx = a0^2 - 2 * a1^2
 	gfpSqr(ty, &a.x, 1)
 	gfpSqr(tx, &a.y, 1)
@@ -203,8 +208,7 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 {
 	gfpAdd(ty, ty, ty)
 	gfpNeg(ty, ty)
 
-	e.x.Set(tx)
-	e.y.Set(ty)
+	gfp2Copy(e, tmp)
 	return e
 }
 
@@ -251,8 +255,10 @@ func (e *gfP2) Exp(f *gfP2, power *big.Int) *gfP2 {
 }
 
 // （xu+y)^p = x * u^p + y
-//  = x * u * u^(p-1) + y
-//  = (-x)*u + y
+//
+//	= x * u * u^(p-1) + y
+//	= (-x)*u + y
+//
 // here u^(p-1) = -1
 func (e *gfP2) Frobenius(a *gfP2) *gfP2 {
 	e.Conjugate(a)
diff --git a/sm9/bn256/gfp4.go b/sm9/bn256/gfp4.go
index 9a00444..af04808 100644
--- a/sm9/bn256/gfp4.go
+++ b/sm9/bn256/gfp4.go
@@ -25,8 +25,7 @@ func (e *gfP4) String() string {
 }
 
 func (e *gfP4) Set(a *gfP4) *gfP4 {
-	e.x.Set(&a.x)
-	e.y.Set(&a.y)
+	gfp4Copy(e, a)
 	return e
 }
 
@@ -99,7 +98,10 @@ func (e *gfP4) Mul(a, b *gfP4) *gfP4 {
 	//(a0+a1*v)(b0+b1*v)=c0+c1*v, where
 	//c0 = a0*b0 +a1*b1*u
 	//c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0
-	tx, ty, v0, v1 := &gfP2{}, &gfP2{}, &gfP2{}, &gfP2{}
+	tmp := &gfP4{}
+	tx := &tmp.x
+	ty := &tmp.y
+	v0, v1 := &gfP2{}, &gfP2{}
 	v0.Mul(&a.y, &b.y)
 	v1.Mul(&a.x, &b.x)
 
@@ -112,19 +114,21 @@ func (e *gfP4) Mul(a, b *gfP4) *gfP4 {
 	ty.MulU1(v1)
 	ty.Add(ty, v0)
 
-	e.x.Set(tx)
-	e.y.Set(ty)
+	gfp4Copy(e, tmp)
 	return e
 }
 
 // MulV: a * b * v
-//(a0+a1*v)(b0+b1*v)*v=c0+c1*v, where
+// (a0+a1*v)(b0+b1*v)*v=c0+c1*v, where
 // (a0*b0 + a0*b1v + a1*b0*v + a1*b1*u)*v
 // a0*b0*v + a0*b1*u + a1*b0*u + a1*b1*u*v
 // c0 = a0*b1*u + a1*b0*u
 // c1 = a0*b0 + a1*b1*u
 func (e *gfP4) MulV(a, b *gfP4) *gfP4 {
-	tx, ty, v0, v1 := &gfP2{}, &gfP2{}, &gfP2{}, &gfP2{}
+	tmp := &gfP4{}
+	tx := &tmp.x
+	ty := &tmp.y
+	v0, v1 := &gfP2{}, &gfP2{}
 	v0.Mul(&a.y, &b.y)
 	v1.Mul(&a.x, &b.x)
 
@@ -138,27 +142,29 @@ func (e *gfP4) MulV(a, b *gfP4) *gfP4 {
 	tx.MulU1(v1)
 	tx.Add(tx, v0)
 
-	e.x.Set(tx)
-	e.y.Set(ty)
+	gfp4Copy(e, tmp)
 	return e
 }
 
 // MulV1: a * v
-//(a0+a1*v)*v=c0+c1*v, where
+// (a0+a1*v)*v=c0+c1*v, where
 // c0 = a1*u
 // c1 = a0
 func (e *gfP4) MulV1(a *gfP4) *gfP4 {
-	tx := (&gfP2{}).Set(&a.y)
+	tx := &gfP2{}
+	gfp2Copy(tx, &a.y)
 
 	e.y.MulU1(&a.x)
-	e.x.Set(tx)
+	gfp2Copy(&e.x, tx)
 	return e
 }
 
 func (e *gfP4) Square(a *gfP4) *gfP4 {
 	// Complex squaring algorithm:
 	// (xv+y)² = (x^2*u + y^2) + 2*x*y*v
-	tx, ty := &gfP2{}, &gfP2{}
+	tmp := &gfP4{}
+	tx := &tmp.x
+	ty := &tmp.y
 	tx.SquareU(&a.x)
 	ty.Square(&a.y)
 	ty.Add(tx, ty)
@@ -166,15 +172,16 @@ func (e *gfP4) Square(a *gfP4) *gfP4 {
 	tx.Mul(&a.x, &a.y)
 	tx.Add(tx, tx)
 
-	e.x.Set(tx)
-	e.y.Set(ty)
+	gfp4Copy(e, tmp)
 	return e
 }
 
 // SquareV: (a^2) * v
 // v*(xv+y)² = (x^2*u + y^2)v + 2*x*y*u
 func (e *gfP4) SquareV(a *gfP4) *gfP4 {
-	tx, ty := &gfP2{}, &gfP2{}
+	tmp := &gfP4{}
+	tx := &tmp.x
+	ty := &tmp.y
 	tx.SquareU(&a.x)
 	ty.Square(&a.y)
 	tx.Add(tx, ty)
@@ -182,15 +189,18 @@ func (e *gfP4) SquareV(a *gfP4) *gfP4 {
 	ty.MulU(&a.x, &a.y)
 	ty.Add(ty, ty)
 
-	e.x.Set(tx)
-	e.y.Set(ty)
+	gfp4Copy(e, tmp)
 	return e
 }
 
 func (e *gfP4) Invert(a *gfP4) *gfP4 {
 	// See "Implementing cryptographic pairings", M. Scott, section 3.2.
 	// ftp://136.206.11.249/pub/crypto/pairings.pdf
-	t1, t2, t3 := &gfP2{}, &gfP2{}, &gfP2{}
+	tmp := &gfP4{}
+	t2 := &tmp.x
+	t1 := &tmp.y
+
+	t3 := &gfP2{}
 
 	t3.SquareU(&a.x)
 	t1.Square(&a.y)
@@ -202,8 +212,7 @@ func (e *gfP4) Invert(a *gfP4) *gfP4 {
 
 	t2.Mul(&a.x, t3)
 
-	e.x.Set(t2)
-	e.y.Set(t1)
+	gfp4Copy(e, tmp)
 	return e
 }
 
@@ -224,40 +233,46 @@ func (e *gfP4) Exp(f *gfP4, power *big.Int) *gfP4 {
 	return e
 }
 
-//  (y+x*v)^p
+//	(y+x*v)^p
+//
 // = y^p + x^p*v^p
 // = f(y) + f(x) * v^p
 // = f(y) + f(x) * v * v^(p-1)
 func (e *gfP4) Frobenius(a *gfP4) *gfP4 {
-	x, y := &gfP2{}, &gfP2{}
+	tmp := &gfP4{}
+	x := &tmp.x
+	y := &tmp.y
+	
 	x.Conjugate(&a.x)
 	y.Conjugate(&a.y)
 	x.MulScalar(x, vToPMinus1)
 
-	e.x.Set(x)
-	e.y.Set(y)
+	gfp4Copy(e, tmp)
 
 	return e
 }
 
-//  (y+x*v)^(p^2)
+//	(y+x*v)^(p^2)
+//
 // y + x*v * v^(p^2-1)
 func (e *gfP4) FrobeniusP2(a *gfP4) *gfP4 {
 	e.Conjugate(a)
 	return e
 }
 
-//  (y+x*v)^(p^3)
+//	(y+x*v)^(p^3)
+//
 // = ((y+x*v)^p)^(p^2)
 func (e *gfP4) FrobeniusP3(a *gfP4) *gfP4 {
-	x, y := &gfP2{}, &gfP2{}
+	tmp := &gfP4{}
+	x := &tmp.x
+	y := &tmp.y
 	x.Conjugate(&a.x)
 	y.Conjugate(&a.y)
 	x.MulScalar(x, vToPMinus1)
 	x.Neg(x)
 
-	e.x.Set(x)
-	e.y.Set(y)
+	gfp4Copy(e, tmp)
 
 	return e
 }
diff --git a/sm9/bn256/select_amd64.s b/sm9/bn256/select_amd64.s
index 4d4a69d..e7584f4 100644
--- a/sm9/bn256/select_amd64.s
+++ b/sm9/bn256/select_amd64.s
@@ -7,6 +7,198 @@
 #define x_ptr SI
 #define y_ptr CX
 
+// func gfpCopy(res, a *gfP)
+TEXT ·gfpCopy(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ a+8(FP), x_ptr
+
+	CMPB ·supportAVX2+0(SB), $0x01
+	JEQ  copygfp_avx2
+
+	MOVOU (16*0)(x_ptr), X0
+	MOVOU (16*1)(x_ptr), X1
+
+	MOVOU X0, (16*0)(res_ptr)
+	MOVOU X1, (16*1)(res_ptr)
+
+copygfp_avx2:
+	VMOVDQU (x_ptr), Y0
+	VMOVDQU Y0, (res_ptr)
+	VZEROUPPER
+	RET
+
+// func gfp2Copy(res, a *gfP2)
+TEXT ·gfp2Copy(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ a+8(FP), x_ptr
+
+	CMPB ·supportAVX2+0(SB), $0x01
+	JEQ  copygfp2_avx2
+
+	MOVOU (16*0)(x_ptr), X0
+	MOVOU (16*1)(x_ptr), X1
+	MOVOU (16*2)(x_ptr), X2
+	MOVOU (16*3)(x_ptr), X3
+
+	MOVOU X0, (16*0)(res_ptr)
+	MOVOU X1, (16*1)(res_ptr)
+	MOVOU X2, (16*2)(res_ptr)
+	MOVOU X3, (16*3)(res_ptr)
+
+copygfp2_avx2:
+	VMOVDQU (32*0)(x_ptr), Y0
+	VMOVDQU (32*1)(x_ptr), Y1
+
+	VMOVDQU Y0, (32*0)(res_ptr)
+	VMOVDQU Y1, (32*1)(res_ptr)
+
+	VZEROUPPER
+	RET
+
+// func gfp4Copy(res, a *gfP4)
+TEXT ·gfp4Copy(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ a+8(FP), x_ptr
+
+	CMPB ·supportAVX2+0(SB), $0x01
+	JEQ  copygfp4_avx2
+
+	MOVOU (16*0)(x_ptr), X0
+	MOVOU (16*1)(x_ptr), X1
+	MOVOU (16*2)(x_ptr), X2
+	MOVOU (16*3)(x_ptr), X3
+	
+	MOVOU (16*4)(x_ptr), X4
+	MOVOU (16*5)(x_ptr), X5
+	MOVOU (16*6)(x_ptr), X6
+	MOVOU (16*7)(x_ptr), X7
+
+	MOVOU X0, (16*0)(res_ptr)
+	MOVOU X1, (16*1)(res_ptr)
+	MOVOU X2, (16*2)(res_ptr)
+	MOVOU X3, (16*3)(res_ptr)
+
+	MOVOU X4, (16*4)(res_ptr)
+	MOVOU X5, (16*5)(res_ptr)
+	MOVOU X6, (16*6)(res_ptr)
+	MOVOU X7, (16*7)(res_ptr)
+
+copygfp4_avx2:
+	VMOVDQU (32*0)(x_ptr), Y0
+	VMOVDQU (32*1)(x_ptr), Y1
+	VMOVDQU (32*2)(x_ptr), Y2
+	VMOVDQU (32*3)(x_ptr), Y3
+
+	VMOVDQU Y0, (32*0)(res_ptr)
+	VMOVDQU Y1, (32*1)(res_ptr)
+	VMOVDQU Y2, (32*2)(res_ptr)
+	VMOVDQU Y3, (32*3)(res_ptr)
+
+	VZEROUPPER
+	RET		
+
+// func gfp12Copy(res, a *gfP12)
+TEXT ·gfp12Copy(SB),NOSPLIT,$0
+	MOVQ res+0(FP), res_ptr
+	MOVQ a+8(FP), x_ptr
+
+	CMPB ·supportAVX2+0(SB), $0x01
+	JEQ  copygfp12_avx2
+
+	MOVOU (16*0)(x_ptr), X0
+	MOVOU (16*1)(x_ptr), X1
+	MOVOU (16*2)(x_ptr), X2
+	MOVOU (16*3)(x_ptr), X3
+	
+	MOVOU (16*4)(x_ptr), X4
+	MOVOU (16*5)(x_ptr), X5
+	MOVOU (16*6)(x_ptr), X6
+	MOVOU (16*7)(x_ptr), X7
+
+	MOVOU X0, (16*0)(res_ptr)
+	MOVOU X1, (16*1)(res_ptr)
+	MOVOU X2, (16*2)(res_ptr)
+	MOVOU X3, (16*3)(res_ptr)
+
+	MOVOU X4, (16*4)(res_ptr)
+	MOVOU X5, (16*5)(res_ptr)
+	MOVOU X6, (16*6)(res_ptr)
+	MOVOU X7, (16*7)(res_ptr)
+
+	MOVOU (16*8)(x_ptr), X0
+	MOVOU (16*9)(x_ptr), X1
+	MOVOU (16*10)(x_ptr), X2
+	MOVOU (16*11)(x_ptr), X3
+	
+	MOVOU (16*12)(x_ptr), X4
+	MOVOU (16*13)(x_ptr), X5
+	MOVOU (16*14)(x_ptr), X6
+	MOVOU (16*15)(x_ptr), X7
+
+	MOVOU X0, (16*8)(res_ptr)
+	MOVOU X1, (16*9)(res_ptr)
+	MOVOU X2, (16*10)(res_ptr)
+	MOVOU X3, (16*11)(res_ptr)
+
+	MOVOU X4, (16*12)(res_ptr)
+	MOVOU X5, (16*13)(res_ptr)
+	MOVOU X6, (16*14)(res_ptr)
+	MOVOU X7, (16*15)(res_ptr)
+
+	MOVOU (16*16)(x_ptr), X0
+	MOVOU (16*17)(x_ptr), X1
+	MOVOU (16*18)(x_ptr), X2
+	MOVOU (16*19)(x_ptr), X3
+	
+	MOVOU (16*20)(x_ptr), X4
+	MOVOU (16*21)(x_ptr), X5
+	MOVOU (16*22)(x_ptr), X6
+	MOVOU (16*23)(x_ptr), X7
+
+	MOVOU X0, (16*16)(res_ptr)
+	MOVOU X1, (16*17)(res_ptr)
+	MOVOU X2, (16*18)(res_ptr)
+	MOVOU X3, (16*19)(res_ptr)
+
+	MOVOU X4, (16*20)(res_ptr)
+	MOVOU X5, (16*21)(res_ptr)
+	MOVOU X6, (16*22)(res_ptr)
+	MOVOU X7, (16*23)(res_ptr)
+
+copygfp12_avx2:
+	VMOVDQU (32*0)(x_ptr), Y0
+	VMOVDQU (32*1)(x_ptr), Y1
+	VMOVDQU (32*2)(x_ptr), Y2
+	VMOVDQU (32*3)(x_ptr), Y3
+
+	VMOVDQU (32*4)(x_ptr), Y4
+	VMOVDQU (32*5)(x_ptr), Y5
+	VMOVDQU (32*6)(x_ptr), Y6
+	VMOVDQU (32*7)(x_ptr), Y7
+
+	VMOVDQU (32*8)(x_ptr), Y8
+	VMOVDQU (32*9)(x_ptr), Y9
+	VMOVDQU (32*10)(x_ptr), Y10
+	VMOVDQU (32*11)(x_ptr), Y11
+
+	VMOVDQU Y0, (32*0)(res_ptr)
+	VMOVDQU Y1, (32*1)(res_ptr)
+	VMOVDQU Y2, (32*2)(res_ptr)
+	VMOVDQU Y3, (32*3)(res_ptr)
+
+	VMOVDQU Y4, (32*4)(res_ptr)
+	VMOVDQU Y5, (32*5)(res_ptr)
+	VMOVDQU Y6, (32*6)(res_ptr)
+	VMOVDQU Y7, (32*7)(res_ptr)
+
+	VMOVDQU Y8, (32*8)(res_ptr)
+	VMOVDQU Y9, (32*9)(res_ptr)
+	VMOVDQU Y10, (32*10)(res_ptr)
+	VMOVDQU Y11, (32*11)(res_ptr)
+
+	VZEROUPPER
+	RET		
+
 // func gfP12MovCond(res, a, b *gfP12, cond int)
 TEXT ·gfP12MovCond(SB),NOSPLIT,$0
 	MOVQ res+0(FP), res_ptr
diff --git a/sm9/bn256/select_arm64.s b/sm9/bn256/select_arm64.s
index 519a382..dd1f7bb 100644
--- a/sm9/bn256/select_arm64.s
+++ b/sm9/bn256/select_arm64.s
@@ -7,6 +7,68 @@
 #define a_ptr R1
 #define b_ptr R2
 
+/* ---------------------------------------*/
+// func gfpCopy(res, a *gfP)
+TEXT ·gfpCopy(SB),NOSPLIT,$0
+	MOVD res+0(FP), res_ptr
+	MOVD a+8(FP), a_ptr
+
+	VLD1	(a_ptr), [V0.B16, V1.B16]
+	VST1	[V0.B16, V1.B16], (res_ptr)
+
+	RET
+
+/* ---------------------------------------*/
+// func gfp2Copy(res, a *gfP2)
+TEXT ·gfp2Copy(SB),NOSPLIT,$0
+	MOVD res+0(FP), res_ptr
+	MOVD a+8(FP), a_ptr
+
+	VLD1	(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VST1	[V0.B16, V1.B16, V2.B16, V3.B16], (res_ptr)
+
+	RET
+
+/* ---------------------------------------*/
+// func gfp4Copy(res, a *gfP2)
+TEXT ·gfp4Copy(SB),NOSPLIT,$0
+	MOVD res+0(FP), res_ptr
+	MOVD a+8(FP), a_ptr
+
+	VLD1.P	64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VST1.P	[V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
+
+	VLD1.P	64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VST1.P	[V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
+
+	RET
+
+/* ---------------------------------------*/
+// func gfp12Copy(res, a *gfP12)
+TEXT ·gfp12Copy(SB),NOSPLIT,$0
+	MOVD res+0(FP), res_ptr
+	MOVD a+8(FP), a_ptr
+
+	VLD1.P	64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VST1.P	[V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
+
+	VLD1.P	64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VST1.P	[V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
+
+	VLD1.P	64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VST1.P	[V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
+
+	VLD1.P	64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VST1.P	[V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
+
+	VLD1.P	64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VST1.P	[V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
+
+	VLD1.P	64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VST1.P	[V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
+		
+	RET
+
 /* ---------------------------------------*/
 // func gfP12MovCond(res, a, b *gfP12, cond int)
 // If cond == 0 res=b, else res=a
diff --git a/sm9/bn256/select_decl.go b/sm9/bn256/select_decl.go
index 0d5c0da..acd9855 100644
--- a/sm9/bn256/select_decl.go
+++ b/sm9/bn256/select_decl.go
@@ -21,3 +21,15 @@ func curvePointMovCond(res, a, b *curvePoint, cond int)
 //
 //go:noescape
 func twistPointMovCond(res, a, b *twistPoint, cond int)
+
+//go:noescape
+func gfpCopy(res, in *gfP)
+
+//go:noescape
+func gfp2Copy(res, in *gfP2)
+
+//go:noescape
+func gfp4Copy(res, in *gfP4)
+
+//go:noescape
+func gfp12Copy(res, in *gfP12)
diff --git a/sm9/bn256/select_generic.go b/sm9/bn256/select_generic.go
index 7500455..1107bdf 100644
--- a/sm9/bn256/select_generic.go
+++ b/sm9/bn256/select_generic.go
@@ -14,3 +14,39 @@ func curvePointMovCond(res, a, b *curvePoint, cond int) {
 func twistPointMovCond(res, a, b *twistPoint, cond int) {
 	res.Select(a, b, cond)
 }
+
+func gfpCopy(res, in *gfP) {
+	res[0] = in[0]
+	res[1] = in[1]
+	res[2] = in[2]
+	res[3] = in[3]
+}
+
+func gfp2Copy(res, in *gfP2) {
+	gfpCopy(&res.x, &in.x)
+	gfpCopy(&res.y, &in.y)
+}
+
+func gfp4Copy(res, in *gfP4) {
+	gfpCopy(&res.x.x, &in.x.x)
+	gfpCopy(&res.x.y, &in.x.y)
+	gfpCopy(&res.y.x, &in.y.x)
+	gfpCopy(&res.y.y, &in.y.y)
+}
+
+func gfp12Copy(res, in *gfP12) {
+	gfpCopy(&res.x.x.x, &in.x.x.x)
+	gfpCopy(&res.x.x.y, &in.x.x.y)
+	gfpCopy(&res.x.y.x, &in.x.y.x)
+	gfpCopy(&res.x.y.y, &in.x.y.y)
+
+	gfpCopy(&res.y.x.x, &in.y.x.x)
+	gfpCopy(&res.y.x.y, &in.y.x.y)
+	gfpCopy(&res.y.y.x, &in.y.y.x)
+	gfpCopy(&res.y.y.y, &in.y.y.y)
+
+	gfpCopy(&res.z.x.x, &in.z.x.x)
+	gfpCopy(&res.z.x.y, &in.z.x.y)
+	gfpCopy(&res.z.y.x, &in.z.y.x)
+	gfpCopy(&res.z.y.y, &in.z.y.y)
+}
diff --git a/sm9/bn256/select_test.go b/sm9/bn256/select_test.go
new file mode 100644
index 0000000..c00f61e
--- /dev/null
+++ b/sm9/bn256/select_test.go
@@ -0,0 +1,54 @@
+package bn256
+
+import "testing"
+
+func BenchmarkGfP12Copy(b *testing.B) {
+	x := &gfP12{
+		testdataP4,
+		testdataP4,
+		testdataP4,
+	}
+	res := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		gfp12Copy(res, x)
+	}
+}
+
+func gfpCopyForTest(res, in *gfP) {
+	res[0] = in[0]
+	res[1] = in[1]
+	res[2] = in[2]
+	res[3] = in[3]
+}
+
+func gfp2CopyForTest(res, in *gfP2) {
+	gfpCopyForTest(&res.x, &in.x)
+	gfpCopyForTest(&res.y, &in.y)
+}
+
+func gfp4CopyForTest(res, in *gfP4) {
+	gfp2CopyForTest(&res.x, &in.x)
+	gfp2CopyForTest(&res.y, &in.y)
+}
+
+func gfp12CopyForTest(res, in *gfP12) {
+	gfp4CopyForTest(&res.x, &in.x)
+	gfp4CopyForTest(&res.y, &in.y)
+	gfp4CopyForTest(&res.z, &in.z)
+}
+
+func BenchmarkGfP12Set(b *testing.B) {
+	x := &gfP12{
+		testdataP4,
+		testdataP4,
+		testdataP4,
+	}
+	res := &gfP12{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		gfp12CopyForTest(res, x)
+	}
+}