From c7b3d97304eeef82124e430d9fd88b733594a1db Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Fri, 30 Jun 2023 17:51:35 +0800 Subject: [PATCH] sm9/bn256: value copy acceleration #136 --- sm9/bn256/bn_pair.go | 15 ++- sm9/bn256/generate.go | 45 +++++++++ sm9/bn256/gfp.go | 6 +- sm9/bn256/gfp12.go | 121 ++++++++++++++++------- sm9/bn256/gfp12_exp_u.go | 43 ++++++++ sm9/bn256/gfp12_test.go | 42 ++++++++ sm9/bn256/gfp2.go | 62 ++++++------ sm9/bn256/gfp4.go | 75 ++++++++------ sm9/bn256/select_amd64.s | 192 ++++++++++++++++++++++++++++++++++++ sm9/bn256/select_arm64.s | 62 ++++++++++++ sm9/bn256/select_decl.go | 12 +++ sm9/bn256/select_generic.go | 36 +++++++ sm9/bn256/select_test.go | 54 ++++++++++ 13 files changed, 657 insertions(+), 108 deletions(-) create mode 100644 sm9/bn256/gfp12_exp_u.go create mode 100644 sm9/bn256/select_test.go diff --git a/sm9/bn256/bn_pair.go b/sm9/bn256/bn_pair.go index 90a9e4d..acc3ef8 100644 --- a/sm9/bn256/bn_pair.go +++ b/sm9/bn256/bn_pair.go @@ -95,8 +95,8 @@ func lineFunctionDouble(r *twistPoint, q *curvePoint) (a, b, c *gfP2, rOut *twis // (ret.z + ret.y*w + ret.x*w^2)* ((cv+a) + b*w^2) func mulLine(ret *gfP12, a, b, c *gfP2) { t1, tz, t, bz := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{} - bz.x.Set(c) - bz.y.Set(a) + gfp2Copy(&bz.x, c) + gfp2Copy(&bz.y, a) tz.Mul(&ret.z, bz) t.MulScalar(&ret.y, b).MulV1(t) @@ -109,17 +109,14 @@ func mulLine(ret *gfP12, a, b, c *gfP2) { t.Mul(&ret.x, bz) t1.MulScalar(&ret.z, b) ret.x.Add(t1, t) - - ret.z.Set(tz) + gfp4Copy(&ret.z, tz) } -// // R-ate Pairing G2 x G1 -> GT // // P is a point of order q in G1. Q(x,y) is a point of order q in G2. // Note that Q is a point on the sextic twist of the curve over Fp^2, P(x,y) is a point on the // curve over the base field Fp -// func miller(q *twistPoint, p *curvePoint) *gfP12 { ret := (&gfP12{}).SetOne() @@ -218,9 +215,9 @@ func finalExponentiation(in *gfP12) *gfP12 { fp2 := (&gfP12{}).FrobeniusP2(t1) fp3 := (&gfP12{}).Frobenius(fp2) - fu := (&gfP12{}).Exp(t1, u) - fu2 := (&gfP12{}).Exp(fu, u) - fu3 := (&gfP12{}).Exp(fu2, u) + fu := (&gfP12{}).gfP12ExpU(t1) + fu2 := (&gfP12{}).gfP12ExpU(fu) + fu3 := (&gfP12{}).gfP12ExpU(fu2) y3 := (&gfP12{}).Frobenius(fu) fu2p := (&gfP12{}).Frobenius(fu2) diff --git a/sm9/bn256/generate.go b/sm9/bn256/generate.go index ac5b1c7..145cb20 100644 --- a/sm9/bn256/generate.go +++ b/sm9/bn256/generate.go @@ -97,6 +97,14 @@ func main() { if err = writeFile("gfp2_sqrt.go", out, out1, out2); err != nil { log.Fatal(err) } + + out, err = generate(tmplAddchainExp12, "0x600000000058f98a", "gfP12") + if err != nil { + log.Fatal(err) + } + if err = writeFile("gfp12_exp_u.go", out); err != nil { + log.Fatal(err) + } } const tmplAddchainExp1 = `// Code generated by {{ .Meta.Name }}. DO NOT EDIT. @@ -300,3 +308,40 @@ func sqrtCandidate(z, x *Element) { {{- end }} } ` + +const tmplAddchainExp12 = `// Code generated by {{ .Meta.Name }}. DO NOT EDIT. +package bn256 + +func (e *Element) gfP12ExpU(x *Element) *Element { + // The sequence of {{ .Ops.Adds }} multiplications and {{ .Ops.Doubles }} squarings is derived from the + // following addition chain generated with {{ .Meta.Module }} {{ .Meta.ReleaseTag }}. + // + {{- range lines (format .Script) }} + // {{ . }} + {{- end }} + // + var z = new(Element).Set(e) + {{- range .Program.Temporaries }} + var {{ . }} = new(Element) + {{- end }} + {{ range $i := .Program.Instructions -}} + {{- with add $i.Op }} + {{ $i.Output }}.Mul({{ .X }}, {{ .Y }}) + {{- end -}} + {{- with double $i.Op }} + {{ $i.Output }}.Square({{ .X }}) + {{- end -}} + {{- with shift $i.Op -}} + {{- $first := 0 -}} + {{- if ne $i.Output.Identifier .X.Identifier }} + {{ $i.Output }}.Square({{ .X }}) + {{- $first = 1 -}} + {{- end }} + for s := {{ $first }}; s < {{ .S }}; s++ { + {{ $i.Output }}.Square({{ $i.Output }}) + } + {{- end -}} + {{- end }} + return e.Set(z) +} +` diff --git a/sm9/bn256/gfp.go b/sm9/bn256/gfp.go index 2a56759..f834a07 100644 --- a/sm9/bn256/gfp.go +++ b/sm9/bn256/gfp.go @@ -60,11 +60,7 @@ func (e *gfP) String() string { } func (e *gfP) Set(f *gfP) *gfP { - e[0] = f[0] - e[1] = f[1] - e[2] = f[2] - e[3] = f[3] - + gfpCopy(e, f) return e } diff --git a/sm9/bn256/gfp12.go b/sm9/bn256/gfp12.go index ea5f1b4..fbe1533 100644 --- a/sm9/bn256/gfp12.go +++ b/sm9/bn256/gfp12.go @@ -58,9 +58,7 @@ func (e *gfP12) String() string { } func (e *gfP12) Set(a *gfP12) *gfP12 { - e.x.Set(&a.x) - e.y.Set(&a.y) - e.z.Set(&a.z) + gfp12Copy(e, a) return e } @@ -141,7 +139,11 @@ func (e *gfP12) Mul(a, b *gfP12) *gfP12 { // +y0*z1*w + y0*y1*w^2 + y0*x1*v // +x0*z1*w^2 + x0*y1*v + x0*x1*v*w //=(z0*z1+y0*x1*v+x0*y1*v) + (z0*y1+y0*z1+x0*x1*v)w + (z0*x1 + y0*y1 + x0*z1)*w^2 - tx, ty, tz, t, v0, v1, v2 := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{} + tmp := &gfP12{} + tx := &tmp.x + ty := &tmp.y + tz := &tmp.z + t, v0, v1, v2 := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{} v0.Mul(&a.z, &b.z) v1.Mul(&a.y, &b.y) v2.Mul(&a.x, &b.x) @@ -168,10 +170,7 @@ func (e *gfP12) Mul(a, b *gfP12) *gfP12 { tx.Sub(tx, v0) tx.Add(tx, v1) tx.Sub(tx, v2) - - e.x.Set(tx) - e.y.Set(ty) - e.z.Set(tz) + gfp12Copy(e, tmp) return e } @@ -180,7 +179,37 @@ func (e *gfP12) Square(a *gfP12) *gfP12 { // z^2 + z*y*w + z*x*w^2 + y*z*w + y^2*w^2 + y*x*v + x*z*w^2 + x*y*v + x^2 *v *w // (z^2 + y*x*v + x*y*v) + (z*y + y*z + v * x^2)w + (z*x + y^2 + x*z)*w^2 // (z^2 + 2*x*y*v) + (v*x^2 + 2*y*z) *w + (y^2 + 2*x*z) * w^2 - tx, ty, tz, t := &gfP4{}, &gfP4{}, &gfP4{}, &gfP4{} + tmp := &gfP12{} + tx := &tmp.x + ty := &tmp.y + tz := &tmp.z + t := &gfP4{} + + tz.Square(&a.z) + t.MulV(&a.x, &a.y) + t.Add(t, t) + tz.Add(tz, t) + + ty.SquareV(&a.x) + t.Mul(&a.y, &a.z) + t.Add(t, t) + ty.Add(ty, t) + + tx.Square(&a.y) + t.Mul(&a.x, &a.z) + t.Add(t, t) + tx.Add(tx, t) + gfp12Copy(e, tmp) + return e +} + +func (e *gfP12) Squares(a *gfP12, n int) *gfP12 { + // Square first round + in := &gfP12{} + tx := &in.x + ty := &in.y + tz := &in.z + t := &gfP4{} tz.Square(&a.z) t.MulV(&a.x, &a.y) @@ -197,9 +226,36 @@ func (e *gfP12) Square(a *gfP12) *gfP12 { t.Add(t, t) tx.Add(tx, t) - e.x.Set(tx) - e.y.Set(ty) - e.z.Set(tz) + tmp := &gfP12{} + var tmp2 *gfP12 + tx = &tmp.x + ty = &tmp.y + tz = &tmp.z + for i := 1; i < n; i++ { + tz.Square(&in.z) + t.MulV(&in.x, &in.y) + t.Add(t, t) + tz.Add(tz, t) + + ty.SquareV(&in.x) + t.Mul(&in.y, &in.z) + t.Add(t, t) + ty.Add(ty, t) + + tx.Square(&in.y) + t.Mul(&in.x, &in.z) + t.Add(t, t) + tx.Add(tx, t) + + // Switch references + tmp2 = in + in = tmp + tmp = tmp2 + tx = &tmp.x + ty = &tmp.y + tz = &tmp.z + } + gfp12Copy(e, in) return e } @@ -215,8 +271,7 @@ func (e *gfP12) Exp(f *gfP12, power *big.Int) *gfP12 { sum.Set(t) } } - - e.Set(sum) + gfp12Copy(e, sum) return e } @@ -268,38 +323,39 @@ func (e *gfP12) Neg(a *gfP12) *gfP12 { } // (z + y*w + x*w^2)^p -//= z^p + y^p*w*w^(p-1)+x^p*w^2*(w^2)^(p-1) +// = z^p + y^p*w*w^(p-1)+x^p*w^2*(w^2)^(p-1) // w2ToP2Minus1 = vToPMinus1 * wToPMinus1 func (e *gfP12) Frobenius(a *gfP12) *gfP12 { - x, y := &gfP2{}, &gfP2{} + tmp := &gfP4{} + x := &tmp.x + y := &tmp.y x.Conjugate(&a.z.x) y.Conjugate(&a.z.y) x.MulScalar(x, vToPMinus1) - e.z.x.Set(x) - e.z.y.Set(y) + gfp4Copy(&e.z, tmp) x.Conjugate(&a.y.x) y.Conjugate(&a.y.y) x.MulScalar(x, w2ToP2Minus1) y.MulScalar(y, wToPMinus1) - e.y.x.Set(x) - e.y.y.Set(y) + gfp4Copy(&e.y, tmp) x.Conjugate(&a.x.x) y.Conjugate(&a.x.y) x.MulScalar(x, vToPMinus1Mw2ToPMinus1) y.MulScalar(y, w2ToPMinus1) - e.x.x.Set(x) - e.x.y.Set(y) + gfp4Copy(&e.x, tmp) return e } // (z + y*w + x*w^2)^(p^2) -//= z^(p^2) + y^(p^2)*w*w^((p^2)-1)+x^(p^2)*w^2*(w^2)^((p^2)-1) +// = z^(p^2) + y^(p^2)*w*w^((p^2)-1)+x^(p^2)*w^2*(w^2)^((p^2)-1) func (e *gfP12) FrobeniusP2(a *gfP12) *gfP12 { - tx, ty, tz := &gfP4{}, &gfP4{}, &gfP4{} + tx := &e.x + ty := &e.y + tz := &e.z tz.Conjugate(&a.z) @@ -308,17 +364,12 @@ func (e *gfP12) FrobeniusP2(a *gfP12) *gfP12 { tx.Conjugate(&a.x) tx.MulGFP(tx, w2ToP2Minus1) - - e.x.Set(tx) - e.y.Set(ty) - e.z.Set(tz) - return e } // (z + y*w + x*w^2)^(p^3) -//=z^(p^3) + y^(p^3)*w*w^((p^3)-1)+x^(p^3)*w^2*(w^2)^((p^3)-1) -//=z^(p^3) + y^(p^3)*w*vToPMinus1-x^(p^3)*w^2 +// =z^(p^3) + y^(p^3)*w*w^((p^3)-1)+x^(p^3)*w^2*(w^2)^((p^3)-1) +// =z^(p^3) + y^(p^3)*w*vToPMinus1-x^(p^3)*w^2 // vToPMinus1 * vToPMinus1 = -1 func (e *gfP12) FrobeniusP3(a *gfP12) *gfP12 { x, y := &gfP2{}, &gfP2{} @@ -352,7 +403,9 @@ func (e *gfP12) FrobeniusP3(a *gfP12) *gfP12 { // (z + y*w + x*w^2)^(p^6) // = ((z + y*w + x*w^2)^(p^3))^(p^3) func (e *gfP12) FrobeniusP6(a *gfP12) *gfP12 { - tx, ty, tz := &gfP4{}, &gfP4{}, &gfP4{} + tx := &e.x + ty := &e.y + tz := &e.z tz.Conjugate(&a.z) @@ -361,10 +414,6 @@ func (e *gfP12) FrobeniusP6(a *gfP12) *gfP12 { tx.Conjugate(&a.x) - e.x.Set(tx) - e.y.Set(ty) - e.z.Set(tz) - return e } diff --git a/sm9/bn256/gfp12_exp_u.go b/sm9/bn256/gfp12_exp_u.go new file mode 100644 index 0000000..2f95620 --- /dev/null +++ b/sm9/bn256/gfp12_exp_u.go @@ -0,0 +1,43 @@ +package bn256 + +func (e *gfP12) gfP12ExpU(x *gfP12) *gfP12 { + // The sequence of 10 multiplications and 61 squarings is derived from the + // following addition chain generated with github.com/mmcloughlin/addchain v0.4.0. + // + // _10 = 2*1 + // _100 = 2*_10 + // _101 = 1 + _100 + // _1001 = _100 + _101 + // _1011 = _10 + _1001 + // _1100 = 1 + _1011 + // i56 = (_1100 << 40 + _1011) << 7 + _1011 + _100 + // i69 = (2*(i56 << 4 + _1001) + 1) << 6 + // return 2*(_101 + i69) + // + var z = new(gfP12) + var t0 = new(gfP12) + var t1 = new(gfP12) + var t2 = new(gfP12) + var t3 = new(gfP12) + + t2.Square(x) + t1.Square(t2) + z.Mul(x, t1) + t0.Mul(t1, z) + t2.Mul(t2, t0) + t3.Mul(x, t2) + t3.Squares(t3, 40) + t3.Mul(t2, t3) + t3.Squares(t3, 7) + t2.Mul(t2, t3) + t1.Mul(t1, t2) + t1.Squares(t1, 4) + t0.Mul(t0, t1) + t0.Square(t0) + t0.Mul(x, t0) + t0.Squares(t0, 6) + z.Mul(z, t0) + z.Square(z) + gfp12Copy(e, z) + return e +} diff --git a/sm9/bn256/gfp12_test.go b/sm9/bn256/gfp12_test.go index ef4efe7..bedd1c9 100644 --- a/sm9/bn256/gfp12_test.go +++ b/sm9/bn256/gfp12_test.go @@ -35,6 +35,20 @@ func Test_gfP12Square(t *testing.T) { } } +func BenchmarkGfP12Square(b *testing.B) { + x := &gfP12{ + testdataP4, + testdataP4, + *(&gfP4{}).SetOne(), + } + x2 := &gfP12{} + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + x2.Square(x) + } +} + func testGfP12Invert(t *testing.T, x *gfP12) { xInv := &gfP12{} xInv.Invert(x) @@ -285,3 +299,31 @@ func BenchmarkGfP12Frobenius(b *testing.B) { } } } + +func BenchmarkGfP12ExpU(b *testing.B) { + x := &gfP12{ + testdataP4, + testdataP4, + testdataP4, + } + got := &gfP12{} + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + got.gfP12ExpU(x) + } +} + +func BenchmarkGfP12ExpU2(b *testing.B) { + x := &gfP12{ + testdataP4, + testdataP4, + testdataP4, + } + got := &gfP12{} + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + got.Exp(x, u) + } +} diff --git a/sm9/bn256/gfp2.go b/sm9/bn256/gfp2.go index 40034b4..e89d19d 100644 --- a/sm9/bn256/gfp2.go +++ b/sm9/bn256/gfp2.go @@ -26,8 +26,7 @@ func (e *gfP2) String() string { } func (e *gfP2) Set(a *gfP2) *gfP2 { - e.x.Set(&a.x) - e.y.Set(&a.y) + gfp2Copy(e, a) return e } @@ -105,11 +104,14 @@ func (e *gfP2) Triple(a *gfP2) *gfP2 { // See "Multiplication and Squaring in Pairing-Friendly Fields", // http://eprint.iacr.org/2006/471.pdf // The Karatsuba method -//(a0+a1*u)(b0+b1*u)=c0+c1*u, where -//c0 = a0*b0 - 2a1*b1 -//c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0 +// (a0+a1*u)(b0+b1*u)=c0+c1*u, where +// c0 = a0*b0 - 2a1*b1 +// c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0 func (e *gfP2) Mul(a, b *gfP2) *gfP2 { - tx, ty, v0, v1 := &gfP{}, &gfP{}, &gfP{}, &gfP{} + tmp := &gfP2{} + tx := &tmp.x + ty := &tmp.y + v0, v1 := &gfP{}, &gfP{} gfpMul(v0, &a.y, &b.y) gfpMul(v1, &a.x, &b.x) @@ -123,17 +125,19 @@ func (e *gfP2) Mul(a, b *gfP2) *gfP2 { gfpSub(ty, v0, v1) gfpSub(ty, ty, v1) - e.x.Set(tx) - e.y.Set(ty) + gfp2Copy(e, tmp) return e } // MulU: a * b * u -//(a0+a1*u)(b0+b1*u)*u=c0+c1*u, where -//c1 = (a0*b0 - 2a1*b1)u -//c0 = -2 * ((a0 + a1)(b0 + b1) - a0*b0 - a1*b1) = -2 * (a0*b1 + a1*b0) +// (a0+a1*u)(b0+b1*u)*u=c0+c1*u, where +// c1 = (a0*b0 - 2a1*b1)u +// c0 = -2 * ((a0 + a1)(b0 + b1) - a0*b0 - a1*b1) = -2 * (a0*b1 + a1*b0) func (e *gfP2) MulU(a, b *gfP2) *gfP2 { - tx, ty, v0, v1 := &gfP{}, &gfP{}, &gfP{}, &gfP{} + tmp := &gfP2{} + tx := &tmp.x + ty := &tmp.y + v0, v1 := &gfP{}, &gfP{} gfpMul(v0, &a.y, &b.y) gfpMul(v1, &a.x, &b.x) @@ -150,29 +154,30 @@ func (e *gfP2) MulU(a, b *gfP2) *gfP2 { gfpSub(tx, v0, v1) gfpSub(tx, tx, v1) - e.x.Set(tx) - e.y.Set(ty) + gfp2Copy(e, tmp) return e } // MulU1: a * u -//(a0+a1*u)u=c0+c1*u, where -//c1 = a0 -//c0 = -2a1 +// (a0+a1*u)u=c0+c1*u, where +// c1 = a0 +// c0 = -2a1 func (e *gfP2) MulU1(a *gfP2) *gfP2 { t := &gfP{} gfpAdd(t, &a.x, &a.x) gfpNeg(t, t) - e.x.Set(&a.y) - e.y.Set(t) + gfpCopy(&e.x, &a.y) + gfpCopy(&e.y, t) return e } func (e *gfP2) Square(a *gfP2) *gfP2 { // Complex squaring algorithm: // (xu+y)² = y^2-2*x^2 + 2*u*x*y - tx, ty := &gfP{}, &gfP{} + tmp := &gfP2{} + tx := &tmp.x + ty := &tmp.y gfpSqr(tx, &a.x, 1) gfpSqr(ty, &a.y, 1) gfpSub(ty, ty, tx) @@ -180,9 +185,7 @@ func (e *gfP2) Square(a *gfP2) *gfP2 { gfpMul(tx, &a.x, &a.y) gfpAdd(tx, tx, tx) - - e.x.Set(tx) - e.y.Set(ty) + gfp2Copy(e, tmp) return e } @@ -190,7 +193,9 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 { // Complex squaring algorithm: // (xu+y)²*u = (y^2-2*x^2)u - 4*x*y - tx, ty := &gfP{}, &gfP{} + tmp := &gfP2{} + tx := &tmp.x + ty := &tmp.y // tx = a0^2 - 2 * a1^2 gfpSqr(ty, &a.x, 1) gfpSqr(tx, &a.y, 1) @@ -203,8 +208,7 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 { gfpAdd(ty, ty, ty) gfpNeg(ty, ty) - e.x.Set(tx) - e.y.Set(ty) + gfp2Copy(e, tmp) return e } @@ -251,8 +255,10 @@ func (e *gfP2) Exp(f *gfP2, power *big.Int) *gfP2 { } // (xu+y)^p = x * u^p + y -// = x * u * u^(p-1) + y -// = (-x)*u + y +// +// = x * u * u^(p-1) + y +// = (-x)*u + y +// // here u^(p-1) = -1 func (e *gfP2) Frobenius(a *gfP2) *gfP2 { e.Conjugate(a) diff --git a/sm9/bn256/gfp4.go b/sm9/bn256/gfp4.go index 9a00444..af04808 100644 --- a/sm9/bn256/gfp4.go +++ b/sm9/bn256/gfp4.go @@ -25,8 +25,7 @@ func (e *gfP4) String() string { } func (e *gfP4) Set(a *gfP4) *gfP4 { - e.x.Set(&a.x) - e.y.Set(&a.y) + gfp4Copy(e, a) return e } @@ -99,7 +98,10 @@ func (e *gfP4) Mul(a, b *gfP4) *gfP4 { //(a0+a1*v)(b0+b1*v)=c0+c1*v, where //c0 = a0*b0 +a1*b1*u //c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0 - tx, ty, v0, v1 := &gfP2{}, &gfP2{}, &gfP2{}, &gfP2{} + tmp := &gfP4{} + tx := &tmp.x + ty := &tmp.y + v0, v1 := &gfP2{}, &gfP2{} v0.Mul(&a.y, &b.y) v1.Mul(&a.x, &b.x) @@ -112,19 +114,21 @@ func (e *gfP4) Mul(a, b *gfP4) *gfP4 { ty.MulU1(v1) ty.Add(ty, v0) - e.x.Set(tx) - e.y.Set(ty) + gfp4Copy(e, tmp) return e } // MulV: a * b * v -//(a0+a1*v)(b0+b1*v)*v=c0+c1*v, where +// (a0+a1*v)(b0+b1*v)*v=c0+c1*v, where // (a0*b0 + a0*b1v + a1*b0*v + a1*b1*u)*v // a0*b0*v + a0*b1*u + a1*b0*u + a1*b1*u*v // c0 = a0*b1*u + a1*b0*u // c1 = a0*b0 + a1*b1*u func (e *gfP4) MulV(a, b *gfP4) *gfP4 { - tx, ty, v0, v1 := &gfP2{}, &gfP2{}, &gfP2{}, &gfP2{} + tmp := &gfP4{} + tx := &tmp.x + ty := &tmp.y + v0, v1 := &gfP2{}, &gfP2{} v0.Mul(&a.y, &b.y) v1.Mul(&a.x, &b.x) @@ -138,27 +142,29 @@ func (e *gfP4) MulV(a, b *gfP4) *gfP4 { tx.MulU1(v1) tx.Add(tx, v0) - e.x.Set(tx) - e.y.Set(ty) + gfp4Copy(e, tmp) return e } // MulV1: a * v -//(a0+a1*v)*v=c0+c1*v, where +// (a0+a1*v)*v=c0+c1*v, where // c0 = a1*u // c1 = a0 func (e *gfP4) MulV1(a *gfP4) *gfP4 { - tx := (&gfP2{}).Set(&a.y) + tx := &gfP2{} + gfp2Copy(tx, &a.y) e.y.MulU1(&a.x) - e.x.Set(tx) + gfp2Copy(&e.x, tx) return e } func (e *gfP4) Square(a *gfP4) *gfP4 { // Complex squaring algorithm: // (xv+y)² = (x^2*u + y^2) + 2*x*y*v - tx, ty := &gfP2{}, &gfP2{} + tmp := &gfP4{} + tx := &tmp.x + ty := &tmp.y tx.SquareU(&a.x) ty.Square(&a.y) ty.Add(tx, ty) @@ -166,15 +172,16 @@ func (e *gfP4) Square(a *gfP4) *gfP4 { tx.Mul(&a.x, &a.y) tx.Add(tx, tx) - e.x.Set(tx) - e.y.Set(ty) + gfp4Copy(e, tmp) return e } // SquareV: (a^2) * v // v*(xv+y)² = (x^2*u + y^2)v + 2*x*y*u func (e *gfP4) SquareV(a *gfP4) *gfP4 { - tx, ty := &gfP2{}, &gfP2{} + tmp := &gfP4{} + tx := &tmp.x + ty := &tmp.y tx.SquareU(&a.x) ty.Square(&a.y) tx.Add(tx, ty) @@ -182,15 +189,18 @@ func (e *gfP4) SquareV(a *gfP4) *gfP4 { ty.MulU(&a.x, &a.y) ty.Add(ty, ty) - e.x.Set(tx) - e.y.Set(ty) + gfp4Copy(e, tmp) return e } func (e *gfP4) Invert(a *gfP4) *gfP4 { // See "Implementing cryptographic pairings", M. Scott, section 3.2. // ftp://136.206.11.249/pub/crypto/pairings.pdf - t1, t2, t3 := &gfP2{}, &gfP2{}, &gfP2{} + tmp := &gfP4{} + t2 := &tmp.x + t1 := &tmp.y + + t3 := &gfP2{} t3.SquareU(&a.x) t1.Square(&a.y) @@ -202,8 +212,7 @@ func (e *gfP4) Invert(a *gfP4) *gfP4 { t2.Mul(&a.x, t3) - e.x.Set(t2) - e.y.Set(t1) + gfp4Copy(e, tmp) return e } @@ -224,40 +233,46 @@ func (e *gfP4) Exp(f *gfP4, power *big.Int) *gfP4 { return e } -// (y+x*v)^p +// (y+x*v)^p +// // = y^p + x^p*v^p // = f(y) + f(x) * v^p // = f(y) + f(x) * v * v^(p-1) func (e *gfP4) Frobenius(a *gfP4) *gfP4 { - x, y := &gfP2{}, &gfP2{} + tmp := &gfP4{} + x := &tmp.x + y := &tmp.y + x.Conjugate(&a.x) y.Conjugate(&a.y) x.MulScalar(x, vToPMinus1) - e.x.Set(x) - e.y.Set(y) + gfp4Copy(e, tmp) return e } -// (y+x*v)^(p^2) +// (y+x*v)^(p^2) +// // y + x*v * v^(p^2-1) func (e *gfP4) FrobeniusP2(a *gfP4) *gfP4 { e.Conjugate(a) return e } -// (y+x*v)^(p^3) +// (y+x*v)^(p^3) +// // = ((y+x*v)^p)^(p^2) func (e *gfP4) FrobeniusP3(a *gfP4) *gfP4 { - x, y := &gfP2{}, &gfP2{} + tmp := &gfP4{} + x := &tmp.x + y := &tmp.y x.Conjugate(&a.x) y.Conjugate(&a.y) x.MulScalar(x, vToPMinus1) x.Neg(x) - e.x.Set(x) - e.y.Set(y) + gfp4Copy(e, tmp) return e } diff --git a/sm9/bn256/select_amd64.s b/sm9/bn256/select_amd64.s index 4d4a69d..e7584f4 100644 --- a/sm9/bn256/select_amd64.s +++ b/sm9/bn256/select_amd64.s @@ -7,6 +7,198 @@ #define x_ptr SI #define y_ptr CX +// func gfpCopy(res, a *gfP) +TEXT ·gfpCopy(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ a+8(FP), x_ptr + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ copygfp_avx2 + + MOVOU (16*0)(x_ptr), X0 + MOVOU (16*1)(x_ptr), X1 + + MOVOU X0, (16*0)(res_ptr) + MOVOU X1, (16*1)(res_ptr) + +copygfp_avx2: + VMOVDQU (x_ptr), Y0 + VMOVDQU Y0, (res_ptr) + VZEROUPPER + RET + +// func gfp2Copy(res, a *gfP2) +TEXT ·gfp2Copy(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ a+8(FP), x_ptr + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ copygfp2_avx2 + + MOVOU (16*0)(x_ptr), X0 + MOVOU (16*1)(x_ptr), X1 + MOVOU (16*2)(x_ptr), X2 + MOVOU (16*3)(x_ptr), X3 + + MOVOU X0, (16*0)(res_ptr) + MOVOU X1, (16*1)(res_ptr) + MOVOU X2, (16*2)(res_ptr) + MOVOU X3, (16*3)(res_ptr) + +copygfp2_avx2: + VMOVDQU (32*0)(x_ptr), Y0 + VMOVDQU (32*1)(x_ptr), Y1 + + VMOVDQU Y0, (32*0)(res_ptr) + VMOVDQU Y1, (32*1)(res_ptr) + + VZEROUPPER + RET + +// func gfp4Copy(res, a *gfP4) +TEXT ·gfp4Copy(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ a+8(FP), x_ptr + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ copygfp4_avx2 + + MOVOU (16*0)(x_ptr), X0 + MOVOU (16*1)(x_ptr), X1 + MOVOU (16*2)(x_ptr), X2 + MOVOU (16*3)(x_ptr), X3 + + MOVOU (16*4)(x_ptr), X4 + MOVOU (16*5)(x_ptr), X5 + MOVOU (16*6)(x_ptr), X6 + MOVOU (16*7)(x_ptr), X7 + + MOVOU X0, (16*0)(res_ptr) + MOVOU X1, (16*1)(res_ptr) + MOVOU X2, (16*2)(res_ptr) + MOVOU X3, (16*3)(res_ptr) + + MOVOU X4, (16*4)(res_ptr) + MOVOU X5, (16*5)(res_ptr) + MOVOU X6, (16*6)(res_ptr) + MOVOU X7, (16*7)(res_ptr) + +copygfp4_avx2: + VMOVDQU (32*0)(x_ptr), Y0 + VMOVDQU (32*1)(x_ptr), Y1 + VMOVDQU (32*2)(x_ptr), Y2 + VMOVDQU (32*3)(x_ptr), Y3 + + VMOVDQU Y0, (32*0)(res_ptr) + VMOVDQU Y1, (32*1)(res_ptr) + VMOVDQU Y2, (32*2)(res_ptr) + VMOVDQU Y3, (32*3)(res_ptr) + + VZEROUPPER + RET + +// func gfp12Copy(res, a *gfP12) +TEXT ·gfp12Copy(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ a+8(FP), x_ptr + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ copygfp12_avx2 + + MOVOU (16*0)(x_ptr), X0 + MOVOU (16*1)(x_ptr), X1 + MOVOU (16*2)(x_ptr), X2 + MOVOU (16*3)(x_ptr), X3 + + MOVOU (16*4)(x_ptr), X4 + MOVOU (16*5)(x_ptr), X5 + MOVOU (16*6)(x_ptr), X6 + MOVOU (16*7)(x_ptr), X7 + + MOVOU X0, (16*0)(res_ptr) + MOVOU X1, (16*1)(res_ptr) + MOVOU X2, (16*2)(res_ptr) + MOVOU X3, (16*3)(res_ptr) + + MOVOU X4, (16*4)(res_ptr) + MOVOU X5, (16*5)(res_ptr) + MOVOU X6, (16*6)(res_ptr) + MOVOU X7, (16*7)(res_ptr) + + MOVOU (16*8)(x_ptr), X0 + MOVOU (16*9)(x_ptr), X1 + MOVOU (16*10)(x_ptr), X2 + MOVOU (16*11)(x_ptr), X3 + + MOVOU (16*12)(x_ptr), X4 + MOVOU (16*13)(x_ptr), X5 + MOVOU (16*14)(x_ptr), X6 + MOVOU (16*15)(x_ptr), X7 + + MOVOU X0, (16*8)(res_ptr) + MOVOU X1, (16*9)(res_ptr) + MOVOU X2, (16*10)(res_ptr) + MOVOU X3, (16*11)(res_ptr) + + MOVOU X4, (16*12)(res_ptr) + MOVOU X5, (16*13)(res_ptr) + MOVOU X6, (16*14)(res_ptr) + MOVOU X7, (16*15)(res_ptr) + + MOVOU (16*16)(x_ptr), X0 + MOVOU (16*17)(x_ptr), X1 + MOVOU (16*18)(x_ptr), X2 + MOVOU (16*19)(x_ptr), X3 + + MOVOU (16*20)(x_ptr), X4 + MOVOU (16*21)(x_ptr), X5 + MOVOU (16*22)(x_ptr), X6 + MOVOU (16*23)(x_ptr), X7 + + MOVOU X0, (16*16)(res_ptr) + MOVOU X1, (16*17)(res_ptr) + MOVOU X2, (16*18)(res_ptr) + MOVOU X3, (16*19)(res_ptr) + + MOVOU X4, (16*20)(res_ptr) + MOVOU X5, (16*21)(res_ptr) + MOVOU X6, (16*22)(res_ptr) + MOVOU X7, (16*23)(res_ptr) + +copygfp12_avx2: + VMOVDQU (32*0)(x_ptr), Y0 + VMOVDQU (32*1)(x_ptr), Y1 + VMOVDQU (32*2)(x_ptr), Y2 + VMOVDQU (32*3)(x_ptr), Y3 + + VMOVDQU (32*4)(x_ptr), Y4 + VMOVDQU (32*5)(x_ptr), Y5 + VMOVDQU (32*6)(x_ptr), Y6 + VMOVDQU (32*7)(x_ptr), Y7 + + VMOVDQU (32*8)(x_ptr), Y8 + VMOVDQU (32*9)(x_ptr), Y9 + VMOVDQU (32*10)(x_ptr), Y10 + VMOVDQU (32*11)(x_ptr), Y11 + + VMOVDQU Y0, (32*0)(res_ptr) + VMOVDQU Y1, (32*1)(res_ptr) + VMOVDQU Y2, (32*2)(res_ptr) + VMOVDQU Y3, (32*3)(res_ptr) + + VMOVDQU Y4, (32*4)(res_ptr) + VMOVDQU Y5, (32*5)(res_ptr) + VMOVDQU Y6, (32*6)(res_ptr) + VMOVDQU Y7, (32*7)(res_ptr) + + VMOVDQU Y8, (32*8)(res_ptr) + VMOVDQU Y9, (32*9)(res_ptr) + VMOVDQU Y10, (32*10)(res_ptr) + VMOVDQU Y11, (32*11)(res_ptr) + + VZEROUPPER + RET + // func gfP12MovCond(res, a, b *gfP12, cond int) TEXT ·gfP12MovCond(SB),NOSPLIT,$0 MOVQ res+0(FP), res_ptr diff --git a/sm9/bn256/select_arm64.s b/sm9/bn256/select_arm64.s index 519a382..dd1f7bb 100644 --- a/sm9/bn256/select_arm64.s +++ b/sm9/bn256/select_arm64.s @@ -7,6 +7,68 @@ #define a_ptr R1 #define b_ptr R2 +/* ---------------------------------------*/ +// func gfpCopy(res, a *gfP) +TEXT ·gfpCopy(SB),NOSPLIT,$0 + MOVD res+0(FP), res_ptr + MOVD a+8(FP), a_ptr + + VLD1 (a_ptr), [V0.B16, V1.B16] + VST1 [V0.B16, V1.B16], (res_ptr) + + RET + +/* ---------------------------------------*/ +// func gfp2Copy(res, a *gfP2) +TEXT ·gfp2Copy(SB),NOSPLIT,$0 + MOVD res+0(FP), res_ptr + MOVD a+8(FP), a_ptr + + VLD1 (a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16] + VST1 [V0.B16, V1.B16, V2.B16, V3.B16], (res_ptr) + + RET + +/* ---------------------------------------*/ +// func gfp4Copy(res, a *gfP2) +TEXT ·gfp4Copy(SB),NOSPLIT,$0 + MOVD res+0(FP), res_ptr + MOVD a+8(FP), a_ptr + + VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16] + VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr) + + VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16] + VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr) + + RET + +/* ---------------------------------------*/ +// func gfp12Copy(res, a *gfP12) +TEXT ·gfp12Copy(SB),NOSPLIT,$0 + MOVD res+0(FP), res_ptr + MOVD a+8(FP), a_ptr + + VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16] + VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr) + + VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16] + VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr) + + VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16] + VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr) + + VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16] + VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr) + + VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16] + VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr) + + VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16] + VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr) + + RET + /* ---------------------------------------*/ // func gfP12MovCond(res, a, b *gfP12, cond int) // If cond == 0 res=b, else res=a diff --git a/sm9/bn256/select_decl.go b/sm9/bn256/select_decl.go index 0d5c0da..acd9855 100644 --- a/sm9/bn256/select_decl.go +++ b/sm9/bn256/select_decl.go @@ -21,3 +21,15 @@ func curvePointMovCond(res, a, b *curvePoint, cond int) // //go:noescape func twistPointMovCond(res, a, b *twistPoint, cond int) + +//go:noescape +func gfpCopy(res, in *gfP) + +//go:noescape +func gfp2Copy(res, in *gfP2) + +//go:noescape +func gfp4Copy(res, in *gfP4) + +//go:noescape +func gfp12Copy(res, in *gfP12) diff --git a/sm9/bn256/select_generic.go b/sm9/bn256/select_generic.go index 7500455..1107bdf 100644 --- a/sm9/bn256/select_generic.go +++ b/sm9/bn256/select_generic.go @@ -14,3 +14,39 @@ func curvePointMovCond(res, a, b *curvePoint, cond int) { func twistPointMovCond(res, a, b *twistPoint, cond int) { res.Select(a, b, cond) } + +func gfpCopy(res, in *gfP) { + res[0] = in[0] + res[1] = in[1] + res[2] = in[2] + res[3] = in[3] +} + +func gfp2Copy(res, in *gfP2) { + gfpCopy(&res.x, &in.x) + gfpCopy(&res.y, &in.y) +} + +func gfp4Copy(res, in *gfP4) { + gfpCopy(&res.x.x, &in.x.x) + gfpCopy(&res.x.y, &in.x.y) + gfpCopy(&res.y.x, &in.y.x) + gfpCopy(&res.y.y, &in.y.y) +} + +func gfp12Copy(res, in *gfP12) { + gfpCopy(&res.x.x.x, &in.x.x.x) + gfpCopy(&res.x.x.y, &in.x.x.y) + gfpCopy(&res.x.y.x, &in.x.y.x) + gfpCopy(&res.x.y.y, &in.x.y.y) + + gfpCopy(&res.y.x.x, &in.y.x.x) + gfpCopy(&res.y.x.y, &in.y.x.y) + gfpCopy(&res.y.y.x, &in.y.y.x) + gfpCopy(&res.y.y.y, &in.y.y.y) + + gfpCopy(&res.z.x.x, &in.z.x.x) + gfpCopy(&res.z.x.y, &in.z.x.y) + gfpCopy(&res.z.y.x, &in.z.y.x) + gfpCopy(&res.z.y.y, &in.z.y.y) +} diff --git a/sm9/bn256/select_test.go b/sm9/bn256/select_test.go new file mode 100644 index 0000000..c00f61e --- /dev/null +++ b/sm9/bn256/select_test.go @@ -0,0 +1,54 @@ +package bn256 + +import "testing" + +func BenchmarkGfP12Copy(b *testing.B) { + x := &gfP12{ + testdataP4, + testdataP4, + testdataP4, + } + res := &gfP12{} + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gfp12Copy(res, x) + } +} + +func gfpCopyForTest(res, in *gfP) { + res[0] = in[0] + res[1] = in[1] + res[2] = in[2] + res[3] = in[3] +} + +func gfp2CopyForTest(res, in *gfP2) { + gfpCopyForTest(&res.x, &in.x) + gfpCopyForTest(&res.y, &in.y) +} + +func gfp4CopyForTest(res, in *gfP4) { + gfp2CopyForTest(&res.x, &in.x) + gfp2CopyForTest(&res.y, &in.y) +} + +func gfp12CopyForTest(res, in *gfP12) { + gfp4CopyForTest(&res.x, &in.x) + gfp4CopyForTest(&res.y, &in.y) + gfp4CopyForTest(&res.z, &in.z) +} + +func BenchmarkGfP12Set(b *testing.B) { + x := &gfP12{ + testdataP4, + testdataP4, + testdataP4, + } + res := &gfP12{} + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gfp12CopyForTest(res, x) + } +}