From b9bbf94b4766dff6d0c0f4f41373e0a8f6559f13 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Tue, 18 Jul 2023 17:29:10 +0800 Subject: [PATCH 1/6] sm9/bn256: rename special square function name --- sm9/bn256/bn_pair.go | 13 ++++++------- sm9/bn256/bn_pair_b6.go | 13 ++++++------- sm9/bn256/curve.go | 2 +- sm9/bn256/gfp12.go | 22 ++++++++++++++-------- sm9/bn256/gfp12_b6.go | 22 ++++++++++++++-------- sm9/bn256/gfp12_b6_test.go | 2 +- sm9/bn256/gfp12_exp_u.go | 18 +++++++++--------- sm9/bn256/gfp12_test.go | 34 ++++++++++++++++++++++++++-------- sm9/bn256/gfp12b6_exp_u.go | 18 +++++++++--------- 9 files changed, 86 insertions(+), 58 deletions(-) diff --git a/sm9/bn256/bn_pair.go b/sm9/bn256/bn_pair.go index ff9fd81..10e8daf 100644 --- a/sm9/bn256/bn_pair.go +++ b/sm9/bn256/bn_pair.go @@ -218,10 +218,9 @@ func finalExponentiation(in *gfP12) *gfP12 { y0.MulNC(fp, fp2).Mul(y0, fp3) // y0 = (t1^p) * (t1^(p^2)) * (t1^(p^3)) // reuse fp, fp2, fp3 local variables - // [gfP12ExpU] is most time consuming operation - fu := fp.gfP12ExpU(t1) - fu2 := fp2.gfP12ExpU(fu) - fu3 := fp3.gfP12ExpU(fu2) + fu := fp.Cyclo6PowToU(t1) + fu2 := fp2.Cyclo6PowToU(fu) + fu3 := fp3.Cyclo6PowToU(fu2) fu2p := (&gfP12{}).Frobenius(fu2) fu3p := (&gfP12{}).Frobenius(fu3) @@ -237,14 +236,14 @@ func finalExponentiation(in *gfP12) *gfP12 { y6.Conjugate(y6) // y6 = 1 / (t1^(u^3) * (t1^(u^3))^p) // https://eprint.iacr.org/2008/490.pdf - t0 := (&gfP12{}).SpecialSquareNC(y6) + t0 := (&gfP12{}).Cyclo6SquareNC(y6) t0.Mul(t0, y4).Mul(t0, y5) t1.Mul(y3, y5).Mul(t1, t0) t0.Mul(t0, y2) - t1.SpecialSquare(t1).Mul(t1, t0).SpecialSquare(t1) + t1.Cyclo6Square(t1).Mul(t1, t0).Cyclo6Square(t1) t0.Mul(t1, y1) t1.Mul(t1, y0) - t0.SpecialSquare(t0).Mul(t0, t1) + t0.Cyclo6Square(t0).Mul(t0, t1) return t0 } diff --git a/sm9/bn256/bn_pair_b6.go b/sm9/bn256/bn_pair_b6.go index c65682e..6435beb 100644 --- a/sm9/bn256/bn_pair_b6.go +++ b/sm9/bn256/bn_pair_b6.go @@ -155,10 +155,9 @@ func finalExponentiationB6(in *gfP12b6) *gfP12b6 { y0.MulNC(fp, fp2).Mul(y0, fp3) // reuse fp, fp2, fp3 local variables - // [gfP12ExpU] is most time consuming operation - fu := fp.gfP12ExpU(t1) - fu2 := fp2.gfP12ExpU(fu) - fu3 := fp3.gfP12ExpU(fu2) + fu := fp.Cyclo6PowToU(t1) + fu2 := fp2.Cyclo6PowToU(fu) + fu3 := fp3.Cyclo6PowToU(fu2) y3 := (&gfP12b6{}).Frobenius(fu) fu2p := (&gfP12b6{}).Frobenius(fu2) @@ -174,14 +173,14 @@ func finalExponentiationB6(in *gfP12b6) *gfP12b6 { y6 := (&gfP12b6{}).MulNC(fu3, fu3p) y6.Conjugate(y6) - t0 := (&gfP12b6{}).SpecialSquareNC(y6) + t0 := (&gfP12b6{}).Cyclo6SquareNC(y6) t0.Mul(t0, y4).Mul(t0, y5) t1.Mul(y3, y5).Mul(t1, t0) t0.Mul(t0, y2) - t1.SpecialSquare(t1).Mul(t1, t0).SpecialSquare(t1) + t1.Cyclo6Square(t1).Mul(t1, t0).Cyclo6Square(t1) t0.Mul(t1, y1) t1.Mul(t1, y0) - t0.SpecialSquare(t0).Mul(t0, t1) + t0.Cyclo6Square(t0).Mul(t0, t1) return t0 } diff --git a/sm9/bn256/curve.go b/sm9/bn256/curve.go index a2d2328..122920f 100644 --- a/sm9/bn256/curve.go +++ b/sm9/bn256/curve.go @@ -182,7 +182,7 @@ func (c *curvePoint) Double(a *curvePoint) { gfpSub(t2, t, C) d, e, f := &gfP{}, &gfP{}, &gfP{} - gfpAdd(d, t2, t2) + gfpDouble(d, t2) gfpDouble(t, A) gfpAdd(e, t, A) gfpSqr(f, e, 1) diff --git a/sm9/bn256/gfp12.go b/sm9/bn256/gfp12.go index cd51aa6..4f9bc27 100644 --- a/sm9/bn256/gfp12.go +++ b/sm9/bn256/gfp12.go @@ -226,13 +226,19 @@ func (e *gfP12) SquareNC(a *gfP12) *gfP12 { return e } -// Special squaring for use on elements in T_6(fp2) (after the -// easy part of the final exponentiation. Used in the hard part -// of the final exponentiation. Function uses formulas in -// Granger/Scott (PKC2010). -func (e *gfP12) SpecialSquare(a *gfP12) *gfP12 { +// Cyclo6Square is used in final exponentiation after easy part(a ^ ((p^2 + 1)(p^6-1))). +// Note that after the easy part of the final exponentiation, +// the resulting element lies in cyclotomic subgroup. +// "New software speed records for cryptographic pairings" +// Section 3.3, Final exponentiation +// https://cryptojedi.org/papers/dclxvi-20100714.pdf +// The fomula reference: +// Granger/Scott (PKC2010). +// Section 3.2 +// https://eprint.iacr.org/2009/565.pdf +func (e *gfP12) Cyclo6Square(a *gfP12) *gfP12 { tmp := &gfP12{} - tmp.SpecialSquareNC(a) + tmp.Cyclo6SquareNC(a) gfp12Copy(e, tmp) return e } @@ -241,7 +247,7 @@ func (e *gfP12) SpecialSquare(a *gfP12) *gfP12 { // easy part of the final exponentiation. Used in the hard part // of the final exponentiation. Function uses formulas in // Granger/Scott (PKC2010). -func (e *gfP12) SpecialSquares(a *gfP12, n int) *gfP12 { +func (e *gfP12) Cyclo6Squares(a *gfP12, n int) *gfP12 { // Square first round in := &gfP12{} tx, ty, tz := &gfP4{}, &gfP4{}, &gfP4{} @@ -306,7 +312,7 @@ func (e *gfP12) SpecialSquares(a *gfP12, n int) *gfP12 { } // Special Square without value copy, will use e directly, so e can't be same as a. -func (e *gfP12) SpecialSquareNC(a *gfP12) *gfP12 { +func (e *gfP12) Cyclo6SquareNC(a *gfP12) *gfP12 { tx, ty, tz := &gfP4{}, &gfP4{}, &gfP4{} v0 := &e.x diff --git a/sm9/bn256/gfp12_b6.go b/sm9/bn256/gfp12_b6.go index 3661796..2a27fc1 100644 --- a/sm9/bn256/gfp12_b6.go +++ b/sm9/bn256/gfp12_b6.go @@ -201,20 +201,26 @@ func (e *gfP12b6) SquareNC(a *gfP12b6) *gfP12b6 { return e } -// Special squaring for use on elements in T_6(fp2) (after the -// easy part of the final exponentiation. Used in the hard part -// of the final exponentiation. Function uses formulas in -// Granger/Scott (PKC2010). -func (e *gfP12b6) SpecialSquare(a *gfP12b6) *gfP12b6 { +// Cyclo6Square is used in final exponentiation after easy part(a ^ ((p^2 + 1)(p^6-1))). +// Note that after the easy part of the final exponentiation, +// the resulting element lies in cyclotomic subgroup. +// "New software speed records for cryptographic pairings" +// Section 3.3, Final exponentiation +// https://cryptojedi.org/papers/dclxvi-20100714.pdf +// The fomula reference: +// Granger/Scott (PKC2010). +// Section 3.2 +// https://eprint.iacr.org/2009/565.pdf +func (e *gfP12b6) Cyclo6Square(a *gfP12b6) *gfP12b6 { tmp := &gfP12b6{} - tmp.SpecialSquareNC(a) + tmp.Cyclo6SquareNC(a) e.x.Set(&tmp.x) e.y.Set(&tmp.y) return e } // Special Square without value copy, will use e directly, so e can't be same as a. -func (e *gfP12b6) SpecialSquareNC(a *gfP12b6) *gfP12b6 { +func (e *gfP12b6) Cyclo6SquareNC(a *gfP12b6) *gfP12b6 { f02 := &e.y.x f01 := &e.y.y f00 := &e.y.z @@ -265,7 +271,7 @@ func (e *gfP12b6) SpecialSquareNC(a *gfP12b6) *gfP12b6 { return e } -func (e *gfP12b6) SpecialSquares(a *gfP12b6, n int) *gfP12b6 { +func (e *gfP12b6) Cyclo6Squares(a *gfP12b6, n int) *gfP12b6 { // Square first round in := &gfP12b6{} f02 := &in.y.x diff --git a/sm9/bn256/gfp12_b6_test.go b/sm9/bn256/gfp12_b6_test.go index 2bca692..975a157 100644 --- a/sm9/bn256/gfp12_b6_test.go +++ b/sm9/bn256/gfp12_b6_test.go @@ -330,7 +330,7 @@ func TestGfP12b6SpecialSquare(t *testing.T) { got := &gfP12b6{} expected := &gfP12b6{} - got.SpecialSquare(t1) + got.Cyclo6Square(t1) expected.Square(t1) if *got != *expected { t.Errorf("not same got=%v, expected=%v", got, expected) diff --git a/sm9/bn256/gfp12_exp_u.go b/sm9/bn256/gfp12_exp_u.go index 198be7d..c380eba 100644 --- a/sm9/bn256/gfp12_exp_u.go +++ b/sm9/bn256/gfp12_exp_u.go @@ -1,7 +1,7 @@ package bn256 // Use special square -func (e *gfP12) gfP12ExpU(x *gfP12) *gfP12 { +func (e *gfP12) Cyclo6PowToU(x *gfP12) *gfP12 { // The sequence of 10 multiplications and 61 squarings is derived from the // following addition chain generated with github.com/mmcloughlin/addchain v0.4.0. // @@ -21,23 +21,23 @@ func (e *gfP12) gfP12ExpU(x *gfP12) *gfP12 { var t2 = new(gfP12) var t3 = new(gfP12) - t2.SpecialSquareNC(x) - t1.SpecialSquareNC(t2) + t2.Cyclo6SquareNC(x) + t1.Cyclo6SquareNC(t2) z.MulNC(x, t1) t0.MulNC(t1, z) t2.Mul(t2, t0) t3.MulNC(x, t2) - t3.SpecialSquares(t3, 40) + t3.Cyclo6Squares(t3, 40) t3.Mul(t2, t3) - t3.SpecialSquares(t3, 7) + t3.Cyclo6Squares(t3, 7) t2.Mul(t2, t3) t1.Mul(t1, t2) - t1.SpecialSquares(t1, 4) + t1.Cyclo6Squares(t1, 4) t0.Mul(t0, t1) - t0.SpecialSquare(t0) + t0.Cyclo6Square(t0) t0.Mul(x, t0) - t0.SpecialSquares(t0, 6) + t0.Cyclo6Squares(t0, 6) z.Mul(z, t0) - z.SpecialSquare(z) + z.Cyclo6Square(z) return e } diff --git a/sm9/bn256/gfp12_test.go b/sm9/bn256/gfp12_test.go index 4d08177..fd78c05 100644 --- a/sm9/bn256/gfp12_test.go +++ b/sm9/bn256/gfp12_test.go @@ -35,7 +35,7 @@ func Test_gfP12Square(t *testing.T) { } } -func TestSpecialSquare(t *testing.T) { +func TestCyclo6Square(t *testing.T) { in := &gfP12{ testdataP4, testdataP4, @@ -51,9 +51,18 @@ func TestSpecialSquare(t *testing.T) { t2 := inv.FrobeniusP2(t1) // reuse inv t1.Mul(t1, t2) // t1 = in ^ ((p^6 - 1) * (p^2 + 1)), the first two parts of the exponentiation + one := (&gfP12{}).SetOne() + t3 := (&gfP12{}).FrobeniusP2(t1) + t4 := (&gfP12{}).FrobeniusP2(t3) + t5 := (&gfP12{}).Invert(t3) + t5.Mul(t4, t5).Mul(t1, t5) + if *t5 != *one { + t.Errorf("t1 should be in Cyclotomic Subgroup") + } + got := &gfP12{} expected := &gfP12{} - got.SpecialSquare(t1) + got.Cyclo6Square(t1) expected.Square(t1) if *got != *expected { t.Errorf("not same got=%v, expected=%v", got, expected) @@ -74,7 +83,7 @@ func BenchmarkGfP12Square(b *testing.B) { } } -func BenchmarkGfP12SpecialSquare(b *testing.B) { +func BenchmarkGfP12Cyclo6Square(b *testing.B) { in := &gfP12{ testdataP4, testdataP4, @@ -93,7 +102,7 @@ func BenchmarkGfP12SpecialSquare(b *testing.B) { b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - x2.SpecialSquare(t1) + x2.Cyclo6Square(t1) } } @@ -116,7 +125,7 @@ func BenchmarkGfP12SpecialSqures(b *testing.B) { b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - got.SpecialSquares(in, 61) + got.Cyclo6Squares(in, 61) } } @@ -433,13 +442,22 @@ func BenchmarkGfP12ExpU(b *testing.B) { testdataP4, testdataP4, } + // This is the p^6-Frobenius + t1 := (&gfP12{}).FrobeniusP6(x) + + inv := (&gfP12{}).Invert(x) + t1.Mul(t1, inv) + + t2 := inv.FrobeniusP2(t1) // reuse inv + t1.Mul(t1, t2) // t1 = in ^ ((p^6 - 1) * (p^2 + 1)), the first two parts of the exponentiation + got := &gfP12{} b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - got.gfP12ExpU(x) - got.gfP12ExpU(x) - got.gfP12ExpU(x) + got.Cyclo6PowToU(t1) + got.Cyclo6PowToU(t1) + got.Cyclo6PowToU(t1) } } diff --git a/sm9/bn256/gfp12b6_exp_u.go b/sm9/bn256/gfp12b6_exp_u.go index 9fd09f3..dc2900f 100644 --- a/sm9/bn256/gfp12b6_exp_u.go +++ b/sm9/bn256/gfp12b6_exp_u.go @@ -1,7 +1,7 @@ package bn256 // Use special square -func (e *gfP12b6) gfP12ExpU(x *gfP12b6) *gfP12b6 { +func (e *gfP12b6) Cyclo6PowToU(x *gfP12b6) *gfP12b6 { // The sequence of 10 multiplications and 61 squarings is derived from the // following addition chain generated with github.com/mmcloughlin/addchain v0.4.0. // @@ -21,23 +21,23 @@ func (e *gfP12b6) gfP12ExpU(x *gfP12b6) *gfP12b6 { var t2 = new(gfP12b6) var t3 = new(gfP12b6) - t2.SpecialSquareNC(x) - t1.SpecialSquareNC(t2) + t2.Cyclo6SquareNC(x) + t1.Cyclo6SquareNC(t2) z.MulNC(x, t1) t0.MulNC(t1, z) t2.Mul(t2, t0) t3.MulNC(x, t2) - t3.SpecialSquares(t3, 40) + t3.Cyclo6Squares(t3, 40) t3.Mul(t2, t3) - t3.SpecialSquares(t3, 7) + t3.Cyclo6Squares(t3, 7) t2.Mul(t2, t3) t1.Mul(t1, t2) - t1.SpecialSquares(t1, 4) + t1.Cyclo6Squares(t1, 4) t0.Mul(t0, t1) - t0.SpecialSquare(t0) + t0.Cyclo6Square(t0) t0.Mul(x, t0) - t0.SpecialSquares(t0, 6) + t0.Cyclo6Squares(t0, 6) z.Mul(z, t0) - z.SpecialSquare(z) + z.Cyclo6Square(z) return e } From bbbf2612bc410c572c1907f489667dd8a7fe6ab9 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 19 Jul 2023 15:26:38 +0800 Subject: [PATCH 2/6] sm9/bn256: fix add same bug --- sm9/bn256/curve.go | 2 +- sm9/bn256/g1_test.go | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sm9/bn256/curve.go b/sm9/bn256/curve.go index 122920f..61c4133 100644 --- a/sm9/bn256/curve.go +++ b/sm9/bn256/curve.go @@ -133,7 +133,7 @@ func (c *curvePoint) Add(a, b *curvePoint) { gfpSub(t, s2, s1) - if h.Equal(zero) == 1 && t.Equal(one) == 1 { + if h.Equal(zero) == 1 && t.Equal(zero) == 1 { c.Double(a) return } diff --git a/sm9/bn256/g1_test.go b/sm9/bn256/g1_test.go index acc8bb8..25f990f 100644 --- a/sm9/bn256/g1_test.go +++ b/sm9/bn256/g1_test.go @@ -24,6 +24,16 @@ func TestG1AddNeg(t *testing.T) { } } +func TestG1AddSame(t *testing.T) { + g1, g2 := &G1{}, &G1{} + g1.Add(Gen1, Gen1) + g2.Double(Gen1) + + if !g1.Equal(g2) { + t.Fail() + } +} + type g1BaseMultTest struct { k string } From a173646017d6062a8314553779d47fe02abd9b12 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Thu, 20 Jul 2023 17:49:53 +0800 Subject: [PATCH 3/6] internal/sm2ec: optiomization for ADX usage and supplement comments --- internal/sm2ec/p256_asm_amd64.s | 79 ++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 30 deletions(-) diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 37f9fa2..979fcc5 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -2095,6 +2095,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalMulBMI2 + // [t3, t2, t1, t0] * acc4 MOVQ acc4, mul0 MULQ t0 MOVQ mul0, acc0 @@ -2118,6 +2119,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, acc4 + // [t3, t2, t1, t0] * acc5 MOVQ acc5, mul0 MULQ t0 ADDQ mul0, acc1 @@ -2148,6 +2150,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, acc5 + // [t3, t2, t1, t0] * acc6 MOVQ acc6, mul0 MULQ t0 ADDQ mul0, acc2 @@ -2178,6 +2181,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, acc6 + // [t3, t2, t1, t0] * acc7 MOVQ acc7, mul0 MULQ t0 ADDQ mul0, acc3 @@ -2207,6 +2211,8 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADDQ mul0, acc6 ADCQ $0, mul1 MOVQ mul1, acc7 + + // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] // First reduction step MOVQ acc0, mul0 MOVQ acc0, mul1 @@ -2292,22 +2298,23 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 CMOVQCS acc3, acc7 RET + internalMulBMI2: + // [t3, t2, t1, t0] * acc4 MOVQ acc4, mul1 MULXQ t0, acc0, acc1 MULXQ t1, mul0, acc2 ADDQ mul0, acc1 - ADCQ $0, acc2 MULXQ t2, mul0, acc3 - ADDQ mul0, acc2 - ADCQ $0, acc3 + ADCQ mul0, acc2 MULXQ t3, mul0, acc4 - ADDQ mul0, acc3 + ADCQ mul0, acc3 ADCQ $0, acc4 + // [t3, t2, t1, t0] * acc5 MOVQ acc5, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc1 @@ -2328,6 +2335,7 @@ internalMulBMI2: ADDQ mul0, acc4 ADCQ $0, acc5 + // [t3, t2, t1, t0] * acc6 MOVQ acc6, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc2 @@ -2348,6 +2356,7 @@ internalMulBMI2: ADDQ mul0, acc5 ADCQ $0, acc6 + // [t3, t2, t1, t0] * acc7 MOVQ acc7, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc3 @@ -2368,6 +2377,7 @@ internalMulBMI2: ADDQ mul0, acc6 ADCQ $0, acc7 + // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] // First reduction step MOVQ acc0, mul0 MOVQ acc0, mul1 @@ -2544,6 +2554,7 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalSqrBMI2 + // [acc7, acc6, acc5] * acc4 MOVQ acc4, mul0 MULQ acc5 MOVQ mul0, acc1 @@ -2561,6 +2572,7 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, t0 + // [acc7, acc6] * acc5 MOVQ acc5, mul0 MULQ acc6 ADDQ mul0, acc3 @@ -2575,6 +2587,7 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, t1 + // acc7 * acc6 MOVQ acc6, mul0 MULQ acc7 ADDQ mul0, t1 @@ -2615,64 +2628,70 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADCQ mul0, t2 ADCQ DX, t3 + // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] sm2P256SqrReductionInternal() RET internalSqrBMI2: + XORQ t3, t3 + + // [acc7, acc6, acc5] * acc4 MOVQ acc4, mul1 MULXQ acc5, acc1, acc2 MULXQ acc6, mul0, acc3 - ADDQ mul0, acc2 + ADOXQ mul0, acc2 MULXQ acc7, mul0, t0 - ADCQ mul0, acc3 - ADCQ $0, t0 + ADOXQ mul0, acc3 + ADOXQ t3, t0 + // [acc7, acc6] * acc5 MOVQ acc5, mul1 MULXQ acc6, mul0, hlp - ADDQ mul0, acc3 - ADCQ hlp, t0 + ADOXQ mul0, acc3 MULXQ acc7, mul0, t1 - ADCQ $0, t1 - ADDQ mul0, t0 + ADCXQ hlp, mul0 + ADOXQ mul0, t0 + ADCXQ t3, t1 + // acc7 * acc6 MOVQ acc6, mul1 MULXQ acc7, mul0, t2 - ADCQ mul0, t1 - ADCQ $0, t2 - XORQ t3, t3 - + ADOXQ mul0, t1 + ADOXQ t3, t2 + // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ t0, t0 - ADCQ t1, t1 - ADCQ t2, t2 - ADCQ $0, t3 + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ t0, t0 + ADOXQ t1, t1 + ADOXQ t2, t2 + ADOXQ t3, t3 // Missing products MOVQ acc4, mul1 MULXQ mul1, acc0, acc4 - ADDQ acc4, acc1 + ADCXQ acc4, acc1 MOVQ acc5, mul1 MULXQ mul1, mul0, acc4 - ADCQ mul0, acc2 - ADCQ acc4, acc3 + ADCXQ mul0, acc2 + ADCXQ acc4, acc3 MOVQ acc6, mul1 MULXQ mul1, mul0, acc4 - ADCQ mul0, t0 - ADCQ acc4, t1 + ADCXQ mul0, t0 + ADCXQ acc4, t1 MOVQ acc7, mul1 MULXQ mul1, mul0, acc4 - ADCQ mul0, t2 - ADCQ acc4, t3 - + ADCXQ mul0, t2 + ADCXQ acc4, t3 + + // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] sm2P256SqrReductionInternal() RET From 16b2a43dc3e023e8e51dd84e81c1d6e17c11fa43 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Fri, 21 Jul 2023 17:39:06 +0800 Subject: [PATCH 4/6] sm9/bn256: complete addition fomulas and gfp2 amd64 asm #144 --- sm9/bn256/bn_pair.go | 24 +- sm9/bn256/bn_pair_b6.go | 2 +- sm9/bn256/curve.go | 251 +++--- sm9/bn256/g1.go | 5 +- sm9/bn256/g1_test.go | 238 ++++- sm9/bn256/gfp12_b6.go | 6 +- sm9/bn256/gfp2.go | 101 +-- sm9/bn256/gfp2_g1_amd64.s | 1653 ++++++++++++++++++++++++++++++++++ sm9/bn256/gfp2_g1_decl.go | 36 + sm9/bn256/gfp2_g1_generic.go | 193 ++++ sm9/bn256/gfp2_test.go | 2 +- sm9/bn256/gfp4.go | 28 +- sm9/bn256/gfp6.go | 28 +- sm9/bn256/gfp_amd64.s | 12 +- sm9/bn256/gfp_test.go | 3 +- sm9/bn256/twist.go | 174 ++-- 16 files changed, 2367 insertions(+), 389 deletions(-) create mode 100644 sm9/bn256/gfp2_g1_amd64.s create mode 100644 sm9/bn256/gfp2_g1_decl.go create mode 100644 sm9/bn256/gfp2_g1_generic.go diff --git a/sm9/bn256/bn_pair.go b/sm9/bn256/bn_pair.go index 10e8daf..8b99c2f 100644 --- a/sm9/bn256/bn_pair.go +++ b/sm9/bn256/bn_pair.go @@ -4,23 +4,23 @@ package bn256 func lineFunctionAdd(r, p, rOut *twistPoint, q *curvePoint, r2, a, b, c *gfP2) { // See the mixed addition algorithm from "Faster Computation of the // Tate Pairing", http://arxiv.org/pdf/0904.0854v3.pdf - B := (&gfP2{}).MulNC(&p.x, &r.t) // B = Xp * Zr^2 + B := (&gfP2{}).Mul(&p.x, &r.t) // B = Xp * Zr^2 D := (&gfP2{}).Add(&p.y, &r.z) // D = Yp + Zr D.Square(D).Sub(D, r2).Sub(D, &r.t).Mul(D, &r.t) // D = ((Yp + Zr)^2 - Zr^2 - Yp^2)*Zr^2 = 2Yp*Zr^3 H := (&gfP2{}).Sub(B, &r.x) // H = Xp * Zr^2 - Xr - I := (&gfP2{}).SquareNC(H) // I = (Xp * Zr^2 - Xr)^2 = Xp^2*Zr^4 + Xr^2 - 2Xr*Xp*Zr^2 + I := (&gfP2{}).Square(H) // I = (Xp * Zr^2 - Xr)^2 = Xp^2*Zr^4 + Xr^2 - 2Xr*Xp*Zr^2 E := (&gfP2{}).Double(I) // E = 2*(Xp * Zr^2 - Xr)^2 E.Double(E) // E = 4*(Xp * Zr^2 - Xr)^2 - J := (&gfP2{}).MulNC(H, E) // J = 4*(Xp * Zr^2 - Xr)^3 + J := (&gfP2{}).Mul(H, E) // J = 4*(Xp * Zr^2 - Xr)^3 L1 := (&gfP2{}).Sub(D, &r.y) // L1 = 2Yp*Zr^3 - Yr L1.Sub(L1, &r.y) // L1 = 2Yp*Zr^3 - 2*Yr - V := (&gfP2{}).MulNC(&r.x, E) // V = 4 * Xr * (Xp * Zr^2 - Xr)^2 + V := (&gfP2{}).Mul(&r.x, E) // V = 4 * Xr * (Xp * Zr^2 - Xr)^2 rOut.x.Square(L1).Sub(&rOut.x, J).Sub(&rOut.x, V).Sub(&rOut.x, V) // rOut.x = L1^2 - J - 2V @@ -28,11 +28,11 @@ func lineFunctionAdd(r, p, rOut *twistPoint, q *curvePoint, r2, a, b, c *gfP2) { t := (&gfP2{}).Sub(V, &rOut.x) // t = V - rOut.x t.Mul(t, L1) // t = L1*(V-rOut.x) - t2 := (&gfP2{}).MulNC(&r.y, J) + t2 := (&gfP2{}).Mul(&r.y, J) t2.Double(t2) // t2 = 2Yr * J rOut.y.Sub(t, t2) // rOut.y = L1*(V-rOut.x) - 2Yr*J - rOut.t.SquareNC(&rOut.z) + rOut.t.Square(&rOut.z) // t = (Yp + rOut.Z)^2 - Yp^2 - rOut.Z^2 = 2Yp*rOut.Z t.Add(&p.y, &rOut.z).Square(t).Sub(t, r2).Sub(t, &rOut.t) @@ -51,9 +51,9 @@ func lineFunctionAdd(r, p, rOut *twistPoint, q *curvePoint, r2, a, b, c *gfP2) { func lineFunctionDouble(r, rOut *twistPoint, q *curvePoint, a, b, c *gfP2) { // See the doubling algorithm for a=0 from "Faster Computation of the // Tate Pairing", http://arxiv.org/pdf/0904.0854v3.pdf - A := (&gfP2{}).SquareNC(&r.x) - B := (&gfP2{}).SquareNC(&r.y) - C := (&gfP2{}).SquareNC(B) // C = Yr ^ 4 + A := (&gfP2{}).Square(&r.x) + B := (&gfP2{}).Square(&r.y) + C := (&gfP2{}).Square(B) // C = Yr ^ 4 D := (&gfP2{}).Add(&r.x, B) D.Square(D).Sub(D, A).Sub(D, C).Double(D) @@ -61,7 +61,7 @@ func lineFunctionDouble(r, rOut *twistPoint, q *curvePoint, a, b, c *gfP2) { E := (&gfP2{}).Double(A) // E.Add(E, A) // E = 3 * Xr ^ 2 - G := (&gfP2{}).SquareNC(E) // G = 9 * Xr^4 + G := (&gfP2{}).Square(E) // G = 9 * Xr^4 rOut.x.Sub(G, D).Sub(&rOut.x, D) @@ -72,7 +72,7 @@ func lineFunctionDouble(r, rOut *twistPoint, q *curvePoint, a, b, c *gfP2) { t.Double(t).Double(t) // t = 8 * Yr ^ 4 rOut.y.Sub(&rOut.y, t) - rOut.t.SquareNC(&rOut.z) + rOut.t.Square(&rOut.z) t.Mul(E, &r.t).Double(t) // t = 2(E * Tr) b.Neg(t) // b = -2(E * Tr) @@ -127,7 +127,7 @@ func miller(q *twistPoint, p *curvePoint) *gfP12 { r := &twistPoint{} r.Set(aAffine) - r2 := (&gfP2{}).SquareNC(&aAffine.y) + r2 := (&gfP2{}).Square(&aAffine.y) a, b, c := &gfP2{}, &gfP2{}, &gfP2{} newR := &twistPoint{} diff --git a/sm9/bn256/bn_pair_b6.go b/sm9/bn256/bn_pair_b6.go index 6435beb..6957469 100644 --- a/sm9/bn256/bn_pair_b6.go +++ b/sm9/bn256/bn_pair_b6.go @@ -50,7 +50,7 @@ func millerB6(q *twistPoint, p *curvePoint) *gfP12b6 { r := &twistPoint{} r.Set(aAffine) - r2 := (&gfP2{}).SquareNC(&aAffine.y) + r2 := (&gfP2{}).Square(&aAffine.y) a, b, c := &gfP2{}, &gfP2{}, &gfP2{} newR := &twistPoint{} diff --git a/sm9/bn256/curve.go b/sm9/bn256/curve.go index 61c4133..aa46f4f 100644 --- a/sm9/bn256/curve.go +++ b/sm9/bn256/curve.go @@ -12,6 +12,7 @@ type curvePoint struct { } var curveB = newGFp(5) +var threeCurveB = newGFp(3 * 5) // curveGen is the generator of G₁. var curveGen = &curvePoint{ @@ -82,125 +83,6 @@ func (c *curvePoint) IsInfinity() bool { return c.z.Equal(zero) == 1 } -func (c *curvePoint) Add(a, b *curvePoint) { - if a.IsInfinity() { - c.Set(b) - return - } - if b.IsInfinity() { - c.Set(a) - return - } - - // See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/addition/add-2007-bl.op3 - - // Normalize the points by replacing a = [x1:y1:z1] and b = [x2:y2:z2] - // by [u1:s1:z1·z2] and [u2:s2:z1·z2] - // where u1 = x1·z2², s1 = y1·z2³ and u1 = x2·z1², s2 = y2·z1³ - z12, z22 := &gfP{}, &gfP{} - gfpSqr(z12, &a.z, 1) - gfpSqr(z22, &b.z, 1) - - u1, u2 := &gfP{}, &gfP{} - gfpMul(u1, &a.x, z22) - gfpMul(u2, &b.x, z12) - - t, s1 := &gfP{}, &gfP{} - gfpMul(t, &b.z, z22) - gfpMul(s1, &a.y, t) - - s2 := &gfP{} - gfpMul(t, &a.z, z12) - gfpMul(s2, &b.y, t) - - // Compute x = (2h)²(s²-u1-u2) - // where s = (s2-s1)/(u2-u1) is the slope of the line through - // (u1,s1) and (u2,s2). The extra factor 2h = 2(u2-u1) comes from the value of z below. - // This is also: - // 4(s2-s1)² - 4h²(u1+u2) = 4(s2-s1)² - 4h³ - 4h²(2u1) - // = r² - j - 2v - // with the notations below. - h := &gfP{} - gfpSub(h, u2, u1) - - gfpDouble(t, h) - // i = 4h² - i := &gfP{} - gfpSqr(i, t, 1) - // j = 4h³ - j := &gfP{} - gfpMul(j, h, i) - - gfpSub(t, s2, s1) - - if h.Equal(zero) == 1 && t.Equal(zero) == 1 { - c.Double(a) - return - } - r := &gfP{} - gfpDouble(r, t) - - v := &gfP{} - gfpMul(v, u1, i) - - // t4 = 4(s2-s1)² - t4, t6 := &gfP{}, &gfP{} - gfpSqr(t4, r, 1) - gfpDouble(t, v) - gfpSub(t6, t4, j) - - gfpSub(&c.x, t6, t) - - // Set y = -(2h)³(s1 + s*(x/4h²-u1)) - // This is also - // y = - 2·s1·j - (s2-s1)(2x - 2i·u1) = r(v-x) - 2·s1·j - gfpSub(t, v, &c.x) // t7 - gfpMul(t4, s1, j) // t8 - gfpDouble(t6, t4) // t9 - gfpMul(t4, r, t) // t10 - gfpSub(&c.y, t4, t6) - - // Set z = 2(u2-u1)·z1·z2 = 2h·z1·z2 - gfpAdd(t, &a.z, &b.z) // t11 - gfpSqr(t4, t, 1) // t12 - gfpSub(t, t4, z12) // t13 - gfpSub(t4, t, z22) // t14 - gfpMul(&c.z, t4, h) -} - -func (c *curvePoint) Double(a *curvePoint) { - // See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/doubling/dbl-2009-l.op3 - A, B, C := &gfP{}, &gfP{}, &gfP{} - gfpSqr(A, &a.x, 1) - gfpSqr(B, &a.y, 1) - gfpSqr(C, B, 1) - - t, t2 := &gfP{}, &gfP{} - gfpAdd(t, &a.x, B) - gfpSqr(t2, t, 1) - gfpSub(t, t2, A) - gfpSub(t2, t, C) - - d, e, f := &gfP{}, &gfP{}, &gfP{} - gfpDouble(d, t2) - gfpDouble(t, A) - gfpAdd(e, t, A) - gfpSqr(f, e, 1) - - gfpDouble(t, d) - gfpSub(&c.x, f, t) - - gfpMul(&c.z, &a.y, &a.z) - gfpDouble(&c.z, &c.z) - - gfpDouble(t, C) - gfpDouble(t2, t) - gfpDouble(t, t2) - gfpSub(&c.y, d, &c.x) - gfpMul(t2, e, &c.y) - gfpSub(&c.y, t2, t) -} - func (c *curvePoint) Mul(a *curvePoint, scalar *big.Int) { sum, t := &curvePoint{}, &curvePoint{} sum.SetInfinity() @@ -217,7 +99,10 @@ func (c *curvePoint) Mul(a *curvePoint, scalar *big.Int) { c.Set(sum) } -func (c *curvePoint) MakeAffine() { +// MakeAffine reverses the Jacobian transform. +// the Jacobian coordinates are (x1, y1, z1) +// where x = x1/z1² and y = y1/z1³. +func (c *curvePoint) AffineFromJacobian() { if c.z.Equal(one) == 1 { return } else if c.z.Equal(zero) == 1 { @@ -231,11 +116,11 @@ func (c *curvePoint) MakeAffine() { zInv.Invert(&c.z) t, zInv2 := &gfP{}, &gfP{} - gfpMul(t, &c.y, zInv) + gfpMul(t, &c.y, zInv) // t = y/z gfpSqr(zInv2, zInv, 1) - gfpMul(&c.x, &c.x, zInv2) - gfpMul(&c.y, t, zInv2) + gfpMul(&c.x, &c.x, zInv2) // x = x / z^2 + gfpMul(&c.y, t, zInv2) // y = y / z^3 c.z.Set(one) c.t.Set(one) @@ -265,3 +150,123 @@ func (table *curvePointTable) Select(p *curvePoint, n uint8) { curvePointMovCond(p, f, p, cond) } } + +// Equal compare e and other +func (e *curvePoint) Equal(other *curvePoint) bool { + return e.x.Equal(&other.x) == 1 && + e.y.Equal(&other.y) == 1 && + e.z.Equal(&other.z) == 1 && + e.t.Equal(&other.t) == 1 +} + +// Below methods are POC yet, the line add/double functions are still based on +// Jacobian coordination. +func (c *curvePoint) Add(p1, p2 *curvePoint) { + // Complete addition formula for a = 0 from "Complete addition formulas for + // prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2. + // Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves. + + t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP) + x3, y3, z3 := new(gfP), new(gfP), new(gfP) + gfpMul(t0, &p1.x, &p2.x) // t0 := X1X2 + gfpMul(t1, &p1.y, &p2.y) // t1 := Y1Y2 + gfpMul(t2, &p1.z, &p2.z) // t2 := Z1Z2 + gfpAdd(t3, &p1.x, &p1.y) // t3 := X1 + Y1 + gfpAdd(t4, &p2.x, &p2.y) // t4 := X2 + Y2 + gfpMul(t3, t3, t4) // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) + gfpAdd(t4, t0, t1) // t4 := t0 + t1 + gfpSub(t3, t3, t4) // t3 := t3 - t4 = X1Y2 + X2Y1 + gfpAdd(t4, &p1.y, &p1.z) // t4 := Y1 + Z1 + gfpAdd(x3, &p2.y, &p2.z) // X3 := Y2 + Z2 + gfpMul(t4, t4, x3) // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) + gfpAdd(x3, t1, t2) // X3 := t1 + t2 + gfpSub(t4, t4, x3) // t4 := t4 - X3 = Y1Z2 + Y2Z1 + gfpAdd(x3, &p1.x, &p1.z) // X3 := X1 + Z1 + gfpAdd(y3, &p2.x, &p2.z) // Y3 := X2 + Z2 + gfpMul(x3, x3, y3) // X3 := X3 * Y3 + gfpAdd(y3, t0, t2) // Y3 := t0 + t2 + gfpSub(y3, x3, y3) // Y3 := X3 - Y3 = X1Z2 + X2Z1 + gfpTriple(t0, t0) // t0 := t0 + t0 + t0 = 3X1X2 + gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2 + gfpAdd(z3, t1, t2) // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 + gfpSub(t1, t1, t2) // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 + gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) + gfpMul(x3, t4, y3) // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) + gfpMul(t2, t3, t1) // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) + gfpSub(x3, t2, x3) // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) + gfpMul(y3, y3, t0) // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) + gfpMul(t1, t1, z3) // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + gfpAdd(y3, t1, y3) // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) + gfpMul(t0, t0, t3) // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) + gfpMul(z3, z3, t4) // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + gfpAdd(z3, z3, t0) // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) + + c.x.Set(x3) + c.y.Set(y3) + c.z.Set(z3) +} + +func (c *curvePoint) AddComplete(p1, p2 *curvePoint) { + c.Add(p1, p2) +} + +func (c *curvePoint) Double(p *curvePoint) { + // Complete addition formula for a = 0 from "Complete addition formulas for + // prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2. + // Algorithm 9: Exception-free point doubling for prime order j-invariant 0 short Weierstrass curves. + t0, t1, t2 := new(gfP), new(gfP), new(gfP) + x3, y3, z3 := new(gfP), new(gfP), new(gfP) + + gfpSqr(t0, &p.y, 1) // t0 := Y^2 + gfpDouble(z3, t0) // Z3 := t0 + t0 + gfpDouble(z3, z3) // Z3 := Z3 + Z3 + gfpDouble(z3, z3) // Z3 := Z3 + Z3 + gfpMul(t1, &p.y, &p.z) // t1 := YZ + gfpSqr(t2, &p.z, 1) // t0 := Z^2 + gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ^2 + gfpMul(x3, t2, z3) // X3 := t2 * Z3 + gfpAdd(y3, t0, t2) // Y3 := t0 + t2 + gfpMul(z3, t1, z3) // Z3 := t1 * Z3 + gfpTriple(t2, t2) // t2 := t2 + t2 + t2 + gfpSub(t0, t0, t2) // t0 := t0 - t2 + gfpMul(y3, t0, y3) // t0 := t0 * Y3 + gfpAdd(y3, x3, y3) // Y3 := X3 + Y3 + gfpMul(t1, &p.x, &p.y) // t1 := XY + gfpMul(x3, t0, t1) // X3 := t0 * t1 + gfpDouble(x3, x3) // X3 := X3 + X3 + + c.x.Set(x3) + c.y.Set(y3) + c.z.Set(z3) +} + +func (c *curvePoint) DoubleComplete(p *curvePoint) { + c.Double(p) +} + +// MakeAffine reverses the Projective transform. +// A = 1/Z1 +// X3 = A*X1 +// Y3 = A*Y1 +// Z3 = 1 +func (c *curvePoint) MakeAffine() { + // TODO: do we need to change it to constant-time implementation? + if c.z.Equal(one) == 1 { + return + } else if c.z.Equal(zero) == 1 { + c.x.Set(zero) + c.y.Set(one) + c.t.Set(zero) + return + } + zInv := &gfP{} + zInv.Invert(&c.z) + gfpMul(&c.x, &c.x, zInv) + gfpMul(&c.y, &c.y, zInv) + c.z.Set(one) + c.t.Set(one) +} + +func (c *curvePoint) AffineFromProjective() { + c.MakeAffine() +} diff --git a/sm9/bn256/g1.go b/sm9/bn256/g1.go index e29df80..672177c 100644 --- a/sm9/bn256/g1.go +++ b/sm9/bn256/g1.go @@ -368,10 +368,7 @@ func (e *G1) Equal(other *G1) bool { if e.p == nil && other.p == nil { return true } - return e.p.x.Equal(&other.p.x) == 1 && - e.p.y.Equal(&other.p.y) == 1 && - e.p.z.Equal(&other.p.z) == 1 && - e.p.t.Equal(&other.p.t) == 1 + return e.p.Equal(other.p) } // IsOnCurve returns true if e is on the curve. diff --git a/sm9/bn256/g1_test.go b/sm9/bn256/g1_test.go index 25f990f..3894e36 100644 --- a/sm9/bn256/g1_test.go +++ b/sm9/bn256/g1_test.go @@ -3,6 +3,7 @@ package bn256 import ( "bytes" "crypto/rand" + "fmt" "io" "math/big" "testing" @@ -34,6 +35,14 @@ func TestG1AddSame(t *testing.T) { } } +func TestCurvePointDouble(t *testing.T) { + p := &curvePoint{} + p.Double(p) + if !p.IsInfinity() { + t.Fail() + } +} + type g1BaseMultTest struct { k string } @@ -167,35 +176,84 @@ func TestG1BaseMult(t *testing.T) { } } -func TestG1ScaleMult(t *testing.T) { - k, e, err := RandomG1(rand.Reader) +func TestG1ScalarMult(t *testing.T) { + checkScalar := func(t *testing.T, scalar []byte) { + p1, err := (&G1{}).ScalarBaseMult(scalar) + fatalIfErr(t, err) + p2, err := (&G1{}).ScalarMult(Gen1, scalar) + fatalIfErr(t, err) + p1.p.MakeAffine() + p2.p.MakeAffine() + if !p1.Equal(p2) { + t.Error("[k]G != ScalarBaseMult(k)") + } + + d := new(big.Int).SetBytes(scalar) + d.Sub(Order, d) + d.Mod(d, Order) + g1, err := (&G1{}).ScalarBaseMult(d.FillBytes(make([]byte, len(scalar)))) + fatalIfErr(t, err) + g1.Add(g1, p1) + g1.p.MakeAffine() + if !g1.p.IsInfinity() { + t.Error("[N - k]G + [k]G != ∞") + } + } + + byteLen := len(Order.Bytes()) + bitLen := Order.BitLen() + t.Run("0", func(t *testing.T) { checkScalar(t, make([]byte, byteLen)) }) + t.Run("1", func(t *testing.T) { + checkScalar(t, big.NewInt(1).FillBytes(make([]byte, byteLen))) + }) + t.Run("N-6", func(t *testing.T) { + checkScalar(t, new(big.Int).Sub(Order, big.NewInt(6)).Bytes()) + }) + t.Run("N-1", func(t *testing.T) { + checkScalar(t, new(big.Int).Sub(Order, big.NewInt(1)).Bytes()) + }) + t.Run("N", func(t *testing.T) { checkScalar(t, Order.Bytes()) }) + t.Run("N+1", func(t *testing.T) { + checkScalar(t, new(big.Int).Add(Order, big.NewInt(1)).Bytes()) + }) + t.Run("N+22", func(t *testing.T) { + checkScalar(t, new(big.Int).Add(Order, big.NewInt(22)).Bytes()) + }) + t.Run("all1s", func(t *testing.T) { + s := new(big.Int).Lsh(big.NewInt(1), uint(bitLen)) + s.Sub(s, big.NewInt(1)) + checkScalar(t, s.Bytes()) + }) + if testing.Short() { + return + } + for i := 0; i < bitLen; i++ { + t.Run(fmt.Sprintf("1<<%d", i), func(t *testing.T) { + s := new(big.Int).Lsh(big.NewInt(1), uint(i)) + checkScalar(t, s.FillBytes(make([]byte, byteLen))) + }) + } + for i := 0; i <= 64; i++ { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + checkScalar(t, big.NewInt(int64(i)).FillBytes(make([]byte, byteLen))) + }) + } + + // Test N-64...N+64 since they risk overlapping with precomputed table values + // in the final additions. + for i := int64(-64); i <= 64; i++ { + t.Run(fmt.Sprintf("N%+d", i), func(t *testing.T) { + checkScalar(t, new(big.Int).Add(Order, big.NewInt(i)).Bytes()) + }) + } + +} + +func fatalIfErr(t *testing.T, err error) { + t.Helper() if err != nil { t.Fatal(err) } - e.p.MakeAffine() - - e2, e3 := &G1{}, &G1{} - - if e2.p == nil { - e2.p = &curvePoint{} - } - - e2.p.Mul(curveGen, k) - e2.p.MakeAffine() - - if !e.Equal(e2) { - t.Errorf("not same") - } - - _, err = e3.ScalarMult(Gen1, NormalizeScalar(k.Bytes())) - if err != nil { - t.Fatal(err) - } - e3.p.MakeAffine() - - if !e.Equal(e3) { - t.Errorf("not same") - } } func TestFuzz(t *testing.T) { @@ -552,3 +610,131 @@ func BenchmarkMarshalUnmarshal(b *testing.B) { }) }) } + +func TestCurvePointAddComplete(t *testing.T) { + t.Run("normal case", func(t *testing.T) { + p1 := &curvePoint{} + curvePointDouble(p1, curveGen) + p1.AffineFromJacobian() + + p2 := &curvePoint{} + p2.AddComplete(p1, curveGen) + p2.AffineFromProjective() + + p3 := &curvePoint{} + curvePointAdd(p3, curveGen, p1) + p3.AffineFromJacobian() + + if !p2.Equal(p3) { + t.Errorf("Got %v, expected %v", p2, p3) + } + }) + t.Run("exception case: double", func(t *testing.T) { + p2 := &curvePoint{} + p2.AddComplete(curveGen, curveGen) + p2.AffineFromProjective() + + p3 := &curvePoint{} + curvePointDouble(p3, curveGen) + p3.AffineFromJacobian() + if !p2.Equal(p3) { + t.Errorf("Got %v, expected %v", p2, p3) + } + }) + t.Run("exception case: neg", func(t *testing.T) { + p1 := &curvePoint{} + p1.Neg(curveGen) + p2 := &curvePoint{} + p2.AddComplete(curveGen, p1) + p2.AffineFromProjective() + if !p2.IsInfinity() { + t.Fatal("should be infinity") + } + }) + t.Run("exception case: IsInfinity", func(t *testing.T) { + p1 := &curvePoint{} + p1.SetInfinity() + p2 := &curvePoint{} + p2.AddComplete(curveGen, p1) + p2.AffineFromProjective() + if !p2.Equal(curveGen) { + t.Fatal("should be curveGen") + } + p2.AddComplete(p1, curveGen) + p2.AffineFromProjective() + if !p2.Equal(curveGen) { + t.Fatal("should be curveGen") + } + p2.AddComplete(p1, p1) + p2.AffineFromProjective() + if !p2.IsInfinity() { + t.Fatal("should be infinity") + } + }) +} + +func BenchmarkAddPoint(b *testing.B) { + p1 := &curvePoint{} + curvePointDouble(p1, curveGen) + p1.AffineFromJacobian() + p2 := &curvePoint{} + + b.Run("Add complete", func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + p2.AddComplete(curveGen, p1) + } + }) + + b.Run("Add traditional", func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + curvePointAdd(p2, curveGen, p1) + } + }) +} + +func TestCurvePointDobuleComplete(t *testing.T) { + t.Run("normal case", func(t *testing.T) { + p2 := &curvePoint{} + p2.DoubleComplete(curveGen) + p2.AffineFromProjective() + + p3 := &curvePoint{} + curvePointDouble(p3, curveGen) + p3.AffineFromJacobian() + + if !p2.Equal(p3) { + t.Errorf("Got %v, expected %v", p2, p3) + } + }) + + t.Run("exception case: IsInfinity", func(t *testing.T) { + p1 := &curvePoint{} + p1.SetInfinity() + p2 := &curvePoint{} + p2.DoubleComplete(p1) + p2.AffineFromProjective() + if !p2.IsInfinity() { + t.Fatal("should be infinity") + } + }) +} + +func BenchmarkDoublePoint(b *testing.B) { + p2 := &curvePoint{} + + b.Run("Double complete", func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + p2.DoubleComplete(curveGen) + } + }) + + b.Run("Double traditional", func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + curvePointDouble(p2, curveGen) + } + }) +} diff --git a/sm9/bn256/gfp12_b6.go b/sm9/bn256/gfp12_b6.go index 2a27fc1..e8d84c1 100644 --- a/sm9/bn256/gfp12_b6.go +++ b/sm9/bn256/gfp12_b6.go @@ -380,11 +380,11 @@ func (e *gfP12b6) Cyclo6Squares(a *gfP12b6, n int) *gfP12b6 { } func gfP4Square(retX, retY, x, y *gfP2) { - retX.SquareUNC(x) - retY.SquareNC(y) + retX.SquareU(x) + retY.Square(y) retY.Add(retX, retY) - retX.MulNC(x, y) + retX.Mul(x, y) retX.Add(retX, retX) } diff --git a/sm9/bn256/gfp2.go b/sm9/bn256/gfp2.go index 0c46e5b..d029b51 100644 --- a/sm9/bn256/gfp2.go +++ b/sm9/bn256/gfp2.go @@ -116,37 +116,7 @@ func (e *gfP2) Triple(a *gfP2) *gfP2 { // c0 = a0*b0 - 2a1*b1 // c1 = (a0 + a1)(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0 func (e *gfP2) Mul(a, b *gfP2) *gfP2 { - tmp := &gfP2{} - tmp.MulNC(a, b) - gfp2Copy(e, tmp) - return e -} - -// Mul without value copy, will use e directly, so e can't be same as a and b. -func (e *gfP2) MulNC(a, b *gfP2) *gfP2 { - tx := &e.x - ty := &e.y - v0, v1 := &gfP{}, &gfP{} - - gfpMul(v0, &a.y, &b.y) - gfpMul(v1, &a.x, &b.x) - - gfpAdd(tx, &a.x, &a.y) - gfpAdd(ty, &b.x, &b.y) - gfpMul(tx, tx, ty) - gfpSub(tx, tx, v0) - gfpSub(tx, tx, v1) - - gfpSub(ty, v0, v1) - gfpSub(ty, ty, v1) - - return e -} - -func (e *gfP2) MulU(a, b *gfP2) *gfP2 { - tmp := &gfP2{} - tmp.MulUNC(a, b) - gfp2Copy(e, tmp) + gfp2Mul(e, a, b) return e } @@ -155,26 +125,8 @@ func (e *gfP2) MulU(a, b *gfP2) *gfP2 { // (a0+a1*u)(b0+b1*u)*u=c0+c1*u, where // c1 = (a0*b0 - 2a1*b1)u // c0 = -2 * ((a0 + a1)(b0 + b1) - a0*b0 - a1*b1) = -2 * (a0*b1 + a1*b0) -func (e *gfP2) MulUNC(a, b *gfP2) *gfP2 { - tx := &e.x - ty := &e.y - v0, v1 := &gfP{}, &gfP{} - - gfpMul(v0, &a.y, &b.y) - gfpMul(v1, &a.x, &b.x) - - gfpAdd(tx, &a.x, &a.y) - gfpAdd(ty, &b.x, &b.y) - - gfpMul(ty, tx, ty) - gfpSub(ty, ty, v0) - gfpSub(ty, ty, v1) - gfpDouble(ty, ty) - gfpNeg(ty, ty) - - gfpSub(tx, v0, v1) - gfpSub(tx, tx, v1) - +func (e *gfP2) MulU(a, b *gfP2) *gfP2 { + gfp2MulU(e, a, b) return e } @@ -195,57 +147,14 @@ func (e *gfP2) MulU1(a *gfP2) *gfP2 { func (e *gfP2) Square(a *gfP2) *gfP2 { // Complex squaring algorithm: // (xu+y)² = y^2-2*x^2 + 2*u*x*y - tmp := &gfP2{} - tmp.SquareNC(a) - gfp2Copy(e, tmp) - return e -} - -// Square without value copy, will use e directly, so e can't be same as a. -func (e *gfP2) SquareNC(a *gfP2) *gfP2 { - // Complex squaring algorithm: - // (xu+y)² = y^2-2*x^2 + 2*u*x*y - tx := &e.x - ty := &e.y - - gfpAdd(ty, &a.x, &a.y) - gfpDouble(tx, &a.x) - gfpSub(tx, &a.y, tx) - gfpMul(ty, tx, ty) - gfpMul(tx, &a.x, &a.y) - gfpAdd(ty, tx, ty) - gfpDouble(tx, tx) - + gfp2Square(e, a) return e } func (e *gfP2) SquareU(a *gfP2) *gfP2 { // Complex squaring algorithm: // (xu+y)²*u = (y^2-2*x^2)u - 4*x*y - - tmp := &gfP2{} - tmp.SquareUNC(a) - gfp2Copy(e, tmp) - return e -} - -// SquareU without value copy, will use e directly, so e can't be same as a. -func (e *gfP2) SquareUNC(a *gfP2) *gfP2 { - // Complex squaring algorithm: - // (xu+y)²*u = (y^2-2*x^2)u - 4*x*y - tx := &e.x - ty := &e.y - - gfpAdd(tx, &a.x, &a.y) - gfpDouble(ty, &a.x) - gfpSub(ty, &a.y, ty) - gfpMul(tx, tx, ty) - gfpMul(ty, &a.x, &a.y) - gfpAdd(tx, tx, ty) - gfpDouble(ty, ty) - gfpDouble(ty, ty) - gfpNeg(ty, ty) - + gfp2SquareU(e, a) return e } diff --git a/sm9/bn256/gfp2_g1_amd64.s b/sm9/bn256/gfp2_g1_amd64.s new file mode 100644 index 0000000..d8a8f03 --- /dev/null +++ b/sm9/bn256/gfp2_g1_amd64.s @@ -0,0 +1,1653 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +#include "textflag.h" + +/* ---------------------------------------*/ +#define mul0 AX +#define mul1 DX +#define acc0 BX +#define acc1 CX +#define acc2 R8 +#define acc3 R9 +#define acc4 R10 +#define acc5 R11 +#define acc6 R12 +#define acc7 R13 +#define t0 R14 +#define t1 R15 +#define t2 DI +#define t3 SI +#define hlp BP +/* ---------------------------------------*/ +// (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) - (t3, t2, t1, t0) +TEXT gfpSubInternal(SB),NOSPLIT,$0 + XORQ mul0, mul0 + SUBQ t0, acc4 + SBBQ t1, acc5 + SBBQ t2, acc6 + SBBQ t3, acc7 + SBBQ $0, mul0 + + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + + ADDQ ·p2+0(SB), acc4 + ADCQ ·p2+8(SB), acc5 + ADCQ ·p2+16(SB), acc6 + ADCQ ·p2+24(SB), acc7 + ANDQ $1, mul0 + + // CMOVQEQ: Move if equal (ZF == 1) + CMOVQEQ acc0, acc4 + CMOVQEQ acc1, acc5 + CMOVQEQ acc2, acc6 + CMOVQEQ acc3, acc7 + + RET + +/* ---------------------------------------*/ +// (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) * (t3, t2, t1, t0) +// t0, t1 will be overwrited after this function call +TEXT gfpMulInternal(SB),NOSPLIT,$8 + CMPB ·supportADX(SB), $0 + JE noAdxMul + + // [t3, t2, t1, t0] * acc4 + MOVQ acc4, mul1 + MULXQ t0, acc0, acc1 + + MULXQ t1, mul0, acc2 + ADDQ mul0, acc1 + + MULXQ t2, mul0, acc3 + ADCQ mul0, acc2 + + MULXQ t3, mul0, acc4 + ADCQ mul0, acc3 + ADCQ $0, acc4 + + // [t3, t2, t1, t0] * acc5 + MOVQ acc5, mul1 + MULXQ t0, mul0, hlp + ADDQ mul0, acc1 + ADCQ hlp, acc2 + + MULXQ t1, mul0, hlp + ADCQ $0, hlp + ADDQ mul0, acc2 + ADCQ hlp, acc3 + + MULXQ t2, mul0, hlp + ADCQ $0, hlp + ADDQ mul0, acc3 + ADCQ hlp, acc4 + + MULXQ t3, mul0, acc5 + ADCQ $0, acc5 + ADDQ mul0, acc4 + ADCQ $0, acc5 + + // [t3, t2, t1, t0] * acc5 + MOVQ acc6, mul1 + MULXQ t0, mul0, hlp + ADDQ mul0, acc2 + ADCQ hlp, acc3 + + MULXQ t1, mul0, hlp + ADCQ $0, hlp + ADDQ mul0, acc3 + ADCQ hlp, acc4 + + MULXQ t2, mul0, hlp + ADCQ $0, hlp + ADDQ mul0, acc4 + ADCQ hlp, acc5 + + MULXQ t3, mul0, acc6 + ADCQ $0, acc6 + ADDQ mul0, acc5 + ADCQ $0, acc6 + + // [t3, t2, t1, t0] * acc7 + MOVQ acc7, mul1 + MULXQ t0, mul0, hlp + ADDQ mul0, acc3 + ADCQ hlp, acc4 + + MULXQ t1, mul0, hlp + ADCQ $0, hlp + ADDQ mul0, acc4 + ADCQ hlp, acc5 + + MULXQ t2, mul0, hlp + ADCQ $0, hlp + ADDQ mul0, acc5 + ADCQ hlp, acc6 + + MULXQ t3, mul0, acc7 + ADCQ $0, acc7 + ADDQ mul0, acc6 + ADCQ $0, acc7 + + // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] + // First reduction step + XORQ t1, t1 + MOVQ acc0, mul1 + MULXQ ·np+0x00(SB), mul1, mul0 + + MULXQ ·p2+0x00(SB), mul0, t0 + ADOXQ mul0, acc0 // (carry1, acc0) = acc0 + t0 * ord0 + + MULXQ ·p2+0x08(SB), mul0, hlp + ADCXQ t0, mul0 + ADOXQ mul0, acc1 + + MULXQ ·p2+0x10(SB), mul0, t0 + ADCXQ hlp, mul0 + ADOXQ mul0, acc2 + + MULXQ ·p2+0x18(SB), mul0, acc0 + ADCXQ t0, mul0 + ADOXQ mul0, acc3 + ADCXQ t1, acc0 + ADOXQ t1, acc0 + + // Second reduction step + MOVQ acc1, mul1 + MULXQ ·np+0x00(SB), mul1, mul0 + + MULXQ ·p2+0x00(SB), mul0, t0 + ADOXQ mul0, acc1 + + MULXQ ·p2+0x08(SB), mul0, hlp + ADCXQ t0, mul0 + ADOXQ mul0, acc2 + + MULXQ ·p2+0x10(SB), mul0, t0 + ADCXQ hlp, mul0 + ADOXQ mul0, acc3 + + MULXQ ·p2+0x18(SB), mul0, acc1 + ADCXQ t0, mul0 + ADOXQ mul0, acc0 + ADCXQ t1, acc1 + ADOXQ t1, acc1 + + // Third reduction step + MOVQ acc2, mul1 + MULXQ ·np+0x00(SB), mul1, mul0 + + MULXQ ·p2+0x00(SB), mul0, t0 + ADOXQ mul0, acc2 + + MULXQ ·p2+0x08(SB), mul0, hlp + ADCXQ t0, mul0 + ADOXQ mul0, acc3 + + MULXQ ·p2+0x10(SB), mul0, t0 + ADCXQ hlp, mul0 + ADOXQ mul0, acc0 + + MULXQ ·p2+0x18(SB), mul0, acc2 + ADCXQ t0, mul0 + ADOXQ mul0, acc1 + ADCXQ t1, acc2 + ADOXQ t1, acc2 + + // Last reduction step + MOVQ acc3, mul1 + MULXQ ·np+0x00(SB), mul1, mul0 + + MULXQ ·p2+0x00(SB), mul0, t0 + ADOXQ mul0, acc3 + + MULXQ ·p2+0x08(SB), mul0, hlp + ADCXQ t0, mul0 + ADOXQ mul0, acc0 + + MULXQ ·p2+0x10(SB), mul0, t0 + ADCXQ hlp, mul0 + ADOXQ mul0, acc1 + + MULXQ ·p2+0x18(SB), mul0, acc3 + ADCXQ t0, mul0 + ADOXQ mul0, acc2 + ADCXQ t1, acc3 + ADOXQ t1, acc3 + + MOVQ $0, hlp + // Add bits [511:256] of the result + ADDQ acc0, acc4 + ADCQ acc1, acc5 + ADCQ acc2, acc6 + ADCQ acc3, acc7 + ADCQ $0, hlp + // Copy result + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + // Subtract p + SUBQ ·p2+0(SB), acc4 + SBBQ ·p2+8(SB), acc5 + SBBQ ·p2+16(SB), acc6 + SBBQ ·p2+24(SB), acc7 + SBBQ $0, hlp + // If the result of the subtraction is negative, restore the previous result + CMOVQCS acc0, acc4 + CMOVQCS acc1, acc5 + CMOVQCS acc2, acc6 + CMOVQCS acc3, acc7 + + RET + +noAdxMul: + // [t3, t2, t1, t0] * acc4 + MOVQ acc4, mul0 + MULQ t0 + MOVQ mul0, acc0 + MOVQ mul1, acc1 + + MOVQ acc4, mul0 + MULQ t1 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc2 + + MOVQ acc4, mul0 + MULQ t2 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + + MOVQ acc4, mul0 + MULQ t3 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc4 + + // [t3, t2, t1, t0] * acc5 + MOVQ acc5, mul0 + MULQ t0 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ t1 + ADDQ hlp, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ t2 + ADDQ hlp, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ t3 + ADDQ hlp, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + // [t3, t2, t1, t0] * acc6 + MOVQ acc6, mul0 + MULQ t0 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc6, mul0 + MULQ t1 + ADDQ hlp, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc6, mul0 + MULQ t2 + ADDQ hlp, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc6, mul0 + MULQ t3 + ADDQ hlp, acc5 + ADCQ $0, mul1 + ADDQ mul0, acc5 + ADCQ $0, mul1 + MOVQ mul1, acc6 + + // [t3, t2, t1, t0] * acc7 + MOVQ acc7, mul0 + MULQ t0 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc7, mul0 + MULQ t1 + ADDQ hlp, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc7, mul0 + MULQ t2 + ADDQ hlp, acc5 + ADCQ $0, mul1 + ADDQ mul0, acc5 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc7, mul0 + MULQ t3 + ADDQ hlp, acc6 + ADCQ $0, mul1 + ADDQ mul0, acc6 + ADCQ $0, mul1 + MOVQ mul1, acc7 + // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] + // First reduction step + MOVQ acc0, mul0 + MULQ ·np+0x00(SB) + MOVQ mul0, hlp + + MOVQ ·p2+0x00(SB), mul0 + MULQ hlp + ADDQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, t0 + XORQ acc0, acc0 + + MOVQ ·p2+0x08(SB), mul0 + MULQ hlp + ADDQ t0, acc1 + ADCQ $0, mul1 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ ·p2+0x10(SB), mul0 + MULQ hlp + ADDQ t0, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ ·p2+0x18(SB), mul0 + MULQ hlp + ADDQ t0, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ mul1, acc0 + + // Second reduction step + MOVQ acc1, mul0 + MULQ ·np+0x00(SB) + MOVQ mul0, hlp + + MOVQ ·p2+0x00(SB), mul0 + MULQ hlp + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, t0 + XORQ acc1, acc1 + + MOVQ ·p2+0x08(SB), mul0 + MULQ hlp + ADDQ t0, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ ·p2+0x10(SB), mul0 + MULQ hlp + ADDQ t0, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ ·p2+0x18(SB), mul0 + MULQ hlp + ADDQ t0, acc0 + ADCQ $0, mul1 + ADDQ mul0, acc0 + ADCQ mul1, acc1 + + // Third reduction step + MOVQ acc2, mul0 + MULQ ·np+0x00(SB) + MOVQ mul0, hlp + + MOVQ ·p2+0x00(SB), mul0 + MULQ hlp + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, t0 + XORQ acc2, acc2 + + MOVQ ·p2+0x08(SB), mul0 + MULQ hlp + ADDQ t0, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ ·p2+0x10(SB), mul0 + MULQ hlp + ADDQ t0, acc0 + ADCQ $0, mul1 + ADDQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ ·p2+0x18(SB), mul0 + MULQ hlp + ADDQ t0, acc1 + ADCQ $0, mul1 + ADDQ mul0, acc1 + ADCQ mul1, acc2 + + // Last reduction step + MOVQ acc3, mul0 + MULQ ·np+0x00(SB) + MOVQ mul0, hlp + + MOVQ ·p2+0x00(SB), mul0 + MULQ hlp + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, t0 + XORQ acc3, acc3 + + MOVQ ·p2+0x08(SB), mul0 + MULQ hlp + ADDQ t0, acc0 + ADCQ $0, mul1 + ADDQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ ·p2+0x10(SB), mul0 + MULQ hlp + ADDQ t0, acc1 + ADCQ $0, mul1 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ ·p2+0x18(SB), mul0 + MULQ hlp + ADDQ t0, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ mul1, acc3 + + MOVQ $0, hlp + // Add bits [511:256] of the result + ADDQ acc0, acc4 + ADCQ acc1, acc5 + ADCQ acc2, acc6 + ADCQ acc3, acc7 + ADCQ $0, hlp + // Copy result + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + // Subtract p + SUBQ ·p2+0(SB), acc4 + SBBQ ·p2+8(SB), acc5 + SBBQ ·p2+16(SB), acc6 + SBBQ ·p2+24(SB), acc7 + SBBQ $0, hlp + // If the result of the subtraction is negative, restore the previous result + CMOVQCS acc0, acc4 + CMOVQCS acc1, acc5 + CMOVQCS acc2, acc6 + CMOVQCS acc3, acc7 + + RET + +/* ---------------------------------------*/ +// (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) ^ 2 +TEXT gfpSqrInternal(SB),NOSPLIT,$8 + CMPB ·supportADX(SB), $0 + JE noAdxSqr + + XORQ t3, t3 + + // [acc7, acc6, acc5] * acc4 + MOVQ acc4, mul1 + MULXQ acc5, acc1, acc2 + + MULXQ acc6, mul0, acc3 + ADOXQ mul0, acc2 + + MULXQ acc7, mul0, t0 + ADOXQ mul0, acc3 + ADOXQ t3, t0 + + // [acc7, acc6] * acc5 + MOVQ acc5, mul1 + MULXQ acc6, mul0, hlp + ADOXQ mul0, acc3 + + MULXQ acc7, mul0, t1 + ADCXQ hlp, mul0 + ADOXQ mul0, t0 + ADCXQ t3, t1 + + // acc7 * acc6 + MOVQ acc6, mul1 + MULXQ acc7, mul0, t2 + ADOXQ mul0, t1 + ADOXQ t3, t2 + + // *2 + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ t0, t0 + ADOXQ t1, t1 + ADOXQ t2, t2 + ADOXQ t3, t3 + + // Missing products + MOVQ acc4, mul1 + MULXQ mul1, acc0, acc4 + ADCXQ acc4, acc1 + + MOVQ acc5, mul1 + MULXQ mul1, mul0, acc4 + ADCXQ mul0, acc2 + ADCXQ acc4, acc3 + + MOVQ acc6, mul1 + MULXQ mul1, mul0, acc4 + ADCXQ mul0, t0 + ADCXQ acc4, t1 + + MOVQ acc7, mul1 + MULXQ mul1, mul0, acc4 + ADCXQ mul0, t2 + ADCXQ acc4, t3 + + // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] + // First reduction step + XORQ acc5, acc5 + MOVQ acc0, mul1 + MULXQ ·np+0x00(SB), mul1, mul0 + + MULXQ ·p2+0x00(SB), mul0, acc4 + ADOXQ mul0, acc0 // (carry1, acc0) = acc0 + acc5 * ord0 + + MULXQ ·p2+0x08(SB), mul0, hlp + ADCXQ acc4, mul0 + ADOXQ mul0, acc1 + + MULXQ ·p2+0x10(SB), mul0, acc4 + ADCXQ hlp, mul0 + ADOXQ mul0, acc2 + + MULXQ ·p2+0x18(SB), mul0, acc0 + ADCXQ acc4, mul0 + ADOXQ mul0, acc3 + ADCXQ acc5, acc0 + ADOXQ acc5, acc0 + + // Second reduction step + MOVQ acc1, mul1 + MULXQ ·np+0x00(SB), mul1, mul0 + + MULXQ ·p2+0x00(SB), mul0, acc4 + ADOXQ mul0, acc1 + + MULXQ ·p2+0x08(SB), mul0, hlp + ADCXQ acc4, mul0 + ADOXQ mul0, acc2 + + MULXQ ·p2+0x10(SB), mul0, acc4 + ADCXQ hlp, mul0 + ADOXQ mul0, acc3 + + MULXQ ·p2+0x18(SB), mul0, acc1 + ADCXQ acc4, mul0 + ADOXQ mul0, acc0 + ADCXQ acc5, acc1 + ADOXQ acc5, acc1 + + // Third reduction step + MOVQ acc2, mul1 + MULXQ ·np+0x00(SB), mul1, mul0 + + MULXQ ·p2+0x00(SB), mul0, acc4 + ADOXQ mul0, acc2 + + MULXQ ·p2+0x08(SB), mul0, hlp + ADCXQ acc4, mul0 + ADOXQ mul0, acc3 + + MULXQ ·p2+0x10(SB), mul0, acc4 + ADCXQ hlp, mul0 + ADOXQ mul0, acc0 + + MULXQ ·p2+0x18(SB), mul0, acc2 + ADCXQ acc4, mul0 + ADOXQ mul0, acc1 + ADCXQ acc5, acc2 + ADOXQ acc5, acc2 + + // Last reduction step + MOVQ acc3, mul1 + MULXQ ·np+0x00(SB), mul1, mul0 + + MULXQ ·p2+0x00(SB), mul0, acc4 + ADOXQ mul0, acc3 + + MULXQ ·p2+0x08(SB), mul0, hlp + ADCXQ acc4, mul0 + ADOXQ mul0, acc0 + + MULXQ ·p2+0x10(SB), mul0, acc4 + ADCXQ hlp, mul0 + ADOXQ mul0, acc1 + + MULXQ ·p2+0x18(SB), mul0, acc3 + ADCXQ acc4, mul0 + ADOXQ mul0, acc2 + ADCXQ acc5, acc3 + ADOXQ acc5, acc3 + + MOVQ $0, hlp + // Add bits [511:256] of the result + ADDQ acc0, t0 + ADCQ acc1, t1 + ADCQ acc2, t2 + ADCQ acc3, t3 + ADCQ $0, hlp + // Copy result + MOVQ t0, acc4 + MOVQ t1, acc5 + MOVQ t2, acc6 + MOVQ t3, acc7 + // Subtract p + SUBQ ·p2+0(SB), acc4 + SBBQ ·p2+8(SB), acc5 + SBBQ ·p2+16(SB), acc6 + SBBQ ·p2+24(SB), acc7 + SBBQ $0, hlp + // If the result of the subtraction is negative, restore the previous result + CMOVQCS t0, acc4 + CMOVQCS t1, acc5 + CMOVQCS t2, acc6 + CMOVQCS t3, acc7 + + RET + +noAdxSqr: + MOVQ acc4, mul0 + MULQ acc5 + MOVQ mul0, acc1 + MOVQ mul1, acc2 + + MOVQ acc4, mul0 + MULQ acc6 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + + MOVQ acc4, mul0 + MULQ acc7 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ acc5, mul0 + MULQ acc6 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ acc7 + ADDQ hlp, t0 + ADCQ $0, mul1 + ADDQ mul0, t0 + ADCQ $0, mul1 + MOVQ mul1, t1 + + MOVQ acc6, mul0 + MULQ acc7 + ADDQ mul0, t1 + ADCQ $0, mul1 + MOVQ mul1, t2 + XORQ t3, t3 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ t0, t0 + ADCQ t1, t1 + ADCQ t2, t2 + ADCQ $0, t3 + // Missing products + MOVQ acc4, mul0 + MULQ mul0 + MOVQ mul0, acc0 + MOVQ DX, acc4 + + MOVQ acc5, mul0 + MULQ mul0 + ADDQ acc4, acc1 + ADCQ mul0, acc2 + ADCQ $0, DX + MOVQ DX, acc4 + + MOVQ acc6, mul0 + MULQ mul0 + ADDQ acc4, acc3 + ADCQ mul0, t0 + ADCQ $0, DX + MOVQ DX, acc4 + + MOVQ acc7, mul0 + MULQ mul0 + ADDQ acc4, t1 + ADCQ mul0, t2 + ADCQ DX, t3 + // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] + // First reduction step + MOVQ acc0, mul0 + MULQ ·np+0x00(SB) + MOVQ mul0, hlp + + MOVQ ·p2+0x00(SB), mul0 + MULQ hlp + ADDQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, acc5 + XORQ acc0, acc0 + + MOVQ ·p2+0x08(SB), mul0 + MULQ hlp + ADDQ acc5, acc1 + ADCQ $0, mul1 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ ·p2+0x10(SB), mul0 + MULQ hlp + ADDQ acc5, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ ·p2+0x18(SB), mul0 + MULQ hlp + ADDQ acc5, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ mul1, acc0 + + // Second reduction step + MOVQ acc1, mul0 + MULQ ·np+0x00(SB) + MOVQ mul0, hlp + + MOVQ ·p2+0x00(SB), mul0 + MULQ hlp + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc5 + XORQ acc1, acc1 + + MOVQ ·p2+0x08(SB), mul0 + MULQ hlp + ADDQ acc5, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ ·p2+0x10(SB), mul0 + MULQ hlp + ADDQ acc5, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ ·p2+0x18(SB), mul0 + MULQ hlp + ADDQ acc5, acc0 + ADCQ $0, mul1 + ADDQ mul0, acc0 + ADCQ mul1, acc1 + + // Third reduction step + MOVQ acc2, mul0 + MULQ ·np+0x00(SB) + MOVQ mul0, hlp + + MOVQ ·p2+0x00(SB), mul0 + MULQ hlp + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc5 + XORQ acc2, acc2 + + MOVQ ·p2+0x08(SB), mul0 + MULQ hlp + ADDQ acc5, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ ·p2+0x10(SB), mul0 + MULQ hlp + ADDQ acc5, acc0 + ADCQ $0, mul1 + ADDQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ ·p2+0x18(SB), mul0 + MULQ hlp + ADDQ acc5, acc1 + ADCQ $0, mul1 + ADDQ mul0, acc1 + ADCQ mul1, acc2 + + // Last reduction step + MOVQ acc3, mul0 + MULQ ·np+0x00(SB) + MOVQ mul0, hlp + + MOVQ ·p2+0x00(SB), mul0 + MULQ hlp + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc5 + XORQ acc3, acc3 + + MOVQ ·p2+0x08(SB), mul0 + MULQ hlp + ADDQ acc5, acc0 + ADCQ $0, mul1 + ADDQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ ·p2+0x10(SB), mul0 + MULQ hlp + ADDQ acc5, acc1 + ADCQ $0, mul1 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ ·p2+0x18(SB), mul0 + MULQ hlp + ADDQ acc5, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ mul1, acc3 + + MOVQ $0, hlp + // Add bits [511:256] of the result + ADDQ acc0, t0 + ADCQ acc1, t1 + ADCQ acc2, t2 + ADCQ acc3, t3 + ADCQ $0, hlp + // Copy result + MOVQ t0, acc4 + MOVQ t1, acc5 + MOVQ t2, acc6 + MOVQ t3, acc7 + // Subtract p + SUBQ ·p2+0(SB), acc4 + SBBQ ·p2+8(SB), acc5 + SBBQ ·p2+16(SB), acc6 + SBBQ ·p2+24(SB), acc7 + SBBQ $0, hlp + // If the result of the subtraction is negative, restore the previous result + CMOVQCS t0, acc4 + CMOVQCS t1, acc5 + CMOVQCS t2, acc6 + CMOVQCS t3, acc7 + + RET + +/* ---------------------------------------*/ +// (t3, t2, t1, t0) = 2(acc7, acc6, acc5, acc4) +#define gfpMulBy2Inline \ + XORQ mul0, mul0;\ + ADDQ acc4, acc4;\ + ADCQ acc5, acc5;\ + ADCQ acc6, acc6;\ + ADCQ acc7, acc7;\ + ADCQ $0, mul0;\ + MOVQ acc4, t0;\ + MOVQ acc5, t1;\ + MOVQ acc6, t2;\ + MOVQ acc7, t3;\ + SUBQ ·p2+0(SB), t0;\ + SBBQ ·p2+8(SB), t1;\ + SBBQ ·p2+16(SB), t2;\ + SBBQ ·p2+24(SB), t3;\ + SBBQ $0, mul0;\ + CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1) + CMOVQCS acc5, t1;\ + CMOVQCS acc6, t2;\ + CMOVQCS acc7, t3; + +/* ---------------------------------------*/ +// (t3, t2, t1, t0) = (acc7, acc6, acc5, acc4) + (t3, t2, t1, t0) +#define gfpAddInline \ + XORQ mul0, mul0;\ + ADDQ t0, acc4;\ + ADCQ t1, acc5;\ + ADCQ t2, acc6;\ + ADCQ t3, acc7;\ + ADCQ $0, mul0;\ + MOVQ acc4, t0;\ + MOVQ acc5, t1;\ + MOVQ acc6, t2;\ + MOVQ acc7, t3;\ + SUBQ ·p2+0(SB), t0;\ + SBBQ ·p2+8(SB), t1;\ + SBBQ ·p2+16(SB), t2;\ + SBBQ ·p2+24(SB), t3;\ + SBBQ $0, mul0;\ + CMOVQCS acc4, t0;\ + CMOVQCS acc5, t1;\ + CMOVQCS acc6, t2;\ + CMOVQCS acc7, t3; + +/* ---------------------------------------*/ +#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 +#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 +#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) +#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) +#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 +#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 + +/* ---------------------------------------*/ +#define axin(off) (32*0 + off)(SP) +#define ayin(off) (32*1 + off)(SP) +#define bxin(off) (32*2 + off)(SP) +#define byin(off) (32*3 + off)(SP) +#define tmp0(off) (32*4 + off)(SP) +#define tmp1(off) (32*5 + off)(SP) +#define cxout(off) (32*6 + off)(SP) +#define rptr (32*7)(SP) + +TEXT ·gfp2Mul(SB),NOSPLIT,$256-24 + // Move input to stack in order to free registers + MOVQ res+0(FP), CX + MOVQ in1+8(FP), AX + MOVQ in2+16(FP), BX + + MOVOU (16*0)(AX), X0 + MOVOU (16*1)(AX), X1 + MOVOU (16*2)(AX), X2 + MOVOU (16*3)(AX), X3 + + MOVOU X0, axin(16*0) + MOVOU X1, axin(16*1) + MOVOU X2, ayin(16*0) + MOVOU X3, ayin(16*1) + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + + MOVOU X0, bxin(16*0) + MOVOU X1, bxin(16*1) + MOVOU X2, byin(16*0) + MOVOU X3, byin(16*1) + + // Store pointer to result + MOVQ CX, rptr + + LDacc (ayin) + LDt (byin) + CALL gfpMulInternal(SB) + ST (tmp0) + + LDacc (axin) + LDt (bxin) + CALL gfpMulInternal(SB) + ST (tmp1) + + LDacc (axin) + LDt (ayin) + gfpAddInline + STt (cxout) + + LDacc (bxin) + LDt (byin) + gfpAddInline + + LDacc (cxout) + CALL gfpMulInternal(SB) + LDt (tmp0) + CALL gfpSubInternal(SB) + LDt (tmp1) + CALL gfpSubInternal(SB) + + // Store x + MOVQ rptr, AX + MOVQ acc4, (16*0 + 8*0)(AX) + MOVQ acc5, (16*0 + 8*1)(AX) + MOVQ acc6, (16*0 + 8*2)(AX) + MOVQ acc7, (16*0 + 8*3)(AX) + + LDacc (tmp0) + //LDt (tmp1) + CALL gfpSubInternal(SB) + CALL gfpSubInternal(SB) + MOVQ rptr, AX + /////////////////////// + MOVQ $0, rptr + // Store y + MOVQ acc4, (16*2 + 8*0)(AX) + MOVQ acc5, (16*2 + 8*1)(AX) + MOVQ acc6, (16*2 + 8*2)(AX) + MOVQ acc7, (16*2 + 8*3)(AX) + + RET + +TEXT ·gfp2MulU(SB),NOSPLIT,$256-24 + // Move input to stack in order to free registers + MOVQ res+0(FP), CX + MOVQ in1+8(FP), AX + MOVQ in2+16(FP), BX + + MOVOU (16*0)(AX), X0 + MOVOU (16*1)(AX), X1 + MOVOU (16*2)(AX), X2 + MOVOU (16*3)(AX), X3 + + MOVOU X0, axin(16*0) + MOVOU X1, axin(16*1) + MOVOU X2, ayin(16*0) + MOVOU X3, ayin(16*1) + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + + MOVOU X0, bxin(16*0) + MOVOU X1, bxin(16*1) + MOVOU X2, byin(16*0) + MOVOU X3, byin(16*1) + + // Store pointer to result + MOVQ CX, rptr + + LDacc (ayin) + LDt (byin) + CALL gfpMulInternal(SB) + ST (tmp0) + + LDacc (axin) + LDt (bxin) + CALL gfpMulInternal(SB) + ST (tmp1) + + LDacc (axin) + LDt (ayin) + gfpAddInline + STt (cxout) + + LDacc (bxin) + LDt (byin) + gfpAddInline + + LDacc (cxout) + CALL gfpMulInternal(SB) + LDt (tmp0) + CALL gfpSubInternal(SB) + LDt (tmp1) + CALL gfpSubInternal(SB) + gfpMulBy2Inline + XORQ acc4, acc4 + XORQ acc5, acc5 + XORQ acc6, acc6 + XORQ acc7, acc7 + CALL gfpSubInternal(SB) + + // Store y + MOVQ rptr, AX + MOVQ acc4, (16*2 + 8*0)(AX) + MOVQ acc5, (16*2 + 8*1)(AX) + MOVQ acc6, (16*2 + 8*2)(AX) + MOVQ acc7, (16*2 + 8*3)(AX) + + LDacc (tmp0) + LDt (tmp1) + CALL gfpSubInternal(SB) + CALL gfpSubInternal(SB) + MOVQ rptr, AX + /////////////////////// + MOVQ $0, rptr + // Store x + MOVQ acc4, (16*0 + 8*0)(AX) + MOVQ acc5, (16*0 + 8*1)(AX) + MOVQ acc6, (16*0 + 8*2)(AX) + MOVQ acc7, (16*0 + 8*3)(AX) + + RET + +#undef axin +#undef ayin +#undef bxin +#undef byin +#undef tmp0 +#undef tmp1 +#undef cxout +#undef rptr + +#define axin(off) (32*0 + off)(SP) +#define ayin(off) (32*1 + off)(SP) +#define cxout(off) (32*2 + off)(SP) +#define cyout(off) (32*3 + off)(SP) +#define rptr (32*4)(SP) + +TEXT ·gfp2Square(SB),NOSPLIT,$160-16 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+8(FP), BX + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + + MOVOU X0, axin(16*0) + MOVOU X1, axin(16*1) + MOVOU X2, ayin(16*0) + MOVOU X3, ayin(16*1) + + // Store pointer to result + MOVQ AX, rptr + + LDacc (axin) + LDt (ayin) + gfpAddInline + STt (cyout) + + LDacc (axin) + gfpMulBy2Inline + STt (cxout) + + LDacc (ayin) + CALL gfpSubInternal(SB) + ST (cxout) + + LDt (cyout) + CALL gfpMulInternal(SB) + ST (cyout) + + LDacc (axin) + LDt (ayin) + CALL gfpMulInternal(SB) + ST (cxout) + + LDt (cyout) + gfpAddInline + // Store y + MOVQ rptr, AX + MOVQ t0, (16*2 + 8*0)(AX) + MOVQ t1, (16*2 + 8*1)(AX) + MOVQ t2, (16*2 + 8*2)(AX) + MOVQ t3, (16*2 + 8*3)(AX) + + LDacc (cxout) + gfpMulBy2Inline + // Store x + MOVQ rptr, AX + /////////////////////// + MOVQ $0, rptr + MOVQ t0, (16*0 + 8*0)(AX) + MOVQ t1, (16*0 + 8*1)(AX) + MOVQ t2, (16*0 + 8*2)(AX) + MOVQ t3, (16*0 + 8*3)(AX) + + RET + +TEXT ·gfp2SquareU(SB),NOSPLIT,$160-16 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+8(FP), BX + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + + MOVOU X0, axin(16*0) + MOVOU X1, axin(16*1) + MOVOU X2, ayin(16*0) + MOVOU X3, ayin(16*1) + + // Store pointer to result + MOVQ AX, rptr + + LDacc (axin) + LDt (ayin) + gfpAddInline + STt (cxout) + + LDacc (axin) + gfpMulBy2Inline + STt (cyout) + + LDacc (ayin) + CALL gfpSubInternal(SB) + ST (cyout) + + LDt (cxout) + CALL gfpMulInternal(SB) + ST (cxout) + + LDacc (axin) + LDt (ayin) + CALL gfpMulInternal(SB) + ST (cyout) + + LDt (cxout) + gfpAddInline + + // Store x + MOVQ rptr, AX + MOVQ t0, (16*0 + 8*0)(AX) + MOVQ t1, (16*0 + 8*1)(AX) + MOVQ t2, (16*0 + 8*2)(AX) + MOVQ t3, (16*0 + 8*3)(AX) + + LDacc (cyout) + gfpMulBy2Inline + t2acc + gfpMulBy2Inline + XORQ acc4, acc4 + XORQ acc5, acc5 + XORQ acc6, acc6 + XORQ acc7, acc7 + CALL gfpSubInternal(SB) + + // Store y + MOVQ rptr, AX + /////////////////////// + MOVQ $0, rptr + MOVQ acc4, (16*2 + 8*0)(AX) + MOVQ acc5, (16*2 + 8*1)(AX) + MOVQ acc6, (16*2 + 8*2)(AX) + MOVQ acc7, (16*2 + 8*3)(AX) + + RET + +#undef axin +#undef ayin +#undef cxout +#undef cyout +#undef rptr + +/* ---------------------------------------*/ +#define x(off) (32*0 + off)(SP) +#define y(off) (32*1 + off)(SP) +#define z(off) (32*2 + off)(SP) + +#define a(off) (32*3 + off)(SP) +#define b(off) (32*4 + off)(SP) +#define c(off) (32*5 + off)(SP) +#define rptr (32*6)(SP) + +// func curvePointDouble(c, a *curvePoint) +TEXT ·curvePointDouble(SB),NOSPLIT,$224-16 + // https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in+8(FP), BX + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x(16*0) + MOVOU X1, x(16*1) + MOVOU X2, y(16*0) + MOVOU X3, y(16*1) + MOVOU X4, z(16*0) + MOVOU X5, z(16*1) + + // Store pointer to result + MOVQ AX, rptr + + LDacc (y) + LDt (z) + CALL gfpMulInternal(SB) + gfpMulBy2Inline // Z3 = 2*Y1*Z1 + MOVQ rptr, AX + // Store Z + MOVQ t0, (16*4 + 8*0)(AX) + MOVQ t1, (16*4 + 8*1)(AX) + MOVQ t2, (16*4 + 8*2)(AX) + MOVQ t3, (16*4 + 8*3)(AX) + + LDacc (x) + CALL gfpSqrInternal(SB) // A = X1^2 + ST (a) + + LDacc (y) + CALL gfpSqrInternal(SB) // B = Y1^2 + ST (b) + CALL gfpSqrInternal(SB) // C = B^2 + ST (c) + + LDacc (x) + LDt (b) + gfpAddInline // X1+B + t2acc + CALL gfpSqrInternal(SB) // (X1+B)^2 + LDt (a) + CALL gfpSubInternal(SB) + LDt (c) + CALL gfpSubInternal(SB) + gfpMulBy2Inline // B = D = 2*((X1+B)^2-A-C) + STt (b) // Store D + + LDacc (a) + gfpMulBy2Inline + LDacc (a) + gfpAddInline // A = E = 3*A + STt (a) // Store E + t2acc + CALL gfpSqrInternal(SB) // F = E^2 + + LDt (b) // Load D + CALL gfpSubInternal(SB) + LDt (b) // Load D + CALL gfpSubInternal(SB) // X3 = F-2*D + + ST (x) + + MOVQ rptr, AX + // Store x + MOVQ acc4, (16*0 + 8*0)(AX) + MOVQ acc5, (16*0 + 8*1)(AX) + MOVQ acc6, (16*0 + 8*2)(AX) + MOVQ acc7, (16*0 + 8*3)(AX) + + LDacc (c) + gfpMulBy2Inline + t2acc + gfpMulBy2Inline + t2acc + gfpMulBy2Inline // 8*C + STt (c) + + LDacc (b) // Load D + LDt (x) + CALL gfpSubInternal(SB) // (D-X3) + LDt (a) // Load E + CALL gfpMulInternal(SB) // E*(D-X3) + LDt (c) + CALL gfpSubInternal(SB) // Y3 = E*(D-X3)-8*C + + MOVQ rptr, AX + /////////////////////// + MOVQ $0, rptr + // Store y + MOVQ acc4, (16*2 + 8*0)(AX) + MOVQ acc5, (16*2 + 8*1)(AX) + MOVQ acc6, (16*2 + 8*2)(AX) + MOVQ acc7, (16*2 + 8*3)(AX) + + RET + +#undef x +#undef y +#undef z +#undef a +#undef b +#undef c +#undef rptr + +// gfpIsZero returns 1 in AX if [acc4..acc7] represents zero and zero +// otherwise. It writes to [acc4..acc7], t0 and t1. +TEXT gfpIsZero(SB),NOSPLIT,$0 + // AX contains a flag that is set if the input is zero. + XORQ AX, AX + MOVQ $1, t1 + + // Check whether [acc4..acc7] are all zero. + MOVQ acc4, t0 + ORQ acc5, t0 + ORQ acc6, t0 + ORQ acc7, t0 + + // Set the zero flag if so. (CMOV of a constant to a register doesn't + // appear to be supported in Go. Thus t1 = 1.) + CMOVQEQ t1, AX + + // XOR [acc4..acc7] with P and compare with zero again. + XORQ ·p2+0(SB), acc4 + XORQ ·p2+8(SB), acc5 + XORQ ·p2+16(SB), acc6 + XORQ ·p2+24(SB), acc7 + ORQ acc5, acc4 + ORQ acc6, acc4 + ORQ acc7, acc4 + + // Set the zero flag if so. + CMOVQEQ t1, AX + RET + +/* ---------------------------------------*/ +#define x1in(off) (32*0 + off)(SP) +#define y1in(off) (32*1 + off)(SP) +#define z1in(off) (32*2 + off)(SP) +#define x2in(off) (32*3 + off)(SP) +#define y2in(off) (32*4 + off)(SP) +#define z2in(off) (32*5 + off)(SP) + +#define xout(off) (32*6 + off)(SP) +#define yout(off) (32*7 + off)(SP) +#define zout(off) (32*8 + off)(SP) + +#define u1(off) (32*9 + off)(SP) +#define u2(off) (32*10 + off)(SP) +#define s1(off) (32*11 + off)(SP) +#define s2(off) (32*12 + off)(SP) +#define z1sqr(off) (32*13 + off)(SP) +#define z2sqr(off) (32*14 + off)(SP) +#define h(off) (32*15 + off)(SP) +#define r(off) (32*16 + off)(SP) +#define hsqr(off) (32*17 + off)(SP) +#define rsqr(off) (32*18 + off)(SP) +#define hcub(off) (32*19 + off)(SP) +#define rptr (32*20)(SP) +#define points_eq (32*20+8)(SP) + +#define curvePointAddInline \ + \// Begin point add + LDacc (z2in) \ + CALL gfpSqrInternal(SB) \// z2ˆ2 + ST (z2sqr) \ + LDt (z2in) \ + CALL gfpMulInternal(SB) \// z2ˆ3 + LDt (y1in) \ + CALL gfpMulInternal(SB) \// s1 = z2ˆ3*y1 + ST (s1) \ + \ + LDacc (z1in) \ + CALL gfpSqrInternal(SB) \// z1ˆ2 + ST (z1sqr) \ + LDt (z1in) \ + CALL gfpMulInternal(SB) \// z1ˆ3 + LDt (y2in) \ + CALL gfpMulInternal(SB) \// s2 = z1ˆ3*y2 + ST (s2) \ + \ + LDt (s1) \ + CALL gfpSubInternal(SB) \// r = s2 - s1 + ST (r) \ + CALL gfpIsZero(SB) \ + MOVQ AX, points_eq \ + \ + LDacc (z2sqr) \ + LDt (x1in) \ + CALL gfpMulInternal(SB) \// u1 = x1 * z2ˆ2 + ST (u1) \ + LDacc (z1sqr) \ + LDt (x2in) \ + CALL gfpMulInternal(SB) \// u2 = x2 * z1ˆ2 + ST (u2) \ + \ + LDt (u1) \ + CALL gfpSubInternal(SB) \// h = u2 - u1 + ST (h) \ + CALL gfpIsZero(SB) \ + ANDQ points_eq, AX \ + MOVQ AX, points_eq \ + \ + LDacc (r) \ + CALL gfpSqrInternal(SB) \// rsqr = rˆ2 + ST (rsqr) \ + \ + LDacc (h) \ + CALL gfpSqrInternal(SB) \// hsqr = hˆ2 + ST (hsqr) \ + \ + LDt (h) \ + CALL gfpMulInternal(SB) \// hcub = hˆ3 + ST (hcub) \ + \ + LDt (s1) \ + CALL gfpMulInternal(SB) \ + ST (s2) \ + \ + LDacc (z1in) \ + LDt (z2in) \ + CALL gfpMulInternal(SB) \// z1 * z2 + LDt (h) \ + CALL gfpMulInternal(SB) \// z1 * z2 * h + ST (zout) \ + \ + LDacc (hsqr) \ + LDt (u1) \ + CALL gfpMulInternal(SB) \// hˆ2 * u1 + ST (u2) \ + \ + gfpMulBy2Inline \// u1 * hˆ2 * 2, inline + LDacc (rsqr) \ + CALL gfpSubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2 + \ + LDt (hcub) \ + CALL gfpSubInternal(SB) \ + ST (xout) \ + \ + MOVQ acc4, t0 \ + MOVQ acc5, t1 \ + MOVQ acc6, t2 \ + MOVQ acc7, t3 \ + LDacc (u2) \ + CALL gfpSubInternal(SB) \ + \ + LDt (r) \ + CALL gfpMulInternal(SB) \ + \ + LDt (s2) \ + CALL gfpSubInternal(SB) \ + ST (yout) \ + +// func curvePointAdd(c, a, b *curvePoint) int +TEXT ·curvePointAdd(SB),0,$680-32 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+8(FP), BX + MOVQ in2+16(FP), CX + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x1in(16*0) + MOVOU X1, x1in(16*1) + MOVOU X2, y1in(16*0) + MOVOU X3, y1in(16*1) + MOVOU X4, z1in(16*0) + MOVOU X5, z1in(16*1) + + MOVOU (16*0)(CX), X0 + MOVOU (16*1)(CX), X1 + MOVOU (16*2)(CX), X2 + MOVOU (16*3)(CX), X3 + MOVOU (16*4)(CX), X4 + MOVOU (16*5)(CX), X5 + + MOVOU X0, x2in(16*0) + MOVOU X1, x2in(16*1) + MOVOU X2, y2in(16*0) + MOVOU X3, y2in(16*1) + MOVOU X4, z2in(16*0) + MOVOU X5, z2in(16*1) + // Store pointer to result + MOVQ AX, rptr + + curvePointAddInline + + MOVOU xout(16*0), X0 + MOVOU xout(16*1), X1 + MOVOU yout(16*0), X2 + MOVOU yout(16*1), X3 + MOVOU zout(16*0), X4 + MOVOU zout(16*1), X5 + // Finally output the result + MOVQ rptr, AX + MOVQ $0, rptr + MOVOU X0, (16*0)(AX) + MOVOU X1, (16*1)(AX) + MOVOU X2, (16*2)(AX) + MOVOU X3, (16*3)(AX) + MOVOU X4, (16*4)(AX) + MOVOU X5, (16*5)(AX) + + MOVQ points_eq, AX + MOVQ AX, ret+24(FP) + + RET diff --git a/sm9/bn256/gfp2_g1_decl.go b/sm9/bn256/gfp2_g1_decl.go new file mode 100644 index 0000000..ea2614c --- /dev/null +++ b/sm9/bn256/gfp2_g1_decl.go @@ -0,0 +1,36 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +package bn256 + +// gfP2 multiplication. +// +//go:noescape +func gfp2Mul(c, a, b *gfP2) + +// gfP2 multiplication. c = a*b*u +// +//go:noescape +func gfp2MulU(c, a, b *gfP2) + +// gfP2 square. +// +//go:noescape +func gfp2Square(c, a *gfP2) + +// gfP2 square and mult u. +// +//go:noescape +func gfp2SquareU(c, a *gfP2) + +// Point doubling. Sets res = in + in. in can be the point at infinity. +// +//go:noescape +func curvePointDouble(c, a *curvePoint) + +// Point addition. Sets res = in1 + in2. Returns one if the two input points +// were equal and zero otherwise. If in1 or in2 are the point at infinity, res +// and the return value are undefined. +// +//go:noescape +func curvePointAdd(c, a, b *curvePoint) int diff --git a/sm9/bn256/gfp2_g1_generic.go b/sm9/bn256/gfp2_g1_generic.go new file mode 100644 index 0000000..db85ce9 --- /dev/null +++ b/sm9/bn256/gfp2_g1_generic.go @@ -0,0 +1,193 @@ +//go:build (!amd64) || purego +// +build !amd64 purego + +package bn256 + +func gfp2Mul(c, a, b *gfP2) { + tmp := &gfP2{} + tx := &tmp.x + ty := &tmp.y + v0, v1 := &gfP{}, &gfP{} + + gfpMul(v0, &a.y, &b.y) + gfpMul(v1, &a.x, &b.x) + + gfpAdd(tx, &a.x, &a.y) + gfpAdd(ty, &b.x, &b.y) + gfpMul(tx, tx, ty) + gfpSub(tx, tx, v0) + gfpSub(tx, tx, v1) + + gfpSub(ty, v0, v1) + gfpSub(ty, ty, v1) + + gfp2Copy(c, tmp) +} + +func gfp2MulU(c, a, b *gfP2) { + tmp := &gfP2{} + tx := &tmp.x + ty := &tmp.y + v0, v1 := &gfP{}, &gfP{} + + gfpMul(v0, &a.y, &b.y) + gfpMul(v1, &a.x, &b.x) + + gfpAdd(tx, &a.x, &a.y) + gfpAdd(ty, &b.x, &b.y) + + gfpMul(ty, tx, ty) + gfpSub(ty, ty, v0) + gfpSub(ty, ty, v1) + gfpDouble(ty, ty) + gfpNeg(ty, ty) + + gfpSub(tx, v0, v1) + gfpSub(tx, tx, v1) + + gfp2Copy(c, tmp) +} + +func gfp2Square(c, a *gfP2) { + tmp := &gfP2{} + tx := &tmp.x + ty := &tmp.y + + gfpAdd(ty, &a.x, &a.y) + gfpDouble(tx, &a.x) + gfpSub(tx, &a.y, tx) + gfpMul(ty, tx, ty) + gfpMul(tx, &a.x, &a.y) + gfpAdd(ty, tx, ty) + gfpDouble(tx, tx) + + gfp2Copy(c, tmp) +} + +func gfp2SquareU(c, a *gfP2) { + tmp := &gfP2{} + tx := &tmp.x + ty := &tmp.y + + gfpAdd(tx, &a.x, &a.y) + gfpDouble(ty, &a.x) + gfpSub(ty, &a.y, ty) + gfpMul(tx, tx, ty) + gfpMul(ty, &a.x, &a.y) + gfpAdd(tx, tx, ty) + gfpDouble(ty, ty) + gfpDouble(ty, ty) + gfpNeg(ty, ty) + + gfp2Copy(c, tmp) +} + +func curvePointDouble(c, a *curvePoint) { + // See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/doubling/dbl-2009-l.op3 + A, B, C := &gfP{}, &gfP{}, &gfP{} + gfpSqr(A, &a.x, 1) + gfpSqr(B, &a.y, 1) + gfpSqr(C, B, 1) + + t := &gfP{} + gfpAdd(B, &a.x, B) + gfpSqr(t, B, 1) + gfpSub(B, t, A) + gfpSub(t, B, C) + + d, e := &gfP{}, &gfP{} + gfpDouble(d, t) + gfpDouble(B, A) + gfpAdd(e, B, A) + gfpSqr(A, e, 1) + + gfpDouble(B, d) + gfpSub(&c.x, A, B) + + gfpMul(&c.z, &a.y, &a.z) + gfpDouble(&c.z, &c.z) + + gfpDouble(B, C) + gfpDouble(t, B) + gfpDouble(B, t) + gfpSub(&c.y, d, &c.x) + gfpMul(t, e, &c.y) + gfpSub(&c.y, t, B) +} + +func curvePointAdd(c, a, b *curvePoint) int { + // See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/addition/add-2007-bl.op3 + var pointEq int + // Normalize the points by replacing a = [x1:y1:z1] and b = [x2:y2:z2] + // by [u1:s1:z1·z2] and [u2:s2:z1·z2] + // where u1 = x1·z2², s1 = y1·z2³ and u1 = x2·z1², s2 = y2·z1³ + z12, z22 := &gfP{}, &gfP{} + gfpSqr(z12, &a.z, 1) + gfpSqr(z22, &b.z, 1) + + u1, u2 := &gfP{}, &gfP{} + gfpMul(u1, &a.x, z22) + gfpMul(u2, &b.x, z12) + + t, s1 := &gfP{}, &gfP{} + gfpMul(t, &b.z, z22) + gfpMul(s1, &a.y, t) + + s2 := &gfP{} + gfpMul(t, &a.z, z12) + gfpMul(s2, &b.y, t) + + // Compute x = (2h)²(s²-u1-u2) + // where s = (s2-s1)/(u2-u1) is the slope of the line through + // (u1,s1) and (u2,s2). The extra factor 2h = 2(u2-u1) comes from the value of z below. + // This is also: + // 4(s2-s1)² - 4h²(u1+u2) = 4(s2-s1)² - 4h³ - 4h²(2u1) + // = r² - j - 2v + // with the notations below. + h := &gfP{} + gfpSub(h, u2, u1) + + gfpDouble(t, h) + // i = 4h² + i := &gfP{} + gfpSqr(i, t, 1) + // j = 4h³ + j := &gfP{} + gfpMul(j, h, i) + + gfpSub(t, s2, s1) + + pointEq = h.Equal(zero) & t.Equal(zero) + + r := &gfP{} + gfpDouble(r, t) + + v := &gfP{} + gfpMul(v, u1, i) + + // t4 = 4(s2-s1)² + t4, t6 := &gfP{}, &gfP{} + gfpSqr(t4, r, 1) + gfpDouble(t, v) + gfpSub(t6, t4, j) + + gfpSub(&c.x, t6, t) + + // Set y = -(2h)³(s1 + s*(x/4h²-u1)) + // This is also + // y = - 2·s1·j - (s2-s1)(2x - 2i·u1) = r(v-x) - 2·s1·j + gfpSub(t, v, &c.x) // t7 + gfpMul(t4, s1, j) // t8 + gfpDouble(t6, t4) // t9 + gfpMul(t4, r, t) // t10 + gfpSub(&c.y, t4, t6) + + // Set z = 2(u2-u1)·z1·z2 = 2h·z1·z2 + gfpAdd(t, &a.z, &b.z) // t11 + gfpSqr(t4, t, 1) // t12 + gfpSub(t, t4, z12) // t13 + gfpSub(t4, t, z22) // t14 + gfpMul(&c.z, t4, h) + + return pointEq +} diff --git a/sm9/bn256/gfp2_test.go b/sm9/bn256/gfp2_test.go index 10ad6a6..4af4fd1 100644 --- a/sm9/bn256/gfp2_test.go +++ b/sm9/bn256/gfp2_test.go @@ -135,10 +135,10 @@ func BenchmarkGfP2Mul(b *testing.B) { *fromBigInt(bigFromHex("17509B092E845C1266BA0D262CBEE6ED0736A96FA347C8BD856DC76B84EBEB96")), *fromBigInt(bigFromHex("A7CF28D519BE3DA65F3170153D278FF247EFBA98A71A08116215BBA5C999A7C7")), } + t := &gfP2{} b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - t := &gfP2{} t.Mul(x, y) } } diff --git a/sm9/bn256/gfp4.go b/sm9/bn256/gfp4.go index eb9bc0d..f2c7bb5 100644 --- a/sm9/bn256/gfp4.go +++ b/sm9/bn256/gfp4.go @@ -121,8 +121,8 @@ func (e *gfP4) MulNC(a, b *gfP4) *gfP4 { tx := &e.x ty := &e.y v0, v1 := &gfP2{}, &gfP2{} - v0.MulNC(&a.y, &b.y) - v1.MulNC(&a.x, &b.x) + v0.Mul(&a.y, &b.y) + v1.Mul(&a.x, &b.x) tx.Add(&a.x, &a.y) ty.Add(&b.x, &b.y) @@ -148,8 +148,8 @@ func (e *gfP4) MulNC2(a *gfP4, x, y *gfP2) *gfP4 { tx := &e.x ty := &e.y v0, v1 := &gfP2{}, &gfP2{} - v0.MulNC(&a.y, y) - v1.MulNC(&a.x, x) + v0.Mul(&a.y, y) + v1.Mul(&a.x, x) tx.Add(&a.x, &a.y) ty.Add(x, y) @@ -181,8 +181,8 @@ func (e *gfP4) MulVNC(a, b *gfP4) *gfP4 { tx := &e.x ty := &e.y v0, v1 := &gfP2{}, &gfP2{} - v0.MulNC(&a.y, &b.y) - v1.MulNC(&a.x, &b.x) + v0.Mul(&a.y, &b.y) + v1.Mul(&a.x, &b.x) tx.Add(&a.x, &a.y) ty.Add(&b.x, &b.y) @@ -227,11 +227,11 @@ func (e *gfP4) SquareNC(a *gfP4) *gfP4 { tx := &e.x ty := &e.y - tx.SquareUNC(&a.x) - ty.SquareNC(&a.y) + tx.SquareU(&a.x) + ty.Square(&a.y) ty.Add(tx, ty) - tx.MulNC(&a.x, &a.y) + tx.Mul(&a.x, &a.y) tx.Add(tx, tx) return e @@ -250,8 +250,8 @@ func (e *gfP4) SquareV(a *gfP4) *gfP4 { func (e *gfP4) SquareVNC(a *gfP4) *gfP4 { tx := &e.x ty := &e.y - tx.SquareUNC(&a.x) - ty.SquareNC(&a.y) + tx.SquareU(&a.x) + ty.Square(&a.y) tx.Add(tx, ty) ty.MulU(&a.x, &a.y) @@ -269,15 +269,15 @@ func (e *gfP4) Invert(a *gfP4) *gfP4 { t3 := &gfP2{} - t3.SquareUNC(&a.x) - t1.SquareNC(&a.y) + t3.SquareU(&a.x) + t1.Square(&a.y) t3.Sub(t3, t1) t3.Invert(t3) t1.Mul(&a.y, t3) t1.Neg(t1) - t2.MulNC(&a.x, t3) + t2.Mul(&a.x, t3) gfp4Copy(e, tmp) return e diff --git a/sm9/bn256/gfp6.go b/sm9/bn256/gfp6.go index b6cb982..c767c18 100644 --- a/sm9/bn256/gfp6.go +++ b/sm9/bn256/gfp6.go @@ -119,9 +119,9 @@ func (e *gfP6) MulNC(a, b *gfP6) *gfP6 { ty := &e.y tz := &e.z t, v0, v1, v2 := &gfP2{}, &gfP2{}, &gfP2{}, &gfP2{} - v0.MulNC(&a.z, &b.z) - v1.MulNC(&a.y, &b.y) - v2.MulNC(&a.x, &b.x) + v0.Mul(&a.z, &b.z) + v1.Mul(&a.y, &b.y) + v2.Mul(&a.x, &b.x) t.Add(&a.y, &a.x) tz.Add(&b.y, &b.x) @@ -185,26 +185,26 @@ func (e *gfP6) SquareNC(a *gfP6) *gfP6 { tz := &e.z t, v0, v1, v2 := &gfP2{}, &gfP2{}, &gfP2{}, &gfP2{} - v0.SquareNC(&a.z) - v1.SquareNC(&a.y) - v2.SquareNC(&a.x) + v0.Square(&a.z) + v1.Square(&a.y) + v2.Square(&a.x) t.Add(&a.y, &a.x) - tz.SquareNC(t) + tz.Square(t) tz.Sub(tz, v1) tz.Sub(tz, v2) tz.MulU1(tz) tz.Add(tz, v0) t.Add(&a.z, &a.y) - ty.SquareNC(t) + ty.Square(t) ty.Sub(ty, v0) ty.Sub(ty, v1) t.MulU1(v2) ty.Add(ty, t) t.Add(&a.z, &a.x) - tx.SquareNC(t) + tx.Square(t) tx.Sub(tx, v0) tx.Add(tx, v1) tx.Sub(tx, v2) @@ -233,19 +233,19 @@ func (e *gfP6) Invert(a *gfP6) *gfP6 { // See "Implementing cryptographic pairings", M. Scott, section 3.2. // ftp://136.206.11.249/pub/crypto/pairings.pdf - t1 := (&gfP2{}).MulUNC(&a.x, &a.y) - A := (&gfP2{}).SquareNC(&a.z) + t1 := (&gfP2{}).MulU(&a.x, &a.y) + A := (&gfP2{}).Square(&a.z) A.Sub(A, t1) - B := (&gfP2{}).SquareUNC(&a.x) + B := (&gfP2{}).SquareU(&a.x) t1.Mul(&a.y, &a.z) B.Sub(B, t1) - C := (&gfP2{}).SquareNC(&a.y) + C := (&gfP2{}).Square(&a.y) t1.Mul(&a.x, &a.z) C.Sub(C, t1) - F := (&gfP2{}).MulUNC(C, &a.y) + F := (&gfP2{}).MulU(C, &a.y) t1.Mul(A, &a.z) F.Add(F, t1) t1.MulU(B, &a.x) diff --git a/sm9/bn256/gfp_amd64.s b/sm9/bn256/gfp_amd64.s index 370d826..8027c12 100644 --- a/sm9/bn256/gfp_amd64.s +++ b/sm9/bn256/gfp_amd64.s @@ -67,7 +67,7 @@ CMOVQCC b2, a2 \ CMOVQCC b3, a3 -TEXT ·gfpNeg(SB),0,$0-16 +TEXT ·gfpNeg(SB),NOSPLIT,$0-16 MOVQ ·p2+0(SB), R8 MOVQ ·p2+8(SB), R9 MOVQ ·p2+16(SB), R10 @@ -85,7 +85,7 @@ TEXT ·gfpNeg(SB),0,$0-16 storeBlock(R8,R9,R10,R11, 0(DI)) RET -TEXT ·gfpAdd(SB),0,$0-24 +TEXT ·gfpAdd(SB),NOSPLIT,$0-24 MOVQ a+8(FP), DI MOVQ b+16(FP), SI @@ -104,7 +104,7 @@ TEXT ·gfpAdd(SB),0,$0-24 storeBlock(R8,R9,R10,R11, 0(DI)) RET -TEXT ·gfpDouble(SB),0,$0-16 +TEXT ·gfpDouble(SB),NOSPLIT,$0-16 MOVQ a+0(FP), DI MOVQ b+8(FP), SI @@ -122,7 +122,7 @@ TEXT ·gfpDouble(SB),0,$0-16 storeBlock(R8,R9,R10,R11, 0(DI)) RET -TEXT ·gfpTriple(SB),0,$0-16 +TEXT ·gfpTriple(SB),NOSPLIT,$0-16 MOVQ a+0(FP), DI MOVQ b+8(FP), SI @@ -149,7 +149,7 @@ TEXT ·gfpTriple(SB),0,$0-16 storeBlock(R8,R9,R10,R11, 0(DI)) RET -TEXT ·gfpSub(SB),0,$0-24 +TEXT ·gfpSub(SB),NOSPLIT,$0-24 MOVQ a+8(FP), DI MOVQ b+16(FP), SI @@ -180,7 +180,7 @@ TEXT ·gfpSub(SB),0,$0-24 storeBlock(R8,R9,R10,R11, 0(DI)) RET -TEXT ·gfpMul(SB),0,$0-24 +TEXT ·gfpMul(SB),NOSPLIT,$0-24 MOVQ in1+8(FP), x_ptr MOVQ in2+16(FP), y_ptr diff --git a/sm9/bn256/gfp_test.go b/sm9/bn256/gfp_test.go index 4972004..56a8731 100644 --- a/sm9/bn256/gfp_test.go +++ b/sm9/bn256/gfp_test.go @@ -56,7 +56,8 @@ func Test_gfpSqr(t *testing.T) { gfpSqr(ret, x, 1) pMinusOne.Mul(pMinusOne, pMinusOne) pMinusOne.Mod(pMinusOne, p) - if *ret != *fromBigInt(pMinusOne) { + expected := fromBigInt(pMinusOne) + if *ret != *expected { t.Errorf("bad sqr") } // p + 1 diff --git a/sm9/bn256/twist.go b/sm9/bn256/twist.go index 1f760ad..1b53ce5 100644 --- a/sm9/bn256/twist.go +++ b/sm9/bn256/twist.go @@ -17,6 +17,11 @@ var twistB = &gfP2{ *zero, } +var threeTwistB = &gfP2{ + *newGFp(3 * 5), + *zero, +} + // twistGen is the generator of group G₂. var twistGen = &twistPoint{ gfP2{ @@ -58,7 +63,7 @@ func NewTwistGenerator() *twistPoint { func (c *twistPoint) polynomial(x *gfP2) *gfP2 { x3 := &gfP2{} - x3.SquareNC(x).Mul(x3, x).Add(x3, twistB) + x3.Square(x).Mul(x3, x).Add(x3, twistB) return x3 } @@ -70,7 +75,7 @@ func (c *twistPoint) IsOnCurve() bool { } y2 := &gfP2{} - y2.SquareNC(&c.y) + y2.Square(&c.y) x3 := c.polynomial(&c.x) return y2.Equal(x3) == 1 @@ -87,92 +92,79 @@ func (c *twistPoint) IsInfinity() bool { return c.z.IsZero() } -func (c *twistPoint) Add(a, b *twistPoint) { - // For additional comments, see the same function in curve.go. +func (c *twistPoint) Add(p1, p2 *twistPoint) { + // Complete addition formula for a = 0 from "Complete addition formulas for + // prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2. + // Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves. - if a.IsInfinity() { - c.Set(b) - return - } - if b.IsInfinity() { - c.Set(a) - return - } + t0, t1, t2, t3, t4 := new(gfP2), new(gfP2), new(gfP2), new(gfP2), new(gfP2) + x3, y3, z3 := new(gfP2), new(gfP2), new(gfP2) + t0.Mul(&p1.x, &p2.x) // t0 := X1X2 + t1.Mul(&p1.y, &p2.y) // t1 := Y1Y2 + t2.Mul(&p1.z, &p2.z) // t2 := Z1Z2 + t3.Add(&p1.x, &p1.y) // t3 := X1 + Y1 + t4.Add(&p2.x, &p2.y) // t4 := X2 + Y2 + t3.Mul(t3, t4) // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) + t4.Add(t0, t1) // t4 := t0 + t1 + t3.Sub(t3, t4) // t3 := t3 - t4 = X1Y2 + X2Y1 + t4.Add(&p1.y, &p1.z) // t4 := Y1 + Z1 + x3.Add(&p2.y, &p2.z) // X3 := Y2 + Z2 + t4.Mul(t4, x3) // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) + x3.Add(t1, t2) // X3 := t1 + t2 + t4.Sub(t4, x3) // t4 := t4 - X3 = Y1Z2 + Y2Z1 + x3.Add(&p1.x, &p1.z) // X3 := X1 + Z1 + y3.Add(&p2.x, &p2.z) // Y3 := X2 + Z2 + x3.Mul(x3, y3) // X3 := X3 * Y3 + y3.Add(t0, t2) // Y3 := t0 + t2 + y3.Sub(x3, y3) // Y3 := X3 - Y3 = X1Z2 + X2Z1 + t0.Triple(t0) // t0 := t0 + t0 + t0 = 3X1X2 + t2.Mul(threeTwistB, t2) // t2 := 3b * t2 = 3bZ1Z2 + z3.Add(t1, t2) // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 + t1.Sub(t1, t2) // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 + y3.Mul(threeTwistB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) + x3.Mul(t4, y3) // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) + t2.Mul(t3, t1) // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) + x3.Sub(t2, x3) // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) + y3.Mul(y3, t0) // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) + t1.Mul(t1, z3) // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + y3.Add(t1, y3) // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) + t0.Mul(t0, t3) // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) + z3.Mul(z3, t4) // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + z3.Add(z3, t0) // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) - // See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/addition/add-2007-bl.op3 - z12 := (&gfP2{}).SquareNC(&a.z) - z22 := (&gfP2{}).SquareNC(&b.z) - u1 := (&gfP2{}).MulNC(&a.x, z22) - u2 := (&gfP2{}).MulNC(&b.x, z12) - - t := (&gfP2{}).MulNC(&b.z, z22) - s1 := (&gfP2{}).MulNC(&a.y, t) - - t.Mul(&a.z, z12) - s2 := (&gfP2{}).MulNC(&b.y, t) - - h := (&gfP2{}).Sub(u2, u1) - xEqual := h.IsZero() - - t.Double(h) - i := (&gfP2{}).SquareNC(t) - j := (&gfP2{}).MulNC(h, i) - - t.Sub(s2, s1) - yEqual := t.IsZero() - if xEqual && yEqual { - c.Double(a) - return - } - r := (&gfP2{}).Double(t) - - v := (&gfP2{}).MulNC(u1, i) - - t4 := (&gfP2{}).SquareNC(r) - t.Double(v) - t6 := (&gfP2{}).Sub(t4, j) - c.x.Sub(t6, t) - - t.Sub(v, &c.x) // t7 - t4.Mul(s1, j) // t8 - t6.Double(t4) // t9 - t4.Mul(r, t) // t10 - c.y.Sub(t4, t6) - - t.Add(&a.z, &b.z) // t11 - t4.Square(t) // t12 - t.Sub(t4, z12) // t13 - t4.Sub(t, z22) // t14 - c.z.Mul(t4, h) + c.x.Set(x3) + c.y.Set(y3) + c.z.Set(z3) } -func (c *twistPoint) Double(a *twistPoint) { - // See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/doubling/dbl-2009-l.op3 - A := (&gfP2{}).SquareNC(&a.x) - B := (&gfP2{}).SquareNC(&a.y) - C := (&gfP2{}).SquareNC(B) +func (c *twistPoint) Double(p *twistPoint) { + // Complete addition formula for a = 0 from "Complete addition formulas for + // prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2. + // Algorithm 9: Exception-free point doubling for prime order j-invariant 0 short Weierstrass curves. + t0, t1, t2 := new(gfP2), new(gfP2), new(gfP2) + x3, y3, z3 := new(gfP2), new(gfP2), new(gfP2) - t := (&gfP2{}).Add(&a.x, B) - t2 := (&gfP2{}).SquareNC(t) - t.Sub(t2, A) - t2.Sub(t, C) - d := (&gfP2{}).Double(t2) - t.Double(A) - e := (&gfP2{}).Add(t, A) - f := (&gfP2{}).SquareNC(e) + t0.Square(&p.y) // t0 := Y^2 + z3.Double(t0) // Z3 := t0 + t0 + z3.Double(z3) // Z3 := Z3 + Z3 + z3.Double(z3) // Z3 := Z3 + Z3 + t1.Mul(&p.y, &p.z) // t1 := YZ + t2.Square(&p.z) // t0 := Z^2 + t2.Mul(threeTwistB, t2) // t2 := 3b * t2 = 3bZ^2 + x3.Mul(t2, z3) // X3 := t2 * Z3 + y3.Add(t0, t2) // Y3 := t0 + t2 + z3.Mul(t1, z3) // Z3 := t1 * Z3 + t2.Triple(t2) // t2 := t2 + t2 + t2 + t0.Sub(t0, t2) // t0 := t0 - t2 + y3.Mul(t0, y3) // t0 := t0 * Y3 + y3.Add(x3, y3) // Y3 := X3 + Y3 + t1.Mul(&p.x, &p.y) // t1 := XY + x3.Mul(t0, t1) // X3 := t0 * t1 + x3.Double(x3) // X3 := X3 + X3 - t.Double(d) - c.x.Sub(f, t) - - c.z.Mul(&a.y, &a.z) - c.z.Double(&c.z) - - t.Double(C) - t2.Double(t) - t.Double(t2) - c.y.Sub(d, &c.x) - t2.Mul(e, &c.y) - c.y.Sub(t2, t) + c.x.Set(x3) + c.y.Set(y3) + c.z.Set(z3) } func (c *twistPoint) Mul(a *twistPoint, scalar *big.Int) { @@ -190,7 +182,13 @@ func (c *twistPoint) Mul(a *twistPoint, scalar *big.Int) { c.Set(sum) } +// MakeAffine reverses the Projective transform. +// A = 1/Z1 +// X3 = A*X1 +// Y3 = A*Y1 +// Z3 = 1 func (c *twistPoint) MakeAffine() { + // TODO: do we need to change it to constant-time implementation? if c.z.IsOne() { return } else if c.z.IsZero() { @@ -200,12 +198,12 @@ func (c *twistPoint) MakeAffine() { return } - zInv := (&gfP2{}).Invert(&c.z) - t := (&gfP2{}).MulNC(&c.y, zInv) - zInv2 := (&gfP2{}).SquareNC(zInv) - c.y.Mul(t, zInv2) - t.Mul(&c.x, zInv2) - c.x.Set(t) + zInv := &gfP2{} + zInv.Invert(&c.z) + + c.x.Mul(&c.x, zInv) + c.y.Mul(&c.y, zInv) + c.z.SetOne() c.t.SetOne() } From 5b5b26c095edeb66c5ca3b4bf1da9c8c4cfe7dd5 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Fri, 21 Jul 2023 17:51:25 +0800 Subject: [PATCH 5/6] sm9/bn256: fix twist Frobenius bug due to #144, will further review those functions usage --- sm9/bn256/twist.go | 23 +++++++++++++++++++++++ sm9/bn256/twist_test.go | 9 ++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/sm9/bn256/twist.go b/sm9/bn256/twist.go index 1b53ce5..e56575a 100644 --- a/sm9/bn256/twist.go +++ b/sm9/bn256/twist.go @@ -208,6 +208,29 @@ func (c *twistPoint) MakeAffine() { c.t.SetOne() } +// MakeAffine reverses the Jacobian transform. +// the Jacobian coordinates are (x1, y1, z1) +// where x = x1/z1² and y = y1/z1³. +func (c *twistPoint) AffineFromJacobian() { + if c.z.IsOne() { + return + } else if c.z.IsZero() { + c.x.SetZero() + c.y.SetOne() + c.t.SetZero() + return + } + + zInv := (&gfP2{}).Invert(&c.z) + t := (&gfP2{}).Mul(&c.y, zInv) + zInv2 := (&gfP2{}).Square(zInv) + c.y.Mul(t, zInv2) + t.Mul(&c.x, zInv2) + c.x.Set(t) + c.z.SetOne() + c.t.SetOne() +} + func (c *twistPoint) Neg(a *twistPoint) { c.x.Set(&a.x) c.y.Neg(&a.y) diff --git a/sm9/bn256/twist_test.go b/sm9/bn256/twist_test.go index bf34e7a..6ea1a0b 100644 --- a/sm9/bn256/twist_test.go +++ b/sm9/bn256/twist_test.go @@ -28,7 +28,7 @@ func TestAddNeg(t *testing.T) { func Test_TwistFrobeniusP(t *testing.T) { ret1, ret2 := &twistPoint{}, &twistPoint{} ret1.Frobenius(twistGen) - ret1.MakeAffine() + ret1.AffineFromJacobian() ret2.x.Conjugate(&twistGen.x) ret2.x.MulScalar(&ret2.x, betaToNegPPlus1Over3) @@ -49,12 +49,15 @@ func Test_TwistFrobeniusP(t *testing.T) { func Test_TwistFrobeniusP2(t *testing.T) { ret1, ret2 := &twistPoint{}, &twistPoint{} ret1.Frobenius(twistGen) + ret1.AffineFromJacobian() ret1.Frobenius(ret1) + ret1.AffineFromJacobian() if !ret1.IsOnCurve() { t.Errorf("point should be on curve") } ret2.FrobeniusP2(twistGen) + ret2.AffineFromJacobian() if !ret2.IsOnCurve() { t.Errorf("point should be on curve") } @@ -77,7 +80,7 @@ func Test_TwistFrobeniusP2_Case2(t *testing.T) { } ret2.FrobeniusP2(twistGen) - ret2.MakeAffine() + ret2.AffineFromJacobian() if !ret2.IsOnCurve() { t.Errorf("point should be on curve") } @@ -100,7 +103,7 @@ func Test_TwistNegFrobeniusP2_Case2(t *testing.T) { } ret2.NegFrobeniusP2(twistGen) - ret2.MakeAffine() + ret2.AffineFromJacobian() if !ret2.IsOnCurve() { t.Errorf("point should be on curve") } From 76131e643806c01ee0f5a7aa91f439ea18ebc31e Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Fri, 21 Jul 2023 18:06:22 +0800 Subject: [PATCH 6/6] internal/sm2ec: not use ADX first --- internal/sm2ec/p256_asm_amd64.s | 73 ++++++++++++--------------------- 1 file changed, 26 insertions(+), 47 deletions(-) diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 979fcc5..7069219 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -2095,7 +2095,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalMulBMI2 - // [t3, t2, t1, t0] * acc4 MOVQ acc4, mul0 MULQ t0 MOVQ mul0, acc0 @@ -2119,7 +2118,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, acc4 - // [t3, t2, t1, t0] * acc5 MOVQ acc5, mul0 MULQ t0 ADDQ mul0, acc1 @@ -2150,7 +2148,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, acc5 - // [t3, t2, t1, t0] * acc6 MOVQ acc6, mul0 MULQ t0 ADDQ mul0, acc2 @@ -2181,7 +2178,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, acc6 - // [t3, t2, t1, t0] * acc7 MOVQ acc7, mul0 MULQ t0 ADDQ mul0, acc3 @@ -2211,8 +2207,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADDQ mul0, acc6 ADCQ $0, mul1 MOVQ mul1, acc7 - - // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] // First reduction step MOVQ acc0, mul0 MOVQ acc0, mul1 @@ -2298,9 +2292,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 CMOVQCS acc3, acc7 RET - internalMulBMI2: - // [t3, t2, t1, t0] * acc4 MOVQ acc4, mul1 MULXQ t0, acc0, acc1 @@ -2314,7 +2306,6 @@ internalMulBMI2: ADCQ mul0, acc3 ADCQ $0, acc4 - // [t3, t2, t1, t0] * acc5 MOVQ acc5, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc1 @@ -2335,7 +2326,6 @@ internalMulBMI2: ADDQ mul0, acc4 ADCQ $0, acc5 - // [t3, t2, t1, t0] * acc6 MOVQ acc6, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc2 @@ -2356,7 +2346,6 @@ internalMulBMI2: ADDQ mul0, acc5 ADCQ $0, acc6 - // [t3, t2, t1, t0] * acc7 MOVQ acc7, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc3 @@ -2377,7 +2366,6 @@ internalMulBMI2: ADDQ mul0, acc6 ADCQ $0, acc7 - // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] // First reduction step MOVQ acc0, mul0 MOVQ acc0, mul1 @@ -2554,7 +2542,6 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalSqrBMI2 - // [acc7, acc6, acc5] * acc4 MOVQ acc4, mul0 MULQ acc5 MOVQ mul0, acc1 @@ -2572,7 +2559,6 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, t0 - // [acc7, acc6] * acc5 MOVQ acc5, mul0 MULQ acc6 ADDQ mul0, acc3 @@ -2587,7 +2573,6 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, t1 - // acc7 * acc6 MOVQ acc6, mul0 MULQ acc7 ADDQ mul0, t1 @@ -2628,70 +2613,64 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADCQ mul0, t2 ADCQ DX, t3 - // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] sm2P256SqrReductionInternal() RET internalSqrBMI2: - XORQ t3, t3 - - // [acc7, acc6, acc5] * acc4 MOVQ acc4, mul1 MULXQ acc5, acc1, acc2 MULXQ acc6, mul0, acc3 - ADOXQ mul0, acc2 + ADDQ mul0, acc2 MULXQ acc7, mul0, t0 - ADOXQ mul0, acc3 - ADOXQ t3, t0 + ADCQ mul0, acc3 + ADCQ $0, t0 - // [acc7, acc6] * acc5 MOVQ acc5, mul1 MULXQ acc6, mul0, hlp - ADOXQ mul0, acc3 + ADDQ mul0, acc3 + ADCQ hlp, t0 MULXQ acc7, mul0, t1 - ADCXQ hlp, mul0 - ADOXQ mul0, t0 - ADCXQ t3, t1 + ADCQ $0, t1 + ADDQ mul0, t0 - // acc7 * acc6 MOVQ acc6, mul1 MULXQ acc7, mul0, t2 - ADOXQ mul0, t1 - ADOXQ t3, t2 - + ADCQ mul0, t1 + ADCQ $0, t2 + XORQ t3, t3 + // *2 - ADOXQ acc1, acc1 - ADOXQ acc2, acc2 - ADOXQ acc3, acc3 - ADOXQ t0, t0 - ADOXQ t1, t1 - ADOXQ t2, t2 - ADOXQ t3, t3 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ t0, t0 + ADCQ t1, t1 + ADCQ t2, t2 + ADCQ $0, t3 // Missing products MOVQ acc4, mul1 MULXQ mul1, acc0, acc4 - ADCXQ acc4, acc1 + ADDQ acc4, acc1 MOVQ acc5, mul1 MULXQ mul1, mul0, acc4 - ADCXQ mul0, acc2 - ADCXQ acc4, acc3 + ADCQ mul0, acc2 + ADCQ acc4, acc3 MOVQ acc6, mul1 MULXQ mul1, mul0, acc4 - ADCXQ mul0, t0 - ADCXQ acc4, t1 + ADCQ mul0, t0 + ADCQ acc4, t1 MOVQ acc7, mul1 MULXQ mul1, mul0, acc4 - ADCXQ mul0, t2 - ADCXQ acc4, t3 - - // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] + ADCQ mul0, t2 + ADCQ acc4, t3 + sm2P256SqrReductionInternal() RET