From 0a559b5202f10e7ecd0109fed9e78daa4094c9b5 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Tue, 25 Jul 2023 15:02:41 +0800 Subject: [PATCH] sm9/bn256: arm64 complete add --- sm9/bn256/gfp2_g1_arm64.go | 74 ----------------- sm9/bn256/gfp2_g1_arm64.s | 165 +++++++++++++++++++++++++++++++++++++ sm9/bn256/gfp2_g1_decl.go | 4 +- 3 files changed, 167 insertions(+), 76 deletions(-) delete mode 100644 sm9/bn256/gfp2_g1_arm64.go diff --git a/sm9/bn256/gfp2_g1_arm64.go b/sm9/bn256/gfp2_g1_arm64.go deleted file mode 100644 index f92c169..0000000 --- a/sm9/bn256/gfp2_g1_arm64.go +++ /dev/null @@ -1,74 +0,0 @@ -//go:build arm64 && !purego -// +build arm64,!purego - -package bn256 - -// gfP2 multiplication. -// -//go:noescape -func gfp2Mul(c, a, b *gfP2) - -// gfP2 multiplication. c = a*b*u -// -//go:noescape -func gfp2MulU(c, a, b *gfP2) - -// gfP2 square. -// -//go:noescape -func gfp2Square(c, a *gfP2) - -// gfP2 square and mult u. -// -//go:noescape -func gfp2SquareU(c, a *gfP2) - -// Point doubling. Sets res = in + in. in can be the point at infinity. -// -//go:noescape -func curvePointDoubleComplete(c, a *curvePoint) - -func curvePointAddComplete(c, p1, p2 *curvePoint) { - // Complete addition formula for a = 0 from "Complete addition formulas for - // prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2. - // Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves. - - t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP) - x3, y3, z3 := new(gfP), new(gfP), new(gfP) - gfpMul(t0, &p1.x, &p2.x) // t0 := X1X2 - gfpMul(t1, &p1.y, &p2.y) // t1 := Y1Y2 - gfpMul(t2, &p1.z, &p2.z) // t2 := Z1Z2 - gfpAdd(t3, &p1.x, &p1.y) // t3 := X1 + Y1 - gfpAdd(t4, &p2.x, &p2.y) // t4 := X2 + Y2 - gfpMul(t3, t3, t4) // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) - gfpAdd(t4, t0, t1) // t4 := t0 + t1 - gfpSub(t3, t3, t4) // t3 := t3 - t4 = X1Y2 + X2Y1 - gfpAdd(t4, &p1.y, &p1.z) // t4 := Y1 + Z1 - gfpAdd(x3, &p2.y, &p2.z) // X3 := Y2 + Z2 - gfpMul(t4, t4, x3) // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) - gfpAdd(x3, t1, t2) // X3 := t1 + t2 - gfpSub(t4, t4, x3) // t4 := t4 - X3 = Y1Z2 + Y2Z1 - gfpAdd(x3, &p1.x, &p1.z) // X3 := X1 + Z1 - gfpAdd(y3, &p2.x, &p2.z) // Y3 := X2 + Z2 - gfpMul(x3, x3, y3) // X3 := X3 * Y3 - gfpAdd(y3, t0, t2) // Y3 := t0 + t2 - gfpSub(y3, x3, y3) // Y3 := X3 - Y3 = X1Z2 + X2Z1 - gfpTriple(t0, t0) // t0 := t0 + t0 + t0 = 3X1X2 - gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2 - gfpAdd(z3, t1, t2) // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 - gfpSub(t1, t1, t2) // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 - gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) - gfpMul(x3, t4, y3) // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) - gfpMul(t2, t3, t1) // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - gfpSub(x3, t2, x3) // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) - gfpMul(y3, y3, t0) // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) - gfpMul(t1, t1, z3) // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) - gfpAdd(y3, t1, y3) // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) - gfpMul(t0, t0, t3) // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) - gfpMul(z3, z3, t4) // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) - gfpAdd(z3, z3, t0) // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) - - c.x.Set(x3) - c.y.Set(y3) - c.z.Set(z3) -} diff --git a/sm9/bn256/gfp2_g1_arm64.s b/sm9/bn256/gfp2_g1_arm64.s index 878c3ec..a49aebe 100644 --- a/sm9/bn256/gfp2_g1_arm64.s +++ b/sm9/bn256/gfp2_g1_arm64.s @@ -757,6 +757,171 @@ TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$168-16 RET +/* ---------------------------------------*/ #undef x3t #undef y3t #undef z3t + +#define tmp2(off) (32*2 + 8 + off)(RSP) +#define tmp3(off) (32*3 + 8 + off)(RSP) +#define tmp4(off) (32*4 + 8 + off)(RSP) +#define x3t(off) (32*5 + 8 + off)(RSP) +#define y3t(off) (32*6 + 8 + off)(RSP) +#define z3t(off) (32*7 + 8 + off)(RSP) + +// func curvePointAddComplete(c, a, b *curvePoint) +TEXT ·curvePointAddComplete(SB),0,$264-24 + MOVD in1+8(FP), a_ptr + MOVD in2+16(FP), b_ptr + + MOVD ·np+0x00(SB), hlp1 + LDP ·p2+0x00(SB), (const0, const1) + LDP ·p2+0x10(SB), (const2, const3) + + LDx (x1in) + LDy (x2in) + CALL gfpMulInternal(SB) // t0 := X1X2 + STy (tmp0) + LDx (y1in) + LDy (y2in) + CALL gfpMulInternal(SB) // t1 := Y1Y2 + STy (tmp1) + LDx (z1in) + LDy (z2in) + CALL gfpMulInternal(SB) // t2 := Z1Z2 + STy (tmp2) + + LDx (x1in) + LDy (y1in) + gfpAddInline // t3 := X1 + Y1 + STx (tmp3) + + LDx (x2in) + LDy (y2in) + gfpAddInline // t4 := X2 + Y2 + LDy (tmp3) + CALL gfpMulInternal(SB) // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) + STy (tmp3) + + LDx (tmp0) + LDy (tmp1) + gfpAddInline // t4 := t0 + t1 + LDy (tmp3) + CALL gfpSubInternal(SB) // t3 := t3 - t4 = X1Y2 + X2Y1 + STx (tmp3) + + LDx (y1in) + LDy (z1in) + gfpAddInline // t4 := Y1 + Z1 + STx (tmp4) + + LDx (y2in) + LDy (z2in) + gfpAddInline // t3 := Y2 + Z2 + LDy (tmp4) + CALL gfpMulInternal(SB) // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) + STy (tmp4) + + LDx (tmp1) + LDy (tmp2) + gfpAddInline // X3 := t1 + t2 + LDy (tmp4) + CALL gfpSubInternal(SB) // t4 := t4 - X3 = Y1Z2 + Y2Z1 + STx (tmp4) + + LDx (x1in) + LDy (z1in) + gfpAddInline // X3 := X1 + Z1 + STx (x3t) + + LDx (x2in) + LDy (z2in) + gfpAddInline // Y3 := X2 + Z2 + LDy (x3t) + CALL gfpMulInternal(SB) // X3 := X3 * Y3 + STy (x3t) + + LDx (tmp0) + LDy (tmp2) + gfpAddInline // Y3 := t0 + t2 + LDy (x3t) + CALL gfpSubInternal(SB) // Y3 := X3 - Y3 = X1Z2 + X2Z1 + STx (y3t) + + LDy (tmp0) + gfpMulBy2Inline + gfpAddInline // t0 := t0 + t0 + t0 = 3X1X2 + STx (tmp0) + + LDy (tmp2) + gfpMulBy2Inline + x2y + gfpMulBy2Inline + x2y + gfpMulBy2Inline + x2y + gfpMulBy2Inline + x2y + LDx (tmp2) + CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ1Z2 + STx (tmp2) + + LDy (tmp1) + gfpAddInline // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 + STx (z3t) + + LDx (tmp2) + CALL gfpSubInternal(SB) // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 + STx (tmp1) + + LDy (y3t) + gfpMulBy2Inline + x2y + gfpMulBy2Inline + x2y + gfpMulBy2Inline + x2y + gfpMulBy2Inline + x2y + LDx (y3t) + CALL gfpSubInternal(SB) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) + STx (y3t) + + LDy (tmp4) + CALL gfpMulInternal(SB) // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) + STy (x3t) + + MOVD res+0(FP), b_ptr + + LDx (tmp3) + LDy (tmp1) + CALL gfpMulInternal(SB) // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) + LDx (x3t) + CALL gfpSubInternal(SB) // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) + STx (x2in) + + LDy (y3t) + LDx (tmp0) + CALL gfpMulInternal(SB) // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) + STy (y3t) + + LDx (tmp1) + LDy (z3t) + CALL gfpMulInternal(SB) // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + LDx (y3t) + gfpAddInline // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) + STx (y2in) + + LDx (tmp0) + LDy (tmp3) + CALL gfpMulInternal(SB) // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) + STy (tmp0) + + LDx (tmp4) + LDy (z3t) + CALL gfpMulInternal(SB) // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + LDx (tmp0) + gfpAddInline // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) + STx (x2in) + + RET diff --git a/sm9/bn256/gfp2_g1_decl.go b/sm9/bn256/gfp2_g1_decl.go index 71c37f2..6e5078a 100644 --- a/sm9/bn256/gfp2_g1_decl.go +++ b/sm9/bn256/gfp2_g1_decl.go @@ -1,5 +1,5 @@ -//go:build amd64 && !purego -// +build amd64,!purego +//go:build (amd64 && !purego) || (arm64 && !purego) +// +build amd64,!purego arm64,!purego package bn256