diff --git a/sm9/bn256/constants.go b/sm9/bn256/constants.go index 5d8dee4..bb9f977 100644 --- a/sm9/bn256/constants.go +++ b/sm9/bn256/constants.go @@ -26,7 +26,7 @@ var p2 = [4]uint64{0xe56f9b27e351457d, 0x21f2934b1a7aeedb, 0xd603ab4ff58ec745, 0 // np is the negative inverse of p, mod 2^256. var np = [4]uint64{0x892bc42c2f2ee42b, 0x181ae39613c8dbaf, 0x966a4b291522b137, 0xafd2bac5558a13b3} -// b3 is 15 +// Montgomery encoding of 15 var b3 = [4]uint64{0x2dd845ba5a554cbf, 0x3719ead6d3ea67f6, 0x71b2f270db49a754, 0x0cbfffffc8934e29} // rN1 is R^-1 where R = 2^256 mod p. diff --git a/sm9/bn256/curve.go b/sm9/bn256/curve.go index c5b1dfe..d8bf0f5 100644 --- a/sm9/bn256/curve.go +++ b/sm9/bn256/curve.go @@ -313,48 +313,3 @@ func curvePointAdd(c, a, b *curvePoint) int { return pointEq } - -func curvePointAddComplete(c, p1, p2 *curvePoint) { - // Complete addition formula for a = 0 from "Complete addition formulas for - // prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2. - // Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves. - - t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP) - x3, y3, z3 := new(gfP), new(gfP), new(gfP) - gfpMul(t0, &p1.x, &p2.x) // t0 := X1X2 - gfpMul(t1, &p1.y, &p2.y) // t1 := Y1Y2 - gfpMul(t2, &p1.z, &p2.z) // t2 := Z1Z2 - gfpAdd(t3, &p1.x, &p1.y) // t3 := X1 + Y1 - gfpAdd(t4, &p2.x, &p2.y) // t4 := X2 + Y2 - gfpMul(t3, t3, t4) // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) - gfpAdd(t4, t0, t1) // t4 := t0 + t1 - gfpSub(t3, t3, t4) // t3 := t3 - t4 = X1Y2 + X2Y1 - gfpAdd(t4, &p1.y, &p1.z) // t4 := Y1 + Z1 - gfpAdd(x3, &p2.y, &p2.z) // X3 := Y2 + Z2 - gfpMul(t4, t4, x3) // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) - gfpAdd(x3, t1, t2) // X3 := t1 + t2 - gfpSub(t4, t4, x3) // t4 := t4 - X3 = Y1Z2 + Y2Z1 - gfpAdd(x3, &p1.x, &p1.z) // X3 := X1 + Z1 - gfpAdd(y3, &p2.x, &p2.z) // Y3 := X2 + Z2 - gfpMul(x3, x3, y3) // X3 := X3 * Y3 - gfpAdd(y3, t0, t2) // Y3 := t0 + t2 - gfpSub(y3, x3, y3) // Y3 := X3 - Y3 = X1Z2 + X2Z1 - gfpTriple(t0, t0) // t0 := t0 + t0 + t0 = 3X1X2 - gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2 - gfpAdd(z3, t1, t2) // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 - gfpSub(t1, t1, t2) // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 - gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) - gfpMul(x3, t4, y3) // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) - gfpMul(t2, t3, t1) // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - gfpSub(x3, t2, x3) // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) - gfpMul(y3, y3, t0) // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) - gfpMul(t1, t1, z3) // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) - gfpAdd(y3, t1, y3) // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) - gfpMul(t0, t0, t3) // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) - gfpMul(z3, z3, t4) // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) - gfpAdd(z3, z3, t0) // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) - - c.x.Set(x3) - c.y.Set(y3) - c.z.Set(z3) -} diff --git a/sm9/bn256/gfp2_g1_amd64.s b/sm9/bn256/gfp2_g1_amd64.s index 74311cf..e9040f6 100644 --- a/sm9/bn256/gfp2_g1_amd64.s +++ b/sm9/bn256/gfp2_g1_amd64.s @@ -1474,6 +1474,263 @@ TEXT gfpIsZero(SB),NOSPLIT,$0 CMOVQEQ t1, AX RET +/* ---------------------------------------*/ +#define x1in(off) (32*0 + off)(SP) +#define y1in(off) (32*1 + off)(SP) +#define z1in(off) (32*2 + off)(SP) +#define x2in(off) (32*3 + off)(SP) +#define y2in(off) (32*4 + off)(SP) +#define z2in(off) (32*5 + off)(SP) +#define xout(off) (32*6 + off)(SP) +#define yout(off) (32*7 + off)(SP) +#define zout(off) (32*8 + off)(SP) +#define tmp0(off) (32*9 + off)(SP) +#define tmp1(off) (32*10 + off)(SP) +#define tmp2(off) (32*11 + off)(SP) +#define tmp3(off) (32*12 + off)(SP) +#define tmp4(off) (32*13 + off)(SP) +#define rptr (32*14)(SP) + +#define curvePointAddCompleteInline \ + LDacc (x1in) \ + LDt (x2in) \ + CALL gfpMulInternal(SB) \ // t0 := X1X2 + ST (tmp0) \ + LDacc (y1in) \ + LDt (y2in) \ + CALL gfpMulInternal(SB) \ // t1 := Y1Y2 + ST (tmp1) \ + LDacc (z1in) \ + LDt (z2in) \ + CALL gfpMulInternal(SB) \ // t2 := Z1Z2 + ST (tmp2) \ + \ + LDacc (x1in) \ + LDt (y1in) \ + gfpAddInline \ + STt (tmp3) \ // t3 := X1 + Y1 + LDacc (x2in) \ + LDt (y2in) \ + gfpAddInline \ + LDacc (tmp3) \ + CALL gfpMulInternal(SB) \ // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) + ST (tmp3) \ + LDacc (tmp0) \ + LDt (tmp1) \ + gfpAddInline \ + LDacc (tmp3) \ + CALL gfpSubInternal(SB) \ // t3 := t3 - t4 = X1Y2 + X2Y1 + ST (tmp3) \ + \ + LDacc (y1in) \ + LDt (z1in) \ + gfpAddInline \ // t4 := Y1 + Z1 + STt (tmp4) \ + LDacc (y2in) \ + LDt (z2in) \ + gfpAddInline \ + LDacc (tmp4) \ + CALL gfpMulInternal(SB) \ // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) + ST (tmp4) \ + LDacc (tmp1) \ + LDt (tmp2) \ + gfpAddInline \ + LDacc (tmp4) \ + CALL gfpSubInternal(SB) \ // t4 := t4 - X3 = Y1Z2 + Y2Z1 + ST (tmp4) \ + \ + LDacc (z1in) \ + LDt (x1in) \ + gfpAddInline \ // X3 := X1 + Z1 + STt (xout) \ + LDacc (z2in) \ + LDt (x2in) \ + gfpAddInline \ + LDacc (xout) \ + CALL gfpMulInternal(SB) \ // X3 := X3 * Y3 + ST (xout) \ + LDacc (tmp0) \ + LDt (tmp2) \ + gfpAddInline \ + LDacc (xout) \ + CALL gfpSubInternal(SB) \ // Y3 := X3 - Y3 = X1Z2 + X2Z1 + ST (yout) \ + \ + LDacc (tmp0) \ + gfpMulBy2Inline \ + LDacc (tmp0) \ + gfpAddInline \ // t0 := t0 + t0 + t0 = 3X1X2 + STt (tmp0) \ + \ + LDacc (tmp2) \ + gfpMulBy2Inline \ + t2acc \ + gfpMulBy2Inline \ + t2acc \ + gfpMulBy2Inline \ + t2acc \ + gfpMulBy2Inline \ + t2acc \ + LDt (tmp2) \ + CALL gfpSubInternal(SB) \ // t2 := 3b * t2 = 3bZ1Z2 + ST (tmp2) \ + \ + LDt (tmp1) \ + gfpAddInline \ // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 + STt (zout) \ + \ + LDacc (tmp1) \ + LDt (tmp2) \ + CALL gfpSubInternal(SB) \ // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 + ST (tmp1) \ + \ + LDacc (yout) \ + gfpMulBy2Inline \ + t2acc \ + gfpMulBy2Inline \ + t2acc \ + gfpMulBy2Inline \ + t2acc \ + gfpMulBy2Inline \ + t2acc \ + LDt (yout) \ + CALL gfpSubInternal(SB) \ // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) + ST (yout) \ + \ + LDt (tmp4) \ + CALL gfpMulInternal(SB) \ // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) + ST (xout) \ + \ + LDacc (tmp1) \ + LDt (tmp3) \ + CALL gfpMulInternal(SB) \ // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) + LDt (xout) \ + CALL gfpSubInternal(SB) \ // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) + MOVQ rptr, AX \ + \// Store x + MOVQ acc4, (16*0 + 8*0)(AX) \ + MOVQ acc5, (16*0 + 8*1)(AX) \ + MOVQ acc6, (16*0 + 8*2)(AX) \ + MOVQ acc7, (16*0 + 8*3)(AX) \ + \ + LDacc (yout) \ + LDt (tmp0) \ + CALL gfpMulInternal(SB) \ // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) + ST (yout) \ + \ + LDacc (tmp1) \ + LDt (zout) \ + CALL gfpMulInternal(SB) \ // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + LDt (yout) \ + gfpAddInline \ // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) + MOVQ rptr, AX \ + \// Store y + MOVQ t0, (16*2 + 8*0)(AX) \ + MOVQ t1, (16*2 + 8*1)(AX) \ + MOVQ t2, (16*2 + 8*2)(AX) \ + MOVQ t3, (16*2 + 8*3)(AX) \ + \ + LDacc (tmp0) \ + LDt (tmp3) \ + CALL gfpMulInternal(SB) \ // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) + ST (tmp0) \ + LDacc (zout) \ + LDt (tmp4) \ + CALL gfpMulInternal(SB) \ // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + LDt (tmp0) \ + gfpAddInline \ // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) + MOVQ rptr, AX \ + MOVQ $0, rptr \ + \// Store z + MOVQ t0, (16*4 + 8*0)(AX) \ + MOVQ t1, (16*4 + 8*1)(AX) \ + MOVQ t2, (16*4 + 8*2)(AX) \ + MOVQ t3, (16*4 + 8*3)(AX) \ + +// func curvePointAddComplete(c, a, b *curvePoint) +TEXT ·curvePointAddComplete(SB),0,$480-24 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+8(FP), BX + MOVQ in2+16(FP), CX + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ pointadd_avx2 + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x1in(16*0) + MOVOU X1, x1in(16*1) + MOVOU X2, y1in(16*0) + MOVOU X3, y1in(16*1) + MOVOU X4, z1in(16*0) + MOVOU X5, z1in(16*1) + + MOVOU (16*0)(CX), X0 + MOVOU (16*1)(CX), X1 + MOVOU (16*2)(CX), X2 + MOVOU (16*3)(CX), X3 + MOVOU (16*4)(CX), X4 + MOVOU (16*5)(CX), X5 + + MOVOU X0, x2in(16*0) + MOVOU X1, x2in(16*1) + MOVOU X2, y2in(16*0) + MOVOU X3, y2in(16*1) + MOVOU X4, z2in(16*0) + MOVOU X5, z2in(16*1) + // Store pointer to result + MOVQ AX, rptr + + curvePointAddCompleteInline + + RET + +pointadd_avx2: + VMOVDQU (32*0)(BX), Y0 + VMOVDQU (32*1)(BX), Y1 + VMOVDQU (32*2)(BX), Y2 + + VMOVDQU Y0, x1in(32*0) + VMOVDQU Y1, y1in(32*0) + VMOVDQU Y2, z1in(32*0) + + VMOVDQU (32*0)(CX), Y0 + VMOVDQU (32*1)(CX), Y1 + VMOVDQU (32*2)(CX), Y2 + + VMOVDQU Y0, x2in(32*0) + VMOVDQU Y1, y2in(32*0) + VMOVDQU Y2, z2in(32*0) + + // Store pointer to result + MOVQ AX, rptr + curvePointAddCompleteInline + + VZEROUPPER + RET + +#undef x1in +#undef y1in +#undef z1in +#undef x2in +#undef y2in +#undef z2in +#undef xout +#undef yout +#undef zout +#undef tmp0 +#undef tmp1 +#undef tmp2 +#undef tmp3 +#undef tmp4 +#undef rptr + /* ---------------------------------------*/ /* #define x1in(off) (32*0 + off)(SP) diff --git a/sm9/bn256/gfp2_g1_decl.go b/sm9/bn256/gfp2_g1_decl.go index f4b8fbb..71c37f2 100644 --- a/sm9/bn256/gfp2_g1_decl.go +++ b/sm9/bn256/gfp2_g1_decl.go @@ -27,9 +27,9 @@ func gfp2SquareU(c, a *gfP2) // //go:noescape func curvePointDoubleComplete(c, a *curvePoint) -/* + // Point addition. Sets res = in1 + in2. in1 can be same as in2, also can be at infinity. // //go:noescape func curvePointAddComplete(c, a, b *curvePoint) -*/ + diff --git a/sm9/bn256/gfp2_g1_generic.go b/sm9/bn256/gfp2_g1_generic.go index fe245fe..83fa955 100644 --- a/sm9/bn256/gfp2_g1_generic.go +++ b/sm9/bn256/gfp2_g1_generic.go @@ -111,3 +111,48 @@ func curvePointDoubleComplete(c, p *curvePoint) { c.y.Set(y3) c.z.Set(z3) } + +func curvePointAddComplete(c, p1, p2 *curvePoint) { + // Complete addition formula for a = 0 from "Complete addition formulas for + // prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2. + // Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves. + + t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP) + x3, y3, z3 := new(gfP), new(gfP), new(gfP) + gfpMul(t0, &p1.x, &p2.x) // t0 := X1X2 + gfpMul(t1, &p1.y, &p2.y) // t1 := Y1Y2 + gfpMul(t2, &p1.z, &p2.z) // t2 := Z1Z2 + gfpAdd(t3, &p1.x, &p1.y) // t3 := X1 + Y1 + gfpAdd(t4, &p2.x, &p2.y) // t4 := X2 + Y2 + gfpMul(t3, t3, t4) // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2) + gfpAdd(t4, t0, t1) // t4 := t0 + t1 + gfpSub(t3, t3, t4) // t3 := t3 - t4 = X1Y2 + X2Y1 + gfpAdd(t4, &p1.y, &p1.z) // t4 := Y1 + Z1 + gfpAdd(x3, &p2.y, &p2.z) // X3 := Y2 + Z2 + gfpMul(t4, t4, x3) // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2) + gfpAdd(x3, t1, t2) // X3 := t1 + t2 + gfpSub(t4, t4, x3) // t4 := t4 - X3 = Y1Z2 + Y2Z1 + gfpAdd(x3, &p1.x, &p1.z) // X3 := X1 + Z1 + gfpAdd(y3, &p2.x, &p2.z) // Y3 := X2 + Z2 + gfpMul(x3, x3, y3) // X3 := X3 * Y3 + gfpAdd(y3, t0, t2) // Y3 := t0 + t2 + gfpSub(y3, x3, y3) // Y3 := X3 - Y3 = X1Z2 + X2Z1 + gfpTriple(t0, t0) // t0 := t0 + t0 + t0 = 3X1X2 + gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2 + gfpAdd(z3, t1, t2) // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2 + gfpSub(t1, t1, t2) // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2 + gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) + gfpMul(x3, t4, y3) // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1) + gfpMul(t2, t3, t1) // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) + gfpSub(x3, t2, x3) // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1) + gfpMul(y3, y3, t0) // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1) + gfpMul(t1, t1, z3) // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + gfpAdd(y3, t1, y3) // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1) + gfpMul(t0, t0, t3) // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1) + gfpMul(z3, z3, t4) // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + gfpAdd(z3, z3, t0) // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1) + + c.x.Set(x3) + c.y.Set(y3) + c.z.Set(z3) +}