sm9/bn256: curvePointDoubleComplete asm

2025-09-18 21:03:49 +08:00 · 2023-07-22 17:29:19 +08:00 · 2023-07-22 17:29:19 +08:00 · b21a234037
commit b21a234037
parent 2d615c7f94
6 changed files with 287 additions and 275 deletions
--- a/sm9/bn256/constants.go
+++ b/sm9/bn256/constants.go
@ -26,6 +26,9 @@ var p2 = [4]uint64{0xe56f9b27e351457d, 0x21f2934b1a7aeedb, 0xd603ab4ff58ec745, 0
 // np is the negative inverse of p, mod 2^256.
 var np = [4]uint64{0x892bc42c2f2ee42b, 0x181ae39613c8dbaf, 0x966a4b291522b137, 0xafd2bac5558a13b3}
 // b3 is 15
 var b3 = [4]uint64{0x2dd845ba5a554cbf, 0x3719ead6d3ea67f6, 0x71b2f270db49a754, 0x0cbfffffc8934e29}
 // rN1 is R^-1 where R = 2^256 mod p.
 var rN1 = &gfP{0x0a1c7970e5df544d, 0xe74504e9a96b56cc, 0xcda02d92d4d62924, 0x7d2bc576fdf597d1}
--- a/sm9/bn256/curve.go
+++ b/sm9/bn256/curve.go
@ -162,6 +162,159 @@ func (e *curvePoint) Equal(other *curvePoint) bool {
 // Below methods are POC yet, the line add/double functions are still based on
 // Jacobian coordination.
 func (c *curvePoint) Add(p1, p2 *curvePoint) {
 	curvePointAddComplete(c, p1, p2)
 }
 func (c *curvePoint) AddComplete(p1, p2 *curvePoint) {
 	curvePointAddComplete(c, p1, p2)
 }
 func (c *curvePoint) Double(p *curvePoint) {
 	curvePointDoubleComplete(c, p)
 }
 func (c *curvePoint) DoubleComplete(p *curvePoint) {
 	curvePointDoubleComplete(c, p)
 }
 // MakeAffine reverses the Projective transform.
 // A = 1/Z1
 // X3 = A*X1
 // Y3 = A*Y1
 // Z3 = 1
 func (c *curvePoint) MakeAffine() {
 	// TODO: do we need to change it to constant-time implementation?
 	if c.z.Equal(one) == 1 {
 		return
 	} else if c.z.Equal(zero) == 1 {
 		c.x.Set(zero)
 		c.y.Set(one)
 		c.t.Set(zero)
 		return
 	}
 	zInv := &gfP{}
 	zInv.Invert(&c.z)
 	gfpMul(&c.x, &c.x, zInv)
 	gfpMul(&c.y, &c.y, zInv)
 	c.z.Set(one)
 	c.t.Set(one)
 }
 func (c *curvePoint) AffineFromProjective() {
 	c.MakeAffine()
 }
 func curvePointDouble(c, a *curvePoint) {
 	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/doubling/dbl-2009-l.op3
 	A, B, C := &gfP{}, &gfP{}, &gfP{}
 	gfpSqr(A, &a.x, 1)
 	gfpSqr(B, &a.y, 1)
 	gfpSqr(C, B, 1)
 	t := &gfP{}
 	gfpAdd(B, &a.x, B)
 	gfpSqr(t, B, 1)
 	gfpSub(B, t, A)
 	gfpSub(t, B, C)
 	d, e := &gfP{}, &gfP{}
 	gfpDouble(d, t)
 	gfpDouble(B, A)
 	gfpAdd(e, B, A)
 	gfpSqr(A, e, 1)
 	gfpDouble(B, d)
 	gfpSub(&c.x, A, B)
 	gfpMul(&c.z, &a.y, &a.z)
 	gfpDouble(&c.z, &c.z)
 	gfpDouble(B, C)
 	gfpDouble(t, B)
 	gfpDouble(B, t)
 	gfpSub(&c.y, d, &c.x)
 	gfpMul(t, e, &c.y)
 	gfpSub(&c.y, t, B)
 }
 func curvePointAdd(c, a, b *curvePoint) int {
 	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/addition/add-2007-bl.op3
 	var pointEq int
 	// Normalize the points by replacing a = [x1:y1:z1] and b = [x2:y2:z2]
 	// by [u1:s1:z1·z2] and [u2:s2:z1·z2]
 	// where u1 = x1·z2², s1 = y1·z2³ and u1 = x2·z1², s2 = y2·z1³
 	z12, z22 := &gfP{}, &gfP{}
 	gfpSqr(z12, &a.z, 1)
 	gfpSqr(z22, &b.z, 1)
 	u1, u2 := &gfP{}, &gfP{}
 	gfpMul(u1, &a.x, z22)
 	gfpMul(u2, &b.x, z12)
 	t, s1 := &gfP{}, &gfP{}
 	gfpMul(t, &b.z, z22)
 	gfpMul(s1, &a.y, t)
 	s2 := &gfP{}
 	gfpMul(t, &a.z, z12)
 	gfpMul(s2, &b.y, t)
 	// Compute x = (2h)²(s²-u1-u2)
 	// where s = (s2-s1)/(u2-u1) is the slope of the line through
 	// (u1,s1) and (u2,s2). The extra factor 2h = 2(u2-u1) comes from the value of z below.
 	// This is also:
 	// 4(s2-s1)² - 4h²(u1+u2) = 4(s2-s1)² - 4h³ - 4h²(2u1)
 	//                        = r² - j - 2v
 	// with the notations below.
 	h := &gfP{}
 	gfpSub(h, u2, u1)
 	gfpDouble(t, h)
 	// i = 4h²
 	i := &gfP{}
 	gfpSqr(i, t, 1)
 	// j = 4h³
 	j := &gfP{}
 	gfpMul(j, h, i)
 	gfpSub(t, s2, s1)
 	pointEq = h.Equal(zero) & t.Equal(zero)
 	r := &gfP{}
 	gfpDouble(r, t)
 	v := &gfP{}
 	gfpMul(v, u1, i)
 	// t4 = 4(s2-s1)²
 	t4, t6 := &gfP{}, &gfP{}
 	gfpSqr(t4, r, 1)
 	gfpDouble(t, v)
 	gfpSub(t6, t4, j)
 	gfpSub(&c.x, t6, t)
 	// Set y = -(2h)³(s1 + s*(x/4h²-u1))
 	// This is also
 	// y = - 2·s1·j - (s2-s1)(2x - 2i·u1) = r(v-x) - 2·s1·j
 	gfpSub(t, v, &c.x) // t7
 	gfpMul(t4, s1, j)  // t8
 	gfpDouble(t6, t4)  // t9
 	gfpMul(t4, r, t)   // t10
 	gfpSub(&c.y, t4, t6)
 	// Set z = 2(u2-u1)·z1·z2 = 2h·z1·z2
 	gfpAdd(t, &a.z, &b.z) // t11
 	gfpSqr(t4, t, 1)      // t12
 	gfpSub(t, t4, z12)    // t13
 	gfpSub(t4, t, z22)    // t14
 	gfpMul(&c.z, t4, h)
 	return pointEq
 }
 func curvePointAddComplete(c, p1, p2 *curvePoint) {
 	// Complete addition formula for a = 0 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
 	// Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves.
@ -205,68 +358,3 @@ func (c *curvePoint) Add(p1, p2 *curvePoint) {
 	c.y.Set(y3)
 	c.z.Set(z3)
 }
 func (c *curvePoint) AddComplete(p1, p2 *curvePoint) {
 	c.Add(p1, p2)
 }
 func (c *curvePoint) Double(p *curvePoint) {
 	// Complete addition formula for a = 0 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
 	// Algorithm 9: Exception-free point doubling for prime order j-invariant 0 short Weierstrass curves.
 	t0, t1, t2 := new(gfP), new(gfP), new(gfP)
 	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
 	gfpSqr(t0, &p.y, 1)         // t0 := Y^2
 	gfpDouble(z3, t0)           // Z3 := t0 + t0
 	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
 	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
 	gfpMul(t1, &p.y, &p.z)      // t1 := YZ
 	gfpSqr(t2, &p.z, 1)         // t0 := Z^2
 	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ^2
 	gfpMul(x3, t2, z3)          // X3 := t2 * Z3
 	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
 	gfpMul(z3, t1, z3)          // Z3 := t1 * Z3
 	gfpTriple(t2, t2)           // t2 := t2 + t2 + t2
 	gfpSub(t0, t0, t2)          // t0 := t0 - t2
 	gfpMul(y3, t0, y3)          // t0 := t0 * Y3
 	gfpAdd(y3, x3, y3)          // Y3 := X3 + Y3
 	gfpMul(t1, &p.x, &p.y)      // t1 := XY
 	gfpMul(x3, t0, t1)          // X3 := t0 * t1
 	gfpDouble(x3, x3)           // X3 := X3 + X3
 	c.x.Set(x3)
 	c.y.Set(y3)
 	c.z.Set(z3)
 }
 func (c *curvePoint) DoubleComplete(p *curvePoint) {
 	c.Double(p)
 }
 // MakeAffine reverses the Projective transform.
 // A = 1/Z1
 // X3 = A*X1
 // Y3 = A*Y1
 // Z3 = 1
 func (c *curvePoint) MakeAffine() {
 	// TODO: do we need to change it to constant-time implementation?
 	if c.z.Equal(one) == 1 {
 		return
 	} else if c.z.Equal(zero) == 1 {
 		c.x.Set(zero)
 		c.y.Set(one)
 		c.t.Set(zero)
 		return
 	}
 	zInv := &gfP{}
 	zInv.Invert(&c.z)
 	gfpMul(&c.x, &c.x, zInv)
 	gfpMul(&c.y, &c.y, zInv)
 	c.z.Set(one)
 	c.t.Set(one)
 }
 func (c *curvePoint) AffineFromProjective() {
 	c.MakeAffine()
 }
--- a/sm9/bn256/gfp2_g1_amd64.s
+++ b/sm9/bn256/gfp2_g1_amd64.s
@ -1321,19 +1321,19 @@ TEXT ·gfp2SquareU(SB),NOSPLIT,$160-16
 #undef rptr
 /* ---------------------------------------*/
-#define x(off) (32*0 + off)(SP)
+#define xin(off) (32*0 + off)(SP)
-#define y(off) (32*1 + off)(SP)
+#define yin(off) (32*1 + off)(SP)
-#define z(off) (32*2 + off)(SP)
+#define zin(off) (32*2 + off)(SP)
-#define a(off) (32*3 + off)(SP)
+#define xout(off) (32*3 + off)(SP)
-#define b(off) (32*4 + off)(SP)
+#define yout(off) (32*4 + off)(SP)
-#define c(off) (32*5 + off)(SP)
+#define zout(off) (32*5 + off)(SP)
-#define rptr	  (32*6)(SP)
+#define tmp0(off) (32*6 + off)(SP)
 #define tmp2(off) (32*7 + off)(SP)
 #define rptr	  (32*8)(SP)
-// func curvePointDouble(c, a *curvePoint)
+// func curvePointDoubleComplete(c, a *curvePoint)
-TEXT ·curvePointDouble(SB),NOSPLIT,$224-16
+TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16
 	// https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
 	// Move input to stack in order to free registers
 	MOVQ res+0(FP), AX
 	MOVQ in+8(FP), BX
@ -1344,104 +1344,104 @@ TEXT ·curvePointDouble(SB),NOSPLIT,$224-16
 	MOVOU (16*4)(BX), X4
 	MOVOU (16*5)(BX), X5
-	MOVOU X0, x(16*0)
+	MOVOU X0, xin(16*0)
-	MOVOU X1, x(16*1)
+	MOVOU X1, xin(16*1)
-	MOVOU X2, y(16*0)
+	MOVOU X2, yin(16*0)
-	MOVOU X3, y(16*1)
+	MOVOU X3, yin(16*1)
-	MOVOU X4, z(16*0)
+	MOVOU X4, zin(16*0)
-	MOVOU X5, z(16*1)
+	MOVOU X5, zin(16*1)
 	// Store pointer to result
 	MOVQ AX, rptr
-	LDacc (y)
+	LDacc (yin)
-	LDt (z)
+	CALL gfpSqrInternal(SB) // t0 := Y^2
-	CALL gfpMulInternal(SB)
+	ST (tmp0)
-	gfpMulBy2Inline         // Z3 = 2*Y1*Z1
+
 	gfpMulBy2Inline // Z3 := t0 + t0
 	t2acc
 	gfpMulBy2Inline // Z3 := Z3 + Z3
 	t2acc
 	gfpMulBy2Inline // Z3 := Z3 + Z3
 	STt (zout)	
 	LDacc (zin)
 	CALL gfpSqrInternal(SB) // t2 := Z^2
 	ST (tmp2)
 	gfpMulBy2Inline
 	t2acc
 	gfpMulBy2Inline
 	t2acc
 	gfpMulBy2Inline
 	t2acc
 	gfpMulBy2Inline
 	t2acc
 	LDt (tmp2)
 	CALL gfpSubInternal(SB)  // t2 := 3b * t2
 	ST (tmp2)
 	LDt (zout)
 	CALL gfpMulInternal(SB) // X3 := Z3 * t2
 	ST (xout)
 	LDacc (tmp0)
 	LDt (tmp2)
 	gfpAddInline // Y3 := t0 + t2
 	STt (yout)
 	LDacc (yin)
 	LDt (zin)
 	CALL gfpMulInternal(SB) // t1 := YZ
 	LDt (zout)
 	CALL gfpMulInternal(SB) // Z3 := t1 * Z3
 	MOVQ rptr, AX
 	// Store Z
-	MOVQ t0, (16*4 + 8*0)(AX)
+	MOVQ acc4, (16*4 + 8*0)(AX)
-	MOVQ t1, (16*4 + 8*1)(AX)
+	MOVQ acc5, (16*4 + 8*1)(AX)
-	MOVQ t2, (16*4 + 8*2)(AX)
+	MOVQ acc6, (16*4 + 8*2)(AX)
-	MOVQ t3, (16*4 + 8*3)(AX)	
+	MOVQ acc7, (16*4 + 8*3)(AX)	
-	LDacc (x)
+	LDacc (tmp2) 
 	CALL gfpSqrInternal(SB) // A = X1^2
 	ST (a)
 	LDacc (y)
 	CALL gfpSqrInternal(SB) // B = Y1^2
 	ST (b)
 	CALL gfpSqrInternal(SB) // C = B^2
 	ST (c)
 	LDacc (x)
 	LDt (b)
 	gfpAddInline            // X1+B
 	t2acc
 	CALL gfpSqrInternal(SB) // (X1+B)^2
 	LDt (a)
 	CALL gfpSubInternal(SB)
 	LDt (c)
 	CALL gfpSubInternal(SB)
 	gfpMulBy2Inline         //  B = D = 2*((X1+B)^2-A-C)
 	STt (b)                 // Store D
 	LDacc (a)
 	gfpMulBy2Inline
-	LDacc (a)
+	LDacc (tmp2)
-	gfpAddInline            // A = E = 3*A
+	gfpAddInline // t2 := t2 + t2 + t2
-	STt (a)                 // Store E
+	LDacc (tmp0)
-	t2acc
+	CALL gfpSubInternal(SB) // t0 := t0 - t2
-	CALL gfpSqrInternal(SB) // F = E^2
+	ST (tmp0)
-
+	LDt (yout)
-	LDt (b)                 // Load D
+	CALL gfpMulInternal(SB) // Y3 = t0 * Y3
-	CALL gfpSubInternal(SB)
+	LDt (xout)
-	LDt (b)                 // Load D
+	gfpAddInline // Y3 := X3 + Y3
 	CALL gfpSubInternal(SB) // X3 = F-2*D
 	ST (x)
 	MOVQ rptr, AX
 	// Store x
 	MOVQ acc4, (16*0 + 8*0)(AX)
 	MOVQ acc5, (16*0 + 8*1)(AX)
 	MOVQ acc6, (16*0 + 8*2)(AX)
 	MOVQ acc7, (16*0 + 8*3)(AX)
 	LDacc (c)
 	gfpMulBy2Inline
 	t2acc
 	gfpMulBy2Inline
 	t2acc
 	gfpMulBy2Inline // 8*C
 	STt (c)
 	LDacc (b)               // Load D
 	LDt (x)
 	CALL gfpSubInternal(SB) // (D-X3)
 	LDt (a)                 // Load E
 	CALL gfpMulInternal(SB) // E*(D-X3)
 	LDt (c)
 	CALL gfpSubInternal(SB) // Y3 = E*(D-X3)-8*C
 	MOVQ rptr, AX
 	///////////////////////
 	MOVQ $0, rptr	
 	// Store y
-	MOVQ acc4, (16*2 + 8*0)(AX)
+	MOVQ t0, (16*2 + 8*0)(AX)
-	MOVQ acc5, (16*2 + 8*1)(AX)
+	MOVQ t1, (16*2 + 8*1)(AX)
-	MOVQ acc6, (16*2 + 8*2)(AX)
+	MOVQ t2, (16*2 + 8*2)(AX)
-	MOVQ acc7, (16*2 + 8*3)(AX)
+	MOVQ t3, (16*2 + 8*3)(AX)
 	LDacc (xin)
 	LDt (yin)
 	CALL gfpMulInternal(SB) // t1 := XY
 	LDt (tmp0)
 	CALL gfpMulInternal(SB) // X3 := t0 * t1
 	gfpMulBy2Inline         // X3 := X3 + X3
 	MOVQ rptr, AX
 	MOVQ $0, rptr
 	// Store x
 	MOVQ t0, (16*0 + 8*0)(AX)
 	MOVQ t1, (16*0 + 8*1)(AX)
 	MOVQ t2, (16*0 + 8*2)(AX)
 	MOVQ t3, (16*0 + 8*3)(AX)
 	RET
-#undef x
+#undef xin
-#undef y
+#undef yin
-#undef z
+#undef zin
-#undef a
+#undef xout
-#undef b
+#undef yout
-#undef c
+#undef zout
 #undef tmp0
 #undef tmp2
 #undef rptr
 // gfpIsZero returns 1 in AX if [acc4..acc7] represents zero and zero
@ -1475,6 +1475,7 @@ TEXT gfpIsZero(SB),NOSPLIT,$0
 	RET
 /* ---------------------------------------*/
 /*
 #define x1in(off) (32*0 + off)(SP)
 #define y1in(off) (32*1 + off)(SP)
 #define z1in(off) (32*2 + off)(SP)
@ -1651,3 +1652,4 @@ TEXT ·curvePointAdd(SB),0,$680-32
 	MOVQ AX, ret+24(FP)
 	RET
 */
--- a/sm9/bn256/gfp2_g1_decl.go
+++ b/sm9/bn256/gfp2_g1_decl.go
@ -26,11 +26,10 @@ func gfp2SquareU(c, a *gfP2)
 // Point doubling. Sets res = in + in. in can be the point at infinity.
 //
 //go:noescape
-func curvePointDouble(c, a *curvePoint)
+func curvePointDoubleComplete(c, a *curvePoint)
-
+/*
-// Point addition. Sets res = in1 + in2. Returns one if the two input points
+// Point addition. Sets res = in1 + in2. in1 can be same as in2, also can be at infinity.
 // were equal and zero otherwise. If in1 or in2 are the point at infinity, res
 // and the return value are undefined.
 //
 //go:noescape
-func curvePointAdd(c, a, b *curvePoint) int
+func curvePointAddComplete(c, a, b *curvePoint)
 */
--- a/sm9/bn256/gfp2_g1_generic.go
+++ b/sm9/bn256/gfp2_g1_generic.go
@ -82,112 +82,32 @@ func gfp2SquareU(c, a *gfP2) {
 	gfp2Copy(c, tmp)
 }
-func curvePointDouble(c, a *curvePoint) {
+func curvePointDoubleComplete(c, p *curvePoint) {
-	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/doubling/dbl-2009-l.op3
+	// Complete addition formula for a = 0 from "Complete addition formulas for
-	A, B, C := &gfP{}, &gfP{}, &gfP{}
+	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
-	gfpSqr(A, &a.x, 1)
+	// Algorithm 9: Exception-free point doubling for prime order j-invariant 0 short Weierstrass curves.
-	gfpSqr(B, &a.y, 1)
+	t0, t1, t2 := new(gfP), new(gfP), new(gfP)
-	gfpSqr(C, B, 1)
+	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
-	t := &gfP{}
+	gfpSqr(t0, &p.y, 1)         // t0 := Y^2
-	gfpAdd(B, &a.x, B)
+	gfpDouble(z3, t0)           // Z3 := t0 + t0
-	gfpSqr(t, B, 1)
+	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
-	gfpSub(B, t, A)
+	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
-	gfpSub(t, B, C)
+	gfpMul(t1, &p.y, &p.z)      // t1 := YZ
 	gfpSqr(t2, &p.z, 1)         // t2 := Z^2
 	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ^2
 	gfpMul(x3, t2, z3)          // X3 := t2 * Z3
 	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
 	gfpMul(z3, t1, z3)          // Z3 := t1 * Z3
 	gfpTriple(t2, t2)           // t2 := t2 + t2 + t2
 	gfpSub(t0, t0, t2)          // t0 := t0 - t2
 	gfpMul(y3, t0, y3)          // Y3 := t0 * Y3
 	gfpAdd(y3, x3, y3)          // Y3 := X3 + Y3
 	gfpMul(t1, &p.x, &p.y)      // t1 := XY
 	gfpMul(x3, t0, t1)          // X3 := t0 * t1
 	gfpDouble(x3, x3)           // X3 := X3 + X3
-	d, e := &gfP{}, &gfP{}
+	c.x.Set(x3)
-	gfpDouble(d, t)
+	c.y.Set(y3)
-	gfpDouble(B, A)
+	c.z.Set(z3)
 	gfpAdd(e, B, A)
 	gfpSqr(A, e, 1)
 	gfpDouble(B, d)
 	gfpSub(&c.x, A, B)
 	gfpMul(&c.z, &a.y, &a.z)
 	gfpDouble(&c.z, &c.z)
 	gfpDouble(B, C)
 	gfpDouble(t, B)
 	gfpDouble(B, t)
 	gfpSub(&c.y, d, &c.x)
 	gfpMul(t, e, &c.y)
 	gfpSub(&c.y, t, B)	
 }
 func curvePointAdd(c, a, b *curvePoint) int {
 	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/addition/add-2007-bl.op3
 	var pointEq int
 	// Normalize the points by replacing a = [x1:y1:z1] and b = [x2:y2:z2]
 	// by [u1:s1:z1·z2] and [u2:s2:z1·z2]
 	// where u1 = x1·z2², s1 = y1·z2³ and u1 = x2·z1², s2 = y2·z1³
 	z12, z22 := &gfP{}, &gfP{}
 	gfpSqr(z12, &a.z, 1)
 	gfpSqr(z22, &b.z, 1)
 	u1, u2 := &gfP{}, &gfP{}
 	gfpMul(u1, &a.x, z22)
 	gfpMul(u2, &b.x, z12)
 	t, s1 := &gfP{}, &gfP{}
 	gfpMul(t, &b.z, z22)
 	gfpMul(s1, &a.y, t)
 	s2 := &gfP{}
 	gfpMul(t, &a.z, z12)
 	gfpMul(s2, &b.y, t)
 	// Compute x = (2h)²(s²-u1-u2)
 	// where s = (s2-s1)/(u2-u1) is the slope of the line through
 	// (u1,s1) and (u2,s2). The extra factor 2h = 2(u2-u1) comes from the value of z below.
 	// This is also:
 	// 4(s2-s1)² - 4h²(u1+u2) = 4(s2-s1)² - 4h³ - 4h²(2u1)
 	//                        = r² - j - 2v
 	// with the notations below.
 	h := &gfP{}
 	gfpSub(h, u2, u1)
 	gfpDouble(t, h)
 	// i = 4h²
 	i := &gfP{}
 	gfpSqr(i, t, 1)
 	// j = 4h³
 	j := &gfP{}
 	gfpMul(j, h, i)
 	gfpSub(t, s2, s1)
 	pointEq = h.Equal(zero) & t.Equal(zero) 
 	r := &gfP{}
 	gfpDouble(r, t)
 	v := &gfP{}
 	gfpMul(v, u1, i)
 	// t4 = 4(s2-s1)²
 	t4, t6 := &gfP{}, &gfP{}
 	gfpSqr(t4, r, 1)
 	gfpDouble(t, v)
 	gfpSub(t6, t4, j)
 	gfpSub(&c.x, t6, t)
 	// Set y = -(2h)³(s1 + s*(x/4h²-u1))
 	// This is also
 	// y = - 2·s1·j - (s2-s1)(2x - 2i·u1) = r(v-x) - 2·s1·j
 	gfpSub(t, v, &c.x) // t7
 	gfpMul(t4, s1, j)  // t8
 	gfpDouble(t6, t4)  // t9
 	gfpMul(t4, r, t)   // t10
 	gfpSub(&c.y, t4, t6)
 	// Set z = 2(u2-u1)·z1·z2 = 2h·z1·z2
 	gfpAdd(t, &a.z, &b.z) // t11
 	gfpSqr(t4, t, 1)      // t12
 	gfpSub(t, t4, z12)    // t13
 	gfpSub(t4, t, z22)    // t14
 	gfpMul(&c.z, t4, h)
 	return pointEq
 }
--- a/sm9/bn256/twist.go
+++ b/sm9/bn256/twist.go
@ -149,14 +149,14 @@ func (c *twistPoint) Double(p *twistPoint) {
 	z3.Double(z3)           // Z3 := Z3 + Z3
 	z3.Double(z3)           // Z3 := Z3 + Z3
 	t1.Mul(&p.y, &p.z)      // t1 := YZ
-	t2.Square(&p.z)         // t0 := Z^2
+	t2.Square(&p.z)         // t2 := Z^2
 	t2.Mul(threeTwistB, t2) // t2 := 3b * t2 = 3bZ^2
 	x3.Mul(t2, z3)          // X3 := t2 * Z3
 	y3.Add(t0, t2)          // Y3 := t0 + t2
 	z3.Mul(t1, z3)          // Z3 := t1 * Z3
 	t2.Triple(t2)           // t2 := t2 + t2 + t2
 	t0.Sub(t0, t2)          // t0 := t0 - t2
-	y3.Mul(t0, y3)          // t0 := t0 * Y3
+	y3.Mul(t0, y3)          // Y3 := t0 * Y3
 	y3.Add(x3, y3)          // Y3 := X3 + Y3
 	t1.Mul(&p.x, &p.y)      // t1 := XY
 	x3.Mul(t0, t1)          // X3 := t0 * t1