diff --git a/sm9/bn256/constants.go b/sm9/bn256/constants.go
index cef6ebb..5d8dee4 100644
--- a/sm9/bn256/constants.go
+++ b/sm9/bn256/constants.go
@@ -26,6 +26,9 @@ var p2 = [4]uint64{0xe56f9b27e351457d, 0x21f2934b1a7aeedb, 0xd603ab4ff58ec745, 0
 // np is the negative inverse of p, mod 2^256.
 var np = [4]uint64{0x892bc42c2f2ee42b, 0x181ae39613c8dbaf, 0x966a4b291522b137, 0xafd2bac5558a13b3}
 
+// b3 is 15
+var b3 = [4]uint64{0x2dd845ba5a554cbf, 0x3719ead6d3ea67f6, 0x71b2f270db49a754, 0x0cbfffffc8934e29}
+
 // rN1 is R^-1 where R = 2^256 mod p.
 var rN1 = &gfP{0x0a1c7970e5df544d, 0xe74504e9a96b56cc, 0xcda02d92d4d62924, 0x7d2bc576fdf597d1}
 
diff --git a/sm9/bn256/curve.go b/sm9/bn256/curve.go
index aa46f4f..c5b1dfe 100644
--- a/sm9/bn256/curve.go
+++ b/sm9/bn256/curve.go
@@ -162,6 +162,159 @@ func (e *curvePoint) Equal(other *curvePoint) bool {
 // Below methods are POC yet, the line add/double functions are still based on
 // Jacobian coordination.
 func (c *curvePoint) Add(p1, p2 *curvePoint) {
+	curvePointAddComplete(c, p1, p2)
+}
+
+func (c *curvePoint) AddComplete(p1, p2 *curvePoint) {
+	curvePointAddComplete(c, p1, p2)
+}
+
+func (c *curvePoint) Double(p *curvePoint) {
+	curvePointDoubleComplete(c, p)
+}
+
+func (c *curvePoint) DoubleComplete(p *curvePoint) {
+	curvePointDoubleComplete(c, p)
+}
+
+// MakeAffine reverses the Projective transform.
+// A = 1/Z1
+// X3 = A*X1
+// Y3 = A*Y1
+// Z3 = 1
+func (c *curvePoint) MakeAffine() {
+	// TODO: do we need to change it to constant-time implementation?
+	if c.z.Equal(one) == 1 {
+		return
+	} else if c.z.Equal(zero) == 1 {
+		c.x.Set(zero)
+		c.y.Set(one)
+		c.t.Set(zero)
+		return
+	}
+	zInv := &gfP{}
+	zInv.Invert(&c.z)
+	gfpMul(&c.x, &c.x, zInv)
+	gfpMul(&c.y, &c.y, zInv)
+	c.z.Set(one)
+	c.t.Set(one)
+}
+
+func (c *curvePoint) AffineFromProjective() {
+	c.MakeAffine()
+}
+
+func curvePointDouble(c, a *curvePoint) {
+	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/doubling/dbl-2009-l.op3
+	A, B, C := &gfP{}, &gfP{}, &gfP{}
+	gfpSqr(A, &a.x, 1)
+	gfpSqr(B, &a.y, 1)
+	gfpSqr(C, B, 1)
+
+	t := &gfP{}
+	gfpAdd(B, &a.x, B)
+	gfpSqr(t, B, 1)
+	gfpSub(B, t, A)
+	gfpSub(t, B, C)
+
+	d, e := &gfP{}, &gfP{}
+	gfpDouble(d, t)
+	gfpDouble(B, A)
+	gfpAdd(e, B, A)
+	gfpSqr(A, e, 1)
+
+	gfpDouble(B, d)
+	gfpSub(&c.x, A, B)
+
+	gfpMul(&c.z, &a.y, &a.z)
+	gfpDouble(&c.z, &c.z)
+
+	gfpDouble(B, C)
+	gfpDouble(t, B)
+	gfpDouble(B, t)
+	gfpSub(&c.y, d, &c.x)
+	gfpMul(t, e, &c.y)
+	gfpSub(&c.y, t, B)
+}
+
+func curvePointAdd(c, a, b *curvePoint) int {
+	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/addition/add-2007-bl.op3
+	var pointEq int
+	// Normalize the points by replacing a = [x1:y1:z1] and b = [x2:y2:z2]
+	// by [u1:s1:z1·z2] and [u2:s2:z1·z2]
+	// where u1 = x1·z2², s1 = y1·z2³ and u1 = x2·z1², s2 = y2·z1³
+	z12, z22 := &gfP{}, &gfP{}
+	gfpSqr(z12, &a.z, 1)
+	gfpSqr(z22, &b.z, 1)
+
+	u1, u2 := &gfP{}, &gfP{}
+	gfpMul(u1, &a.x, z22)
+	gfpMul(u2, &b.x, z12)
+
+	t, s1 := &gfP{}, &gfP{}
+	gfpMul(t, &b.z, z22)
+	gfpMul(s1, &a.y, t)
+
+	s2 := &gfP{}
+	gfpMul(t, &a.z, z12)
+	gfpMul(s2, &b.y, t)
+
+	// Compute x = (2h)²(s²-u1-u2)
+	// where s = (s2-s1)/(u2-u1) is the slope of the line through
+	// (u1,s1) and (u2,s2). The extra factor 2h = 2(u2-u1) comes from the value of z below.
+	// This is also:
+	// 4(s2-s1)² - 4h²(u1+u2) = 4(s2-s1)² - 4h³ - 4h²(2u1)
+	//                        = r² - j - 2v
+	// with the notations below.
+	h := &gfP{}
+	gfpSub(h, u2, u1)
+
+	gfpDouble(t, h)
+	// i = 4h²
+	i := &gfP{}
+	gfpSqr(i, t, 1)
+	// j = 4h³
+	j := &gfP{}
+	gfpMul(j, h, i)
+
+	gfpSub(t, s2, s1)
+
+	pointEq = h.Equal(zero) & t.Equal(zero)
+
+	r := &gfP{}
+	gfpDouble(r, t)
+
+	v := &gfP{}
+	gfpMul(v, u1, i)
+
+	// t4 = 4(s2-s1)²
+	t4, t6 := &gfP{}, &gfP{}
+	gfpSqr(t4, r, 1)
+	gfpDouble(t, v)
+	gfpSub(t6, t4, j)
+
+	gfpSub(&c.x, t6, t)
+
+	// Set y = -(2h)³(s1 + s*(x/4h²-u1))
+	// This is also
+	// y = - 2·s1·j - (s2-s1)(2x - 2i·u1) = r(v-x) - 2·s1·j
+	gfpSub(t, v, &c.x) // t7
+	gfpMul(t4, s1, j)  // t8
+	gfpDouble(t6, t4)  // t9
+	gfpMul(t4, r, t)   // t10
+	gfpSub(&c.y, t4, t6)
+
+	// Set z = 2(u2-u1)·z1·z2 = 2h·z1·z2
+	gfpAdd(t, &a.z, &b.z) // t11
+	gfpSqr(t4, t, 1)      // t12
+	gfpSub(t, t4, z12)    // t13
+	gfpSub(t4, t, z22)    // t14
+	gfpMul(&c.z, t4, h)
+
+	return pointEq
+}
+
+func curvePointAddComplete(c, p1, p2 *curvePoint) {
 	// Complete addition formula for a = 0 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
 	// Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves.
@@ -205,68 +358,3 @@ func (c *curvePoint) Add(p1, p2 *curvePoint) {
 	c.y.Set(y3)
 	c.z.Set(z3)
 }
-
-func (c *curvePoint) AddComplete(p1, p2 *curvePoint) {
-	c.Add(p1, p2)
-}
-
-func (c *curvePoint) Double(p *curvePoint) {
-	// Complete addition formula for a = 0 from "Complete addition formulas for
-	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
-	// Algorithm 9: Exception-free point doubling for prime order j-invariant 0 short Weierstrass curves.
-	t0, t1, t2 := new(gfP), new(gfP), new(gfP)
-	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
-
-	gfpSqr(t0, &p.y, 1)         // t0 := Y^2
-	gfpDouble(z3, t0)           // Z3 := t0 + t0
-	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
-	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
-	gfpMul(t1, &p.y, &p.z)      // t1 := YZ
-	gfpSqr(t2, &p.z, 1)         // t0 := Z^2
-	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ^2
-	gfpMul(x3, t2, z3)          // X3 := t2 * Z3
-	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
-	gfpMul(z3, t1, z3)          // Z3 := t1 * Z3
-	gfpTriple(t2, t2)           // t2 := t2 + t2 + t2
-	gfpSub(t0, t0, t2)          // t0 := t0 - t2
-	gfpMul(y3, t0, y3)          // t0 := t0 * Y3
-	gfpAdd(y3, x3, y3)          // Y3 := X3 + Y3
-	gfpMul(t1, &p.x, &p.y)      // t1 := XY
-	gfpMul(x3, t0, t1)          // X3 := t0 * t1
-	gfpDouble(x3, x3)           // X3 := X3 + X3
-
-	c.x.Set(x3)
-	c.y.Set(y3)
-	c.z.Set(z3)
-}
-
-func (c *curvePoint) DoubleComplete(p *curvePoint) {
-	c.Double(p)
-}
-
-// MakeAffine reverses the Projective transform.
-// A = 1/Z1
-// X3 = A*X1
-// Y3 = A*Y1
-// Z3 = 1
-func (c *curvePoint) MakeAffine() {
-	// TODO: do we need to change it to constant-time implementation?
-	if c.z.Equal(one) == 1 {
-		return
-	} else if c.z.Equal(zero) == 1 {
-		c.x.Set(zero)
-		c.y.Set(one)
-		c.t.Set(zero)
-		return
-	}
-	zInv := &gfP{}
-	zInv.Invert(&c.z)
-	gfpMul(&c.x, &c.x, zInv)
-	gfpMul(&c.y, &c.y, zInv)
-	c.z.Set(one)
-	c.t.Set(one)
-}
-
-func (c *curvePoint) AffineFromProjective() {
-	c.MakeAffine()
-}
diff --git a/sm9/bn256/gfp2_g1_amd64.s b/sm9/bn256/gfp2_g1_amd64.s
index d8a8f03..74311cf 100644
--- a/sm9/bn256/gfp2_g1_amd64.s
+++ b/sm9/bn256/gfp2_g1_amd64.s
@@ -1321,19 +1321,19 @@ TEXT ·gfp2SquareU(SB),NOSPLIT,$160-16
 #undef rptr
 
 /* ---------------------------------------*/
-#define x(off) (32*0 + off)(SP)
-#define y(off) (32*1 + off)(SP)
-#define z(off) (32*2 + off)(SP)
+#define xin(off) (32*0 + off)(SP)
+#define yin(off) (32*1 + off)(SP)
+#define zin(off) (32*2 + off)(SP)
 
-#define a(off) (32*3 + off)(SP)
-#define b(off) (32*4 + off)(SP)
-#define c(off) (32*5 + off)(SP)
-#define rptr	  (32*6)(SP)
+#define xout(off) (32*3 + off)(SP)
+#define yout(off) (32*4 + off)(SP)
+#define zout(off) (32*5 + off)(SP)
+#define tmp0(off) (32*6 + off)(SP)
+#define tmp2(off) (32*7 + off)(SP)
+#define rptr	  (32*8)(SP)
 
-// func curvePointDouble(c, a *curvePoint)
-TEXT ·curvePointDouble(SB),NOSPLIT,$224-16
-	// https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
-	// Move input to stack in order to free registers
+// func curvePointDoubleComplete(c, a *curvePoint)
+TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16
 	MOVQ res+0(FP), AX
 	MOVQ in+8(FP), BX
 
@@ -1344,104 +1344,104 @@ TEXT ·curvePointDouble(SB),NOSPLIT,$224-16
 	MOVOU (16*4)(BX), X4
 	MOVOU (16*5)(BX), X5
 	
-	MOVOU X0, x(16*0)
-	MOVOU X1, x(16*1)
-	MOVOU X2, y(16*0)
-	MOVOU X3, y(16*1)
-	MOVOU X4, z(16*0)
-	MOVOU X5, z(16*1)
+	MOVOU X0, xin(16*0)
+	MOVOU X1, xin(16*1)
+	MOVOU X2, yin(16*0)
+	MOVOU X3, yin(16*1)
+	MOVOU X4, zin(16*0)
+	MOVOU X5, zin(16*1)
 
 	// Store pointer to result
 	MOVQ AX, rptr
 
-	LDacc (y)
-	LDt (z)
-	CALL gfpMulInternal(SB)
-	gfpMulBy2Inline         // Z3 = 2*Y1*Z1
+	LDacc (yin)
+	CALL gfpSqrInternal(SB) // t0 := Y^2
+	ST (tmp0)
+
+	gfpMulBy2Inline // Z3 := t0 + t0
+	t2acc
+	gfpMulBy2Inline // Z3 := Z3 + Z3
+	t2acc
+	gfpMulBy2Inline // Z3 := Z3 + Z3
+	STt (zout)	
+
+	LDacc (zin)
+	CALL gfpSqrInternal(SB) // t2 := Z^2
+	ST (tmp2)
+	gfpMulBy2Inline
+	t2acc
+	gfpMulBy2Inline
+	t2acc
+	gfpMulBy2Inline
+	t2acc
+	gfpMulBy2Inline
+	t2acc
+	LDt (tmp2)
+	CALL gfpSubInternal(SB)  // t2 := 3b * t2
+	ST (tmp2)
+	LDt (zout)
+	CALL gfpMulInternal(SB) // X3 := Z3 * t2
+	ST (xout)
+
+	LDacc (tmp0)
+	LDt (tmp2)
+	gfpAddInline // Y3 := t0 + t2
+	STt (yout)
+
+	LDacc (yin)
+	LDt (zin)
+	CALL gfpMulInternal(SB) // t1 := YZ
+	LDt (zout)
+	CALL gfpMulInternal(SB) // Z3 := t1 * Z3
 	MOVQ rptr, AX
 	// Store Z
-	MOVQ t0, (16*4 + 8*0)(AX)
-	MOVQ t1, (16*4 + 8*1)(AX)
-	MOVQ t2, (16*4 + 8*2)(AX)
-	MOVQ t3, (16*4 + 8*3)(AX)	
+	MOVQ acc4, (16*4 + 8*0)(AX)
+	MOVQ acc5, (16*4 + 8*1)(AX)
+	MOVQ acc6, (16*4 + 8*2)(AX)
+	MOVQ acc7, (16*4 + 8*3)(AX)	
 
-	LDacc (x)
-	CALL gfpSqrInternal(SB) // A = X1^2
-	ST (a)
-
-	LDacc (y)
-	CALL gfpSqrInternal(SB) // B = Y1^2
-	ST (b)
-	CALL gfpSqrInternal(SB) // C = B^2
-	ST (c)
-
-	LDacc (x)
-	LDt (b)
-	gfpAddInline            // X1+B
-	t2acc
-	CALL gfpSqrInternal(SB) // (X1+B)^2
-	LDt (a)
-	CALL gfpSubInternal(SB)
-	LDt (c)
-	CALL gfpSubInternal(SB)
-	gfpMulBy2Inline         //  B = D = 2*((X1+B)^2-A-C)
-	STt (b)                 // Store D
-
-	LDacc (a)
+	LDacc (tmp2) 
 	gfpMulBy2Inline
-	LDacc (a)
-	gfpAddInline            // A = E = 3*A
-	STt (a)                 // Store E
-	t2acc
-	CALL gfpSqrInternal(SB) // F = E^2
-
-	LDt (b)                 // Load D
-	CALL gfpSubInternal(SB)
-	LDt (b)                 // Load D
-	CALL gfpSubInternal(SB) // X3 = F-2*D
-
-	ST (x)
-
+	LDacc (tmp2)
+	gfpAddInline // t2 := t2 + t2 + t2
+	LDacc (tmp0)
+	CALL gfpSubInternal(SB) // t0 := t0 - t2
+	ST (tmp0)
+	LDt (yout)
+	CALL gfpMulInternal(SB) // Y3 = t0 * Y3
+	LDt (xout)
+	gfpAddInline // Y3 := X3 + Y3
 	MOVQ rptr, AX
-	// Store x
-	MOVQ acc4, (16*0 + 8*0)(AX)
-	MOVQ acc5, (16*0 + 8*1)(AX)
-	MOVQ acc6, (16*0 + 8*2)(AX)
-	MOVQ acc7, (16*0 + 8*3)(AX)
-
-	LDacc (c)
-	gfpMulBy2Inline
-	t2acc
-	gfpMulBy2Inline
-	t2acc
-	gfpMulBy2Inline // 8*C
-	STt (c)
-
-	LDacc (b)               // Load D
-	LDt (x)
-	CALL gfpSubInternal(SB) // (D-X3)
-	LDt (a)                 // Load E
-	CALL gfpMulInternal(SB) // E*(D-X3)
-	LDt (c)
-	CALL gfpSubInternal(SB) // Y3 = E*(D-X3)-8*C
-
-	MOVQ rptr, AX
-	///////////////////////
-	MOVQ $0, rptr	
 	// Store y
-	MOVQ acc4, (16*2 + 8*0)(AX)
-	MOVQ acc5, (16*2 + 8*1)(AX)
-	MOVQ acc6, (16*2 + 8*2)(AX)
-	MOVQ acc7, (16*2 + 8*3)(AX)
+	MOVQ t0, (16*2 + 8*0)(AX)
+	MOVQ t1, (16*2 + 8*1)(AX)
+	MOVQ t2, (16*2 + 8*2)(AX)
+	MOVQ t3, (16*2 + 8*3)(AX)
+
+	LDacc (xin)
+	LDt (yin)
+	CALL gfpMulInternal(SB) // t1 := XY
+	LDt (tmp0)
+	CALL gfpMulInternal(SB) // X3 := t0 * t1
+	gfpMulBy2Inline         // X3 := X3 + X3
+	MOVQ rptr, AX
+	MOVQ $0, rptr
+	// Store x
+	MOVQ t0, (16*0 + 8*0)(AX)
+	MOVQ t1, (16*0 + 8*1)(AX)
+	MOVQ t2, (16*0 + 8*2)(AX)
+	MOVQ t3, (16*0 + 8*3)(AX)
 
 	RET
 
-#undef x
-#undef y
-#undef z
-#undef a
-#undef b
-#undef c
+#undef xin
+#undef yin
+#undef zin
+#undef xout
+#undef yout
+#undef zout
+#undef tmp0
+#undef tmp2
 #undef rptr
 
 // gfpIsZero returns 1 in AX if [acc4..acc7] represents zero and zero
@@ -1475,6 +1475,7 @@ TEXT gfpIsZero(SB),NOSPLIT,$0
 	RET
 
 /* ---------------------------------------*/
+/*
 #define x1in(off) (32*0 + off)(SP)
 #define y1in(off) (32*1 + off)(SP)
 #define z1in(off) (32*2 + off)(SP)
@@ -1651,3 +1652,4 @@ TEXT ·curvePointAdd(SB),0,$680-32
 	MOVQ AX, ret+24(FP)
 
 	RET
+*/
diff --git a/sm9/bn256/gfp2_g1_decl.go b/sm9/bn256/gfp2_g1_decl.go
index ea2614c..f4b8fbb 100644
--- a/sm9/bn256/gfp2_g1_decl.go
+++ b/sm9/bn256/gfp2_g1_decl.go
@@ -26,11 +26,10 @@ func gfp2SquareU(c, a *gfP2)
 // Point doubling. Sets res = in + in. in can be the point at infinity.
 //
 //go:noescape
-func curvePointDouble(c, a *curvePoint)
-
-// Point addition. Sets res = in1 + in2. Returns one if the two input points
-// were equal and zero otherwise. If in1 or in2 are the point at infinity, res
-// and the return value are undefined.
+func curvePointDoubleComplete(c, a *curvePoint)
+/*
+// Point addition. Sets res = in1 + in2. in1 can be same as in2, also can be at infinity.
 //
 //go:noescape
-func curvePointAdd(c, a, b *curvePoint) int
+func curvePointAddComplete(c, a, b *curvePoint)
+*/
diff --git a/sm9/bn256/gfp2_g1_generic.go b/sm9/bn256/gfp2_g1_generic.go
index db85ce9..fe245fe 100644
--- a/sm9/bn256/gfp2_g1_generic.go
+++ b/sm9/bn256/gfp2_g1_generic.go
@@ -82,112 +82,32 @@ func gfp2SquareU(c, a *gfP2) {
 	gfp2Copy(c, tmp)
 }
 
-func curvePointDouble(c, a *curvePoint) {
-	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/doubling/dbl-2009-l.op3
-	A, B, C := &gfP{}, &gfP{}, &gfP{}
-	gfpSqr(A, &a.x, 1)
-	gfpSqr(B, &a.y, 1)
-	gfpSqr(C, B, 1)
+func curvePointDoubleComplete(c, p *curvePoint) {
+	// Complete addition formula for a = 0 from "Complete addition formulas for
+	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
+	// Algorithm 9: Exception-free point doubling for prime order j-invariant 0 short Weierstrass curves.
+	t0, t1, t2 := new(gfP), new(gfP), new(gfP)
+	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
 
-	t := &gfP{}
-	gfpAdd(B, &a.x, B)
-	gfpSqr(t, B, 1)
-	gfpSub(B, t, A)
-	gfpSub(t, B, C)
+	gfpSqr(t0, &p.y, 1)         // t0 := Y^2
+	gfpDouble(z3, t0)           // Z3 := t0 + t0
+	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
+	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
+	gfpMul(t1, &p.y, &p.z)      // t1 := YZ
+	gfpSqr(t2, &p.z, 1)         // t2 := Z^2
+	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ^2
+	gfpMul(x3, t2, z3)          // X3 := t2 * Z3
+	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
+	gfpMul(z3, t1, z3)          // Z3 := t1 * Z3
+	gfpTriple(t2, t2)           // t2 := t2 + t2 + t2
+	gfpSub(t0, t0, t2)          // t0 := t0 - t2
+	gfpMul(y3, t0, y3)          // Y3 := t0 * Y3
+	gfpAdd(y3, x3, y3)          // Y3 := X3 + Y3
+	gfpMul(t1, &p.x, &p.y)      // t1 := XY
+	gfpMul(x3, t0, t1)          // X3 := t0 * t1
+	gfpDouble(x3, x3)           // X3 := X3 + X3
 
-	d, e := &gfP{}, &gfP{}
-	gfpDouble(d, t)
-	gfpDouble(B, A)
-	gfpAdd(e, B, A)
-	gfpSqr(A, e, 1)
-
-	gfpDouble(B, d)
-	gfpSub(&c.x, A, B)
-
-	gfpMul(&c.z, &a.y, &a.z)
-	gfpDouble(&c.z, &c.z)
-
-	gfpDouble(B, C)
-	gfpDouble(t, B)
-	gfpDouble(B, t)
-	gfpSub(&c.y, d, &c.x)
-	gfpMul(t, e, &c.y)
-	gfpSub(&c.y, t, B)	
-}
-
-func curvePointAdd(c, a, b *curvePoint) int {
-	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/addition/add-2007-bl.op3
-	var pointEq int
-	// Normalize the points by replacing a = [x1:y1:z1] and b = [x2:y2:z2]
-	// by [u1:s1:z1·z2] and [u2:s2:z1·z2]
-	// where u1 = x1·z2², s1 = y1·z2³ and u1 = x2·z1², s2 = y2·z1³
-	z12, z22 := &gfP{}, &gfP{}
-	gfpSqr(z12, &a.z, 1)
-	gfpSqr(z22, &b.z, 1)
-
-	u1, u2 := &gfP{}, &gfP{}
-	gfpMul(u1, &a.x, z22)
-	gfpMul(u2, &b.x, z12)
-
-	t, s1 := &gfP{}, &gfP{}
-	gfpMul(t, &b.z, z22)
-	gfpMul(s1, &a.y, t)
-
-	s2 := &gfP{}
-	gfpMul(t, &a.z, z12)
-	gfpMul(s2, &b.y, t)
-
-	// Compute x = (2h)²(s²-u1-u2)
-	// where s = (s2-s1)/(u2-u1) is the slope of the line through
-	// (u1,s1) and (u2,s2). The extra factor 2h = 2(u2-u1) comes from the value of z below.
-	// This is also:
-	// 4(s2-s1)² - 4h²(u1+u2) = 4(s2-s1)² - 4h³ - 4h²(2u1)
-	//                        = r² - j - 2v
-	// with the notations below.
-	h := &gfP{}
-	gfpSub(h, u2, u1)
-
-	gfpDouble(t, h)
-	// i = 4h²
-	i := &gfP{}
-	gfpSqr(i, t, 1)
-	// j = 4h³
-	j := &gfP{}
-	gfpMul(j, h, i)
-
-	gfpSub(t, s2, s1)
-
-	pointEq = h.Equal(zero) & t.Equal(zero) 
-
-	r := &gfP{}
-	gfpDouble(r, t)
-
-	v := &gfP{}
-	gfpMul(v, u1, i)
-
-	// t4 = 4(s2-s1)²
-	t4, t6 := &gfP{}, &gfP{}
-	gfpSqr(t4, r, 1)
-	gfpDouble(t, v)
-	gfpSub(t6, t4, j)
-
-	gfpSub(&c.x, t6, t)
-
-	// Set y = -(2h)³(s1 + s*(x/4h²-u1))
-	// This is also
-	// y = - 2·s1·j - (s2-s1)(2x - 2i·u1) = r(v-x) - 2·s1·j
-	gfpSub(t, v, &c.x) // t7
-	gfpMul(t4, s1, j)  // t8
-	gfpDouble(t6, t4)  // t9
-	gfpMul(t4, r, t)   // t10
-	gfpSub(&c.y, t4, t6)
-
-	// Set z = 2(u2-u1)·z1·z2 = 2h·z1·z2
-	gfpAdd(t, &a.z, &b.z) // t11
-	gfpSqr(t4, t, 1)      // t12
-	gfpSub(t, t4, z12)    // t13
-	gfpSub(t4, t, z22)    // t14
-	gfpMul(&c.z, t4, h)
-
-	return pointEq
+	c.x.Set(x3)
+	c.y.Set(y3)
+	c.z.Set(z3)
 }
diff --git a/sm9/bn256/twist.go b/sm9/bn256/twist.go
index e56575a..6a2b84e 100644
--- a/sm9/bn256/twist.go
+++ b/sm9/bn256/twist.go
@@ -149,14 +149,14 @@ func (c *twistPoint) Double(p *twistPoint) {
 	z3.Double(z3)           // Z3 := Z3 + Z3
 	z3.Double(z3)           // Z3 := Z3 + Z3
 	t1.Mul(&p.y, &p.z)      // t1 := YZ
-	t2.Square(&p.z)         // t0 := Z^2
+	t2.Square(&p.z)         // t2 := Z^2
 	t2.Mul(threeTwistB, t2) // t2 := 3b * t2 = 3bZ^2
 	x3.Mul(t2, z3)          // X3 := t2 * Z3
 	y3.Add(t0, t2)          // Y3 := t0 + t2
 	z3.Mul(t1, z3)          // Z3 := t1 * Z3
 	t2.Triple(t2)           // t2 := t2 + t2 + t2
 	t0.Sub(t0, t2)          // t0 := t0 - t2
-	y3.Mul(t0, y3)          // t0 := t0 * Y3
+	y3.Mul(t0, y3)          // Y3 := t0 * Y3
 	y3.Add(x3, y3)          // Y3 := X3 + Y3
 	t1.Mul(&p.x, &p.y)      // t1 := XY
 	x3.Mul(t0, t1)          // X3 := t0 * t1