diff --git a/sm9/bn256/constants.go b/sm9/bn256/constants.go
index 5d8dee4..bb9f977 100644
--- a/sm9/bn256/constants.go
+++ b/sm9/bn256/constants.go
@@ -26,7 +26,7 @@ var p2 = [4]uint64{0xe56f9b27e351457d, 0x21f2934b1a7aeedb, 0xd603ab4ff58ec745, 0
 // np is the negative inverse of p, mod 2^256.
 var np = [4]uint64{0x892bc42c2f2ee42b, 0x181ae39613c8dbaf, 0x966a4b291522b137, 0xafd2bac5558a13b3}
 
-// b3 is 15
+// Montgomery encoding of 15
 var b3 = [4]uint64{0x2dd845ba5a554cbf, 0x3719ead6d3ea67f6, 0x71b2f270db49a754, 0x0cbfffffc8934e29}
 
 // rN1 is R^-1 where R = 2^256 mod p.
diff --git a/sm9/bn256/curve.go b/sm9/bn256/curve.go
index c5b1dfe..d8bf0f5 100644
--- a/sm9/bn256/curve.go
+++ b/sm9/bn256/curve.go
@@ -313,48 +313,3 @@ func curvePointAdd(c, a, b *curvePoint) int {
 
 	return pointEq
 }
-
-func curvePointAddComplete(c, p1, p2 *curvePoint) {
-	// Complete addition formula for a = 0 from "Complete addition formulas for
-	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
-	// Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves.
-
-	t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP)
-	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
-	gfpMul(t0, &p1.x, &p2.x)    // t0 := X1X2
-	gfpMul(t1, &p1.y, &p2.y)    // t1 := Y1Y2
-	gfpMul(t2, &p1.z, &p2.z)    // t2 := Z1Z2
-	gfpAdd(t3, &p1.x, &p1.y)    // t3 := X1 + Y1
-	gfpAdd(t4, &p2.x, &p2.y)    // t4 := X2 + Y2
-	gfpMul(t3, t3, t4)          // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
-	gfpAdd(t4, t0, t1)          // t4 := t0 + t1
-	gfpSub(t3, t3, t4)          // t3 := t3 - t4 = X1Y2 + X2Y1
-	gfpAdd(t4, &p1.y, &p1.z)    // t4 := Y1 + Z1
-	gfpAdd(x3, &p2.y, &p2.z)    // X3 := Y2 + Z2
-	gfpMul(t4, t4, x3)          // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
-	gfpAdd(x3, t1, t2)          // X3 := t1 + t2
-	gfpSub(t4, t4, x3)          // t4 := t4 - X3 = Y1Z2 + Y2Z1
-	gfpAdd(x3, &p1.x, &p1.z)    // X3 := X1 + Z1
-	gfpAdd(y3, &p2.x, &p2.z)    // Y3 := X2 + Z2
-	gfpMul(x3, x3, y3)          // X3 := X3 * Y3
-	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
-	gfpSub(y3, x3, y3)          // Y3 := X3 - Y3 = X1Z2 + X2Z1
-	gfpTriple(t0, t0)           // t0 := t0 + t0 + t0 = 3X1X2
-	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2
-	gfpAdd(z3, t1, t2)          // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
-	gfpSub(t1, t1, t2)          // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
-	gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
-	gfpMul(x3, t4, y3)          // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
-	gfpMul(t2, t3, t1)          // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
-	gfpSub(x3, t2, x3)          // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
-	gfpMul(y3, y3, t0)          // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
-	gfpMul(t1, t1, z3)          // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
-	gfpAdd(y3, t1, y3)          // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
-	gfpMul(t0, t0, t3)          // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
-	gfpMul(z3, z3, t4)          // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
-	gfpAdd(z3, z3, t0)          // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
-
-	c.x.Set(x3)
-	c.y.Set(y3)
-	c.z.Set(z3)
-}
diff --git a/sm9/bn256/gfp2_g1_amd64.s b/sm9/bn256/gfp2_g1_amd64.s
index 74311cf..e9040f6 100644
--- a/sm9/bn256/gfp2_g1_amd64.s
+++ b/sm9/bn256/gfp2_g1_amd64.s
@@ -1474,6 +1474,263 @@ TEXT gfpIsZero(SB),NOSPLIT,$0
 	CMOVQEQ t1, AX
 	RET
 
+/* ---------------------------------------*/
+#define x1in(off) (32*0 + off)(SP)
+#define y1in(off) (32*1 + off)(SP)
+#define z1in(off) (32*2 + off)(SP)
+#define x2in(off) (32*3 + off)(SP)
+#define y2in(off) (32*4 + off)(SP)
+#define z2in(off) (32*5 + off)(SP)
+#define xout(off) (32*6 + off)(SP)
+#define yout(off) (32*7 + off)(SP)
+#define zout(off) (32*8 + off)(SP)
+#define tmp0(off) (32*9 + off)(SP)
+#define tmp1(off) (32*10 + off)(SP)
+#define tmp2(off) (32*11 + off)(SP)
+#define tmp3(off) (32*12 + off)(SP)
+#define tmp4(off) (32*13 + off)(SP)
+#define rptr      (32*14)(SP)
+
+#define curvePointAddCompleteInline \
+	LDacc (x1in) \
+	LDt (x2in)   \
+	CALL gfpMulInternal(SB) \ // t0 := X1X2
+	ST (tmp0)    \
+	LDacc (y1in) \
+	LDt (y2in)   \
+	CALL gfpMulInternal(SB) \ // t1 := Y1Y2
+	ST (tmp1)    \
+	LDacc (z1in) \
+	LDt (z2in)   \
+	CALL gfpMulInternal(SB) \ // t2 := Z1Z2
+	ST (tmp2)    \
+	\
+	LDacc (x1in) \
+	LDt (y1in)   \
+	gfpAddInline \
+	STt (tmp3)   \            // t3 := X1 + Y1
+	LDacc (x2in) \
+	LDt (y2in)   \
+	gfpAddInline \
+	LDacc (tmp3) \
+	CALL gfpMulInternal(SB) \ // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
+	ST (tmp3)    \
+	LDacc (tmp0) \
+	LDt (tmp1)   \
+	gfpAddInline \
+	LDacc (tmp3) \
+	CALL gfpSubInternal(SB) \ // t3 := t3 - t4 = X1Y2 + X2Y1
+	ST (tmp3)    \
+	\
+	LDacc (y1in) \
+	LDt (z1in)   \
+	gfpAddInline \            // t4 := Y1 + Z1
+	STt (tmp4)   \
+	LDacc (y2in) \ 
+	LDt (z2in)   \
+	gfpAddInline \
+	LDacc (tmp4) \
+	CALL gfpMulInternal(SB) \ // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
+	ST (tmp4)    \
+	LDacc (tmp1) \
+	LDt (tmp2)   \
+	gfpAddInline \
+	LDacc (tmp4) \
+	CALL gfpSubInternal(SB) \ // t4 := t4 - X3 = Y1Z2 + Y2Z1
+	ST (tmp4)    \
+	\
+	LDacc (z1in) \
+	LDt (x1in)   \
+	gfpAddInline \            // X3 := X1 + Z1
+	STt (xout)   \
+	LDacc (z2in) \
+	LDt (x2in)   \
+	gfpAddInline \
+	LDacc (xout) \
+	CALL gfpMulInternal(SB) \ // X3 := X3 * Y3
+	ST (xout)    \
+	LDacc (tmp0) \
+	LDt (tmp2)   \
+	gfpAddInline \
+	LDacc (xout) \
+	CALL gfpSubInternal(SB) \ // Y3 := X3 - Y3 = X1Z2 + X2Z1
+	ST (yout)    \
+	\
+	LDacc (tmp0) \
+	gfpMulBy2Inline \
+	LDacc (tmp0)    \
+	gfpAddInline    \         // t0 := t0 + t0 + t0 = 3X1X2
+	STt (tmp0)   \
+	\
+	LDacc (tmp2) \
+	gfpMulBy2Inline  \
+	t2acc        \
+	gfpMulBy2Inline  \
+	t2acc            \
+	gfpMulBy2Inline  \
+	t2acc            \
+	gfpMulBy2Inline  \
+	t2acc            \
+	LDt (tmp2)       \
+	CALL gfpSubInternal(SB) \ // t2 := 3b * t2 = 3bZ1Z2
+	ST (tmp2)        \  
+	\
+	LDt (tmp1)       \  
+	gfpAddInline     \        // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
+	STt (zout)       \
+	\
+	LDacc (tmp1)     \
+	LDt (tmp2)       \
+	CALL gfpSubInternal(SB) \ // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
+	ST (tmp1)        \
+	\
+	LDacc (yout)     \
+	gfpMulBy2Inline  \
+	t2acc            \
+	gfpMulBy2Inline  \
+	t2acc            \
+	gfpMulBy2Inline  \
+	t2acc            \
+	gfpMulBy2Inline  \
+	t2acc            \
+	LDt (yout)       \
+	CALL gfpSubInternal(SB) \ // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
+	ST (yout)        \
+	\
+	LDt (tmp4)       \
+	CALL gfpMulInternal(SB) \ // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
+	ST (xout)        \
+	\
+	LDacc (tmp1)     \
+	LDt (tmp3)       \
+	CALL gfpMulInternal(SB) \ // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
+	LDt (xout)       \
+	CALL gfpSubInternal(SB) \ // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
+	MOVQ rptr, AX    \
+	\// Store x
+	MOVQ acc4, (16*0 + 8*0)(AX) \
+	MOVQ acc5, (16*0 + 8*1)(AX) \
+	MOVQ acc6, (16*0 + 8*2)(AX) \
+	MOVQ acc7, (16*0 + 8*3)(AX) \
+	\
+	LDacc (yout)     \
+	LDt (tmp0)       \
+	CALL gfpMulInternal(SB) \ // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
+	ST (yout)        \ 
+	\
+	LDacc (tmp1)     \ 
+	LDt (zout)       \
+	CALL gfpMulInternal(SB) \ // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
+	LDt (yout)       \  
+	gfpAddInline     \        // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
+	MOVQ rptr, AX    \
+	\// Store y
+	MOVQ t0, (16*2 + 8*0)(AX) \
+	MOVQ t1, (16*2 + 8*1)(AX) \
+	MOVQ t2, (16*2 + 8*2)(AX) \
+	MOVQ t3, (16*2 + 8*3)(AX) \
+	\
+	LDacc (tmp0)     \    
+	LDt (tmp3)       \
+	CALL gfpMulInternal(SB) \ // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
+	ST (tmp0)        \
+	LDacc (zout)     \
+	LDt (tmp4)       \
+	CALL gfpMulInternal(SB) \ // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
+	LDt (tmp0)       \
+	gfpAddInline     \        // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
+	MOVQ rptr, AX    \
+	MOVQ $0, rptr    \
+	\// Store z
+	MOVQ t0, (16*4 + 8*0)(AX) \
+	MOVQ t1, (16*4 + 8*1)(AX) \
+	MOVQ t2, (16*4 + 8*2)(AX) \
+	MOVQ t3, (16*4 + 8*3)(AX) \
+
+// func curvePointAddComplete(c, a, b *curvePoint)
+TEXT ·curvePointAddComplete(SB),0,$480-24
+	// Move input to stack in order to free registers
+	MOVQ res+0(FP), AX
+	MOVQ in1+8(FP), BX
+	MOVQ in2+16(FP), CX
+
+	CMPB ·supportAVX2+0(SB), $0x01
+	JEQ  pointadd_avx2
+
+	MOVOU (16*0)(BX), X0
+	MOVOU (16*1)(BX), X1
+	MOVOU (16*2)(BX), X2
+	MOVOU (16*3)(BX), X3
+	MOVOU (16*4)(BX), X4
+	MOVOU (16*5)(BX), X5
+
+	MOVOU X0, x1in(16*0)
+	MOVOU X1, x1in(16*1)
+	MOVOU X2, y1in(16*0)
+	MOVOU X3, y1in(16*1)
+	MOVOU X4, z1in(16*0)
+	MOVOU X5, z1in(16*1)
+
+	MOVOU (16*0)(CX), X0
+	MOVOU (16*1)(CX), X1
+	MOVOU (16*2)(CX), X2
+	MOVOU (16*3)(CX), X3
+	MOVOU (16*4)(CX), X4
+	MOVOU (16*5)(CX), X5
+
+	MOVOU X0, x2in(16*0)
+	MOVOU X1, x2in(16*1)
+	MOVOU X2, y2in(16*0)
+	MOVOU X3, y2in(16*1)
+	MOVOU X4, z2in(16*0)
+	MOVOU X5, z2in(16*1)
+	// Store pointer to result
+	MOVQ AX, rptr
+	
+	curvePointAddCompleteInline
+
+	RET
+	
+pointadd_avx2:	
+	VMOVDQU (32*0)(BX), Y0
+	VMOVDQU (32*1)(BX), Y1
+	VMOVDQU (32*2)(BX), Y2
+
+	VMOVDQU Y0, x1in(32*0)
+	VMOVDQU Y1, y1in(32*0)
+	VMOVDQU Y2, z1in(32*0)
+
+	VMOVDQU (32*0)(CX), Y0
+	VMOVDQU (32*1)(CX), Y1
+	VMOVDQU (32*2)(CX), Y2
+
+	VMOVDQU Y0, x2in(32*0)
+	VMOVDQU Y1, y2in(32*0)
+	VMOVDQU Y2, z2in(32*0)
+
+	// Store pointer to result
+	MOVQ AX, rptr
+	curvePointAddCompleteInline
+
+	VZEROUPPER
+	RET
+
+#undef x1in
+#undef y1in
+#undef z1in
+#undef x2in
+#undef y2in
+#undef z2in
+#undef xout
+#undef yout
+#undef zout
+#undef tmp0
+#undef tmp1
+#undef tmp2
+#undef tmp3
+#undef tmp4
+#undef rptr
+
 /* ---------------------------------------*/
 /*
 #define x1in(off) (32*0 + off)(SP)
diff --git a/sm9/bn256/gfp2_g1_decl.go b/sm9/bn256/gfp2_g1_decl.go
index f4b8fbb..71c37f2 100644
--- a/sm9/bn256/gfp2_g1_decl.go
+++ b/sm9/bn256/gfp2_g1_decl.go
@@ -27,9 +27,9 @@ func gfp2SquareU(c, a *gfP2)
 //
 //go:noescape
 func curvePointDoubleComplete(c, a *curvePoint)
-/*
+
 // Point addition. Sets res = in1 + in2. in1 can be same as in2, also can be at infinity.
 //
 //go:noescape
 func curvePointAddComplete(c, a, b *curvePoint)
-*/
+
diff --git a/sm9/bn256/gfp2_g1_generic.go b/sm9/bn256/gfp2_g1_generic.go
index fe245fe..83fa955 100644
--- a/sm9/bn256/gfp2_g1_generic.go
+++ b/sm9/bn256/gfp2_g1_generic.go
@@ -111,3 +111,48 @@ func curvePointDoubleComplete(c, p *curvePoint) {
 	c.y.Set(y3)
 	c.z.Set(z3)
 }
+
+func curvePointAddComplete(c, p1, p2 *curvePoint) {
+	// Complete addition formula for a = 0 from "Complete addition formulas for
+	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
+	// Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves.
+
+	t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP)
+	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
+	gfpMul(t0, &p1.x, &p2.x)    // t0 := X1X2
+	gfpMul(t1, &p1.y, &p2.y)    // t1 := Y1Y2
+	gfpMul(t2, &p1.z, &p2.z)    // t2 := Z1Z2
+	gfpAdd(t3, &p1.x, &p1.y)    // t3 := X1 + Y1
+	gfpAdd(t4, &p2.x, &p2.y)    // t4 := X2 + Y2
+	gfpMul(t3, t3, t4)          // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
+	gfpAdd(t4, t0, t1)          // t4 := t0 + t1
+	gfpSub(t3, t3, t4)          // t3 := t3 - t4 = X1Y2 + X2Y1
+	gfpAdd(t4, &p1.y, &p1.z)    // t4 := Y1 + Z1
+	gfpAdd(x3, &p2.y, &p2.z)    // X3 := Y2 + Z2
+	gfpMul(t4, t4, x3)          // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
+	gfpAdd(x3, t1, t2)          // X3 := t1 + t2
+	gfpSub(t4, t4, x3)          // t4 := t4 - X3 = Y1Z2 + Y2Z1
+	gfpAdd(x3, &p1.x, &p1.z)    // X3 := X1 + Z1
+	gfpAdd(y3, &p2.x, &p2.z)    // Y3 := X2 + Z2
+	gfpMul(x3, x3, y3)          // X3 := X3 * Y3
+	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
+	gfpSub(y3, x3, y3)          // Y3 := X3 - Y3 = X1Z2 + X2Z1
+	gfpTriple(t0, t0)           // t0 := t0 + t0 + t0 = 3X1X2
+	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2
+	gfpAdd(z3, t1, t2)          // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
+	gfpSub(t1, t1, t2)          // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
+	gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
+	gfpMul(x3, t4, y3)          // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
+	gfpMul(t2, t3, t1)          // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
+	gfpSub(x3, t2, x3)          // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
+	gfpMul(y3, y3, t0)          // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
+	gfpMul(t1, t1, z3)          // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
+	gfpAdd(y3, t1, y3)          // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
+	gfpMul(t0, t0, t3)          // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
+	gfpMul(z3, z3, t4)          // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
+	gfpAdd(z3, z3, t0)          // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
+
+	c.x.Set(x3)
+	c.y.Set(y3)
+	c.z.Set(z3)
+}