diff --git a/sm9/bn256/gfp2_g1_arm64.go b/sm9/bn256/gfp2_g1_arm64.go
new file mode 100644
index 0000000..91d1b90
--- /dev/null
+++ b/sm9/bn256/gfp2_g1_arm64.go
@@ -0,0 +1,144 @@
+//go:build arm64 && !purego
+// +build arm64,!purego
+
+package bn256
+
+package bn256
+
+// gfP2 multiplication.
+//
+//go:noescape
+func gfp2Mul(c, a, b *gfP2)
+
+func gfp2MulU(c, a, b *gfP2) {
+	tmp := &gfP2{}
+	tx := &tmp.x
+	ty := &tmp.y
+	v0, v1 := &gfP{}, &gfP{}
+
+	gfpMul(v0, &a.y, &b.y)
+	gfpMul(v1, &a.x, &b.x)
+
+	gfpAdd(tx, &a.x, &a.y)
+	gfpAdd(ty, &b.x, &b.y)
+
+	gfpMul(ty, tx, ty)
+	gfpSub(ty, ty, v0)
+	gfpSub(ty, ty, v1)
+	gfpDouble(ty, ty)
+	gfpNeg(ty, ty)
+
+	gfpSub(tx, v0, v1)
+	gfpSub(tx, tx, v1)
+
+	gfp2Copy(c, tmp)
+}
+
+func gfp2Square(c, a *gfP2) {
+	tmp := &gfP2{}
+	tx := &tmp.x
+	ty := &tmp.y
+
+	gfpAdd(ty, &a.x, &a.y)
+	gfpDouble(tx, &a.x)
+	gfpSub(tx, &a.y, tx)
+	gfpMul(ty, tx, ty)
+	gfpMul(tx, &a.x, &a.y)
+	gfpAdd(ty, tx, ty)
+	gfpDouble(tx, tx)
+
+	gfp2Copy(c, tmp)
+}
+
+func gfp2SquareU(c, a *gfP2) {
+	tmp := &gfP2{}
+	tx := &tmp.x
+	ty := &tmp.y
+
+	gfpAdd(tx, &a.x, &a.y)
+	gfpDouble(ty, &a.x)
+	gfpSub(ty, &a.y, ty)
+	gfpMul(tx, tx, ty)
+	gfpMul(ty, &a.x, &a.y)
+	gfpAdd(tx, tx, ty)
+	gfpDouble(ty, ty)
+	gfpDouble(ty, ty)
+	gfpNeg(ty, ty)
+
+	gfp2Copy(c, tmp)
+}
+
+func curvePointDoubleComplete(c, p *curvePoint) {
+	// Complete addition formula for a = 0 from "Complete addition formulas for
+	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
+	// Algorithm 9: Exception-free point doubling for prime order j-invariant 0 short Weierstrass curves.
+	t0, t1, t2 := new(gfP), new(gfP), new(gfP)
+	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
+
+	gfpSqr(t0, &p.y, 1)         // t0 := Y^2
+	gfpDouble(z3, t0)           // Z3 := t0 + t0
+	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
+	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
+	gfpMul(t1, &p.y, &p.z)      // t1 := YZ
+	gfpSqr(t2, &p.z, 1)         // t2 := Z^2
+	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ^2
+	gfpMul(x3, t2, z3)          // X3 := t2 * Z3
+	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
+	gfpMul(z3, t1, z3)          // Z3 := t1 * Z3
+	gfpTriple(t2, t2)           // t2 := t2 + t2 + t2
+	gfpSub(t0, t0, t2)          // t0 := t0 - t2
+	gfpMul(y3, t0, y3)          // Y3 := t0 * Y3
+	gfpAdd(y3, x3, y3)          // Y3 := X3 + Y3
+	gfpMul(t1, &p.x, &p.y)      // t1 := XY
+	gfpMul(x3, t0, t1)          // X3 := t0 * t1
+	gfpDouble(x3, x3)           // X3 := X3 + X3
+
+	c.x.Set(x3)
+	c.y.Set(y3)
+	c.z.Set(z3)
+}
+
+func curvePointAddComplete(c, p1, p2 *curvePoint) {
+	// Complete addition formula for a = 0 from "Complete addition formulas for
+	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
+	// Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves.
+
+	t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP)
+	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
+	gfpMul(t0, &p1.x, &p2.x)    // t0 := X1X2
+	gfpMul(t1, &p1.y, &p2.y)    // t1 := Y1Y2
+	gfpMul(t2, &p1.z, &p2.z)    // t2 := Z1Z2
+	gfpAdd(t3, &p1.x, &p1.y)    // t3 := X1 + Y1
+	gfpAdd(t4, &p2.x, &p2.y)    // t4 := X2 + Y2
+	gfpMul(t3, t3, t4)          // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
+	gfpAdd(t4, t0, t1)          // t4 := t0 + t1
+	gfpSub(t3, t3, t4)          // t3 := t3 - t4 = X1Y2 + X2Y1
+	gfpAdd(t4, &p1.y, &p1.z)    // t4 := Y1 + Z1
+	gfpAdd(x3, &p2.y, &p2.z)    // X3 := Y2 + Z2
+	gfpMul(t4, t4, x3)          // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
+	gfpAdd(x3, t1, t2)          // X3 := t1 + t2
+	gfpSub(t4, t4, x3)          // t4 := t4 - X3 = Y1Z2 + Y2Z1
+	gfpAdd(x3, &p1.x, &p1.z)    // X3 := X1 + Z1
+	gfpAdd(y3, &p2.x, &p2.z)    // Y3 := X2 + Z2
+	gfpMul(x3, x3, y3)          // X3 := X3 * Y3
+	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
+	gfpSub(y3, x3, y3)          // Y3 := X3 - Y3 = X1Z2 + X2Z1
+	gfpTriple(t0, t0)           // t0 := t0 + t0 + t0 = 3X1X2
+	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2
+	gfpAdd(z3, t1, t2)          // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
+	gfpSub(t1, t1, t2)          // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
+	gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
+	gfpMul(x3, t4, y3)          // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
+	gfpMul(t2, t3, t1)          // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
+	gfpSub(x3, t2, x3)          // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
+	gfpMul(y3, y3, t0)          // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
+	gfpMul(t1, t1, z3)          // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
+	gfpAdd(y3, t1, y3)          // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
+	gfpMul(t0, t0, t3)          // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
+	gfpMul(z3, z3, t4)          // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
+	gfpAdd(z3, z3, t0)          // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
+
+	c.x.Set(x3)
+	c.y.Set(y3)
+	c.z.Set(z3)
+}
diff --git a/sm9/bn256/gfp2_g1_arm64.s b/sm9/bn256/gfp2_g1_arm64.s
new file mode 100644
index 0000000..583a40e
--- /dev/null
+++ b/sm9/bn256/gfp2_g1_arm64.s
@@ -0,0 +1,543 @@
+//go:build arm64 && !purego
+// +build arm64,!purego
+
+#include "textflag.h"
+
+#define res_ptr R0
+#define a_ptr R1
+#define b_ptr R2
+
+#define acc0 R3
+#define acc1 R4
+#define acc2 R5
+#define acc3 R6
+
+#define acc4 R7
+#define acc5 R8
+#define acc6 R9
+#define acc7 R10
+#define t0 R11
+#define t1 R12
+#define const0 R15
+#define const1 R16
+#define const2 R13
+#define const3 R14
+
+#define hlp0 R17
+#define hlp1 res_ptr
+
+#define x0 R19
+#define x1 R20
+#define x2 R21
+#define x3 R22
+#define y0 R23
+#define y1 R24
+#define y2 R25
+#define y3 R26
+
+/* ---------------------------------------*/
+// (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0)
+TEXT gfpSubInternal<>(SB),NOSPLIT,$0
+	SUBS	x0, y0, acc0
+	SBCS	x1, y1, acc1
+	SBCS	x2, y2, acc2
+	SBCS	x3, y3, acc3
+	SBC	$0, ZR, t0
+
+	ADDS	·p2+0(SB), acc0, acc4
+	ADCS	·p2+8(SB), acc1, acc5
+	ADCS	·p2+16(SB), acc2, acc6
+	ADC	·p2+24(SB), acc3, acc7
+
+	ANDS	$1, t0
+	CSEL	EQ, acc0, acc4, x0
+	CSEL	EQ, acc1, acc5, x1
+	CSEL	EQ, acc2, acc6, x2
+	CSEL	EQ, acc3, acc7, x3
+
+	RET
+
+/* ---------------------------------------*/
+// (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0)
+TEXT gfpMulInternal<>(SB),NOSPLIT,$0
+	// y[0] * x
+	MUL	y0, x0, acc0
+	UMULH	y0, x0, acc1
+
+	MUL	y0, x1, t0
+	ADDS	t0, acc1
+	UMULH	y0, x1, acc2
+
+	MUL	y0, x2, t0
+	ADCS	t0, acc2
+	UMULH	y0, x2, acc3
+
+	MUL	y0, x3, t0
+	ADCS	t0, acc3
+	UMULH	y0, x3, acc4
+	ADC	$0, acc4
+	// First reduction step
+	MUL	acc0, hlp1, hlp0
+
+	MUL	const0, hlp0, t0
+	ADDS	t0, acc0, acc0
+	UMULH	const0, hlp0, t1
+
+	MUL	const1, hlp0, t0
+	ADCS	t0, acc1, acc1
+	UMULH	const1, hlp0, y0
+
+	MUL	const2, hlp0, t0
+	ADCS	t0, acc2, acc2
+	UMULH	const2, hlp0, acc0
+
+	MUL	const3, hlp0, t0
+	ADCS	t0, acc3, acc3
+
+	UMULH	const3, hlp0, hlp0
+	ADC	$0, acc4
+
+	ADDS	t1, acc1, acc1
+	ADCS	y0, acc2, acc2
+	ADCS	acc0, acc3, acc3
+	ADC	$0, hlp0, acc0
+
+	// y[1] * x
+	MUL	y1, x0, t0
+	ADDS	t0, acc1
+	UMULH	y1, x0, t1
+
+	MUL	y1, x1, t0
+	ADCS	t0, acc2
+	UMULH	y1, x1, y0
+
+	MUL	y1, x2, t0
+	ADCS	t0, acc3
+	UMULH	y1, x2, hlp0
+
+	MUL	y1, x3, t0
+	ADCS	t0, acc4
+	UMULH	y1, x3, y1
+	ADC	$0, ZR, acc5
+
+	ADDS	t1, acc2
+	ADCS	y0, acc3
+	ADCS	hlp0, acc4
+	ADC	y1, acc5
+	// Second reduction step
+	MUL	acc1, hlp1, hlp0
+
+	MUL	const0, hlp0, t0
+	ADDS	t0, acc1, acc1
+	UMULH	const0, hlp0, t1
+
+	MUL	const1, hlp0, t0
+	ADCS	t0, acc2, acc2
+	UMULH	const1, hlp0, y0
+
+	MUL	const2, hlp0, t0
+	ADCS	t0, acc3, acc3
+	UMULH	const2, hlp0, acc1
+
+	MUL	const3, hlp0, t0
+	ADCS	t0, acc0, acc0
+
+	UMULH	const3, hlp0, hlp0
+	ADC	$0, acc5
+
+	ADDS	t1, acc2, acc2
+	ADCS	y0, acc3, acc3
+	ADCS	acc1, acc0, acc0
+	ADC	$0, hlp0, acc1
+
+	// y[2] * x
+	MUL	y2, x0, t0
+	ADDS	t0, acc2
+	UMULH	y2, x0, t1
+
+	MUL	y2, x1, t0
+	ADCS	t0, acc3
+	UMULH	y2, x1, y0
+
+	MUL	y2, x2, t0
+	ADCS	t0, acc4
+	UMULH	y2, x2, y1
+
+	MUL	y2, x3, t0
+	ADCS	t0, acc5
+	UMULH	y2, x3, hlp0
+	ADC	$0, ZR, acc6
+
+	ADDS	t1, acc3
+	ADCS	y0, acc4
+	ADCS	y1, acc5
+	ADC	hlp0, acc6
+	// Third reduction step
+	MUL	acc2, hlp1, hlp0
+
+	MUL	const0, hlp0, t0
+	ADDS	t0, acc2, acc2
+	UMULH	const0, hlp0, t1
+
+	MUL	const1, hlp0, t0
+	ADCS	t0, acc3, acc3
+	UMULH	const1, hlp0, y0
+
+	MUL	const2, hlp0, t0
+	ADCS	t0, acc0, acc0
+	UMULH	const2, hlp0, acc2
+
+	MUL	const3, hlp0, t0
+	ADCS	t0, acc1, acc1
+
+	UMULH	const3, hlp0, hlp0
+	ADC	$0, acc6
+
+	ADDS	t1, acc3, acc3
+	ADCS	y0, acc0, acc0
+	ADCS	acc2, acc1, acc1
+	ADC	$0, hlp0, acc2
+	// y[3] * x
+	MUL	y3, x0, t0
+	ADDS	t0, acc3
+	UMULH	y3, x0, t1
+
+	MUL	y3, x1, t0
+	ADCS	t0, acc4
+	UMULH	y3, x1, y0
+
+	MUL	y3, x2, t0
+	ADCS	t0, acc5
+	UMULH	y3, x2, y1
+
+	MUL	y3, x3, t0
+	ADCS	t0, acc6
+	UMULH	y3, x3, hlp0
+	ADC	$0, ZR, acc7
+
+	ADDS	t1, acc4
+	ADCS	y0, acc5
+	ADCS	y1, acc6
+	ADC	hlp0, acc7
+	// Last reduction step
+	MUL	acc3, hlp1, hlp0
+
+	MUL	const0, hlp0, t0
+	ADDS	t0, acc3, acc3
+	UMULH	const0, hlp0, t1
+
+	MUL	const1, hlp0, t0
+	ADCS	t0, acc0, acc0
+	UMULH	const1, hlp0, y0
+
+	MUL	const2, hlp0, t0
+	ADCS	t0, acc1, acc1
+	UMULH	const2, hlp0, acc3
+
+	MUL	const3, hlp0, t0
+	ADCS	t0, acc2, acc2
+
+	UMULH	const3, hlp0, hlp0
+	ADC	$0, acc7
+	// Add bits [511:256] of the mul result
+	ADDS	acc4, acc0, acc0
+	ADCS	acc5, acc1, acc1
+	ADCS	acc6, acc2, acc2
+	ADCS	acc7, acc3, acc3
+	ADC	$0, ZR, acc4
+
+	SUBS	const0, acc0, t0
+	SBCS	const1, acc1, t1
+	SBCS	const2, acc2, acc6
+	SBCS	const3, acc3, acc7
+	SBCS	$0, acc4, acc4
+
+	CSEL	CS, t0, acc0, y0
+	CSEL	CS, t1, acc1, y1
+	CSEL	CS, acc6, acc2, y2
+	CSEL	CS, acc7, acc3, y3
+    
+    RET
+
+/* ---------------------------------------*/
+// (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2
+TEXT gfpSqrInternal<>(SB),NOSPLIT,$0
+	// x[1:] * x[0]
+	MUL	x0, x1, acc1
+	UMULH	x0, x1, acc2
+
+	MUL	x0, x2, t0
+	ADDS	t0, acc2, acc2
+	UMULH	x0, x2, acc3
+
+	MUL	x0, x3, t0
+	ADCS	t0, acc3, acc3
+	UMULH	x0, x3, acc4
+	ADC	$0, acc4, acc4
+	// x[2:] * x[1]
+	MUL	x1, x2, t0
+	ADDS	t0, acc3
+	UMULH	x1, x2, t1
+	ADCS	t1, acc4
+	ADC	$0, ZR, acc5
+
+	MUL	x1, x3, t0
+	ADDS	t0, acc4
+	UMULH	x1, x3, t1
+	ADC	t1, acc5
+	// x[3] * x[2]
+	MUL	x2, x3, t0
+	ADDS	t0, acc5
+	UMULH	x2, x3, acc6
+	ADC	$0, acc6
+
+	MOVD	$0, acc7
+	// *2
+	ADDS	acc1, acc1
+	ADCS	acc2, acc2
+	ADCS	acc3, acc3
+	ADCS	acc4, acc4
+	ADCS	acc5, acc5
+	ADCS	acc6, acc6
+	ADC	$0, acc7
+	// Missing products
+	MUL	x0, x0, acc0
+	UMULH	x0, x0, t0
+	ADDS	t0, acc1, acc1
+
+	MUL	x1, x1, t0
+	ADCS	t0, acc2, acc2
+	UMULH	x1, x1, t1
+	ADCS	t1, acc3, acc3
+
+	MUL	x2, x2, t0
+	ADCS	t0, acc4, acc4
+	UMULH	x2, x2, t1
+	ADCS	t1, acc5, acc5
+
+	MUL	x3, x3, t0
+	ADCS	t0, acc6, acc6
+	UMULH	x3, x3, t1
+	ADCS	t1, acc7, acc7
+	// First reduction step
+	MUL	acc0, hlp1, hlp0
+
+	MUL	const0, hlp0, t0
+	ADDS	t0, acc0, acc0
+	UMULH	const0, hlp0, t1
+
+	MUL	const1, hlp0, t0
+	ADCS	t0, acc1, acc1
+	UMULH	const1, hlp0, y0
+
+	MUL	const2, hlp0, t0
+	ADCS	t0, acc2, acc2
+	UMULH	const2, hlp0, acc0
+
+	MUL	const3, hlp0, t0
+	ADCS	t0, acc3, acc3
+
+	UMULH	const3, hlp0, hlp0
+	ADC	$0, hlp0
+
+	ADDS	t1, acc1, acc1
+	ADCS	y0, acc2, acc2
+	ADCS	acc0, acc3, acc3
+	ADC	$0, hlp0, acc0
+	// Second reduction step
+	MUL	acc1, hlp1, hlp0
+
+	MUL	const0, hlp0, t0
+	ADDS	t0, acc1, acc1
+	UMULH	const0, hlp0, t1
+
+	MUL	const1, hlp0, t0
+	ADCS	t0, acc2, acc2
+	UMULH	const1, hlp0, y0
+
+	MUL	const2, hlp0, t0
+	ADCS	t0, acc3, acc3
+	UMULH	const2, hlp0, acc1
+
+	MUL	const3, hlp0, t0
+	ADCS	t0, acc0, acc0
+
+	UMULH	const3, hlp0, hlp0
+	ADC	$0, hlp0
+
+	ADDS	t1, acc2, acc2
+	ADCS	y0, acc3, acc3
+	ADCS	acc1, acc0, acc0
+	ADC	$0, hlp0, acc1
+	// Third reduction step
+	MUL	acc2, hlp1, hlp0
+
+	MUL	const0, hlp0, t0
+	ADDS	t0, acc2, acc2
+	UMULH	const0, hlp0, t1
+
+	MUL	const1, hlp0, t0
+	ADCS	t0, acc3, acc3
+	UMULH	const1, hlp0, y0
+
+	MUL	const2, hlp0, t0
+	ADCS	t0, acc0, acc0
+	UMULH	const2, hlp0, acc2
+
+	MUL	const3, hlp0, t0
+	ADCS	t0, acc1, acc1
+
+	UMULH	const3, hlp0, hlp0
+	ADC	$0, hlp0
+
+	ADDS	t1, acc3, acc3
+	ADCS	y0, acc0, acc0
+	ADCS	acc2, acc1, acc1
+	ADC	$0, hlp0, acc2
+
+	// Last reduction step
+	MUL	acc3, hlp1, hlp0
+
+	MUL	const0, hlp0, t0
+	ADDS	t0, acc3, acc3
+	UMULH	const0, hlp0, t1
+
+	MUL	const1, hlp0, t0
+	ADCS	t0, acc0, acc0
+	UMULH	const1, hlp0, y0
+
+	MUL	const2, hlp0, t0
+	ADCS	t0, acc1, acc1
+	UMULH	const2, hlp0, acc3
+
+	MUL	const3, hlp0, t0
+	ADCS	t0, acc2, acc2
+
+	UMULH	const3, hlp0, hlp0
+	ADC	$0, acc7
+
+	ADDS	t1, acc0, acc0
+	ADCS	y0, acc1, acc1
+	ADCS	acc3, acc2, acc2
+	ADC	$0, hlp0, acc3
+	// Add bits [511:256] of the sqr result
+	ADDS	acc4, acc0, acc0
+	ADCS	acc5, acc1, acc1
+	ADCS	acc6, acc2, acc2
+	ADCS	acc7, acc3, acc3
+	ADC	$0, ZR, acc4
+
+	SUBS	const0, acc0, t0
+	SBCS	const1, acc1, t1
+	SBCS	const2, acc2, acc6
+	SBCS	const3, acc3, acc7
+	SBCS	$0, acc4, acc4
+
+	CSEL	CS, t0, acc0, y0
+	CSEL	CS, t1, acc1, y1
+	CSEL	CS, acc6, acc2, y2
+	CSEL	CS, acc7, acc3, y3
+    RET
+
+/* ---------------------------------------*/
+// (x3, x2, x1, x0) = 2(y3, y2, y1, y0)
+#define gfpMulBy2Inline       \
+	ADDS	y0, y0, x0;    \
+	ADCS	y1, y1, x1;    \
+	ADCS	y2, y2, x2;    \
+	ADCS	y3, y3, x3;    \
+	ADC	$0, ZR, hlp0;  \
+	SUBS	·p2+0(SB), x0, acc0;   \
+	SBCS	·p2+8(SB), x1, acc1;\
+	SBCS	·p2+16(SB), x2, acc2;    \
+	SBCS	·p2+24(SB), x3, acc3;\
+	SBCS	$0, hlp0, hlp0;\
+	CSEL	CC, x0, acc0, x0;\
+	CSEL	CC, x1, acc1, x1;\
+	CSEL	CC, x2, acc2, x2;\
+	CSEL	CC, x3, acc3, x3;    
+
+/* ---------------------------------------*/
+// (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
+#define gfpAddInline          \
+	ADDS	y0, x0, x0;    \
+	ADCS	y1, x1, x1;    \
+	ADCS	y2, x2, x2;    \
+	ADCS	y3, x3, x3;    \
+	ADC	$0, ZR, hlp0;  \
+	SUBS	·p2+0(SB), x0, acc0;   \
+	SBCS	·p2+8(SB), x1, acc1;\
+	SBCS	·p2+16(SB), x2, acc2;    \
+	SBCS	·p2+24(SB), x3, acc3;\
+	SBCS	$0, hlp0, hlp0;\
+	CSEL	CC, x0, acc0, x0;\
+	CSEL	CC, x1, acc1, x1;\
+	CSEL	CC, x2, acc2, x2;\
+	CSEL	CC, x3, acc3, x3;
+
+/* ---------------------------------------*/
+#define x1in(off) (off)(a_ptr)
+#define y1in(off) (off + 32)(a_ptr)
+#define z1in(off) (off + 64)(a_ptr)
+#define x2in(off) (off)(b_ptr)
+#define z2in(off) (off + 64)(b_ptr)
+#define x3out(off) (off)(res_ptr)
+#define y3out(off) (off + 32)(res_ptr)
+#define z3out(off) (off + 64)(res_ptr)
+#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
+#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
+#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
+#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
+#define y2x      MOVD y0, x0; MOVD y1, x1; MOVD y2, x2; MOVD y3, x3
+#define x2y      MOVD x0, y0; MOVD x1, y1; MOVD x2, y2; MOVD x3, y3
+
+/* ---------------------------------------*/
+#define tmp0(off)	(32*0 + 8 + off)(RSP)
+#define tmp1(off)	(32*1 + 8 + off)(RSP)
+#define tmp2(off) (32*2 + 8 + off)(RSP)
+
+// func gfp2Mul(c, a, b *gfP2)
+TEXT ·gfp2Mul(SB),NOSPLIT,$104-24
+	MOVD	res+0(FP), res_ptr
+	MOVD	in1+8(FP), a_ptr
+	MOVD	in2+16(FP), b_ptr
+
+	MOVD	·np+0x00(SB), hlp1
+	LDP	·p2+0x00(SB), (const0, const1)
+	LDP	·p2+0x10(SB), (const2, const3)
+	
+	LDx (y1in)
+	LDy (y2in)
+	CALL gfpMulInternal(SB)
+	STy (tmp0)
+
+	LDx (x1in)
+	LDy (x2in)
+	CALL gfpMulInternal(SB)
+	STy (tmp1)
+
+	LDx (x1in)
+	LDy (y1in)
+	gfpAddInline
+	STx (tmp2)
+
+	LDx (x2in)
+	LDy (y2in)
+	gfpAddInline
+	LDy (tmp2)
+	CALL gfpMulInternal(SB)
+
+	LDx (tmp0)
+	CALL gfpSubInternal(SB)
+	x2y
+	LDx (tmp1)
+	CALL gfpSubInternal(SB)
+	STx (x3out)
+
+	LDy (tmp1)
+	gfpMulBy2Inline
+	LDy (tmp0)
+	CALL gfpSubInternal(SB)
+	STx (y3out)
+
+	RET