sm9/bn256: gfp2 g1 arm64 method one by one

2025-10-14 15:20:45 +08:00 · 2023-07-24 13:11:38 +08:00 · 2023-07-24 13:11:38 +08:00 · 968dfaafa0
commit 968dfaafa0
parent de62767f53
2 changed files with 687 additions and 0 deletions
--- a/sm9/bn256/gfp2_g1_arm64.go
+++ b/sm9/bn256/gfp2_g1_arm64.go
@ -0,0 +1,144 @@
 //go:build arm64 && !purego
 // +build arm64,!purego
 package bn256
 package bn256
 // gfP2 multiplication.
 //
 //go:noescape
 func gfp2Mul(c, a, b *gfP2)
 func gfp2MulU(c, a, b *gfP2) {
 	tmp := &gfP2{}
 	tx := &tmp.x
 	ty := &tmp.y
 	v0, v1 := &gfP{}, &gfP{}
 	gfpMul(v0, &a.y, &b.y)
 	gfpMul(v1, &a.x, &b.x)
 	gfpAdd(tx, &a.x, &a.y)
 	gfpAdd(ty, &b.x, &b.y)
 	gfpMul(ty, tx, ty)
 	gfpSub(ty, ty, v0)
 	gfpSub(ty, ty, v1)
 	gfpDouble(ty, ty)
 	gfpNeg(ty, ty)
 	gfpSub(tx, v0, v1)
 	gfpSub(tx, tx, v1)
 	gfp2Copy(c, tmp)
 }
 func gfp2Square(c, a *gfP2) {
 	tmp := &gfP2{}
 	tx := &tmp.x
 	ty := &tmp.y
 	gfpAdd(ty, &a.x, &a.y)
 	gfpDouble(tx, &a.x)
 	gfpSub(tx, &a.y, tx)
 	gfpMul(ty, tx, ty)
 	gfpMul(tx, &a.x, &a.y)
 	gfpAdd(ty, tx, ty)
 	gfpDouble(tx, tx)
 	gfp2Copy(c, tmp)
 }
 func gfp2SquareU(c, a *gfP2) {
 	tmp := &gfP2{}
 	tx := &tmp.x
 	ty := &tmp.y
 	gfpAdd(tx, &a.x, &a.y)
 	gfpDouble(ty, &a.x)
 	gfpSub(ty, &a.y, ty)
 	gfpMul(tx, tx, ty)
 	gfpMul(ty, &a.x, &a.y)
 	gfpAdd(tx, tx, ty)
 	gfpDouble(ty, ty)
 	gfpDouble(ty, ty)
 	gfpNeg(ty, ty)
 	gfp2Copy(c, tmp)
 }
 func curvePointDoubleComplete(c, p *curvePoint) {
 	// Complete addition formula for a = 0 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
 	// Algorithm 9: Exception-free point doubling for prime order j-invariant 0 short Weierstrass curves.
 	t0, t1, t2 := new(gfP), new(gfP), new(gfP)
 	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
 	gfpSqr(t0, &p.y, 1)         // t0 := Y^2
 	gfpDouble(z3, t0)           // Z3 := t0 + t0
 	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
 	gfpDouble(z3, z3)           // Z3 := Z3 + Z3
 	gfpMul(t1, &p.y, &p.z)      // t1 := YZ
 	gfpSqr(t2, &p.z, 1)         // t2 := Z^2
 	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ^2
 	gfpMul(x3, t2, z3)          // X3 := t2 * Z3
 	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
 	gfpMul(z3, t1, z3)          // Z3 := t1 * Z3
 	gfpTriple(t2, t2)           // t2 := t2 + t2 + t2
 	gfpSub(t0, t0, t2)          // t0 := t0 - t2
 	gfpMul(y3, t0, y3)          // Y3 := t0 * Y3
 	gfpAdd(y3, x3, y3)          // Y3 := X3 + Y3
 	gfpMul(t1, &p.x, &p.y)      // t1 := XY
 	gfpMul(x3, t0, t1)          // X3 := t0 * t1
 	gfpDouble(x3, x3)           // X3 := X3 + X3
 	c.x.Set(x3)
 	c.y.Set(y3)
 	c.z.Set(z3)
 }
 func curvePointAddComplete(c, p1, p2 *curvePoint) {
 	// Complete addition formula for a = 0 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
 	// Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves.
 	t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP)
 	x3, y3, z3 := new(gfP), new(gfP), new(gfP)
 	gfpMul(t0, &p1.x, &p2.x)    // t0 := X1X2
 	gfpMul(t1, &p1.y, &p2.y)    // t1 := Y1Y2
 	gfpMul(t2, &p1.z, &p2.z)    // t2 := Z1Z2
 	gfpAdd(t3, &p1.x, &p1.y)    // t3 := X1 + Y1
 	gfpAdd(t4, &p2.x, &p2.y)    // t4 := X2 + Y2
 	gfpMul(t3, t3, t4)          // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
 	gfpAdd(t4, t0, t1)          // t4 := t0 + t1
 	gfpSub(t3, t3, t4)          // t3 := t3 - t4 = X1Y2 + X2Y1
 	gfpAdd(t4, &p1.y, &p1.z)    // t4 := Y1 + Z1
 	gfpAdd(x3, &p2.y, &p2.z)    // X3 := Y2 + Z2
 	gfpMul(t4, t4, x3)          // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
 	gfpAdd(x3, t1, t2)          // X3 := t1 + t2
 	gfpSub(t4, t4, x3)          // t4 := t4 - X3 = Y1Z2 + Y2Z1
 	gfpAdd(x3, &p1.x, &p1.z)    // X3 := X1 + Z1
 	gfpAdd(y3, &p2.x, &p2.z)    // Y3 := X2 + Z2
 	gfpMul(x3, x3, y3)          // X3 := X3 * Y3
 	gfpAdd(y3, t0, t2)          // Y3 := t0 + t2
 	gfpSub(y3, x3, y3)          // Y3 := X3 - Y3 = X1Z2 + X2Z1
 	gfpTriple(t0, t0)           // t0 := t0 + t0 + t0 = 3X1X2
 	gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2
 	gfpAdd(z3, t1, t2)          // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
 	gfpSub(t1, t1, t2)          // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
 	gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
 	gfpMul(x3, t4, y3)          // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
 	gfpMul(t2, t3, t1)          // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
 	gfpSub(x3, t2, x3)          // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
 	gfpMul(y3, y3, t0)          // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
 	gfpMul(t1, t1, z3)          // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
 	gfpAdd(y3, t1, y3)          // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
 	gfpMul(t0, t0, t3)          // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
 	gfpMul(z3, z3, t4)          // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
 	gfpAdd(z3, z3, t0)          // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
 	c.x.Set(x3)
 	c.y.Set(y3)
 	c.z.Set(z3)
 }
--- a/sm9/bn256/gfp2_g1_arm64.s
+++ b/sm9/bn256/gfp2_g1_arm64.s
@ -0,0 +1,543 @@
 //go:build arm64 && !purego
 // +build arm64,!purego
 #include "textflag.h"
 #define res_ptr R0
 #define a_ptr R1
 #define b_ptr R2
 #define acc0 R3
 #define acc1 R4
 #define acc2 R5
 #define acc3 R6
 #define acc4 R7
 #define acc5 R8
 #define acc6 R9
 #define acc7 R10
 #define t0 R11
 #define t1 R12
 #define const0 R15
 #define const1 R16
 #define const2 R13
 #define const3 R14
 #define hlp0 R17
 #define hlp1 res_ptr
 #define x0 R19
 #define x1 R20
 #define x2 R21
 #define x3 R22
 #define y0 R23
 #define y1 R24
 #define y2 R25
 #define y3 R26
 /* ---------------------------------------*/
 // (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0)
 TEXT gfpSubInternal<>(SB),NOSPLIT,$0
 	SUBS	x0, y0, acc0
 	SBCS	x1, y1, acc1
 	SBCS	x2, y2, acc2
 	SBCS	x3, y3, acc3
 	SBC	$0, ZR, t0
 	ADDS	·p2+0(SB), acc0, acc4
 	ADCS	·p2+8(SB), acc1, acc5
 	ADCS	·p2+16(SB), acc2, acc6
 	ADC	·p2+24(SB), acc3, acc7
 	ANDS	$1, t0
 	CSEL	EQ, acc0, acc4, x0
 	CSEL	EQ, acc1, acc5, x1
 	CSEL	EQ, acc2, acc6, x2
 	CSEL	EQ, acc3, acc7, x3
 	RET
 /* ---------------------------------------*/
 // (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0)
 TEXT gfpMulInternal<>(SB),NOSPLIT,$0
 	// y[0] * x
 	MUL	y0, x0, acc0
 	UMULH	y0, x0, acc1
 	MUL	y0, x1, t0
 	ADDS	t0, acc1
 	UMULH	y0, x1, acc2
 	MUL	y0, x2, t0
 	ADCS	t0, acc2
 	UMULH	y0, x2, acc3
 	MUL	y0, x3, t0
 	ADCS	t0, acc3
 	UMULH	y0, x3, acc4
 	ADC	$0, acc4
 	// First reduction step
 	MUL	acc0, hlp1, hlp0
 	MUL	const0, hlp0, t0
 	ADDS	t0, acc0, acc0
 	UMULH	const0, hlp0, t1
 	MUL	const1, hlp0, t0
 	ADCS	t0, acc1, acc1
 	UMULH	const1, hlp0, y0
 	MUL	const2, hlp0, t0
 	ADCS	t0, acc2, acc2
 	UMULH	const2, hlp0, acc0
 	MUL	const3, hlp0, t0
 	ADCS	t0, acc3, acc3
 	UMULH	const3, hlp0, hlp0
 	ADC	$0, acc4
 	ADDS	t1, acc1, acc1
 	ADCS	y0, acc2, acc2
 	ADCS	acc0, acc3, acc3
 	ADC	$0, hlp0, acc0
 	// y[1] * x
 	MUL	y1, x0, t0
 	ADDS	t0, acc1
 	UMULH	y1, x0, t1
 	MUL	y1, x1, t0
 	ADCS	t0, acc2
 	UMULH	y1, x1, y0
 	MUL	y1, x2, t0
 	ADCS	t0, acc3
 	UMULH	y1, x2, hlp0
 	MUL	y1, x3, t0
 	ADCS	t0, acc4
 	UMULH	y1, x3, y1
 	ADC	$0, ZR, acc5
 	ADDS	t1, acc2
 	ADCS	y0, acc3
 	ADCS	hlp0, acc4
 	ADC	y1, acc5
 	// Second reduction step
 	MUL	acc1, hlp1, hlp0
 	MUL	const0, hlp0, t0
 	ADDS	t0, acc1, acc1
 	UMULH	const0, hlp0, t1
 	MUL	const1, hlp0, t0
 	ADCS	t0, acc2, acc2
 	UMULH	const1, hlp0, y0
 	MUL	const2, hlp0, t0
 	ADCS	t0, acc3, acc3
 	UMULH	const2, hlp0, acc1
 	MUL	const3, hlp0, t0
 	ADCS	t0, acc0, acc0
 	UMULH	const3, hlp0, hlp0
 	ADC	$0, acc5
 	ADDS	t1, acc2, acc2
 	ADCS	y0, acc3, acc3
 	ADCS	acc1, acc0, acc0
 	ADC	$0, hlp0, acc1
 	// y[2] * x
 	MUL	y2, x0, t0
 	ADDS	t0, acc2
 	UMULH	y2, x0, t1
 	MUL	y2, x1, t0
 	ADCS	t0, acc3
 	UMULH	y2, x1, y0
 	MUL	y2, x2, t0
 	ADCS	t0, acc4
 	UMULH	y2, x2, y1
 	MUL	y2, x3, t0
 	ADCS	t0, acc5
 	UMULH	y2, x3, hlp0
 	ADC	$0, ZR, acc6
 	ADDS	t1, acc3
 	ADCS	y0, acc4
 	ADCS	y1, acc5
 	ADC	hlp0, acc6
 	// Third reduction step
 	MUL	acc2, hlp1, hlp0
 	MUL	const0, hlp0, t0
 	ADDS	t0, acc2, acc2
 	UMULH	const0, hlp0, t1
 	MUL	const1, hlp0, t0
 	ADCS	t0, acc3, acc3
 	UMULH	const1, hlp0, y0
 	MUL	const2, hlp0, t0
 	ADCS	t0, acc0, acc0
 	UMULH	const2, hlp0, acc2
 	MUL	const3, hlp0, t0
 	ADCS	t0, acc1, acc1
 	UMULH	const3, hlp0, hlp0
 	ADC	$0, acc6
 	ADDS	t1, acc3, acc3
 	ADCS	y0, acc0, acc0
 	ADCS	acc2, acc1, acc1
 	ADC	$0, hlp0, acc2
 	// y[3] * x
 	MUL	y3, x0, t0
 	ADDS	t0, acc3
 	UMULH	y3, x0, t1
 	MUL	y3, x1, t0
 	ADCS	t0, acc4
 	UMULH	y3, x1, y0
 	MUL	y3, x2, t0
 	ADCS	t0, acc5
 	UMULH	y3, x2, y1
 	MUL	y3, x3, t0
 	ADCS	t0, acc6
 	UMULH	y3, x3, hlp0
 	ADC	$0, ZR, acc7
 	ADDS	t1, acc4
 	ADCS	y0, acc5
 	ADCS	y1, acc6
 	ADC	hlp0, acc7
 	// Last reduction step
 	MUL	acc3, hlp1, hlp0
 	MUL	const0, hlp0, t0
 	ADDS	t0, acc3, acc3
 	UMULH	const0, hlp0, t1
 	MUL	const1, hlp0, t0
 	ADCS	t0, acc0, acc0
 	UMULH	const1, hlp0, y0
 	MUL	const2, hlp0, t0
 	ADCS	t0, acc1, acc1
 	UMULH	const2, hlp0, acc3
 	MUL	const3, hlp0, t0
 	ADCS	t0, acc2, acc2
 	UMULH	const3, hlp0, hlp0
 	ADC	$0, acc7
 	// Add bits [511:256] of the mul result
 	ADDS	acc4, acc0, acc0
 	ADCS	acc5, acc1, acc1
 	ADCS	acc6, acc2, acc2
 	ADCS	acc7, acc3, acc3
 	ADC	$0, ZR, acc4
 	SUBS	const0, acc0, t0
 	SBCS	const1, acc1, t1
 	SBCS	const2, acc2, acc6
 	SBCS	const3, acc3, acc7
 	SBCS	$0, acc4, acc4
 	CSEL	CS, t0, acc0, y0
 	CSEL	CS, t1, acc1, y1
 	CSEL	CS, acc6, acc2, y2
 	CSEL	CS, acc7, acc3, y3
    RET
 /* ---------------------------------------*/
 // (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2
 TEXT gfpSqrInternal<>(SB),NOSPLIT,$0
 	// x[1:] * x[0]
 	MUL	x0, x1, acc1
 	UMULH	x0, x1, acc2
 	MUL	x0, x2, t0
 	ADDS	t0, acc2, acc2
 	UMULH	x0, x2, acc3
 	MUL	x0, x3, t0
 	ADCS	t0, acc3, acc3
 	UMULH	x0, x3, acc4
 	ADC	$0, acc4, acc4
 	// x[2:] * x[1]
 	MUL	x1, x2, t0
 	ADDS	t0, acc3
 	UMULH	x1, x2, t1
 	ADCS	t1, acc4
 	ADC	$0, ZR, acc5
 	MUL	x1, x3, t0
 	ADDS	t0, acc4
 	UMULH	x1, x3, t1
 	ADC	t1, acc5
 	// x[3] * x[2]
 	MUL	x2, x3, t0
 	ADDS	t0, acc5
 	UMULH	x2, x3, acc6
 	ADC	$0, acc6
 	MOVD	$0, acc7
 	// *2
 	ADDS	acc1, acc1
 	ADCS	acc2, acc2
 	ADCS	acc3, acc3
 	ADCS	acc4, acc4
 	ADCS	acc5, acc5
 	ADCS	acc6, acc6
 	ADC	$0, acc7
 	// Missing products
 	MUL	x0, x0, acc0
 	UMULH	x0, x0, t0
 	ADDS	t0, acc1, acc1
 	MUL	x1, x1, t0
 	ADCS	t0, acc2, acc2
 	UMULH	x1, x1, t1
 	ADCS	t1, acc3, acc3
 	MUL	x2, x2, t0
 	ADCS	t0, acc4, acc4
 	UMULH	x2, x2, t1
 	ADCS	t1, acc5, acc5
 	MUL	x3, x3, t0
 	ADCS	t0, acc6, acc6
 	UMULH	x3, x3, t1
 	ADCS	t1, acc7, acc7
 	// First reduction step
 	MUL	acc0, hlp1, hlp0
 	MUL	const0, hlp0, t0
 	ADDS	t0, acc0, acc0
 	UMULH	const0, hlp0, t1
 	MUL	const1, hlp0, t0
 	ADCS	t0, acc1, acc1
 	UMULH	const1, hlp0, y0
 	MUL	const2, hlp0, t0
 	ADCS	t0, acc2, acc2
 	UMULH	const2, hlp0, acc0
 	MUL	const3, hlp0, t0
 	ADCS	t0, acc3, acc3
 	UMULH	const3, hlp0, hlp0
 	ADC	$0, hlp0
 	ADDS	t1, acc1, acc1
 	ADCS	y0, acc2, acc2
 	ADCS	acc0, acc3, acc3
 	ADC	$0, hlp0, acc0
 	// Second reduction step
 	MUL	acc1, hlp1, hlp0
 	MUL	const0, hlp0, t0
 	ADDS	t0, acc1, acc1
 	UMULH	const0, hlp0, t1
 	MUL	const1, hlp0, t0
 	ADCS	t0, acc2, acc2
 	UMULH	const1, hlp0, y0
 	MUL	const2, hlp0, t0
 	ADCS	t0, acc3, acc3
 	UMULH	const2, hlp0, acc1
 	MUL	const3, hlp0, t0
 	ADCS	t0, acc0, acc0
 	UMULH	const3, hlp0, hlp0
 	ADC	$0, hlp0
 	ADDS	t1, acc2, acc2
 	ADCS	y0, acc3, acc3
 	ADCS	acc1, acc0, acc0
 	ADC	$0, hlp0, acc1
 	// Third reduction step
 	MUL	acc2, hlp1, hlp0
 	MUL	const0, hlp0, t0
 	ADDS	t0, acc2, acc2
 	UMULH	const0, hlp0, t1
 	MUL	const1, hlp0, t0
 	ADCS	t0, acc3, acc3
 	UMULH	const1, hlp0, y0
 	MUL	const2, hlp0, t0
 	ADCS	t0, acc0, acc0
 	UMULH	const2, hlp0, acc2
 	MUL	const3, hlp0, t0
 	ADCS	t0, acc1, acc1
 	UMULH	const3, hlp0, hlp0
 	ADC	$0, hlp0
 	ADDS	t1, acc3, acc3
 	ADCS	y0, acc0, acc0
 	ADCS	acc2, acc1, acc1
 	ADC	$0, hlp0, acc2
 	// Last reduction step
 	MUL	acc3, hlp1, hlp0
 	MUL	const0, hlp0, t0
 	ADDS	t0, acc3, acc3
 	UMULH	const0, hlp0, t1
 	MUL	const1, hlp0, t0
 	ADCS	t0, acc0, acc0
 	UMULH	const1, hlp0, y0
 	MUL	const2, hlp0, t0
 	ADCS	t0, acc1, acc1
 	UMULH	const2, hlp0, acc3
 	MUL	const3, hlp0, t0
 	ADCS	t0, acc2, acc2
 	UMULH	const3, hlp0, hlp0
 	ADC	$0, acc7
 	ADDS	t1, acc0, acc0
 	ADCS	y0, acc1, acc1
 	ADCS	acc3, acc2, acc2
 	ADC	$0, hlp0, acc3
 	// Add bits [511:256] of the sqr result
 	ADDS	acc4, acc0, acc0
 	ADCS	acc5, acc1, acc1
 	ADCS	acc6, acc2, acc2
 	ADCS	acc7, acc3, acc3
 	ADC	$0, ZR, acc4
 	SUBS	const0, acc0, t0
 	SBCS	const1, acc1, t1
 	SBCS	const2, acc2, acc6
 	SBCS	const3, acc3, acc7
 	SBCS	$0, acc4, acc4
 	CSEL	CS, t0, acc0, y0
 	CSEL	CS, t1, acc1, y1
 	CSEL	CS, acc6, acc2, y2
 	CSEL	CS, acc7, acc3, y3
    RET
 /* ---------------------------------------*/
 // (x3, x2, x1, x0) = 2(y3, y2, y1, y0)
 #define gfpMulBy2Inline       \
 	ADDS	y0, y0, x0;    \
 	ADCS	y1, y1, x1;    \
 	ADCS	y2, y2, x2;    \
 	ADCS	y3, y3, x3;    \
 	ADC	$0, ZR, hlp0;  \
 	SUBS	·p2+0(SB), x0, acc0;   \
 	SBCS	·p2+8(SB), x1, acc1;\
 	SBCS	·p2+16(SB), x2, acc2;    \
 	SBCS	·p2+24(SB), x3, acc3;\
 	SBCS	$0, hlp0, hlp0;\
 	CSEL	CC, x0, acc0, x0;\
 	CSEL	CC, x1, acc1, x1;\
 	CSEL	CC, x2, acc2, x2;\
 	CSEL	CC, x3, acc3, x3;    
 /* ---------------------------------------*/
 // (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
 #define gfpAddInline          \
 	ADDS	y0, x0, x0;    \
 	ADCS	y1, x1, x1;    \
 	ADCS	y2, x2, x2;    \
 	ADCS	y3, x3, x3;    \
 	ADC	$0, ZR, hlp0;  \
 	SUBS	·p2+0(SB), x0, acc0;   \
 	SBCS	·p2+8(SB), x1, acc1;\
 	SBCS	·p2+16(SB), x2, acc2;    \
 	SBCS	·p2+24(SB), x3, acc3;\
 	SBCS	$0, hlp0, hlp0;\
 	CSEL	CC, x0, acc0, x0;\
 	CSEL	CC, x1, acc1, x1;\
 	CSEL	CC, x2, acc2, x2;\
 	CSEL	CC, x3, acc3, x3;
 /* ---------------------------------------*/
 #define x1in(off) (off)(a_ptr)
 #define y1in(off) (off + 32)(a_ptr)
 #define z1in(off) (off + 64)(a_ptr)
 #define x2in(off) (off)(b_ptr)
 #define z2in(off) (off + 64)(b_ptr)
 #define x3out(off) (off)(res_ptr)
 #define y3out(off) (off + 32)(res_ptr)
 #define z3out(off) (off + 64)(res_ptr)
 #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
 #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
 #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
 #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
 #define y2x      MOVD y0, x0; MOVD y1, x1; MOVD y2, x2; MOVD y3, x3
 #define x2y      MOVD x0, y0; MOVD x1, y1; MOVD x2, y2; MOVD x3, y3
 /* ---------------------------------------*/
 #define tmp0(off)	(32*0 + 8 + off)(RSP)
 #define tmp1(off)	(32*1 + 8 + off)(RSP)
 #define tmp2(off) (32*2 + 8 + off)(RSP)
 // func gfp2Mul(c, a, b *gfP2)
 TEXT ·gfp2Mul(SB),NOSPLIT,$104-24
 	MOVD	res+0(FP), res_ptr
 	MOVD	in1+8(FP), a_ptr
 	MOVD	in2+16(FP), b_ptr
 	MOVD	·np+0x00(SB), hlp1
 	LDP	·p2+0x00(SB), (const0, const1)
 	LDP	·p2+0x10(SB), (const2, const3)
 	LDx (y1in)
 	LDy (y2in)
 	CALL gfpMulInternal(SB)
 	STy (tmp0)
 	LDx (x1in)
 	LDy (x2in)
 	CALL gfpMulInternal(SB)
 	STy (tmp1)
 	LDx (x1in)
 	LDy (y1in)
 	gfpAddInline
 	STx (tmp2)
 	LDx (x2in)
 	LDy (y2in)
 	gfpAddInline
 	LDy (tmp2)
 	CALL gfpMulInternal(SB)
 	LDx (tmp0)
 	CALL gfpSubInternal(SB)
 	x2y
 	LDx (tmp1)
 	CALL gfpSubInternal(SB)
 	STx (x3out)
 	LDy (tmp1)
 	gfpMulBy2Inline
 	LDy (tmp0)
 	CALL gfpSubInternal(SB)
 	STx (y3out)
 	RET