mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
sm9/bn256: gfp2 g1 arm64 method one by one
This commit is contained in:
parent
de62767f53
commit
968dfaafa0
144
sm9/bn256/gfp2_g1_arm64.go
Normal file
144
sm9/bn256/gfp2_g1_arm64.go
Normal file
@ -0,0 +1,144 @@
|
||||
//go:build arm64 && !purego
|
||||
// +build arm64,!purego
|
||||
|
||||
package bn256
|
||||
|
||||
package bn256
|
||||
|
||||
// gfP2 multiplication.
|
||||
//
|
||||
//go:noescape
|
||||
func gfp2Mul(c, a, b *gfP2)
|
||||
|
||||
func gfp2MulU(c, a, b *gfP2) {
|
||||
tmp := &gfP2{}
|
||||
tx := &tmp.x
|
||||
ty := &tmp.y
|
||||
v0, v1 := &gfP{}, &gfP{}
|
||||
|
||||
gfpMul(v0, &a.y, &b.y)
|
||||
gfpMul(v1, &a.x, &b.x)
|
||||
|
||||
gfpAdd(tx, &a.x, &a.y)
|
||||
gfpAdd(ty, &b.x, &b.y)
|
||||
|
||||
gfpMul(ty, tx, ty)
|
||||
gfpSub(ty, ty, v0)
|
||||
gfpSub(ty, ty, v1)
|
||||
gfpDouble(ty, ty)
|
||||
gfpNeg(ty, ty)
|
||||
|
||||
gfpSub(tx, v0, v1)
|
||||
gfpSub(tx, tx, v1)
|
||||
|
||||
gfp2Copy(c, tmp)
|
||||
}
|
||||
|
||||
func gfp2Square(c, a *gfP2) {
|
||||
tmp := &gfP2{}
|
||||
tx := &tmp.x
|
||||
ty := &tmp.y
|
||||
|
||||
gfpAdd(ty, &a.x, &a.y)
|
||||
gfpDouble(tx, &a.x)
|
||||
gfpSub(tx, &a.y, tx)
|
||||
gfpMul(ty, tx, ty)
|
||||
gfpMul(tx, &a.x, &a.y)
|
||||
gfpAdd(ty, tx, ty)
|
||||
gfpDouble(tx, tx)
|
||||
|
||||
gfp2Copy(c, tmp)
|
||||
}
|
||||
|
||||
func gfp2SquareU(c, a *gfP2) {
|
||||
tmp := &gfP2{}
|
||||
tx := &tmp.x
|
||||
ty := &tmp.y
|
||||
|
||||
gfpAdd(tx, &a.x, &a.y)
|
||||
gfpDouble(ty, &a.x)
|
||||
gfpSub(ty, &a.y, ty)
|
||||
gfpMul(tx, tx, ty)
|
||||
gfpMul(ty, &a.x, &a.y)
|
||||
gfpAdd(tx, tx, ty)
|
||||
gfpDouble(ty, ty)
|
||||
gfpDouble(ty, ty)
|
||||
gfpNeg(ty, ty)
|
||||
|
||||
gfp2Copy(c, tmp)
|
||||
}
|
||||
|
||||
func curvePointDoubleComplete(c, p *curvePoint) {
|
||||
// Complete addition formula for a = 0 from "Complete addition formulas for
|
||||
// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
|
||||
// Algorithm 9: Exception-free point doubling for prime order j-invariant 0 short Weierstrass curves.
|
||||
t0, t1, t2 := new(gfP), new(gfP), new(gfP)
|
||||
x3, y3, z3 := new(gfP), new(gfP), new(gfP)
|
||||
|
||||
gfpSqr(t0, &p.y, 1) // t0 := Y^2
|
||||
gfpDouble(z3, t0) // Z3 := t0 + t0
|
||||
gfpDouble(z3, z3) // Z3 := Z3 + Z3
|
||||
gfpDouble(z3, z3) // Z3 := Z3 + Z3
|
||||
gfpMul(t1, &p.y, &p.z) // t1 := YZ
|
||||
gfpSqr(t2, &p.z, 1) // t2 := Z^2
|
||||
gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ^2
|
||||
gfpMul(x3, t2, z3) // X3 := t2 * Z3
|
||||
gfpAdd(y3, t0, t2) // Y3 := t0 + t2
|
||||
gfpMul(z3, t1, z3) // Z3 := t1 * Z3
|
||||
gfpTriple(t2, t2) // t2 := t2 + t2 + t2
|
||||
gfpSub(t0, t0, t2) // t0 := t0 - t2
|
||||
gfpMul(y3, t0, y3) // Y3 := t0 * Y3
|
||||
gfpAdd(y3, x3, y3) // Y3 := X3 + Y3
|
||||
gfpMul(t1, &p.x, &p.y) // t1 := XY
|
||||
gfpMul(x3, t0, t1) // X3 := t0 * t1
|
||||
gfpDouble(x3, x3) // X3 := X3 + X3
|
||||
|
||||
c.x.Set(x3)
|
||||
c.y.Set(y3)
|
||||
c.z.Set(z3)
|
||||
}
|
||||
|
||||
func curvePointAddComplete(c, p1, p2 *curvePoint) {
|
||||
// Complete addition formula for a = 0 from "Complete addition formulas for
|
||||
// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §3.2.
|
||||
// Algorithm 7: Complete, projective point addition for prime order j-invariant 0 short Weierstrass curves.
|
||||
|
||||
t0, t1, t2, t3, t4 := new(gfP), new(gfP), new(gfP), new(gfP), new(gfP)
|
||||
x3, y3, z3 := new(gfP), new(gfP), new(gfP)
|
||||
gfpMul(t0, &p1.x, &p2.x) // t0 := X1X2
|
||||
gfpMul(t1, &p1.y, &p2.y) // t1 := Y1Y2
|
||||
gfpMul(t2, &p1.z, &p2.z) // t2 := Z1Z2
|
||||
gfpAdd(t3, &p1.x, &p1.y) // t3 := X1 + Y1
|
||||
gfpAdd(t4, &p2.x, &p2.y) // t4 := X2 + Y2
|
||||
gfpMul(t3, t3, t4) // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
|
||||
gfpAdd(t4, t0, t1) // t4 := t0 + t1
|
||||
gfpSub(t3, t3, t4) // t3 := t3 - t4 = X1Y2 + X2Y1
|
||||
gfpAdd(t4, &p1.y, &p1.z) // t4 := Y1 + Z1
|
||||
gfpAdd(x3, &p2.y, &p2.z) // X3 := Y2 + Z2
|
||||
gfpMul(t4, t4, x3) // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
|
||||
gfpAdd(x3, t1, t2) // X3 := t1 + t2
|
||||
gfpSub(t4, t4, x3) // t4 := t4 - X3 = Y1Z2 + Y2Z1
|
||||
gfpAdd(x3, &p1.x, &p1.z) // X3 := X1 + Z1
|
||||
gfpAdd(y3, &p2.x, &p2.z) // Y3 := X2 + Z2
|
||||
gfpMul(x3, x3, y3) // X3 := X3 * Y3
|
||||
gfpAdd(y3, t0, t2) // Y3 := t0 + t2
|
||||
gfpSub(y3, x3, y3) // Y3 := X3 - Y3 = X1Z2 + X2Z1
|
||||
gfpTriple(t0, t0) // t0 := t0 + t0 + t0 = 3X1X2
|
||||
gfpMul(t2, threeCurveB, t2) // t2 := 3b * t2 = 3bZ1Z2
|
||||
gfpAdd(z3, t1, t2) // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
|
||||
gfpSub(t1, t1, t2) // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
|
||||
gfpMul(y3, threeCurveB, y3) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
|
||||
gfpMul(x3, t4, y3) // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
|
||||
gfpMul(t2, t3, t1) // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
|
||||
gfpSub(x3, t2, x3) // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
|
||||
gfpMul(y3, y3, t0) // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
|
||||
gfpMul(t1, t1, z3) // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
|
||||
gfpAdd(y3, t1, y3) // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
|
||||
gfpMul(t0, t0, t3) // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
|
||||
gfpMul(z3, z3, t4) // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
|
||||
gfpAdd(z3, z3, t0) // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
|
||||
|
||||
c.x.Set(x3)
|
||||
c.y.Set(y3)
|
||||
c.z.Set(z3)
|
||||
}
|
543
sm9/bn256/gfp2_g1_arm64.s
Normal file
543
sm9/bn256/gfp2_g1_arm64.s
Normal file
@ -0,0 +1,543 @@
|
||||
//go:build arm64 && !purego
|
||||
// +build arm64,!purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define res_ptr R0
|
||||
#define a_ptr R1
|
||||
#define b_ptr R2
|
||||
|
||||
#define acc0 R3
|
||||
#define acc1 R4
|
||||
#define acc2 R5
|
||||
#define acc3 R6
|
||||
|
||||
#define acc4 R7
|
||||
#define acc5 R8
|
||||
#define acc6 R9
|
||||
#define acc7 R10
|
||||
#define t0 R11
|
||||
#define t1 R12
|
||||
#define const0 R15
|
||||
#define const1 R16
|
||||
#define const2 R13
|
||||
#define const3 R14
|
||||
|
||||
#define hlp0 R17
|
||||
#define hlp1 res_ptr
|
||||
|
||||
#define x0 R19
|
||||
#define x1 R20
|
||||
#define x2 R21
|
||||
#define x3 R22
|
||||
#define y0 R23
|
||||
#define y1 R24
|
||||
#define y2 R25
|
||||
#define y3 R26
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0)
|
||||
TEXT gfpSubInternal<>(SB),NOSPLIT,$0
|
||||
SUBS x0, y0, acc0
|
||||
SBCS x1, y1, acc1
|
||||
SBCS x2, y2, acc2
|
||||
SBCS x3, y3, acc3
|
||||
SBC $0, ZR, t0
|
||||
|
||||
ADDS ·p2+0(SB), acc0, acc4
|
||||
ADCS ·p2+8(SB), acc1, acc5
|
||||
ADCS ·p2+16(SB), acc2, acc6
|
||||
ADC ·p2+24(SB), acc3, acc7
|
||||
|
||||
ANDS $1, t0
|
||||
CSEL EQ, acc0, acc4, x0
|
||||
CSEL EQ, acc1, acc5, x1
|
||||
CSEL EQ, acc2, acc6, x2
|
||||
CSEL EQ, acc3, acc7, x3
|
||||
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0)
|
||||
TEXT gfpMulInternal<>(SB),NOSPLIT,$0
|
||||
// y[0] * x
|
||||
MUL y0, x0, acc0
|
||||
UMULH y0, x0, acc1
|
||||
|
||||
MUL y0, x1, t0
|
||||
ADDS t0, acc1
|
||||
UMULH y0, x1, acc2
|
||||
|
||||
MUL y0, x2, t0
|
||||
ADCS t0, acc2
|
||||
UMULH y0, x2, acc3
|
||||
|
||||
MUL y0, x3, t0
|
||||
ADCS t0, acc3
|
||||
UMULH y0, x3, acc4
|
||||
ADC $0, acc4
|
||||
// First reduction step
|
||||
MUL acc0, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp0, t0
|
||||
ADDS t0, acc0, acc0
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const2, hlp0, acc0
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc4
|
||||
|
||||
ADDS t1, acc1, acc1
|
||||
ADCS y0, acc2, acc2
|
||||
ADCS acc0, acc3, acc3
|
||||
ADC $0, hlp0, acc0
|
||||
|
||||
// y[1] * x
|
||||
MUL y1, x0, t0
|
||||
ADDS t0, acc1
|
||||
UMULH y1, x0, t1
|
||||
|
||||
MUL y1, x1, t0
|
||||
ADCS t0, acc2
|
||||
UMULH y1, x1, y0
|
||||
|
||||
MUL y1, x2, t0
|
||||
ADCS t0, acc3
|
||||
UMULH y1, x2, hlp0
|
||||
|
||||
MUL y1, x3, t0
|
||||
ADCS t0, acc4
|
||||
UMULH y1, x3, y1
|
||||
ADC $0, ZR, acc5
|
||||
|
||||
ADDS t1, acc2
|
||||
ADCS y0, acc3
|
||||
ADCS hlp0, acc4
|
||||
ADC y1, acc5
|
||||
// Second reduction step
|
||||
MUL acc1, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp0, t0
|
||||
ADDS t0, acc1, acc1
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const2, hlp0, acc1
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc5
|
||||
|
||||
ADDS t1, acc2, acc2
|
||||
ADCS y0, acc3, acc3
|
||||
ADCS acc1, acc0, acc0
|
||||
ADC $0, hlp0, acc1
|
||||
|
||||
// y[2] * x
|
||||
MUL y2, x0, t0
|
||||
ADDS t0, acc2
|
||||
UMULH y2, x0, t1
|
||||
|
||||
MUL y2, x1, t0
|
||||
ADCS t0, acc3
|
||||
UMULH y2, x1, y0
|
||||
|
||||
MUL y2, x2, t0
|
||||
ADCS t0, acc4
|
||||
UMULH y2, x2, y1
|
||||
|
||||
MUL y2, x3, t0
|
||||
ADCS t0, acc5
|
||||
UMULH y2, x3, hlp0
|
||||
ADC $0, ZR, acc6
|
||||
|
||||
ADDS t1, acc3
|
||||
ADCS y0, acc4
|
||||
ADCS y1, acc5
|
||||
ADC hlp0, acc6
|
||||
// Third reduction step
|
||||
MUL acc2, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp0, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const2, hlp0, acc2
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc6
|
||||
|
||||
ADDS t1, acc3, acc3
|
||||
ADCS y0, acc0, acc0
|
||||
ADCS acc2, acc1, acc1
|
||||
ADC $0, hlp0, acc2
|
||||
// y[3] * x
|
||||
MUL y3, x0, t0
|
||||
ADDS t0, acc3
|
||||
UMULH y3, x0, t1
|
||||
|
||||
MUL y3, x1, t0
|
||||
ADCS t0, acc4
|
||||
UMULH y3, x1, y0
|
||||
|
||||
MUL y3, x2, t0
|
||||
ADCS t0, acc5
|
||||
UMULH y3, x2, y1
|
||||
|
||||
MUL y3, x3, t0
|
||||
ADCS t0, acc6
|
||||
UMULH y3, x3, hlp0
|
||||
ADC $0, ZR, acc7
|
||||
|
||||
ADDS t1, acc4
|
||||
ADCS y0, acc5
|
||||
ADCS y1, acc6
|
||||
ADC hlp0, acc7
|
||||
// Last reduction step
|
||||
MUL acc3, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp0, t0
|
||||
ADDS t0, acc3, acc3
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const2, hlp0, acc3
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc7
|
||||
// Add bits [511:256] of the mul result
|
||||
ADDS acc4, acc0, acc0
|
||||
ADCS acc5, acc1, acc1
|
||||
ADCS acc6, acc2, acc2
|
||||
ADCS acc7, acc3, acc3
|
||||
ADC $0, ZR, acc4
|
||||
|
||||
SUBS const0, acc0, t0
|
||||
SBCS const1, acc1, t1
|
||||
SBCS const2, acc2, acc6
|
||||
SBCS const3, acc3, acc7
|
||||
SBCS $0, acc4, acc4
|
||||
|
||||
CSEL CS, t0, acc0, y0
|
||||
CSEL CS, t1, acc1, y1
|
||||
CSEL CS, acc6, acc2, y2
|
||||
CSEL CS, acc7, acc3, y3
|
||||
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2
|
||||
TEXT gfpSqrInternal<>(SB),NOSPLIT,$0
|
||||
// x[1:] * x[0]
|
||||
MUL x0, x1, acc1
|
||||
UMULH x0, x1, acc2
|
||||
|
||||
MUL x0, x2, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH x0, x2, acc3
|
||||
|
||||
MUL x0, x3, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH x0, x3, acc4
|
||||
ADC $0, acc4, acc4
|
||||
// x[2:] * x[1]
|
||||
MUL x1, x2, t0
|
||||
ADDS t0, acc3
|
||||
UMULH x1, x2, t1
|
||||
ADCS t1, acc4
|
||||
ADC $0, ZR, acc5
|
||||
|
||||
MUL x1, x3, t0
|
||||
ADDS t0, acc4
|
||||
UMULH x1, x3, t1
|
||||
ADC t1, acc5
|
||||
// x[3] * x[2]
|
||||
MUL x2, x3, t0
|
||||
ADDS t0, acc5
|
||||
UMULH x2, x3, acc6
|
||||
ADC $0, acc6
|
||||
|
||||
MOVD $0, acc7
|
||||
// *2
|
||||
ADDS acc1, acc1
|
||||
ADCS acc2, acc2
|
||||
ADCS acc3, acc3
|
||||
ADCS acc4, acc4
|
||||
ADCS acc5, acc5
|
||||
ADCS acc6, acc6
|
||||
ADC $0, acc7
|
||||
// Missing products
|
||||
MUL x0, x0, acc0
|
||||
UMULH x0, x0, t0
|
||||
ADDS t0, acc1, acc1
|
||||
|
||||
MUL x1, x1, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH x1, x1, t1
|
||||
ADCS t1, acc3, acc3
|
||||
|
||||
MUL x2, x2, t0
|
||||
ADCS t0, acc4, acc4
|
||||
UMULH x2, x2, t1
|
||||
ADCS t1, acc5, acc5
|
||||
|
||||
MUL x3, x3, t0
|
||||
ADCS t0, acc6, acc6
|
||||
UMULH x3, x3, t1
|
||||
ADCS t1, acc7, acc7
|
||||
// First reduction step
|
||||
MUL acc0, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp0, t0
|
||||
ADDS t0, acc0, acc0
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const2, hlp0, acc0
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc1, acc1
|
||||
ADCS y0, acc2, acc2
|
||||
ADCS acc0, acc3, acc3
|
||||
ADC $0, hlp0, acc0
|
||||
// Second reduction step
|
||||
MUL acc1, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp0, t0
|
||||
ADDS t0, acc1, acc1
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const2, hlp0, acc1
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc2, acc2
|
||||
ADCS y0, acc3, acc3
|
||||
ADCS acc1, acc0, acc0
|
||||
ADC $0, hlp0, acc1
|
||||
// Third reduction step
|
||||
MUL acc2, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp0, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const2, hlp0, acc2
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc3, acc3
|
||||
ADCS y0, acc0, acc0
|
||||
ADCS acc2, acc1, acc1
|
||||
ADC $0, hlp0, acc2
|
||||
|
||||
// Last reduction step
|
||||
MUL acc3, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp0, t0
|
||||
ADDS t0, acc3, acc3
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const2, hlp0, acc3
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc7
|
||||
|
||||
ADDS t1, acc0, acc0
|
||||
ADCS y0, acc1, acc1
|
||||
ADCS acc3, acc2, acc2
|
||||
ADC $0, hlp0, acc3
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADDS acc4, acc0, acc0
|
||||
ADCS acc5, acc1, acc1
|
||||
ADCS acc6, acc2, acc2
|
||||
ADCS acc7, acc3, acc3
|
||||
ADC $0, ZR, acc4
|
||||
|
||||
SUBS const0, acc0, t0
|
||||
SBCS const1, acc1, t1
|
||||
SBCS const2, acc2, acc6
|
||||
SBCS const3, acc3, acc7
|
||||
SBCS $0, acc4, acc4
|
||||
|
||||
CSEL CS, t0, acc0, y0
|
||||
CSEL CS, t1, acc1, y1
|
||||
CSEL CS, acc6, acc2, y2
|
||||
CSEL CS, acc7, acc3, y3
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// (x3, x2, x1, x0) = 2(y3, y2, y1, y0)
|
||||
#define gfpMulBy2Inline \
|
||||
ADDS y0, y0, x0; \
|
||||
ADCS y1, y1, x1; \
|
||||
ADCS y2, y2, x2; \
|
||||
ADCS y3, y3, x3; \
|
||||
ADC $0, ZR, hlp0; \
|
||||
SUBS ·p2+0(SB), x0, acc0; \
|
||||
SBCS ·p2+8(SB), x1, acc1;\
|
||||
SBCS ·p2+16(SB), x2, acc2; \
|
||||
SBCS ·p2+24(SB), x3, acc3;\
|
||||
SBCS $0, hlp0, hlp0;\
|
||||
CSEL CC, x0, acc0, x0;\
|
||||
CSEL CC, x1, acc1, x1;\
|
||||
CSEL CC, x2, acc2, x2;\
|
||||
CSEL CC, x3, acc3, x3;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
|
||||
#define gfpAddInline \
|
||||
ADDS y0, x0, x0; \
|
||||
ADCS y1, x1, x1; \
|
||||
ADCS y2, x2, x2; \
|
||||
ADCS y3, x3, x3; \
|
||||
ADC $0, ZR, hlp0; \
|
||||
SUBS ·p2+0(SB), x0, acc0; \
|
||||
SBCS ·p2+8(SB), x1, acc1;\
|
||||
SBCS ·p2+16(SB), x2, acc2; \
|
||||
SBCS ·p2+24(SB), x3, acc3;\
|
||||
SBCS $0, hlp0, hlp0;\
|
||||
CSEL CC, x0, acc0, x0;\
|
||||
CSEL CC, x1, acc1, x1;\
|
||||
CSEL CC, x2, acc2, x2;\
|
||||
CSEL CC, x3, acc3, x3;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define x1in(off) (off)(a_ptr)
|
||||
#define y1in(off) (off + 32)(a_ptr)
|
||||
#define z1in(off) (off + 64)(a_ptr)
|
||||
#define x2in(off) (off)(b_ptr)
|
||||
#define z2in(off) (off + 64)(b_ptr)
|
||||
#define x3out(off) (off)(res_ptr)
|
||||
#define y3out(off) (off + 32)(res_ptr)
|
||||
#define z3out(off) (off + 64)(res_ptr)
|
||||
#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
|
||||
#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
|
||||
#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
|
||||
#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
|
||||
#define y2x MOVD y0, x0; MOVD y1, x1; MOVD y2, x2; MOVD y3, x3
|
||||
#define x2y MOVD x0, y0; MOVD x1, y1; MOVD x2, y2; MOVD x3, y3
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define tmp0(off) (32*0 + 8 + off)(RSP)
|
||||
#define tmp1(off) (32*1 + 8 + off)(RSP)
|
||||
#define tmp2(off) (32*2 + 8 + off)(RSP)
|
||||
|
||||
// func gfp2Mul(c, a, b *gfP2)
|
||||
TEXT ·gfp2Mul(SB),NOSPLIT,$104-24
|
||||
MOVD res+0(FP), res_ptr
|
||||
MOVD in1+8(FP), a_ptr
|
||||
MOVD in2+16(FP), b_ptr
|
||||
|
||||
MOVD ·np+0x00(SB), hlp1
|
||||
LDP ·p2+0x00(SB), (const0, const1)
|
||||
LDP ·p2+0x10(SB), (const2, const3)
|
||||
|
||||
LDx (y1in)
|
||||
LDy (y2in)
|
||||
CALL gfpMulInternal(SB)
|
||||
STy (tmp0)
|
||||
|
||||
LDx (x1in)
|
||||
LDy (x2in)
|
||||
CALL gfpMulInternal(SB)
|
||||
STy (tmp1)
|
||||
|
||||
LDx (x1in)
|
||||
LDy (y1in)
|
||||
gfpAddInline
|
||||
STx (tmp2)
|
||||
|
||||
LDx (x2in)
|
||||
LDy (y2in)
|
||||
gfpAddInline
|
||||
LDy (tmp2)
|
||||
CALL gfpMulInternal(SB)
|
||||
|
||||
LDx (tmp0)
|
||||
CALL gfpSubInternal(SB)
|
||||
x2y
|
||||
LDx (tmp1)
|
||||
CALL gfpSubInternal(SB)
|
||||
STx (x3out)
|
||||
|
||||
LDy (tmp1)
|
||||
gfpMulBy2Inline
|
||||
LDy (tmp0)
|
||||
CALL gfpSubInternal(SB)
|
||||
STx (y3out)
|
||||
|
||||
RET
|
Loading…
x
Reference in New Issue
Block a user