gmsm/sm9/bn256/gfp2_g1_arm64.s

762 lines
14 KiB
ArmAsm
Raw Normal View History

//go:build arm64 && !purego
// +build arm64,!purego
#include "textflag.h"
2023-07-25 08:14:56 +08:00
#define res_ptr R0
#define a_ptr R1
#define b_ptr R2
#define acc0 R3
#define acc1 R4
#define acc2 R5
#define acc3 R6
#define acc4 R7
#define acc5 R8
#define acc6 R9
#define acc7 R10
#define t0 R11
#define t1 R12
2023-07-25 08:06:07 +08:00
#define const0 R13
#define const1 R14
#define const2 R15
#define const3 R16
#define hlp0 R17
2023-07-25 08:14:56 +08:00
#define hlp1 res_ptr
#define x0 R19
#define x1 R20
#define x2 R21
#define x3 R22
#define y0 R23
#define y1 R24
#define y2 R25
#define y3 R26
/* ---------------------------------------*/
// (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0)
2023-07-24 15:31:11 +08:00
TEXT gfpSubInternal(SB),NOSPLIT,$0
SUBS x0, y0, acc0
SBCS x1, y1, acc1
SBCS x2, y2, acc2
SBCS x3, y3, acc3
SBC $0, ZR, t0
2023-07-24 14:28:23 +08:00
ADDS const0, acc0, acc4
ADCS const1, acc1, acc5
ADCS const2, acc2, acc6
ADC const3, acc3, acc7
ANDS $1, t0
CSEL EQ, acc0, acc4, x0
CSEL EQ, acc1, acc5, x1
CSEL EQ, acc2, acc6, x2
CSEL EQ, acc3, acc7, x3
RET
/* ---------------------------------------*/
// (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0)
2023-07-24 15:31:11 +08:00
TEXT gfpMulInternal(SB),NOSPLIT,$0
// y[0] * x
MUL y0, x0, acc0
UMULH y0, x0, acc1
MUL y0, x1, t0
ADDS t0, acc1
UMULH y0, x1, acc2
MUL y0, x2, t0
ADCS t0, acc2
UMULH y0, x2, acc3
MUL y0, x3, t0
ADCS t0, acc3
UMULH y0, x3, acc4
ADC $0, acc4
// First reduction step
MUL acc0, hlp1, hlp0
MUL const0, hlp0, t0
ADDS t0, acc0, acc0
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc1, acc1
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc2, acc2
UMULH const2, hlp0, acc0
MUL const3, hlp0, t0
ADCS t0, acc3, acc3
UMULH const3, hlp0, hlp0
ADC $0, acc4
ADDS t1, acc1, acc1
ADCS y0, acc2, acc2
ADCS acc0, acc3, acc3
ADC $0, hlp0, acc0
// y[1] * x
MUL y1, x0, t0
ADDS t0, acc1
UMULH y1, x0, t1
MUL y1, x1, t0
ADCS t0, acc2
UMULH y1, x1, y0
MUL y1, x2, t0
ADCS t0, acc3
UMULH y1, x2, hlp0
MUL y1, x3, t0
ADCS t0, acc4
UMULH y1, x3, y1
ADC $0, ZR, acc5
ADDS t1, acc2
ADCS y0, acc3
ADCS hlp0, acc4
ADC y1, acc5
// Second reduction step
MUL acc1, hlp1, hlp0
MUL const0, hlp0, t0
ADDS t0, acc1, acc1
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc2, acc2
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc3, acc3
UMULH const2, hlp0, acc1
MUL const3, hlp0, t0
ADCS t0, acc0, acc0
UMULH const3, hlp0, hlp0
ADC $0, acc5
ADDS t1, acc2, acc2
ADCS y0, acc3, acc3
ADCS acc1, acc0, acc0
ADC $0, hlp0, acc1
// y[2] * x
MUL y2, x0, t0
ADDS t0, acc2
UMULH y2, x0, t1
MUL y2, x1, t0
ADCS t0, acc3
UMULH y2, x1, y0
MUL y2, x2, t0
ADCS t0, acc4
UMULH y2, x2, y1
MUL y2, x3, t0
ADCS t0, acc5
UMULH y2, x3, hlp0
ADC $0, ZR, acc6
ADDS t1, acc3
ADCS y0, acc4
ADCS y1, acc5
ADC hlp0, acc6
// Third reduction step
MUL acc2, hlp1, hlp0
MUL const0, hlp0, t0
ADDS t0, acc2, acc2
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc3, acc3
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc0, acc0
UMULH const2, hlp0, acc2
MUL const3, hlp0, t0
ADCS t0, acc1, acc1
UMULH const3, hlp0, hlp0
ADC $0, acc6
ADDS t1, acc3, acc3
ADCS y0, acc0, acc0
ADCS acc2, acc1, acc1
ADC $0, hlp0, acc2
// y[3] * x
MUL y3, x0, t0
ADDS t0, acc3
UMULH y3, x0, t1
MUL y3, x1, t0
ADCS t0, acc4
UMULH y3, x1, y0
MUL y3, x2, t0
ADCS t0, acc5
UMULH y3, x2, y1
MUL y3, x3, t0
ADCS t0, acc6
UMULH y3, x3, hlp0
ADC $0, ZR, acc7
ADDS t1, acc4
ADCS y0, acc5
ADCS y1, acc6
ADC hlp0, acc7
// Last reduction step
MUL acc3, hlp1, hlp0
MUL const0, hlp0, t0
ADDS t0, acc3, acc3
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc0, acc0
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc1, acc1
UMULH const2, hlp0, acc3
MUL const3, hlp0, t0
ADCS t0, acc2, acc2
UMULH const3, hlp0, hlp0
ADC $0, acc7
2023-07-24 16:26:16 +08:00
ADDS t1, acc0, acc0
ADCS y0, acc1, acc1
ADCS acc3, acc2, acc2
ADC $0, hlp0, acc3
// Add bits [511:256] of the mul result
ADDS acc4, acc0, acc0
ADCS acc5, acc1, acc1
ADCS acc6, acc2, acc2
ADCS acc7, acc3, acc3
ADC $0, ZR, acc4
SUBS const0, acc0, t0
SBCS const1, acc1, t1
SBCS const2, acc2, acc6
SBCS const3, acc3, acc7
SBCS $0, acc4, acc4
CSEL CS, t0, acc0, y0
CSEL CS, t1, acc1, y1
CSEL CS, acc6, acc2, y2
CSEL CS, acc7, acc3, y3
RET
/* ---------------------------------------*/
// (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2
2023-07-24 15:31:11 +08:00
TEXT gfpSqrInternal(SB),NOSPLIT,$0
// x[1:] * x[0]
MUL x0, x1, acc1
UMULH x0, x1, acc2
MUL x0, x2, t0
ADDS t0, acc2, acc2
UMULH x0, x2, acc3
MUL x0, x3, t0
ADCS t0, acc3, acc3
UMULH x0, x3, acc4
ADC $0, acc4, acc4
// x[2:] * x[1]
MUL x1, x2, t0
ADDS t0, acc3
UMULH x1, x2, t1
ADCS t1, acc4
ADC $0, ZR, acc5
MUL x1, x3, t0
ADDS t0, acc4
UMULH x1, x3, t1
ADC t1, acc5
// x[3] * x[2]
MUL x2, x3, t0
ADDS t0, acc5
UMULH x2, x3, acc6
ADC $0, acc6
MOVD $0, acc7
// *2
ADDS acc1, acc1
ADCS acc2, acc2
ADCS acc3, acc3
ADCS acc4, acc4
ADCS acc5, acc5
ADCS acc6, acc6
ADC $0, acc7
// Missing products
MUL x0, x0, acc0
UMULH x0, x0, t0
ADDS t0, acc1, acc1
MUL x1, x1, t0
ADCS t0, acc2, acc2
UMULH x1, x1, t1
ADCS t1, acc3, acc3
MUL x2, x2, t0
ADCS t0, acc4, acc4
UMULH x2, x2, t1
ADCS t1, acc5, acc5
MUL x3, x3, t0
ADCS t0, acc6, acc6
UMULH x3, x3, t1
ADCS t1, acc7, acc7
// First reduction step
MUL acc0, hlp1, hlp0
MUL const0, hlp0, t0
ADDS t0, acc0, acc0
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc1, acc1
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc2, acc2
UMULH const2, hlp0, acc0
MUL const3, hlp0, t0
ADCS t0, acc3, acc3
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc1, acc1
ADCS y0, acc2, acc2
ADCS acc0, acc3, acc3
ADC $0, hlp0, acc0
// Second reduction step
MUL acc1, hlp1, hlp0
MUL const0, hlp0, t0
ADDS t0, acc1, acc1
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc2, acc2
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc3, acc3
UMULH const2, hlp0, acc1
MUL const3, hlp0, t0
ADCS t0, acc0, acc0
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc2, acc2
ADCS y0, acc3, acc3
ADCS acc1, acc0, acc0
ADC $0, hlp0, acc1
// Third reduction step
MUL acc2, hlp1, hlp0
MUL const0, hlp0, t0
ADDS t0, acc2, acc2
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc3, acc3
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc0, acc0
UMULH const2, hlp0, acc2
MUL const3, hlp0, t0
ADCS t0, acc1, acc1
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc3, acc3
ADCS y0, acc0, acc0
ADCS acc2, acc1, acc1
ADC $0, hlp0, acc2
// Last reduction step
MUL acc3, hlp1, hlp0
MUL const0, hlp0, t0
ADDS t0, acc3, acc3
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc0, acc0
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc1, acc1
UMULH const2, hlp0, acc3
MUL const3, hlp0, t0
ADCS t0, acc2, acc2
UMULH const3, hlp0, hlp0
ADC $0, acc7
ADDS t1, acc0, acc0
ADCS y0, acc1, acc1
ADCS acc3, acc2, acc2
ADC $0, hlp0, acc3
// Add bits [511:256] of the sqr result
ADDS acc4, acc0, acc0
ADCS acc5, acc1, acc1
ADCS acc6, acc2, acc2
ADCS acc7, acc3, acc3
ADC $0, ZR, acc4
SUBS const0, acc0, t0
SBCS const1, acc1, t1
SBCS const2, acc2, acc6
SBCS const3, acc3, acc7
SBCS $0, acc4, acc4
CSEL CS, t0, acc0, y0
CSEL CS, t1, acc1, y1
CSEL CS, acc6, acc2, y2
CSEL CS, acc7, acc3, y3
RET
/* ---------------------------------------*/
// (x3, x2, x1, x0) = 2(y3, y2, y1, y0)
#define gfpMulBy2Inline \
ADDS y0, y0, x0; \
ADCS y1, y1, x1; \
ADCS y2, y2, x2; \
ADCS y3, y3, x3; \
ADC $0, ZR, hlp0; \
2023-07-24 14:28:23 +08:00
SUBS const0, x0, acc0; \
SBCS const1, x1, acc1;\
SBCS const2, x2, acc2; \
SBCS const3, x3, acc3;\
SBCS $0, hlp0, hlp0;\
CSEL CC, x0, acc0, x0;\
CSEL CC, x1, acc1, x1;\
CSEL CC, x2, acc2, x2;\
CSEL CC, x3, acc3, x3;
/* ---------------------------------------*/
// (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
#define gfpAddInline \
ADDS y0, x0, x0; \
ADCS y1, x1, x1; \
ADCS y2, x2, x2; \
ADCS y3, x3, x3; \
ADC $0, ZR, hlp0; \
2023-07-24 14:28:23 +08:00
SUBS const0, x0, acc0; \
SBCS const1, x1, acc1;\
SBCS const2, x2, acc2; \
SBCS const3, x3, acc3;\
SBCS $0, hlp0, hlp0;\
CSEL CC, x0, acc0, x0;\
CSEL CC, x1, acc1, x1;\
CSEL CC, x2, acc2, x2;\
CSEL CC, x3, acc3, x3;
/* ---------------------------------------*/
#define x1in(off) (off)(a_ptr)
#define y1in(off) (off + 32)(a_ptr)
#define z1in(off) (off + 64)(a_ptr)
#define x2in(off) (off)(b_ptr)
2023-07-24 14:28:23 +08:00
#define y2in(off) (off + 32)(b_ptr)
#define z2in(off) (off + 64)(b_ptr)
#define x3out(off) (off)(res_ptr)
#define y3out(off) (off + 32)(res_ptr)
#define z3out(off) (off + 64)(res_ptr)
#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
#define y2x MOVD y0, x0; MOVD y1, x1; MOVD y2, x2; MOVD y3, x3
#define x2y MOVD x0, y0; MOVD x1, y1; MOVD x2, y2; MOVD x3, y3
/* ---------------------------------------*/
#define tmp0(off) (32*0 + 8 + off)(RSP)
#define tmp1(off) (32*1 + 8 + off)(RSP)
#define tmp2(off) (32*2 + 8 + off)(RSP)
// func gfp2Mul(c, a, b *gfP2)
TEXT ·gfp2Mul(SB),NOSPLIT,$104-24
MOVD in1+8(FP), a_ptr
MOVD in2+16(FP), b_ptr
MOVD ·np+0x00(SB), hlp1
LDP ·p2+0x00(SB), (const0, const1)
LDP ·p2+0x10(SB), (const2, const3)
LDx (y1in)
LDy (y2in)
CALL gfpMulInternal(SB)
STy (tmp0)
2023-07-25 08:21:44 +08:00
LDx (x1in)
LDy (x2in)
CALL gfpMulInternal(SB)
STy (tmp1)
LDx (x1in)
LDy (y1in)
gfpAddInline
STx (tmp2)
LDx (x2in)
LDy (y2in)
gfpAddInline
LDy (tmp2)
CALL gfpMulInternal(SB)
LDx (tmp0)
CALL gfpSubInternal(SB)
x2y
LDx (tmp1)
CALL gfpSubInternal(SB)
2023-07-25 08:21:44 +08:00
MOVD res+0(FP), res_ptr // not use hlp1 any more
STx (x3out)
LDy (tmp1)
gfpMulBy2Inline
LDy (tmp0)
CALL gfpSubInternal(SB)
STx (y3out)
2023-07-25 08:14:56 +08:00
RET
2023-07-25 08:21:44 +08:00
// func gfp2MulU(c, a, b *gfP2)
TEXT ·gfp2MulU(SB),NOSPLIT,$104-24
MOVD in1+8(FP), a_ptr
MOVD in2+16(FP), b_ptr
MOVD ·np+0x00(SB), hlp1
LDP ·p2+0x00(SB), (const0, const1)
LDP ·p2+0x10(SB), (const2, const3)
LDx (y1in)
LDy (y2in)
CALL gfpMulInternal(SB)
STy (tmp0)
LDx (x1in)
LDy (x2in)
CALL gfpMulInternal(SB)
STy (tmp1)
LDx (x1in)
LDy (y1in)
gfpAddInline
STx (tmp2)
LDx (x2in)
LDy (y2in)
gfpAddInline
LDy (tmp2)
CALL gfpMulInternal(SB)
LDx (tmp0)
CALL gfpSubInternal(SB)
x2y
LDx (tmp1)
CALL gfpSubInternal(SB)
x2y
gfpMulBy2Inline
MOVD $0, y0
MOVD $0, y1
MOVD $0, y2
MOVD $0, y3
CALL gfpSubInternal(SB)
MOVD res+0(FP), res_ptr // not use hlp1 any more
STx (y3out)
LDy (tmp1)
gfpMulBy2Inline
LDy (tmp0)
CALL gfpSubInternal(SB)
STx (x3out)
RET
// func gfp2Square(c, a *gfP2)
TEXT ·gfp2Square(SB),NOSPLIT,$72-16
MOVD res+0(FP), b_ptr
MOVD in1+8(FP), a_ptr
MOVD ·np+0x00(SB), hlp1
LDP ·p2+0x00(SB), (const0, const1)
LDP ·p2+0x10(SB), (const2, const3)
LDx (y1in)
LDy (x1in)
gfpAddInline
STx (tmp0)
gfpMulBy2Inline
LDy (y1in)
CALL gfpSubInternal(SB)
LDy (tmp0)
CALL gfpMulInternal(SB)
STy (tmp0)
LDx (y1in)
LDy (x1in)
CALL gfpMulInternal(SB)
//STy (tmp1)
LDx (tmp0)
gfpAddInline
STx (y2in)
//LDy (tmp1)
gfpMulBy2Inline
STx (x2in)
RET
// func gfp2SquareU(c, a *gfP2)
TEXT ·gfp2SquareU(SB),NOSPLIT,$72-16
MOVD res+0(FP), b_ptr
MOVD in1+8(FP), a_ptr
MOVD ·np+0x00(SB), hlp1
LDP ·p2+0x00(SB), (const0, const1)
LDP ·p2+0x10(SB), (const2, const3)
LDx (y1in)
LDy (x1in)
gfpAddInline
STx (tmp0)
gfpMulBy2Inline
LDy (y1in)
CALL gfpSubInternal(SB)
LDy (tmp0)
CALL gfpMulInternal(SB)
STy (tmp0)
LDx (y1in)
LDy (x1in)
CALL gfpMulInternal(SB)
//STy (tmp1)
LDx (tmp0)
gfpAddInline
STx (x2in)
//LDy (tmp1)
gfpMulBy2Inline
x2y
gfpMulBy2Inline
MOVD $0, y0
MOVD $0, y1
MOVD $0, y2
MOVD $0, y3
CALL gfpSubInternal(SB)
STx (y2in)
RET
/* ---------------------------------------*/
#undef tmp2
#define x3t(off) (32*2 + 8 + off)(RSP)
#define y3t(off) (32*3 + 8 + off)(RSP)
#define z3t(off) (32*4 + 8 + off)(RSP)
// func curvePointDoubleComplete(c, a *curvePoint)
TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$168-16
MOVD res+0(FP), b_ptr
MOVD in1+8(FP), a_ptr
MOVD ·np+0x00(SB), hlp1
LDP ·p2+0x00(SB), (const0, const1)
LDP ·p2+0x10(SB), (const2, const3)
LDx (y1in)
CALL gfpSqrInternal(SB) // t0 := Y^2
STy (tmp0)
gfpMulBy2Inline // Z3 := t0 + t0
x2y
gfpMulBy2Inline // Z3 := Z3 + Z3
x2y
gfpMulBy2Inline // Z3 := Z3 + Z3
STx (z3t)
LDx (z1in)
CALL gfpSqrInternal(SB) // t2 := Z^2
STy (tmp1)
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
LDx (tmp1)
CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ^2
STx (tmp1)
LDy (z3t)
CALL gfpMulInternal(SB) // X3 := t2 * Z3
STy (x3t)
LDx (tmp0)
LDy (tmp1)
gfpAddInline // Y3 := t0 + t2
STx (y3t)
gfpMulBy2Inline
gfpAddInline // t2 := t2 + t2 + t2
STx (tmp1)
LDy (tmp0)
CALL gfpSubInternal(SB) // t0 := t0 - t2
LDy (y3t)
CALL gfpMulInternal(SB) // Y3 := t0 * Y3
LDx (x3t)
gfpAddInline // Y3 := X3 + Y3
STx (y3t)
LDx (y1in)
LDy (z1in)
CALL gfpMulInternal(SB) // t1 := YZ
LDx (z3t)
CALL gfpMulInternal(SB) // Z3 := t1 * Z3
STy (z2in) // Store Z3
LDx (x1in)
LDy (y1in)
CALL gfpMulInternal(SB) // t1 := XY
LDx (tmp0)
CALL gfpMulInternal(SB) // X3 := t0 * t1
gfpMulBy2Inline // X3 := X3 + X3
STx (x2in) // Store X3
// Store Y3
LDx (y3t)
STx (y2in)
RET
#undef x3t
#undef y3t
#undef z3t