sm9/bn256: reduce register move

This commit is contained in:
Sun Yimin 2023-07-27 13:03:25 +08:00 committed by GitHub
parent a10e64f6f5
commit e3d14fb41a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 84 additions and 69 deletions

View File

@ -965,6 +965,28 @@ noAdxSqr:
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
// (acc7, acc6, acc5, acc4) = 2(acc7, acc6, acc5, acc4)
#define gfpMulBy2Inline2 \
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ ·p2+0(SB), acc4;\
SBBQ ·p2+8(SB), acc5;\
SBBQ ·p2+16(SB), acc6;\
SBBQ ·p2+24(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
/* ---------------------------------------*/
// (t3, t2, t1, t0) = (acc7, acc6, acc5, acc4) + (t3, t2, t1, t0)
#define gfpAddInline \
@ -1294,8 +1316,7 @@ TEXT ·gfp2SquareU(SB),NOSPLIT,$160-16
MOVQ t3, (16*0 + 8*3)(AX)
LDacc (cyout)
gfpMulBy2Inline
t2acc
gfpMulBy2Inline2
gfpMulBy2Inline
XORQ acc4, acc4
XORQ acc5, acc5
@ -1358,41 +1379,41 @@ TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16
CALL gfpSqrInternal(SB) // t0 := Y^2
ST (tmp0)
gfpMulBy2Inline // Z3 := t0 + t0
t2acc
gfpMulBy2Inline // Z3 := Z3 + Z3
t2acc
gfpMulBy2Inline // Z3 := Z3 + Z3
gfpMulBy2Inline2 // Z3 := t0 + t0
gfpMulBy2Inline2 // Z3 := Z3 + Z3
gfpMulBy2Inline // Z3 := Z3 + Z3
STt (zout)
LDacc (zin)
CALL gfpSqrInternal(SB) // t2 := Z^2
ST (tmp2)
gfpMulBy2Inline
t2acc
gfpMulBy2Inline
t2acc
gfpMulBy2Inline
t2acc
gfpMulBy2Inline
t2acc
LDt (tmp2)
MOVQ acc4, acc0
MOVQ acc5, acc1
MOVQ acc6, acc2
MOVQ acc7, acc3
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
MOVQ acc0, t0
MOVQ acc1, t1
MOVQ acc2, t2
MOVQ acc3, t3
CALL gfpSubInternal(SB) // t2 := 3b * t2
ST (tmp2)
LDt (zout)
CALL gfpMulInternal(SB) // X3 := Z3 * t2
CALL gfpMulInternal(SB) // X3 := Z3 * t2
ST (xout)
LDacc (tmp0)
LDt (tmp2)
gfpAddInline // Y3 := t0 + t2
gfpAddInline // Y3 := t0 + t2
STt (yout)
LDacc (yin)
LDt (zin)
CALL gfpMulInternal(SB) // t1 := YZ
CALL gfpMulInternal(SB) // t1 := YZ
LDt (zout)
CALL gfpMulInternal(SB) // Z3 := t1 * Z3
CALL gfpMulInternal(SB) // Z3 := t1 * Z3
MOVQ rptr, AX
// Store Z
MOVQ acc4, (16*4 + 8*0)(AX)
@ -1403,14 +1424,14 @@ TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16
LDacc (tmp2)
gfpMulBy2Inline
LDacc (tmp2)
gfpAddInline // t2 := t2 + t2 + t2
gfpAddInline // t2 := t2 + t2 + t2
LDacc (tmp0)
CALL gfpSubInternal(SB) // t0 := t0 - t2
ST (tmp0)
LDt (yout)
CALL gfpMulInternal(SB) // Y3 = t0 * Y3
LDt (xout)
gfpAddInline // Y3 := X3 + Y3
gfpAddInline // Y3 := X3 + Y3
MOVQ rptr, AX
// Store y
MOVQ t0, (16*2 + 8*0)(AX)
@ -1563,14 +1584,10 @@ TEXT gfpIsZero(SB),NOSPLIT,$0
STt (tmp0) \
\
LDacc (tmp2) \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
LDt (tmp2) \
CALL gfpSubInternal(SB) \ // t2 := 3b * t2 = 3bZ1Z2
ST (tmp2) \
@ -1585,14 +1602,10 @@ TEXT gfpIsZero(SB),NOSPLIT,$0
ST (tmp1) \
\
LDacc (yout) \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
LDt (yout) \
CALL gfpSubInternal(SB) \ // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
ST (yout) \

View File

@ -463,6 +463,23 @@ TEXT gfpSqrInternal(SB),NOSPLIT,$0
CSEL CC, x2, acc2, x2;\
CSEL CC, x3, acc3, x3;
// (y3, y2, y1, y0) = 2(y3, y2, y1, y0)
#define gfpMulBy2Inline2 \
ADDS y0, y0, x0; \
ADCS y1, y1, x1; \
ADCS y2, y2, x2; \
ADCS y3, y3, x3; \
ADC $0, ZR, hlp0; \
SUBS const0, x0, acc0; \
SBCS const1, x1, acc1;\
SBCS const2, x2, acc2; \
SBCS const3, x3, acc3;\
SBCS $0, hlp0, hlp0;\
CSEL CC, x0, acc0, y0;\
CSEL CC, x1, acc1, y1;\
CSEL CC, x2, acc2, y2;\
CSEL CC, x3, acc3, y3;
/* ---------------------------------------*/
// (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
#define gfpAddInline \
@ -665,8 +682,7 @@ TEXT ·gfp2SquareU(SB),NOSPLIT,$72-16
STx (x2in)
//LDy (tmp1)
gfpMulBy2Inline
x2y
gfpMulBy2Inline2
gfpMulBy2Inline
MOVD $0, y0
MOVD $0, y1
@ -696,24 +712,18 @@ TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$168-16
CALL gfpSqrInternal(SB) // t0 := Y^2
STy (tmp0)
gfpMulBy2Inline // Z3 := t0 + t0
x2y
gfpMulBy2Inline // Z3 := Z3 + Z3
x2y
gfpMulBy2Inline2 // Z3 := t0 + t0
gfpMulBy2Inline2 // Z3 := Z3 + Z3
gfpMulBy2Inline // Z3 := Z3 + Z3
STx (z3t)
LDx (z1in)
CALL gfpSqrInternal(SB) // t2 := Z^2
STy (tmp1)
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
LDx (tmp1)
CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ^2
STx (tmp1)
@ -854,14 +864,10 @@ TEXT ·curvePointAddComplete(SB),0,$264-24
STx (tmp0)
LDy (tmp2)
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
LDx (tmp2)
CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ1Z2
STx (tmp2)
@ -875,14 +881,10 @@ TEXT ·curvePointAddComplete(SB),0,$264-24
STx (tmp1)
LDy (y3t)
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
LDx (y3t)
CALL gfpSubInternal(SB) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
STx (y3t)