sm9/bn256: reduce register move

This commit is contained in:
Sun Yimin 2023-07-27 13:03:25 +08:00 committed by GitHub
parent a10e64f6f5
commit e3d14fb41a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 84 additions and 69 deletions

View File

@ -965,6 +965,28 @@ noAdxSqr:
CMOVQCS acc6, t2;\ CMOVQCS acc6, t2;\
CMOVQCS acc7, t3; CMOVQCS acc7, t3;
// (acc7, acc6, acc5, acc4) = 2(acc7, acc6, acc5, acc4)
#define gfpMulBy2Inline2 \
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ ·p2+0(SB), acc4;\
SBBQ ·p2+8(SB), acc5;\
SBBQ ·p2+16(SB), acc6;\
SBBQ ·p2+24(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
/* ---------------------------------------*/ /* ---------------------------------------*/
// (t3, t2, t1, t0) = (acc7, acc6, acc5, acc4) + (t3, t2, t1, t0) // (t3, t2, t1, t0) = (acc7, acc6, acc5, acc4) + (t3, t2, t1, t0)
#define gfpAddInline \ #define gfpAddInline \
@ -1294,8 +1316,7 @@ TEXT ·gfp2SquareU(SB),NOSPLIT,$160-16
MOVQ t3, (16*0 + 8*3)(AX) MOVQ t3, (16*0 + 8*3)(AX)
LDacc (cyout) LDacc (cyout)
gfpMulBy2Inline gfpMulBy2Inline2
t2acc
gfpMulBy2Inline gfpMulBy2Inline
XORQ acc4, acc4 XORQ acc4, acc4
XORQ acc5, acc5 XORQ acc5, acc5
@ -1358,41 +1379,41 @@ TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16
CALL gfpSqrInternal(SB) // t0 := Y^2 CALL gfpSqrInternal(SB) // t0 := Y^2
ST (tmp0) ST (tmp0)
gfpMulBy2Inline // Z3 := t0 + t0 gfpMulBy2Inline2 // Z3 := t0 + t0
t2acc gfpMulBy2Inline2 // Z3 := Z3 + Z3
gfpMulBy2Inline // Z3 := Z3 + Z3 gfpMulBy2Inline // Z3 := Z3 + Z3
t2acc
gfpMulBy2Inline // Z3 := Z3 + Z3
STt (zout) STt (zout)
LDacc (zin) LDacc (zin)
CALL gfpSqrInternal(SB) // t2 := Z^2 CALL gfpSqrInternal(SB) // t2 := Z^2
ST (tmp2) MOVQ acc4, acc0
gfpMulBy2Inline MOVQ acc5, acc1
t2acc MOVQ acc6, acc2
gfpMulBy2Inline MOVQ acc7, acc3
t2acc gfpMulBy2Inline2
gfpMulBy2Inline gfpMulBy2Inline2
t2acc gfpMulBy2Inline2
gfpMulBy2Inline gfpMulBy2Inline2
t2acc MOVQ acc0, t0
LDt (tmp2) MOVQ acc1, t1
MOVQ acc2, t2
MOVQ acc3, t3
CALL gfpSubInternal(SB) // t2 := 3b * t2 CALL gfpSubInternal(SB) // t2 := 3b * t2
ST (tmp2) ST (tmp2)
LDt (zout) LDt (zout)
CALL gfpMulInternal(SB) // X3 := Z3 * t2 CALL gfpMulInternal(SB) // X3 := Z3 * t2
ST (xout) ST (xout)
LDacc (tmp0) LDacc (tmp0)
LDt (tmp2) LDt (tmp2)
gfpAddInline // Y3 := t0 + t2 gfpAddInline // Y3 := t0 + t2
STt (yout) STt (yout)
LDacc (yin) LDacc (yin)
LDt (zin) LDt (zin)
CALL gfpMulInternal(SB) // t1 := YZ CALL gfpMulInternal(SB) // t1 := YZ
LDt (zout) LDt (zout)
CALL gfpMulInternal(SB) // Z3 := t1 * Z3 CALL gfpMulInternal(SB) // Z3 := t1 * Z3
MOVQ rptr, AX MOVQ rptr, AX
// Store Z // Store Z
MOVQ acc4, (16*4 + 8*0)(AX) MOVQ acc4, (16*4 + 8*0)(AX)
@ -1403,14 +1424,14 @@ TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16
LDacc (tmp2) LDacc (tmp2)
gfpMulBy2Inline gfpMulBy2Inline
LDacc (tmp2) LDacc (tmp2)
gfpAddInline // t2 := t2 + t2 + t2 gfpAddInline // t2 := t2 + t2 + t2
LDacc (tmp0) LDacc (tmp0)
CALL gfpSubInternal(SB) // t0 := t0 - t2 CALL gfpSubInternal(SB) // t0 := t0 - t2
ST (tmp0) ST (tmp0)
LDt (yout) LDt (yout)
CALL gfpMulInternal(SB) // Y3 = t0 * Y3 CALL gfpMulInternal(SB) // Y3 = t0 * Y3
LDt (xout) LDt (xout)
gfpAddInline // Y3 := X3 + Y3 gfpAddInline // Y3 := X3 + Y3
MOVQ rptr, AX MOVQ rptr, AX
// Store y // Store y
MOVQ t0, (16*2 + 8*0)(AX) MOVQ t0, (16*2 + 8*0)(AX)
@ -1563,14 +1584,10 @@ TEXT gfpIsZero(SB),NOSPLIT,$0
STt (tmp0) \ STt (tmp0) \
\ \
LDacc (tmp2) \ LDacc (tmp2) \
gfpMulBy2Inline \ gfpMulBy2Inline2 \
t2acc \ gfpMulBy2Inline2 \
gfpMulBy2Inline \ gfpMulBy2Inline2 \
t2acc \ gfpMulBy2Inline2 \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline \
t2acc \
LDt (tmp2) \ LDt (tmp2) \
CALL gfpSubInternal(SB) \ // t2 := 3b * t2 = 3bZ1Z2 CALL gfpSubInternal(SB) \ // t2 := 3b * t2 = 3bZ1Z2
ST (tmp2) \ ST (tmp2) \
@ -1585,14 +1602,10 @@ TEXT gfpIsZero(SB),NOSPLIT,$0
ST (tmp1) \ ST (tmp1) \
\ \
LDacc (yout) \ LDacc (yout) \
gfpMulBy2Inline \ gfpMulBy2Inline2 \
t2acc \ gfpMulBy2Inline2 \
gfpMulBy2Inline \ gfpMulBy2Inline2 \
t2acc \ gfpMulBy2Inline2 \
gfpMulBy2Inline \
t2acc \
gfpMulBy2Inline \
t2acc \
LDt (yout) \ LDt (yout) \
CALL gfpSubInternal(SB) \ // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) CALL gfpSubInternal(SB) \ // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
ST (yout) \ ST (yout) \

View File

@ -463,6 +463,23 @@ TEXT gfpSqrInternal(SB),NOSPLIT,$0
CSEL CC, x2, acc2, x2;\ CSEL CC, x2, acc2, x2;\
CSEL CC, x3, acc3, x3; CSEL CC, x3, acc3, x3;
// (y3, y2, y1, y0) = 2(y3, y2, y1, y0)
#define gfpMulBy2Inline2 \
ADDS y0, y0, x0; \
ADCS y1, y1, x1; \
ADCS y2, y2, x2; \
ADCS y3, y3, x3; \
ADC $0, ZR, hlp0; \
SUBS const0, x0, acc0; \
SBCS const1, x1, acc1;\
SBCS const2, x2, acc2; \
SBCS const3, x3, acc3;\
SBCS $0, hlp0, hlp0;\
CSEL CC, x0, acc0, y0;\
CSEL CC, x1, acc1, y1;\
CSEL CC, x2, acc2, y2;\
CSEL CC, x3, acc3, y3;
/* ---------------------------------------*/ /* ---------------------------------------*/
// (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0) // (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
#define gfpAddInline \ #define gfpAddInline \
@ -665,8 +682,7 @@ TEXT ·gfp2SquareU(SB),NOSPLIT,$72-16
STx (x2in) STx (x2in)
//LDy (tmp1) //LDy (tmp1)
gfpMulBy2Inline gfpMulBy2Inline2
x2y
gfpMulBy2Inline gfpMulBy2Inline
MOVD $0, y0 MOVD $0, y0
MOVD $0, y1 MOVD $0, y1
@ -696,24 +712,18 @@ TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$168-16
CALL gfpSqrInternal(SB) // t0 := Y^2 CALL gfpSqrInternal(SB) // t0 := Y^2
STy (tmp0) STy (tmp0)
gfpMulBy2Inline // Z3 := t0 + t0 gfpMulBy2Inline2 // Z3 := t0 + t0
x2y gfpMulBy2Inline2 // Z3 := Z3 + Z3
gfpMulBy2Inline // Z3 := Z3 + Z3
x2y
gfpMulBy2Inline // Z3 := Z3 + Z3 gfpMulBy2Inline // Z3 := Z3 + Z3
STx (z3t) STx (z3t)
LDx (z1in) LDx (z1in)
CALL gfpSqrInternal(SB) // t2 := Z^2 CALL gfpSqrInternal(SB) // t2 := Z^2
STy (tmp1) STy (tmp1)
gfpMulBy2Inline gfpMulBy2Inline2
x2y gfpMulBy2Inline2
gfpMulBy2Inline gfpMulBy2Inline2
x2y gfpMulBy2Inline2
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
LDx (tmp1) LDx (tmp1)
CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ^2 CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ^2
STx (tmp1) STx (tmp1)
@ -854,14 +864,10 @@ TEXT ·curvePointAddComplete(SB),0,$264-24
STx (tmp0) STx (tmp0)
LDy (tmp2) LDy (tmp2)
gfpMulBy2Inline gfpMulBy2Inline2
x2y gfpMulBy2Inline2
gfpMulBy2Inline gfpMulBy2Inline2
x2y gfpMulBy2Inline2
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
LDx (tmp2) LDx (tmp2)
CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ1Z2 CALL gfpSubInternal(SB) // t2 := 3b * t2 = 3bZ1Z2
STx (tmp2) STx (tmp2)
@ -875,14 +881,10 @@ TEXT ·curvePointAddComplete(SB),0,$264-24
STx (tmp1) STx (tmp1)
LDy (y3t) LDy (y3t)
gfpMulBy2Inline gfpMulBy2Inline2
x2y gfpMulBy2Inline2
gfpMulBy2Inline gfpMulBy2Inline2
x2y gfpMulBy2Inline2
gfpMulBy2Inline
x2y
gfpMulBy2Inline
x2y
LDx (y3t) LDx (y3t)
CALL gfpSubInternal(SB) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1) CALL gfpSubInternal(SB) // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
STx (y3t) STx (y3t)