internal/sm2ec: amd64, optimize point double

This commit is contained in:
Sun Yimin 2024-02-28 17:39:27 +08:00 committed by GitHub
parent ee7af1bda3
commit fabcb6ad30
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 109 additions and 43 deletions

View File

@ -1654,86 +1654,84 @@ pointadd_avx2:
#define calZ() \ #define calZ() \
LDacc (z) \ LDacc (z) \
CALL sm2P256SqrInternal(SB) \ CALL sm2P256SqrInternal(SB) \
ST (zsqr) \ ST (zsqr) \ // ZZ = Z1^2
\ \
LDt (x) \ LDt (x) \
p256AddInline \ p256AddInline \
STt (m) \ STt (m) \ // M = ZZ + X1
\ \
LDacc (z) \ LDacc (z) \
LDt (y) \ LDt (y) \
CALL sm2P256MulInternal(SB) \ CALL sm2P256MulInternal(SB) \ // Z1 * Y1
p256MulBy2Inline \ p256MulBy2Inline \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2
#define calX() \ #define calX() \
LDacc (x) \ LDacc (x) \
LDt (zsqr) \ LDt (zsqr) \
CALL sm2P256SubInternal(SB) \ CALL sm2P256SubInternal(SB) \ // X1 - ZZ
LDt (m) \ LDt (m) \
CALL sm2P256MulInternal(SB) \ CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2
ST (m) \ ST (m) \
\// Multiply by 3 \// Multiply by 3
p256MulBy2Inline \ p256TripleInline \
LDacc (m) \ STt (m) \ // M = 3 * (X1^2 - ZZ^2)
p256AddInline \
STt (m) \
\//////////////////////// \////////////////////////
LDacc (y) \ LDacc (y) \
p256MulBy2Inline \ p256MulBy2Inline2 \
t2acc \ CALL sm2P256SqrInternal(SB) \ // 4 * YY = (2*Y1)^2
CALL sm2P256SqrInternal(SB) \ ST (s) \ // S = 4 * YY
ST (s) \ CALL sm2P256SqrInternal(SB) \ // (4 * YY)^2 = 16 * YYYY
CALL sm2P256SqrInternal(SB) \
\// Divide by 2 \// Divide by 2
XORQ mul0, mul0 \ XORQ mul0, mul0 \
MOVQ acc4, t0 \ MOVQ acc4, t0 \
MOVQ acc5, t1 \ MOVQ acc5, t1 \
MOVQ acc6, t2 \ MOVQ acc6, t2 \
MOVQ acc7, t3 \ MOVQ acc7, t3 \
\ \ // [mul0, acc7, acc6, acc5, acc4] := [acc7, acc6, acc5, acc4] + P
ADDQ $-1, acc4 \ ADDQ $-1, acc4 \
ADCQ p256p<>+0x08(SB), acc5 \ ADCQ p256p<>+0x08(SB), acc5 \
ADCQ $-1, acc6 \ ADCQ $-1, acc6 \
ADCQ p256p<>+0x018(SB), acc7 \ ADCQ p256p<>+0x018(SB), acc7 \
ADCQ $0, mul0 \ ADCQ $0, mul0 \
TESTQ $1, t0 \ TESTQ $1, t0 \ // ZF := 1 if (t0 AND 1 == 0)
\ \ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ t0, acc4 \ CMOVQEQ t0, acc4 \ // acc4 := t0 if (ZF == 1)
CMOVQEQ t1, acc5 \ CMOVQEQ t1, acc5 \ // acc5 := t1 if (ZF == 1)
CMOVQEQ t2, acc6 \ CMOVQEQ t2, acc6 \ // acc6 := t2 if (ZF == 1)
CMOVQEQ t3, acc7 \ CMOVQEQ t3, acc7 \ // acc7 := t3 if (ZF == 1)
ANDQ t0, mul0 \ ANDQ t0, mul0 \ // mul0 := t0 AND mul0 (mul0 := 0 if (ZF == 1) else keeping the original value 0 or 1)
\ \ // Divide even by 2
SHRQ $1, acc5, acc4 \ SHRQ $1, acc5, acc4 \ // acc4 := acc4 >> 1 | acc5 << 63
SHRQ $1, acc6, acc5 \ SHRQ $1, acc6, acc5 \ // acc5 := acc5 >> 1 | acc6 << 63
SHRQ $1, acc7, acc6 \ SHRQ $1, acc7, acc6 \ // acc6 := acc6 >> 1 | acc7 << 63
SHRQ $1, mul0, acc7 \ SHRQ $1, mul0, acc7 \ // acc7 := acc7 >> 1 | mul0 << 63
ST (y) \ ST (y) \ // Y3 = 8 * YYYY
\///////////////////////// \/////////////////////////
LDacc (x) \ LDacc (x) \
LDt (s) \ LDt (s) \
CALL sm2P256MulInternal(SB) \ CALL sm2P256MulInternal(SB) \ // X1 * 4 * YY
ST (s) \ ST (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
p256MulBy2Inline \ p256MulBy2Inline \
STt (tmp) \ STt (tmp) \ // tmp = 2*S = 8 * X1 * YY
\ \
LDacc (m) \ LDacc (m) \
CALL sm2P256SqrInternal(SB) \ CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2
LDt (tmp) \ LDt (tmp) \
CALL sm2P256SubInternal(SB) \ CALL sm2P256SubInternal(SB) \ // X3 = M^2 - 2*S
#define calY() \ #define calY() \
acc2t \ acc2t \
LDacc (s) \ LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
CALL sm2P256SubInternal(SB) \ CALL sm2P256SubInternal(SB) \ // S - X3
\ \
LDt (m) \ LDt (m) \
CALL sm2P256MulInternal(SB) \ CALL sm2P256MulInternal(SB) \ // M * (S - X3)
\ \
LDt (y) \ LDt (y) \
CALL sm2P256SubInternal(SB) \ CALL sm2P256SubInternal(SB) \ // Y3 = M * (S - X3) - 8 * YYYYY
#define lastP256PointDouble() \ #define lastP256PointDouble() \
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl
calZ() \ calZ() \
MOVQ rptr, AX \ MOVQ rptr, AX \
\// Store z \// Store z

View File

@ -340,6 +340,76 @@ GLOBL p256one<>(SB), 8, $32
CMOVQCS acc5, t1;\ CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\ CMOVQCS acc6, t2;\
CMOVQCS acc7, t3; CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline2\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
#define p256TripleInline\
XORQ mul0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;\
XORQ mul0, mul0;\
ADDQ acc0, acc4;\
ADCQ acc1, acc5;\
ADCQ acc2, acc6;\
ADCQ acc3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/ /* ---------------------------------------*/
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0] // [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
#define p256AddInline \ #define p256AddInline \

View File

@ -1674,14 +1674,11 @@ pointadd_avx2:
CALL sm2P256MulInternal(SB) \ CALL sm2P256MulInternal(SB) \
ST (m) \ ST (m) \
\// Multiply by 3 \// Multiply by 3
p256MulBy2Inline \ p256TripleInline \
LDacc (m) \
p256AddInline \
STt (m) \ STt (m) \
\//////////////////////// \////////////////////////
LDacc (y) \ LDacc (y) \
p256MulBy2Inline \ p256MulBy2Inline2 \
t2acc \
CALL sm2P256SqrInternal(SB) \ CALL sm2P256SqrInternal(SB) \
ST (s) \ ST (s) \
CALL sm2P256SqrInternal(SB) \ CALL sm2P256SqrInternal(SB) \
@ -1735,6 +1732,7 @@ pointadd_avx2:
CALL sm2P256SubInternal(SB) \ CALL sm2P256SubInternal(SB) \
#define lastP256PointDouble() \ #define lastP256PointDouble() \
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl
calZ() \ calZ() \
MOVQ rptr, AX \ MOVQ rptr, AX \
\// Store z \// Store z