mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
internal/sm2ec: amd64, optimize point double
This commit is contained in:
parent
ee7af1bda3
commit
fabcb6ad30
@ -1654,86 +1654,84 @@ pointadd_avx2:
|
|||||||
#define calZ() \
|
#define calZ() \
|
||||||
LDacc (z) \
|
LDacc (z) \
|
||||||
CALL sm2P256SqrInternal(SB) \
|
CALL sm2P256SqrInternal(SB) \
|
||||||
ST (zsqr) \
|
ST (zsqr) \ // ZZ = Z1^2
|
||||||
\
|
\
|
||||||
LDt (x) \
|
LDt (x) \
|
||||||
p256AddInline \
|
p256AddInline \
|
||||||
STt (m) \
|
STt (m) \ // M = ZZ + X1
|
||||||
\
|
\
|
||||||
LDacc (z) \
|
LDacc (z) \
|
||||||
LDt (y) \
|
LDt (y) \
|
||||||
CALL sm2P256MulInternal(SB) \
|
CALL sm2P256MulInternal(SB) \ // Z1 * Y1
|
||||||
p256MulBy2Inline \
|
p256MulBy2Inline \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2
|
||||||
|
|
||||||
#define calX() \
|
#define calX() \
|
||||||
LDacc (x) \
|
LDacc (x) \
|
||||||
LDt (zsqr) \
|
LDt (zsqr) \
|
||||||
CALL sm2P256SubInternal(SB) \
|
CALL sm2P256SubInternal(SB) \ // X1 - ZZ
|
||||||
LDt (m) \
|
LDt (m) \
|
||||||
CALL sm2P256MulInternal(SB) \
|
CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2
|
||||||
ST (m) \
|
ST (m) \
|
||||||
\// Multiply by 3
|
\// Multiply by 3
|
||||||
p256MulBy2Inline \
|
p256TripleInline \
|
||||||
LDacc (m) \
|
STt (m) \ // M = 3 * (X1^2 - ZZ^2)
|
||||||
p256AddInline \
|
|
||||||
STt (m) \
|
|
||||||
\////////////////////////
|
\////////////////////////
|
||||||
LDacc (y) \
|
LDacc (y) \
|
||||||
p256MulBy2Inline \
|
p256MulBy2Inline2 \
|
||||||
t2acc \
|
CALL sm2P256SqrInternal(SB) \ // 4 * YY = (2*Y1)^2
|
||||||
CALL sm2P256SqrInternal(SB) \
|
ST (s) \ // S = 4 * YY
|
||||||
ST (s) \
|
CALL sm2P256SqrInternal(SB) \ // (4 * YY)^2 = 16 * YYYY
|
||||||
CALL sm2P256SqrInternal(SB) \
|
|
||||||
\// Divide by 2
|
\// Divide by 2
|
||||||
XORQ mul0, mul0 \
|
XORQ mul0, mul0 \
|
||||||
MOVQ acc4, t0 \
|
MOVQ acc4, t0 \
|
||||||
MOVQ acc5, t1 \
|
MOVQ acc5, t1 \
|
||||||
MOVQ acc6, t2 \
|
MOVQ acc6, t2 \
|
||||||
MOVQ acc7, t3 \
|
MOVQ acc7, t3 \
|
||||||
\
|
\ // [mul0, acc7, acc6, acc5, acc4] := [acc7, acc6, acc5, acc4] + P
|
||||||
ADDQ $-1, acc4 \
|
ADDQ $-1, acc4 \
|
||||||
ADCQ p256p<>+0x08(SB), acc5 \
|
ADCQ p256p<>+0x08(SB), acc5 \
|
||||||
ADCQ $-1, acc6 \
|
ADCQ $-1, acc6 \
|
||||||
ADCQ p256p<>+0x018(SB), acc7 \
|
ADCQ p256p<>+0x018(SB), acc7 \
|
||||||
ADCQ $0, mul0 \
|
ADCQ $0, mul0 \
|
||||||
TESTQ $1, t0 \
|
TESTQ $1, t0 \ // ZF := 1 if (t0 AND 1 == 0)
|
||||||
\
|
\ // CMOVQEQ: Move if equal (ZF == 1)
|
||||||
CMOVQEQ t0, acc4 \
|
CMOVQEQ t0, acc4 \ // acc4 := t0 if (ZF == 1)
|
||||||
CMOVQEQ t1, acc5 \
|
CMOVQEQ t1, acc5 \ // acc5 := t1 if (ZF == 1)
|
||||||
CMOVQEQ t2, acc6 \
|
CMOVQEQ t2, acc6 \ // acc6 := t2 if (ZF == 1)
|
||||||
CMOVQEQ t3, acc7 \
|
CMOVQEQ t3, acc7 \ // acc7 := t3 if (ZF == 1)
|
||||||
ANDQ t0, mul0 \
|
ANDQ t0, mul0 \ // mul0 := t0 AND mul0 (mul0 := 0 if (ZF == 1) else keeping the original value 0 or 1)
|
||||||
\
|
\ // Divide even by 2
|
||||||
SHRQ $1, acc5, acc4 \
|
SHRQ $1, acc5, acc4 \ // acc4 := acc4 >> 1 | acc5 << 63
|
||||||
SHRQ $1, acc6, acc5 \
|
SHRQ $1, acc6, acc5 \ // acc5 := acc5 >> 1 | acc6 << 63
|
||||||
SHRQ $1, acc7, acc6 \
|
SHRQ $1, acc7, acc6 \ // acc6 := acc6 >> 1 | acc7 << 63
|
||||||
SHRQ $1, mul0, acc7 \
|
SHRQ $1, mul0, acc7 \ // acc7 := acc7 >> 1 | mul0 << 63
|
||||||
ST (y) \
|
ST (y) \ // Y3 = 8 * YYYY
|
||||||
\/////////////////////////
|
\/////////////////////////
|
||||||
LDacc (x) \
|
LDacc (x) \
|
||||||
LDt (s) \
|
LDt (s) \
|
||||||
CALL sm2P256MulInternal(SB) \
|
CALL sm2P256MulInternal(SB) \ // X1 * 4 * YY
|
||||||
ST (s) \
|
ST (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
|
||||||
p256MulBy2Inline \
|
p256MulBy2Inline \
|
||||||
STt (tmp) \
|
STt (tmp) \ // tmp = 2*S = 8 * X1 * YY
|
||||||
\
|
\
|
||||||
LDacc (m) \
|
LDacc (m) \
|
||||||
CALL sm2P256SqrInternal(SB) \
|
CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2
|
||||||
LDt (tmp) \
|
LDt (tmp) \
|
||||||
CALL sm2P256SubInternal(SB) \
|
CALL sm2P256SubInternal(SB) \ // X3 = M^2 - 2*S
|
||||||
|
|
||||||
#define calY() \
|
#define calY() \
|
||||||
acc2t \
|
acc2t \
|
||||||
LDacc (s) \
|
LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
|
||||||
CALL sm2P256SubInternal(SB) \
|
CALL sm2P256SubInternal(SB) \ // S - X3
|
||||||
\
|
\
|
||||||
LDt (m) \
|
LDt (m) \
|
||||||
CALL sm2P256MulInternal(SB) \
|
CALL sm2P256MulInternal(SB) \ // M * (S - X3)
|
||||||
\
|
\
|
||||||
LDt (y) \
|
LDt (y) \
|
||||||
CALL sm2P256SubInternal(SB) \
|
CALL sm2P256SubInternal(SB) \ // Y3 = M * (S - X3) - 8 * YYYYY
|
||||||
|
|
||||||
#define lastP256PointDouble() \
|
#define lastP256PointDouble() \
|
||||||
|
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl
|
||||||
calZ() \
|
calZ() \
|
||||||
MOVQ rptr, AX \
|
MOVQ rptr, AX \
|
||||||
\// Store z
|
\// Store z
|
||||||
|
@ -340,6 +340,76 @@ GLOBL p256one<>(SB), 8, $32
|
|||||||
CMOVQCS acc5, t1;\
|
CMOVQCS acc5, t1;\
|
||||||
CMOVQCS acc6, t2;\
|
CMOVQCS acc6, t2;\
|
||||||
CMOVQCS acc7, t3;
|
CMOVQCS acc7, t3;
|
||||||
|
|
||||||
|
/* ---------------------------------------*/
|
||||||
|
// [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
|
||||||
|
#define p256MulBy2Inline2\
|
||||||
|
XORQ mul0, mul0;\
|
||||||
|
ADDQ acc4, acc4;\
|
||||||
|
ADCQ acc5, acc5;\
|
||||||
|
ADCQ acc6, acc6;\
|
||||||
|
ADCQ acc7, acc7;\
|
||||||
|
ADCQ $0, mul0;\
|
||||||
|
MOVQ acc4, t0;\
|
||||||
|
MOVQ acc5, t1;\
|
||||||
|
MOVQ acc6, t2;\
|
||||||
|
MOVQ acc7, t3;\
|
||||||
|
SUBQ $-1, acc4;\
|
||||||
|
SBBQ p256p<>+0x08(SB), acc5;\
|
||||||
|
SBBQ $-1, acc6;\
|
||||||
|
SBBQ p256p<>+0x018(SB), acc7;\
|
||||||
|
SBBQ $0, mul0;\
|
||||||
|
CMOVQCS t0, acc4;\
|
||||||
|
CMOVQCS t1, acc5;\
|
||||||
|
CMOVQCS t2, acc6;\
|
||||||
|
CMOVQCS t3, acc7;
|
||||||
|
|
||||||
|
/* ---------------------------------------*/
|
||||||
|
// [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
|
||||||
|
#define p256TripleInline\
|
||||||
|
XORQ mul0, mul0;\
|
||||||
|
MOVQ acc4, acc0;\
|
||||||
|
MOVQ acc5, acc1;\
|
||||||
|
MOVQ acc6, acc2;\
|
||||||
|
MOVQ acc7, acc3;\
|
||||||
|
ADDQ acc4, acc4;\
|
||||||
|
ADCQ acc5, acc5;\
|
||||||
|
ADCQ acc6, acc6;\
|
||||||
|
ADCQ acc7, acc7;\
|
||||||
|
ADCQ $0, mul0;\
|
||||||
|
MOVQ acc4, t0;\
|
||||||
|
MOVQ acc5, t1;\
|
||||||
|
MOVQ acc6, t2;\
|
||||||
|
MOVQ acc7, t3;\
|
||||||
|
SUBQ $-1, acc4;\
|
||||||
|
SBBQ p256p<>+0x08(SB), acc5;\
|
||||||
|
SBBQ $-1, acc6;\
|
||||||
|
SBBQ p256p<>+0x018(SB), acc7;\
|
||||||
|
SBBQ $0, mul0;\
|
||||||
|
CMOVQCS t0, acc4;\
|
||||||
|
CMOVQCS t1, acc5;\
|
||||||
|
CMOVQCS t2, acc6;\
|
||||||
|
CMOVQCS t3, acc7;\
|
||||||
|
XORQ mul0, mul0;\
|
||||||
|
ADDQ acc0, acc4;\
|
||||||
|
ADCQ acc1, acc5;\
|
||||||
|
ADCQ acc2, acc6;\
|
||||||
|
ADCQ acc3, acc7;\
|
||||||
|
ADCQ $0, mul0;\
|
||||||
|
MOVQ acc4, t0;\
|
||||||
|
MOVQ acc5, t1;\
|
||||||
|
MOVQ acc6, t2;\
|
||||||
|
MOVQ acc7, t3;\
|
||||||
|
SUBQ $-1, t0;\
|
||||||
|
SBBQ p256p<>+0x08(SB), t1;\
|
||||||
|
SBBQ $-1, t2;\
|
||||||
|
SBBQ p256p<>+0x018(SB), t3;\
|
||||||
|
SBBQ $0, mul0;\
|
||||||
|
CMOVQCS acc4, t0;\
|
||||||
|
CMOVQCS acc5, t1;\
|
||||||
|
CMOVQCS acc6, t2;\
|
||||||
|
CMOVQCS acc7, t3;
|
||||||
|
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
|
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
|
||||||
#define p256AddInline \
|
#define p256AddInline \
|
||||||
|
@ -1674,14 +1674,11 @@ pointadd_avx2:
|
|||||||
CALL sm2P256MulInternal(SB) \
|
CALL sm2P256MulInternal(SB) \
|
||||||
ST (m) \
|
ST (m) \
|
||||||
\// Multiply by 3
|
\// Multiply by 3
|
||||||
p256MulBy2Inline \
|
p256TripleInline \
|
||||||
LDacc (m) \
|
|
||||||
p256AddInline \
|
|
||||||
STt (m) \
|
STt (m) \
|
||||||
\////////////////////////
|
\////////////////////////
|
||||||
LDacc (y) \
|
LDacc (y) \
|
||||||
p256MulBy2Inline \
|
p256MulBy2Inline2 \
|
||||||
t2acc \
|
|
||||||
CALL sm2P256SqrInternal(SB) \
|
CALL sm2P256SqrInternal(SB) \
|
||||||
ST (s) \
|
ST (s) \
|
||||||
CALL sm2P256SqrInternal(SB) \
|
CALL sm2P256SqrInternal(SB) \
|
||||||
@ -1735,6 +1732,7 @@ pointadd_avx2:
|
|||||||
CALL sm2P256SubInternal(SB) \
|
CALL sm2P256SubInternal(SB) \
|
||||||
|
|
||||||
#define lastP256PointDouble() \
|
#define lastP256PointDouble() \
|
||||||
|
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl
|
||||||
calZ() \
|
calZ() \
|
||||||
MOVQ rptr, AX \
|
MOVQ rptr, AX \
|
||||||
\// Store z
|
\// Store z
|
||||||
|
Loading…
x
Reference in New Issue
Block a user