mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
internal/sm2ec: supplement comments
This commit is contained in:
parent
53ac591635
commit
9f7e3ef018
@ -4,6 +4,7 @@
|
||||
// 256-bit primes"
|
||||
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
|
||||
// https://eprint.iacr.org/2013/816.pdf
|
||||
// https://github.com/emmansun/gmsm/wiki/SM2-WWMM-(2)
|
||||
//go:build amd64 && !purego && !plugin
|
||||
|
||||
#include "textflag.h"
|
||||
@ -423,10 +424,10 @@ internalSqrBMI2:
|
||||
ST (z1sqr) \
|
||||
\
|
||||
LDt (x2in) \
|
||||
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2
|
||||
CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2
|
||||
\
|
||||
LDt (x1in) \
|
||||
p256SubInline2 \// h = u2 - u1
|
||||
p256SubInline2 \// h = u2 - x1
|
||||
ST (h) \
|
||||
\
|
||||
LDt (z1in) \
|
||||
@ -441,7 +442,7 @@ internalSqrBMI2:
|
||||
ST (s2) \
|
||||
\
|
||||
LDt (y1in) \
|
||||
p256SubInline2 \// r = s2 - s1
|
||||
p256SubInline2 \// r = s2 - y1
|
||||
ST (r) \
|
||||
\
|
||||
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
|
||||
@ -456,26 +457,21 @@ internalSqrBMI2:
|
||||
ST (hcub) \
|
||||
\
|
||||
LDt (y1in) \
|
||||
CALL sm2P256MulInternal(SB) \// y1 * hˆ3
|
||||
CALL sm2P256MulInternal(SB) \// s2 = y1 * hˆ3
|
||||
ST (s2) \
|
||||
\
|
||||
LDacc (x1in) \
|
||||
LDt (hsqr) \
|
||||
CALL sm2P256MulInternal(SB) \// u1 * hˆ2
|
||||
CALL sm2P256MulInternal(SB) \// x1 * hˆ2
|
||||
ST (h) \
|
||||
\
|
||||
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
|
||||
p256MulBy2Inline \// x1 * hˆ2 * 2, inline
|
||||
LDacc (rsqr) \
|
||||
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
|
||||
p256SubInline2 \// rˆ2 - x1 * hˆ2 * 2
|
||||
\
|
||||
LDt (hcub) \
|
||||
p256SubInline2 \
|
||||
ST (xout) \
|
||||
\
|
||||
MOVQ acc4, t0 \
|
||||
MOVQ acc5, t1 \
|
||||
MOVQ acc6, t2 \
|
||||
MOVQ acc7, t3 \
|
||||
p256SubInline \
|
||||
STt (xout) \// xout = rˆ2 - 2 * x1 * hˆ2 - h^3
|
||||
LDacc (h) \
|
||||
p256SubInline2 \
|
||||
\
|
||||
@ -532,20 +528,20 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
|
||||
MOVOU zout(16*0), X4
|
||||
MOVOU zout(16*1), X5
|
||||
|
||||
MOVL BX, X6
|
||||
MOVL CX, X7
|
||||
MOVL BX, X6 // sel
|
||||
MOVL CX, X7 // zero
|
||||
|
||||
PXOR X8, X8
|
||||
PCMPEQL X9, X9
|
||||
PXOR X8, X8 // X8's bits are all 0
|
||||
PCMPEQL X9, X9 // X9's bits are all 1
|
||||
|
||||
PSHUFD $0, X6, X6
|
||||
PSHUFD $0, X7, X7
|
||||
|
||||
PCMPEQL X8, X6
|
||||
PCMPEQL X8, X7
|
||||
PCMPEQL X8, X6 // X6's bits are all 1 if sel = 0, else are 0
|
||||
PCMPEQL X8, X7 // X7's bits are all 1 if zero = 0, else are 0
|
||||
|
||||
MOVOU X6, X15
|
||||
PANDN X9, X15
|
||||
PANDN X9, X15 // X15 = NOT(X6)
|
||||
|
||||
MOVOU x1in(16*0), X9
|
||||
MOVOU x1in(16*1), X10
|
||||
@ -577,7 +573,7 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
|
||||
// Similarly if zero == 0
|
||||
PCMPEQL X9, X9
|
||||
MOVOU X7, X15
|
||||
PANDN X9, X15
|
||||
PANDN X9, X15 // X15 = NOT(X7)
|
||||
|
||||
MOVOU x2in(16*0), X9
|
||||
MOVOU x2in(16*1), X10
|
||||
@ -630,24 +626,20 @@ pointaddaffine_avx2:
|
||||
|
||||
p256PointAddAffineInline()
|
||||
// The result is not valid if (sel == 0), conditional choose
|
||||
MOVL BX, X6
|
||||
MOVL CX, X7
|
||||
MOVL BX, X6 // sel
|
||||
MOVL CX, X7 // zero
|
||||
|
||||
VPXOR Y8, Y8, Y8
|
||||
VPCMPEQD Y9, Y9, Y9
|
||||
VPXOR Y8, Y8, Y8 // Y8's bits are all 0
|
||||
|
||||
VPBROADCASTD X6, Y6
|
||||
VPBROADCASTD X7, Y7
|
||||
|
||||
VPCMPEQD Y8, Y6, Y6
|
||||
VPCMPEQD Y8, Y7, Y7
|
||||
VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0
|
||||
VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0
|
||||
|
||||
VMOVDQU Y6, Y15
|
||||
VPANDN Y9, Y15, Y15
|
||||
|
||||
VPAND xout(32*0), Y15, Y0
|
||||
VPAND yout(32*0), Y15, Y1
|
||||
VPAND zout(32*0), Y15, Y2
|
||||
VPANDN xout(32*0), Y6, Y0
|
||||
VPANDN yout(32*0), Y6, Y1
|
||||
VPANDN zout(32*0), Y6, Y2
|
||||
|
||||
VPAND x1in(32*0), Y6, Y9
|
||||
VPAND y1in(32*0), Y6, Y10
|
||||
@ -658,12 +650,9 @@ pointaddaffine_avx2:
|
||||
VPXOR Y11, Y2, Y2
|
||||
|
||||
// Similarly if zero == 0
|
||||
VPCMPEQD Y9, Y9, Y9
|
||||
VPANDN Y9, Y7, Y15
|
||||
|
||||
VPAND Y15, Y0, Y0
|
||||
VPAND Y15, Y1, Y1
|
||||
VPAND Y15, Y2, Y2
|
||||
VPANDN Y0, Y7, Y0
|
||||
VPANDN Y1, Y7, Y1
|
||||
VPANDN Y2, Y7, Y2
|
||||
|
||||
VPAND x2in(32*0), Y7, Y9
|
||||
VPAND y2in(32*0), Y7, Y10
|
||||
@ -801,13 +790,8 @@ pointaddaffine_avx2:
|
||||
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
|
||||
\
|
||||
LDt (hcub) \
|
||||
p256SubInline2 \
|
||||
ST (xout) \
|
||||
\
|
||||
MOVQ acc4, t0 \
|
||||
MOVQ acc5, t1 \
|
||||
MOVQ acc6, t2 \
|
||||
MOVQ acc7, t3 \
|
||||
p256SubInline \
|
||||
STt (xout) \
|
||||
LDacc (u2) \
|
||||
p256SubInline2 \
|
||||
\
|
||||
|
@ -32,20 +32,20 @@ GLOBL p256one<>(SB), 8, $32
|
||||
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
|
||||
MOVQ acc0, AX \
|
||||
MOVQ acc0, DX \
|
||||
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
|
||||
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
|
||||
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
|
||||
SHLQ $32, AX \
|
||||
SHRQ $32, DX \
|
||||
\// calculate the negative part: [1, -0x100000000, 0, -0x100000000] * acc0 + [0, acc3, acc2, acc1]
|
||||
SUBQ AX, acc1 \
|
||||
SBBQ DX, acc2 \
|
||||
SBBQ AX, acc3 \
|
||||
MOVQ acc0, AX \
|
||||
SBBQ DX, acc0 \
|
||||
\ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
|
||||
\ // calculate the positive part: [0, 0, 0, AX] + [acc0, acc3, acc2, acc1],
|
||||
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
|
||||
ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1)
|
||||
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
|
||||
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
|
||||
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
|
||||
ADDQ AX, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
\ // Second reduction step
|
||||
MOVQ acc1, AX \
|
||||
MOVQ acc1, DX \
|
||||
@ -102,6 +102,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
ADCQ x_ptr, acc3 \
|
||||
ADCQ $0, t0
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
||||
MOVQ a0, b0 \
|
||||
MOVQ a1, b1 \
|
||||
@ -113,7 +114,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SBBQ $-1, a2 \
|
||||
SBBQ p256p<>+0x018(SB), a3 \
|
||||
SBBQ $0, a4 \
|
||||
\
|
||||
\ // If the result of the subtraction is negative, restore the previous result
|
||||
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS b1, a1 \
|
||||
CMOVQCS b2, a2 \
|
||||
@ -131,13 +132,13 @@ GLOBL p256one<>(SB), 8, $32
|
||||
MOVQ a1, b1 \
|
||||
MOVQ a2, b2 \
|
||||
MOVQ a3, b3 \
|
||||
\// Subtract p256
|
||||
\// Subtract p256ord
|
||||
SUBQ p256ord<>+0x00(SB), a0 \
|
||||
SBBQ p256ord<>+0x08(SB) ,a1 \
|
||||
SBBQ p256ord<>+0x10(SB), a2 \
|
||||
SBBQ p256ord<>+0x18(SB), a3 \
|
||||
SBBQ $0, a4 \
|
||||
\
|
||||
\ // If the result of the subtraction is negative, restore the previous result
|
||||
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS b1, a1 \
|
||||
CMOVQCS b2, a2 \
|
||||
@ -148,6 +149,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
MOVQ a2, (8*2)(res) \
|
||||
MOVQ a3, (8*3)(res)
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define sm2P256SqrReductionInline \
|
||||
\ // First reduction step
|
||||
MOVQ acc0, mul0 \
|
||||
@ -237,6 +239,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
CMOVQCS t2, acc6 \
|
||||
CMOVQCS t3, acc7
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define sm2P256MulReductionInline \
|
||||
\// First reduction step
|
||||
MOVQ acc0, mul0 \
|
||||
@ -303,159 +306,6 @@ GLOBL p256one<>(SB), 8, $32
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3
|
||||
|
||||
#define p256PointDoubleInit() \
|
||||
MOVOU (16*0)(BX), X0;\
|
||||
MOVOU (16*1)(BX), X1;\
|
||||
MOVOU (16*2)(BX), X2;\
|
||||
MOVOU (16*3)(BX), X3;\
|
||||
MOVOU (16*4)(BX), X4;\
|
||||
MOVOU (16*5)(BX), X5;\
|
||||
\
|
||||
MOVOU X0, x(16*0);\
|
||||
MOVOU X1, x(16*1);\
|
||||
MOVOU X2, y(16*0);\
|
||||
MOVOU X3, y(16*1);\
|
||||
MOVOU X4, z(16*0);\
|
||||
MOVOU X5, z(16*1);
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
|
||||
#define p256MulBy2Inline\
|
||||
XORQ mul0, mul0;\
|
||||
ADDQ acc4, acc4;\
|
||||
ADCQ acc5, acc5;\
|
||||
ADCQ acc6, acc6;\
|
||||
ADCQ acc7, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, t0;\
|
||||
SBBQ p256p<>+0x08(SB), t1;\
|
||||
SBBQ $-1, t2;\
|
||||
SBBQ p256p<>+0x018(SB), t3;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc5, t1;\
|
||||
CMOVQCS acc6, t2;\
|
||||
CMOVQCS acc7, t3;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
|
||||
#define p256MulBy2Inline2\
|
||||
XORQ mul0, mul0;\
|
||||
ADDQ acc4, acc4;\
|
||||
ADCQ acc5, acc5;\
|
||||
ADCQ acc6, acc6;\
|
||||
ADCQ acc7, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, acc4;\
|
||||
SBBQ p256p<>+0x08(SB), acc5;\
|
||||
SBBQ $-1, acc6;\
|
||||
SBBQ p256p<>+0x018(SB), acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS t1, acc5;\
|
||||
CMOVQCS t2, acc6;\
|
||||
CMOVQCS t3, acc7;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
|
||||
#define p256TripleInline\
|
||||
XORQ mul0, mul0;\
|
||||
MOVQ acc4, acc0;\
|
||||
MOVQ acc5, acc1;\
|
||||
MOVQ acc6, acc2;\
|
||||
MOVQ acc7, acc3;\
|
||||
ADDQ acc4, acc4;\
|
||||
ADCQ acc5, acc5;\
|
||||
ADCQ acc6, acc6;\
|
||||
ADCQ acc7, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, acc4;\
|
||||
SBBQ p256p<>+0x08(SB), acc5;\
|
||||
SBBQ $-1, acc6;\
|
||||
SBBQ p256p<>+0x018(SB), acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS t1, acc5;\
|
||||
CMOVQCS t2, acc6;\
|
||||
CMOVQCS t3, acc7;\
|
||||
XORQ mul0, mul0;\
|
||||
ADDQ acc0, acc4;\
|
||||
ADCQ acc1, acc5;\
|
||||
ADCQ acc2, acc6;\
|
||||
ADCQ acc3, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, t0;\
|
||||
SBBQ p256p<>+0x08(SB), t1;\
|
||||
SBBQ $-1, t2;\
|
||||
SBBQ p256p<>+0x018(SB), t3;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc5, t1;\
|
||||
CMOVQCS acc6, t2;\
|
||||
CMOVQCS acc7, t3;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
|
||||
#define p256AddInline \
|
||||
XORQ mul0, mul0;\
|
||||
ADDQ t0, acc4;\
|
||||
ADCQ t1, acc5;\
|
||||
ADCQ t2, acc6;\
|
||||
ADCQ t3, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, t0;\
|
||||
SBBQ p256p<>+0x08(SB), t1;\
|
||||
SBBQ $-1, t2;\
|
||||
SBBQ p256p<>+0x018(SB), t3;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc5, t1;\
|
||||
CMOVQCS acc6, t2;\
|
||||
CMOVQCS acc7, t3;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
|
||||
#define p256SubInline2 \
|
||||
XORQ mul0, mul0;\
|
||||
SUBQ t0, acc4;\
|
||||
SBBQ t1, acc5;\
|
||||
SBBQ t2, acc6;\
|
||||
SBBQ t3, acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
MOVQ acc4, acc0;\
|
||||
MOVQ acc5, acc1;\
|
||||
MOVQ acc6, acc2;\
|
||||
MOVQ acc7, acc3;\
|
||||
ADDQ $-1, acc4;\
|
||||
ADCQ p256p<>+0x08(SB), acc5;\
|
||||
ADCQ $-1, acc6;\
|
||||
ADCQ p256p<>+0x018(SB), acc7;\
|
||||
ANDQ $1, mul0;\
|
||||
CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1)
|
||||
CMOVQEQ acc1, acc5;\
|
||||
CMOVQEQ acc2, acc6;\
|
||||
CMOVQEQ acc3, acc7;\
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define p256SqrRound(t1) \
|
||||
\// y[1:] * y[0]
|
||||
@ -891,7 +741,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
MULXQ p256ordK0<>(SB), DX, AX;\
|
||||
\
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0;\
|
||||
ADOXQ AX, acc0 ;\// (carry1, acc0) = acc0 + t0 * ord0
|
||||
ADOXQ AX, acc0;\// (carry1, acc0) = acc0 + t0 * ord0
|
||||
\
|
||||
MULXQ p256ord<>+0x08(SB), AX, t1;\
|
||||
ADCXQ t0, AX;\
|
||||
@ -985,6 +835,168 @@ GLOBL p256one<>(SB), 8, $32
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
|
||||
MOVQ res_ptr, x_ptr;
|
||||
|
||||
// Below marcors are used for point operation
|
||||
/* ---------------------------------------*/
|
||||
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
|
||||
#define p256MulBy2Inline\
|
||||
XORQ mul0, mul0;\
|
||||
ADDQ acc4, acc4;\
|
||||
ADCQ acc5, acc5;\
|
||||
ADCQ acc6, acc6;\
|
||||
ADCQ acc7, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, t0;\
|
||||
SBBQ p256p<>+0x08(SB), t1;\
|
||||
SBBQ $-1, t2;\
|
||||
SBBQ p256p<>+0x018(SB), t3;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc5, t1;\
|
||||
CMOVQCS acc6, t2;\
|
||||
CMOVQCS acc7, t3;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
|
||||
#define p256MulBy2Inline2\
|
||||
XORQ mul0, mul0;\
|
||||
ADDQ acc4, acc4;\
|
||||
ADCQ acc5, acc5;\
|
||||
ADCQ acc6, acc6;\
|
||||
ADCQ acc7, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, acc4;\
|
||||
SBBQ p256p<>+0x08(SB), acc5;\
|
||||
SBBQ $-1, acc6;\
|
||||
SBBQ p256p<>+0x018(SB), acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS t1, acc5;\
|
||||
CMOVQCS t2, acc6;\
|
||||
CMOVQCS t3, acc7;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
|
||||
#define p256TripleInline\
|
||||
XORQ mul0, mul0;\
|
||||
MOVQ acc4, acc0;\
|
||||
MOVQ acc5, acc1;\
|
||||
MOVQ acc6, acc2;\
|
||||
MOVQ acc7, acc3;\
|
||||
ADDQ acc4, acc4;\
|
||||
ADCQ acc5, acc5;\
|
||||
ADCQ acc6, acc6;\
|
||||
ADCQ acc7, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, acc4;\
|
||||
SBBQ p256p<>+0x08(SB), acc5;\
|
||||
SBBQ $-1, acc6;\
|
||||
SBBQ p256p<>+0x018(SB), acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS t1, acc5;\
|
||||
CMOVQCS t2, acc6;\
|
||||
CMOVQCS t3, acc7;\
|
||||
XORQ mul0, mul0;\
|
||||
ADDQ acc0, acc4;\
|
||||
ADCQ acc1, acc5;\
|
||||
ADCQ acc2, acc6;\
|
||||
ADCQ acc3, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, t0;\
|
||||
SBBQ p256p<>+0x08(SB), t1;\
|
||||
SBBQ $-1, t2;\
|
||||
SBBQ p256p<>+0x018(SB), t3;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc5, t1;\
|
||||
CMOVQCS acc6, t2;\
|
||||
CMOVQCS acc7, t3;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
|
||||
#define p256AddInline \
|
||||
XORQ mul0, mul0;\
|
||||
ADDQ t0, acc4;\
|
||||
ADCQ t1, acc5;\
|
||||
ADCQ t2, acc6;\
|
||||
ADCQ t3, acc7;\
|
||||
ADCQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
SUBQ $-1, t0;\
|
||||
SBBQ p256p<>+0x08(SB), t1;\
|
||||
SBBQ $-1, t2;\
|
||||
SBBQ p256p<>+0x018(SB), t3;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc5, t1;\
|
||||
CMOVQCS acc6, t2;\
|
||||
CMOVQCS acc7, t3;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
|
||||
#define p256SubInline \
|
||||
XORQ mul0, mul0;\
|
||||
SUBQ t0, acc4;\
|
||||
SBBQ t1, acc5;\
|
||||
SBBQ t2, acc6;\
|
||||
SBBQ t3, acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
MOVQ acc4, t0;\
|
||||
MOVQ acc5, t1;\
|
||||
MOVQ acc6, t2;\
|
||||
MOVQ acc7, t3;\
|
||||
ADDQ $-1, t0;\
|
||||
ADCQ p256p<>+0x08(SB), t1;\
|
||||
ADCQ $-1, t2;\
|
||||
ADCQ p256p<>+0x018(SB), t3;\
|
||||
ANDQ $1, mul0;\
|
||||
CMOVQEQ acc4, t0;\ // CMOVQEQ: Move if equal (ZF == 1)
|
||||
CMOVQEQ acc5, t1;\
|
||||
CMOVQEQ acc6, t2;\
|
||||
CMOVQEQ acc7, t3;\
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
|
||||
#define p256SubInline2 \
|
||||
XORQ mul0, mul0;\
|
||||
SUBQ t0, acc4;\
|
||||
SBBQ t1, acc5;\
|
||||
SBBQ t2, acc6;\
|
||||
SBBQ t3, acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
MOVQ acc4, acc0;\
|
||||
MOVQ acc5, acc1;\
|
||||
MOVQ acc6, acc2;\
|
||||
MOVQ acc7, acc3;\
|
||||
ADDQ $-1, acc4;\
|
||||
ADCQ p256p<>+0x08(SB), acc5;\
|
||||
ADCQ $-1, acc6;\
|
||||
ADCQ p256p<>+0x018(SB), acc7;\
|
||||
ANDQ $1, mul0;\
|
||||
CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1)
|
||||
CMOVQEQ acc1, acc5;\
|
||||
CMOVQEQ acc2, acc6;\
|
||||
CMOVQEQ acc3, acc7;\
|
||||
|
||||
#define p256SqrInternalInline \
|
||||
MOVQ acc4, mul0;\
|
||||
MULQ acc5;\
|
||||
@ -1143,3 +1155,18 @@ GLOBL p256one<>(SB), 8, $32
|
||||
\// Set the zero flag if so.
|
||||
\// CMOVQEQ: Move if equal (ZF == 1)
|
||||
CMOVQEQ t1, AX;
|
||||
|
||||
#define p256PointDoubleInit() \
|
||||
MOVOU (16*0)(BX), X0;\
|
||||
MOVOU (16*1)(BX), X1;\
|
||||
MOVOU (16*2)(BX), X2;\
|
||||
MOVOU (16*3)(BX), X3;\
|
||||
MOVOU (16*4)(BX), X4;\
|
||||
MOVOU (16*5)(BX), X5;\
|
||||
\
|
||||
MOVOU X0, x(16*0);\
|
||||
MOVOU X1, x(16*1);\
|
||||
MOVOU X2, y(16*0);\
|
||||
MOVOU X3, y(16*1);\
|
||||
MOVOU X4, z(16*0);\
|
||||
MOVOU X5, z(16*1);
|
||||
|
@ -4,6 +4,7 @@
|
||||
// 256-bit primes"
|
||||
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
|
||||
// https://eprint.iacr.org/2013/816.pdf
|
||||
// https://github.com/emmansun/gmsm/wiki/SM2-WWMM-(2)
|
||||
//go:build amd64 && !purego && plugin
|
||||
|
||||
// plugin mode - DO NOT use the R15 Register.
|
||||
@ -12,6 +13,7 @@
|
||||
// 2.p256OrdSqr
|
||||
// 3.sm2P256MulInternal
|
||||
// 4.sm2P256SqrInternal
|
||||
// The most affected one is sm2P256MulInternal, it uses SIMD register X0 as temp storage.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
@ -359,6 +361,7 @@ internalSqrBMI2:
|
||||
p256SqrInternalInlineAdx
|
||||
RET
|
||||
|
||||
// Below is same as non-plugin
|
||||
/* ---------------------------------------*/
|
||||
#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
|
||||
#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
|
||||
@ -412,98 +415,93 @@ internalSqrBMI2:
|
||||
MOVQ acc2, t2 \
|
||||
MOVQ acc3, t3 \
|
||||
\// Add in case the operand was > p256
|
||||
ADDQ $-1, acc0 \
|
||||
ADCQ p256p<>+0x08(SB), acc1 \
|
||||
ADCQ $-1, acc2 \
|
||||
ADCQ p256p<>+0x018(SB), acc3 \
|
||||
ADCQ $0, mul0 \
|
||||
CMOVQNE t0, acc0 \
|
||||
CMOVQNE t1, acc1 \
|
||||
CMOVQNE t2, acc2 \
|
||||
CMOVQNE t3, acc3 \
|
||||
ADDQ $-1, acc0 \
|
||||
ADCQ p256p<>+0x08(SB), acc1 \
|
||||
ADCQ $-1, acc2 \
|
||||
ADCQ p256p<>+0x018(SB), acc3 \
|
||||
ADCQ $0, mul0 \ // ZF := 1 if mul0 == 0 after ADC
|
||||
CMOVQNE t0, acc0 \ // CMOVQNE: Move if not equal (ZF == 0)
|
||||
CMOVQNE t1, acc1 \
|
||||
CMOVQNE t2, acc2 \
|
||||
CMOVQNE t3, acc3 \
|
||||
\// If condition is 0, keep original value
|
||||
TESTQ DX, DX \
|
||||
CMOVQEQ acc4, acc0 \
|
||||
TESTQ DX, DX \ // ZF := 1 if (DX AND DX == 0)
|
||||
CMOVQEQ acc4, acc0 \ // CMOVQEQ: Move if equal (ZF == 1)
|
||||
CMOVQEQ acc5, acc1 \
|
||||
CMOVQEQ acc6, acc2 \
|
||||
CMOVQEQ acc7, acc3 \
|
||||
\// Store result
|
||||
MOVQ acc0, y2in(8*0) \
|
||||
MOVQ acc1, y2in(8*1) \
|
||||
MOVQ acc2, y2in(8*2) \
|
||||
MOVQ acc3, y2in(8*3) \
|
||||
MOVQ acc0, y2in(8*0) \
|
||||
MOVQ acc1, y2in(8*1) \
|
||||
MOVQ acc2, y2in(8*2) \
|
||||
MOVQ acc3, y2in(8*3) \
|
||||
\// Begin point add
|
||||
LDacc (z1in) \
|
||||
CALL sm2P256SqrInternal(SB) \// z1ˆ2
|
||||
ST (z1sqr) \
|
||||
LDacc (z1in) \
|
||||
CALL sm2P256SqrInternal(SB) \// z1ˆ2
|
||||
ST (z1sqr) \
|
||||
\
|
||||
LDt (x2in) \
|
||||
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2
|
||||
LDt (x2in) \
|
||||
CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2
|
||||
\
|
||||
LDt (x1in) \
|
||||
p256SubInline2 \// h = u2 - u1
|
||||
ST (h) \
|
||||
LDt (x1in) \
|
||||
p256SubInline2 \// h = u2 - x1
|
||||
ST (h) \
|
||||
\
|
||||
LDt (z1in) \
|
||||
CALL sm2P256MulInternal(SB) \// z3 = h * z1
|
||||
ST (zout) \
|
||||
LDt (z1in) \
|
||||
CALL sm2P256MulInternal(SB) \// z3 = h * z1
|
||||
ST (zout) \
|
||||
\
|
||||
LDacc (z1sqr) \
|
||||
CALL sm2P256MulInternal(SB) \// z1ˆ3
|
||||
LDacc (z1sqr) \
|
||||
CALL sm2P256MulInternal(SB) \// z1ˆ3
|
||||
\
|
||||
LDt (y2in) \
|
||||
CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3
|
||||
ST (s2) \
|
||||
LDt (y2in) \
|
||||
CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3
|
||||
ST (s2) \
|
||||
\
|
||||
LDt (y1in) \
|
||||
p256SubInline2 \// r = s2 - s1
|
||||
ST (r) \
|
||||
LDt (y1in) \
|
||||
p256SubInline2 \// r = s2 - y1
|
||||
ST (r) \
|
||||
\
|
||||
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
|
||||
ST (rsqr) \
|
||||
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
|
||||
ST (rsqr) \
|
||||
\
|
||||
LDacc (h) \
|
||||
CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2
|
||||
ST (hsqr) \
|
||||
LDacc (h) \
|
||||
CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2
|
||||
ST (hsqr) \
|
||||
\
|
||||
LDt (h) \
|
||||
CALL sm2P256MulInternal(SB) \// hcub = hˆ3
|
||||
ST (hcub) \
|
||||
LDt (h) \
|
||||
CALL sm2P256MulInternal(SB) \// hcub = hˆ3
|
||||
ST (hcub) \
|
||||
\
|
||||
LDt (y1in) \
|
||||
CALL sm2P256MulInternal(SB) \// y1 * hˆ3
|
||||
ST (s2) \
|
||||
LDt (y1in) \
|
||||
CALL sm2P256MulInternal(SB) \// s2 = y1 * hˆ3
|
||||
ST (s2) \
|
||||
\
|
||||
LDacc (x1in) \
|
||||
LDt (hsqr) \
|
||||
CALL sm2P256MulInternal(SB) \// u1 * hˆ2
|
||||
ST (h) \
|
||||
LDacc (x1in) \
|
||||
LDt (hsqr) \
|
||||
CALL sm2P256MulInternal(SB) \// x1 * hˆ2
|
||||
ST (h) \
|
||||
\
|
||||
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
|
||||
LDacc (rsqr) \
|
||||
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
|
||||
p256MulBy2Inline \// x1 * hˆ2 * 2, inline
|
||||
LDacc (rsqr) \
|
||||
p256SubInline2 \// rˆ2 - x1 * hˆ2 * 2
|
||||
\
|
||||
LDt (hcub) \
|
||||
p256SubInline2 \
|
||||
ST (xout) \
|
||||
LDt (hcub) \
|
||||
p256SubInline \
|
||||
STt (xout) \// xout = rˆ2 - 2 * x1 * hˆ2 - h^3
|
||||
LDacc (h) \
|
||||
p256SubInline2 \
|
||||
\
|
||||
MOVQ acc4, t0 \
|
||||
MOVQ acc5, t1 \
|
||||
MOVQ acc6, t2 \
|
||||
MOVQ acc7, t3 \
|
||||
LDacc (h) \
|
||||
p256SubInline2 \
|
||||
LDt (r) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
\
|
||||
LDt (r) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
\
|
||||
LDt (s2) \
|
||||
p256SubInline2 \
|
||||
ST (yout) \
|
||||
LDt (s2) \
|
||||
p256SubInline2 \
|
||||
ST (yout) \
|
||||
\// Load stored values from stack
|
||||
MOVQ rptr, AX \
|
||||
MOVL sel_save, BX \
|
||||
MOVL zero_save, CX \
|
||||
MOVQ rptr, AX \
|
||||
MOVL sel_save, BX \
|
||||
MOVL zero_save, CX \
|
||||
|
||||
// func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
|
||||
TEXT ·p256PointAddAffineAsm(SB),0,$512-48
|
||||
@ -547,20 +545,20 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
|
||||
MOVOU zout(16*0), X4
|
||||
MOVOU zout(16*1), X5
|
||||
|
||||
MOVL BX, X6
|
||||
MOVL CX, X7
|
||||
MOVL BX, X6 // sel
|
||||
MOVL CX, X7 // zero
|
||||
|
||||
PXOR X8, X8
|
||||
PCMPEQL X9, X9
|
||||
PXOR X8, X8 // X8's bits are all 0
|
||||
PCMPEQL X9, X9 // X9's bits are all 1
|
||||
|
||||
PSHUFD $0, X6, X6
|
||||
PSHUFD $0, X7, X7
|
||||
|
||||
PCMPEQL X8, X6
|
||||
PCMPEQL X8, X7
|
||||
PCMPEQL X8, X6 // X6's bits are all 1 if sel = 0, else are 0
|
||||
PCMPEQL X8, X7 // X7's bits are all 1 if zero = 0, else are 0
|
||||
|
||||
MOVOU X6, X15
|
||||
PANDN X9, X15
|
||||
PANDN X9, X15 // X15 = NOT(X6)
|
||||
|
||||
MOVOU x1in(16*0), X9
|
||||
MOVOU x1in(16*1), X10
|
||||
@ -592,7 +590,7 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
|
||||
// Similarly if zero == 0
|
||||
PCMPEQL X9, X9
|
||||
MOVOU X7, X15
|
||||
PANDN X9, X15
|
||||
PANDN X9, X15 // X15 = NOT(X7)
|
||||
|
||||
MOVOU x2in(16*0), X9
|
||||
MOVOU x2in(16*1), X10
|
||||
@ -645,24 +643,20 @@ pointaddaffine_avx2:
|
||||
|
||||
p256PointAddAffineInline()
|
||||
// The result is not valid if (sel == 0), conditional choose
|
||||
MOVL BX, X6
|
||||
MOVL CX, X7
|
||||
MOVL BX, X6 // sel
|
||||
MOVL CX, X7 // zero
|
||||
|
||||
VPXOR Y8, Y8, Y8
|
||||
VPCMPEQD Y9, Y9, Y9
|
||||
VPXOR Y8, Y8, Y8 // Y8's bits are all 0
|
||||
|
||||
VPBROADCASTD X6, Y6
|
||||
VPBROADCASTD X7, Y7
|
||||
|
||||
VPCMPEQD Y8, Y6, Y6
|
||||
VPCMPEQD Y8, Y7, Y7
|
||||
VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0
|
||||
VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0
|
||||
|
||||
VMOVDQU Y6, Y15
|
||||
VPANDN Y9, Y15, Y15
|
||||
|
||||
VPAND xout(32*0), Y15, Y0
|
||||
VPAND yout(32*0), Y15, Y1
|
||||
VPAND zout(32*0), Y15, Y2
|
||||
VPANDN xout(32*0), Y6, Y0
|
||||
VPANDN yout(32*0), Y6, Y1
|
||||
VPANDN zout(32*0), Y6, Y2
|
||||
|
||||
VPAND x1in(32*0), Y6, Y9
|
||||
VPAND y1in(32*0), Y6, Y10
|
||||
@ -673,12 +667,9 @@ pointaddaffine_avx2:
|
||||
VPXOR Y11, Y2, Y2
|
||||
|
||||
// Similarly if zero == 0
|
||||
VPCMPEQD Y9, Y9, Y9
|
||||
VPANDN Y9, Y7, Y15
|
||||
|
||||
VPAND Y15, Y0, Y0
|
||||
VPAND Y15, Y1, Y1
|
||||
VPAND Y15, Y2, Y2
|
||||
VPANDN Y0, Y7, Y0
|
||||
VPANDN Y1, Y7, Y1
|
||||
VPANDN Y2, Y7, Y2
|
||||
|
||||
VPAND x2in(32*0), Y7, Y9
|
||||
VPAND y2in(32*0), Y7, Y10
|
||||
@ -762,7 +753,7 @@ pointaddaffine_avx2:
|
||||
ST (s2) \
|
||||
\
|
||||
LDt (s1) \
|
||||
p256SubInline2 \// r = s2 - s1
|
||||
p256SubInline2 \// r = s2 - s1
|
||||
ST (r) \
|
||||
p256IsZeroInline \
|
||||
MOVQ AX, points_eq \
|
||||
@ -816,13 +807,8 @@ pointaddaffine_avx2:
|
||||
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
|
||||
\
|
||||
LDt (hcub) \
|
||||
p256SubInline2 \
|
||||
ST (xout) \
|
||||
\
|
||||
MOVQ acc4, t0 \
|
||||
MOVQ acc5, t1 \
|
||||
MOVQ acc6, t2 \
|
||||
MOVQ acc7, t3 \
|
||||
p256SubInline \
|
||||
STt (xout) \
|
||||
LDacc (u2) \
|
||||
p256SubInline2 \
|
||||
\
|
||||
@ -965,83 +951,83 @@ pointadd_avx2:
|
||||
#define rptr (32*7)(SP)
|
||||
|
||||
#define calZ() \
|
||||
LDacc (z) \
|
||||
CALL sm2P256SqrInternal(SB) \
|
||||
ST (zsqr) \
|
||||
LDacc (z) \
|
||||
CALL sm2P256SqrInternal(SB) \
|
||||
ST (zsqr) \ // ZZ = Z1^2
|
||||
\
|
||||
LDt (x) \
|
||||
p256AddInline \
|
||||
STt (m) \
|
||||
LDt (x) \
|
||||
p256AddInline \
|
||||
STt (m) \ // M = ZZ + X1
|
||||
\
|
||||
LDacc (z) \
|
||||
LDt (y) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
p256MulBy2Inline \
|
||||
LDacc (z) \
|
||||
LDt (y) \
|
||||
CALL sm2P256MulInternal(SB) \ // Z1 * Y1
|
||||
p256MulBy2Inline \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2
|
||||
|
||||
#define calX() \
|
||||
LDacc (x) \
|
||||
LDt (zsqr) \
|
||||
p256SubInline2 \
|
||||
p256SubInline2 \ // X1 - ZZ
|
||||
LDt (m) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2
|
||||
ST (m) \
|
||||
\// Multiply by 3
|
||||
p256TripleInline \
|
||||
STt (m) \
|
||||
STt (m) \ // M = 3 * (X1^2 - ZZ^2)
|
||||
\////////////////////////
|
||||
LDacc (y) \
|
||||
p256MulBy2Inline2 \
|
||||
CALL sm2P256SqrInternal(SB) \
|
||||
ST (s) \
|
||||
CALL sm2P256SqrInternal(SB) \
|
||||
CALL sm2P256SqrInternal(SB) \ // 4 * YY = (2*Y1)^2
|
||||
ST (s) \ // S = 4 * YY
|
||||
CALL sm2P256SqrInternal(SB) \ // (4 * YY)^2 = 16 * YYYY
|
||||
\// Divide by 2
|
||||
XORQ mul0, mul0 \
|
||||
MOVQ acc4, t0 \
|
||||
MOVQ acc5, t1 \
|
||||
MOVQ acc6, t2 \
|
||||
MOVQ acc7, t3 \
|
||||
\
|
||||
\ // [mul0, acc7, acc6, acc5, acc4] := [acc7, acc6, acc5, acc4] + P
|
||||
ADDQ $-1, acc4 \
|
||||
ADCQ p256p<>+0x08(SB), acc5 \
|
||||
ADCQ $-1, acc6 \
|
||||
ADCQ p256p<>+0x018(SB), acc7 \
|
||||
ADCQ $0, mul0 \
|
||||
TESTQ $1, t0 \
|
||||
\
|
||||
CMOVQEQ t0, acc4 \
|
||||
CMOVQEQ t1, acc5 \
|
||||
CMOVQEQ t2, acc6 \
|
||||
CMOVQEQ t3, acc7 \
|
||||
ANDQ t0, mul0 \
|
||||
\
|
||||
SHRQ $1, acc5, acc4 \
|
||||
SHRQ $1, acc6, acc5 \
|
||||
SHRQ $1, acc7, acc6 \
|
||||
SHRQ $1, mul0, acc7 \
|
||||
ST (y) \
|
||||
TESTQ $1, t0 \ // ZF := 1 if (t0 AND 1 == 0)
|
||||
\ // CMOVQEQ: Move if equal (ZF == 1)
|
||||
CMOVQEQ t0, acc4 \ // acc4 := t0 if (ZF == 1)
|
||||
CMOVQEQ t1, acc5 \ // acc5 := t1 if (ZF == 1)
|
||||
CMOVQEQ t2, acc6 \ // acc6 := t2 if (ZF == 1)
|
||||
CMOVQEQ t3, acc7 \ // acc7 := t3 if (ZF == 1)
|
||||
ANDQ t0, mul0 \ // mul0 := t0 AND mul0 (mul0 := 0 if (ZF == 1) else keeping the original value 0 or 1)
|
||||
\ // Divide even by 2
|
||||
SHRQ $1, acc5, acc4 \ // acc4 := acc4 >> 1 | acc5 << 63
|
||||
SHRQ $1, acc6, acc5 \ // acc5 := acc5 >> 1 | acc6 << 63
|
||||
SHRQ $1, acc7, acc6 \ // acc6 := acc6 >> 1 | acc7 << 63
|
||||
SHRQ $1, mul0, acc7 \ // acc7 := acc7 >> 1 | mul0 << 63
|
||||
ST (y) \ // Y3 = 8 * YYYY
|
||||
\/////////////////////////
|
||||
LDacc (x) \
|
||||
LDt (s) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
ST (s) \
|
||||
CALL sm2P256MulInternal(SB) \ // X1 * 4 * YY
|
||||
ST (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
|
||||
p256MulBy2Inline \
|
||||
STt (tmp) \
|
||||
STt (tmp) \ // tmp = 2*S = 8 * X1 * YY
|
||||
\
|
||||
LDacc (m) \
|
||||
CALL sm2P256SqrInternal(SB) \
|
||||
CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2
|
||||
LDt (tmp) \
|
||||
p256SubInline2 \
|
||||
p256SubInline2 \ // X3 = M^2 - 2*S
|
||||
|
||||
#define calY() \
|
||||
acc2t \
|
||||
LDacc (s) \
|
||||
p256SubInline2 \
|
||||
LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
|
||||
p256SubInline2 \ // S - X3
|
||||
\
|
||||
LDt (m) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
CALL sm2P256MulInternal(SB) \ // M * (S - X3)
|
||||
\
|
||||
LDt (y) \
|
||||
p256SubInline2 \
|
||||
p256SubInline2 \ // Y3 = M * (S - X3) - 8 * YYYYY
|
||||
|
||||
#define lastP256PointDouble() \
|
||||
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl
|
||||
|
Loading…
x
Reference in New Issue
Block a user