internal/sm2ec: supplement comments

This commit is contained in:
Sun Yimin 2024-03-01 17:40:15 +08:00 committed by GitHub
parent 53ac591635
commit 9f7e3ef018
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 352 additions and 355 deletions

View File

@ -4,6 +4,7 @@
// 256-bit primes"
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
// https://eprint.iacr.org/2013/816.pdf
// https://github.com/emmansun/gmsm/wiki/SM2-WWMM-(2)
//go:build amd64 && !purego && !plugin
#include "textflag.h"
@ -423,10 +424,10 @@ internalSqrBMI2:
ST (z1sqr) \
\
LDt (x2in) \
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2
CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2
\
LDt (x1in) \
p256SubInline2 \// h = u2 - u1
p256SubInline2 \// h = u2 - x1
ST (h) \
\
LDt (z1in) \
@ -441,7 +442,7 @@ internalSqrBMI2:
ST (s2) \
\
LDt (y1in) \
p256SubInline2 \// r = s2 - s1
p256SubInline2 \// r = s2 - y1
ST (r) \
\
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
@ -456,26 +457,21 @@ internalSqrBMI2:
ST (hcub) \
\
LDt (y1in) \
CALL sm2P256MulInternal(SB) \// y1 * hˆ3
CALL sm2P256MulInternal(SB) \// s2 = y1 * hˆ3
ST (s2) \
\
LDacc (x1in) \
LDt (hsqr) \
CALL sm2P256MulInternal(SB) \// u1 * hˆ2
CALL sm2P256MulInternal(SB) \// x1 * hˆ2
ST (h) \
\
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
p256MulBy2Inline \// x1 * hˆ2 * 2, inline
LDacc (rsqr) \
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
p256SubInline2 \// rˆ2 - x1 * hˆ2 * 2
\
LDt (hcub) \
p256SubInline2 \
ST (xout) \
\
MOVQ acc4, t0 \
MOVQ acc5, t1 \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
p256SubInline \
STt (xout) \// xout = rˆ2 - 2 * x1 * hˆ2 - h^3
LDacc (h) \
p256SubInline2 \
\
@ -532,20 +528,20 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
MOVOU zout(16*0), X4
MOVOU zout(16*1), X5
MOVL BX, X6
MOVL CX, X7
MOVL BX, X6 // sel
MOVL CX, X7 // zero
PXOR X8, X8
PCMPEQL X9, X9
PXOR X8, X8 // X8's bits are all 0
PCMPEQL X9, X9 // X9's bits are all 1
PSHUFD $0, X6, X6
PSHUFD $0, X7, X7
PCMPEQL X8, X6
PCMPEQL X8, X7
PCMPEQL X8, X6 // X6's bits are all 1 if sel = 0, else are 0
PCMPEQL X8, X7 // X7's bits are all 1 if zero = 0, else are 0
MOVOU X6, X15
PANDN X9, X15
PANDN X9, X15 // X15 = NOT(X6)
MOVOU x1in(16*0), X9
MOVOU x1in(16*1), X10
@ -577,7 +573,7 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
// Similarly if zero == 0
PCMPEQL X9, X9
MOVOU X7, X15
PANDN X9, X15
PANDN X9, X15 // X15 = NOT(X7)
MOVOU x2in(16*0), X9
MOVOU x2in(16*1), X10
@ -630,24 +626,20 @@ pointaddaffine_avx2:
p256PointAddAffineInline()
// The result is not valid if (sel == 0), conditional choose
MOVL BX, X6
MOVL CX, X7
MOVL BX, X6 // sel
MOVL CX, X7 // zero
VPXOR Y8, Y8, Y8
VPCMPEQD Y9, Y9, Y9
VPXOR Y8, Y8, Y8 // Y8's bits are all 0
VPBROADCASTD X6, Y6
VPBROADCASTD X7, Y7
VPCMPEQD Y8, Y6, Y6
VPCMPEQD Y8, Y7, Y7
VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0
VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0
VMOVDQU Y6, Y15
VPANDN Y9, Y15, Y15
VPAND xout(32*0), Y15, Y0
VPAND yout(32*0), Y15, Y1
VPAND zout(32*0), Y15, Y2
VPANDN xout(32*0), Y6, Y0
VPANDN yout(32*0), Y6, Y1
VPANDN zout(32*0), Y6, Y2
VPAND x1in(32*0), Y6, Y9
VPAND y1in(32*0), Y6, Y10
@ -658,12 +650,9 @@ pointaddaffine_avx2:
VPXOR Y11, Y2, Y2
// Similarly if zero == 0
VPCMPEQD Y9, Y9, Y9
VPANDN Y9, Y7, Y15
VPAND Y15, Y0, Y0
VPAND Y15, Y1, Y1
VPAND Y15, Y2, Y2
VPANDN Y0, Y7, Y0
VPANDN Y1, Y7, Y1
VPANDN Y2, Y7, Y2
VPAND x2in(32*0), Y7, Y9
VPAND y2in(32*0), Y7, Y10
@ -801,13 +790,8 @@ pointaddaffine_avx2:
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
\
LDt (hcub) \
p256SubInline2 \
ST (xout) \
\
MOVQ acc4, t0 \
MOVQ acc5, t1 \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
p256SubInline \
STt (xout) \
LDacc (u2) \
p256SubInline2 \
\

View File

@ -32,20 +32,20 @@ GLOBL p256one<>(SB), 8, $32
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
MOVQ acc0, AX \
MOVQ acc0, DX \
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
SHLQ $32, AX \
SHRQ $32, DX \
\// calculate the negative part: [1, -0x100000000, 0, -0x100000000] * acc0 + [0, acc3, acc2, acc1]
SUBQ AX, acc1 \
SBBQ DX, acc2 \
SBBQ AX, acc3 \
MOVQ acc0, AX \
SBBQ DX, acc0 \
\ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
\ // calculate the positive part: [0, 0, 0, AX] + [acc0, acc3, acc2, acc1],
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1)
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
ADDQ AX, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
\ // Second reduction step
MOVQ acc1, AX \
MOVQ acc1, DX \
@ -102,6 +102,7 @@ GLOBL p256one<>(SB), 8, $32
ADCQ x_ptr, acc3 \
ADCQ $0, t0
/* ---------------------------------------*/
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
MOVQ a0, b0 \
MOVQ a1, b1 \
@ -113,7 +114,7 @@ GLOBL p256one<>(SB), 8, $32
SBBQ $-1, a2 \
SBBQ p256p<>+0x018(SB), a3 \
SBBQ $0, a4 \
\
\ // If the result of the subtraction is negative, restore the previous result
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
CMOVQCS b1, a1 \
CMOVQCS b2, a2 \
@ -131,13 +132,13 @@ GLOBL p256one<>(SB), 8, $32
MOVQ a1, b1 \
MOVQ a2, b2 \
MOVQ a3, b3 \
\// Subtract p256
\// Subtract p256ord
SUBQ p256ord<>+0x00(SB), a0 \
SBBQ p256ord<>+0x08(SB) ,a1 \
SBBQ p256ord<>+0x10(SB), a2 \
SBBQ p256ord<>+0x18(SB), a3 \
SBBQ $0, a4 \
\
\ // If the result of the subtraction is negative, restore the previous result
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
CMOVQCS b1, a1 \
CMOVQCS b2, a2 \
@ -148,6 +149,7 @@ GLOBL p256one<>(SB), 8, $32
MOVQ a2, (8*2)(res) \
MOVQ a3, (8*3)(res)
/* ---------------------------------------*/
#define sm2P256SqrReductionInline \
\ // First reduction step
MOVQ acc0, mul0 \
@ -237,6 +239,7 @@ GLOBL p256one<>(SB), 8, $32
CMOVQCS t2, acc6 \
CMOVQCS t3, acc7
/* ---------------------------------------*/
#define sm2P256MulReductionInline \
\// First reduction step
MOVQ acc0, mul0 \
@ -303,159 +306,6 @@ GLOBL p256one<>(SB), 8, $32
ADCQ $0, acc2 \
ADCQ $0, acc3
#define p256PointDoubleInit() \
MOVOU (16*0)(BX), X0;\
MOVOU (16*1)(BX), X1;\
MOVOU (16*2)(BX), X2;\
MOVOU (16*3)(BX), X3;\
MOVOU (16*4)(BX), X4;\
MOVOU (16*5)(BX), X5;\
\
MOVOU X0, x(16*0);\
MOVOU X1, x(16*1);\
MOVOU X2, y(16*0);\
MOVOU X3, y(16*1);\
MOVOU X4, z(16*0);\
MOVOU X5, z(16*1);
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline2\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
#define p256TripleInline\
XORQ mul0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;\
XORQ mul0, mul0;\
ADDQ acc0, acc4;\
ADCQ acc1, acc5;\
ADCQ acc2, acc6;\
ADCQ acc3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
#define p256AddInline \
XORQ mul0, mul0;\
ADDQ t0, acc4;\
ADCQ t1, acc5;\
ADCQ t2, acc6;\
ADCQ t3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
#define p256SubInline2 \
XORQ mul0, mul0;\
SUBQ t0, acc4;\
SBBQ t1, acc5;\
SBBQ t2, acc6;\
SBBQ t3, acc7;\
SBBQ $0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ $-1, acc4;\
ADCQ p256p<>+0x08(SB), acc5;\
ADCQ $-1, acc6;\
ADCQ p256p<>+0x018(SB), acc7;\
ANDQ $1, mul0;\
CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc1, acc5;\
CMOVQEQ acc2, acc6;\
CMOVQEQ acc3, acc7;\
/* ---------------------------------------*/
#define p256SqrRound(t1) \
\// y[1:] * y[0]
@ -891,7 +741,7 @@ GLOBL p256one<>(SB), 8, $32
MULXQ p256ordK0<>(SB), DX, AX;\
\
MULXQ p256ord<>+0x00(SB), AX, t0;\
ADOXQ AX, acc0 ;\// (carry1, acc0) = acc0 + t0 * ord0
ADOXQ AX, acc0;\// (carry1, acc0) = acc0 + t0 * ord0
\
MULXQ p256ord<>+0x08(SB), AX, t1;\
ADCXQ t0, AX;\
@ -985,6 +835,168 @@ GLOBL p256one<>(SB), 8, $32
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
MOVQ res_ptr, x_ptr;
// Below marcors are used for point operation
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline2\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
#define p256TripleInline\
XORQ mul0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;\
XORQ mul0, mul0;\
ADDQ acc0, acc4;\
ADCQ acc1, acc5;\
ADCQ acc2, acc6;\
ADCQ acc3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
#define p256AddInline \
XORQ mul0, mul0;\
ADDQ t0, acc4;\
ADCQ t1, acc5;\
ADCQ t2, acc6;\
ADCQ t3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
#define p256SubInline \
XORQ mul0, mul0;\
SUBQ t0, acc4;\
SBBQ t1, acc5;\
SBBQ t2, acc6;\
SBBQ t3, acc7;\
SBBQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
ADDQ $-1, t0;\
ADCQ p256p<>+0x08(SB), t1;\
ADCQ $-1, t2;\
ADCQ p256p<>+0x018(SB), t3;\
ANDQ $1, mul0;\
CMOVQEQ acc4, t0;\ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc5, t1;\
CMOVQEQ acc6, t2;\
CMOVQEQ acc7, t3;\
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
#define p256SubInline2 \
XORQ mul0, mul0;\
SUBQ t0, acc4;\
SBBQ t1, acc5;\
SBBQ t2, acc6;\
SBBQ t3, acc7;\
SBBQ $0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ $-1, acc4;\
ADCQ p256p<>+0x08(SB), acc5;\
ADCQ $-1, acc6;\
ADCQ p256p<>+0x018(SB), acc7;\
ANDQ $1, mul0;\
CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc1, acc5;\
CMOVQEQ acc2, acc6;\
CMOVQEQ acc3, acc7;\
#define p256SqrInternalInline \
MOVQ acc4, mul0;\
MULQ acc5;\
@ -1143,3 +1155,18 @@ GLOBL p256one<>(SB), 8, $32
\// Set the zero flag if so.
\// CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ t1, AX;
#define p256PointDoubleInit() \
MOVOU (16*0)(BX), X0;\
MOVOU (16*1)(BX), X1;\
MOVOU (16*2)(BX), X2;\
MOVOU (16*3)(BX), X3;\
MOVOU (16*4)(BX), X4;\
MOVOU (16*5)(BX), X5;\
\
MOVOU X0, x(16*0);\
MOVOU X1, x(16*1);\
MOVOU X2, y(16*0);\
MOVOU X3, y(16*1);\
MOVOU X4, z(16*0);\
MOVOU X5, z(16*1);

View File

@ -4,6 +4,7 @@
// 256-bit primes"
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
// https://eprint.iacr.org/2013/816.pdf
// https://github.com/emmansun/gmsm/wiki/SM2-WWMM-(2)
//go:build amd64 && !purego && plugin
// plugin mode - DO NOT use the R15 Register.
@ -12,6 +13,7 @@
// 2.p256OrdSqr
// 3.sm2P256MulInternal
// 4.sm2P256SqrInternal
// The most affected one is sm2P256MulInternal, it uses SIMD register X0 as temp storage.
#include "textflag.h"
@ -359,6 +361,7 @@ internalSqrBMI2:
p256SqrInternalInlineAdx
RET
// Below is same as non-plugin
/* ---------------------------------------*/
#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
@ -416,14 +419,14 @@ internalSqrBMI2:
ADCQ p256p<>+0x08(SB), acc1 \
ADCQ $-1, acc2 \
ADCQ p256p<>+0x018(SB), acc3 \
ADCQ $0, mul0 \
CMOVQNE t0, acc0 \
ADCQ $0, mul0 \ // ZF := 1 if mul0 == 0 after ADC
CMOVQNE t0, acc0 \ // CMOVQNE: Move if not equal (ZF == 0)
CMOVQNE t1, acc1 \
CMOVQNE t2, acc2 \
CMOVQNE t3, acc3 \
\// If condition is 0, keep original value
TESTQ DX, DX \
CMOVQEQ acc4, acc0 \
TESTQ DX, DX \ // ZF := 1 if (DX AND DX == 0)
CMOVQEQ acc4, acc0 \ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc5, acc1 \
CMOVQEQ acc6, acc2 \
CMOVQEQ acc7, acc3 \
@ -438,10 +441,10 @@ internalSqrBMI2:
ST (z1sqr) \
\
LDt (x2in) \
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2
CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2
\
LDt (x1in) \
p256SubInline2 \// h = u2 - u1
p256SubInline2 \// h = u2 - x1
ST (h) \
\
LDt (z1in) \
@ -456,7 +459,7 @@ internalSqrBMI2:
ST (s2) \
\
LDt (y1in) \
p256SubInline2 \// r = s2 - s1
p256SubInline2 \// r = s2 - y1
ST (r) \
\
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
@ -471,26 +474,21 @@ internalSqrBMI2:
ST (hcub) \
\
LDt (y1in) \
CALL sm2P256MulInternal(SB) \// y1 * hˆ3
CALL sm2P256MulInternal(SB) \// s2 = y1 * hˆ3
ST (s2) \
\
LDacc (x1in) \
LDt (hsqr) \
CALL sm2P256MulInternal(SB) \// u1 * hˆ2
CALL sm2P256MulInternal(SB) \// x1 * hˆ2
ST (h) \
\
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
p256MulBy2Inline \// x1 * hˆ2 * 2, inline
LDacc (rsqr) \
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
p256SubInline2 \// rˆ2 - x1 * hˆ2 * 2
\
LDt (hcub) \
p256SubInline2 \
ST (xout) \
\
MOVQ acc4, t0 \
MOVQ acc5, t1 \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
p256SubInline \
STt (xout) \// xout = rˆ2 - 2 * x1 * hˆ2 - h^3
LDacc (h) \
p256SubInline2 \
\
@ -547,20 +545,20 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
MOVOU zout(16*0), X4
MOVOU zout(16*1), X5
MOVL BX, X6
MOVL CX, X7
MOVL BX, X6 // sel
MOVL CX, X7 // zero
PXOR X8, X8
PCMPEQL X9, X9
PXOR X8, X8 // X8's bits are all 0
PCMPEQL X9, X9 // X9's bits are all 1
PSHUFD $0, X6, X6
PSHUFD $0, X7, X7
PCMPEQL X8, X6
PCMPEQL X8, X7
PCMPEQL X8, X6 // X6's bits are all 1 if sel = 0, else are 0
PCMPEQL X8, X7 // X7's bits are all 1 if zero = 0, else are 0
MOVOU X6, X15
PANDN X9, X15
PANDN X9, X15 // X15 = NOT(X6)
MOVOU x1in(16*0), X9
MOVOU x1in(16*1), X10
@ -592,7 +590,7 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
// Similarly if zero == 0
PCMPEQL X9, X9
MOVOU X7, X15
PANDN X9, X15
PANDN X9, X15 // X15 = NOT(X7)
MOVOU x2in(16*0), X9
MOVOU x2in(16*1), X10
@ -645,24 +643,20 @@ pointaddaffine_avx2:
p256PointAddAffineInline()
// The result is not valid if (sel == 0), conditional choose
MOVL BX, X6
MOVL CX, X7
MOVL BX, X6 // sel
MOVL CX, X7 // zero
VPXOR Y8, Y8, Y8
VPCMPEQD Y9, Y9, Y9
VPXOR Y8, Y8, Y8 // Y8's bits are all 0
VPBROADCASTD X6, Y6
VPBROADCASTD X7, Y7
VPCMPEQD Y8, Y6, Y6
VPCMPEQD Y8, Y7, Y7
VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0
VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0
VMOVDQU Y6, Y15
VPANDN Y9, Y15, Y15
VPAND xout(32*0), Y15, Y0
VPAND yout(32*0), Y15, Y1
VPAND zout(32*0), Y15, Y2
VPANDN xout(32*0), Y6, Y0
VPANDN yout(32*0), Y6, Y1
VPANDN zout(32*0), Y6, Y2
VPAND x1in(32*0), Y6, Y9
VPAND y1in(32*0), Y6, Y10
@ -673,12 +667,9 @@ pointaddaffine_avx2:
VPXOR Y11, Y2, Y2
// Similarly if zero == 0
VPCMPEQD Y9, Y9, Y9
VPANDN Y9, Y7, Y15
VPAND Y15, Y0, Y0
VPAND Y15, Y1, Y1
VPAND Y15, Y2, Y2
VPANDN Y0, Y7, Y0
VPANDN Y1, Y7, Y1
VPANDN Y2, Y7, Y2
VPAND x2in(32*0), Y7, Y9
VPAND y2in(32*0), Y7, Y10
@ -816,13 +807,8 @@ pointaddaffine_avx2:
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
\
LDt (hcub) \
p256SubInline2 \
ST (xout) \
\
MOVQ acc4, t0 \
MOVQ acc5, t1 \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
p256SubInline \
STt (xout) \
LDacc (u2) \
p256SubInline2 \
\
@ -967,81 +953,81 @@ pointadd_avx2:
#define calZ() \
LDacc (z) \
CALL sm2P256SqrInternal(SB) \
ST (zsqr) \
ST (zsqr) \ // ZZ = Z1^2
\
LDt (x) \
p256AddInline \
STt (m) \
STt (m) \ // M = ZZ + X1
\
LDacc (z) \
LDt (y) \
CALL sm2P256MulInternal(SB) \
p256MulBy2Inline \
CALL sm2P256MulInternal(SB) \ // Z1 * Y1
p256MulBy2Inline \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2
#define calX() \
LDacc (x) \
LDt (zsqr) \
p256SubInline2 \
p256SubInline2 \ // X1 - ZZ
LDt (m) \
CALL sm2P256MulInternal(SB) \
CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2
ST (m) \
\// Multiply by 3
p256TripleInline \
STt (m) \
STt (m) \ // M = 3 * (X1^2 - ZZ^2)
\////////////////////////
LDacc (y) \
p256MulBy2Inline2 \
CALL sm2P256SqrInternal(SB) \
ST (s) \
CALL sm2P256SqrInternal(SB) \
CALL sm2P256SqrInternal(SB) \ // 4 * YY = (2*Y1)^2
ST (s) \ // S = 4 * YY
CALL sm2P256SqrInternal(SB) \ // (4 * YY)^2 = 16 * YYYY
\// Divide by 2
XORQ mul0, mul0 \
MOVQ acc4, t0 \
MOVQ acc5, t1 \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
\
\ // [mul0, acc7, acc6, acc5, acc4] := [acc7, acc6, acc5, acc4] + P
ADDQ $-1, acc4 \
ADCQ p256p<>+0x08(SB), acc5 \
ADCQ $-1, acc6 \
ADCQ p256p<>+0x018(SB), acc7 \
ADCQ $0, mul0 \
TESTQ $1, t0 \
\
CMOVQEQ t0, acc4 \
CMOVQEQ t1, acc5 \
CMOVQEQ t2, acc6 \
CMOVQEQ t3, acc7 \
ANDQ t0, mul0 \
\
SHRQ $1, acc5, acc4 \
SHRQ $1, acc6, acc5 \
SHRQ $1, acc7, acc6 \
SHRQ $1, mul0, acc7 \
ST (y) \
TESTQ $1, t0 \ // ZF := 1 if (t0 AND 1 == 0)
\ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ t0, acc4 \ // acc4 := t0 if (ZF == 1)
CMOVQEQ t1, acc5 \ // acc5 := t1 if (ZF == 1)
CMOVQEQ t2, acc6 \ // acc6 := t2 if (ZF == 1)
CMOVQEQ t3, acc7 \ // acc7 := t3 if (ZF == 1)
ANDQ t0, mul0 \ // mul0 := t0 AND mul0 (mul0 := 0 if (ZF == 1) else keeping the original value 0 or 1)
\ // Divide even by 2
SHRQ $1, acc5, acc4 \ // acc4 := acc4 >> 1 | acc5 << 63
SHRQ $1, acc6, acc5 \ // acc5 := acc5 >> 1 | acc6 << 63
SHRQ $1, acc7, acc6 \ // acc6 := acc6 >> 1 | acc7 << 63
SHRQ $1, mul0, acc7 \ // acc7 := acc7 >> 1 | mul0 << 63
ST (y) \ // Y3 = 8 * YYYY
\/////////////////////////
LDacc (x) \
LDt (s) \
CALL sm2P256MulInternal(SB) \
ST (s) \
CALL sm2P256MulInternal(SB) \ // X1 * 4 * YY
ST (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
p256MulBy2Inline \
STt (tmp) \
STt (tmp) \ // tmp = 2*S = 8 * X1 * YY
\
LDacc (m) \
CALL sm2P256SqrInternal(SB) \
CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2
LDt (tmp) \
p256SubInline2 \
p256SubInline2 \ // X3 = M^2 - 2*S
#define calY() \
acc2t \
LDacc (s) \
p256SubInline2 \
LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
p256SubInline2 \ // S - X3
\
LDt (m) \
CALL sm2P256MulInternal(SB) \
CALL sm2P256MulInternal(SB) \ // M * (S - X3)
\
LDt (y) \
p256SubInline2 \
p256SubInline2 \ // Y3 = M * (S - X3) - 8 * YYYYY
#define lastP256PointDouble() \
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl