internal/sm2ec: supplement comments

This commit is contained in:
Sun Yimin 2024-03-01 17:40:15 +08:00 committed by GitHub
parent 53ac591635
commit 9f7e3ef018
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 352 additions and 355 deletions

View File

@ -4,6 +4,7 @@
// 256-bit primes" // 256-bit primes"
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
// https://eprint.iacr.org/2013/816.pdf // https://eprint.iacr.org/2013/816.pdf
// https://github.com/emmansun/gmsm/wiki/SM2-WWMM-(2)
//go:build amd64 && !purego && !plugin //go:build amd64 && !purego && !plugin
#include "textflag.h" #include "textflag.h"
@ -423,10 +424,10 @@ internalSqrBMI2:
ST (z1sqr) \ ST (z1sqr) \
\ \
LDt (x2in) \ LDt (x2in) \
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2 CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2
\ \
LDt (x1in) \ LDt (x1in) \
p256SubInline2 \// h = u2 - u1 p256SubInline2 \// h = u2 - x1
ST (h) \ ST (h) \
\ \
LDt (z1in) \ LDt (z1in) \
@ -441,7 +442,7 @@ internalSqrBMI2:
ST (s2) \ ST (s2) \
\ \
LDt (y1in) \ LDt (y1in) \
p256SubInline2 \// r = s2 - s1 p256SubInline2 \// r = s2 - y1
ST (r) \ ST (r) \
\ \
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
@ -456,26 +457,21 @@ internalSqrBMI2:
ST (hcub) \ ST (hcub) \
\ \
LDt (y1in) \ LDt (y1in) \
CALL sm2P256MulInternal(SB) \// y1 * hˆ3 CALL sm2P256MulInternal(SB) \// s2 = y1 * hˆ3
ST (s2) \ ST (s2) \
\ \
LDacc (x1in) \ LDacc (x1in) \
LDt (hsqr) \ LDt (hsqr) \
CALL sm2P256MulInternal(SB) \// u1 * hˆ2 CALL sm2P256MulInternal(SB) \// x1 * hˆ2
ST (h) \ ST (h) \
\ \
p256MulBy2Inline \// u1 * hˆ2 * 2, inline p256MulBy2Inline \// x1 * hˆ2 * 2, inline
LDacc (rsqr) \ LDacc (rsqr) \
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 p256SubInline2 \// rˆ2 - x1 * hˆ2 * 2
\ \
LDt (hcub) \ LDt (hcub) \
p256SubInline2 \ p256SubInline \
ST (xout) \ STt (xout) \// xout = rˆ2 - 2 * x1 * hˆ2 - h^3
\
MOVQ acc4, t0 \
MOVQ acc5, t1 \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
LDacc (h) \ LDacc (h) \
p256SubInline2 \ p256SubInline2 \
\ \
@ -532,20 +528,20 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
MOVOU zout(16*0), X4 MOVOU zout(16*0), X4
MOVOU zout(16*1), X5 MOVOU zout(16*1), X5
MOVL BX, X6 MOVL BX, X6 // sel
MOVL CX, X7 MOVL CX, X7 // zero
PXOR X8, X8 PXOR X8, X8 // X8's bits are all 0
PCMPEQL X9, X9 PCMPEQL X9, X9 // X9's bits are all 1
PSHUFD $0, X6, X6 PSHUFD $0, X6, X6
PSHUFD $0, X7, X7 PSHUFD $0, X7, X7
PCMPEQL X8, X6 PCMPEQL X8, X6 // X6's bits are all 1 if sel = 0, else are 0
PCMPEQL X8, X7 PCMPEQL X8, X7 // X7's bits are all 1 if zero = 0, else are 0
MOVOU X6, X15 MOVOU X6, X15
PANDN X9, X15 PANDN X9, X15 // X15 = NOT(X6)
MOVOU x1in(16*0), X9 MOVOU x1in(16*0), X9
MOVOU x1in(16*1), X10 MOVOU x1in(16*1), X10
@ -577,7 +573,7 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
// Similarly if zero == 0 // Similarly if zero == 0
PCMPEQL X9, X9 PCMPEQL X9, X9
MOVOU X7, X15 MOVOU X7, X15
PANDN X9, X15 PANDN X9, X15 // X15 = NOT(X7)
MOVOU x2in(16*0), X9 MOVOU x2in(16*0), X9
MOVOU x2in(16*1), X10 MOVOU x2in(16*1), X10
@ -630,24 +626,20 @@ pointaddaffine_avx2:
p256PointAddAffineInline() p256PointAddAffineInline()
// The result is not valid if (sel == 0), conditional choose // The result is not valid if (sel == 0), conditional choose
MOVL BX, X6 MOVL BX, X6 // sel
MOVL CX, X7 MOVL CX, X7 // zero
VPXOR Y8, Y8, Y8 VPXOR Y8, Y8, Y8 // Y8's bits are all 0
VPCMPEQD Y9, Y9, Y9
VPBROADCASTD X6, Y6 VPBROADCASTD X6, Y6
VPBROADCASTD X7, Y7 VPBROADCASTD X7, Y7
VPCMPEQD Y8, Y6, Y6 VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0
VPCMPEQD Y8, Y7, Y7 VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0
VMOVDQU Y6, Y15 VPANDN xout(32*0), Y6, Y0
VPANDN Y9, Y15, Y15 VPANDN yout(32*0), Y6, Y1
VPANDN zout(32*0), Y6, Y2
VPAND xout(32*0), Y15, Y0
VPAND yout(32*0), Y15, Y1
VPAND zout(32*0), Y15, Y2
VPAND x1in(32*0), Y6, Y9 VPAND x1in(32*0), Y6, Y9
VPAND y1in(32*0), Y6, Y10 VPAND y1in(32*0), Y6, Y10
@ -658,12 +650,9 @@ pointaddaffine_avx2:
VPXOR Y11, Y2, Y2 VPXOR Y11, Y2, Y2
// Similarly if zero == 0 // Similarly if zero == 0
VPCMPEQD Y9, Y9, Y9 VPANDN Y0, Y7, Y0
VPANDN Y9, Y7, Y15 VPANDN Y1, Y7, Y1
VPANDN Y2, Y7, Y2
VPAND Y15, Y0, Y0
VPAND Y15, Y1, Y1
VPAND Y15, Y2, Y2
VPAND x2in(32*0), Y7, Y9 VPAND x2in(32*0), Y7, Y9
VPAND y2in(32*0), Y7, Y10 VPAND y2in(32*0), Y7, Y10
@ -801,13 +790,8 @@ pointaddaffine_avx2:
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
\ \
LDt (hcub) \ LDt (hcub) \
p256SubInline2 \ p256SubInline \
ST (xout) \ STt (xout) \
\
MOVQ acc4, t0 \
MOVQ acc5, t1 \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
LDacc (u2) \ LDacc (u2) \
p256SubInline2 \ p256SubInline2 \
\ \

View File

@ -32,20 +32,20 @@ GLOBL p256one<>(SB), 8, $32
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1] \ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
MOVQ acc0, AX \ MOVQ acc0, AX \
MOVQ acc0, DX \ MOVQ acc0, DX \
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part SHLQ $32, AX \
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part SHRQ $32, DX \
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0 \// calculate the negative part: [1, -0x100000000, 0, -0x100000000] * acc0 + [0, acc3, acc2, acc1]
SUBQ AX, acc1 \ SUBQ AX, acc1 \
SBBQ DX, acc2 \ SBBQ DX, acc2 \
SBBQ AX, acc3 \ SBBQ AX, acc3 \
MOVQ acc0, AX \ MOVQ acc0, AX \
SBBQ DX, acc0 \ SBBQ DX, acc0 \
\ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1], \ // calculate the positive part: [0, 0, 0, AX] + [acc0, acc3, acc2, acc1],
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry. \ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1) ADDQ AX, acc1 \
ADCQ $0, acc2 \ // acc2' = acc2 + carry1 ADCQ $0, acc2 \
ADCQ $0, acc3 \ // acc3' = acc3 + carry2 ADCQ $0, acc3 \
ADCQ $0, acc0 \ // acc0' = acc0 + carry3 ADCQ $0, acc0 \
\ // Second reduction step \ // Second reduction step
MOVQ acc1, AX \ MOVQ acc1, AX \
MOVQ acc1, DX \ MOVQ acc1, DX \
@ -102,6 +102,7 @@ GLOBL p256one<>(SB), 8, $32
ADCQ x_ptr, acc3 \ ADCQ x_ptr, acc3 \
ADCQ $0, t0 ADCQ $0, t0
/* ---------------------------------------*/
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ #define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
MOVQ a0, b0 \ MOVQ a0, b0 \
MOVQ a1, b1 \ MOVQ a1, b1 \
@ -113,7 +114,7 @@ GLOBL p256one<>(SB), 8, $32
SBBQ $-1, a2 \ SBBQ $-1, a2 \
SBBQ p256p<>+0x018(SB), a3 \ SBBQ p256p<>+0x018(SB), a3 \
SBBQ $0, a4 \ SBBQ $0, a4 \
\ \ // If the result of the subtraction is negative, restore the previous result
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1) CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
CMOVQCS b1, a1 \ CMOVQCS b1, a1 \
CMOVQCS b2, a2 \ CMOVQCS b2, a2 \
@ -131,13 +132,13 @@ GLOBL p256one<>(SB), 8, $32
MOVQ a1, b1 \ MOVQ a1, b1 \
MOVQ a2, b2 \ MOVQ a2, b2 \
MOVQ a3, b3 \ MOVQ a3, b3 \
\// Subtract p256 \// Subtract p256ord
SUBQ p256ord<>+0x00(SB), a0 \ SUBQ p256ord<>+0x00(SB), a0 \
SBBQ p256ord<>+0x08(SB) ,a1 \ SBBQ p256ord<>+0x08(SB) ,a1 \
SBBQ p256ord<>+0x10(SB), a2 \ SBBQ p256ord<>+0x10(SB), a2 \
SBBQ p256ord<>+0x18(SB), a3 \ SBBQ p256ord<>+0x18(SB), a3 \
SBBQ $0, a4 \ SBBQ $0, a4 \
\ \ // If the result of the subtraction is negative, restore the previous result
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1) CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
CMOVQCS b1, a1 \ CMOVQCS b1, a1 \
CMOVQCS b2, a2 \ CMOVQCS b2, a2 \
@ -148,6 +149,7 @@ GLOBL p256one<>(SB), 8, $32
MOVQ a2, (8*2)(res) \ MOVQ a2, (8*2)(res) \
MOVQ a3, (8*3)(res) MOVQ a3, (8*3)(res)
/* ---------------------------------------*/
#define sm2P256SqrReductionInline \ #define sm2P256SqrReductionInline \
\ // First reduction step \ // First reduction step
MOVQ acc0, mul0 \ MOVQ acc0, mul0 \
@ -237,6 +239,7 @@ GLOBL p256one<>(SB), 8, $32
CMOVQCS t2, acc6 \ CMOVQCS t2, acc6 \
CMOVQCS t3, acc7 CMOVQCS t3, acc7
/* ---------------------------------------*/
#define sm2P256MulReductionInline \ #define sm2P256MulReductionInline \
\// First reduction step \// First reduction step
MOVQ acc0, mul0 \ MOVQ acc0, mul0 \
@ -303,159 +306,6 @@ GLOBL p256one<>(SB), 8, $32
ADCQ $0, acc2 \ ADCQ $0, acc2 \
ADCQ $0, acc3 ADCQ $0, acc3
#define p256PointDoubleInit() \
MOVOU (16*0)(BX), X0;\
MOVOU (16*1)(BX), X1;\
MOVOU (16*2)(BX), X2;\
MOVOU (16*3)(BX), X3;\
MOVOU (16*4)(BX), X4;\
MOVOU (16*5)(BX), X5;\
\
MOVOU X0, x(16*0);\
MOVOU X1, x(16*1);\
MOVOU X2, y(16*0);\
MOVOU X3, y(16*1);\
MOVOU X4, z(16*0);\
MOVOU X5, z(16*1);
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline2\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
#define p256TripleInline\
XORQ mul0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;\
XORQ mul0, mul0;\
ADDQ acc0, acc4;\
ADCQ acc1, acc5;\
ADCQ acc2, acc6;\
ADCQ acc3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
#define p256AddInline \
XORQ mul0, mul0;\
ADDQ t0, acc4;\
ADCQ t1, acc5;\
ADCQ t2, acc6;\
ADCQ t3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
#define p256SubInline2 \
XORQ mul0, mul0;\
SUBQ t0, acc4;\
SBBQ t1, acc5;\
SBBQ t2, acc6;\
SBBQ t3, acc7;\
SBBQ $0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ $-1, acc4;\
ADCQ p256p<>+0x08(SB), acc5;\
ADCQ $-1, acc6;\
ADCQ p256p<>+0x018(SB), acc7;\
ANDQ $1, mul0;\
CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc1, acc5;\
CMOVQEQ acc2, acc6;\
CMOVQEQ acc3, acc7;\
/* ---------------------------------------*/ /* ---------------------------------------*/
#define p256SqrRound(t1) \ #define p256SqrRound(t1) \
\// y[1:] * y[0] \// y[1:] * y[0]
@ -891,7 +741,7 @@ GLOBL p256one<>(SB), 8, $32
MULXQ p256ordK0<>(SB), DX, AX;\ MULXQ p256ordK0<>(SB), DX, AX;\
\ \
MULXQ p256ord<>+0x00(SB), AX, t0;\ MULXQ p256ord<>+0x00(SB), AX, t0;\
ADOXQ AX, acc0 ;\// (carry1, acc0) = acc0 + t0 * ord0 ADOXQ AX, acc0;\// (carry1, acc0) = acc0 + t0 * ord0
\ \
MULXQ p256ord<>+0x08(SB), AX, t1;\ MULXQ p256ord<>+0x08(SB), AX, t1;\
ADCXQ t0, AX;\ ADCXQ t0, AX;\
@ -985,6 +835,168 @@ GLOBL p256one<>(SB), 8, $32
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\ p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
MOVQ res_ptr, x_ptr; MOVQ res_ptr, x_ptr;
// Below marcors are used for point operation
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline2\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
#define p256TripleInline\
XORQ mul0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;\
XORQ mul0, mul0;\
ADDQ acc0, acc4;\
ADCQ acc1, acc5;\
ADCQ acc2, acc6;\
ADCQ acc3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
#define p256AddInline \
XORQ mul0, mul0;\
ADDQ t0, acc4;\
ADCQ t1, acc5;\
ADCQ t2, acc6;\
ADCQ t3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
#define p256SubInline \
XORQ mul0, mul0;\
SUBQ t0, acc4;\
SBBQ t1, acc5;\
SBBQ t2, acc6;\
SBBQ t3, acc7;\
SBBQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
ADDQ $-1, t0;\
ADCQ p256p<>+0x08(SB), t1;\
ADCQ $-1, t2;\
ADCQ p256p<>+0x018(SB), t3;\
ANDQ $1, mul0;\
CMOVQEQ acc4, t0;\ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc5, t1;\
CMOVQEQ acc6, t2;\
CMOVQEQ acc7, t3;\
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
#define p256SubInline2 \
XORQ mul0, mul0;\
SUBQ t0, acc4;\
SBBQ t1, acc5;\
SBBQ t2, acc6;\
SBBQ t3, acc7;\
SBBQ $0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ $-1, acc4;\
ADCQ p256p<>+0x08(SB), acc5;\
ADCQ $-1, acc6;\
ADCQ p256p<>+0x018(SB), acc7;\
ANDQ $1, mul0;\
CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc1, acc5;\
CMOVQEQ acc2, acc6;\
CMOVQEQ acc3, acc7;\
#define p256SqrInternalInline \ #define p256SqrInternalInline \
MOVQ acc4, mul0;\ MOVQ acc4, mul0;\
MULQ acc5;\ MULQ acc5;\
@ -1143,3 +1155,18 @@ GLOBL p256one<>(SB), 8, $32
\// Set the zero flag if so. \// Set the zero flag if so.
\// CMOVQEQ: Move if equal (ZF == 1) \// CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ t1, AX; CMOVQEQ t1, AX;
#define p256PointDoubleInit() \
MOVOU (16*0)(BX), X0;\
MOVOU (16*1)(BX), X1;\
MOVOU (16*2)(BX), X2;\
MOVOU (16*3)(BX), X3;\
MOVOU (16*4)(BX), X4;\
MOVOU (16*5)(BX), X5;\
\
MOVOU X0, x(16*0);\
MOVOU X1, x(16*1);\
MOVOU X2, y(16*0);\
MOVOU X3, y(16*1);\
MOVOU X4, z(16*0);\
MOVOU X5, z(16*1);

View File

@ -4,6 +4,7 @@
// 256-bit primes" // 256-bit primes"
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
// https://eprint.iacr.org/2013/816.pdf // https://eprint.iacr.org/2013/816.pdf
// https://github.com/emmansun/gmsm/wiki/SM2-WWMM-(2)
//go:build amd64 && !purego && plugin //go:build amd64 && !purego && plugin
// plugin mode - DO NOT use the R15 Register. // plugin mode - DO NOT use the R15 Register.
@ -12,6 +13,7 @@
// 2.p256OrdSqr // 2.p256OrdSqr
// 3.sm2P256MulInternal // 3.sm2P256MulInternal
// 4.sm2P256SqrInternal // 4.sm2P256SqrInternal
// The most affected one is sm2P256MulInternal, it uses SIMD register X0 as temp storage.
#include "textflag.h" #include "textflag.h"
@ -359,6 +361,7 @@ internalSqrBMI2:
p256SqrInternalInlineAdx p256SqrInternalInlineAdx
RET RET
// Below is same as non-plugin
/* ---------------------------------------*/ /* ---------------------------------------*/
#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
@ -412,98 +415,93 @@ internalSqrBMI2:
MOVQ acc2, t2 \ MOVQ acc2, t2 \
MOVQ acc3, t3 \ MOVQ acc3, t3 \
\// Add in case the operand was > p256 \// Add in case the operand was > p256
ADDQ $-1, acc0 \ ADDQ $-1, acc0 \
ADCQ p256p<>+0x08(SB), acc1 \ ADCQ p256p<>+0x08(SB), acc1 \
ADCQ $-1, acc2 \ ADCQ $-1, acc2 \
ADCQ p256p<>+0x018(SB), acc3 \ ADCQ p256p<>+0x018(SB), acc3 \
ADCQ $0, mul0 \ ADCQ $0, mul0 \ // ZF := 1 if mul0 == 0 after ADC
CMOVQNE t0, acc0 \ CMOVQNE t0, acc0 \ // CMOVQNE: Move if not equal (ZF == 0)
CMOVQNE t1, acc1 \ CMOVQNE t1, acc1 \
CMOVQNE t2, acc2 \ CMOVQNE t2, acc2 \
CMOVQNE t3, acc3 \ CMOVQNE t3, acc3 \
\// If condition is 0, keep original value \// If condition is 0, keep original value
TESTQ DX, DX \ TESTQ DX, DX \ // ZF := 1 if (DX AND DX == 0)
CMOVQEQ acc4, acc0 \ CMOVQEQ acc4, acc0 \ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc5, acc1 \ CMOVQEQ acc5, acc1 \
CMOVQEQ acc6, acc2 \ CMOVQEQ acc6, acc2 \
CMOVQEQ acc7, acc3 \ CMOVQEQ acc7, acc3 \
\// Store result \// Store result
MOVQ acc0, y2in(8*0) \ MOVQ acc0, y2in(8*0) \
MOVQ acc1, y2in(8*1) \ MOVQ acc1, y2in(8*1) \
MOVQ acc2, y2in(8*2) \ MOVQ acc2, y2in(8*2) \
MOVQ acc3, y2in(8*3) \ MOVQ acc3, y2in(8*3) \
\// Begin point add \// Begin point add
LDacc (z1in) \ LDacc (z1in) \
CALL sm2P256SqrInternal(SB) \// z1ˆ2 CALL sm2P256SqrInternal(SB) \// z1ˆ2
ST (z1sqr) \ ST (z1sqr) \
\ \
LDt (x2in) \ LDt (x2in) \
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2 CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2
\ \
LDt (x1in) \ LDt (x1in) \
p256SubInline2 \// h = u2 - u1 p256SubInline2 \// h = u2 - x1
ST (h) \ ST (h) \
\ \
LDt (z1in) \ LDt (z1in) \
CALL sm2P256MulInternal(SB) \// z3 = h * z1 CALL sm2P256MulInternal(SB) \// z3 = h * z1
ST (zout) \ ST (zout) \
\ \
LDacc (z1sqr) \ LDacc (z1sqr) \
CALL sm2P256MulInternal(SB) \// z1ˆ3 CALL sm2P256MulInternal(SB) \// z1ˆ3
\ \
LDt (y2in) \ LDt (y2in) \
CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3 CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3
ST (s2) \ ST (s2) \
\ \
LDt (y1in) \ LDt (y1in) \
p256SubInline2 \// r = s2 - s1 p256SubInline2 \// r = s2 - y1
ST (r) \ ST (r) \
\ \
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
ST (rsqr) \ ST (rsqr) \
\ \
LDacc (h) \ LDacc (h) \
CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2 CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2
ST (hsqr) \ ST (hsqr) \
\ \
LDt (h) \ LDt (h) \
CALL sm2P256MulInternal(SB) \// hcub = hˆ3 CALL sm2P256MulInternal(SB) \// hcub = hˆ3
ST (hcub) \ ST (hcub) \
\ \
LDt (y1in) \ LDt (y1in) \
CALL sm2P256MulInternal(SB) \// y1 * hˆ3 CALL sm2P256MulInternal(SB) \// s2 = y1 * hˆ3
ST (s2) \ ST (s2) \
\ \
LDacc (x1in) \ LDacc (x1in) \
LDt (hsqr) \ LDt (hsqr) \
CALL sm2P256MulInternal(SB) \// u1 * hˆ2 CALL sm2P256MulInternal(SB) \// x1 * hˆ2
ST (h) \ ST (h) \
\ \
p256MulBy2Inline \// u1 * hˆ2 * 2, inline p256MulBy2Inline \// x1 * hˆ2 * 2, inline
LDacc (rsqr) \ LDacc (rsqr) \
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 p256SubInline2 \// rˆ2 - x1 * hˆ2 * 2
\ \
LDt (hcub) \ LDt (hcub) \
p256SubInline2 \ p256SubInline \
ST (xout) \ STt (xout) \// xout = rˆ2 - 2 * x1 * hˆ2 - h^3
LDacc (h) \
p256SubInline2 \
\ \
MOVQ acc4, t0 \ LDt (r) \
MOVQ acc5, t1 \ CALL sm2P256MulInternal(SB) \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
LDacc (h) \
p256SubInline2 \
\ \
LDt (r) \ LDt (s2) \
CALL sm2P256MulInternal(SB) \ p256SubInline2 \
\ ST (yout) \
LDt (s2) \
p256SubInline2 \
ST (yout) \
\// Load stored values from stack \// Load stored values from stack
MOVQ rptr, AX \ MOVQ rptr, AX \
MOVL sel_save, BX \ MOVL sel_save, BX \
MOVL zero_save, CX \ MOVL zero_save, CX \
// func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int) // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
TEXT ·p256PointAddAffineAsm(SB),0,$512-48 TEXT ·p256PointAddAffineAsm(SB),0,$512-48
@ -547,20 +545,20 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
MOVOU zout(16*0), X4 MOVOU zout(16*0), X4
MOVOU zout(16*1), X5 MOVOU zout(16*1), X5
MOVL BX, X6 MOVL BX, X6 // sel
MOVL CX, X7 MOVL CX, X7 // zero
PXOR X8, X8 PXOR X8, X8 // X8's bits are all 0
PCMPEQL X9, X9 PCMPEQL X9, X9 // X9's bits are all 1
PSHUFD $0, X6, X6 PSHUFD $0, X6, X6
PSHUFD $0, X7, X7 PSHUFD $0, X7, X7
PCMPEQL X8, X6 PCMPEQL X8, X6 // X6's bits are all 1 if sel = 0, else are 0
PCMPEQL X8, X7 PCMPEQL X8, X7 // X7's bits are all 1 if zero = 0, else are 0
MOVOU X6, X15 MOVOU X6, X15
PANDN X9, X15 PANDN X9, X15 // X15 = NOT(X6)
MOVOU x1in(16*0), X9 MOVOU x1in(16*0), X9
MOVOU x1in(16*1), X10 MOVOU x1in(16*1), X10
@ -592,7 +590,7 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
// Similarly if zero == 0 // Similarly if zero == 0
PCMPEQL X9, X9 PCMPEQL X9, X9
MOVOU X7, X15 MOVOU X7, X15
PANDN X9, X15 PANDN X9, X15 // X15 = NOT(X7)
MOVOU x2in(16*0), X9 MOVOU x2in(16*0), X9
MOVOU x2in(16*1), X10 MOVOU x2in(16*1), X10
@ -645,24 +643,20 @@ pointaddaffine_avx2:
p256PointAddAffineInline() p256PointAddAffineInline()
// The result is not valid if (sel == 0), conditional choose // The result is not valid if (sel == 0), conditional choose
MOVL BX, X6 MOVL BX, X6 // sel
MOVL CX, X7 MOVL CX, X7 // zero
VPXOR Y8, Y8, Y8 VPXOR Y8, Y8, Y8 // Y8's bits are all 0
VPCMPEQD Y9, Y9, Y9
VPBROADCASTD X6, Y6 VPBROADCASTD X6, Y6
VPBROADCASTD X7, Y7 VPBROADCASTD X7, Y7
VPCMPEQD Y8, Y6, Y6 VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0
VPCMPEQD Y8, Y7, Y7 VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0
VMOVDQU Y6, Y15 VPANDN xout(32*0), Y6, Y0
VPANDN Y9, Y15, Y15 VPANDN yout(32*0), Y6, Y1
VPANDN zout(32*0), Y6, Y2
VPAND xout(32*0), Y15, Y0
VPAND yout(32*0), Y15, Y1
VPAND zout(32*0), Y15, Y2
VPAND x1in(32*0), Y6, Y9 VPAND x1in(32*0), Y6, Y9
VPAND y1in(32*0), Y6, Y10 VPAND y1in(32*0), Y6, Y10
@ -673,12 +667,9 @@ pointaddaffine_avx2:
VPXOR Y11, Y2, Y2 VPXOR Y11, Y2, Y2
// Similarly if zero == 0 // Similarly if zero == 0
VPCMPEQD Y9, Y9, Y9 VPANDN Y0, Y7, Y0
VPANDN Y9, Y7, Y15 VPANDN Y1, Y7, Y1
VPANDN Y2, Y7, Y2
VPAND Y15, Y0, Y0
VPAND Y15, Y1, Y1
VPAND Y15, Y2, Y2
VPAND x2in(32*0), Y7, Y9 VPAND x2in(32*0), Y7, Y9
VPAND y2in(32*0), Y7, Y10 VPAND y2in(32*0), Y7, Y10
@ -762,7 +753,7 @@ pointaddaffine_avx2:
ST (s2) \ ST (s2) \
\ \
LDt (s1) \ LDt (s1) \
p256SubInline2 \// r = s2 - s1 p256SubInline2 \// r = s2 - s1
ST (r) \ ST (r) \
p256IsZeroInline \ p256IsZeroInline \
MOVQ AX, points_eq \ MOVQ AX, points_eq \
@ -816,13 +807,8 @@ pointaddaffine_avx2:
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
\ \
LDt (hcub) \ LDt (hcub) \
p256SubInline2 \ p256SubInline \
ST (xout) \ STt (xout) \
\
MOVQ acc4, t0 \
MOVQ acc5, t1 \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
LDacc (u2) \ LDacc (u2) \
p256SubInline2 \ p256SubInline2 \
\ \
@ -965,83 +951,83 @@ pointadd_avx2:
#define rptr (32*7)(SP) #define rptr (32*7)(SP)
#define calZ() \ #define calZ() \
LDacc (z) \ LDacc (z) \
CALL sm2P256SqrInternal(SB) \ CALL sm2P256SqrInternal(SB) \
ST (zsqr) \ ST (zsqr) \ // ZZ = Z1^2
\ \
LDt (x) \ LDt (x) \
p256AddInline \ p256AddInline \
STt (m) \ STt (m) \ // M = ZZ + X1
\ \
LDacc (z) \ LDacc (z) \
LDt (y) \ LDt (y) \
CALL sm2P256MulInternal(SB) \ CALL sm2P256MulInternal(SB) \ // Z1 * Y1
p256MulBy2Inline \ p256MulBy2Inline \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2
#define calX() \ #define calX() \
LDacc (x) \ LDacc (x) \
LDt (zsqr) \ LDt (zsqr) \
p256SubInline2 \ p256SubInline2 \ // X1 - ZZ
LDt (m) \ LDt (m) \
CALL sm2P256MulInternal(SB) \ CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2
ST (m) \ ST (m) \
\// Multiply by 3 \// Multiply by 3
p256TripleInline \ p256TripleInline \
STt (m) \ STt (m) \ // M = 3 * (X1^2 - ZZ^2)
\//////////////////////// \////////////////////////
LDacc (y) \ LDacc (y) \
p256MulBy2Inline2 \ p256MulBy2Inline2 \
CALL sm2P256SqrInternal(SB) \ CALL sm2P256SqrInternal(SB) \ // 4 * YY = (2*Y1)^2
ST (s) \ ST (s) \ // S = 4 * YY
CALL sm2P256SqrInternal(SB) \ CALL sm2P256SqrInternal(SB) \ // (4 * YY)^2 = 16 * YYYY
\// Divide by 2 \// Divide by 2
XORQ mul0, mul0 \ XORQ mul0, mul0 \
MOVQ acc4, t0 \ MOVQ acc4, t0 \
MOVQ acc5, t1 \ MOVQ acc5, t1 \
MOVQ acc6, t2 \ MOVQ acc6, t2 \
MOVQ acc7, t3 \ MOVQ acc7, t3 \
\ \ // [mul0, acc7, acc6, acc5, acc4] := [acc7, acc6, acc5, acc4] + P
ADDQ $-1, acc4 \ ADDQ $-1, acc4 \
ADCQ p256p<>+0x08(SB), acc5 \ ADCQ p256p<>+0x08(SB), acc5 \
ADCQ $-1, acc6 \ ADCQ $-1, acc6 \
ADCQ p256p<>+0x018(SB), acc7 \ ADCQ p256p<>+0x018(SB), acc7 \
ADCQ $0, mul0 \ ADCQ $0, mul0 \
TESTQ $1, t0 \ TESTQ $1, t0 \ // ZF := 1 if (t0 AND 1 == 0)
\ \ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ t0, acc4 \ CMOVQEQ t0, acc4 \ // acc4 := t0 if (ZF == 1)
CMOVQEQ t1, acc5 \ CMOVQEQ t1, acc5 \ // acc5 := t1 if (ZF == 1)
CMOVQEQ t2, acc6 \ CMOVQEQ t2, acc6 \ // acc6 := t2 if (ZF == 1)
CMOVQEQ t3, acc7 \ CMOVQEQ t3, acc7 \ // acc7 := t3 if (ZF == 1)
ANDQ t0, mul0 \ ANDQ t0, mul0 \ // mul0 := t0 AND mul0 (mul0 := 0 if (ZF == 1) else keeping the original value 0 or 1)
\ \ // Divide even by 2
SHRQ $1, acc5, acc4 \ SHRQ $1, acc5, acc4 \ // acc4 := acc4 >> 1 | acc5 << 63
SHRQ $1, acc6, acc5 \ SHRQ $1, acc6, acc5 \ // acc5 := acc5 >> 1 | acc6 << 63
SHRQ $1, acc7, acc6 \ SHRQ $1, acc7, acc6 \ // acc6 := acc6 >> 1 | acc7 << 63
SHRQ $1, mul0, acc7 \ SHRQ $1, mul0, acc7 \ // acc7 := acc7 >> 1 | mul0 << 63
ST (y) \ ST (y) \ // Y3 = 8 * YYYY
\///////////////////////// \/////////////////////////
LDacc (x) \ LDacc (x) \
LDt (s) \ LDt (s) \
CALL sm2P256MulInternal(SB) \ CALL sm2P256MulInternal(SB) \ // X1 * 4 * YY
ST (s) \ ST (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
p256MulBy2Inline \ p256MulBy2Inline \
STt (tmp) \ STt (tmp) \ // tmp = 2*S = 8 * X1 * YY
\ \
LDacc (m) \ LDacc (m) \
CALL sm2P256SqrInternal(SB) \ CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2
LDt (tmp) \ LDt (tmp) \
p256SubInline2 \ p256SubInline2 \ // X3 = M^2 - 2*S
#define calY() \ #define calY() \
acc2t \ acc2t \
LDacc (s) \ LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
p256SubInline2 \ p256SubInline2 \ // S - X3
\ \
LDt (m) \ LDt (m) \
CALL sm2P256MulInternal(SB) \ CALL sm2P256MulInternal(SB) \ // M * (S - X3)
\ \
LDt (y) \ LDt (y) \
p256SubInline2 \ p256SubInline2 \ // Y3 = M * (S - X3) - 8 * YYYYY
#define lastP256PointDouble() \ #define lastP256PointDouble() \
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl \ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl