internal/sm2ec: loong64 v1

This commit is contained in:
Sun Yimin 2025-10-15 13:37:10 +08:00 committed by GitHub
parent 31b941908a
commit aa2ef453f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 877 additions and 450 deletions

View File

@ -7,9 +7,12 @@
#include "textflag.h"
#define ZERO R0
#define RSP R3
#define res_ptr R4
#define x_ptr R5
#define y_ptr R6
#define a_ptr x_ptr
#define b_ptr y_ptr
#define acc0 R7
#define acc1 R8
@ -528,13 +531,13 @@ TEXT ·p256Sqr(SB),NOSPLIT,$0
ADDV $1, const0, const1
sqrLoop:
SUBV $1, y_ptr
CALL sm2P256SqrInternal<>(SB)
MOVV y0, x0
MOVV y1, x1
MOVV y2, x2
MOVV y3, x3
BNE y_ptr, sqrLoop
SUBV $1, y_ptr
CALL sm2P256SqrInternal<>(SB)
MOVV y0, x0
MOVV y1, x1
MOVV y2, x2
MOVV y3, x3
BNE y_ptr, sqrLoop
MOVV y0, (8*0)(res_ptr)
MOVV y1, (8*1)(res_ptr)
@ -1288,11 +1291,147 @@ TEXT ·p256OrdReduce(SB),NOSPLIT,$0
/* ---------------------------------------*/
// func p256Select(res *SM2P256Point, table *p256Table, idx, limit int)
TEXT ·p256Select(SB),NOSPLIT,$0
MOVV limit+24(FP), x_ptr
MOVV idx+16(FP), const0
MOVV table+8(FP), y_ptr
MOVV res+0(FP), res_ptr
MOVV $0, x0
MOVV $0, x1
MOVV $0, x2
MOVV $0, x3
MOVV $0, y0
MOVV $0, y1
MOVV $0, y2
MOVV $0, y3
MOVV $0, t0
MOVV $0, t1
MOVV $0, t2
MOVV $0, t3
MOVV $0, const1
loop_select:
ADDV $1, const1, const1
XOR const1, const0, hlp0
MOVV (8*0)(y_ptr), acc0
MOVV (8*1)(y_ptr), acc1
MOVV (8*2)(y_ptr), acc2
MOVV (8*3)(y_ptr), acc3
MASKNEZ hlp0, acc0, acc0
MASKNEZ hlp0, acc1, acc1
MASKNEZ hlp0, acc2, acc2
MASKNEZ hlp0, acc3, acc3
OR acc0, x0, x0
OR acc1, x1, x1
OR acc2, x2, x2
OR acc3, x3, x3
ADDVU $32, y_ptr, y_ptr
MOVV (8*0)(y_ptr), acc0
MOVV (8*1)(y_ptr), acc1
MOVV (8*2)(y_ptr), acc2
MOVV (8*3)(y_ptr), acc3
MASKNEZ hlp0, acc0, acc0
MASKNEZ hlp0, acc1, acc1
MASKNEZ hlp0, acc2, acc2
MASKNEZ hlp0, acc3, acc3
OR acc0, y0, y0
OR acc1, y1, y1
OR acc2, y2, y2
OR acc3, y3, y3
ADDVU $32, y_ptr, y_ptr
MOVV (8*0)(y_ptr), acc0
MOVV (8*1)(y_ptr), acc1
MOVV (8*2)(y_ptr), acc2
MOVV (8*3)(y_ptr), acc3
MASKNEZ hlp0, acc0, acc0
MASKNEZ hlp0, acc1, acc1
MASKNEZ hlp0, acc2, acc2
MASKNEZ hlp0, acc3, acc3
OR acc0, t0, t0
OR acc1, t1, t1
OR acc2, t2, t2
OR acc3, t3, t3
BNE const1, x_ptr, loop_select
MOVV x0, (8*0)(res_ptr)
MOVV x1, (8*1)(res_ptr)
MOVV x2, (8*2)(res_ptr)
MOVV x3, (8*3)(res_ptr)
MOVV y0, (8*4)(res_ptr)
MOVV y1, (8*5)(res_ptr)
MOVV y2, (8*6)(res_ptr)
MOVV y3, (8*7)(res_ptr)
MOVV t0, (8*8)(res_ptr)
MOVV t1, (8*9)(res_ptr)
MOVV t2, (8*10)(res_ptr)
MOVV t3, (8*11)(res_ptr)
RET
/* ---------------------------------------*/
// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
TEXT ·p256SelectAffine(SB),NOSPLIT,$0
MOVD idx+16(FP), t0
MOVD table+8(FP), t1
MOVD res+0(FP), res_ptr
XOR x0, x0, x0
XOR x1, x1, x1
XOR x2, x2, x2
XOR x3, x3, x3
XOR y0, y0, y0
XOR y1, y1, y1
XOR y2, y2, y2
XOR y3, y3, y3
MOVV $0, t2
MOVV $32, const0
loop_select:
ADDV $1, t2, t2
XOR t2, t0, hlp0
MOVV (8*0)(t1), acc0
MOVV (8*1)(t1), acc1
MOVV (8*2)(t1), acc2
MOVV (8*3)(t1), acc3
MASKNEZ hlp0, acc0, acc0
MASKNEZ hlp0, acc1, acc1
MASKNEZ hlp0, acc2, acc2
MASKNEZ hlp0, acc3, acc3
OR acc0, x0, x0
OR acc1, x1, x1
OR acc2, x2, x2
OR acc3, x3, x3
ADDVU $32, t1, t1
MOVV (8*0)(t1), acc0
MOVV (8*1)(t1), acc1
MOVV (8*2)(t1), acc2
MOVV (8*3)(t1), acc3
MASKNEZ hlp0, acc0, acc0
MASKNEZ hlp0, acc1, acc1
MASKNEZ hlp0, acc2, acc2
MASKNEZ hlp0, acc3, acc3
OR acc0, y0, y0
OR acc1, y1, y1
OR acc2, y2, y2
OR acc3, y3, y3
BNE t2, const0, loop_select
MOVV x0, (8*0)(res_ptr)
MOVV x1, (8*1)(res_ptr)
MOVV x2, (8*2)(res_ptr)
MOVV x3, (8*3)(res_ptr)
MOVV y0, (8*4)(res_ptr)
MOVV y1, (8*5)(res_ptr)
MOVV y2, (8*6)(res_ptr)
MOVV y3, (8*7)(res_ptr)
RET
/* ---------------------------------------*/
@ -1420,9 +1559,304 @@ TEXT ·p256MulBy2(SB),NOSPLIT,$0
MOVV x3, (8*3)(res_ptr)
RET
/* ---------------------------------------*/
#define x1in(off) (off)(a_ptr)
#define y1in(off) (off + 32)(a_ptr)
#define z1in(off) (off + 64)(a_ptr)
#define x2in(off) (off)(b_ptr)
#define z2in(off) (off + 64)(b_ptr)
#define x3out(off) (off)(res_ptr)
#define y3out(off) (off + 32)(res_ptr)
#define z3out(off) (off + 64)(res_ptr)
#define LDx(src) MOVV src(0), x0; MOVV src(8) x1; MOVV src(16), x2; MOVV src(24), x3
#define LDy(src) MOVV src(0), y0; MOVV src(8) y1; MOVV src(16), y2; MOVV src(24), y3
#define STx(src) MOVV x0, src(0); MOVV x1, src(8); MOVV x2, src(16); MOVV x3, src(24)
#define STy(src) MOVV y0, src(0); MOVV y1, src(8); MOVV y2, src(16); MOVV y3, src(24)
/* ---------------------------------------*/
#define y2in(off) (32*0 + 8 + off)(RSP)
#define s2(off) (32*1 + 8 + off)(RSP)
#define z1sqr(off) (32*2 + 8 + off)(RSP)
#define h(off) (32*3 + 8 + off)(RSP)
#define r(off) (32*4 + 8 + off)(RSP)
#define hsqr(off) (32*5 + 8 + off)(RSP)
#define rsqr(off) (32*6 + 8 + off)(RSP)
#define hcub(off) (32*7 + 8 + off)(RSP)
#define z2sqr(off) (32*8 + 8 + off)(RSP)
#define s1(off) (32*9 + 8 + off)(RSP)
#define u1(off) (32*10 + 8 + off)(RSP)
#define u2(off) (32*11 + 8 + off)(RSP)
/* ---------------------------------------*/
// func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
TEXT ·p256PointAddAffineAsm(SB),0,$264-48
MOVV in1+8(FP), a_ptr
MOVV in2+16(FP), b_ptr
MOVV sign+24(FP), hlp0
MOVV sel+32(FP), hlp1
MOVV zero+40(FP), res_ptr
MOVV p256one<>+0x08(SB), const0
ADDV $1, const0, const1
// Negate y2in based on sign
MOVV (8*0)(b_ptr), y0
MOVV (8*1)(b_ptr), y1
MOVV (8*2)(b_ptr), y2
MOVV (8*3)(b_ptr), y3
// (acc0, acc1, acc2, acc3) = - (y3, y2, y1, y0)
SGTU y0, ZERO, t3
SUBV y0, ZERO, acc0
SGTU y1, ZERO, t4
SUBV y1, ZERO, acc1
SGTU t3, acc1, t1
SUBV t3, acc1, acc1
OR t4, t1, t3
SGTU y2, ZERO, t4
SUBV y2, ZERO, acc2
SGTU t3, acc2, t1
SUBV t3, acc2, acc2
OR t4, t1, t3
SGTU y3, ZERO, t4
SUBV y3, ZERO, acc3
SGTU t3, acc3, t1
SUBV t3, acc3, acc3
OR t4, t1, t3
MOVV $1, acc4
MASKEQZ t3, acc4, acc4
MASKEQZ t3, const0, acc5
MASKEQZ t3, const1, acc7
SGTU acc4, acc0, t3
SUBV acc4, acc0, acc0
ADDV t3, acc5, acc5 // no carry
SGTU acc5, acc1, t3
SUBV acc5, acc1, acc1
SGTU t3, acc2, t1
SUBV t3, acc2, acc2
ADDV t1, acc7, t3 // no carry
SUBV t3, acc3, acc3
// If condition is 0, keep original value
MASKEQZ hlp0, acc0, acc0
MASKNEZ hlp0, y0, y0
MASKEQZ hlp0, acc1, acc1
MASKNEZ hlp0, y1, y1
MASKEQZ hlp0, acc2, acc2
MASKNEZ hlp0, y2, y2
MASKEQZ hlp0, acc3, acc3
MASKNEZ hlp0, y3, y3
OR acc0, y0
OR acc1, y1
OR acc2, y2
OR acc3, y3
// Store result
STy(y2in)
// Begin point add
LDx(z1in)
CALL sm2P256SqrInternal<>(SB) // z1ˆ2
STy(z1sqr)
LDx(x2in)
CALL sm2P256MulInternal<>(SB) // x2 * z1ˆ2
LDx(x1in)
CALL sm2P256Subinternal<>(SB) // h = u2 - u1
STx(h)
LDy(z1in)
CALL p256MulInternal<>(SB) // z3 = h * z1
// iff select == 0, z3 = z1
MOVV (8*8)(a_ptr), acc0
MOVV (8*9)(a_ptr), acc1
MOVV (8*10)(a_ptr), acc2
MOVV (8*11)(a_ptr), acc3
MASKEQZ hlp1, y0, y0
MASKNEZ hlp1, acc0, acc0
MASKEQZ hlp1, y1, y1
MASKNEZ hlp1, acc1, acc1
MASKEQZ hlp1, y2, y2
MASKNEZ hlp1, acc2, acc2
MASKEQZ hlp1, y3, y3
MASKNEZ hlp1, acc3, acc3
OR acc0, y0
OR acc1, y1
OR acc2, y2
OR acc3, y3
// iff zero == 0, z3 = 1
MOVV $1, acc0
MOVV const0, acc1
MOVV $0, acc2
MOVV const1, acc3
MASKEQZ res_ptr, y0, y0
MASKNEZ res_ptr, acc0, acc0
MASKEQZ res_ptr, y1, y1
MASKNEZ res_ptr, acc1, acc1
MASKEQZ res_ptr, y2, y2
MASKNEZ res_ptr, acc2, acc2
MASKEQZ res_ptr, y3, y3
MASKNEZ res_ptr, acc3, acc3
OR acc0, y0
OR acc1, y1
OR acc2, y2
OR acc3, y3
LDx(z1in)
// store z3
MOVV res+0(FP), t0
MOVV y0, (8*8)(t0)
MOVV y1, (8*9)(t0)
MOVV y2, (8*10)(t0)
MOVV y3, (8*11)(t0)
LDy(z1sqr)
CALL p256MulInternal<>(SB) // z1 ^ 3
LDx(y2in)
CALL p256MulInternal<>(SB) // s2 = y2 * z1ˆ3
STy(s2)
LDx(y1in)
CALL p256SubInternal<>(SB) // r = s2 - s1
STx(r)
CALL p256SqrInternal<>(SB) // rsqr = rˆ2
STy (rsqr)
LDx(h)
CALL p256SqrInternal<>(SB) // hsqr = hˆ2
STy(hsqr)
CALL p256MulInternal<>(SB) // hcub = hˆ3
STy(hcub)
LDx(y1in)
CALL p256MulInternal<>(SB) // y1 * hˆ3
STy(s2)
MOVV hsqr(0*8), x0
MOVV hsqr(1*8), x1
MOVV hsqr(2*8), x2
MOVV hsqr(3*8), x3
CALL p256MulInternal<>(SB) // hsqr * u1
MOVV y0, h(0*8)
MOVV y1, h(1*8)
MOVV y2, h(2*8)
MOVV y3, h(3*8)
p256MulBy2Inline // u1 * hˆ2 * 2, inline
LDy(rsqr)
CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
MOVV x0, y0
MOVV x1, y1
MOVV x2, y2
MOVV x3, y3
LDy(hcub)
CALL p256SubInternal<>(SB)
MOVV (8*0)(a_ptr), acc0
MOVV (8*1)(a_ptr), acc1
MOVV (8*2)(a_ptr), acc2
MOVV (8*3)(a_ptr), acc3
// iff select == 0, x3 = x1
MASKEQZ hlp1, x0, x0
MASKNEZ hlp1, acc0, acc0
MASKEQZ hlp1, x1, x1
MASKNEZ hlp1, acc1, acc1
MASKEQZ hlp1, x2, x2
MASKNEZ hlp1, acc2, acc2
MASKEQZ hlp1, x3, x3
MASKNEZ hlp1, acc3, acc3
OR acc0, x0
OR acc1, x1
OR acc2, x2
OR acc3, x3
MOVV (8*0)(b_ptr), acc0
MOVV (8*1)(b_ptr), acc1
MOVV (8*2)(b_ptr), acc2
MOVV (8*3)(b_ptr), acc3
// iff zero == 0, x3 = x2
MASKEQZ res_ptr, x0, x0
MASKNEZ res_ptr, acc0, acc0
MASKEQZ res_ptr, x1, x1
MASKNEZ res_ptr, acc1, acc1
MASKEQZ res_ptr, x2, x2
MASKNEZ res_ptr, acc2, acc2
MASKEQZ res_ptr, x3, x3
MASKNEZ res_ptr, acc3, acc3
OR acc0, x0
OR acc1, x1
OR acc2, x2
OR acc3, x3
// store x3
MOVV res+0(FP), t0
MOVV x0, (8*0)(t0)
MOVV x1, (8*1)(t0)
MOVV x2, (8*2)(t0)
MOVV x3, (8*3)(t0)
MOVV h(0*8), y0
MOVV h(1*8), y1
MOVV h(2*8), y2
MOVV h(3*8), y3
CALL p256SubInternal<>(SB)
MOVV r(0*8), y0
MOVV r(1*8), y1
MOVV r(2*8), y2
MOVV r(3*8), y3
CALL p256MulInternal<>(SB)
MOVV s2(0*8), x0
MOVV s2(1*8), x1
MOVV s2(2*8), x2
MOVV s2(3*8), x3
CALL p256SubInternal<>(SB)
MOVV (8*4)(a_ptr), acc0
MOVV (8*5)(a_ptr), acc1
MOVV (8*6)(a_ptr), acc2
MOVV (8*7)(a_ptr), acc3
// iff select == 0, y3 = y1
MASKEQZ hlp1, x0, x0
MASKNEZ hlp1, acc0, acc0
MASKEQZ hlp1, x1, x1
MASKNEZ hlp1, acc1, acc1
MASKEQZ hlp1, x2, x2
MASKNEZ hlp1, acc2, acc2
MASKEQZ hlp1, x3, x3
MASKNEZ hlp1, acc3, acc3
OR acc0, x0
OR acc1, x1
OR acc2, x2
OR acc3, x3
MOVV y2in(0*8), acc0
MOVV y2in(1*8), acc1
MOVV y2in(2*8), acc2
MOVV y2in(3*8), acc3
// iff zero == 0, y3 = y2
MASKEQZ res_ptr, x0, x0
MASKNEZ res_ptr, acc0, acc0
MASKEQZ res_ptr, x1, x1
MASKNEZ res_ptr, acc1, acc1
MASKEQZ res_ptr, x2, x2
MASKNEZ res_ptr, acc2, acc2
MASKEQZ res_ptr, x3, x3
MASKNEZ res_ptr, acc3, acc3
OR acc0, x0
OR acc1, x1
OR acc2, x2
OR acc3, x3
// store y3
MOVV res+0(FP), t0
MOVV x0, (8*4)(t0)
MOVV x1, (8*5)(t0)
MOVV x2, (8*6)(t0)
MOVV x3, (8*7)(t0)
RET
// (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
@ -1497,17 +1931,450 @@ TEXT ·p256Add(SB),NOSPLIT,$0
MOVV x3, (8*3)(res_ptr)
RET
// (y3, y2, y1, y0) = (y3, y2, y1, y0) / 2
#define p256DivideBy2 \
MOVV $1, acc1; \
AND t1, y0, t0; \
MASKEQZ t0, acc1, acc1
MASKEQZ t0, const0, acc2; \
MASKEQZ t0, const1, acc3; \
SGTU acc1, y0, t1; \
SUBV acc1, y0, y0; \
ADDV t1, acc2, acc2; \
SRLV $1, y0, y0; \
SGTU acc2, y1, t1; \
SUBV acc2, y1, y1; \
SGTU t1, y2, t2; \
SUBV t1, y2, y2; \
BSTRINSV $63, y1, $63, y0; \
SRLV $1, y1, y1; \
ADDV t2, acc3, acc3; \
BSTRINSV $63, y2, $63, y1; \
SRLV $1, y2, y2; \
SUBV acc3, y3, t1; \
SGTU y3, acc3, t2; \
BSTRINSV $63, t1, $63, y2; \
SRLV $1, t1, y3; \
MASKEQZ t0, t2, t2; \
BSTRINSV $63, t2, $63, y3
/* ---------------------------------------*/
// func p256DivBy2(res, in *p256Element)
TEXT ·p256DivBy2(SB),NOSPLIT,$0
MOVV res+0(FP), res_ptr
MOVV in+8(FP), x_ptr
MOVV (8*0)(x_ptr), y0
MOVV (8*1)(x_ptr), y1
MOVV (8*2)(x_ptr), y2
MOVV (8*3)(x_ptr), y3
MOVV p256one<>+0x08(SB), const0
ADDV $1, const0, const1
p256DivideBy2
MOVV y0, (8*0)(res_ptr)
MOVV y1, (8*1)(res_ptr)
MOVV y2, (8*2)(res_ptr)
MOVV y3, (8*3)(res_ptr)
RET
#define s(off) (32*0 + 8 + off)(RSP)
#define m(off) (32*1 + 8 + off)(RSP)
#define zsqr(off) (32*2 + 8 + off)(RSP)
#define tmp(off) (32*3 + 8 + off)(RSP)
//func p256PointDoubleAsm(res, in *SM2P256Point)
TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
MOVV res+0(FP), res_ptr
MOVV in+8(FP), a_ptr
MOVV p256one<>+0x08(SB), const0
ADDV $1, const0, const1
// Begin point double
MOVV (8*8)(a_ptr), x0
MOVV (8*9)(a_ptr), x1
MOVV (8*10)(a_ptr), x2
MOVV (8*11)(a_ptr), x3
CALL sm2P256SqrInternal<>(SB) // z1ˆ2
MOVV y0, zsqr(0*8) // store z^2
MOVV y1, zsqr(1*8)
MOVV y2, zsqr(2*8)
MOVV y3, zsqr(3*8)
MOVV (8*0)(a_ptr), x0 // load x
MOVV (8*1)(a_ptr), x1
MOVV (8*2)(a_ptr), x2
MOVV (8*3)(a_ptr), x3
p256AddInline
STx(m)
LDx(z1in)
LDy(y1in)
CALL sm2P256MulInternal<>(SB)
p256MulBy2Inline
STx(z3out)
LDy(x1in)
LDx(zsqr)
CALL sm2P256Subinternal<>(SB)
LDy(m)
CALL sm2P256MulInternal<>(SB)
// Multiply by 3
p256MulBy2Inline
p256AddInline
STx(m)
LDy(y1in)
p256MulBy2Inline
CALL sm2P256SqrInternal<>(SB)
STy(s)
MOVV y0, x0
MOVV y1, x1
MOVV y2, x2
MOVV y3, x3
CALL sm2P256SqrInternal<>(SB)
// Divide by 2
p256DivideBy2
STy(y3out)
LDx(x1in)
LDy(s)
CALL sm2P256MulInternal<>(SB)
STy(s)
p256MulBy2Inline
STx(tmp)
LDx(m)
CALL sm2P256SqrInternal<>(SB)
LDx(tmp)
CALL sm2P256Subinternal<>(SB)
STx(x3out)
LDy(s)
CALL sm2P256Subinternal<>(SB)
LDy(m)
CALL sm2P256MulInternal<>(SB)
LDx(y3out)
CALL sm2P256Subinternal<>(SB)
STx(y3out)
RET
#define p256PointDoubleRound() \
LDx(z3out) \ // load z
CALL sm2P256SqrInternal<>(SB) \
MOVV y0, zsqr(0*8) \ // store z^2
MOVV y1, zsqr(1*8) \
MOVV y2, zsqr(2*8) \
MOVV y3, zsqr(3*8) \
\
LDx(x3out) \// load x
p256AddInline \
STx(m) \
\
LDx(z3out) \ // load z
LDy(y3out) \ // load y
CALL sm2P256MulInternal<>(SB) \
p256MulBy2Inline \
STx(z3out) \ // store result z
\
LDy(x3out) \ // load x
LDx(zsqr) \
CALL sm2P256Subinternal<>(SB) \
LDy(m) \
CALL sm2P256MulInternal<>(SB) \
\
\// Multiply by 3
p256MulBy2Inline \
p256AddInline \
STx(m) \
\
LDy(y3out) \ // load y
p256MulBy2Inline \
CALL sm2P256SqrInternal<>(SB) \
STy(s) \
MOVV y0, x0 \
MOVV y1, x1 \
MOVV y2, x2 \
MOVV y3, x3 \
CALL sm2P256SqrInternal<>(SB) \
\
\// Divide by 2
p256DivideBy2 \
STy(y3out) \
\
LDx(x3out) \ // load x
LDy(s) \
CALL sm2P256MulInternal<>(SB) \
STy(s) \
p256MulBy2Inline \
STx(tmp) \
\
LDx(m) \
CALL sm2P256SqrInternal<>(SB) \
LDx(tmp) \
CALL sm2P256Subinternal<>(SB) \
\
STx(x3out) \
\
LDy(s) \
CALL sm2P256Subinternal<>(SB) \
\
LDy(m) \
CALL sm2P256MulInternal<>(SB) \
\
LDx(y3out) \
CALL sm2P256Subinternal<>(SB) \
STx(y3out) \
/* ---------------------------------------*/
//func p256PointDouble6TimesAsm(res, in *SM2P256Point)
TEXT ·p256PointDouble6TimesAsm(SB),NOSPLIT,$136-16
MOVV res+0(FP), res_ptr
MOVV in+8(FP), a_ptr
MOVV p256one<>+0x08(SB), const0
ADDV $1, const0, const1
// Begin point double
MOVV (8*8)(a_ptr), x0
MOVV (8*9)(a_ptr), x1
MOVV (8*10)(a_ptr), x2
MOVV (8*11)(a_ptr), x3
CALL sm2P256SqrInternal<>(SB) // z1ˆ2
MOVV y0, zsqr(0*8) // store z^2
MOVV y1, zsqr(1*8)
MOVV y2, zsqr(2*8)
MOVV y3, zsqr(3*8)
MOVV (8*0)(a_ptr), x0 // load x
MOVV (8*1)(a_ptr), x1
MOVV (8*2)(a_ptr), x2
MOVV (8*3)(a_ptr), x3
p256AddInline
STx(m)
LDx(z1in)
LDy(y1in)
CALL sm2P256MulInternal<>(SB)
p256MulBy2Inline
STx(z3out)
LDy(x1in)
LDx(zsqr)
CALL sm2P256Subinternal<>(SB)
LDy(m)
CALL sm2P256MulInternal<>(SB)
// Multiply by 3
p256MulBy2Inline
p256AddInline
STx(m)
LDy(y1in)
p256MulBy2Inline
CALL sm2P256SqrInternal<>(SB)
STy(s)
MOVV y0, x0
MOVV y1, x1
MOVV y2, x2
MOVV y3, x3
CALL sm2P256SqrInternal<>(SB)
// Divide by 2
p256DivideBy2
STy(y3out)
LDx(x1in)
LDy(s)
CALL sm2P256MulInternal<>(SB)
STy(s)
p256MulBy2Inline
STx(tmp)
LDx(m)
CALL sm2P256SqrInternal<>(SB)
LDx(tmp)
CALL sm2P256Subinternal<>(SB)
STx(x3out)
LDy(s)
CALL sm2P256Subinternal<>(SB)
LDy(m)
CALL sm2P256MulInternal<>(SB)
LDx(y3out)
CALL sm2P256Subinternal<>(SB)
STx(y3out)
// Begin point double rounds 2 - 6
p256PointDoubleRound()
p256PointDoubleRound()
p256PointDoubleRound()
p256PointDoubleRound()
p256PointDoubleRound()
RET
/* ---------------------------------------*/
#undef y2in
#undef x3out
#undef y3out
#undef z3out
#define y2in(off) (off + 32)(b_ptr)
#define x3out(off) (off)(b_ptr)
#define y3out(off) (off + 32)(b_ptr)
#define z3out(off) (off + 64)(b_ptr)
// func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
TEXT ·p256PointAddAsm(SB),0,$392-32
// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
// Move input to stack in order to free registers
MOVV in1+8(FP), a_ptr
MOVV in2+16(FP), b_ptr
MOVV p256one<>+0x08(SB), const0
ADDV $1, const0, const1
// Begin point add
LDx(z2in)
CALL sm2P256SqrInternal<>(SB) // z2^2
STy(z2sqr)
CALL sm2P256MulInternal<>(SB) // z2^3
LDx(y1in)
CALL sm2P256MulInternal<>(SB) // s1 = z2ˆ3*y1
STy(s1)
LDx(z1in)
CALL sm2P256SqrInternal<>(SB) // z1^2
STy(z1sqr)
CALL sm2P256MulInternal<>(SB) // z1^3
LDx(y2in)
CALL sm2P256MulInternal<>(SB) // s2 = z1ˆ3*y2
LDx(s1)
CALL sm2P256Subinternal<>(SB) // r = s2 - s1
STx(r)
// Check if zero mod p256
OR x0, x1, acc0
OR x2, x3, acc1
OR acc0, acc1, acc1
SGTU acc1, ZERO, hlp0
MOVV $-1, acc0
MOVV p256p<>+0x08(SB), acc1
MOVV p256p<>+0x18(SB), acc3
XOR acc0, x0, acc4
XOR acc1, x1, acc5
XOR acc0, x2, acc6
XOR acc3, x3, acc7
OR acc4, acc5, acc4
OR acc6, acc7, acc7
OR acc4, acc7, acc7
SGTU acc7, ZERO, res_ptr
OR hlp0, res_ptr, res_ptr
LDx(z2sqr)
LDy(x1in)
CALL sm2P256MulInternal<>(SB) // u1 = x1 * z2ˆ2
STy(u1)
LDx(z1sqr)
LDy(x2in)
CALL sm2P256MulInternal<>(SB) // u2 = x2 * z1ˆ2
STy(u2)
LDx(u1)
CALL sm2P256Subinternal<>(SB) // h = u2 - u1
STx(h)
// Check if zero mod p256
OR x0, x1, acc0
OR x2, x3, acc1
OR acc0, acc1, acc1
SGTU acc1, ZERO, hlp0
MOVV $-1, acc0
MOVV p256p<>+0x08(SB), acc1
MOVV p256p<>+0x18(SB), acc3
XOR acc0, x0, acc4
XOR acc1, x1, acc5
XOR acc0, x2, acc6
XOR acc3, x3, acc7
OR acc4, acc5, acc4
OR acc6, acc7, acc7
OR acc4, acc7, acc7
SGTU acc7, ZERO, t0
OR hlp0, t0, hlp0
AND hlp0, res_ptr, res_ptr
LDx(r)
CALL sm2P256SqrInternal<>(SB) // rsqr = rˆ2
STy(rsqr)
LDx(h)
CALL sm2P256SqrInternal<>(SB) // hsqr = hˆ2
STy(hsqr)
LDx(h)
CALL sm2P256MulInternal<>(SB) // hcub = hˆ3
STy(hcub)
LDx(s1)
CALL sm2P256MulInternal<>(SB)
STy(s2)
LDx(z1in)
LDy(z2in)
CALL sm2P256MulInternal<>(SB) // z1 * z2
LDx(h)
CALL sm2P256MulInternal<>(SB) // z1 * z2 * h
MOVV res+0(FP), b_ptr
STy(z3out)
LDx(hsqr)
LDy(u1)
CALL sm2P256MulInternal<>(SB) // hˆ2 * u1
STy(u2)
p256MulBy2Inline // u1 * hˆ2 * 2, inline
LDy(rsqr)
CALL sm2P256Subinternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
MOVV x0, y0
MOVV x1, y1
MOVV x2, y2
MOVV x3, y3
LDx(hcub)
CALL sm2P256Subinternal<>(SB)
STx(x3out)
LDy(u2)
CALL sm2P256Subinternal<>(SB)
LDy(r)
CALL sm2P256MulInternal<>(SB)
LDx(s2)
CALL sm2P256Subinternal<>(SB)
STx(y3out)
MOVV res_ptr, ret+24(FP)
RET

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build purego || !(amd64 || arm64 || s390x || ppc64le)
//go:build purego || !(amd64 || arm64 || loong64 || s390x || ppc64le)
package sm2ec

View File

@ -7,7 +7,7 @@
// 256-bit primes"
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
// https://eprint.iacr.org/2013/816.pdf
//go:build (amd64 || arm64 || s390x || ppc64le) && !purego
//go:build (amd64 || arm64 || loong64 || s390x || ppc64le) && !purego
package sm2ec

View File

@ -1,64 +0,0 @@
package sm2ec
import (
"github.com/emmansun/gmsm/internal/deps/cpu"
)
// p256Element is a P-256 base field element in [0, P-1] in the Montgomery
// domain (with R 2²⁵⁶) as four limbs in little-endian order value.
type p256Element [4]uint64
type SM2P256Point1 struct {
// (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point
// at infinity can be represented by any set of coordinates with Z = 0.
x, y, z p256Element
}
var supportLSX = cpu.Loong64.HasLSX
var supportLASX = cpu.Loong64.HasLASX
//go:noescape
func p256BigToLittle(res *p256Element, in *[32]byte)
//go:noescape
func p256LittleToBig(res *[32]byte, in *p256Element)
// If cond is 0, sets res = b, otherwise sets res = a.
//
//go:noescape
func p256MovCond(res, a, b *SM2P256Point1, cond int)
// If cond is not 0, sets val = -val mod p.
//
//go:noescape
func p256NegCond(val *p256Element, cond int)
// Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p.
//
//go:noescape
func p256Mul(res, in1, in2 *p256Element)
// Montgomery square, repeated n times (n >= 1).
//
//go:noescape
func p256Sqr(res, in *p256Element, n int)
// Montgomery multiplication by R⁻¹, or 1 outside the domain.
// Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
//
//go:noescape
func p256FromMont(res, in *p256Element)
// p256OrdReduce ensures s is in the range [0, ord(G)-1].
//
//go:noescape
func p256OrdReduce(s *p256OrdElement)
//go:noescape
func p256Add(res, in1, in2 *p256Element)
//go:noescape
func p256Sub(res, in1, in2 *p256Element)
//go:noescape
func p256MulBy2(res, in *p256Element)

View File

@ -1,376 +0,0 @@
//go:build loong64 && !purego
package sm2ec
import (
"bytes"
"crypto/rand"
"encoding/binary"
"fmt"
"io"
"math/big"
"reflect"
"testing"
"time"
)
func TestP256BigToLittle(t *testing.T) {
// 构造一个已知的 32 字节大端输入
var in [32]byte
for i := 0; i < 32; i++ {
in[i] = byte(i + 1)
}
var out p256Element
p256BigToLittle(&out, &in)
// 检查每个 limb 是否为小端解包
for i := 0; i < 4; i++ {
expected := binary.BigEndian.Uint64(in[i*8 : (i+1)*8])
k := 3 - i // 逆序存储
if out[k] != expected {
t.Errorf("limb %d: got 0x%x, want 0x%x", k, out[k], expected)
}
}
// 逆操作测试
var back [32]byte
p256LittleToBig(&back, &out)
if !bytes.Equal(in[:], back[:]) {
t.Errorf("p256LittleToBig(p256BigToLittle(...)) mismatch\nin: %x\nback: %x", in, back)
}
}
func TestP256NegCond(t *testing.T) {
var tests = []struct {
input p256Element
cond int
expected p256Element
}{
{
input: p256Element{1, 0, 0, 0},
cond: 1,
expected: p256Element{0xfffffffffffffffe, 0xffffffff00000000, 0xffffffffffffffff, 0xfffffffeffffffff},
},
{
input: p256Element{1, 0, 0, 0},
cond: 0,
expected: p256Element{1, 0, 0, 0},
},
{
input: p256Element{0x1, 0xffffffff00000001, 0xfffffffffffffffe, 0xfffffffeffffffff},
cond: 1,
expected: p256Element{0xfffffffffffffffe, 0xffffffffffffffff, 0, 0},
},
}
for i, test := range tests {
var result p256Element
copy(result[:], test.input[:])
p256NegCond(&result, test.cond)
if result != test.expected {
t.Errorf("test %d: got %x, want %x", i, result, test.expected)
}
}
}
func newPoint(x, y, z uint64) *SM2P256Point1 {
return &SM2P256Point1{
x: p256Element{x, x + 1, x + 2, x + 3},
y: p256Element{y, y + 1, y + 2, y + 3},
z: p256Element{z, z + 1, z + 2, z + 3},
}
}
func TestP256MovCond(t *testing.T) {
fmt.Printf("supportLSX=%v, supportLASX=%v\n", supportLSX, supportLASX)
a := newPoint(10, 20, 30)
b := newPoint(100, 200, 300)
var res SM2P256Point1
// cond == 0: res = b
p256MovCond(&res, a, b, 0)
if !reflect.DeepEqual(res, *b) {
t.Errorf("cond=0: got %+v, want %+v", res, *b)
}
// cond != 0: res = a
p256MovCond(&res, a, b, 1)
if !reflect.DeepEqual(res, *a) {
t.Errorf("cond=1: got %+v, want %+v", res, *a)
}
// cond < 0: res = a (should treat any nonzero as true)
p256MovCond(&res, a, b, -123)
if !reflect.DeepEqual(res, *a) {
t.Errorf("cond=-123: got %+v, want %+v", res, *a)
}
}
// fromBig converts a *big.Int into a format used by this code.
func fromBig(out *p256Element, big *big.Int) {
for i := range out {
out[i] = 0
}
for i, v := range big.Bits() {
out[i] = uint64(v)
}
}
func toBigInt(in *p256Element) *big.Int {
var valBytes [32]byte
p256LittleToBig(&valBytes, in)
return new(big.Int).SetBytes(valBytes[:])
}
func p256MulTest(t *testing.T, x, y, p, r *big.Int) {
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
y1 := new(big.Int).Mul(y, r)
y1 = y1.Mod(y1, p)
ax := new(p256Element)
ay := new(p256Element)
res := new(p256Element)
res2 := new(p256Element)
fromBig(ax, x1)
fromBig(ay, y1)
p256Mul(res2, ax, ay)
p256FromMont(res, res2)
resInt := toBigInt(res)
expected := new(big.Int).Mul(x, y)
expected = expected.Mod(expected, p)
if resInt.Cmp(expected) != 0 {
t.Fatalf("p256Mul(%x, %x) = %x, want %x", x, y, resInt, expected)
}
}
func TestP256MulPMinus1(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
p256MulTest(t, pMinus1, pMinus1, p, r)
}
func TestFuzzyP256Mul(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
var scalar2 [32]byte
var timeout *time.Timer
if testing.Short() {
timeout = time.NewTimer(10 * time.Millisecond)
} else {
timeout = time.NewTimer(2 * time.Second)
}
for {
select {
case <-timeout.C:
return
default:
}
io.ReadFull(rand.Reader, scalar1[:])
io.ReadFull(rand.Reader, scalar2[:])
x := new(big.Int).SetBytes(scalar1[:])
y := new(big.Int).SetBytes(scalar2[:])
p256MulTest(t, x, y, p, r)
}
}
func p256SqrTest(t *testing.T, x, p, r *big.Int) {
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
ax := new(p256Element)
res := new(p256Element)
res2 := new(p256Element)
fromBig(ax, x1)
p256Sqr(res2, ax, 1)
p256FromMont(res, res2)
resInt := toBigInt(res)
expected := new(big.Int).Mul(x, x)
expected = expected.Mod(expected, p)
if resInt.Cmp(expected) != 0 {
t.Fatalf("p256Sqr(%x) = %x, want %x", x, resInt, expected)
}
}
func TestP256SqrPMinus1(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
p256SqrTest(t, pMinus1, p, r)
}
func TestFuzzyP256Sqr(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
var timeout *time.Timer
if testing.Short() {
timeout = time.NewTimer(10 * time.Millisecond)
} else {
timeout = time.NewTimer(2 * time.Second)
}
for {
select {
case <-timeout.C:
return
default:
}
io.ReadFull(rand.Reader, scalar1[:])
x := new(big.Int).SetBytes(scalar1[:])
p256SqrTest(t, x, p, r)
}
}
func TestP256OrdReduce(t *testing.T) {
p256Ord := &p256OrdElement{0x53bbf40939d54123, 0x7203df6b21c6052b, 0xffffffffffffffff, 0xfffffffeffffffff}
// s < p256Ord
var s1 p256OrdElement
copy(s1[:], p256Ord[:])
s1[0] -= 1 // s1 = p256Ord - 1
s1Orig := s1
p256OrdReduce(&s1)
if s1 != s1Orig {
t.Errorf("p256OrdReduce changed s when s < p256Ord: got %x, want %x", s1, s1Orig)
}
// s >= p256Ord
var s2 p256OrdElement
copy(s2[:], p256Ord[:])
// s2 = p256Ord
p256OrdReduce(&s2)
zero := p256OrdElement{}
if s2 != zero {
t.Errorf("p256OrdReduce failed for s == p256Ord: got %x, want 0", s2)
}
// s2 = p256Ord + 1
copy(s2[:], p256Ord[:])
s2[0] += 1
p256OrdReduce(&s2)
one := p256OrdElement{1, 0, 0, 0}
if s2 != one {
t.Errorf("p256OrdReduce failed for s == p256Ord+1: got %x, want %x", s2, one)
}
}
func TestP256Sub(t *testing.T) {
// in1 > in2
in1 := p256Element{5, 0, 0, 0}
in2 := p256Element{3, 0, 0, 0}
var res p256Element
p256Sub(&res, &in1, &in2)
want := p256Element{2, 0, 0, 0}
if !reflect.DeepEqual(res, want) {
t.Errorf("in1 > in2: got %v, want %v", res, want)
}
// in1 == in2
in1 = p256Element{7, 8, 9, 10}
in2 = p256Element{7, 8, 9, 10}
p256Sub(&res, &in1, &in2)
want = p256Element{0, 0, 0, 0}
if !reflect.DeepEqual(res, want) {
t.Errorf("in1 == in2: got %v, want %v", res, want)
}
// in1 < in2
in1 = p256Element{1, 0, 0, 0}
in2 = p256Element{2, 0, 0, 0}
p256Sub(&res, &in1, &in2)
// 1 - 2 mod 2^64 = 0xFFFFFFFFFFFFFFFF
want = p256Element{0xfffffffffffffffe, 0xffffffff00000000,
0xffffffffffffffff, 0xfffffffeffffffff}
if !reflect.DeepEqual(res, want) {
t.Errorf("in1 < in2: got %v, want %v", res, want)
}
}
func p256MulBy2Test(t *testing.T, x, p, r *big.Int) {
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
y1 := new(big.Int).Mul(big.NewInt(2), r)
y1 = y1.Mod(y1, p)
ax := new(p256Element)
res := new(p256Element)
res2 := new(p256Element)
fromBig(ax, x1)
p256MulBy2(res2, ax)
p256FromMont(res, res2)
resInt := toBigInt(res)
expected := new(big.Int).Mul(x, big.NewInt(2))
expected = expected.Mod(expected, p)
if resInt.Cmp(expected) != 0 {
t.Fatalf("p256MulBy2(%x) = %x, want %x", x, resInt, expected)
}
}
func TestP256MulBy2(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
p256MulBy2Test(t, pMinus1, p, r)
p256MulBy2Test(t, big.NewInt(0), p, r)
p256MulBy2Test(t, big.NewInt(1), p, r)
}
func p256AddTest(t *testing.T, x, y, p, r *big.Int) {
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
y1 := new(big.Int).Mul(y, r)
y1 = y1.Mod(y1, p)
ax := new(p256Element)
ay := new(p256Element)
res := new(p256Element)
res2 := new(p256Element)
fromBig(ax, x1)
fromBig(ay, y1)
p256Add(res2, ax, ay)
p256FromMont(res, res2)
resInt := toBigInt(res)
expected := new(big.Int).Add(x, y)
expected = expected.Mod(expected, p)
if resInt.Cmp(expected) != 0 {
t.Fatalf("p256Add(%x, %x) = %x, want %x", x, y, resInt, expected)
}
}
func TestP256AddPMinus1(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
p256AddTest(t, pMinus1, pMinus1, p, r)
}
func TestFuzzyP256Add(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
var scalar2 [32]byte
var timeout *time.Timer
if testing.Short() {
timeout = time.NewTimer(10 * time.Millisecond)
} else {
timeout = time.NewTimer(2 * time.Second)
}
for {
select {
case <-timeout.C:
return
default:
}
io.ReadFull(rand.Reader, scalar1[:])
io.ReadFull(rand.Reader, scalar2[:])
x := new(big.Int).SetBytes(scalar1[:])
y := new(big.Int).SetBytes(scalar2[:])
p256AddTest(t, x, y, p, r)
}
}

View File

@ -1,4 +1,4 @@
//go:build (amd64 || arm64 || s390x || ppc64le) && !purego
//go:build (amd64 || arm64 || loong64 || s390x || ppc64le) && !purego
package sm2ec