internal/sm2ec: loong64 v1

2025-10-15 15:50:46 +08:00 · 2025-10-15 13:37:10 +08:00 · 2025-10-15 13:37:10 +08:00 · aa2ef453f1
commit aa2ef453f1
parent 31b941908a
6 changed files with 877 additions and 450 deletions
--- a/internal/sm2ec/p256_asm_loong64.s
+++ b/internal/sm2ec/p256_asm_loong64.s
@ -7,9 +7,12 @@
 #include "textflag.h"

 #define ZERO R0
+#define RSP R3
 #define res_ptr R4
 #define x_ptr R5
 #define y_ptr R6
+#define a_ptr x_ptr
+#define b_ptr y_ptr

 #define acc0 R7
 #define acc1 R8
@ -528,13 +531,13 @@ TEXT ·p256Sqr(SB),NOSPLIT,$0
 	ADDV $1, const0, const1
 	
 sqrLoop:
-	SUBV $1, y_ptr
-	CALL	sm2P256SqrInternal<>(SB)
-	MOVV y0, x0
-	MOVV y1, x1
-	MOVV y2, x2
-	MOVV y3, x3
-	BNE y_ptr, sqrLoop
+		SUBV $1, y_ptr
+		CALL	sm2P256SqrInternal<>(SB)
+		MOVV y0, x0
+		MOVV y1, x1
+		MOVV y2, x2
+		MOVV y3, x3
+		BNE y_ptr, sqrLoop

 	MOVV y0, (8*0)(res_ptr)
 	MOVV y1, (8*1)(res_ptr)
@ -1288,11 +1291,147 @@ TEXT ·p256OrdReduce(SB),NOSPLIT,$0
 /* ---------------------------------------*/
 // func p256Select(res *SM2P256Point, table *p256Table, idx, limit int)
 TEXT ·p256Select(SB),NOSPLIT,$0
+	MOVV	limit+24(FP), x_ptr
+	MOVV	idx+16(FP), const0
+	MOVV	table+8(FP), y_ptr
+	MOVV	res+0(FP), res_ptr
+
+	MOVV    $0, x0
+	MOVV    $0, x1
+	MOVV    $0, x2
+	MOVV    $0, x3
+	MOVV    $0, y0
+	MOVV    $0, y1
+	MOVV    $0, y2
+	MOVV    $0, y3
+	MOVV    $0, t0
+	MOVV    $0, t1
+	MOVV    $0, t2
+	MOVV    $0, t3
+
+	MOVV	$0, const1
+
+loop_select:
+		ADDV $1, const1, const1
+		XOR  const1, const0, hlp0
+
+		MOVV    (8*0)(y_ptr), acc0
+		MOVV    (8*1)(y_ptr), acc1
+		MOVV    (8*2)(y_ptr), acc2
+		MOVV    (8*3)(y_ptr), acc3
+		MASKNEZ hlp0, acc0, acc0
+		MASKNEZ hlp0, acc1, acc1
+		MASKNEZ hlp0, acc2, acc2
+		MASKNEZ hlp0, acc3, acc3
+		OR   acc0, x0, x0
+		OR   acc1, x1, x1
+		OR   acc2, x2, x2
+		OR   acc3, x3, x3
+
+		ADDVU $32, y_ptr, y_ptr
+		MOVV    (8*0)(y_ptr), acc0
+		MOVV    (8*1)(y_ptr), acc1
+		MOVV    (8*2)(y_ptr), acc2
+		MOVV    (8*3)(y_ptr), acc3
+		MASKNEZ hlp0, acc0, acc0
+		MASKNEZ hlp0, acc1, acc1
+		MASKNEZ hlp0, acc2, acc2
+		MASKNEZ hlp0, acc3, acc3
+		OR   acc0, y0, y0
+		OR   acc1, y1, y1
+		OR   acc2, y2, y2
+		OR   acc3, y3, y3
+
+		ADDVU $32, y_ptr, y_ptr
+		MOVV    (8*0)(y_ptr), acc0
+		MOVV    (8*1)(y_ptr), acc1
+		MOVV    (8*2)(y_ptr), acc2
+		MOVV    (8*3)(y_ptr), acc3
+		MASKNEZ hlp0, acc0, acc0
+		MASKNEZ hlp0, acc1, acc1
+		MASKNEZ hlp0, acc2, acc2
+		MASKNEZ hlp0, acc3, acc3
+		OR   acc0, t0, t0
+		OR   acc1, t1, t1
+		OR   acc2, t2, t2
+		OR   acc3, t3, t3
+
+		BNE const1, x_ptr, loop_select
+
+	MOVV    x0, (8*0)(res_ptr)
+	MOVV    x1, (8*1)(res_ptr)
+	MOVV    x2, (8*2)(res_ptr)
+	MOVV    x3, (8*3)(res_ptr)
+	MOVV    y0, (8*4)(res_ptr)
+	MOVV    y1, (8*5)(res_ptr)
+	MOVV    y2, (8*6)(res_ptr)
+	MOVV    y3, (8*7)(res_ptr)
+	MOVV    t0, (8*8)(res_ptr)
+	MOVV    t1, (8*9)(res_ptr)
+	MOVV    t2, (8*10)(res_ptr)
+	MOVV    t3, (8*11)(res_ptr)
+
 	RET

 /* ---------------------------------------*/
 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
 TEXT ·p256SelectAffine(SB),NOSPLIT,$0
+	MOVD	idx+16(FP), t0
+	MOVD	table+8(FP), t1
+	MOVD	res+0(FP), res_ptr
+
+	XOR	x0, x0, x0
+	XOR	x1, x1, x1
+	XOR	x2, x2, x2
+	XOR	x3, x3, x3
+	XOR	y0, y0, y0
+	XOR	y1, y1, y1
+	XOR	y2, y2, y2
+	XOR	y3, y3, y3
+
+	MOVV	$0, t2
+	MOVV	$32, const0
+
+loop_select:
+		ADDV $1, t2, t2
+		XOR  t2, t0, hlp0
+
+		MOVV    (8*0)(t1), acc0
+		MOVV    (8*1)(t1), acc1
+		MOVV    (8*2)(t1), acc2
+		MOVV    (8*3)(t1), acc3
+		MASKNEZ hlp0, acc0, acc0
+		MASKNEZ hlp0, acc1, acc1
+		MASKNEZ hlp0, acc2, acc2
+		MASKNEZ hlp0, acc3, acc3
+		OR   acc0, x0, x0
+		OR   acc1, x1, x1
+		OR   acc2, x2, x2
+		OR   acc3, x3, x3
+
+		ADDVU $32, t1, t1
+		MOVV    (8*0)(t1), acc0
+		MOVV    (8*1)(t1), acc1
+		MOVV    (8*2)(t1), acc2
+		MOVV    (8*3)(t1), acc3
+		MASKNEZ hlp0, acc0, acc0
+		MASKNEZ hlp0, acc1, acc1
+		MASKNEZ hlp0, acc2, acc2
+		MASKNEZ hlp0, acc3, acc3
+		OR   acc0, y0, y0
+		OR   acc1, y1, y1
+		OR   acc2, y2, y2
+		OR   acc3, y3, y3
+
+		BNE t2, const0, loop_select
+	MOVV    x0, (8*0)(res_ptr)
+	MOVV    x1, (8*1)(res_ptr)
+	MOVV    x2, (8*2)(res_ptr)
+	MOVV    x3, (8*3)(res_ptr)
+	MOVV    y0, (8*4)(res_ptr)
+	MOVV    y1, (8*5)(res_ptr)
+	MOVV    y2, (8*6)(res_ptr)
+	MOVV    y3, (8*7)(res_ptr)		
 	RET

 /* ---------------------------------------*/
@ -1420,9 +1559,304 @@ TEXT ·p256MulBy2(SB),NOSPLIT,$0
 	MOVV x3, (8*3)(res_ptr)
 	RET

+/* ---------------------------------------*/
+#define x1in(off) (off)(a_ptr)
+#define y1in(off) (off + 32)(a_ptr)
+#define z1in(off) (off + 64)(a_ptr)
+#define x2in(off) (off)(b_ptr)
+#define z2in(off) (off + 64)(b_ptr)
+#define x3out(off) (off)(res_ptr)
+#define y3out(off) (off + 32)(res_ptr)
+#define z3out(off) (off + 64)(res_ptr)
+#define LDx(src) MOVV src(0), x0; MOVV src(8) x1; MOVV src(16), x2; MOVV src(24), x3
+#define LDy(src) MOVV src(0), y0; MOVV src(8) y1; MOVV src(16), y2; MOVV src(24), y3
+#define STx(src) MOVV x0, src(0); MOVV x1, src(8); MOVV x2, src(16); MOVV x3, src(24)
+#define STy(src) MOVV y0, src(0); MOVV y1, src(8); MOVV y2, src(16); MOVV y3, src(24)
+/* ---------------------------------------*/
+#define y2in(off)  (32*0 + 8 + off)(RSP)
+#define s2(off)    (32*1 + 8 + off)(RSP)
+#define z1sqr(off) (32*2 + 8 + off)(RSP)
+#define h(off)	   (32*3 + 8 + off)(RSP)
+#define r(off)	   (32*4 + 8 + off)(RSP)
+#define hsqr(off)  (32*5 + 8 + off)(RSP)
+#define rsqr(off)  (32*6 + 8 + off)(RSP)
+#define hcub(off)  (32*7 + 8 + off)(RSP)
+
+#define z2sqr(off) (32*8 + 8 + off)(RSP)
+#define s1(off) (32*9 + 8 + off)(RSP)
+#define u1(off) (32*10 + 8 + off)(RSP)
+#define u2(off) (32*11 + 8 + off)(RSP)
+
 /* ---------------------------------------*/
 // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
 TEXT ·p256PointAddAffineAsm(SB),0,$264-48
+	MOVV	in1+8(FP), a_ptr
+	MOVV	in2+16(FP), b_ptr
+	MOVV	sign+24(FP), hlp0
+	MOVV	sel+32(FP), hlp1
+	MOVV	zero+40(FP), res_ptr
+
+	MOVV p256one<>+0x08(SB), const0
+	ADDV $1, const0, const1
+
+	// Negate y2in based on sign
+	MOVV (8*0)(b_ptr), y0
+	MOVV (8*1)(b_ptr), y1
+	MOVV (8*2)(b_ptr), y2
+	MOVV (8*3)(b_ptr), y3
+	// (acc0, acc1, acc2, acc3) = - (y3, y2, y1, y0)
+	SGTU y0, ZERO, t3
+	SUBV y0, ZERO, acc0
+	SGTU y1, ZERO, t4
+	SUBV y1, ZERO, acc1
+	SGTU t3, acc1, t1
+	SUBV t3, acc1, acc1
+	OR t4, t1, t3
+	SGTU y2, ZERO, t4
+	SUBV y2, ZERO, acc2
+	SGTU t3, acc2, t1
+	SUBV t3, acc2, acc2
+	OR t4, t1, t3
+	SGTU y3, ZERO, t4
+	SUBV y3, ZERO, acc3
+	SGTU t3, acc3, t1
+	SUBV t3, acc3, acc3
+	OR t4, t1, t3
+
+	MOVV $1, acc4
+	MASKEQZ t3, acc4, acc4
+	MASKEQZ t3, const0, acc5
+	MASKEQZ t3, const1, acc7
+
+	SGTU acc4, acc0, t3
+	SUBV acc4, acc0, acc0
+	ADDV t3, acc5, acc5       // no carry
+	SGTU acc5, acc1, t3
+	SUBV acc5, acc1, acc1
+	SGTU t3, acc2, t1
+	SUBV t3, acc2, acc2
+	ADDV t1, acc7, t3       // no carry
+	SUBV t3, acc3, acc3
+	// If condition is 0, keep original value
+	MASKEQZ hlp0, acc0, acc0
+	MASKNEZ hlp0, y0, y0
+	MASKEQZ hlp0, acc1, acc1
+	MASKNEZ hlp0, y1, y1
+	MASKEQZ hlp0, acc2, acc2
+	MASKNEZ hlp0, y2, y2
+	MASKEQZ hlp0, acc3, acc3
+	MASKNEZ hlp0, y3, y3
+	OR acc0, y0
+	OR acc1, y1
+	OR acc2, y2
+	OR acc3, y3
+	// Store result
+	STy(y2in)
+
+	// Begin point add
+	LDx(z1in)
+	CALL	sm2P256SqrInternal<>(SB)    // z1ˆ2
+	STy(z1sqr)
+
+	LDx(x2in)
+	CALL	sm2P256MulInternal<>(SB)    // x2 * z1ˆ2
+
+	LDx(x1in)
+	CALL	sm2P256Subinternal<>(SB)    // h = u2 - u1
+	STx(h)
+
+	LDy(z1in)
+	CALL	p256MulInternal<>(SB)    // z3 = h * z1
+
+	// iff select == 0, z3 = z1
+	MOVV (8*8)(a_ptr), acc0
+	MOVV (8*9)(a_ptr), acc1
+	MOVV (8*10)(a_ptr), acc2
+	MOVV (8*11)(a_ptr), acc3
+	MASKEQZ hlp1, y0, y0
+	MASKNEZ hlp1, acc0, acc0
+	MASKEQZ hlp1, y1, y1
+	MASKNEZ hlp1, acc1, acc1
+	MASKEQZ hlp1, y2, y2
+	MASKNEZ hlp1, acc2, acc2
+	MASKEQZ hlp1, y3, y3
+	MASKNEZ hlp1, acc3, acc3
+	OR acc0, y0
+	OR acc1, y1
+	OR acc2, y2
+	OR acc3, y3
+	// iff zero == 0, z3 = 1
+	MOVV $1, acc0
+	MOVV const0, acc1
+	MOVV $0, acc2
+	MOVV const1, acc3
+	MASKEQZ res_ptr, y0, y0
+	MASKNEZ res_ptr, acc0, acc0
+	MASKEQZ res_ptr, y1, y1
+	MASKNEZ res_ptr, acc1, acc1
+	MASKEQZ res_ptr, y2, y2
+	MASKNEZ res_ptr, acc2, acc2
+	MASKEQZ res_ptr, y3, y3
+	MASKNEZ res_ptr, acc3, acc3
+	OR acc0, y0
+	OR acc1, y1
+	OR acc2, y2
+	OR acc3, y3
+	LDx(z1in)
+	// store z3
+	MOVV res+0(FP), t0
+	MOVV y0, (8*8)(t0)
+	MOVV y1, (8*9)(t0)
+	MOVV y2, (8*10)(t0)
+	MOVV y3, (8*11)(t0)
+
+	LDy(z1sqr)
+	CALL	p256MulInternal<>(SB)    // z1 ^ 3
+
+	LDx(y2in)
+	CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
+	STy(s2)
+
+	LDx(y1in)
+	CALL	p256SubInternal<>(SB)    // r = s2 - s1
+	STx(r)
+
+	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
+	STy	(rsqr)
+
+	LDx(h)
+	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
+	STy(hsqr)
+
+	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
+	STy(hcub)
+
+	LDx(y1in)
+	CALL	p256MulInternal<>(SB)    // y1 * hˆ3
+	STy(s2)
+
+	MOVV hsqr(0*8), x0
+	MOVV hsqr(1*8), x1
+	MOVV hsqr(2*8), x2
+	MOVV hsqr(3*8), x3
+	CALL	p256MulInternal<>(SB)    // hsqr * u1
+	MOVV y0, h(0*8)
+	MOVV y1, h(1*8)
+	MOVV y2, h(2*8)
+	MOVV y3, h(3*8)
+
+	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
+
+	LDy(rsqr)
+	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
+
+	MOVV x0, y0 
+	MOVV x1, y1
+	MOVV x2, y2
+	MOVV x3, y3
+	LDy(hcub)
+	CALL	p256SubInternal<>(SB)
+
+	MOVV (8*0)(a_ptr), acc0
+	MOVV (8*1)(a_ptr), acc1
+	MOVV (8*2)(a_ptr), acc2
+	MOVV (8*3)(a_ptr), acc3
+	// iff select == 0, x3 = x1
+	MASKEQZ hlp1, x0, x0 
+	MASKNEZ hlp1, acc0, acc0
+	MASKEQZ hlp1, x1, x1
+	MASKNEZ hlp1, acc1, acc1
+	MASKEQZ hlp1, x2, x2
+	MASKNEZ hlp1, acc2, acc2
+	MASKEQZ hlp1, x3, x3
+	MASKNEZ hlp1, acc3, acc3
+	OR acc0, x0
+	OR acc1, x1
+	OR acc2, x2
+	OR acc3, x3
+	MOVV (8*0)(b_ptr), acc0
+	MOVV (8*1)(b_ptr), acc1
+	MOVV (8*2)(b_ptr), acc2
+	MOVV (8*3)(b_ptr), acc3
+	// iff zero == 0, x3 = x2
+	MASKEQZ res_ptr, x0, x0
+	MASKNEZ res_ptr, acc0, acc0
+	MASKEQZ res_ptr, x1, x1
+	MASKNEZ res_ptr, acc1, acc1
+	MASKEQZ res_ptr, x2, x2
+	MASKNEZ res_ptr, acc2, acc2
+	MASKEQZ res_ptr, x3, x3
+	MASKNEZ res_ptr, acc3, acc3
+	OR acc0, x0
+	OR acc1, x1
+	OR acc2, x2
+	OR acc3, x3
+	// store x3
+	MOVV res+0(FP), t0
+	MOVV x0, (8*0)(t0)
+	MOVV x1, (8*1)(t0)
+	MOVV x2, (8*2)(t0)
+	MOVV x3, (8*3)(t0)
+
+	MOVV h(0*8), y0 
+	MOVV h(1*8), y1
+	MOVV h(2*8), y2
+	MOVV h(3*8), y3
+	CALL	p256SubInternal<>(SB)
+
+	MOVV r(0*8), y0 
+	MOVV r(1*8), y1
+	MOVV r(2*8), y2
+	MOVV r(3*8), y3
+	CALL	p256MulInternal<>(SB)
+
+	MOVV s2(0*8), x0 
+	MOVV s2(1*8), x1
+	MOVV s2(2*8), x2
+	MOVV s2(3*8), x3
+	CALL	p256SubInternal<>(SB)
+
+	MOVV (8*4)(a_ptr), acc0
+	MOVV (8*5)(a_ptr), acc1
+	MOVV (8*6)(a_ptr), acc2
+	MOVV (8*7)(a_ptr), acc3
+	// iff select == 0, y3 = y1
+	MASKEQZ hlp1, x0, x0
+	MASKNEZ hlp1, acc0, acc0
+	MASKEQZ hlp1, x1, x1
+	MASKNEZ hlp1, acc1, acc1
+	MASKEQZ hlp1, x2, x2
+	MASKNEZ hlp1, acc2, acc2
+	MASKEQZ hlp1, x3, x3
+	MASKNEZ hlp1, acc3, acc3
+	OR acc0, x0
+	OR acc1, x1
+	OR acc2, x2
+	OR acc3, x3
+	MOVV y2in(0*8), acc0
+	MOVV y2in(1*8), acc1
+	MOVV y2in(2*8), acc2
+	MOVV y2in(3*8), acc3
+	// iff zero == 0, y3 = y2
+	MASKEQZ res_ptr, x0, x0
+	MASKNEZ res_ptr, acc0, acc0
+	MASKEQZ res_ptr, x1, x1
+	MASKNEZ res_ptr, acc1, acc1
+	MASKEQZ res_ptr, x2, x2
+	MASKNEZ res_ptr, acc2, acc2
+	MASKEQZ res_ptr, x3, x3
+	MASKNEZ res_ptr, acc3, acc3
+	OR acc0, x0
+	OR acc1, x1
+	OR acc2, x2
+	OR acc3, x3
+	// store y3
+	MOVV res+0(FP), t0
+	MOVV x0, (8*4)(t0)
+	MOVV x1, (8*5)(t0)
+	MOVV x2, (8*6)(t0)
+	MOVV x3, (8*7)(t0)
+
 	RET

 // (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
@ -1497,17 +1931,450 @@ TEXT ·p256Add(SB),NOSPLIT,$0
 	MOVV x3, (8*3)(res_ptr)
 	RET

+// (y3, y2, y1, y0) = (y3, y2, y1, y0) / 2
+#define p256DivideBy2 \
+	MOVV $1, acc1;  \
+	AND t1, y0, t0;  \
+	MASKEQZ t0, acc1, acc1
+	MASKEQZ t0, const0, acc2;  \
+	MASKEQZ t0, const1, acc3;  \
+	SGTU acc1, y0, t1;  \
+	SUBV acc1, y0, y0;  \
+	ADDV t1, acc2, acc2;  \
+	SRLV $1, y0, y0;  \
+	SGTU acc2, y1, t1;  \
+	SUBV acc2, y1, y1;  \
+	SGTU t1, y2, t2;  \
+	SUBV t1, y2, y2;  \
+	BSTRINSV $63, y1, $63, y0;  \
+	SRLV $1, y1, y1;  \
+	ADDV t2, acc3, acc3;  \
+	BSTRINSV $63, y2, $63, y1;  \
+	SRLV $1, y2, y2;  \
+	SUBV acc3, y3, t1;  \
+	SGTU y3, acc3, t2;  \
+	BSTRINSV $63, t1, $63, y2;  \
+	SRLV $1, t1, y3;  \
+	MASKEQZ t0, t2, t2;  \
+	BSTRINSV $63, t2, $63, y3
+
 /* ---------------------------------------*/
+// func p256DivBy2(res, in *p256Element)
+TEXT ·p256DivBy2(SB),NOSPLIT,$0
+	MOVV res+0(FP), res_ptr
+	MOVV in+8(FP), x_ptr
+	MOVV (8*0)(x_ptr), y0
+	MOVV (8*1)(x_ptr), y1
+	MOVV (8*2)(x_ptr), y2
+	MOVV (8*3)(x_ptr), y3
+	MOVV p256one<>+0x08(SB), const0
+	ADDV $1, const0, const1
+	p256DivideBy2
+	MOVV y0, (8*0)(res_ptr)
+	MOVV y1, (8*1)(res_ptr)
+	MOVV y2, (8*2)(res_ptr)
+	MOVV y3, (8*3)(res_ptr)
+	RET
+
+#define s(off)	(32*0 + 8 + off)(RSP)
+#define m(off)	(32*1 + 8 + off)(RSP)
+#define zsqr(off) (32*2 + 8 + off)(RSP)
+#define tmp(off)  (32*3 + 8 + off)(RSP)
+
 //func p256PointDoubleAsm(res, in *SM2P256Point)
 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
+	MOVV	res+0(FP), res_ptr
+	MOVV	in+8(FP), a_ptr
+
+	MOVV p256one<>+0x08(SB), const0
+	ADDV $1, const0, const1
+
+	// Begin point double
+	MOVV (8*8)(a_ptr), x0 
+	MOVV (8*9)(a_ptr), x1
+	MOVV (8*10)(a_ptr), x2
+	MOVV (8*11)(a_ptr), x3
+	CALL	sm2P256SqrInternal<>(SB)    // z1ˆ2
+	MOVV y0, zsqr(0*8)                  // store z^2
+	MOVV y1, zsqr(1*8)
+	MOVV y2, zsqr(2*8)
+	MOVV y3, zsqr(3*8)
+
+	MOVV (8*0)(a_ptr), x0               // load x
+	MOVV (8*1)(a_ptr), x1
+	MOVV (8*2)(a_ptr), x2
+	MOVV (8*3)(a_ptr), x3
+	p256AddInline
+	STx(m)
+
+	LDx(z1in)
+	LDy(y1in)
+	CALL	sm2P256MulInternal<>(SB)
+	p256MulBy2Inline
+	STx(z3out)
+
+	LDy(x1in)
+	LDx(zsqr)
+	CALL	sm2P256Subinternal<>(SB)
+	LDy(m)
+	CALL	sm2P256MulInternal<>(SB)
+
+	// Multiply by 3
+	p256MulBy2Inline
+	p256AddInline
+	STx(m)
+
+	LDy(y1in)
+	p256MulBy2Inline
+	CALL	sm2P256SqrInternal<>(SB)
+	STy(s)
+	MOVV	y0, x0
+	MOVV	y1, x1
+	MOVV	y2, x2
+	MOVV	y3, x3
+	CALL	sm2P256SqrInternal<>(SB)
+
+	// Divide by 2
+	p256DivideBy2
+
+	STy(y3out)
+
+	LDx(x1in)
+	LDy(s)
+	CALL	sm2P256MulInternal<>(SB)
+	STy(s)
+	p256MulBy2Inline
+	STx(tmp)
+
+	LDx(m)
+	CALL	sm2P256SqrInternal<>(SB)
+	LDx(tmp)
+	CALL	sm2P256Subinternal<>(SB)
+
+	STx(x3out)
+
+	LDy(s)
+	CALL	sm2P256Subinternal<>(SB)
+
+	LDy(m)
+	CALL	sm2P256MulInternal<>(SB)
+
+	LDx(y3out)
+	CALL	sm2P256Subinternal<>(SB)
+	STx(y3out)
+
 	RET

+#define p256PointDoubleRound() \
+	LDx(z3out)                       \ // load z
+	CALL	sm2P256SqrInternal<>(SB) \
+	MOVV y0, zsqr(0*8)          \ // store z^2
+	MOVV y1, zsqr(1*8)          \
+	MOVV y2, zsqr(2*8)          \
+	MOVV y3, zsqr(3*8)          \
+	\
+	LDx(x3out)                       \// load x
+	p256AddInline                    \
+	STx(m)                           \
+	\
+	LDx(z3out)                       \ // load z
+	LDy(y3out)                       \ // load y
+	CALL	sm2P256MulInternal<>(SB) \
+	p256MulBy2Inline                 \
+	STx(z3out)                       \ // store result z
+	\
+	LDy(x3out)                       \ // load x
+	LDx(zsqr)                        \
+	CALL	sm2P256Subinternal<>(SB) \
+	LDy(m)                           \
+	CALL	sm2P256MulInternal<>(SB) \
+	\
+	\// Multiply by 3
+	p256MulBy2Inline                 \
+	p256AddInline                    \
+	STx(m)                           \
+	\
+	LDy(y3out)                       \  // load y
+	p256MulBy2Inline                 \
+	CALL	sm2P256SqrInternal<>(SB) \
+	STy(s)                           \
+	MOVV	y0, x0                   \
+	MOVV	y1, x1                   \
+	MOVV	y2, x2                   \
+	MOVV	y3, x3                   \
+	CALL	sm2P256SqrInternal<>(SB) \
+	\
+	\// Divide by 2
+	p256DivideBy2                    \
+	STy(y3out)                       \                
+	\
+	LDx(x3out)                       \  // load x
+	LDy(s)                           \
+	CALL	sm2P256MulInternal<>(SB) \
+	STy(s)                           \
+	p256MulBy2Inline                 \
+	STx(tmp)                         \
+	\
+	LDx(m)                           \
+	CALL	sm2P256SqrInternal<>(SB) \
+	LDx(tmp)                         \
+	CALL	sm2P256Subinternal<>(SB) \
+	\
+	STx(x3out)                       \
+	\
+	LDy(s)                           \
+	CALL	sm2P256Subinternal<>(SB) \
+	\
+	LDy(m)                           \
+	CALL	sm2P256MulInternal<>(SB) \
+	\
+	LDx(y3out)                       \
+	CALL	sm2P256Subinternal<>(SB) \
+	STx(y3out)                       \
+
+
 /* ---------------------------------------*/
 //func p256PointDouble6TimesAsm(res, in *SM2P256Point)
 TEXT ·p256PointDouble6TimesAsm(SB),NOSPLIT,$136-16
+	MOVV	res+0(FP), res_ptr
+	MOVV	in+8(FP), a_ptr
+
+	MOVV p256one<>+0x08(SB), const0
+	ADDV $1, const0, const1
+
+	// Begin point double
+	MOVV (8*8)(a_ptr), x0 
+	MOVV (8*9)(a_ptr), x1
+	MOVV (8*10)(a_ptr), x2
+	MOVV (8*11)(a_ptr), x3
+	CALL	sm2P256SqrInternal<>(SB)    // z1ˆ2
+	MOVV y0, zsqr(0*8)                  // store z^2
+	MOVV y1, zsqr(1*8)
+	MOVV y2, zsqr(2*8)
+	MOVV y3, zsqr(3*8)
+
+	MOVV (8*0)(a_ptr), x0               // load x
+	MOVV (8*1)(a_ptr), x1
+	MOVV (8*2)(a_ptr), x2
+	MOVV (8*3)(a_ptr), x3
+	p256AddInline
+	STx(m)
+
+	LDx(z1in)
+	LDy(y1in)
+	CALL	sm2P256MulInternal<>(SB)
+	p256MulBy2Inline
+	STx(z3out)
+
+	LDy(x1in)
+	LDx(zsqr)
+	CALL	sm2P256Subinternal<>(SB)
+	LDy(m)
+	CALL	sm2P256MulInternal<>(SB)
+
+	// Multiply by 3
+	p256MulBy2Inline
+	p256AddInline
+	STx(m)
+
+	LDy(y1in)
+	p256MulBy2Inline
+	CALL	sm2P256SqrInternal<>(SB)
+	STy(s)
+	MOVV	y0, x0
+	MOVV	y1, x1
+	MOVV	y2, x2
+	MOVV	y3, x3
+	CALL	sm2P256SqrInternal<>(SB)
+
+	// Divide by 2
+	p256DivideBy2
+
+	STy(y3out)
+
+	LDx(x1in)
+	LDy(s)
+	CALL	sm2P256MulInternal<>(SB)
+	STy(s)
+	p256MulBy2Inline
+	STx(tmp)
+
+	LDx(m)
+	CALL	sm2P256SqrInternal<>(SB)
+	LDx(tmp)
+	CALL	sm2P256Subinternal<>(SB)
+
+	STx(x3out)
+
+	LDy(s)
+	CALL	sm2P256Subinternal<>(SB)
+
+	LDy(m)
+	CALL	sm2P256MulInternal<>(SB)
+
+	LDx(y3out)
+	CALL	sm2P256Subinternal<>(SB)
+	STx(y3out)
+
+	// Begin point double rounds 2 - 6
+	p256PointDoubleRound()
+	p256PointDoubleRound()
+	p256PointDoubleRound()
+	p256PointDoubleRound()
+	p256PointDoubleRound()
+
 	RET

 /* ---------------------------------------*/
+#undef y2in
+#undef x3out
+#undef y3out
+#undef z3out
+#define y2in(off) (off + 32)(b_ptr)
+#define x3out(off) (off)(b_ptr)
+#define y3out(off) (off + 32)(b_ptr)
+#define z3out(off) (off + 64)(b_ptr)
 // func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
 TEXT ·p256PointAddAsm(SB),0,$392-32
+	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
+	// Move input to stack in order to free registers
+	MOVV	in1+8(FP), a_ptr
+	MOVV	in2+16(FP), b_ptr
+
+	MOVV p256one<>+0x08(SB), const0
+	ADDV $1, const0, const1
+
+	// Begin point add
+	LDx(z2in)
+	CALL	sm2P256SqrInternal<>(SB)    // z2^2
+	STy(z2sqr)
+
+	CALL	sm2P256MulInternal<>(SB)    // z2^3
+
+	LDx(y1in)
+	CALL	sm2P256MulInternal<>(SB)    // s1 = z2ˆ3*y1
+	STy(s1)
+
+	LDx(z1in)
+	CALL	sm2P256SqrInternal<>(SB)    // z1^2
+	STy(z1sqr)
+
+	CALL	sm2P256MulInternal<>(SB)    // z1^3
+
+	LDx(y2in)
+	CALL	sm2P256MulInternal<>(SB)    // s2 = z1ˆ3*y2
+
+	LDx(s1)
+	CALL	sm2P256Subinternal<>(SB)    // r = s2 - s1
+	STx(r)
+
+	// Check if zero mod p256
+	OR x0, x1, acc0
+	OR x2, x3, acc1
+	OR acc0, acc1, acc1
+	SGTU acc1, ZERO, hlp0
+
+	MOVV $-1, acc0
+	MOVV p256p<>+0x08(SB), acc1
+	MOVV p256p<>+0x18(SB), acc3
+
+	XOR acc0, x0, acc4
+	XOR acc1, x1, acc5
+	XOR acc0, x2, acc6
+	XOR acc3, x3, acc7
+	OR acc4, acc5, acc4
+	OR acc6, acc7, acc7
+	OR acc4, acc7, acc7
+	SGTU acc7, ZERO, res_ptr
+	OR hlp0, res_ptr, res_ptr
+
+	LDx(z2sqr)
+	LDy(x1in)
+	CALL	sm2P256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
+	STy(u1)
+
+	LDx(z1sqr)
+	LDy(x2in)
+	CALL	sm2P256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
+	STy(u2)
+
+	LDx(u1)
+	CALL	sm2P256Subinternal<>(SB)    // h = u2 - u1
+	STx(h)
+
+	// Check if zero mod p256
+	OR x0, x1, acc0
+	OR x2, x3, acc1
+	OR acc0, acc1, acc1
+	SGTU acc1, ZERO, hlp0
+
+	MOVV $-1, acc0
+	MOVV p256p<>+0x08(SB), acc1
+	MOVV p256p<>+0x18(SB), acc3
+
+	XOR acc0, x0, acc4
+	XOR acc1, x1, acc5
+	XOR acc0, x2, acc6
+	XOR acc3, x3, acc7
+	OR acc4, acc5, acc4
+	OR acc6, acc7, acc7
+	OR acc4, acc7, acc7
+	SGTU acc7, ZERO, t0
+	OR hlp0, t0, hlp0
+
+	AND hlp0, res_ptr, res_ptr
+
+	LDx(r)
+	CALL	sm2P256SqrInternal<>(SB)    // rsqr = rˆ2
+	STy(rsqr)
+
+	LDx(h)
+	CALL	sm2P256SqrInternal<>(SB)    // hsqr = hˆ2
+	STy(hsqr)
+
+	LDx(h)
+	CALL	sm2P256MulInternal<>(SB)    // hcub = hˆ3
+	STy(hcub)
+
+	LDx(s1)
+	CALL	sm2P256MulInternal<>(SB)
+	STy(s2)
+
+	LDx(z1in)
+	LDy(z2in)
+	CALL	sm2P256MulInternal<>(SB)    // z1 * z2
+	LDx(h)
+	CALL	sm2P256MulInternal<>(SB)    // z1 * z2 * h
+	MOVV	res+0(FP), b_ptr
+	STy(z3out)
+
+	LDx(hsqr)
+	LDy(u1)
+	CALL	sm2P256MulInternal<>(SB)    // hˆ2 * u1
+	STy(u2)
+
+	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
+	LDy(rsqr)
+	CALL	sm2P256Subinternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
+
+	MOVV	x0, y0
+	MOVV	x1, y1
+	MOVV	x2, y2
+	MOVV	x3, y3
+	LDx(hcub)
+	CALL	sm2P256Subinternal<>(SB)
+	STx(x3out)
+
+	LDy(u2)
+	CALL	sm2P256Subinternal<>(SB)
+
+	LDy(r)
+	CALL	sm2P256MulInternal<>(SB)
+
+	LDx(s2)
+	CALL	sm2P256Subinternal<>(SB)
+	STx(y3out)
+
+	MOVV	res_ptr, ret+24(FP)
+
 	RET
--- a/internal/sm2ec/sm2p256.go
+++ b/internal/sm2ec/sm2p256.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:build purego || !(amd64 || arm64 || s390x || ppc64le)
+//go:build purego || !(amd64 || arm64 || loong64 || s390x || ppc64le)

 package sm2ec

--- a/internal/sm2ec/sm2p256_asm.go
+++ b/internal/sm2ec/sm2p256_asm.go
@ -7,7 +7,7 @@
 //                          256-bit primes"
 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
 // https://eprint.iacr.org/2013/816.pdf
-//go:build (amd64 || arm64 || s390x || ppc64le) && !purego
+//go:build (amd64 || arm64 || loong64 || s390x || ppc64le) && !purego

 package sm2ec

--- a/internal/sm2ec/sm2p256_asm_loong64.go
+++ b/internal/sm2ec/sm2p256_asm_loong64.go
@ -1,64 +0,0 @@
-package sm2ec
-
-import (
-	"github.com/emmansun/gmsm/internal/deps/cpu"
-)
-
-// p256Element is a P-256 base field element in [0, P-1] in the Montgomery
-// domain (with R 2²⁵⁶) as four limbs in little-endian order value.
-type p256Element [4]uint64
-
-type SM2P256Point1 struct {
-	// (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point
-	// at infinity can be represented by any set of coordinates with Z = 0.
-	x, y, z p256Element
-}
-
-var supportLSX = cpu.Loong64.HasLSX
-var supportLASX = cpu.Loong64.HasLASX
-
-//go:noescape
-func p256BigToLittle(res *p256Element, in *[32]byte)
-
-//go:noescape
-func p256LittleToBig(res *[32]byte, in *p256Element)
-
-// If cond is 0, sets res = b, otherwise sets res = a.
-//
-//go:noescape
-func p256MovCond(res, a, b *SM2P256Point1, cond int)
-
-// If cond is not 0, sets val = -val mod p.
-//
-//go:noescape
-func p256NegCond(val *p256Element, cond int)
-
-// Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p.
-//
-//go:noescape
-func p256Mul(res, in1, in2 *p256Element)
-
-// Montgomery square, repeated n times (n >= 1).
-//
-//go:noescape
-func p256Sqr(res, in *p256Element, n int)
-
-// Montgomery multiplication by R⁻¹, or 1 outside the domain.
-// Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
-//
-//go:noescape
-func p256FromMont(res, in *p256Element)
-
-// p256OrdReduce ensures s is in the range [0, ord(G)-1].
-//
-//go:noescape
-func p256OrdReduce(s *p256OrdElement)
-
-//go:noescape
-func p256Add(res, in1, in2 *p256Element)
-
-//go:noescape
-func p256Sub(res, in1, in2 *p256Element)
-
-//go:noescape
-func p256MulBy2(res, in *p256Element)
--- a/internal/sm2ec/sm2p256_asm_loong64_test.go
+++ b/internal/sm2ec/sm2p256_asm_loong64_test.go
@ -1,376 +0,0 @@
-//go:build loong64 && !purego
-
-package sm2ec
-
-import (
-	"bytes"
-	"crypto/rand"
-	"encoding/binary"
-	"fmt"
-	"io"
-	"math/big"
-	"reflect"
-	"testing"
-	"time"
-)
-
-func TestP256BigToLittle(t *testing.T) {
-	// 构造一个已知的 32 字节大端输入
-	var in [32]byte
-	for i := 0; i < 32; i++ {
-		in[i] = byte(i + 1)
-	}
-	var out p256Element
-
-	p256BigToLittle(&out, &in)
-
-	// 检查每个 limb 是否为小端解包
-	for i := 0; i < 4; i++ {
-		expected := binary.BigEndian.Uint64(in[i*8 : (i+1)*8])
-		k := 3 - i // 逆序存储
-		if out[k] != expected {
-			t.Errorf("limb %d: got 0x%x, want 0x%x", k, out[k], expected)
-		}
-	}
-
-	// 逆操作测试
-	var back [32]byte
-	p256LittleToBig(&back, &out)
-	if !bytes.Equal(in[:], back[:]) {
-		t.Errorf("p256LittleToBig(p256BigToLittle(...)) mismatch\nin:   %x\nback: %x", in, back)
-	}
-}
-
-func TestP256NegCond(t *testing.T) {
-	var tests = []struct {
-		input    p256Element
-		cond     int
-		expected p256Element
-	}{
-		{
-			input:    p256Element{1, 0, 0, 0},
-			cond:     1,
-			expected: p256Element{0xfffffffffffffffe, 0xffffffff00000000, 0xffffffffffffffff, 0xfffffffeffffffff},
-		},
-		{
-			input:    p256Element{1, 0, 0, 0},
-			cond:     0,
-			expected: p256Element{1, 0, 0, 0},
-		},
-		{
-			input:    p256Element{0x1, 0xffffffff00000001, 0xfffffffffffffffe, 0xfffffffeffffffff},
-			cond:     1,
-			expected: p256Element{0xfffffffffffffffe, 0xffffffffffffffff, 0, 0},
-		},
-	}
-
-	for i, test := range tests {
-		var result p256Element
-		copy(result[:], test.input[:])
-		p256NegCond(&result, test.cond)
-		if result != test.expected {
-			t.Errorf("test %d: got %x, want %x", i, result, test.expected)
-		}
-	}
-}
-
-func newPoint(x, y, z uint64) *SM2P256Point1 {
-	return &SM2P256Point1{
-		x: p256Element{x, x + 1, x + 2, x + 3},
-		y: p256Element{y, y + 1, y + 2, y + 3},
-		z: p256Element{z, z + 1, z + 2, z + 3},
-	}
-}
-
-func TestP256MovCond(t *testing.T) {
-	fmt.Printf("supportLSX=%v, supportLASX=%v\n", supportLSX, supportLASX)
-	a := newPoint(10, 20, 30)
-	b := newPoint(100, 200, 300)
-	var res SM2P256Point1
-
-	// cond == 0: res = b
-	p256MovCond(&res, a, b, 0)
-	if !reflect.DeepEqual(res, *b) {
-		t.Errorf("cond=0: got %+v, want %+v", res, *b)
-	}
-
-	// cond != 0: res = a
-	p256MovCond(&res, a, b, 1)
-	if !reflect.DeepEqual(res, *a) {
-		t.Errorf("cond=1: got %+v, want %+v", res, *a)
-	}
-
-	// cond < 0: res = a (should treat any nonzero as true)
-	p256MovCond(&res, a, b, -123)
-	if !reflect.DeepEqual(res, *a) {
-		t.Errorf("cond=-123: got %+v, want %+v", res, *a)
-	}
-}
-
-// fromBig converts a *big.Int into a format used by this code.
-func fromBig(out *p256Element, big *big.Int) {
-	for i := range out {
-		out[i] = 0
-	}
-
-	for i, v := range big.Bits() {
-		out[i] = uint64(v)
-	}
-}
-
-func toBigInt(in *p256Element) *big.Int {
-	var valBytes [32]byte
-	p256LittleToBig(&valBytes, in)
-	return new(big.Int).SetBytes(valBytes[:])
-}
-
-func p256MulTest(t *testing.T, x, y, p, r *big.Int) {
-	x1 := new(big.Int).Mul(x, r)
-	x1 = x1.Mod(x1, p)
-	y1 := new(big.Int).Mul(y, r)
-	y1 = y1.Mod(y1, p)
-	ax := new(p256Element)
-	ay := new(p256Element)
-	res := new(p256Element)
-	res2 := new(p256Element)
-	fromBig(ax, x1)
-	fromBig(ay, y1)
-	p256Mul(res2, ax, ay)
-	p256FromMont(res, res2)
-	resInt := toBigInt(res)
-
-	expected := new(big.Int).Mul(x, y)
-	expected = expected.Mod(expected, p)
-	if resInt.Cmp(expected) != 0 {
-		t.Fatalf("p256Mul(%x, %x) = %x, want %x", x, y, resInt, expected)
-	}
-}
-
-func TestP256MulPMinus1(t *testing.T) {
-	p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
-	r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
-	pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
-	p256MulTest(t, pMinus1, pMinus1, p, r)
-}
-
-func TestFuzzyP256Mul(t *testing.T) {
-	p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
-	r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
-	var scalar1 [32]byte
-	var scalar2 [32]byte
-	var timeout *time.Timer
-
-	if testing.Short() {
-		timeout = time.NewTimer(10 * time.Millisecond)
-	} else {
-		timeout = time.NewTimer(2 * time.Second)
-	}
-	for {
-		select {
-		case <-timeout.C:
-			return
-		default:
-		}
-		io.ReadFull(rand.Reader, scalar1[:])
-		io.ReadFull(rand.Reader, scalar2[:])
-		x := new(big.Int).SetBytes(scalar1[:])
-		y := new(big.Int).SetBytes(scalar2[:])
-		p256MulTest(t, x, y, p, r)
-	}
-}
-
-func p256SqrTest(t *testing.T, x, p, r *big.Int) {
-	x1 := new(big.Int).Mul(x, r)
-	x1 = x1.Mod(x1, p)
-	ax := new(p256Element)
-	res := new(p256Element)
-	res2 := new(p256Element)
-	fromBig(ax, x1)
-	p256Sqr(res2, ax, 1)
-	p256FromMont(res, res2)
-	resInt := toBigInt(res)
-
-	expected := new(big.Int).Mul(x, x)
-	expected = expected.Mod(expected, p)
-	if resInt.Cmp(expected) != 0 {
-		t.Fatalf("p256Sqr(%x) = %x, want %x", x, resInt, expected)
-	}
-}
-
-func TestP256SqrPMinus1(t *testing.T) {
-	p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
-	r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
-	pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
-	p256SqrTest(t, pMinus1, p, r)
-}
-
-func TestFuzzyP256Sqr(t *testing.T) {
-	p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
-	r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
-	var scalar1 [32]byte
-	var timeout *time.Timer
-
-	if testing.Short() {
-		timeout = time.NewTimer(10 * time.Millisecond)
-	} else {
-		timeout = time.NewTimer(2 * time.Second)
-	}
-	for {
-		select {
-		case <-timeout.C:
-			return
-		default:
-		}
-		io.ReadFull(rand.Reader, scalar1[:])
-		x := new(big.Int).SetBytes(scalar1[:])
-		p256SqrTest(t, x, p, r)
-	}
-}
-
-func TestP256OrdReduce(t *testing.T) {
-	p256Ord := &p256OrdElement{0x53bbf40939d54123, 0x7203df6b21c6052b, 0xffffffffffffffff, 0xfffffffeffffffff}
-	// s < p256Ord
-	var s1 p256OrdElement
-	copy(s1[:], p256Ord[:])
-	s1[0] -= 1 // s1 = p256Ord - 1
-	s1Orig := s1
-	p256OrdReduce(&s1)
-	if s1 != s1Orig {
-		t.Errorf("p256OrdReduce changed s when s < p256Ord: got %x, want %x", s1, s1Orig)
-	}
-
-	// s >= p256Ord
-	var s2 p256OrdElement
-	copy(s2[:], p256Ord[:])
-	// s2 = p256Ord
-	p256OrdReduce(&s2)
-	zero := p256OrdElement{}
-	if s2 != zero {
-		t.Errorf("p256OrdReduce failed for s == p256Ord: got %x, want 0", s2)
-	}
-
-	// s2 = p256Ord + 1
-	copy(s2[:], p256Ord[:])
-	s2[0] += 1
-	p256OrdReduce(&s2)
-	one := p256OrdElement{1, 0, 0, 0}
-	if s2 != one {
-		t.Errorf("p256OrdReduce failed for s == p256Ord+1: got %x, want %x", s2, one)
-	}
-}
-
-func TestP256Sub(t *testing.T) {
-	// in1 > in2
-	in1 := p256Element{5, 0, 0, 0}
-	in2 := p256Element{3, 0, 0, 0}
-	var res p256Element
-	p256Sub(&res, &in1, &in2)
-	want := p256Element{2, 0, 0, 0}
-	if !reflect.DeepEqual(res, want) {
-		t.Errorf("in1 > in2: got %v, want %v", res, want)
-	}
-
-	// in1 == in2
-	in1 = p256Element{7, 8, 9, 10}
-	in2 = p256Element{7, 8, 9, 10}
-	p256Sub(&res, &in1, &in2)
-	want = p256Element{0, 0, 0, 0}
-	if !reflect.DeepEqual(res, want) {
-		t.Errorf("in1 == in2: got %v, want %v", res, want)
-	}
-
-	// in1 < in2
-	in1 = p256Element{1, 0, 0, 0}
-	in2 = p256Element{2, 0, 0, 0}
-	p256Sub(&res, &in1, &in2)
-	// 1 - 2 mod 2^64 = 0xFFFFFFFFFFFFFFFF
-	want = p256Element{0xfffffffffffffffe, 0xffffffff00000000,
-		0xffffffffffffffff, 0xfffffffeffffffff}
-	if !reflect.DeepEqual(res, want) {
-		t.Errorf("in1 < in2: got %v, want %v", res, want)
-	}
-}
-
-func p256MulBy2Test(t *testing.T, x, p, r *big.Int) {
-	x1 := new(big.Int).Mul(x, r)
-	x1 = x1.Mod(x1, p)
-	y1 := new(big.Int).Mul(big.NewInt(2), r)
-	y1 = y1.Mod(y1, p)
-	ax := new(p256Element)
-	res := new(p256Element)
-	res2 := new(p256Element)
-	fromBig(ax, x1)
-	p256MulBy2(res2, ax)
-	p256FromMont(res, res2)
-	resInt := toBigInt(res)
-
-	expected := new(big.Int).Mul(x, big.NewInt(2))
-	expected = expected.Mod(expected, p)
-	if resInt.Cmp(expected) != 0 {
-		t.Fatalf("p256MulBy2(%x) = %x, want %x", x, resInt, expected)
-	}
-}
-
-func TestP256MulBy2(t *testing.T) {
-	p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
-	r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
-	pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
-	p256MulBy2Test(t, pMinus1, p, r)
-	p256MulBy2Test(t, big.NewInt(0), p, r)
-	p256MulBy2Test(t, big.NewInt(1), p, r)
-}
-
-func p256AddTest(t *testing.T, x, y, p, r *big.Int) {
-	x1 := new(big.Int).Mul(x, r)
-	x1 = x1.Mod(x1, p)
-	y1 := new(big.Int).Mul(y, r)
-	y1 = y1.Mod(y1, p)
-	ax := new(p256Element)
-	ay := new(p256Element)
-	res := new(p256Element)
-	res2 := new(p256Element)
-	fromBig(ax, x1)
-	fromBig(ay, y1)
-	p256Add(res2, ax, ay)
-	p256FromMont(res, res2)
-	resInt := toBigInt(res)
-
-	expected := new(big.Int).Add(x, y)
-	expected = expected.Mod(expected, p)
-	if resInt.Cmp(expected) != 0 {
-		t.Fatalf("p256Add(%x, %x) = %x, want %x", x, y, resInt, expected)
-	}
-}
-
-func TestP256AddPMinus1(t *testing.T) {
-	p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
-	r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
-	pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
-	p256AddTest(t, pMinus1, pMinus1, p, r)
-}
-
-func TestFuzzyP256Add(t *testing.T) {
-	p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
-	r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
-	var scalar1 [32]byte
-	var scalar2 [32]byte
-	var timeout *time.Timer
-
-	if testing.Short() {
-		timeout = time.NewTimer(10 * time.Millisecond)
-	} else {
-		timeout = time.NewTimer(2 * time.Second)
-	}
-	for {
-		select {
-		case <-timeout.C:
-			return
-		default:
-		}
-		io.ReadFull(rand.Reader, scalar1[:])
-		io.ReadFull(rand.Reader, scalar2[:])
-		x := new(big.Int).SetBytes(scalar1[:])
-		y := new(big.Int).SetBytes(scalar2[:])
-		p256AddTest(t, x, y, p, r)
-	}
-}
--- a/internal/sm2ec/sm2p256_asm_test.go
+++ b/internal/sm2ec/sm2p256_asm_test.go
@ -1,4 +1,4 @@
-//go:build (amd64 || arm64 || s390x || ppc64le) && !purego
+//go:build (amd64 || arm64 || loong64 || s390x || ppc64le) && !purego

 package sm2ec