From 24bb018fade193c91d78e40b1eb2d24efdc3d773 Mon Sep 17 00:00:00 2001
From: Sun Yimin <emmansun@users.noreply.github.com>
Date: Mon, 13 Oct 2025 11:59:15 +0800
Subject: [PATCH] internal/sm2ec: loong64 p256Mul

---
 internal/sm2ec/p256_asm_loong64.s          | 409 ++++++++++++++++++++-
 internal/sm2ec/sm2p256_asm_loong64.go      |   7 +-
 internal/sm2ec/sm2p256_asm_loong64_test.go |  80 +++-
 3 files changed, 487 insertions(+), 9 deletions(-)

diff --git a/internal/sm2ec/p256_asm_loong64.s b/internal/sm2ec/p256_asm_loong64.s
index 7f10e5a..818c951 100644
--- a/internal/sm2ec/p256_asm_loong64.s
+++ b/internal/sm2ec/p256_asm_loong64.s
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
 
-//go:build go1.25 && !purego
+//go:build !purego
 
 #include "textflag.h"
 
@@ -17,11 +17,22 @@
 #define acc3 R10
 #define acc4 R11
 #define acc5 R12
-#define t0 R13
-#define t1 R14
-#define t2 R15
-#define t3 R16
-#define t4 R17
+#define acc6 R13
+#define acc7 R14
+
+#define t0 R15
+#define t1 R16
+#define t2 R17
+#define t3 R18
+
+#define x0 R19
+#define x1 R20
+#define x2 R21
+#define x3 R22
+#define y0 R23
+#define y1 R24
+#define y2 R25
+#define y3 R26
 
 DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
 DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
@@ -322,3 +333,389 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
 	MOVV acc3, (8*3)(res_ptr)
 
 	RET
+
+// (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0)
+TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
+	// y[0] * x
+	MULV y0, x0, acc0
+	MULHVU	y0, x0, acc4
+	MULV y0, x1, acc1
+	MULHVU y0, x1, acc5
+	MULV y0, x2, acc2
+	MULHVU y0, x2, acc6
+	MULV y0, x3, acc3
+	MULHVU y0, x3, acc7
+
+	// ADDS acc4, acc1
+	ADDV acc1, acc4, acc1
+	SGTU acc4, acc1, t0
+	// ADCS acc5, acc2
+	ADDV t0, acc5, acc5    // no carry
+	ADDV acc2, acc5, acc2
+	SGTU acc5, acc2, t0
+	// ADCS acc6, acc3
+	ADDV t0, acc6, acc6    // no carry
+	ADDV acc3, acc6, acc3
+	SGTU acc6, acc3, t0
+	// ADC $0, acc7, acc4
+	ADDV t0, acc7, acc4    // no carry
+	// First reduction step
+	SLLV $32, acc0, t0
+	SRLV $32, acc0, t1
+
+	// SUBS t0, acc1
+	SGTU t0, acc1, t2
+	SUBV t0, acc1
+	// SUBCS t1, acc2
+	ADDV t2, t1, t3        // no carry
+	SGTU t3, acc2, t2
+	SUBV t3, acc2
+	// SUBCS t0, acc3
+	ADDV t2, t0, t2        // no carry
+	SGTU t2, acc3, t3
+	SUBV t2, acc3, acc3
+	// SUBC t1, acc0, t2
+	SUBV t1, acc0, t2      // no borrow
+	SUBV t3, t2, t2        // no borrow
+
+	// ADDS acc0, acc1
+	ADDV acc0, acc1, acc1
+	SGTU acc0, acc1, t0
+	// ADCS $0, acc2
+	ADDV t0, acc2, acc2
+	SGTU t0, acc2, t1
+	// ADCS $0, acc3
+	ADDV t1, acc3, acc3
+	SGTU t1, acc3, t0
+	// ADC $0, t2, acc0
+	ADDV t0, t2, acc0      // (acc1, acc2, acc3, acc0) is the result
+
+	// y[1] * x
+	MULV y1, x0, t0
+	// ADDS t0, acc1
+	ADDV t0, acc1, acc1
+	SGTU t0, acc1, t2
+	MULHVU y1, x0, t1
+
+	MULV y1, x1, t0
+	// ADCS t0, acc2
+	ADDV t0, acc2, acc2
+	SGTU t0, acc2, t3
+	ADDV t2, acc2, acc2
+	SGTU t2, acc2, t4
+	OR t3, t4, t2
+	MULHVU y1, x1, y0
+
+	MULV y1, x2, t0
+	// ADCS t0, acc3
+	ADDV t0, acc3, acc3
+	SGTU t0, acc3, t3
+	ADDV t2, acc3, acc3
+	SGTU t2, acc3, t4
+	OR t3, t4, t2
+	MULHVU y1, x2, acc6
+
+	MULV y1, x3, t0
+	// ADCS t0, acc4
+	ADDV t0, acc4, acc4
+	SGTU t0, acc4, t3
+	ADDV t2, acc4, acc4
+	SGTU t2, acc4, t4
+	OR t3, t4, acc5
+	MULHVU y1, x3, acc7
+
+	// ADDS	t1, acc2
+	ADDV t1, acc2, acc2
+	SGTU t1, acc2, t2
+	// ADCS	y0, acc3
+	ADDV y0, acc3, acc3
+	SGTU y0, acc3, t3
+	ADDV t2, acc3, acc3
+	SGTU t2, acc3, t4
+	OR t3, t4, t2
+	// ADCS	acc6, acc4
+	ADDV acc6, acc4, acc4
+	SGTU acc6, acc4, t3
+	ADDV t2, acc4, acc4
+	SGTU t2, acc4, t4
+	OR t3, t4, t2
+	// ADC	acc7, acc5
+	ADDV t2, acc5, acc5
+	ADDV acc7, acc5, acc5
+
+	// Second reduction step
+	SLLV $32, acc1, t0
+	SRLV $32, acc1, t1
+
+	// SUBS t0, acc2
+	SGTU t0, acc2, t2
+	SUBV t0, acc2
+	// SUBCS t1, acc3
+	ADDV t2, t1, t3        // no carry
+	SGTU t3, acc3, t2
+	SUBV t3, acc3
+	// SUBCS t0, acc0
+	ADDV t2, t0, t2        // no carry
+	SGTU t2, acc0, t3
+	SUBV t2, acc0, acc0
+	// SUBC t1, acc1, t2
+	SUBV t1, acc1, t2      // no borrow
+	SUBV t3, t2, t2        // no borrow
+
+	// ADDS acc1, acc2
+	ADDV acc1, acc2, acc2
+	SGTU acc1, acc2, t0
+	// ADCS $0, acc3
+	ADDV t0, acc3, acc3
+	SGTU t0, acc3, t1
+	// ADCS $0, acc0
+	ADDV t1, acc0, acc0
+	SGTU t1, acc0, t0
+	// ADC $0, t2, acc1
+	ADDV t0, t2, acc1      // (acc2, acc3, acc0, acc1) is the result
+
+	// y[2] * x
+	MULV y2, x0, t0
+	// ADDS t0, acc2
+	ADDV t0, acc2, acc2
+	SGTU t0, acc2, t2
+	MULHVU y2, x0, t1
+
+	MULV y2, x1, t0
+	// ADCS t0, acc3
+	ADDV t0, acc3, acc3
+	SGTU t0, acc3, t3
+	ADDV t2, acc3, acc3
+	SGTU t2, acc3, t4
+	OR t3, t4, t2
+	MULHVU y2, x1, y0
+
+	MULV y2, x2, t0
+	// ADCS t0, acc0
+	ADDV t0, acc0, acc0
+	SGTU t0, acc0, t3
+	ADDV t2, acc0, acc0
+	SGTU t2, acc0, t4
+	OR t3, t4, t2
+	MULHVU y2, x2, y1
+
+	MULV y2, x3, t0
+	// ADCS t0, acc1
+	ADDV t0, acc1, acc1
+	SGTU t0, acc1, t3
+	ADDV t2, acc1, acc1
+	SGTU t2, acc1, t4
+	OR t3, t4, acc6
+	MULHVU y2, x3, acc7
+
+	// ADDS	t1, acc3
+	ADDV t1, acc3, acc3
+	SGTU t1, acc3, t2
+	// ADCS	y0, acc4
+	ADDV y0, acc4, acc4
+	SGTU y0, acc4, t3
+	ADDV t2, acc4, acc4
+	SGTU t2, acc4, t4
+	OR t3, t4, t2
+	// ADCS	y1, acc5
+	ADDV y1, acc5, acc5
+	SGTU y1, acc5, t3
+	ADDV t2, acc5, acc5
+	SGTU t2, acc5, t4
+	OR t3, t4, t2
+	// ADC	acc7, acc6
+	ADDV t2, acc6, acc6
+	ADDV acc7, acc6, acc6
+
+	// Third reduction step
+	SLLV $32, acc2, t0
+	SRLV $32, acc2, t1
+
+	// SUBS t0, acc3
+	SGTU t0, acc3, t2
+	SUBV t0, acc3
+	// SUBCS t1, acc0
+	ADDV t2, t1, t3        // no carry
+	SGTU t3, acc0, t2
+	SUBV t3, acc0
+	// SUBCS t0, acc1
+	ADDV t2, t0, t2        // no carry
+	SGTU t2, acc1, t3
+	SUBV t2, acc1, acc1	
+	// SUBC t1, acc2, t2
+	SUBV t1, acc2, t2      // no borrow
+	SUBV t3, t2, t2        // no borrow
+
+	// ADDS acc2, acc3
+	ADDV acc2, acc3, acc3
+	SGTU acc2, acc3, t0
+	// ADCS $0, acc0
+	ADDV t0, acc0, acc0
+	SGTU t0, acc0, t1
+	// ADCS $0, acc1
+	ADDV t1, acc1, acc1
+	SGTU t1, acc1, t0
+	// ADC $0, t2, acc2
+	ADDV t0, t2, acc2      // (acc3, acc0, acc1, acc2) is the result
+
+	// y[2] * x
+	MULV y3, x0, t0
+	// ADDS t0, acc3
+	ADDV t0, acc3, acc3
+	SGTU t0, acc3, t2
+	MULHVU y3, x0, t1
+
+	MULV y3, x1, t0
+	// ADCS t0, acc4
+	ADDV t0, acc4, acc4
+	SGTU t0, acc4, t3
+	ADDV t2, acc4, acc4
+	SGTU t2, acc4, t4
+	OR t3, t4, t2
+	MULHVU y3, x1, y0
+
+	MULV y3, x2, t0
+	// ADCS t0, acc5
+	ADDV t0, acc5, acc5
+	SGTU t0, acc5, t3
+	ADDV t2, acc5, acc5	
+	SGTU t2, acc5, t4
+	OR t3, t4, t2
+	MULHVU y3, x2, y1
+
+	MULV y3, x3, t0
+	// ADCS t0, acc6
+	ADDV t0, acc6, acc6
+	SGTU t0, acc6, t3
+	ADDV t2, acc6, acc6
+	SGTU t2, acc6, t4
+	OR t3, t4, acc7
+	MULHVU y3, x3, t0
+
+	// ADDS	t1, acc4
+	ADDV t1, acc4, acc4
+	SGTU t1, acc4, t2
+	// ADCS	y0, acc5
+	ADDV y0, acc5, acc5
+	SGTU y0, acc5, t3
+	ADDV t2, acc5, acc5
+	SGTU t2, acc5, t4
+	OR t3, t4, t2
+	// ADCS	y1, acc6
+	ADDV y1, acc6, acc6
+	SGTU y1, acc6, t3
+	ADDV t2, acc6, acc6
+	SGTU t2, acc6, t4
+	OR t3, t4, t2
+	// ADC	t0, acc7
+	ADDV t2, acc7, acc7
+	ADDV t0, acc7, acc7
+
+	// Fourth reduction step
+	SLLV $32, acc3, t0
+	SRLV $32, acc3, t1
+
+	// SUBS t0, acc0
+	SGTU t0, acc0, t2
+	SUBV t0, acc0
+	// SUBCS t1, acc1
+	ADDV t2, t1, t3        // no carry
+	SGTU t3, acc1, t2
+	SUBV t3, acc1
+	// SUBCS t0, acc2
+	ADDV t2, t0, t2        // no carry
+	SGTU t2, acc2, t3
+	SUBV t2, acc2, acc2
+	// SUBC t1, acc3, t2
+	SUBV t1, acc3, t2      // no borrow
+	SUBV t3, t2, t2        // no borrow
+
+	// ADDS acc3, acc0
+	ADDV acc3, acc0, acc0
+	SGTU acc3, acc0, t0
+	// ADCS $0, acc1
+	ADDV t0, acc1, acc1
+	SGTU t0, acc1, t1
+	// ADCS $0, acc2
+	ADDV t1, acc2, acc2
+	SGTU t1, acc2, t0
+	// ADC $0, t2, acc3
+	ADDV t0, t2, acc3      // (acc0, acc1, acc2, acc3) is the result
+
+	// Add bits [511:256] of the mul result
+	ADDV acc4, acc0, y0
+	SGTU acc4, y0, t0
+	ADDV acc5, acc1, y1
+	SGTU acc5, y1, t1
+	ADDV t0, y1, y1
+	SGTU t0, y1, t2
+	OR t1, t2, t0
+	ADDV acc6, acc2, y2
+	SGTU acc6, y2, t1
+	ADDV t0, y2, acc2
+	SGTU t0, y2, t2
+	OR t1, t2, t0
+	ADDV acc7, acc3, y3
+	SGTU acc7, y3, t1
+	ADDV t0, y3, y3
+	SGTU t0, y3, t2
+	OR t1, t2, t0
+
+	// Final reduction
+	ADDV $1, y0, acc4
+	SGTU y0, acc4, t1
+	MOVV p256one<>+0X08(SB), t2
+	ADDV t2, t1, t1             // no carry
+	ADDV y1, t1, acc5
+	SGTU y1, acc5, t3
+	ADDV t3, y2, acc6
+	SGTU y2, acc6, t4
+	ADDV $1, t2, t2
+	ADDV t4, t2, t2         // no carry
+	ADDV y3, t2, acc7
+	SGTU y3, acc7, t4
+	OR t0, t4, t0
+
+	MASKNEZ t0, y0, y0
+	MASKEQZ t0, acc4, acc4
+	OR acc4, y0
+
+	MASKNEZ t0, y1, y1
+	MASKEQZ t0, acc5, acc5
+	OR acc5, y1
+
+	MASKNEZ t0, y2, y2
+	MASKEQZ t0, acc6, acc6
+	OR acc6, y2
+
+	MASKNEZ t0, y3, y3
+	MASKEQZ t0, acc7, acc7
+	OR acc7, y3
+
+	RET
+
+/* ---------------------------------------*/
+// func p256Mul(res, in1, in2 *p256Element)
+TEXT ·p256Mul(SB),NOSPLIT,$0
+	MOVV res+0(FP), res_ptr
+	MOVV in1+8(FP), x_ptr
+	MOVV in2+16(FP), y_ptr
+
+	MOVV (8*0)(x_ptr), x0
+	MOVV (8*1)(x_ptr), x1
+	MOVV (8*2)(x_ptr), x2
+	MOVV (8*3)(x_ptr), x3
+
+	MOVV (8*0)(y_ptr), y0
+	MOVV (8*1)(y_ptr), y1
+	MOVV (8*2)(y_ptr), y2
+	MOVV (8*3)(y_ptr), y3
+
+	CALL sm2P256MulInternal<>(SB)
+
+	MOVV y0, (8*0)(res_ptr)
+	MOVV y1, (8*1)(res_ptr)
+	MOVV y2, (8*2)(res_ptr)
+	MOVV y3, (8*3)(res_ptr)
+
+	RET
diff --git a/internal/sm2ec/sm2p256_asm_loong64.go b/internal/sm2ec/sm2p256_asm_loong64.go
index 2287c5a..cf27239 100644
--- a/internal/sm2ec/sm2p256_asm_loong64.go
+++ b/internal/sm2ec/sm2p256_asm_loong64.go
@@ -1,5 +1,3 @@
-//go:build go1.25
-
 package sm2ec
 
 import (
@@ -34,3 +32,8 @@ func p256MovCond(res, a, b *SM2P256Point1, cond int)
 //
 //go:noescape
 func p256NegCond(val *p256Element, cond int)
+
+// Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p.
+//
+//go:noescape
+func p256Mul(res, in1, in2 *p256Element)
diff --git a/internal/sm2ec/sm2p256_asm_loong64_test.go b/internal/sm2ec/sm2p256_asm_loong64_test.go
index f423a74..aa84f39 100644
--- a/internal/sm2ec/sm2p256_asm_loong64_test.go
+++ b/internal/sm2ec/sm2p256_asm_loong64_test.go
@@ -1,13 +1,17 @@
-//go:build loong64 && go1.25 && !purego
+//go:build loong64 && !purego
 
 package sm2ec
 
 import (
 	"bytes"
+	"crypto/rand"
 	"encoding/binary"
 	"fmt"
+	"io"
+	"math/big"
 	"reflect"
 	"testing"
+	"time"
 )
 
 func TestP256BigToLittle(t *testing.T) {
@@ -102,3 +106,77 @@ func TestP256MovCond(t *testing.T) {
 		t.Errorf("cond=-123: got %+v, want %+v", res, *a)
 	}
 }
+
+// fromBig converts a *big.Int into a format used by this code.
+func fromBig(out *p256Element, big *big.Int) {
+	for i := range out {
+		out[i] = 0
+	}
+
+	for i, v := range big.Bits() {
+		out[i] = uint64(v)
+	}
+}
+
+func toBigInt(in *p256Element) *big.Int {
+	var valBytes [32]byte
+	p256LittleToBig(&valBytes, in)
+	return new(big.Int).SetBytes(valBytes[:])
+}
+
+func p256MulTest(t *testing.T, x, y, p, r *big.Int) {
+	x1 := new(big.Int).Mul(x, r)
+	x1 = x1.Mod(x1, p)
+	y1 := new(big.Int).Mul(y, r)
+	y1 = y1.Mod(y1, p)
+	one := new(p256Element)
+	one[0] = 1
+	ax := new(p256Element)
+	ay := new(p256Element)
+	res := new(p256Element)
+	res2 := new(p256Element)
+	fromBig(ax, x1)
+	fromBig(ay, y1)
+	p256Mul(res2, ax, ay)
+	p256Mul(res, res2, one)
+	resInt := toBigInt(res)
+
+	expected := new(big.Int).Mul(x, y)
+	expected = expected.Mod(expected, p)
+	if resInt.Cmp(expected) != 0 {
+		t.FailNow()
+	}
+}
+
+func TestP256MulPMinus1(t *testing.T) {
+	p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
+	r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
+	pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
+	p256MulTest(t, pMinus1, pMinus1, p, r)
+}
+
+func TestFuzzyP256Mul(t *testing.T) {
+	p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
+	r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
+	var scalar1 [32]byte
+	var scalar2 [32]byte
+	var timeout *time.Timer
+
+	if testing.Short() {
+		timeout = time.NewTimer(10 * time.Millisecond)
+	} else {
+		timeout = time.NewTimer(2 * time.Second)
+	}
+	for {
+		select {
+		case <-timeout.C:
+			return
+		default:
+		}
+		io.ReadFull(rand.Reader, scalar1[:])
+		io.ReadFull(rand.Reader, scalar2[:])
+		x := new(big.Int).SetBytes(scalar1[:])
+		y := new(big.Int).SetBytes(scalar2[:])
+		p256MulTest(t, x, y, p, r)
+	}
+}