From 24bb018fade193c91d78e40b1eb2d24efdc3d773 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Mon, 13 Oct 2025 11:59:15 +0800 Subject: [PATCH] internal/sm2ec: loong64 p256Mul --- internal/sm2ec/p256_asm_loong64.s | 409 ++++++++++++++++++++- internal/sm2ec/sm2p256_asm_loong64.go | 7 +- internal/sm2ec/sm2p256_asm_loong64_test.go | 80 +++- 3 files changed, 487 insertions(+), 9 deletions(-) diff --git a/internal/sm2ec/p256_asm_loong64.s b/internal/sm2ec/p256_asm_loong64.s index 7f10e5a..818c951 100644 --- a/internal/sm2ec/p256_asm_loong64.s +++ b/internal/sm2ec/p256_asm_loong64.s @@ -2,7 +2,7 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. -//go:build go1.25 && !purego +//go:build !purego #include "textflag.h" @@ -17,11 +17,22 @@ #define acc3 R10 #define acc4 R11 #define acc5 R12 -#define t0 R13 -#define t1 R14 -#define t2 R15 -#define t3 R16 -#define t4 R17 +#define acc6 R13 +#define acc7 R14 + +#define t0 R15 +#define t1 R16 +#define t2 R17 +#define t3 R18 + +#define x0 R19 +#define x1 R20 +#define x2 R21 +#define x3 R22 +#define y0 R23 +#define y1 R24 +#define y2 R25 +#define y3 R26 DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff DATA p256p<>+0x08(SB)/8, $0xffffffff00000000 @@ -322,3 +333,389 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0 MOVV acc3, (8*3)(res_ptr) RET + +// (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0) +TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0 + // y[0] * x + MULV y0, x0, acc0 + MULHVU y0, x0, acc4 + MULV y0, x1, acc1 + MULHVU y0, x1, acc5 + MULV y0, x2, acc2 + MULHVU y0, x2, acc6 + MULV y0, x3, acc3 + MULHVU y0, x3, acc7 + + // ADDS acc4, acc1 + ADDV acc1, acc4, acc1 + SGTU acc4, acc1, t0 + // ADCS acc5, acc2 + ADDV t0, acc5, acc5 // no carry + ADDV acc2, acc5, acc2 + SGTU acc5, acc2, t0 + // ADCS acc6, acc3 + ADDV t0, acc6, acc6 // no carry + ADDV acc3, acc6, acc3 + SGTU acc6, acc3, t0 + // ADC $0, acc7, acc4 + ADDV t0, acc7, acc4 // no carry + // First reduction step + SLLV $32, acc0, t0 + SRLV $32, acc0, t1 + + // SUBS t0, acc1 + SGTU t0, acc1, t2 + SUBV t0, acc1 + // SUBCS t1, acc2 + ADDV t2, t1, t3 // no carry + SGTU t3, acc2, t2 + SUBV t3, acc2 + // SUBCS t0, acc3 + ADDV t2, t0, t2 // no carry + SGTU t2, acc3, t3 + SUBV t2, acc3, acc3 + // SUBC t1, acc0, t2 + SUBV t1, acc0, t2 // no borrow + SUBV t3, t2, t2 // no borrow + + // ADDS acc0, acc1 + ADDV acc0, acc1, acc1 + SGTU acc0, acc1, t0 + // ADCS $0, acc2 + ADDV t0, acc2, acc2 + SGTU t0, acc2, t1 + // ADCS $0, acc3 + ADDV t1, acc3, acc3 + SGTU t1, acc3, t0 + // ADC $0, t2, acc0 + ADDV t0, t2, acc0 // (acc1, acc2, acc3, acc0) is the result + + // y[1] * x + MULV y1, x0, t0 + // ADDS t0, acc1 + ADDV t0, acc1, acc1 + SGTU t0, acc1, t2 + MULHVU y1, x0, t1 + + MULV y1, x1, t0 + // ADCS t0, acc2 + ADDV t0, acc2, acc2 + SGTU t0, acc2, t3 + ADDV t2, acc2, acc2 + SGTU t2, acc2, t4 + OR t3, t4, t2 + MULHVU y1, x1, y0 + + MULV y1, x2, t0 + // ADCS t0, acc3 + ADDV t0, acc3, acc3 + SGTU t0, acc3, t3 + ADDV t2, acc3, acc3 + SGTU t2, acc3, t4 + OR t3, t4, t2 + MULHVU y1, x2, acc6 + + MULV y1, x3, t0 + // ADCS t0, acc4 + ADDV t0, acc4, acc4 + SGTU t0, acc4, t3 + ADDV t2, acc4, acc4 + SGTU t2, acc4, t4 + OR t3, t4, acc5 + MULHVU y1, x3, acc7 + + // ADDS t1, acc2 + ADDV t1, acc2, acc2 + SGTU t1, acc2, t2 + // ADCS y0, acc3 + ADDV y0, acc3, acc3 + SGTU y0, acc3, t3 + ADDV t2, acc3, acc3 + SGTU t2, acc3, t4 + OR t3, t4, t2 + // ADCS acc6, acc4 + ADDV acc6, acc4, acc4 + SGTU acc6, acc4, t3 + ADDV t2, acc4, acc4 + SGTU t2, acc4, t4 + OR t3, t4, t2 + // ADC acc7, acc5 + ADDV t2, acc5, acc5 + ADDV acc7, acc5, acc5 + + // Second reduction step + SLLV $32, acc1, t0 + SRLV $32, acc1, t1 + + // SUBS t0, acc2 + SGTU t0, acc2, t2 + SUBV t0, acc2 + // SUBCS t1, acc3 + ADDV t2, t1, t3 // no carry + SGTU t3, acc3, t2 + SUBV t3, acc3 + // SUBCS t0, acc0 + ADDV t2, t0, t2 // no carry + SGTU t2, acc0, t3 + SUBV t2, acc0, acc0 + // SUBC t1, acc1, t2 + SUBV t1, acc1, t2 // no borrow + SUBV t3, t2, t2 // no borrow + + // ADDS acc1, acc2 + ADDV acc1, acc2, acc2 + SGTU acc1, acc2, t0 + // ADCS $0, acc3 + ADDV t0, acc3, acc3 + SGTU t0, acc3, t1 + // ADCS $0, acc0 + ADDV t1, acc0, acc0 + SGTU t1, acc0, t0 + // ADC $0, t2, acc1 + ADDV t0, t2, acc1 // (acc2, acc3, acc0, acc1) is the result + + // y[2] * x + MULV y2, x0, t0 + // ADDS t0, acc2 + ADDV t0, acc2, acc2 + SGTU t0, acc2, t2 + MULHVU y2, x0, t1 + + MULV y2, x1, t0 + // ADCS t0, acc3 + ADDV t0, acc3, acc3 + SGTU t0, acc3, t3 + ADDV t2, acc3, acc3 + SGTU t2, acc3, t4 + OR t3, t4, t2 + MULHVU y2, x1, y0 + + MULV y2, x2, t0 + // ADCS t0, acc0 + ADDV t0, acc0, acc0 + SGTU t0, acc0, t3 + ADDV t2, acc0, acc0 + SGTU t2, acc0, t4 + OR t3, t4, t2 + MULHVU y2, x2, y1 + + MULV y2, x3, t0 + // ADCS t0, acc1 + ADDV t0, acc1, acc1 + SGTU t0, acc1, t3 + ADDV t2, acc1, acc1 + SGTU t2, acc1, t4 + OR t3, t4, acc6 + MULHVU y2, x3, acc7 + + // ADDS t1, acc3 + ADDV t1, acc3, acc3 + SGTU t1, acc3, t2 + // ADCS y0, acc4 + ADDV y0, acc4, acc4 + SGTU y0, acc4, t3 + ADDV t2, acc4, acc4 + SGTU t2, acc4, t4 + OR t3, t4, t2 + // ADCS y1, acc5 + ADDV y1, acc5, acc5 + SGTU y1, acc5, t3 + ADDV t2, acc5, acc5 + SGTU t2, acc5, t4 + OR t3, t4, t2 + // ADC acc7, acc6 + ADDV t2, acc6, acc6 + ADDV acc7, acc6, acc6 + + // Third reduction step + SLLV $32, acc2, t0 + SRLV $32, acc2, t1 + + // SUBS t0, acc3 + SGTU t0, acc3, t2 + SUBV t0, acc3 + // SUBCS t1, acc0 + ADDV t2, t1, t3 // no carry + SGTU t3, acc0, t2 + SUBV t3, acc0 + // SUBCS t0, acc1 + ADDV t2, t0, t2 // no carry + SGTU t2, acc1, t3 + SUBV t2, acc1, acc1 + // SUBC t1, acc2, t2 + SUBV t1, acc2, t2 // no borrow + SUBV t3, t2, t2 // no borrow + + // ADDS acc2, acc3 + ADDV acc2, acc3, acc3 + SGTU acc2, acc3, t0 + // ADCS $0, acc0 + ADDV t0, acc0, acc0 + SGTU t0, acc0, t1 + // ADCS $0, acc1 + ADDV t1, acc1, acc1 + SGTU t1, acc1, t0 + // ADC $0, t2, acc2 + ADDV t0, t2, acc2 // (acc3, acc0, acc1, acc2) is the result + + // y[2] * x + MULV y3, x0, t0 + // ADDS t0, acc3 + ADDV t0, acc3, acc3 + SGTU t0, acc3, t2 + MULHVU y3, x0, t1 + + MULV y3, x1, t0 + // ADCS t0, acc4 + ADDV t0, acc4, acc4 + SGTU t0, acc4, t3 + ADDV t2, acc4, acc4 + SGTU t2, acc4, t4 + OR t3, t4, t2 + MULHVU y3, x1, y0 + + MULV y3, x2, t0 + // ADCS t0, acc5 + ADDV t0, acc5, acc5 + SGTU t0, acc5, t3 + ADDV t2, acc5, acc5 + SGTU t2, acc5, t4 + OR t3, t4, t2 + MULHVU y3, x2, y1 + + MULV y3, x3, t0 + // ADCS t0, acc6 + ADDV t0, acc6, acc6 + SGTU t0, acc6, t3 + ADDV t2, acc6, acc6 + SGTU t2, acc6, t4 + OR t3, t4, acc7 + MULHVU y3, x3, t0 + + // ADDS t1, acc4 + ADDV t1, acc4, acc4 + SGTU t1, acc4, t2 + // ADCS y0, acc5 + ADDV y0, acc5, acc5 + SGTU y0, acc5, t3 + ADDV t2, acc5, acc5 + SGTU t2, acc5, t4 + OR t3, t4, t2 + // ADCS y1, acc6 + ADDV y1, acc6, acc6 + SGTU y1, acc6, t3 + ADDV t2, acc6, acc6 + SGTU t2, acc6, t4 + OR t3, t4, t2 + // ADC t0, acc7 + ADDV t2, acc7, acc7 + ADDV t0, acc7, acc7 + + // Fourth reduction step + SLLV $32, acc3, t0 + SRLV $32, acc3, t1 + + // SUBS t0, acc0 + SGTU t0, acc0, t2 + SUBV t0, acc0 + // SUBCS t1, acc1 + ADDV t2, t1, t3 // no carry + SGTU t3, acc1, t2 + SUBV t3, acc1 + // SUBCS t0, acc2 + ADDV t2, t0, t2 // no carry + SGTU t2, acc2, t3 + SUBV t2, acc2, acc2 + // SUBC t1, acc3, t2 + SUBV t1, acc3, t2 // no borrow + SUBV t3, t2, t2 // no borrow + + // ADDS acc3, acc0 + ADDV acc3, acc0, acc0 + SGTU acc3, acc0, t0 + // ADCS $0, acc1 + ADDV t0, acc1, acc1 + SGTU t0, acc1, t1 + // ADCS $0, acc2 + ADDV t1, acc2, acc2 + SGTU t1, acc2, t0 + // ADC $0, t2, acc3 + ADDV t0, t2, acc3 // (acc0, acc1, acc2, acc3) is the result + + // Add bits [511:256] of the mul result + ADDV acc4, acc0, y0 + SGTU acc4, y0, t0 + ADDV acc5, acc1, y1 + SGTU acc5, y1, t1 + ADDV t0, y1, y1 + SGTU t0, y1, t2 + OR t1, t2, t0 + ADDV acc6, acc2, y2 + SGTU acc6, y2, t1 + ADDV t0, y2, acc2 + SGTU t0, y2, t2 + OR t1, t2, t0 + ADDV acc7, acc3, y3 + SGTU acc7, y3, t1 + ADDV t0, y3, y3 + SGTU t0, y3, t2 + OR t1, t2, t0 + + // Final reduction + ADDV $1, y0, acc4 + SGTU y0, acc4, t1 + MOVV p256one<>+0X08(SB), t2 + ADDV t2, t1, t1 // no carry + ADDV y1, t1, acc5 + SGTU y1, acc5, t3 + ADDV t3, y2, acc6 + SGTU y2, acc6, t4 + ADDV $1, t2, t2 + ADDV t4, t2, t2 // no carry + ADDV y3, t2, acc7 + SGTU y3, acc7, t4 + OR t0, t4, t0 + + MASKNEZ t0, y0, y0 + MASKEQZ t0, acc4, acc4 + OR acc4, y0 + + MASKNEZ t0, y1, y1 + MASKEQZ t0, acc5, acc5 + OR acc5, y1 + + MASKNEZ t0, y2, y2 + MASKEQZ t0, acc6, acc6 + OR acc6, y2 + + MASKNEZ t0, y3, y3 + MASKEQZ t0, acc7, acc7 + OR acc7, y3 + + RET + +/* ---------------------------------------*/ +// func p256Mul(res, in1, in2 *p256Element) +TEXT ·p256Mul(SB),NOSPLIT,$0 + MOVV res+0(FP), res_ptr + MOVV in1+8(FP), x_ptr + MOVV in2+16(FP), y_ptr + + MOVV (8*0)(x_ptr), x0 + MOVV (8*1)(x_ptr), x1 + MOVV (8*2)(x_ptr), x2 + MOVV (8*3)(x_ptr), x3 + + MOVV (8*0)(y_ptr), y0 + MOVV (8*1)(y_ptr), y1 + MOVV (8*2)(y_ptr), y2 + MOVV (8*3)(y_ptr), y3 + + CALL sm2P256MulInternal<>(SB) + + MOVV y0, (8*0)(res_ptr) + MOVV y1, (8*1)(res_ptr) + MOVV y2, (8*2)(res_ptr) + MOVV y3, (8*3)(res_ptr) + + RET diff --git a/internal/sm2ec/sm2p256_asm_loong64.go b/internal/sm2ec/sm2p256_asm_loong64.go index 2287c5a..cf27239 100644 --- a/internal/sm2ec/sm2p256_asm_loong64.go +++ b/internal/sm2ec/sm2p256_asm_loong64.go @@ -1,5 +1,3 @@ -//go:build go1.25 - package sm2ec import ( @@ -34,3 +32,8 @@ func p256MovCond(res, a, b *SM2P256Point1, cond int) // //go:noescape func p256NegCond(val *p256Element, cond int) + +// Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p. +// +//go:noescape +func p256Mul(res, in1, in2 *p256Element) diff --git a/internal/sm2ec/sm2p256_asm_loong64_test.go b/internal/sm2ec/sm2p256_asm_loong64_test.go index f423a74..aa84f39 100644 --- a/internal/sm2ec/sm2p256_asm_loong64_test.go +++ b/internal/sm2ec/sm2p256_asm_loong64_test.go @@ -1,13 +1,17 @@ -//go:build loong64 && go1.25 && !purego +//go:build loong64 && !purego package sm2ec import ( "bytes" + "crypto/rand" "encoding/binary" "fmt" + "io" + "math/big" "reflect" "testing" + "time" ) func TestP256BigToLittle(t *testing.T) { @@ -102,3 +106,77 @@ func TestP256MovCond(t *testing.T) { t.Errorf("cond=-123: got %+v, want %+v", res, *a) } } + +// fromBig converts a *big.Int into a format used by this code. +func fromBig(out *p256Element, big *big.Int) { + for i := range out { + out[i] = 0 + } + + for i, v := range big.Bits() { + out[i] = uint64(v) + } +} + +func toBigInt(in *p256Element) *big.Int { + var valBytes [32]byte + p256LittleToBig(&valBytes, in) + return new(big.Int).SetBytes(valBytes[:]) +} + +func p256MulTest(t *testing.T, x, y, p, r *big.Int) { + x1 := new(big.Int).Mul(x, r) + x1 = x1.Mod(x1, p) + y1 := new(big.Int).Mul(y, r) + y1 = y1.Mod(y1, p) + one := new(p256Element) + one[0] = 1 + ax := new(p256Element) + ay := new(p256Element) + res := new(p256Element) + res2 := new(p256Element) + fromBig(ax, x1) + fromBig(ay, y1) + p256Mul(res2, ax, ay) + p256Mul(res, res2, one) + resInt := toBigInt(res) + + expected := new(big.Int).Mul(x, y) + expected = expected.Mod(expected, p) + if resInt.Cmp(expected) != 0 { + t.FailNow() + } +} + +func TestP256MulPMinus1(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + pMinus1 := new(big.Int).Sub(p, big.NewInt(1)) + p256MulTest(t, pMinus1, pMinus1, p, r) +} + +func TestFuzzyP256Mul(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + var scalar1 [32]byte + var scalar2 [32]byte + var timeout *time.Timer + + if testing.Short() { + timeout = time.NewTimer(10 * time.Millisecond) + } else { + timeout = time.NewTimer(2 * time.Second) + } + for { + select { + case <-timeout.C: + return + default: + } + io.ReadFull(rand.Reader, scalar1[:]) + io.ReadFull(rand.Reader, scalar2[:]) + x := new(big.Int).SetBytes(scalar1[:]) + y := new(big.Int).SetBytes(scalar2[:]) + p256MulTest(t, x, y, p, r) + } +}