internal/sm2ec: use ADCX/ADOX for order WWMM mul/sqr

This commit is contained in:
Sun Yimin 2024-02-23 17:35:19 +08:00 committed by GitHub
parent 052040fd82
commit 0996508b5b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 476 additions and 349 deletions

View File

@ -407,176 +407,161 @@ ordSqrLoop:
RET RET
ordSqrLoopBMI2: ordSqrLoopBMI2:
XORQ acc0, acc0
XORQ y_ptr, y_ptr
// y[1:] * y[0] // y[1:] * y[0]
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2 MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3 MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ $0, acc3
MULXQ (8*3)(x_ptr), AX, acc4 MULXQ (8*3)(x_ptr), AX, acc4
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ $0, acc4 ADOXQ y_ptr, acc4
// y[2:] * y[1] // y[2:] * y[1]
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, t1 MULXQ (8*2)(x_ptr), AX, t1
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
MULXQ (8*3)(x_ptr), AX, acc5 MULXQ (8*3)(x_ptr), AX, acc5
ADCQ $0, acc5 ADCXQ t1, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ $0, acc5 ADCXQ y_ptr, acc5
// y[3] * y[2] // y[3] * y[2]
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr MULXQ (8*3)(x_ptr), AX, y_ptr
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ $0, y_ptr ADOXQ acc0, y_ptr
XORQ t1, t1 XORQ t1, t1
// *2 // *2
ADDQ acc1, acc1 ADOXQ acc1, acc1
ADCQ acc2, acc2 ADOXQ acc2, acc2
ADCQ acc3, acc3 ADOXQ acc3, acc3
ADCQ acc4, acc4 ADOXQ acc4, acc4
ADCQ acc5, acc5 ADOXQ acc5, acc5
ADCQ y_ptr, y_ptr ADOXQ y_ptr, y_ptr
ADCQ $0, t1 ADOXQ acc0, t1
// Missing products // Missing products
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0 MULXQ DX, acc0, t0
ADDQ t0, acc1 ADCXQ t0, acc1
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc2 ADCXQ AX, acc2
ADCQ t0, acc3 ADCXQ t0, acc3
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc4 ADCXQ AX, acc4
ADCQ t0, acc5 ADCXQ t0, acc5
MOVQ (8*3)(x_ptr), DX MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr MULXQ DX, AX, x_ptr
ADCQ AX, y_ptr ADCXQ AX, y_ptr
ADCQ t1, x_ptr ADCXQ t1, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] // First reduction step
MOVQ acc0, DX MOVQ acc0, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. MULXQ p256ord<>+0x00(SB), AX, t0
MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
MULXQ p256ord<>+0x00(SB), AX, t1
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1
MOVQ t0, acc0 // acc0 = t0
MULXQ p256ord<>+0x08(SB), AX, t1 MULXQ p256ord<>+0x08(SB), AX, t1
ADCQ $0, t1 // t1 = carry2 + H(t0*ord1) ADCXQ t0, AX
ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1) ADOXQ AX, acc1
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
ADCQ $0, acc0 // acc0 = t0 + carry5
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
//MOVQ t0, DX // This is not required due to t0=DX already
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2 MULXQ p256ord<>+0x10(SB), AX, t0
SBBQ AX, acc3 ADCXQ t1, AX
SBBQ DX, acc0 ADOXQ AX, acc2
MULXQ p256ord<>+0x18(SB), AX, acc0
ADCXQ t0, AX
ADOXQ AX, acc3
MOVQ $0, t0
ADCXQ t0, acc0
ADOXQ t0, acc0
// Second reduction step // Second reduction step
MOVQ acc1, DX MOVQ acc1, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, t1 ADOXQ AX, acc1
ADDQ AX, acc1
ADCQ t1, acc2
MOVQ t0, acc1
MULXQ p256ord<>+0x08(SB), AX, t1 MULXQ p256ord<>+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ t1, acc3
ADCQ $0, acc0
ADCQ $0, acc1
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ t1, AX
SHLQ $32, AX ADOXQ AX, acc3
SHRQ $32, DX
MULXQ p256ord<>+0x18(SB), AX, acc1
ADCXQ t0, AX
ADOXQ AX, acc0
MOVQ $0, t0
ADCXQ t0, acc1
ADOXQ t0, acc1
SUBQ t0, acc3
SBBQ AX, acc0
SBBQ DX, acc1
// Third reduction step // Third reduction step
MOVQ acc2, DX MOVQ acc2, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, t1 ADOXQ AX, acc2
ADDQ AX, acc2
ADCQ t1, acc3
MOVQ t0, acc2
MULXQ p256ord<>+0x08(SB), AX, t1 MULXQ p256ord<>+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc0
ADCQ $0, acc1
ADCQ $0, acc2
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ t1, AX
SHLQ $32, AX ADOXQ AX, acc0
SHRQ $32, DX
MULXQ p256ord<>+0x18(SB), AX, acc2
ADCXQ t0, AX
ADOXQ AX, acc1
MOVQ $0, t0
ADCXQ t0, acc2
ADOXQ t0, acc2
SUBQ t0, acc0
SBBQ AX, acc1
SBBQ DX, acc2
// Last reduction step // Last reduction step
MOVQ acc3, DX MOVQ acc3, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, t1 ADOXQ AX, acc3
ADDQ AX, acc3
ADCQ t1, acc0
MOVQ t0, acc3
MULXQ p256ord<>+0x08(SB), AX, t1 MULXQ p256ord<>+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc0 ADOXQ AX, acc0
ADCQ t1, acc1
ADCQ $0, acc2
ADCQ $0, acc3
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ t1, AX
SHLQ $32, AX ADOXQ AX, acc1
SHRQ $32, DX
SUBQ t0, acc1 MULXQ p256ord<>+0x18(SB), AX, acc3
SBBQ AX, acc2 ADCXQ t0, AX
SBBQ DX, acc3 ADOXQ AX, acc2
MOVQ $0, t0
ADCXQ t0, acc3
ADOXQ t0, acc3
XORQ t0, t0 XORQ t1, t1
// Add bits [511:256] of the sqr result // Add bits [511:256] of the sqr result
ADCQ acc4, acc0 ADCXQ acc4, acc0
ADCQ acc5, acc1 ADCXQ acc5, acc1
ADCQ y_ptr, acc2 ADCXQ y_ptr, acc2
ADCQ x_ptr, acc3 ADCXQ x_ptr, acc3
ADCQ $0, t0 ADCXQ t1, t0
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
MOVQ res_ptr, x_ptr MOVQ res_ptr, x_ptr

View File

@ -0,0 +1,154 @@
//go:build (amd64 && !purego) || (arm64 && !purego)
package sm2ec
import (
"crypto/rand"
"io"
"math/big"
"testing"
"time"
)
func ordFromBig(out *p256OrdElement, big *big.Int) {
for i := range out {
out[i] = 0
}
for i, v := range big.Bits() {
out[i] = uint64(v)
}
}
func p256OrderSqrTest(t *testing.T, x, p, r *big.Int) {
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
ax := new(p256OrdElement)
res2 := new(p256OrdElement)
ordFromBig(ax, x1)
p256OrdSqr(res2, ax, 1)
resInt := new(big.Int).SetBytes(p256OrderFromMont(res2))
expected := new(big.Int).Mul(x, x)
expected = expected.Mod(expected, p)
if resInt.Cmp(expected) != 0 {
t.FailNow()
}
}
func TestP256OrdSqrOrdMinus1(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
p256OrderSqrTest(t, pMinus1, p, r)
}
func TestFuzzyP256OrdSqr(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
var timeout *time.Timer
if testing.Short() {
timeout = time.NewTimer(10 * time.Millisecond)
} else {
timeout = time.NewTimer(2 * time.Second)
}
for {
select {
case <-timeout.C:
return
default:
}
io.ReadFull(rand.Reader, scalar1[:])
x := new(big.Int).SetBytes(scalar1[:])
p256OrderSqrTest(t, x, p, r)
}
}
func BenchmarkP25OrdSqr(b *testing.B) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
io.ReadFull(rand.Reader, scalar1[:])
x := new(big.Int).SetBytes(scalar1[:])
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
ax := new(p256OrdElement)
res := new(p256OrdElement)
ordFromBig(ax, x1)
b.ResetTimer()
for i := 0; i < b.N; i++ {
p256OrdSqr(res, ax, 20)
}
}
func p256OrdMulTest(t *testing.T, x, y, p, r *big.Int) {
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
y1 := new(big.Int).Mul(y, r)
y1 = y1.Mod(y1, p)
ax := new(p256OrdElement)
ay := new(p256OrdElement)
res2 := new(p256OrdElement)
ordFromBig(ax, x1)
ordFromBig(ay, y1)
p256OrdMul(res2, ax, ay)
resInt := new(big.Int).SetBytes(p256OrderFromMont(res2))
expected := new(big.Int).Mul(x, y)
expected = expected.Mod(expected, p)
if resInt.Cmp(expected) != 0 {
t.FailNow()
}
}
func TestP256OrdMulOrdMinus1(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
p256OrdMulTest(t, pMinus1, pMinus1, p, r)
}
func TestFuzzyP256OrdMul(t *testing.T) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
var scalar2 [32]byte
var timeout *time.Timer
if testing.Short() {
timeout = time.NewTimer(10 * time.Millisecond)
} else {
timeout = time.NewTimer(2 * time.Second)
}
for {
select {
case <-timeout.C:
return
default:
}
io.ReadFull(rand.Reader, scalar1[:])
io.ReadFull(rand.Reader, scalar2[:])
x := new(big.Int).SetBytes(scalar1[:])
y := new(big.Int).SetBytes(scalar2[:])
p256OrdMulTest(t, x, y, p, r)
}
}
func BenchmarkP25OrdMul(b *testing.B) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
io.ReadFull(rand.Reader, scalar1[:])
x := new(big.Int).SetBytes(scalar1[:])
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
ax := new(p256OrdElement)
res := new(p256OrdElement)
ordFromBig(ax, x1)
b.ResetTimer()
for i := 0; i < b.N; i++ {
p256OrdMul(res, ax, ax)
}
}

View File

@ -876,7 +876,6 @@ TEXT ·p256OrdReduce(SB),NOSPLIT,$0
// func p256OrdMul(res, in1, in2 *p256OrdElement) // func p256OrdMul(res, in1, in2 *p256OrdElement)
TEXT ·p256OrdMul(SB),NOSPLIT,$0 TEXT ·p256OrdMul(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in1+8(FP), x_ptr MOVQ in1+8(FP), x_ptr
MOVQ in2+16(FP), y_ptr MOVQ in2+16(FP), y_ptr
CMPB ·supportBMI2+0(SB), $0x01 CMPB ·supportBMI2+0(SB), $0x01
@ -1125,203 +1124,187 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
SBBQ DX, acc1 SBBQ DX, acc1
SBBQ $0, acc2 SBBQ $0, acc2
MOVQ res+0(FP), res_ptr
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
RET RET
ordMulBMI2: ordMulBMI2:
XORQ acc5, acc5
XORQ res_ptr, res_ptr
// x * y[0] // x * y[0]
MOVQ (8*0)(y_ptr), DX MOVQ (8*0)(y_ptr), DX
MULXQ (8*0)(x_ptr), acc0, acc1 MULXQ (8*0)(x_ptr), acc0, acc1
MULXQ (8*1)(x_ptr), AX, acc2 MULXQ (8*1)(x_ptr), AX, acc2
ADDQ AX, acc1 ADCXQ AX, acc1
ADCQ $0, acc2
MULXQ (8*2)(x_ptr), AX, acc3 MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2 ADCXQ AX, acc2
ADCQ $0, acc3
MULXQ (8*3)(x_ptr), AX, acc4 MULXQ (8*3)(x_ptr), AX, acc4
ADDQ AX, acc3 ADCXQ AX, acc3
ADCQ $0, acc4 ADCXQ acc5, acc4
XORQ acc5, acc5
// First reduction step // First reduction step
MOVQ acc0, DX MOVQ acc0, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, BX ADOXQ AX, acc0
ADDQ AX, acc0
ADCQ BX, acc1
MULXQ p256ord<>+0x08(SB), AX, BX MULXQ p256ord<>+0x08(SB), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ BX, acc2
ADCQ $0, acc3
ADCQ t0, acc4
ADCQ $0, acc5
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ BX, AX
SHLQ $32, AX ADOXQ AX, acc2
SHRQ $32, DX
MULXQ p256ord<>+0x18(SB), AX, BX
SUBQ t0, acc2 ADCXQ t0, AX
SBBQ AX, acc3 ADOXQ AX, acc3
SBBQ DX, acc4
SBBQ $0, acc5 ADCXQ res_ptr, BX
ADOXQ BX, acc4
ADOXQ res_ptr, acc5
XORQ acc0, acc0
// x * y[1] // x * y[1]
MOVQ (8*1)(y_ptr), DX MOVQ (8*1)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, BX MULXQ (8*0)(x_ptr), AX, t0
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ BX, acc2
MULXQ (8*1)(x_ptr), AX, BX MULXQ (8*1)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ BX, acc3
MULXQ (8*2)(x_ptr), AX, BX MULXQ (8*2)(x_ptr), AX, t0
ADCQ $0, BX ADCXQ BX, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ BX, acc4
MULXQ (8*3)(x_ptr), AX, BX MULXQ (8*3)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ BX, acc5
ADCQ $0, acc0 ADCXQ acc0, BX
ADOXQ BX, acc5
ADOXQ res_ptr, acc0
// Second reduction step // Second reduction step
MOVQ acc1, DX MOVQ acc1, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, BX ADOXQ AX, acc1
ADDQ AX, acc1
ADCQ BX, acc2
MULXQ p256ord<>+0x08(SB), AX, BX MULXQ p256ord<>+0x08(SB), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ BX, acc3
ADCQ $0, acc4
ADCQ t0, acc5
ADCQ $0, acc0
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ BX, AX
SHLQ $32, AX ADOXQ AX, acc3
SHRQ $32, DX
MULXQ p256ord<>+0x18(SB), AX, BX
SUBQ t0, acc3 ADCXQ t0, AX
SBBQ AX, acc4 ADOXQ AX, acc4
SBBQ DX, acc5
SBBQ $0, acc0 ADCXQ res_ptr, BX
ADOXQ BX, acc5
ADOXQ res_ptr, acc0
XORQ acc1, acc1
// x * y[2] // x * y[2]
MOVQ (8*2)(y_ptr), DX MOVQ (8*2)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, BX MULXQ (8*0)(x_ptr), AX, t0
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ BX, acc3
MULXQ (8*1)(x_ptr), AX, BX MULXQ (8*1)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ BX, acc4
MULXQ (8*2)(x_ptr), AX, BX MULXQ (8*2)(x_ptr), AX, t0
ADCQ $0, BX ADCXQ BX, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ BX, acc5
MULXQ (8*3)(x_ptr), AX, BX MULXQ (8*3)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ BX, acc0
ADCQ $0, acc1 ADCXQ res_ptr, BX
ADOXQ BX, acc0
ADOXQ res_ptr, acc1
// Third reduction step // Third reduction step
MOVQ acc2, DX MOVQ acc2, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, BX ADOXQ AX, acc2
ADDQ AX, acc2
ADCQ BX, acc3
MULXQ p256ord<>+0x08(SB), AX, BX MULXQ p256ord<>+0x08(SB), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ BX, acc4
ADCQ $0, acc5
ADCQ t0, acc0
ADCQ $0, acc1
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ BX, AX
SHLQ $32, AX ADOXQ AX, acc4
SHRQ $32, DX
MULXQ p256ord<>+0x18(SB), AX, BX
SUBQ t0, acc4 ADCXQ t0, AX
SBBQ AX, acc5 ADOXQ AX, acc5
SBBQ DX, acc0
SBBQ $0, acc1 ADCXQ res_ptr, BX
ADOXQ BX, acc0
ADOXQ res_ptr, acc1
XORQ acc2, acc2
// x * y[3] // x * y[3]
MOVQ (8*3)(y_ptr), DX MOVQ (8*3)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, BX MULXQ (8*0)(x_ptr), AX, t0
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ BX, acc4
MULXQ (8*1)(x_ptr), AX, BX MULXQ (8*1)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ BX, acc5
MULXQ (8*2)(x_ptr), AX, BX MULXQ (8*2)(x_ptr), AX, t0
ADCQ $0, BX ADCXQ BX, AX
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ BX, acc0
MULXQ (8*3)(x_ptr), AX, BX MULXQ (8*3)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc0 ADOXQ AX, acc0
ADCQ BX, acc1
ADCQ $0, acc2 ADCXQ res_ptr, BX
ADOXQ BX, acc1
ADOXQ res_ptr, acc2
// Last reduction step // Last reduction step
MOVQ acc3, DX MOVQ acc3, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, BX ADOXQ AX, acc3
ADDQ AX, acc3
ADCQ BX, acc4
MULXQ p256ord<>+0x08(SB), AX, BX MULXQ p256ord<>+0x08(SB), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ BX, acc5
ADCQ $0, acc0
ADCQ t0, acc1
ADCQ $0, acc2
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ BX, AX
SHLQ $32, AX ADOXQ AX, acc5
SHRQ $32, DX
SUBQ t0, acc5
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
MULXQ p256ord<>+0x18(SB), AX, BX
ADCXQ t0, AX
ADOXQ AX, acc0
ADCXQ res_ptr, BX
ADOXQ BX, acc1
ADOXQ res_ptr, acc2
MOVQ res+0(FP), res_ptr
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
RET RET

View File

@ -406,176 +406,161 @@ ordSqrLoop:
RET RET
ordSqrLoopBMI2: ordSqrLoopBMI2:
XORQ acc0, acc0
XORQ y_ptr, y_ptr
// y[1:] * y[0] // y[1:] * y[0]
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2 MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3 MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ $0, acc3
MULXQ (8*3)(x_ptr), AX, acc4 MULXQ (8*3)(x_ptr), AX, acc4
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ $0, acc4 ADOXQ y_ptr, acc4
// y[2:] * y[1] // y[2:] * y[1]
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, BX MULXQ (8*2)(x_ptr), AX, BX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ BX, acc4
MULXQ (8*3)(x_ptr), AX, acc5 MULXQ (8*3)(x_ptr), AX, acc5
ADCQ $0, acc5 ADCXQ BX, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ $0, acc5 ADCXQ y_ptr, acc5
// y[3] * y[2] // y[3] * y[2]
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr MULXQ (8*3)(x_ptr), AX, y_ptr
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ $0, y_ptr ADOXQ acc0, y_ptr
XORQ BX, BX XORQ BX, BX
// *2 // *2
ADDQ acc1, acc1 ADOXQ acc1, acc1
ADCQ acc2, acc2 ADOXQ acc2, acc2
ADCQ acc3, acc3 ADOXQ acc3, acc3
ADCQ acc4, acc4 ADOXQ acc4, acc4
ADCQ acc5, acc5 ADOXQ acc5, acc5
ADCQ y_ptr, y_ptr ADOXQ y_ptr, y_ptr
ADCQ $0, BX ADOXQ acc0, BX
// Missing products // Missing products
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0 MULXQ DX, acc0, t0
ADDQ t0, acc1 ADCXQ t0, acc1
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc2 ADCXQ AX, acc2
ADCQ t0, acc3 ADCXQ t0, acc3
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc4 ADCXQ AX, acc4
ADCQ t0, acc5 ADCXQ t0, acc5
MOVQ (8*3)(x_ptr), DX MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr MULXQ DX, AX, x_ptr
ADCQ AX, y_ptr ADCXQ AX, y_ptr
ADCQ BX, x_ptr ADCXQ BX, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] // First reduction step
MOVQ acc0, DX MOVQ acc0, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. MULXQ p256ord<>+0x00(SB), AX, t0
MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
MULXQ p256ord<>+0x00(SB), AX, BX
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
ADCQ BX, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1
MOVQ t0, acc0 // acc0 = t0
MULXQ p256ord<>+0x08(SB), AX, BX MULXQ p256ord<>+0x08(SB), AX, BX
ADCQ $0, BX // BX = carry2 + H(t0*ord1) ADCXQ t0, AX
ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1) ADOXQ AX, acc1
ADCQ BX, acc2 // (carry4, acc2) = acc2 + BX + carry3
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
ADCQ $0, acc0 // acc0 = t0 + carry5
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
//MOVQ t0, DX // This is not required due to t0=DX already
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2 MULXQ p256ord<>+0x10(SB), AX, t0
SBBQ AX, acc3 ADCXQ BX, AX
SBBQ DX, acc0 ADOXQ AX, acc2
MULXQ p256ord<>+0x18(SB), AX, acc0
ADCXQ t0, AX
ADOXQ AX, acc3
MOVQ $0, t0
ADCXQ t0, acc0
ADOXQ t0, acc0
// Second reduction step // Second reduction step
MOVQ acc1, DX MOVQ acc1, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, BX ADOXQ AX, acc1
ADDQ AX, acc1
ADCQ BX, acc2
MOVQ t0, acc1
MULXQ p256ord<>+0x08(SB), AX, BX MULXQ p256ord<>+0x08(SB), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ BX, acc3
ADCQ $0, acc0
ADCQ $0, acc1
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ BX, AX
SHLQ $32, AX ADOXQ AX, acc3
SHRQ $32, DX
MULXQ p256ord<>+0x18(SB), AX, acc1
ADCXQ t0, AX
ADOXQ AX, acc0
MOVQ $0, t0
ADCXQ t0, acc1
ADOXQ t0, acc1
SUBQ t0, acc3
SBBQ AX, acc0
SBBQ DX, acc1
// Third reduction step // Third reduction step
MOVQ acc2, DX MOVQ acc2, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, BX ADOXQ AX, acc2
ADDQ AX, acc2
ADCQ BX, acc3
MOVQ t0, acc2
MULXQ p256ord<>+0x08(SB), AX, BX MULXQ p256ord<>+0x08(SB), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ BX, acc0
ADCQ $0, acc1
ADCQ $0, acc2
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ BX, AX
SHLQ $32, AX ADOXQ AX, acc0
SHRQ $32, DX
MULXQ p256ord<>+0x18(SB), AX, acc2
ADCXQ t0, AX
ADOXQ AX, acc1
MOVQ $0, t0
ADCXQ t0, acc2
ADOXQ t0, acc2
SUBQ t0, acc0
SBBQ AX, acc1
SBBQ DX, acc2
// Last reduction step // Last reduction step
MOVQ acc3, DX MOVQ acc3, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), DX, AX
MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, t0
MULXQ p256ord<>+0x00(SB), AX, BX ADOXQ AX, acc3
ADDQ AX, acc3
ADCQ BX, acc0
MOVQ t0, acc3
MULXQ p256ord<>+0x08(SB), AX, BX MULXQ p256ord<>+0x08(SB), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc0 ADOXQ AX, acc0
ADCQ BX, acc1
ADCQ $0, acc2
ADCQ $0, acc3
MOVQ t0, AX MULXQ p256ord<>+0x10(SB), AX, t0
//MOVQ t0, DX // This is not required due to t0=DX already ADCXQ BX, AX
SHLQ $32, AX ADOXQ AX, acc1
SHRQ $32, DX
SUBQ t0, acc1 MULXQ p256ord<>+0x18(SB), AX, acc3
SBBQ AX, acc2 ADCXQ t0, AX
SBBQ DX, acc3 ADOXQ AX, acc2
MOVQ $0, t0
ADCXQ t0, acc3
ADOXQ t0, acc3
XORQ t0, t0 XORQ BX, BX
// Add bits [511:256] of the sqr result // Add bits [511:256] of the sqr result
ADCQ acc4, acc0 ADCXQ acc4, acc0
ADCQ acc5, acc1 ADCXQ acc5, acc1
ADCQ y_ptr, acc2 ADCXQ y_ptr, acc2
ADCQ x_ptr, acc3 ADCXQ x_ptr, acc3
ADCQ $0, t0 ADCXQ BX, t0
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
MOVQ res_ptr, x_ptr MOVQ res_ptr, x_ptr

View File

@ -302,7 +302,9 @@ func p256Sqrt(e, x *p256Element) (isSquare bool) {
} }
// The following assembly functions are implemented in p256_asm_*.s // The following assembly functions are implemented in p256_asm_*.s
var supportBMI2 = cpu.X86.HasBMI2
// amd64 assembly uses ADCX/ADOX/MULX
var supportBMI2 = cpu.X86.HasADX && cpu.X86.HasBMI2
var supportAVX2 = cpu.X86.HasAVX2 var supportAVX2 = cpu.X86.HasAVX2

View File

@ -83,6 +83,23 @@ func TestFuzzyP256Mul(t *testing.T) {
} }
} }
func BenchmarkP256Mul(b *testing.B) {
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
var scalar1 [32]byte
io.ReadFull(rand.Reader, scalar1[:])
x := new(big.Int).SetBytes(scalar1[:])
x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p)
ax := new(p256Element)
res := new(p256Element)
fromBig(ax, x1)
b.ResetTimer()
for i := 0; i < b.N; i++ {
p256Mul(res, ax, ax)
}
}
func p256SqrTest(t *testing.T, x, p, r *big.Int) { func p256SqrTest(t *testing.T, x, p, r *big.Int) {
x1 := new(big.Int).Mul(x, r) x1 := new(big.Int).Mul(x, r)
x1 = x1.Mod(x1, p) x1 = x1.Mod(x1, p)
@ -142,6 +159,7 @@ func BenchmarkP256Sqr(b *testing.B) {
ax := new(p256Element) ax := new(p256Element)
res := new(p256Element) res := new(p256Element)
fromBig(ax, x1) fromBig(ax, x1)
b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
p256Sqr(res, ax, 20) p256Sqr(res, ax, 20)
} }