mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
internal/sm2ec: use ADCX/ADOX for order WWMM mul/sqr
This commit is contained in:
parent
052040fd82
commit
0996508b5b
@ -407,176 +407,161 @@ ordSqrLoop:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
ordSqrLoopBMI2:
|
ordSqrLoopBMI2:
|
||||||
|
XORQ acc0, acc0
|
||||||
|
XORQ y_ptr, y_ptr
|
||||||
// y[1:] * y[0]
|
// y[1:] * y[0]
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
MULXQ (8*1)(x_ptr), acc1, acc2
|
MULXQ (8*1)(x_ptr), acc1, acc2
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, acc3
|
MULXQ (8*2)(x_ptr), AX, acc3
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc4
|
MULXQ (8*3)(x_ptr), AX, acc4
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADOXQ y_ptr, acc4
|
||||||
|
|
||||||
// y[2:] * y[1]
|
// y[2:] * y[1]
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
MULXQ (8*2)(x_ptr), AX, t1
|
MULXQ (8*2)(x_ptr), AX, t1
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc5
|
MULXQ (8*3)(x_ptr), AX, acc5
|
||||||
ADCQ $0, acc5
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ $0, acc5
|
ADCXQ y_ptr, acc5
|
||||||
|
|
||||||
// y[3] * y[2]
|
// y[3] * y[2]
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
MULXQ (8*3)(x_ptr), AX, y_ptr
|
MULXQ (8*3)(x_ptr), AX, y_ptr
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ $0, y_ptr
|
ADOXQ acc0, y_ptr
|
||||||
|
|
||||||
XORQ t1, t1
|
XORQ t1, t1
|
||||||
// *2
|
// *2
|
||||||
ADDQ acc1, acc1
|
ADOXQ acc1, acc1
|
||||||
ADCQ acc2, acc2
|
ADOXQ acc2, acc2
|
||||||
ADCQ acc3, acc3
|
ADOXQ acc3, acc3
|
||||||
ADCQ acc4, acc4
|
ADOXQ acc4, acc4
|
||||||
ADCQ acc5, acc5
|
ADOXQ acc5, acc5
|
||||||
ADCQ y_ptr, y_ptr
|
ADOXQ y_ptr, y_ptr
|
||||||
ADCQ $0, t1
|
ADOXQ acc0, t1
|
||||||
|
|
||||||
// Missing products
|
// Missing products
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
MULXQ DX, acc0, t0
|
MULXQ DX, acc0, t0
|
||||||
ADDQ t0, acc1
|
ADCXQ t0, acc1
|
||||||
|
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc2
|
ADCXQ AX, acc2
|
||||||
ADCQ t0, acc3
|
ADCXQ t0, acc3
|
||||||
|
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc4
|
ADCXQ AX, acc4
|
||||||
ADCQ t0, acc5
|
ADCXQ t0, acc5
|
||||||
|
|
||||||
MOVQ (8*3)(x_ptr), DX
|
MOVQ (8*3)(x_ptr), DX
|
||||||
MULXQ DX, AX, x_ptr
|
MULXQ DX, AX, x_ptr
|
||||||
ADCQ AX, y_ptr
|
ADCXQ AX, y_ptr
|
||||||
ADCQ t1, x_ptr
|
ADCXQ t1, x_ptr
|
||||||
|
|
||||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||||
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
|
// First reduction step
|
||||||
MOVQ acc0, DX
|
MOVQ acc0, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
|
||||||
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64
|
ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, t1
|
|
||||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
|
||||||
ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1
|
|
||||||
MOVQ t0, acc0 // acc0 = t0
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, t1
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1 // t1 = carry2 + H(t0*ord1)
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1)
|
ADOXQ AX, acc1
|
||||||
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
|
|
||||||
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
|
|
||||||
ADCQ $0, acc0 // acc0 = t0 + carry5
|
|
||||||
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
|
||||||
MOVQ t0, AX
|
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc2
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
SBBQ AX, acc3
|
ADCXQ t1, AX
|
||||||
SBBQ DX, acc0
|
ADOXQ AX, acc2
|
||||||
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, acc0
|
||||||
|
ADCXQ t0, AX
|
||||||
|
ADOXQ AX, acc3
|
||||||
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc0
|
||||||
|
ADOXQ t0, acc0
|
||||||
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ acc1, DX
|
MOVQ acc1, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, t1
|
ADOXQ AX, acc1
|
||||||
ADDQ AX, acc1
|
|
||||||
ADCQ t1, acc2
|
|
||||||
MOVQ t0, acc1
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, t1
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ t1, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ t1, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc3
|
||||||
SHRQ $32, DX
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, acc1
|
||||||
|
ADCXQ t0, AX
|
||||||
|
ADOXQ AX, acc0
|
||||||
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc1
|
||||||
|
ADOXQ t0, acc1
|
||||||
|
|
||||||
SUBQ t0, acc3
|
|
||||||
SBBQ AX, acc0
|
|
||||||
SBBQ DX, acc1
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
MOVQ acc2, DX
|
MOVQ acc2, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, t1
|
ADOXQ AX, acc2
|
||||||
ADDQ AX, acc2
|
|
||||||
ADCQ t1, acc3
|
|
||||||
MOVQ t0, acc2
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, t1
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ t1, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc0
|
||||||
SHRQ $32, DX
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, acc2
|
||||||
|
ADCXQ t0, AX
|
||||||
|
ADOXQ AX, acc1
|
||||||
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc2
|
||||||
|
ADOXQ t0, acc2
|
||||||
|
|
||||||
SUBQ t0, acc0
|
|
||||||
SBBQ AX, acc1
|
|
||||||
SBBQ DX, acc2
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
MOVQ acc3, DX
|
MOVQ acc3, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, t1
|
ADOXQ AX, acc3
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ t1, acc0
|
|
||||||
MOVQ t0, acc3
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, t1
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc0
|
||||||
ADCQ t1, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ t1, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc1
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc1
|
MULXQ p256ord<>+0x18(SB), AX, acc3
|
||||||
SBBQ AX, acc2
|
ADCXQ t0, AX
|
||||||
SBBQ DX, acc3
|
ADOXQ AX, acc2
|
||||||
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc3
|
||||||
|
ADOXQ t0, acc3
|
||||||
|
|
||||||
XORQ t0, t0
|
XORQ t1, t1
|
||||||
// Add bits [511:256] of the sqr result
|
// Add bits [511:256] of the sqr result
|
||||||
ADCQ acc4, acc0
|
ADCXQ acc4, acc0
|
||||||
ADCQ acc5, acc1
|
ADCXQ acc5, acc1
|
||||||
ADCQ y_ptr, acc2
|
ADCXQ y_ptr, acc2
|
||||||
ADCQ x_ptr, acc3
|
ADCXQ x_ptr, acc3
|
||||||
ADCQ $0, t0
|
ADCXQ t1, t0
|
||||||
|
|
||||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||||
MOVQ res_ptr, x_ptr
|
MOVQ res_ptr, x_ptr
|
||||||
|
154
internal/sm2ec/p256_asm_ord_test.go
Normal file
154
internal/sm2ec/p256_asm_ord_test.go
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
//go:build (amd64 && !purego) || (arm64 && !purego)
|
||||||
|
|
||||||
|
package sm2ec
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crypto/rand"
|
||||||
|
"io"
|
||||||
|
"math/big"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func ordFromBig(out *p256OrdElement, big *big.Int) {
|
||||||
|
for i := range out {
|
||||||
|
out[i] = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, v := range big.Bits() {
|
||||||
|
out[i] = uint64(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func p256OrderSqrTest(t *testing.T, x, p, r *big.Int) {
|
||||||
|
x1 := new(big.Int).Mul(x, r)
|
||||||
|
x1 = x1.Mod(x1, p)
|
||||||
|
ax := new(p256OrdElement)
|
||||||
|
res2 := new(p256OrdElement)
|
||||||
|
ordFromBig(ax, x1)
|
||||||
|
p256OrdSqr(res2, ax, 1)
|
||||||
|
resInt := new(big.Int).SetBytes(p256OrderFromMont(res2))
|
||||||
|
|
||||||
|
expected := new(big.Int).Mul(x, x)
|
||||||
|
expected = expected.Mod(expected, p)
|
||||||
|
if resInt.Cmp(expected) != 0 {
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestP256OrdSqrOrdMinus1(t *testing.T) {
|
||||||
|
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
|
||||||
|
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
|
||||||
|
pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
|
||||||
|
p256OrderSqrTest(t, pMinus1, p, r)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFuzzyP256OrdSqr(t *testing.T) {
|
||||||
|
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
|
||||||
|
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
|
||||||
|
var scalar1 [32]byte
|
||||||
|
var timeout *time.Timer
|
||||||
|
|
||||||
|
if testing.Short() {
|
||||||
|
timeout = time.NewTimer(10 * time.Millisecond)
|
||||||
|
} else {
|
||||||
|
timeout = time.NewTimer(2 * time.Second)
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-timeout.C:
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
io.ReadFull(rand.Reader, scalar1[:])
|
||||||
|
x := new(big.Int).SetBytes(scalar1[:])
|
||||||
|
p256OrderSqrTest(t, x, p, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkP25OrdSqr(b *testing.B) {
|
||||||
|
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
|
||||||
|
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
|
||||||
|
var scalar1 [32]byte
|
||||||
|
io.ReadFull(rand.Reader, scalar1[:])
|
||||||
|
x := new(big.Int).SetBytes(scalar1[:])
|
||||||
|
x1 := new(big.Int).Mul(x, r)
|
||||||
|
x1 = x1.Mod(x1, p)
|
||||||
|
ax := new(p256OrdElement)
|
||||||
|
res := new(p256OrdElement)
|
||||||
|
ordFromBig(ax, x1)
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
p256OrdSqr(res, ax, 20)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func p256OrdMulTest(t *testing.T, x, y, p, r *big.Int) {
|
||||||
|
x1 := new(big.Int).Mul(x, r)
|
||||||
|
x1 = x1.Mod(x1, p)
|
||||||
|
y1 := new(big.Int).Mul(y, r)
|
||||||
|
y1 = y1.Mod(y1, p)
|
||||||
|
ax := new(p256OrdElement)
|
||||||
|
ay := new(p256OrdElement)
|
||||||
|
res2 := new(p256OrdElement)
|
||||||
|
ordFromBig(ax, x1)
|
||||||
|
ordFromBig(ay, y1)
|
||||||
|
p256OrdMul(res2, ax, ay)
|
||||||
|
resInt := new(big.Int).SetBytes(p256OrderFromMont(res2))
|
||||||
|
|
||||||
|
expected := new(big.Int).Mul(x, y)
|
||||||
|
expected = expected.Mod(expected, p)
|
||||||
|
if resInt.Cmp(expected) != 0 {
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestP256OrdMulOrdMinus1(t *testing.T) {
|
||||||
|
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
|
||||||
|
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
|
||||||
|
pMinus1 := new(big.Int).Sub(p, big.NewInt(1))
|
||||||
|
p256OrdMulTest(t, pMinus1, pMinus1, p, r)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFuzzyP256OrdMul(t *testing.T) {
|
||||||
|
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
|
||||||
|
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
|
||||||
|
var scalar1 [32]byte
|
||||||
|
var scalar2 [32]byte
|
||||||
|
var timeout *time.Timer
|
||||||
|
|
||||||
|
if testing.Short() {
|
||||||
|
timeout = time.NewTimer(10 * time.Millisecond)
|
||||||
|
} else {
|
||||||
|
timeout = time.NewTimer(2 * time.Second)
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-timeout.C:
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
io.ReadFull(rand.Reader, scalar1[:])
|
||||||
|
io.ReadFull(rand.Reader, scalar2[:])
|
||||||
|
x := new(big.Int).SetBytes(scalar1[:])
|
||||||
|
y := new(big.Int).SetBytes(scalar2[:])
|
||||||
|
p256OrdMulTest(t, x, y, p, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkP25OrdMul(b *testing.B) {
|
||||||
|
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
|
||||||
|
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
|
||||||
|
var scalar1 [32]byte
|
||||||
|
io.ReadFull(rand.Reader, scalar1[:])
|
||||||
|
x := new(big.Int).SetBytes(scalar1[:])
|
||||||
|
x1 := new(big.Int).Mul(x, r)
|
||||||
|
x1 = x1.Mod(x1, p)
|
||||||
|
ax := new(p256OrdElement)
|
||||||
|
res := new(p256OrdElement)
|
||||||
|
ordFromBig(ax, x1)
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
p256OrdMul(res, ax, ax)
|
||||||
|
}
|
||||||
|
}
|
@ -876,7 +876,6 @@ TEXT ·p256OrdReduce(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
// func p256OrdMul(res, in1, in2 *p256OrdElement)
|
// func p256OrdMul(res, in1, in2 *p256OrdElement)
|
||||||
TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||||
MOVQ res+0(FP), res_ptr
|
|
||||||
MOVQ in1+8(FP), x_ptr
|
MOVQ in1+8(FP), x_ptr
|
||||||
MOVQ in2+16(FP), y_ptr
|
MOVQ in2+16(FP), y_ptr
|
||||||
CMPB ·supportBMI2+0(SB), $0x01
|
CMPB ·supportBMI2+0(SB), $0x01
|
||||||
@ -1125,203 +1124,187 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
SBBQ DX, acc1
|
SBBQ DX, acc1
|
||||||
SBBQ $0, acc2
|
SBBQ $0, acc2
|
||||||
|
|
||||||
|
MOVQ res+0(FP), res_ptr
|
||||||
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
ordMulBMI2:
|
ordMulBMI2:
|
||||||
|
XORQ acc5, acc5
|
||||||
|
XORQ res_ptr, res_ptr
|
||||||
// x * y[0]
|
// x * y[0]
|
||||||
MOVQ (8*0)(y_ptr), DX
|
MOVQ (8*0)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), acc0, acc1
|
MULXQ (8*0)(x_ptr), acc0, acc1
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, acc2
|
MULXQ (8*1)(x_ptr), AX, acc2
|
||||||
ADDQ AX, acc1
|
ADCXQ AX, acc1
|
||||||
ADCQ $0, acc2
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, acc3
|
MULXQ (8*2)(x_ptr), AX, acc3
|
||||||
ADDQ AX, acc2
|
ADCXQ AX, acc2
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc4
|
MULXQ (8*3)(x_ptr), AX, acc4
|
||||||
ADDQ AX, acc3
|
ADCXQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADCXQ acc5, acc4
|
||||||
|
|
||||||
XORQ acc5, acc5
|
|
||||||
|
|
||||||
// First reduction step
|
// First reduction step
|
||||||
MOVQ acc0, DX
|
MOVQ acc0, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, BX
|
ADOXQ AX, acc0
|
||||||
ADDQ AX, acc0
|
|
||||||
ADCQ BX, acc1
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ BX, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ t0, acc4
|
|
||||||
ADCQ $0, acc5
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ BX, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc2
|
||||||
SHRQ $32, DX
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, BX
|
||||||
SUBQ t0, acc2
|
ADCXQ t0, AX
|
||||||
SBBQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
SBBQ DX, acc4
|
|
||||||
SBBQ $0, acc5
|
ADCXQ res_ptr, BX
|
||||||
|
ADOXQ BX, acc4
|
||||||
|
ADOXQ res_ptr, acc5
|
||||||
|
XORQ acc0, acc0
|
||||||
|
|
||||||
// x * y[1]
|
// x * y[1]
|
||||||
MOVQ (8*1)(y_ptr), DX
|
MOVQ (8*1)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), AX, BX
|
MULXQ (8*0)(x_ptr), AX, t0
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ BX, acc2
|
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, BX
|
MULXQ (8*1)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ BX, acc3
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, BX
|
MULXQ (8*2)(x_ptr), AX, t0
|
||||||
ADCQ $0, BX
|
ADCXQ BX, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ BX, acc4
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, BX
|
MULXQ (8*3)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ BX, acc5
|
|
||||||
ADCQ $0, acc0
|
ADCXQ acc0, BX
|
||||||
|
ADOXQ BX, acc5
|
||||||
|
ADOXQ res_ptr, acc0
|
||||||
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ acc1, DX
|
MOVQ acc1, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, BX
|
ADOXQ AX, acc1
|
||||||
ADDQ AX, acc1
|
|
||||||
ADCQ BX, acc2
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ BX, acc3
|
|
||||||
ADCQ $0, acc4
|
|
||||||
ADCQ t0, acc5
|
|
||||||
ADCQ $0, acc0
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ BX, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc3
|
||||||
SHRQ $32, DX
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, BX
|
||||||
SUBQ t0, acc3
|
ADCXQ t0, AX
|
||||||
SBBQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
SBBQ DX, acc5
|
|
||||||
SBBQ $0, acc0
|
ADCXQ res_ptr, BX
|
||||||
|
ADOXQ BX, acc5
|
||||||
|
ADOXQ res_ptr, acc0
|
||||||
|
XORQ acc1, acc1
|
||||||
|
|
||||||
// x * y[2]
|
// x * y[2]
|
||||||
MOVQ (8*2)(y_ptr), DX
|
MOVQ (8*2)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), AX, BX
|
MULXQ (8*0)(x_ptr), AX, t0
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ BX, acc3
|
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, BX
|
MULXQ (8*1)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ BX, acc4
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, BX
|
MULXQ (8*2)(x_ptr), AX, t0
|
||||||
ADCQ $0, BX
|
ADCXQ BX, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ BX, acc5
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, BX
|
MULXQ (8*3)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ BX, acc0
|
|
||||||
ADCQ $0, acc1
|
ADCXQ res_ptr, BX
|
||||||
|
ADOXQ BX, acc0
|
||||||
|
ADOXQ res_ptr, acc1
|
||||||
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
MOVQ acc2, DX
|
MOVQ acc2, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, BX
|
ADOXQ AX, acc2
|
||||||
ADDQ AX, acc2
|
|
||||||
ADCQ BX, acc3
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ BX, acc4
|
|
||||||
ADCQ $0, acc5
|
|
||||||
ADCQ t0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ BX, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc4
|
||||||
SHRQ $32, DX
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, BX
|
||||||
SUBQ t0, acc4
|
ADCXQ t0, AX
|
||||||
SBBQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
SBBQ DX, acc0
|
|
||||||
SBBQ $0, acc1
|
ADCXQ res_ptr, BX
|
||||||
|
ADOXQ BX, acc0
|
||||||
|
ADOXQ res_ptr, acc1
|
||||||
|
XORQ acc2, acc2
|
||||||
|
|
||||||
// x * y[3]
|
// x * y[3]
|
||||||
MOVQ (8*3)(y_ptr), DX
|
MOVQ (8*3)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), AX, BX
|
MULXQ (8*0)(x_ptr), AX, t0
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ BX, acc4
|
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, BX
|
MULXQ (8*1)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ BX, acc5
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, BX
|
MULXQ (8*2)(x_ptr), AX, t0
|
||||||
ADCQ $0, BX
|
ADCXQ BX, AX
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ BX, acc0
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, BX
|
MULXQ (8*3)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc0
|
||||||
ADCQ BX, acc1
|
|
||||||
ADCQ $0, acc2
|
ADCXQ res_ptr, BX
|
||||||
|
ADOXQ BX, acc1
|
||||||
|
ADOXQ res_ptr, acc2
|
||||||
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
MOVQ acc3, DX
|
MOVQ acc3, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, BX
|
ADOXQ AX, acc3
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ BX, acc4
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ BX, acc5
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ t0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ BX, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc5
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc5
|
|
||||||
SBBQ AX, acc0
|
|
||||||
SBBQ DX, acc1
|
|
||||||
SBBQ $0, acc2
|
|
||||||
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, BX
|
||||||
|
ADCXQ t0, AX
|
||||||
|
ADOXQ AX, acc0
|
||||||
|
|
||||||
|
ADCXQ res_ptr, BX
|
||||||
|
ADOXQ BX, acc1
|
||||||
|
ADOXQ res_ptr, acc2
|
||||||
|
|
||||||
|
MOVQ res+0(FP), res_ptr
|
||||||
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
@ -406,176 +406,161 @@ ordSqrLoop:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
ordSqrLoopBMI2:
|
ordSqrLoopBMI2:
|
||||||
|
XORQ acc0, acc0
|
||||||
|
XORQ y_ptr, y_ptr
|
||||||
// y[1:] * y[0]
|
// y[1:] * y[0]
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
MULXQ (8*1)(x_ptr), acc1, acc2
|
MULXQ (8*1)(x_ptr), acc1, acc2
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, acc3
|
MULXQ (8*2)(x_ptr), AX, acc3
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc4
|
MULXQ (8*3)(x_ptr), AX, acc4
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADOXQ y_ptr, acc4
|
||||||
|
|
||||||
// y[2:] * y[1]
|
// y[2:] * y[1]
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
MULXQ (8*2)(x_ptr), AX, BX
|
MULXQ (8*2)(x_ptr), AX, BX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ BX, acc4
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc5
|
MULXQ (8*3)(x_ptr), AX, acc5
|
||||||
ADCQ $0, acc5
|
ADCXQ BX, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ $0, acc5
|
ADCXQ y_ptr, acc5
|
||||||
|
|
||||||
// y[3] * y[2]
|
// y[3] * y[2]
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
MULXQ (8*3)(x_ptr), AX, y_ptr
|
MULXQ (8*3)(x_ptr), AX, y_ptr
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ $0, y_ptr
|
ADOXQ acc0, y_ptr
|
||||||
|
|
||||||
XORQ BX, BX
|
XORQ BX, BX
|
||||||
// *2
|
// *2
|
||||||
ADDQ acc1, acc1
|
ADOXQ acc1, acc1
|
||||||
ADCQ acc2, acc2
|
ADOXQ acc2, acc2
|
||||||
ADCQ acc3, acc3
|
ADOXQ acc3, acc3
|
||||||
ADCQ acc4, acc4
|
ADOXQ acc4, acc4
|
||||||
ADCQ acc5, acc5
|
ADOXQ acc5, acc5
|
||||||
ADCQ y_ptr, y_ptr
|
ADOXQ y_ptr, y_ptr
|
||||||
ADCQ $0, BX
|
ADOXQ acc0, BX
|
||||||
|
|
||||||
// Missing products
|
// Missing products
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
MULXQ DX, acc0, t0
|
MULXQ DX, acc0, t0
|
||||||
ADDQ t0, acc1
|
ADCXQ t0, acc1
|
||||||
|
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc2
|
ADCXQ AX, acc2
|
||||||
ADCQ t0, acc3
|
ADCXQ t0, acc3
|
||||||
|
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc4
|
ADCXQ AX, acc4
|
||||||
ADCQ t0, acc5
|
ADCXQ t0, acc5
|
||||||
|
|
||||||
MOVQ (8*3)(x_ptr), DX
|
MOVQ (8*3)(x_ptr), DX
|
||||||
MULXQ DX, AX, x_ptr
|
MULXQ DX, AX, x_ptr
|
||||||
ADCQ AX, y_ptr
|
ADCXQ AX, y_ptr
|
||||||
ADCQ BX, x_ptr
|
ADCXQ BX, x_ptr
|
||||||
|
|
||||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||||
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
|
// First reduction step
|
||||||
MOVQ acc0, DX
|
MOVQ acc0, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
|
||||||
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64
|
ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, BX
|
|
||||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
|
||||||
ADCQ BX, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1
|
|
||||||
MOVQ t0, acc0 // acc0 = t0
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||||
ADCQ $0, BX // BX = carry2 + H(t0*ord1)
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1)
|
ADOXQ AX, acc1
|
||||||
ADCQ BX, acc2 // (carry4, acc2) = acc2 + BX + carry3
|
|
||||||
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
|
|
||||||
ADCQ $0, acc0 // acc0 = t0 + carry5
|
|
||||||
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
|
||||||
MOVQ t0, AX
|
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc2
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
SBBQ AX, acc3
|
ADCXQ BX, AX
|
||||||
SBBQ DX, acc0
|
ADOXQ AX, acc2
|
||||||
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, acc0
|
||||||
|
ADCXQ t0, AX
|
||||||
|
ADOXQ AX, acc3
|
||||||
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc0
|
||||||
|
ADOXQ t0, acc0
|
||||||
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ acc1, DX
|
MOVQ acc1, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, BX
|
ADOXQ AX, acc1
|
||||||
ADDQ AX, acc1
|
|
||||||
ADCQ BX, acc2
|
|
||||||
MOVQ t0, acc1
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ BX, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ BX, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc3
|
||||||
SHRQ $32, DX
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, acc1
|
||||||
|
ADCXQ t0, AX
|
||||||
|
ADOXQ AX, acc0
|
||||||
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc1
|
||||||
|
ADOXQ t0, acc1
|
||||||
|
|
||||||
SUBQ t0, acc3
|
|
||||||
SBBQ AX, acc0
|
|
||||||
SBBQ DX, acc1
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
MOVQ acc2, DX
|
MOVQ acc2, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, BX
|
ADOXQ AX, acc2
|
||||||
ADDQ AX, acc2
|
|
||||||
ADCQ BX, acc3
|
|
||||||
MOVQ t0, acc2
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ BX, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ BX, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc0
|
||||||
SHRQ $32, DX
|
|
||||||
|
MULXQ p256ord<>+0x18(SB), AX, acc2
|
||||||
|
ADCXQ t0, AX
|
||||||
|
ADOXQ AX, acc1
|
||||||
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc2
|
||||||
|
ADOXQ t0, acc2
|
||||||
|
|
||||||
SUBQ t0, acc0
|
|
||||||
SBBQ AX, acc1
|
|
||||||
SBBQ DX, acc2
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
MOVQ acc3, DX
|
MOVQ acc3, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), DX, AX
|
||||||
|
|
||||||
MOVQ t0, DX
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||||
MULXQ p256ord<>+0x00(SB), AX, BX
|
ADOXQ AX, acc3
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ BX, acc0
|
|
||||||
MOVQ t0, acc3
|
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc0
|
||||||
ADCQ BX, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
ADCXQ BX, AX
|
||||||
SHLQ $32, AX
|
ADOXQ AX, acc1
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc1
|
MULXQ p256ord<>+0x18(SB), AX, acc3
|
||||||
SBBQ AX, acc2
|
ADCXQ t0, AX
|
||||||
SBBQ DX, acc3
|
ADOXQ AX, acc2
|
||||||
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc3
|
||||||
|
ADOXQ t0, acc3
|
||||||
|
|
||||||
XORQ t0, t0
|
XORQ BX, BX
|
||||||
// Add bits [511:256] of the sqr result
|
// Add bits [511:256] of the sqr result
|
||||||
ADCQ acc4, acc0
|
ADCXQ acc4, acc0
|
||||||
ADCQ acc5, acc1
|
ADCXQ acc5, acc1
|
||||||
ADCQ y_ptr, acc2
|
ADCXQ y_ptr, acc2
|
||||||
ADCQ x_ptr, acc3
|
ADCXQ x_ptr, acc3
|
||||||
ADCQ $0, t0
|
ADCXQ BX, t0
|
||||||
|
|
||||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
|
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
|
||||||
MOVQ res_ptr, x_ptr
|
MOVQ res_ptr, x_ptr
|
||||||
|
@ -302,7 +302,9 @@ func p256Sqrt(e, x *p256Element) (isSquare bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// The following assembly functions are implemented in p256_asm_*.s
|
// The following assembly functions are implemented in p256_asm_*.s
|
||||||
var supportBMI2 = cpu.X86.HasBMI2
|
|
||||||
|
// amd64 assembly uses ADCX/ADOX/MULX
|
||||||
|
var supportBMI2 = cpu.X86.HasADX && cpu.X86.HasBMI2
|
||||||
|
|
||||||
var supportAVX2 = cpu.X86.HasAVX2
|
var supportAVX2 = cpu.X86.HasAVX2
|
||||||
|
|
||||||
|
@ -83,6 +83,23 @@ func TestFuzzyP256Mul(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkP256Mul(b *testing.B) {
|
||||||
|
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
|
||||||
|
r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16)
|
||||||
|
var scalar1 [32]byte
|
||||||
|
io.ReadFull(rand.Reader, scalar1[:])
|
||||||
|
x := new(big.Int).SetBytes(scalar1[:])
|
||||||
|
x1 := new(big.Int).Mul(x, r)
|
||||||
|
x1 = x1.Mod(x1, p)
|
||||||
|
ax := new(p256Element)
|
||||||
|
res := new(p256Element)
|
||||||
|
fromBig(ax, x1)
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
p256Mul(res, ax, ax)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func p256SqrTest(t *testing.T, x, p, r *big.Int) {
|
func p256SqrTest(t *testing.T, x, p, r *big.Int) {
|
||||||
x1 := new(big.Int).Mul(x, r)
|
x1 := new(big.Int).Mul(x, r)
|
||||||
x1 = x1.Mod(x1, p)
|
x1 = x1.Mod(x1, p)
|
||||||
@ -142,6 +159,7 @@ func BenchmarkP256Sqr(b *testing.B) {
|
|||||||
ax := new(p256Element)
|
ax := new(p256Element)
|
||||||
res := new(p256Element)
|
res := new(p256Element)
|
||||||
fromBig(ax, x1)
|
fromBig(ax, x1)
|
||||||
|
b.ResetTimer()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
p256Sqr(res, ax, 20)
|
p256Sqr(res, ax, 20)
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user