diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 75e126b..3228901 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -407,176 +407,161 @@ ordSqrLoop: RET ordSqrLoopBMI2: + XORQ acc0, acc0 + XORQ y_ptr, y_ptr // y[1:] * y[0] MOVQ (8*0)(x_ptr), DX MULXQ (8*1)(x_ptr), acc1, acc2 MULXQ (8*2)(x_ptr), AX, acc3 - ADDQ AX, acc2 - ADCQ $0, acc3 + ADOXQ AX, acc2 MULXQ (8*3)(x_ptr), AX, acc4 - ADDQ AX, acc3 - ADCQ $0, acc4 + ADOXQ AX, acc3 + ADOXQ y_ptr, acc4 // y[2:] * y[1] MOVQ (8*1)(x_ptr), DX MULXQ (8*2)(x_ptr), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 + ADOXQ AX, acc3 MULXQ (8*3)(x_ptr), AX, acc5 - ADCQ $0, acc5 - ADDQ AX, acc4 - ADCQ $0, acc5 + ADCXQ t1, AX + ADOXQ AX, acc4 + ADCXQ y_ptr, acc5 // y[3] * y[2] MOVQ (8*2)(x_ptr), DX MULXQ (8*3)(x_ptr), AX, y_ptr - ADDQ AX, acc5 - ADCQ $0, y_ptr + ADOXQ AX, acc5 + ADOXQ acc0, y_ptr XORQ t1, t1 // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, t1 + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ acc4, acc4 + ADOXQ acc5, acc5 + ADOXQ y_ptr, y_ptr + ADOXQ acc0, t1 // Missing products MOVQ (8*0)(x_ptr), DX MULXQ DX, acc0, t0 - ADDQ t0, acc1 + ADCXQ t0, acc1 MOVQ (8*1)(x_ptr), DX MULXQ DX, AX, t0 - ADCQ AX, acc2 - ADCQ t0, acc3 + ADCXQ AX, acc2 + ADCXQ t0, acc3 MOVQ (8*2)(x_ptr), DX MULXQ DX, AX, t0 - ADCQ AX, acc4 - ADCQ t0, acc5 + ADCXQ AX, acc4 + ADCXQ t0, acc5 MOVQ (8*3)(x_ptr), DX MULXQ DX, AX, x_ptr - ADCQ AX, y_ptr - ADCQ t1, x_ptr + ADCXQ AX, y_ptr + ADCXQ t1, x_ptr // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] + // First reduction step MOVQ acc0, DX - MULXQ p256ordK0<>(SB), t0, AX - // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] - // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. - MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 - MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) - ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1 - MOVQ t0, acc0 // acc0 = t0 + MULXQ p256ordK0<>(SB), DX, AX + + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 MULXQ p256ord<>+0x08(SB), AX, t1 - ADCQ $0, t1 // t1 = carry2 + H(t0*ord1) - ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1) - ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3 - ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4 - ADCQ $0, acc0 // acc0 = t0 + carry5 - // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + ADCXQ t0, AX + ADOXQ AX, acc1 - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc0 + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc2 + + MULXQ p256ord<>+0x18(SB), AX, acc0 + ADCXQ t0, AX + ADOXQ AX, acc3 + MOVQ $0, t0 + ADCXQ t0, acc0 + ADOXQ t0, acc0 // Second reduction step MOVQ acc1, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 - MOVQ t0, acc1 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc1 MULXQ p256ord<>+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 + ADCXQ t0, AX + ADOXQ AX, acc2 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc3 + + MULXQ p256ord<>+0x18(SB), AX, acc1 + ADCXQ t0, AX + ADOXQ AX, acc0 + MOVQ $0, t0 + ADCXQ t0, acc1 + ADOXQ t0, acc1 - SUBQ t0, acc3 - SBBQ AX, acc0 - SBBQ DX, acc1 // Third reduction step MOVQ acc2, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 - MOVQ t0, acc2 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc2 MULXQ p256ord<>+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 + ADCXQ t0, AX + ADOXQ AX, acc3 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc0 + + MULXQ p256ord<>+0x18(SB), AX, acc2 + ADCXQ t0, AX + ADOXQ AX, acc1 + MOVQ $0, t0 + ADCXQ t0, acc2 + ADOXQ t0, acc2 - SUBQ t0, acc0 - SBBQ AX, acc1 - SBBQ DX, acc2 // Last reduction step MOVQ acc3, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc0 - MOVQ t0, acc3 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc3 MULXQ p256ord<>+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc0 - ADCQ t1, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 + ADCXQ t0, AX + ADOXQ AX, acc0 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc1 - SUBQ t0, acc1 - SBBQ AX, acc2 - SBBQ DX, acc3 + MULXQ p256ord<>+0x18(SB), AX, acc3 + ADCXQ t0, AX + ADOXQ AX, acc2 + MOVQ $0, t0 + ADCXQ t0, acc3 + ADOXQ t0, acc3 - XORQ t0, t0 + XORQ t1, t1 // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 - ADCQ acc5, acc1 - ADCQ y_ptr, acc2 - ADCQ x_ptr, acc3 - ADCQ $0, t0 + ADCXQ acc4, acc0 + ADCXQ acc5, acc1 + ADCXQ y_ptr, acc2 + ADCXQ x_ptr, acc3 + ADCXQ t1, t0 p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) MOVQ res_ptr, x_ptr diff --git a/internal/sm2ec/p256_asm_ord_test.go b/internal/sm2ec/p256_asm_ord_test.go new file mode 100644 index 0000000..0858314 --- /dev/null +++ b/internal/sm2ec/p256_asm_ord_test.go @@ -0,0 +1,154 @@ +//go:build (amd64 && !purego) || (arm64 && !purego) + +package sm2ec + +import ( + "crypto/rand" + "io" + "math/big" + "testing" + "time" +) + +func ordFromBig(out *p256OrdElement, big *big.Int) { + for i := range out { + out[i] = 0 + } + + for i, v := range big.Bits() { + out[i] = uint64(v) + } +} + +func p256OrderSqrTest(t *testing.T, x, p, r *big.Int) { + x1 := new(big.Int).Mul(x, r) + x1 = x1.Mod(x1, p) + ax := new(p256OrdElement) + res2 := new(p256OrdElement) + ordFromBig(ax, x1) + p256OrdSqr(res2, ax, 1) + resInt := new(big.Int).SetBytes(p256OrderFromMont(res2)) + + expected := new(big.Int).Mul(x, x) + expected = expected.Mod(expected, p) + if resInt.Cmp(expected) != 0 { + t.FailNow() + } +} + +func TestP256OrdSqrOrdMinus1(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + pMinus1 := new(big.Int).Sub(p, big.NewInt(1)) + p256OrderSqrTest(t, pMinus1, p, r) +} + +func TestFuzzyP256OrdSqr(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + var scalar1 [32]byte + var timeout *time.Timer + + if testing.Short() { + timeout = time.NewTimer(10 * time.Millisecond) + } else { + timeout = time.NewTimer(2 * time.Second) + } + for { + select { + case <-timeout.C: + return + default: + } + io.ReadFull(rand.Reader, scalar1[:]) + x := new(big.Int).SetBytes(scalar1[:]) + p256OrderSqrTest(t, x, p, r) + } +} + +func BenchmarkP25OrdSqr(b *testing.B) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + var scalar1 [32]byte + io.ReadFull(rand.Reader, scalar1[:]) + x := new(big.Int).SetBytes(scalar1[:]) + x1 := new(big.Int).Mul(x, r) + x1 = x1.Mod(x1, p) + ax := new(p256OrdElement) + res := new(p256OrdElement) + ordFromBig(ax, x1) + b.ResetTimer() + for i := 0; i < b.N; i++ { + p256OrdSqr(res, ax, 20) + } +} + +func p256OrdMulTest(t *testing.T, x, y, p, r *big.Int) { + x1 := new(big.Int).Mul(x, r) + x1 = x1.Mod(x1, p) + y1 := new(big.Int).Mul(y, r) + y1 = y1.Mod(y1, p) + ax := new(p256OrdElement) + ay := new(p256OrdElement) + res2 := new(p256OrdElement) + ordFromBig(ax, x1) + ordFromBig(ay, y1) + p256OrdMul(res2, ax, ay) + resInt := new(big.Int).SetBytes(p256OrderFromMont(res2)) + + expected := new(big.Int).Mul(x, y) + expected = expected.Mod(expected, p) + if resInt.Cmp(expected) != 0 { + t.FailNow() + } +} + +func TestP256OrdMulOrdMinus1(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + pMinus1 := new(big.Int).Sub(p, big.NewInt(1)) + p256OrdMulTest(t, pMinus1, pMinus1, p, r) +} + +func TestFuzzyP256OrdMul(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + var scalar1 [32]byte + var scalar2 [32]byte + var timeout *time.Timer + + if testing.Short() { + timeout = time.NewTimer(10 * time.Millisecond) + } else { + timeout = time.NewTimer(2 * time.Second) + } + for { + select { + case <-timeout.C: + return + default: + } + io.ReadFull(rand.Reader, scalar1[:]) + io.ReadFull(rand.Reader, scalar2[:]) + x := new(big.Int).SetBytes(scalar1[:]) + y := new(big.Int).SetBytes(scalar2[:]) + p256OrdMulTest(t, x, y, p, r) + } +} + +func BenchmarkP25OrdMul(b *testing.B) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + var scalar1 [32]byte + io.ReadFull(rand.Reader, scalar1[:]) + x := new(big.Int).SetBytes(scalar1[:]) + x1 := new(big.Int).Mul(x, r) + x1 = x1.Mod(x1, p) + ax := new(p256OrdElement) + res := new(p256OrdElement) + ordFromBig(ax, x1) + b.ResetTimer() + for i := 0; i < b.N; i++ { + p256OrdMul(res, ax, ax) + } +} diff --git a/internal/sm2ec/p256_common_amd64.s b/internal/sm2ec/p256_common_amd64.s index 614355b..c0ee797 100644 --- a/internal/sm2ec/p256_common_amd64.s +++ b/internal/sm2ec/p256_common_amd64.s @@ -876,7 +876,6 @@ TEXT ·p256OrdReduce(SB),NOSPLIT,$0 // func p256OrdMul(res, in1, in2 *p256OrdElement) TEXT ·p256OrdMul(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr MOVQ in1+8(FP), x_ptr MOVQ in2+16(FP), y_ptr CMPB ·supportBMI2+0(SB), $0x01 @@ -1125,203 +1124,187 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 SBBQ DX, acc1 SBBQ $0, acc2 + MOVQ res+0(FP), res_ptr p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) RET ordMulBMI2: + XORQ acc5, acc5 + XORQ res_ptr, res_ptr // x * y[0] MOVQ (8*0)(y_ptr), DX MULXQ (8*0)(x_ptr), acc0, acc1 MULXQ (8*1)(x_ptr), AX, acc2 - ADDQ AX, acc1 - ADCQ $0, acc2 + ADCXQ AX, acc1 MULXQ (8*2)(x_ptr), AX, acc3 - ADDQ AX, acc2 - ADCQ $0, acc3 + ADCXQ AX, acc2 MULXQ (8*3)(x_ptr), AX, acc4 - ADDQ AX, acc3 - ADCQ $0, acc4 - - XORQ acc5, acc5 + ADCXQ AX, acc3 + ADCXQ acc5, acc4 // First reduction step MOVQ acc0, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc0 - ADCQ BX, acc1 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc0 MULXQ p256ord<>+0x08(SB), AX, BX - ADCQ $0, BX - ADDQ AX, acc1 - ADCQ BX, acc2 - ADCQ $0, acc3 - ADCQ t0, acc4 - ADCQ $0, acc5 + ADCXQ t0, AX + ADOXQ AX, acc1 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 - SBBQ $0, acc5 + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc2 + + MULXQ p256ord<>+0x18(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc3 + + ADCXQ res_ptr, BX + ADOXQ BX, acc4 + ADOXQ res_ptr, acc5 + XORQ acc0, acc0 // x * y[1] MOVQ (8*1)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, BX - ADDQ AX, acc1 - ADCQ BX, acc2 + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc1 - MULXQ (8*1)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc2 - ADCQ BX, acc3 + MULXQ (8*1)(x_ptr), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc2 - MULXQ (8*2)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc3 - ADCQ BX, acc4 + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc3 MULXQ (8*3)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc4 - ADCQ BX, acc5 - ADCQ $0, acc0 + ADCXQ t0, AX + ADOXQ AX, acc4 + + ADCXQ acc0, BX + ADOXQ BX, acc5 + ADOXQ res_ptr, acc0 // Second reduction step MOVQ acc1, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc1 - ADCQ BX, acc2 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc1 MULXQ p256ord<>+0x08(SB), AX, BX - ADCQ $0, BX - ADDQ AX, acc2 - ADCQ BX, acc3 - ADCQ $0, acc4 - ADCQ t0, acc5 - ADCQ $0, acc0 + ADCXQ t0, AX + ADOXQ AX, acc2 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc3 - SBBQ AX, acc4 - SBBQ DX, acc5 - SBBQ $0, acc0 + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc3 + + MULXQ p256ord<>+0x18(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc4 + + ADCXQ res_ptr, BX + ADOXQ BX, acc5 + ADOXQ res_ptr, acc0 + XORQ acc1, acc1 // x * y[2] MOVQ (8*2)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, BX - ADDQ AX, acc2 - ADCQ BX, acc3 + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc2 - MULXQ (8*1)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc3 - ADCQ BX, acc4 + MULXQ (8*1)(x_ptr), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc3 - MULXQ (8*2)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc4 - ADCQ BX, acc5 + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc4 MULXQ (8*3)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc5 - ADCQ BX, acc0 - ADCQ $0, acc1 + ADCXQ t0, AX + ADOXQ AX, acc5 + + ADCXQ res_ptr, BX + ADOXQ BX, acc0 + ADOXQ res_ptr, acc1 // Third reduction step MOVQ acc2, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc2 - ADCQ BX, acc3 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc2 MULXQ p256ord<>+0x08(SB), AX, BX - ADCQ $0, BX - ADDQ AX, acc3 - ADCQ BX, acc4 - ADCQ $0, acc5 - ADCQ t0, acc0 - ADCQ $0, acc1 + ADCXQ t0, AX + ADOXQ AX, acc3 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc4 - SBBQ AX, acc5 - SBBQ DX, acc0 - SBBQ $0, acc1 + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc4 + + MULXQ p256ord<>+0x18(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc5 + + ADCXQ res_ptr, BX + ADOXQ BX, acc0 + ADOXQ res_ptr, acc1 + XORQ acc2, acc2 // x * y[3] MOVQ (8*3)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, BX - ADDQ AX, acc3 - ADCQ BX, acc4 + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc3 - MULXQ (8*1)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc4 - ADCQ BX, acc5 + MULXQ (8*1)(x_ptr), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc4 - MULXQ (8*2)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc5 - ADCQ BX, acc0 + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc5 MULXQ (8*3)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc0 - ADCQ BX, acc1 - ADCQ $0, acc2 + ADCXQ t0, AX + ADOXQ AX, acc0 + + ADCXQ res_ptr, BX + ADOXQ BX, acc1 + ADOXQ res_ptr, acc2 // Last reduction step MOVQ acc3, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc3 - ADCQ BX, acc4 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc3 MULXQ p256ord<>+0x08(SB), AX, BX - ADCQ $0, BX - ADDQ AX, acc4 - ADCQ BX, acc5 - ADCQ $0, acc0 - ADCQ t0, acc1 - ADCQ $0, acc2 + ADCXQ t0, AX + ADOXQ AX, acc4 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc5 - SBBQ AX, acc0 - SBBQ DX, acc1 - SBBQ $0, acc2 + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc5 + MULXQ p256ord<>+0x18(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc0 + + ADCXQ res_ptr, BX + ADOXQ BX, acc1 + ADOXQ res_ptr, acc2 + + MOVQ res+0(FP), res_ptr p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) RET diff --git a/internal/sm2ec/p256_plugin_amd64.s b/internal/sm2ec/p256_plugin_amd64.s index 3223598..a35b5f2 100644 --- a/internal/sm2ec/p256_plugin_amd64.s +++ b/internal/sm2ec/p256_plugin_amd64.s @@ -406,176 +406,161 @@ ordSqrLoop: RET ordSqrLoopBMI2: + XORQ acc0, acc0 + XORQ y_ptr, y_ptr // y[1:] * y[0] MOVQ (8*0)(x_ptr), DX MULXQ (8*1)(x_ptr), acc1, acc2 MULXQ (8*2)(x_ptr), AX, acc3 - ADDQ AX, acc2 - ADCQ $0, acc3 + ADOXQ AX, acc2 MULXQ (8*3)(x_ptr), AX, acc4 - ADDQ AX, acc3 - ADCQ $0, acc4 + ADOXQ AX, acc3 + ADOXQ y_ptr, acc4 // y[2:] * y[1] MOVQ (8*1)(x_ptr), DX MULXQ (8*2)(x_ptr), AX, BX - ADDQ AX, acc3 - ADCQ BX, acc4 + ADOXQ AX, acc3 MULXQ (8*3)(x_ptr), AX, acc5 - ADCQ $0, acc5 - ADDQ AX, acc4 - ADCQ $0, acc5 + ADCXQ BX, AX + ADOXQ AX, acc4 + ADCXQ y_ptr, acc5 // y[3] * y[2] MOVQ (8*2)(x_ptr), DX MULXQ (8*3)(x_ptr), AX, y_ptr - ADDQ AX, acc5 - ADCQ $0, y_ptr + ADOXQ AX, acc5 + ADOXQ acc0, y_ptr XORQ BX, BX // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, BX + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ acc4, acc4 + ADOXQ acc5, acc5 + ADOXQ y_ptr, y_ptr + ADOXQ acc0, BX // Missing products MOVQ (8*0)(x_ptr), DX MULXQ DX, acc0, t0 - ADDQ t0, acc1 + ADCXQ t0, acc1 MOVQ (8*1)(x_ptr), DX MULXQ DX, AX, t0 - ADCQ AX, acc2 - ADCQ t0, acc3 + ADCXQ AX, acc2 + ADCXQ t0, acc3 MOVQ (8*2)(x_ptr), DX MULXQ DX, AX, t0 - ADCQ AX, acc4 - ADCQ t0, acc5 + ADCXQ AX, acc4 + ADCXQ t0, acc5 MOVQ (8*3)(x_ptr), DX MULXQ DX, AX, x_ptr - ADCQ AX, y_ptr - ADCQ BX, x_ptr + ADCXQ AX, y_ptr + ADCXQ BX, x_ptr // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] + // First reduction step MOVQ acc0, DX - MULXQ p256ordK0<>(SB), t0, AX - // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] - // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. - MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 - MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) - ADCQ BX, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1 - MOVQ t0, acc0 // acc0 = t0 + MULXQ p256ordK0<>(SB), DX, AX + + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 MULXQ p256ord<>+0x08(SB), AX, BX - ADCQ $0, BX // BX = carry2 + H(t0*ord1) - ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1) - ADCQ BX, acc2 // (carry4, acc2) = acc2 + BX + carry3 - ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4 - ADCQ $0, acc0 // acc0 = t0 + carry5 - // calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + ADCXQ t0, AX + ADOXQ AX, acc1 - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc0 + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc2 + + MULXQ p256ord<>+0x18(SB), AX, acc0 + ADCXQ t0, AX + ADOXQ AX, acc3 + MOVQ $0, t0 + ADCXQ t0, acc0 + ADOXQ t0, acc0 // Second reduction step MOVQ acc1, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc1 - ADCQ BX, acc2 - MOVQ t0, acc1 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc1 MULXQ p256ord<>+0x08(SB), AX, BX - ADCQ $0, BX - ADDQ AX, acc2 - ADCQ BX, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 + ADCXQ t0, AX + ADOXQ AX, acc2 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc3 + + MULXQ p256ord<>+0x18(SB), AX, acc1 + ADCXQ t0, AX + ADOXQ AX, acc0 + MOVQ $0, t0 + ADCXQ t0, acc1 + ADOXQ t0, acc1 - SUBQ t0, acc3 - SBBQ AX, acc0 - SBBQ DX, acc1 // Third reduction step MOVQ acc2, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc2 - ADCQ BX, acc3 - MOVQ t0, acc2 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc2 MULXQ p256ord<>+0x08(SB), AX, BX - ADCQ $0, BX - ADDQ AX, acc3 - ADCQ BX, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 + ADCXQ t0, AX + ADOXQ AX, acc3 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc0 + + MULXQ p256ord<>+0x18(SB), AX, acc2 + ADCXQ t0, AX + ADOXQ AX, acc1 + MOVQ $0, t0 + ADCXQ t0, acc2 + ADOXQ t0, acc2 - SUBQ t0, acc0 - SBBQ AX, acc1 - SBBQ DX, acc2 // Last reduction step MOVQ acc3, DX - MULXQ p256ordK0<>(SB), t0, AX + MULXQ p256ordK0<>(SB), DX, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc3 - ADCQ BX, acc0 - MOVQ t0, acc3 + MULXQ p256ord<>+0x00(SB), AX, t0 + ADOXQ AX, acc3 MULXQ p256ord<>+0x08(SB), AX, BX - ADCQ $0, BX - ADDQ AX, acc0 - ADCQ BX, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 + ADCXQ t0, AX + ADOXQ AX, acc0 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + MULXQ p256ord<>+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc1 - SUBQ t0, acc1 - SBBQ AX, acc2 - SBBQ DX, acc3 + MULXQ p256ord<>+0x18(SB), AX, acc3 + ADCXQ t0, AX + ADOXQ AX, acc2 + MOVQ $0, t0 + ADCXQ t0, acc3 + ADOXQ t0, acc3 - XORQ t0, t0 + XORQ BX, BX // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 - ADCQ acc5, acc1 - ADCQ y_ptr, acc2 - ADCQ x_ptr, acc3 - ADCQ $0, t0 + ADCXQ acc4, acc0 + ADCXQ acc5, acc1 + ADCXQ y_ptr, acc2 + ADCXQ x_ptr, acc3 + ADCXQ BX, t0 p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) MOVQ res_ptr, x_ptr diff --git a/internal/sm2ec/sm2p256_asm.go b/internal/sm2ec/sm2p256_asm.go index 5efd1b6..e1c8528 100644 --- a/internal/sm2ec/sm2p256_asm.go +++ b/internal/sm2ec/sm2p256_asm.go @@ -302,7 +302,9 @@ func p256Sqrt(e, x *p256Element) (isSquare bool) { } // The following assembly functions are implemented in p256_asm_*.s -var supportBMI2 = cpu.X86.HasBMI2 + +// amd64 assembly uses ADCX/ADOX/MULX +var supportBMI2 = cpu.X86.HasADX && cpu.X86.HasBMI2 var supportAVX2 = cpu.X86.HasAVX2 diff --git a/internal/sm2ec/sm2p256_asm_test.go b/internal/sm2ec/sm2p256_asm_test.go index 576b82f..52b79af 100644 --- a/internal/sm2ec/sm2p256_asm_test.go +++ b/internal/sm2ec/sm2p256_asm_test.go @@ -83,6 +83,23 @@ func TestFuzzyP256Mul(t *testing.T) { } } +func BenchmarkP256Mul(b *testing.B) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + var scalar1 [32]byte + io.ReadFull(rand.Reader, scalar1[:]) + x := new(big.Int).SetBytes(scalar1[:]) + x1 := new(big.Int).Mul(x, r) + x1 = x1.Mod(x1, p) + ax := new(p256Element) + res := new(p256Element) + fromBig(ax, x1) + b.ResetTimer() + for i := 0; i < b.N; i++ { + p256Mul(res, ax, ax) + } +} + func p256SqrTest(t *testing.T, x, p, r *big.Int) { x1 := new(big.Int).Mul(x, r) x1 = x1.Mod(x1, p) @@ -142,6 +159,7 @@ func BenchmarkP256Sqr(b *testing.B) { ax := new(p256Element) res := new(p256Element) fromBig(ax, x1) + b.ResetTimer() for i := 0; i < b.N; i++ { p256Sqr(res, ax, 20) }