sm9/bn256: re-implement gfp generic

2025-10-14 23:30:48 +08:00 · 2023-06-20 17:48:04 +08:00 · 2023-06-20 17:48:04 +08:00 · 0ad84822fe
commit 0ad84822fe
parent 392e5fb762
1 changed files with 74 additions and 121 deletions
--- a/sm9/bn256/gfp_generic.go
+++ b/sm9/bn256/gfp_generic.go
@ -1,17 +1,18 @@
-//go:build !amd64 && !arm64 || purego
+//go:build (!amd64 && !arm64) || purego
 // +build !amd64,!arm64 purego
 package bn256
 import (
 	"math/bits"
 )
 func gfpCarry(a *gfP, head uint64) {
 	b := &gfP{}
 	var carry uint64
 	for i, pi := range p2 {
-		ai := a[i]
+		b[i], carry = bits.Sub64(a[i], pi, carry)
 		bi := ai - pi - carry
 		b[i] = bi
 		carry = (pi&^ai | (pi|^ai)&bi) >> 63
 	}
 	carry = carry &^ head
@ -27,21 +28,16 @@ func gfpCarry(a *gfP, head uint64) {
 func gfpNeg(c, a *gfP) {
 	var carry uint64
 	for i, pi := range p2 {
-		ai := a[i]
+		c[i], carry = bits.Sub64(pi, a[i], carry)
 		ci := pi - ai - carry
 		c[i] = ci
 		carry = (ai&^pi | (ai|^pi)&ci) >> 63
 	}
-	gfpCarry(c, 0)
+	// Is this required?
 	//gfpCarry(c, 0)
 }
 func gfpAdd(c, a, b *gfP) {
 	var carry uint64
 	for i, ai := range a {
-		bi := b[i]
+		c[i], carry = bits.Add64(ai, b[i], carry)
 		ci := ai + bi + carry
 		c[i] = ci
 		carry = (ai&bi | (ai|bi)&^ci) >> 63
 	}
 	gfpCarry(c, carry)
 }
@ -49,124 +45,81 @@ func gfpAdd(c, a, b *gfP) {
 func gfpSub(c, a, b *gfP) {
 	t := &gfP{}
-	var carry uint64
+	var carry, underflow uint64
 	for i, ai := range a {
 		c[i], underflow = bits.Sub64(ai, b[i], underflow)
 	}
 	for i, pi := range p2 {
-		bi := b[i]
+		t[i], carry = bits.Add64(pi, c[i], carry)
 		ti := pi - bi - carry
 		t[i] = ti
 		carry = (bi&^pi | (bi|^pi)&ti) >> 63
 	}
-	carry = 0
+	mask := -underflow
-	for i, ai := range a {
+	for i, ci := range c {
-		ti := t[i]
+		c[i] ^= mask & (ci ^ t[i])
 		ci := ai + ti + carry
 		c[i] = ci
 		carry = (ai&ti | (ai|ti)&^ci) >> 63
 	}
 	gfpCarry(c, carry)
 }
-func mul(a, b [4]uint64) [8]uint64 {
+// addMulVVW multiplies the multi-word value x by the single-word value y,
-	const (
+// adding the result to the multi-word value z and returning the final carry.
-		mask16 uint64 = 0x0000ffff
+// It can be thought of as one row of a pen-and-paper column multiplication.
-		mask32 uint64 = 0xffffffff
+func addMulVVW(z, x []uint64, y uint64) (carry uint64) {
-	)
+	_ = x[len(z)-1] // bounds check elimination hint
-
+	for i := range z {
-	var buff [32]uint64
+		hi, lo := bits.Mul64(x[i], y)
-	for i, ai := range a {
+		lo, c := bits.Add64(lo, z[i], 0)
-		a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48
+		// We use bits.Add with zero to get an add-with-carry instruction that
-
+		// absorbs the carry from the previous bits.Add.
-		for j, bj := range b {
+		hi, _ = bits.Add64(hi, 0, c)
-			b0, b2 := bj&mask32, bj>>32
+		lo, c = bits.Add64(lo, carry, 0)
-
+		hi, _ = bits.Add64(hi, 0, c)
-			off := 4 * (i + j)
+		carry = hi
-			buff[off+0] += a0 * b0
+		z[i] = lo
 			buff[off+1] += a1 * b0
 			buff[off+2] += a2*b0 + a0*b2
 			buff[off+3] += a3*b0 + a1*b2
 			buff[off+4] += a2 * b2
 			buff[off+5] += a3 * b2
 		}
 	}
-
+	return carry
 	for i := uint(1); i < 4; i++ {
 		shift := 16 * i
 		var head, carry uint64
 		for j := uint(0); j < 8; j++ {
 			block := 4 * j
 			xi := buff[block]
 			yi := (buff[block+i] << shift) + head
 			zi := xi + yi + carry
 			buff[block] = zi
 			carry = (xi&yi | (xi|yi)&^zi) >> 63
 			head = buff[block+i] >> (64 - shift)
 		}
 	}
 	return [8]uint64{buff[0], buff[4], buff[8], buff[12], buff[16], buff[20], buff[24], buff[28]}
 }
 func halfMul(a, b [4]uint64) [4]uint64 {
 	const (
 		mask16 uint64 = 0x0000ffff
 		mask32 uint64 = 0xffffffff
 	)
 	var buff [18]uint64
 	for i, ai := range a {
 		a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48
 		for j, bj := range b {
 			if i+j > 3 {
 				break
 			}
 			b0, b2 := bj&mask32, bj>>32
 			off := 4 * (i + j)
 			buff[off+0] += a0 * b0
 			buff[off+1] += a1 * b0
 			buff[off+2] += a2*b0 + a0*b2
 			buff[off+3] += a3*b0 + a1*b2
 			buff[off+4] += a2 * b2
 			buff[off+5] += a3 * b2
 		}
 	}
 	for i := uint(1); i < 4; i++ {
 		shift := 16 * i
 		var head, carry uint64
 		for j := uint(0); j < 4; j++ {
 			block := 4 * j
 			xi := buff[block]
 			yi := (buff[block+i] << shift) + head
 			zi := xi + yi + carry
 			buff[block] = zi
 			carry = (xi&yi | (xi|yi)&^zi) >> 63
 			head = buff[block+i] >> (64 - shift)
 		}
 	}
 	return [4]uint64{buff[0], buff[4], buff[8], buff[12]}
 }
 func gfpMul(c, a, b *gfP) {
-	T := mul(*a, *b)
+	var T [8]uint64
-	m := halfMul([4]uint64{T[0], T[1], T[2], T[3]}, np)
+	// This loop implements Word-by-Word Montgomery Multiplication, as
-	t := mul([4]uint64{m[0], m[1], m[2], m[3]}, p2)
+	// described in Algorithm 4 (Fig. 3) of "Efficient Software
-
+	// Implementations of Modular Exponentiation" by Shay Gueron
 	// [https://eprint.iacr.org/2011/239.pdf].
 	var carry uint64
-	for i, Ti := range T {
+	for i := 0; i < 4; i++ {
-		ti := t[i]
+		// Step 1 (T = a × b) is computed as a large pen-and-paper column
-		zi := Ti + ti + carry
+		// multiplication of two numbers with n base-2^_W digits. If we just
-		T[i] = zi
+		// wanted to produce 2n-wide T, we would do
-		carry = (Ti&ti | (Ti|ti)&^zi) >> 63
+		//
 		//   for i := 0; i < n; i++ {
 		//       d := bLimbs[i]
 		//       T[n+i] = addMulVVW(T[i:n+i], aLimbs, d)
 		//   }
 		//
 		// where d is a digit of the multiplier, T[i:n+i] is the shifted
 		// position of the product of that digit, and T[n+i] is the final carry.
 		// Note that T[i] isn't modified after processing the i-th digit.
 		//
 		// Instead of running two loops, one for Step 1 and one for Steps 2–6,
 		// the result of Step 1 is computed during the next loop. This is
 		// possible because each iteration only uses T[i] in Step 2 and then
 		// discards it in Step 6.
 		d := b[i]
 		c1 := addMulVVW(T[i:4+i], a[:], d)
 		// Step 6 is replaced by shifting the virtual window we operate
 		// over: T of the algorithm is T[i:] for us. That means that T1 in
 		// Step 2 (T mod 2^_W) is simply T[i]. k0 in Step 3 is our m0inv.
 		Y := T[i] * np[0]
 		// Step 4 and 5 add Y × m to T, which as mentioned above is stored
 		// at T[i:]. The two carries (from a × d and Y × m) are added up in
 		// the next word T[n+i], and the carry bit from that addition is
 		// brought forward to the next iteration.
 		c2 := addMulVVW(T[i:4+i], p2[:], Y)
 		T[4+i], carry = bits.Add64(c1, c2, carry)
 	}
 	*c = gfP{T[4], T[5], T[6], T[7]}