//go:build (!amd64 && !arm64) || purego
// +build !amd64,!arm64 purego

package bn256

import (
	"math/bits"
)

func gfpCarry(a *gfP, head uint64) {
	b := &gfP{}

	var carry uint64
	for i, pi := range p2 {
		b[i], carry = bits.Sub64(a[i], pi, carry)
	}
	carry = carry &^ head

	// If b is negative, then return a.
	// Else return b.
	carry = -carry
	ncarry := ^carry
	for i := 0; i < 4; i++ {
		a[i] = (a[i] & carry) | (b[i] & ncarry)
	}
}

func gfpNeg(c, a *gfP) {
	var carry uint64
	for i, pi := range p2 {
		c[i], carry = bits.Sub64(pi, a[i], carry)
	}
	// required for "zero", bn256 treats infinity point as valid
	gfpCarry(c, 0)
}

func gfpAdd(c, a, b *gfP) {
	var carry uint64
	for i, ai := range a {
		c[i], carry = bits.Add64(ai, b[i], carry)
	}
	gfpCarry(c, carry)
}

func gfpDouble(c, a *gfP) {
	gfpAdd(c, a, a)
}

func gfpTriple(c, a *gfP) {
	t := &gfP{}
	gfpAdd(t, a, a)
	gfpAdd(c, t, a)
}

func gfpSub(c, a, b *gfP) {
	t := &gfP{}

	var carry, underflow uint64

	for i, ai := range a {
		c[i], underflow = bits.Sub64(ai, b[i], underflow)
	}

	for i, pi := range p2 {
		t[i], carry = bits.Add64(pi, c[i], carry)
	}

	mask := -underflow
	for i, ci := range c {
		c[i] ^= mask & (ci ^ t[i])
	}
}

// addMulVVW multiplies the multi-word value x by the single-word value y,
// adding the result to the multi-word value z and returning the final carry.
// It can be thought of as one row of a pen-and-paper column multiplication.
func addMulVVW(z, x []uint64, y uint64) (carry uint64) {
	_ = x[len(z)-1] // bounds check elimination hint
	for i := range z {
		hi, lo := bits.Mul64(x[i], y)
		lo, c := bits.Add64(lo, z[i], 0)
		// We use bits.Add with zero to get an add-with-carry instruction that
		// absorbs the carry from the previous bits.Add.
		hi, _ = bits.Add64(hi, 0, c)
		lo, c = bits.Add64(lo, carry, 0)
		hi, _ = bits.Add64(hi, 0, c)
		carry = hi
		z[i] = lo
	}
	return carry
}

func gfpMul(c, a, b *gfP) {
	var T [8]uint64
	// This loop implements Word-by-Word Montgomery Multiplication, as
	// described in Algorithm 4 (Fig. 3) of "Efficient Software
	// Implementations of Modular Exponentiation" by Shay Gueron
	// [https://eprint.iacr.org/2011/239.pdf].
	var carry uint64
	for i := 0; i < 4; i++ {
		// Step 1 (T = a × b) is computed as a large pen-and-paper column
		// multiplication of two numbers with n base-2^_W digits. If we just
		// wanted to produce 2n-wide T, we would do
		//
		//   for i := 0; i < n; i++ {
		//       d := bLimbs[i]
		//       T[n+i] = addMulVVW(T[i:n+i], aLimbs, d)
		//   }
		//
		// where d is a digit of the multiplier, T[i:n+i] is the shifted
		// position of the product of that digit, and T[n+i] is the final carry.
		// Note that T[i] isn't modified after processing the i-th digit.
		//
		// Instead of running two loops, one for Step 1 and one for Steps 2–6,
		// the result of Step 1 is computed during the next loop. This is
		// possible because each iteration only uses T[i] in Step 2 and then
		// discards it in Step 6.
		d := b[i]

		c1 := addMulVVW(T[i:4+i], a[:], d)

		// Step 6 is replaced by shifting the virtual window we operate
		// over: T of the algorithm is T[i:] for us. That means that T1 in
		// Step 2 (T mod 2^_W) is simply T[i]. k0 in Step 3 is our m0inv.
		Y := T[i] * np[0]

		// Step 4 and 5 add Y × m to T, which as mentioned above is stored
		// at T[i:]. The two carries (from a × d and Y × m) are added up in
		// the next word T[n+i], and the carry bit from that addition is
		// brought forward to the next iteration.
		c2 := addMulVVW(T[i:4+i], p2[:], Y)
		T[4+i], carry = bits.Add64(c1, c2, carry)
	}

	*c = gfP{T[4], T[5], T[6], T[7]}
	gfpCarry(c, carry)
}

func gfpSqr(res, in *gfP, n int) {
	gfpMul(res, in, in)
	for i := 1; i < n; i++ {
		gfpMul(res, res, res)
	}
}

func gfpFromMont(res, in *gfP) {
	var T [8]uint64
	var carry uint64
	copy(T[:], in[:])
	for i := 0; i < 4; i++ {
		Y := T[i] * np[0]
		c2 := addMulVVW(T[i:4+i], p2[:], Y)
		T[4+i], carry = bits.Add64(uint64(0), c2, carry)
	}

	*res = gfP{T[4], T[5], T[6], T[7]}
	gfpCarry(res, carry)
}

func gfpMarshal(out *[32]byte, in *gfP) {
	for w := uint(0); w < 4; w++ {
		for b := uint(0); b < 8; b++ {
			out[8*w+b] = byte(in[3-w] >> (56 - 8*b))
		}
	}
}

func gfpUnmarshal(out *gfP, in *[32]byte) {
	for w := uint(0); w < 4; w++ {
		out[3-w] = 0
		for b := uint(0); b < 8; b++ {
			out[3-w] += uint64(in[8*w+b]) << (56 - 8*b)
		}
	}
}