sm9/bn256: use ADCX ADOX together with MULX #132

This commit is contained in:
Sun Yimin 2023-06-28 17:38:05 +08:00 committed by GitHub
parent ce489e2b4b
commit 0afaeb49eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 228 additions and 202 deletions

View File

@ -136,198 +136,192 @@ TEXT ·gfpSub(SB),0,$0-24
RET RET
TEXT ·gfpMul(SB),0,$0-24 TEXT ·gfpMul(SB),0,$0-24
MOVQ res+0(FP), res_ptr
MOVQ in1+8(FP), x_ptr MOVQ in1+8(FP), x_ptr
MOVQ in2+16(FP), y_ptr MOVQ in2+16(FP), y_ptr
CMPB ·hasBMI2(SB), $0 CMPB ·supportADX(SB), $0
JE nobmi2Mul JE noAdxMul
XORQ acc5, acc5
XORQ res_ptr, res_ptr
// x * y[0] // x * y[0]
MOVQ (8*0)(y_ptr), DX MOVQ (8*0)(y_ptr), DX
MULXQ (8*0)(x_ptr), acc0, acc1 MULXQ (8*0)(x_ptr), acc0, acc1
MULXQ (8*1)(x_ptr), AX, acc2 MULXQ (8*1)(x_ptr), AX, acc2
ADDQ AX, acc1 ADCXQ AX, acc1
ADCQ $0, acc2
MULXQ (8*2)(x_ptr), AX, acc3 MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2 ADCXQ AX, acc2
ADCQ $0, acc3
MULXQ (8*3)(x_ptr), AX, acc4 MULXQ (8*3)(x_ptr), AX, acc4
ADDQ AX, acc3 ADCXQ AX, acc3
ADCQ $0, acc4 ADCXQ acc5, acc4
XORQ acc5, acc5
// First reduction step // First reduction step
MOVQ acc0, DX MOVQ acc0, DX
MULXQ ·np+0x00(SB), DX, AX MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1 MULXQ ·p2+0x00(SB), AX, t0
ADDQ AX, acc0 ADOXQ AX, acc0
ADCQ t1, acc1
MULXQ ·p2+0x08(SB), AX, t1 MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x10(SB), AX, t1 MULXQ ·p2+0x10(SB), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x18(SB), AX, t1 MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
ADCQ $0, acc5 ADCXQ res_ptr, t1
ADOXQ t1, acc4
ADOXQ res_ptr, acc5
XORQ acc0, acc0
// x * y[1] // x * y[1]
MOVQ (8*1)(y_ptr), DX MOVQ (8*1)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t1 MULXQ (8*0)(x_ptr), AX, t0
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ t1, acc2
MULXQ (8*1)(x_ptr), AX, t1 MULXQ (8*1)(x_ptr), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ t1, acc3
MULXQ (8*2)(x_ptr), AX, t1 MULXQ (8*2)(x_ptr), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
MULXQ (8*3)(x_ptr), AX, t1 MULXQ (8*3)(x_ptr), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ t1, acc5
ADCQ $0, acc0 ADCXQ acc0, t1
ADOXQ t1, acc5
ADOXQ res_ptr, acc0
// Second reduction step // Second reduction step
MOVQ acc1, DX MOVQ acc1, DX
MULXQ ·np+0x00(SB), DX, AX MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1 MULXQ ·p2+0x00(SB), AX, t0
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x08(SB), AX, t1 MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x10(SB), AX, t1 MULXQ ·p2+0x10(SB), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
MULXQ ·p2+0x18(SB), AX, t1 MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ t1, acc5
ADCQ $0, acc0 ADCXQ res_ptr, t1
ADOXQ t1, acc5
ADOXQ res_ptr, acc0
XORQ acc1, acc1
// x * y[2] // x * y[2]
MOVQ (8*2)(y_ptr), DX MOVQ (8*2)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t1 MULXQ (8*0)(x_ptr), AX, t0
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ t1, acc3
MULXQ (8*1)(x_ptr), AX, t1 MULXQ (8*1)(x_ptr), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
MULXQ (8*2)(x_ptr), AX, t1 MULXQ (8*2)(x_ptr), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ t1, acc5
MULXQ (8*3)(x_ptr), AX, t1 MULXQ (8*3)(x_ptr), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ t1, acc0
ADCQ $0, acc1 ADCXQ res_ptr, t1
ADOXQ t1, acc0
ADOXQ res_ptr, acc1
// Third reduction step // Third reduction step
MOVQ acc2, DX MOVQ acc2, DX
MULXQ ·np+0x00(SB), DX, AX MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1 MULXQ ·p2+0x00(SB), AX, t0
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x08(SB), AX, t1 MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
MULXQ ·p2+0x10(SB), AX, t1 MULXQ ·p2+0x10(SB), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ t1, acc5
MULXQ ·p2+0x18(SB), AX, t1 MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ t1, acc0
ADCQ $0, acc1 ADCXQ res_ptr, t1
ADOXQ t1, acc0
ADOXQ res_ptr, acc1
XORQ acc2, acc2
// x * y[3] // x * y[3]
MOVQ (8*3)(y_ptr), DX MOVQ (8*3)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t1 MULXQ (8*0)(x_ptr), AX, t0
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
MULXQ (8*1)(x_ptr), AX, t1 MULXQ (8*1)(x_ptr), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ t1, acc5
MULXQ (8*2)(x_ptr), AX, t1 MULXQ (8*2)(x_ptr), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ t1, acc0
MULXQ (8*3)(x_ptr), AX, t1 MULXQ (8*3)(x_ptr), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc0 ADOXQ AX, acc0
ADCQ t1, acc1
ADCQ $0, acc2 ADCXQ res_ptr, t1
ADOXQ t1, acc1
ADOXQ res_ptr, acc2
// Last reduction step // Last reduction step
MOVQ acc3, DX MOVQ acc3, DX
MULXQ ·np+0x00(SB), DX, AX MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1 MULXQ ·p2+0x00(SB), AX, t0
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
MULXQ ·p2+0x08(SB), AX, t1 MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ t1, acc5
MULXQ ·p2+0x10(SB), AX, t1 MULXQ ·p2+0x10(SB), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ t1, acc0
MULXQ ·p2+0x18(SB), AX, t1 MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc0 ADOXQ AX, acc0
ADCQ t1, acc1
ADCQ $0, acc2 ADCXQ res_ptr, t1
ADOXQ t1, acc1
ADOXQ res_ptr, acc2
// Copy result [255:0] // Copy result [255:0]
gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2) gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2)
MOVQ res+0(FP), res_ptr
storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
RET RET
nobmi2Mul:
noAdxMul:
// x * y[0] // x * y[0]
MOVQ (8*0)(y_ptr), t0 MOVQ (8*0)(y_ptr), t0
@ -588,6 +582,7 @@ nobmi2Mul:
ADCQ $0, acc2 ADCQ $0, acc2
// Copy result [255:0] // Copy result [255:0]
gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2) gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2)
MOVQ res+0(FP), res_ptr
storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
RET RET
@ -598,175 +593,171 @@ TEXT ·gfpSqr(SB),NOSPLIT,$0
MOVQ in+8(FP), x_ptr MOVQ in+8(FP), x_ptr
MOVQ n+16(FP), BX MOVQ n+16(FP), BX
CMPB ·hasBMI2(SB), $0 CMPB ·supportADX(SB), $0
JE gfpSqrLoop JE gfpSqrLoop
gfpSqrLoopBMI2: gfpSqrLoopAdx:
XORQ acc0, acc0
XORQ y_ptr, y_ptr
// y[1:] * y[0] // y[1:] * y[0]
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2 MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3 MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ $0, acc3
MULXQ (8*3)(x_ptr), AX, acc4 MULXQ (8*3)(x_ptr), AX, acc4
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ $0, acc4 ADOXQ y_ptr, acc4
// y[2:] * y[1] // y[2:] * y[1]
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, t1 MULXQ (8*2)(x_ptr), AX, t1
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
MULXQ (8*3)(x_ptr), AX, acc5 MULXQ (8*3)(x_ptr), AX, acc5
ADCQ $0, acc5 ADCXQ t1, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ $0, acc5 ADCXQ y_ptr, acc5
// y[3] * y[2] // y[3] * y[2]
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr MULXQ (8*3)(x_ptr), AX, y_ptr
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ $0, y_ptr ADOXQ acc0, y_ptr
XORQ t1, t1 XORQ t1, t1
// *2 // *2
ADDQ acc1, acc1 ADOXQ acc1, acc1
ADCQ acc2, acc2 ADOXQ acc2, acc2
ADCQ acc3, acc3 ADOXQ acc3, acc3
ADCQ acc4, acc4 ADOXQ acc4, acc4
ADCQ acc5, acc5 ADOXQ acc5, acc5
ADCQ y_ptr, y_ptr ADOXQ y_ptr, y_ptr
ADCQ $0, t1 ADOXQ acc0, t1
// Missing products // Missing products
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0 MULXQ DX, acc0, t0
ADDQ t0, acc1 ADCXQ t0, acc1
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc2 ADCXQ AX, acc2
ADCQ t0, acc3 ADCXQ t0, acc3
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc4 ADCXQ AX, acc4
ADCQ t0, acc5 ADCXQ t0, acc5
MOVQ (8*3)(x_ptr), DX MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr MULXQ DX, AX, x_ptr
ADCQ AX, y_ptr ADCXQ AX, y_ptr
ADCQ t1, x_ptr ADCXQ t1, x_ptr
// First reduction step // First reduction step
MOVQ acc0, DX MOVQ acc0, DX
MULXQ ·np+0x00(SB), DX, AX MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1 MULXQ ·p2+0x00(SB), AX, t0
ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
ADCQ t1, acc1
MULXQ ·p2+0x08(SB), AX, t1 MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x10(SB), AX, t1 MULXQ ·p2+0x10(SB), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x18(SB), AX, acc0 MULXQ ·p2+0x18(SB), AX, acc0
ADCQ $0, acc0 ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ $0, acc0 MOVQ $0, t0
ADCXQ t0, acc0
ADOXQ t0, acc0
// Second reduction step // Second reduction step
MOVQ acc1, DX MOVQ acc1, DX
MULXQ ·np+0x00(SB), DX, AX MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1 MULXQ ·p2+0x00(SB), AX, t0
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x08(SB), AX, t1 MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x10(SB), AX, t1 MULXQ ·p2+0x10(SB), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc0
MULXQ ·p2+0x18(SB), AX, acc1 MULXQ ·p2+0x18(SB), AX, acc1
ADCQ $0, acc1 ADCXQ t0, AX
ADDQ AX, acc0 ADOXQ AX, acc4
ADCQ $0, acc1 MOVQ $0, t0
ADCXQ t0, acc1
ADOXQ t0, acc1
// Third reduction step // Third reduction step
MOVQ acc2, DX MOVQ acc2, DX
MULXQ ·np+0x00(SB), DX, AX MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1 MULXQ ·p2+0x00(SB), AX, t0
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x08(SB), AX, t1 MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc0
MULXQ ·p2+0x10(SB), AX, t1 MULXQ ·p2+0x10(SB), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc0 ADOXQ AX, acc0
ADCQ t1, acc1
MULXQ ·p2+0x18(SB), AX, acc2 MULXQ ·p2+0x18(SB), AX, acc2
ADCQ $0, acc2 ADCXQ t0, AX
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ $0, acc2 MOVQ $0, t0
ADCXQ t0, acc2
ADOXQ t0, acc2
// Last reduction step // Last reduction step
MOVQ acc3, DX MOVQ acc3, DX
MULXQ ·np+0x00(SB), DX, AX MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1 MULXQ ·p2+0x00(SB), AX, t0
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc0
MULXQ ·p2+0x08(SB), AX, t1 MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1 ADCXQ t0, AX
ADDQ AX, acc0 ADOXQ AX, acc0
ADCQ t1, acc1
MULXQ ·p2+0x10(SB), AX, t1 MULXQ ·p2+0x10(SB), AX, t0
ADCQ $0, t1 ADCXQ t1, AX
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x18(SB), AX, acc3 MULXQ ·p2+0x18(SB), AX, acc3
ADCQ $0, acc3 ADCXQ t0, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ $0, acc3 MOVQ $0, t0
ADCXQ t0, acc3
ADOXQ t0, acc3
XORQ t0, t0 XORQ t1, t1
// Add bits [511:256] of the sqr result // Add bits [511:256] of the sqr result
ADDQ acc4, acc0 ADCXQ acc4, acc0
ADCQ acc5, acc1 ADCXQ acc5, acc1
ADCQ y_ptr, acc2 ADCXQ y_ptr, acc2
ADCQ x_ptr, acc3 ADCXQ x_ptr, acc3
ADCQ $0, t0 ADCXQ t1, t0
gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0) gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0)
storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr)) storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
MOVQ res_ptr, x_ptr MOVQ res_ptr, x_ptr
DECQ BX DECQ BX
JNE gfpSqrLoopBMI2 JNE gfpSqrLoopAdx
RET RET

View File

@ -10,7 +10,10 @@ import (
"golang.org/x/sys/cpu" "golang.org/x/sys/cpu"
) )
var hasBMI2 = cpu.X86.HasBMI2 // amd64 assembly uses ADCX/ADOX/MULX if ADX is available to run two carry
// chains in the flags in parallel across the whole operation, and aggressively
// unrolls loops. arm64 processes four words at a time.
var supportADX = cpu.X86.HasADX && cpu.X86.HasBMI2
// Set c = p - a, if c == p, then c = 0 // Set c = p - a, if c == p, then c = 0
// //

View File

@ -134,5 +134,15 @@ func gfpSqr(res, in *gfP, n int) {
} }
func gfpFromMont(res, in *gfP) { func gfpFromMont(res, in *gfP) {
gfpMul(res, in, &gfP{1}) var T [8]uint64
var carry uint64
copy(T[:], in[:])
for i := 0; i < 4; i++ {
Y := T[i] * np[0]
c2 := addMulVVW(T[i:4+i], p2[:], Y)
T[4+i], carry = bits.Add64(uint64(0), c2, carry)
}
*res = gfP{T[4], T[5], T[6], T[7]}
gfpCarry(res, carry)
} }

View File

@ -48,6 +48,28 @@ func Test_gfpBasicOperations(t *testing.T) {
} }
} }
func Test_gfpSqr(t *testing.T) {
// p - 1
pMinusOne := new(big.Int).Sub(p, big.NewInt(1))
x := fromBigInt(pMinusOne)
ret := &gfP{}
gfpSqr(ret, x, 1)
pMinusOne.Mul(pMinusOne, pMinusOne)
pMinusOne.Mod(pMinusOne, p)
if *ret != *fromBigInt(pMinusOne) {
t.Errorf("bad sqr")
}
// p + 1
pPlusOne := new(big.Int).Add(p, big.NewInt(1))
x = fromBigInt(pPlusOne)
gfpSqr(ret, x, 1)
pPlusOne.Mul(pPlusOne, pPlusOne)
pPlusOne.Mod(pPlusOne, p)
if *ret != *fromBigInt(pPlusOne) {
t.Errorf("bad sqr")
}
}
func TestFromMont(t *testing.T) { func TestFromMont(t *testing.T) {
x := fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141")) x := fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141"))
ret1, ret2 := &gfP{}, &gfP{} ret1, ret2 := &gfP{}, &gfP{}