From 0bb1fa5be5819c8f8acaa00dfb429e2b52ffcf96 Mon Sep 17 00:00:00 2001 From: emmansun Date: Thu, 22 Jun 2023 14:35:14 +0800 Subject: [PATCH] sm9/bn256: asm rewrite batch 1, for arm64 test --- sm9/bn256/gfp.go | 6 +- sm9/bn256/gfp2.go | 8 +- sm9/bn256/gfp_amd64.s | 759 ++++++++++++++++++++++++++++++++++- sm9/bn256/gfp_arm64.s | 592 ++++++++++++++++++++++++++- sm9/bn256/gfp_decl.go | 11 + sm9/bn256/gfp_generic.go | 11 + sm9/bn256/gfp_invert_sqrt.go | 242 +++-------- 7 files changed, 1414 insertions(+), 215 deletions(-) diff --git a/sm9/bn256/gfp.go b/sm9/bn256/gfp.go index b4d4726..10291de 100644 --- a/sm9/bn256/gfp.go +++ b/sm9/bn256/gfp.go @@ -91,8 +91,8 @@ func (e *gfP) Mul(a, b *gfP) *gfP { return e } -func (e *gfP) Square(a *gfP) *gfP { - gfpMul(e, a, a) +func (e *gfP) Square(a *gfP, n int) *gfP { + gfpSqr(e, a, n) return e } @@ -150,7 +150,7 @@ func (e *gfP) Unmarshal(in []byte) error { } func montEncode(c, a *gfP) { gfpMul(c, a, r2) } -func montDecode(c, a *gfP) { gfpMul(c, a, &gfP{1}) } +func montDecode(c, a *gfP) { gfpFromMont(c, a) } // cmovznzU64 is a single-word conditional move. // diff --git a/sm9/bn256/gfp2.go b/sm9/bn256/gfp2.go index 69e39a5..606ba2b 100644 --- a/sm9/bn256/gfp2.go +++ b/sm9/bn256/gfp2.go @@ -173,8 +173,8 @@ func (e *gfP2) Square(a *gfP2) *gfP2 { // Complex squaring algorithm: // (xu+y)² = y^2-2*x^2 + 2*u*x*y tx, ty := &gfP{}, &gfP{} - gfpMul(tx, &a.x, &a.x) - gfpMul(ty, &a.y, &a.y) + gfpSqr(tx, &a.x, 1) + gfpSqr(ty, &a.y, 1) gfpSub(ty, ty, tx) gfpSub(ty, ty, tx) @@ -192,8 +192,8 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 { tx, ty := &gfP{}, &gfP{} // tx = a0^2 - 2 * a1^2 - gfpMul(ty, &a.x, &a.x) - gfpMul(tx, &a.y, &a.y) + gfpSqr(ty, &a.x, 1) + gfpSqr(tx, &a.y, 1) gfpAdd(ty, ty, ty) gfpSub(tx, tx, ty) diff --git a/sm9/bn256/gfp_amd64.s b/sm9/bn256/gfp_amd64.s index 0be3d29..53d2997 100644 --- a/sm9/bn256/gfp_amd64.s +++ b/sm9/bn256/gfp_amd64.s @@ -1,6 +1,21 @@ //go:build amd64 && !purego // +build amd64,!purego +#include "textflag.h" + +#define res_ptr DI +#define x_ptr SI +#define y_ptr CX + +#define acc0 R8 +#define acc1 R9 +#define acc2 R10 +#define acc3 R11 +#define acc4 R12 +#define acc5 R13 +#define t0 R14 +#define t1 R15 + #define storeBlock(a0,a1,a2,a3, r) \ MOVQ a0, 0+r \ MOVQ a1, 8+r \ @@ -34,9 +49,6 @@ CMOVQCC b2, a2 \ CMOVQCC b3, a3 -#include "mul_amd64.h" -#include "mul_bmi2_amd64.h" - TEXT ·gfpNeg(SB),0,$0-16 MOVQ ·p2+0(SB), R8 MOVQ ·p2+8(SB), R9 @@ -106,25 +118,732 @@ TEXT ·gfpSub(SB),0,$0-24 storeBlock(R8,R9,R10,R11, 0(DI)) RET -TEXT ·gfpMul(SB),0,$160-24 - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI +TEXT ·gfpMul(SB),0,$0-24 + MOVQ res+0(FP), res_ptr + MOVQ in1+8(FP), x_ptr + MOVQ in2+16(FP), y_ptr + // x * y[0] + MOVQ (8*0)(y_ptr), t0 - // Jump to a slightly different implementation if MULX isn't supported. - CMPB ·hasBMI2(SB), $0 - JE nobmi2Mul + MOVQ (8*0)(x_ptr), AX + MULQ t0 + MOVQ AX, acc0 + MOVQ DX, acc1 - mulBMI2(0(DI),8(DI),16(DI),24(DI), 0(SI)) - storeBlock( R8, R9,R10,R11, 0(SP)) - storeBlock(R12,R13,R14,CX, 32(SP)) - gfpReduceBMI2() - JMP end + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 -nobmi2Mul: - mul(0(DI),8(DI),16(DI),24(DI), 0(SI), 0(SP)) - gfpReduce(0(SP)) + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + XORQ acc5, acc5 + // First reduction step + MOVQ acc0, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 + + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc4 + ADCQ $0, acc5 + // x * y[1] + MOVQ (8*1)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // Second reduction step + MOVQ acc1, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 + + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // x * y[2] + MOVQ (8*2)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // Third reduction step + MOVQ acc2, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 + + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // x * y[3] + MOVQ (8*3)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Last reduction step + MOVQ acc3, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 + + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Copy result [255:0] + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, t1 + // Subtract p2 + SUBQ ·p2+0x00(SB), acc4 + SBBQ ·p2+0x08(SB) ,acc5 + SBBQ ·p2+0x10(SB), acc0 + SBBQ ·p2+0x18(SB), acc1 + SBBQ $0, acc2 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) + + RET + +// func gfpSqr(res, in *gfP, n int) +TEXT ·gfpSqr(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+8(FP), x_ptr + MOVQ n+16(FP), BX + +gfpSqrLoop: + + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + MOVQ AX, acc1 + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, acc5 + // y[3] * y[2] + MOVQ (8*2)(x_ptr), t0 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, y_ptr + XORQ t1, t1 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, t1 + // Missing products + MOVQ (8*0)(x_ptr), AX + MULQ AX + MOVQ AX, acc0 + MOVQ DX, t0 + + MOVQ (8*1)(x_ptr), AX + MULQ AX + ADDQ t0, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*2)(x_ptr), AX + MULQ AX + ADDQ t0, acc3 + ADCQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*3)(x_ptr), AX + MULQ AX + ADDQ t0, acc5 + ADCQ AX, y_ptr + ADCQ DX, t1 + MOVQ t1, x_ptr + // T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr] + // First reduction step + MOVQ acc0, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 // acc0 is free now + ADCQ $0, DX + MOVQ DX, t1 // carry + XORQ acc0, acc0 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc0 + + // Second reduction step + MOVQ acc1, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 // acc1 is free now + ADCQ $0, DX + MOVQ DX, t1 // carry + XORQ acc1, acc1 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + + // Third reduction step + MOVQ acc2, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 // acc2 is free now + ADCQ $0, DX + MOVQ DX, t1 // carry + XORQ acc2, acc2 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ DX, acc2 + + // Last reduction step + MOVQ acc3, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 // acc3 is free now + ADCQ $0, DX + MOVQ DX, t1 // carry + XORQ acc3, acc3 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ DX, acc3 + + XORQ t0, t0 + // Add bits [511:256] of the sqr result + ADCQ acc4, acc0 + ADCQ acc5, acc1 + ADCQ y_ptr, acc2 + ADCQ x_ptr, acc3 + ADCQ $0, t0 + + MOVQ acc0, acc4 + MOVQ acc1, acc5 + MOVQ acc2, y_ptr + MOVQ acc3, t1 + // Subtract p2 + SUBQ ·p2+0x00(SB), acc0 + SBBQ ·p2+0x08(SB) ,acc1 + SBBQ ·p2+0x10(SB), acc2 + SBBQ ·p2+0x18(SB), acc3 + SBBQ $0, t0 + + CMOVQCS acc4, acc0 + CMOVQCS acc5, acc1 + CMOVQCS y_ptr, acc2 + CMOVQCS t1, acc3 + + MOVQ acc0, (8*0)(res_ptr) + MOVQ acc1, (8*1)(res_ptr) + MOVQ acc2, (8*2)(res_ptr) + MOVQ acc3, (8*3)(res_ptr) + MOVQ res_ptr, x_ptr + DECQ BX + JNE gfpSqrLoop + + RET + +/* ---------------------------------------*/ +// func gfpFromMont(res, in *gfP) +TEXT ·gfpFromMont(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+8(FP), x_ptr + + MOVQ (8*0)(x_ptr), acc0 + MOVQ (8*1)(x_ptr), acc1 + MOVQ (8*2)(x_ptr), acc2 + MOVQ (8*3)(x_ptr), acc3 + XORQ acc4, acc4 + + // Only reduce, no multiplications are needed + // First reduction step + MOVQ acc0, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 // acc0 is free now + ADCQ $0, DX + MOVQ DX, t1 // carry + XORQ acc0, acc0 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc4 + XORQ acc5, acc5 + + // Second reduction step + MOVQ acc1, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 // acc1 is free now + ADCQ $0, DX + MOVQ DX, t1 // carry + XORQ acc1, acc1 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + + // Third reduction step + MOVQ acc2, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 // acc2 is free now + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + + // Last reduction step + MOVQ acc3, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 // acc3 is free now + ADCQ $0, DX + MOVQ DX, t1 // carry + XORQ acc3, acc3 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, t1 // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, t1 + + SUBQ ·p2+0x00(SB), acc4 + SBBQ ·p2+0x08(SB) ,acc5 + SBBQ ·p2+0x10(SB), acc0 + SBBQ ·p2+0x18(SB), acc1 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) -end: - MOVQ c+0(FP), DI - storeBlock(R12,R13,R14,CX, 0(DI)) RET diff --git a/sm9/bn256/gfp_arm64.s b/sm9/bn256/gfp_arm64.s index 009c9cf..f652d62 100644 --- a/sm9/bn256/gfp_arm64.s +++ b/sm9/bn256/gfp_arm64.s @@ -1,6 +1,43 @@ //go:build arm64 && !purego // +build arm64,!purego +#include "textflag.h" + +#define res_ptr R0 +#define a_ptr R1 +#define b_ptr R2 + +#define acc0 R3 +#define acc1 R4 +#define acc2 R5 +#define acc3 R6 + +#define acc4 R7 +#define acc5 R8 +#define acc6 R9 +#define acc7 R10 +#define t0 R11 +#define t1 R12 +#define t2 R13 +#define t3 R14 +#define const0 R15 +#define const1 R16 + +#define hlp0 R17 +#define hlp1 res_ptr + +#define x0 R19 +#define x1 R20 +#define x2 R21 +#define x3 R22 +#define y0 R23 +#define y1 R24 +#define y2 R25 +#define y3 R26 + +#define const2 t2 +#define const3 t3 + #define storeBlock(a0,a1,a2,a3, r) \ MOVD a0, 0+r \ MOVD a1, 8+r \ @@ -19,8 +56,6 @@ MOVD ·p2+16(SB), p2 \ MOVD ·p2+24(SB), p3 -#include "mul_arm64.h" - TEXT ·gfpNeg(SB),0,$0-16 MOVD a+8(FP), R0 loadBlock(0(R0), R1,R2,R3,R4) @@ -100,15 +135,550 @@ TEXT ·gfpSub(SB),0,$0-24 storeBlock(R1,R2,R3,R4, 0(R0)) RET -TEXT ·gfpMul(SB),0,$0-24 - MOVD a+8(FP), R0 - loadBlock(0(R0), R1,R2,R3,R4) - MOVD b+16(FP), R0 - loadBlock(0(R0), R5,R6,R7,R8) +TEXT ·gfpMul(SB),NOSPLIT,$0 + MOVD in1+8(FP), a_ptr + MOVD in2+16(FP), b_ptr - mul(R9,R10,R11,R12,R13,R14,R15,R16) - gfpReduce() + MOVD ·np+0x00(SB), hlp1 + LDP ·p2+0x00(SB), (const0, const1) + LDP ·p2+0x10(SB), (const2, const3) + + LDP 0*16(a_ptr), (x0, x1) + LDP 1*16(a_ptr), (x2, x3) + LDP 0*16(b_ptr), (y0, y1) + LDP 1*16(b_ptr), (y2, y3) + + // y[0] * x + MUL y0, x0, acc0 + UMULH y0, x0, acc1 + + MUL y0, x1, t0 + ADDS t0, acc1 + UMULH y0, x1, acc2 + + MUL y0, x2, t0 + ADCS t0, acc2 + UMULH y0, x2, acc3 + + MUL y0, x3, t0 + ADCS t0, acc3 + UMULH y0, x3, acc4 + ADC $0, acc4 + // First reduction step + MUL acc0, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc0, acc0 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc1, acc1 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc2, acc2 + UMULH const2, hlp0, acc0 + + MUL const3, hlp0, t0 + ADCS t0, acc3, acc3 + + UMULH const3, hlp0, hlp0 + ADC $0, acc4 + + ADDS t1, acc1, acc1 + ADCS y0, acc2, acc2 + ADCS acc0, acc3, acc3 + ADC $0, hlp0, acc0 + // y[1] * x + MUL y1, x0, t0 + ADDS t0, acc1 + UMULH y1, x0, t1 + + MUL y1, x1, t0 + ADCS t0, acc2 + UMULH y1, x1, hlp0 + + MUL y1, x2, t0 + ADCS t0, acc3 + UMULH y1, x2, y0 + + MUL y1, x3, t0 + ADCS t0, acc4 + UMULH y1, x3, y1 + ADC $0, ZR, acc5 + + ADDS t1, acc2 + ADCS hlp0, acc3 + ADCS y0, acc4 + ADC y1, acc5 + // Second reduction step + MUL acc1, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc1, acc1 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc2, acc2 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc3, acc3 + UMULH const2, hlp0, acc1 + + MUL const3, hlp0, t0 + ADCS t0, acc0, acc0 + + UMULH const3, hlp0, hlp0 + ADC $0, acc5 + + ADDS t1, acc2, acc2 + ADCS y0, acc3, acc3 + ADCS acc1, acc0, acc0 + ADC $0, hlp0, acc1 + // y[2] * x + MUL y2, x0, t0 + ADDS t0, acc2 + UMULH y2, x0, t1 + + MUL y2, x1, t0 + ADCS t0, acc3 + UMULH y2, x1, hlp0 + + MUL y2, x2, t0 + ADCS t0, acc4 + UMULH y2, x2, y0 + + MUL y2, x3, t0 + ADCS t0, acc5 + UMULH y2, x3, y1 + ADC $0, ZR, acc6 + + ADDS t1, acc3 + ADCS hlp0, acc4 + ADCS y0, acc5 + ADC y1, acc6 + // Third reduction step + MUL acc2, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc2, acc2 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc3, acc3 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc0, acc0 + UMULH const2, hlp0, acc2 + + MUL const3, hlp0, t0 + ADCS t0, acc1, acc1 + + UMULH const3, hlp0, hlp0 + ADC $0, acc6 + + ADDS t1, acc3, acc3 + ADCS y0, acc0, acc0 + ADCS acc2, acc1, acc1 + ADC $0, hlp0, acc2 + // y[3] * x + MUL y3, x0, t0 + ADDS t0, acc3 + UMULH y3, x0, t1 + + MUL y3, x1, t0 + ADCS t0, acc4 + UMULH y3, x1, hlp0 + + MUL y3, x2, t0 + ADCS t0, acc5 + UMULH y3, x2, y0 + + MUL y3, x3, t0 + ADCS t0, acc6 + UMULH y3, x3, y1 + ADC $0, ZR, acc7 + + ADDS t1, acc4 + ADCS hlp0, acc5 + ADCS y0, acc6 + ADC y1, acc7 + // Last reduction step + MUL acc3, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc3, acc3 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc0, acc0 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc1, acc1 + UMULH const2, hlp0, acc3 + + MUL const3, hlp0, t0 + ADCS t0, acc2, acc2 + + UMULH const3, hlp0, hlp0 + ADC $0, acc7 + + ADDS t1, acc0, acc0 + ADCS y0, acc1, acc1 + ADCS acc3, acc2, acc2 + ADC $0, hlp0, acc3 + + ADDS acc4, acc0, acc0 + ADCS acc5, acc1, acc1 + ADCS acc6, acc2, acc2 + ADCS acc7, acc3, acc3 + ADC $0, ZR, acc4 + + SUBS const0, acc0, t0 + SBCS const1, acc1, t1 + SBCS const2, acc2, t2 + SBCS const3, acc3, t3 + SBCS $0, acc4, acc4 + + CSEL CS, t0, acc0, acc0 + CSEL CS, t1, acc1, acc1 + CSEL CS, t2, acc2, acc2 + CSEL CS, t3, acc3, acc3 + + MOVD res+0(FP), res_ptr + STP (acc0, acc1), 0*16(res_ptr) + STP (acc2, acc3), 1*16(res_ptr) + + RET + +// func gfpSqr(res, in *gfP, n int) +TEXT ·gfpSqr(SB),NOSPLIT,$0 + MOVD in+8(FP), a_ptr + MOVD n+16(FP), b_ptr + + MOVD ·np+0x00(SB), hlp1 + LDP ·p2+0x00(SB), (const0, const1) + LDP ·p2+0x10(SB), (const2, const3) + + LDP 0*16(a_ptr), (x0, x1) + LDP 1*16(a_ptr), (x2, x3) + +ordSqrLoop: + SUB $1, b_ptr + + // x[1:] * x[0] + MUL x0, x1, acc1 + UMULH x0, x1, acc2 + + MUL x0, x2, t0 + ADDS t0, acc2, acc2 + UMULH x0, x2, acc3 + + MUL x0, x3, t0 + ADCS t0, acc3, acc3 + UMULH x0, x3, acc4 + ADC $0, acc4, acc4 + // x[2:] * x[1] + MUL x1, x2, t0 + ADDS t0, acc3 + UMULH x1, x2, t1 + ADCS t1, acc4 + ADC $0, ZR, acc5 + + MUL x1, x3, t0 + ADDS t0, acc4 + UMULH x1, x3, t1 + ADC t1, acc5 + // x[3] * x[2] + MUL x2, x3, t0 + ADDS t0, acc5 + UMULH x2, x3, acc6 + ADC $0, acc6 + + MOVD $0, acc7 + // *2 + ADDS acc1, acc1 + ADCS acc2, acc2 + ADCS acc3, acc3 + ADCS acc4, acc4 + ADCS acc5, acc5 + ADCS acc6, acc6 + ADC $0, acc7 + // Missing products + MUL x0, x0, acc0 + UMULH x0, x0, t0 + ADDS t0, acc1, acc1 + + MUL x1, x1, t0 + ADCS t0, acc2, acc2 + UMULH x1, x1, t1 + ADCS t1, acc3, acc3 + + MUL x2, x2, t0 + ADCS t0, acc4, acc4 + UMULH x2, x2, t1 + ADCS t1, acc5, acc5 + + MUL x3, x3, t0 + ADCS t0, acc6, acc6 + UMULH x3, x3, t1 + ADC t1, acc7, acc7 + // First reduction step + MUL acc0, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc0, acc0 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc1, acc1 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc2, acc2 + UMULH const2, hlp0, acc0 + + MUL const3, hlp0, t0 + ADCS t0, acc3, acc3 + + UMULH const3, hlp0, hlp0 + ADC $0, hlp0 + + ADDS t1, acc1, acc1 + ADCS y0, acc2, acc2 + ADCS acc0, acc3, acc3 + ADC $0, hlp0, acc0 + // Second reduction step + MUL acc1, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc1, acc1 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc2, acc2 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc3, acc3 + UMULH const2, hlp0, acc1 + + MUL const3, hlp0, t0 + ADCS t0, acc0, acc0 + + UMULH const3, hlp0, hlp0 + ADC $0, hlp0 + + ADDS t1, acc2, acc2 + ADCS y0, acc3, acc3 + ADCS acc1, acc0, acc0 + ADC $0, hlp0, acc1 + // Third reduction step + MUL acc2, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc2, acc2 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc3, acc3 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc0, acc0 + UMULH const2, hlp0, acc2 + + MUL const3, hlp0, t0 + ADCS t0, acc1, acc1 + + UMULH const3, hlp0, hlp0 + ADC $0, hlp0 + + ADDS t1, acc3, acc3 + ADCS y0, acc0, acc0 + ADCS acc2, acc1, acc1 + ADC $0, hlp0, acc2 + + // Last reduction step + MUL acc3, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc3, acc3 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc0, acc0 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc1, acc1 + UMULH const2, hlp0, acc3 + + MUL const3, hlp0, t0 + ADCS t0, acc2, acc2 + + UMULH const3, hlp0, hlp0 + ADC $0, acc7 + + ADDS t1, acc0, acc0 + ADCS y0, acc1, acc1 + ADCS acc3, acc2, acc2 + ADC $0, hlp0, acc3 + + ADDS acc4, acc0, acc0 + ADCS acc5, acc1, acc1 + ADCS acc6, acc2, acc2 + ADCS acc7, acc3, acc3 + ADC $0, ZR, acc4 + + SUBS const0, acc0, y0 + SBCS const1, acc1, y1 + SBCS const2, acc2, y2 + SBCS const3, acc3, y3 + SBCS $0, acc4, acc4 + + CSEL CS, y0, acc0, x0 + CSEL CS, y1, acc1, x1 + CSEL CS, y2, acc2, x2 + CSEL CS, y3, acc3, x3 + + CBNZ b_ptr, ordSqrLoop + + MOVD res+0(FP), res_ptr + STP (x0, x1), 0*16(res_ptr) + STP (x2, x3), 1*16(res_ptr) + + RET + +/* ---------------------------------------*/ +// func gfpFromMont(res, in *gfP) +TEXT ·gfpFromMont(SB),NOSPLIT,$0 + MOVD in+8(FP), a_ptr + + MOVD ·np+0x00(SB), hlp1 + LDP ·p2+0x00(SB), (const0, const1) + LDP ·p2+0x10(SB), (const2, const3) + + LDP 0*16(a_ptr), (x0, x1) + LDP 1*16(a_ptr), (x2, x3) + // Only reduce, no multiplications are needed + // First reduction step + MUL acc0, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc0, acc0 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc1, acc1 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc2, acc2 + UMULH const2, hlp0, acc0 + + MUL const3, hlp0, t0 + ADCS t0, acc3, acc3 + + UMULH const3, hlp0, hlp0 + ADC $0, hlp0 + + ADDS t1, acc1, acc1 + ADCS y0, acc2, acc2 + ADCS acc0, acc3, acc3 + ADC $0, hlp0, acc0 + // Second reduction step + MUL acc1, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc1, acc1 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc2, acc2 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc3, acc3 + UMULH const2, hlp0, acc1 + + MUL const3, hlp0, t0 + ADCS t0, acc0, acc0 + + UMULH const3, hlp0, hlp0 + ADC $0, hlp0 + + ADDS t1, acc2, acc2 + ADCS y0, acc3, acc3 + ADCS acc1, acc0, acc0 + ADC $0, hlp0, acc1 + // Third reduction step + MUL acc2, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc2, acc2 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc3, acc3 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc0, acc0 + UMULH const2, hlp0, acc2 + + MUL const3, hlp0, t0 + ADCS t0, acc1, acc1 + + UMULH const3, hlp0, hlp0 + ADC $0, hlp0 + + ADDS t1, acc3, acc3 + ADCS y0, acc0, acc0 + ADCS acc2, acc1, acc1 + ADC $0, hlp0, acc2 + + // Last reduction step + MUL acc3, hlp1, hlp0 + + MUL const0, hlp1, t0 + ADDS t0, acc3, acc3 + UMULH const0, hlp0, t1 + + MUL const1, hlp0, t0 + ADCS t0, acc0, acc0 + UMULH const1, hlp0, y0 + + MUL const2, hlp0, t0 + ADCS t0, acc1, acc1 + UMULH const2, hlp0, acc3 + + MUL const3, hlp0, t0 + ADCS t0, acc2, acc2 + + UMULH const3, hlp0, hlp0 + ADC $0, acc7 + + ADDS t1, acc0, acc0 + ADCS y0, acc1, acc1 + ADCS acc3, acc2, acc2 + ADC $0, hlp0, acc3 + + SUBS const0, acc0, y0 + SBCS const1, acc1, y1 + SBCS const2, acc2, y2 + SBCS const3, acc3, y3 + + CSEL CS, y0, acc0, x0 + CSEL CS, y1, acc1, x1 + CSEL CS, y2, acc2, x2 + CSEL CS, y3, acc3, x3 + + MOVD res+0(FP), res_ptr + STP (x0, x1), 0*16(res_ptr) + STP (x2, x3), 1*16(res_ptr) - MOVD c+0(FP), R0 - storeBlock(R1,R2,R3,R4, 0(R0)) RET diff --git a/sm9/bn256/gfp_decl.go b/sm9/bn256/gfp_decl.go index f19ea4b..9eeaa2a 100644 --- a/sm9/bn256/gfp_decl.go +++ b/sm9/bn256/gfp_decl.go @@ -23,3 +23,14 @@ func gfpSub(c, a, b *gfP) //go:noescape func gfpMul(c, a, b *gfP) + +// Montgomery square, repeated n times (n >= 1). +// +//go:noescape +func gfpSqr(res, in *gfP, n int) + +// Montgomery multiplication by R⁻¹, or 1 outside the domain. +// Sets res = in * R⁻¹, bringing res out of the Montgomery domain. +// +//go:noescape +func gfpFromMont(res, in *gfP) diff --git a/sm9/bn256/gfp_generic.go b/sm9/bn256/gfp_generic.go index e8b7451..fbbf00d 100644 --- a/sm9/bn256/gfp_generic.go +++ b/sm9/bn256/gfp_generic.go @@ -125,3 +125,14 @@ func gfpMul(c, a, b *gfP) { *c = gfP{T[4], T[5], T[6], T[7]} gfpCarry(c, carry) } + +func gfpSqr(res, in *gfP, n int) { + gfpMul(res, in, in) + for i := 1; i < n; i++ { + gfpMul(res, res, res) + } +} + +func gfpFromMont(res, in *gfP) { + gfpMul(res, in, &gfP{1}) +} diff --git a/sm9/bn256/gfp_invert_sqrt.go b/sm9/bn256/gfp_invert_sqrt.go index 6850b72..6ef3196 100644 --- a/sm9/bn256/gfp_invert_sqrt.go +++ b/sm9/bn256/gfp_invert_sqrt.go @@ -73,8 +73,8 @@ func (e *gfP) Invert(x *gfP) *gfP { var t19 = new(gfP) var t20 = new(gfP) - t17.Square(x) - t15.Square(t17) + t17.Square(x, 1) + t15.Square(t17, 1) z.Mul(t17, t15) t2.Mul(t15, z) t14.Mul(x, t2) @@ -100,122 +100,66 @@ func (e *gfP) Invert(x *gfP) *gfP { t17.Mul(z, t18) z.Mul(z, t17) t20.Mul(t13, z) - for s := 0; s < 2; s++ { - t20.Square(t20) - } + t20.Square(t20, 2) t20.Mul(x, t20) - for s := 0; s < 33; s++ { - t20.Square(t20) - } + t20.Square(t20, 33) t19.Mul(t19, t20) - for s := 0; s < 8; s++ { - t19.Square(t19) - } + t19.Square(t19, 8) t19.Mul(t12, t19) - for s := 0; s < 9; s++ { - t19.Square(t19) - } + t19.Square(t19, 9) t18.Mul(t18, t19) - for s := 0; s < 10; s++ { - t18.Square(t18) - } + t18.Square(t18, 10) t18.Mul(t17, t18) - t18.Square(t18) + t18.Square(t18, 1) t18.Mul(x, t18) - for s := 0; s < 14; s++ { - t18.Square(t18) - } + t18.Square(t18, 14) t17.Mul(t17, t18) - for s := 0; s < 5; s++ { - t17.Square(t17) - } + t17.Square(t17, 5) t16.Mul(t16, t17) - for s := 0; s < 9; s++ { - t16.Square(t16) - } + t16.Square(t16, 9) t16.Mul(z, t16) t15.Mul(t15, t16) - t15.Square(t15) + t15.Square(t15, 1) t15.Mul(x, t15) - for s := 0; s < 5; s++ { - t15.Square(t15) - } + t15.Square(t15, 5) t14.Mul(t14, t15) - for s := 0; s < 9; s++ { - t14.Square(t14) - } + t14.Square(t14, 9) t13.Mul(t13, t14) - for s := 0; s < 8; s++ { - t13.Square(t13) - } + t13.Square(t13, 8) t12.Mul(t12, t13) - for s := 0; s < 9; s++ { - t12.Square(t12) - } + t12.Square(t12, 9) t12.Mul(t11, t12) - for s := 0; s < 9; s++ { - t12.Square(t12) - } + t12.Square(t12, 9) t12.Mul(t5, t12) - for s := 0; s < 8; s++ { - t12.Square(t12) - } + t12.Square(t12, 8) t11.Mul(t11, t12) - for s := 0; s < 9; s++ { - t11.Square(t11) - } + t11.Square(t11, 9) t10.Mul(t10, t11) - for s := 0; s < 8; s++ { - t10.Square(t10) - } + t10.Square(t10, 8) t10.Mul(t2, t10) - for s := 0; s < 8; s++ { - t10.Square(t10) - } + t10.Square(t10, 8) t10.Mul(t3, t10) - for s := 0; s < 8; s++ { - t10.Square(t10) - } + t10.Square(t10, 8) t9.Mul(t9, t10) - for s := 0; s < 7; s++ { - t9.Square(t9) - } + t9.Square(t9, 7) t8.Mul(t8, t9) - for s := 0; s < 7; s++ { - t8.Square(t8) - } + t8.Square(t8, 7) t7.Mul(t7, t8) - for s := 0; s < 8; s++ { - t7.Square(t7) - } + t7.Square(t7, 8) t6.Mul(t6, t7) - for s := 0; s < 6; s++ { - t6.Square(t6) - } + t6.Square(t6, 6) t5.Mul(t5, t6) - for s := 0; s < 7; s++ { - t5.Square(t5) - } + t5.Square(t5, 7) t4.Mul(t4, t5) - for s := 0; s < 9; s++ { - t4.Square(t4) - } + t4.Square(t4, 9) t3.Mul(t3, t4) - for s := 0; s < 7; s++ { - t3.Square(t3) - } + t3.Square(t3, 7) t2.Mul(t2, t3) - for s := 0; s < 8; s++ { - t2.Square(t2) - } + t2.Square(t2, 8) t1.Mul(t1, t2) - for s := 0; s < 8; s++ { - t1.Square(t1) - } + t1.Square(t1, 8) t0.Mul(t0, t1) - for s := 0; s < 8; s++ { - t0.Square(t0) - } + t0.Square(t0, 8) z.Mul(z, t0) return e.Set(z) } @@ -231,7 +175,7 @@ func Sqrt(e, x *gfP) (isSquare bool) { gfpMul(i, i, b) // i=2(fb)b gfpSub(i, i, one) // i=2(fb)b-1 gfpMul(i, candidate, i) // i=(fb)(2(fb)b-1) - square := new(gfP).Square(i) + square := new(gfP).Square(i, 1) if square.Equal(x) != 1 { return false } @@ -306,8 +250,8 @@ func sqrtCandidate(z, x *gfP) { var t18 = new(gfP) var t19 = new(gfP) - t18.Square(x) - t8.Square(t18) + t18.Square(x, 1) + t8.Square(t18, 1) t16.Mul(t18, t8) t2.Mul(t8, t16) t14.Mul(x, t2) @@ -332,120 +276,64 @@ func sqrtCandidate(z, x *gfP) { t8.Mul(t8, t10) t18.Mul(t18, t8) t16.Mul(t16, t18) - for s := 0; s < 3; s++ { - t19.Square(t19) - } + t19.Square(t19, 3) t19.Mul(x, t19) - for s := 0; s < 33; s++ { - t19.Square(t19) - } + t19.Square(t19, 33) t19.Mul(t15, t19) - for s := 0; s < 8; s++ { - t19.Square(t19) - } + t19.Square(t19, 8) t19.Mul(t12, t19) - for s := 0; s < 9; s++ { - t19.Square(t19) - } + t19.Square(t19, 9) t18.Mul(t18, t19) - for s := 0; s < 10; s++ { - t18.Square(t18) - } + t18.Square(t18, 10) t18.Mul(t16, t18) - t18.Square(t18) + t18.Square(t18, 1) t18.Mul(x, t18) - for s := 0; s < 14; s++ { - t18.Square(t18) - } + t18.Square(t18, 14) t18.Mul(t16, t18) - for s := 0; s < 5; s++ { - t18.Square(t18) - } + t18.Square(t18, 5) t17.Mul(t17, t18) - for s := 0; s < 9; s++ { - t17.Square(t17) - } + t17.Square(t17, 9) t16.Mul(t16, t17) - t16.Square(t16) + t16.Square(t16, 1) t15.Mul(t15, t16) - for s := 0; s < 5; s++ { - t15.Square(t15) - } + t15.Square(t15, 5) t14.Mul(t14, t15) - for s := 0; s < 9; s++ { - t14.Square(t14) - } + t14.Square(t14, 9) t13.Mul(t13, t14) - for s := 0; s < 8; s++ { - t13.Square(t13) - } + t13.Square(t13, 8) t12.Mul(t12, t13) - for s := 0; s < 9; s++ { - t12.Square(t12) - } + t12.Square(t12, 9) t12.Mul(t11, t12) - for s := 0; s < 9; s++ { - t12.Square(t12) - } + t12.Square(t12, 9) t12.Mul(t5, t12) - for s := 0; s < 8; s++ { - t12.Square(t12) - } + t12.Square(t12, 8) t11.Mul(t11, t12) - for s := 0; s < 9; s++ { - t11.Square(t11) - } + t11.Square(t11, 9) t10.Mul(t10, t11) - for s := 0; s < 8; s++ { - t10.Square(t10) - } + t10.Square(t10, 8) t10.Mul(t2, t10) - for s := 0; s < 8; s++ { - t10.Square(t10) - } + t10.Square(t10, 8) t10.Mul(t3, t10) - for s := 0; s < 8; s++ { - t10.Square(t10) - } + t10.Square(t10, 8) t9.Mul(t9, t10) - for s := 0; s < 7; s++ { - t9.Square(t9) - } + t9.Square(t9, 7) t8.Mul(t8, t9) - for s := 0; s < 7; s++ { - t8.Square(t8) - } + t8.Square(t8, 7) t7.Mul(t7, t8) - for s := 0; s < 8; s++ { - t7.Square(t7) - } + t7.Square(t7, 8) t6.Mul(t6, t7) - for s := 0; s < 6; s++ { - t6.Square(t6) - } + t6.Square(t6, 6) t5.Mul(t5, t6) - for s := 0; s < 7; s++ { - t5.Square(t5) - } + t5.Square(t5, 7) t4.Mul(t4, t5) - for s := 0; s < 9; s++ { - t4.Square(t4) - } + t4.Square(t4, 9) t3.Mul(t3, t4) - for s := 0; s < 7; s++ { - t3.Square(t3) - } + t3.Square(t3, 7) t2.Mul(t2, t3) - for s := 0; s < 8; s++ { - t2.Square(t2) - } + t2.Square(t2, 8) t1.Mul(t1, t2) - for s := 0; s < 8; s++ { - t1.Square(t1) - } + t1.Square(t1, 8) t0.Mul(t0, t1) - for s := 0; s < 5; s++ { - t0.Square(t0) - } + t0.Square(t0, 5) z.Mul(z, t0) }