sm9/bn256: asm rewrite batch 1, for arm64 test

This commit is contained in:
emmansun 2023-06-22 14:35:14 +08:00
parent ecab517411
commit 0bb1fa5be5
7 changed files with 1414 additions and 215 deletions

View File

@ -91,8 +91,8 @@ func (e *gfP) Mul(a, b *gfP) *gfP {
return e
}
func (e *gfP) Square(a *gfP) *gfP {
gfpMul(e, a, a)
func (e *gfP) Square(a *gfP, n int) *gfP {
gfpSqr(e, a, n)
return e
}
@ -150,7 +150,7 @@ func (e *gfP) Unmarshal(in []byte) error {
}
func montEncode(c, a *gfP) { gfpMul(c, a, r2) }
func montDecode(c, a *gfP) { gfpMul(c, a, &gfP{1}) }
func montDecode(c, a *gfP) { gfpFromMont(c, a) }
// cmovznzU64 is a single-word conditional move.
//

View File

@ -173,8 +173,8 @@ func (e *gfP2) Square(a *gfP2) *gfP2 {
// Complex squaring algorithm:
// (xu+y)² = y^2-2*x^2 + 2*u*x*y
tx, ty := &gfP{}, &gfP{}
gfpMul(tx, &a.x, &a.x)
gfpMul(ty, &a.y, &a.y)
gfpSqr(tx, &a.x, 1)
gfpSqr(ty, &a.y, 1)
gfpSub(ty, ty, tx)
gfpSub(ty, ty, tx)
@ -192,8 +192,8 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 {
tx, ty := &gfP{}, &gfP{}
// tx = a0^2 - 2 * a1^2
gfpMul(ty, &a.x, &a.x)
gfpMul(tx, &a.y, &a.y)
gfpSqr(ty, &a.x, 1)
gfpSqr(tx, &a.y, 1)
gfpAdd(ty, ty, ty)
gfpSub(tx, tx, ty)

View File

@ -1,6 +1,21 @@
//go:build amd64 && !purego
// +build amd64,!purego
#include "textflag.h"
#define res_ptr DI
#define x_ptr SI
#define y_ptr CX
#define acc0 R8
#define acc1 R9
#define acc2 R10
#define acc3 R11
#define acc4 R12
#define acc5 R13
#define t0 R14
#define t1 R15
#define storeBlock(a0,a1,a2,a3, r) \
MOVQ a0, 0+r \
MOVQ a1, 8+r \
@ -34,9 +49,6 @@
CMOVQCC b2, a2 \
CMOVQCC b3, a3
#include "mul_amd64.h"
#include "mul_bmi2_amd64.h"
TEXT ·gfpNeg(SB),0,$0-16
MOVQ ·p2+0(SB), R8
MOVQ ·p2+8(SB), R9
@ -106,25 +118,732 @@ TEXT ·gfpSub(SB),0,$0-24
storeBlock(R8,R9,R10,R11, 0(DI))
RET
TEXT ·gfpMul(SB),0,$160-24
MOVQ a+8(FP), DI
MOVQ b+16(FP), SI
TEXT ·gfpMul(SB),0,$0-24
MOVQ res+0(FP), res_ptr
MOVQ in1+8(FP), x_ptr
MOVQ in2+16(FP), y_ptr
// x * y[0]
MOVQ (8*0)(y_ptr), t0
// Jump to a slightly different implementation if MULX isn't supported.
CMPB ·hasBMI2(SB), $0
JE nobmi2Mul
MOVQ (8*0)(x_ptr), AX
MULQ t0
MOVQ AX, acc0
MOVQ DX, acc1
mulBMI2(0(DI),8(DI),16(DI),24(DI), 0(SI))
storeBlock( R8, R9,R10,R11, 0(SP))
storeBlock(R12,R13,R14,CX, 32(SP))
gfpReduceBMI2()
JMP end
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, acc2
nobmi2Mul:
mul(0(DI),8(DI),16(DI),24(DI), 0(SI), 0(SP))
gfpReduce(0(SP))
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
XORQ acc5, acc5
// First reduction step
MOVQ acc0, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5
// x * y[1]
MOVQ (8*1)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
// x * y[2]
MOVQ (8*2)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
// x * y[3]
MOVQ (8*3)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
// Subtract p2
SUBQ ·p2+0x00(SB), acc4
SBBQ ·p2+0x08(SB) ,acc5
SBBQ ·p2+0x10(SB), acc0
SBBQ ·p2+0x18(SB), acc1
SBBQ $0, acc2
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
RET
// func gfpSqr(res, in *gfP, n int)
TEXT ·gfpSqr(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+8(FP), x_ptr
MOVQ n+16(FP), BX
gfpSqrLoop:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0
MOVQ (8*1)(x_ptr), AX
MULQ t0
MOVQ AX, acc1
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, y_ptr
XORQ t1, t1
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, t1
// Missing products
MOVQ (8*0)(x_ptr), AX
MULQ AX
MOVQ AX, acc0
MOVQ DX, t0
MOVQ (8*1)(x_ptr), AX
MULQ AX
ADDQ t0, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*2)(x_ptr), AX
MULQ AX
ADDQ t0, acc3
ADCQ AX, acc4
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*3)(x_ptr), AX
MULQ AX
ADDQ t0, acc5
ADCQ AX, y_ptr
ADCQ DX, t1
MOVQ t1, x_ptr
// T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr]
// First reduction step
MOVQ acc0, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // acc0 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc0, acc0
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc0
// Second reduction step
MOVQ acc1, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc1 // acc1 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc1, acc1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
// Third reduction step
MOVQ acc2, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc2 // acc2 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc2, acc2
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ DX, acc2
// Last reduction step
MOVQ acc3, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc3 // acc3 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc3, acc3
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ DX, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADCQ acc4, acc0
ADCQ acc5, acc1
ADCQ y_ptr, acc2
ADCQ x_ptr, acc3
ADCQ $0, t0
MOVQ acc0, acc4
MOVQ acc1, acc5
MOVQ acc2, y_ptr
MOVQ acc3, t1
// Subtract p2
SUBQ ·p2+0x00(SB), acc0
SBBQ ·p2+0x08(SB) ,acc1
SBBQ ·p2+0x10(SB), acc2
SBBQ ·p2+0x18(SB), acc3
SBBQ $0, t0
CMOVQCS acc4, acc0
CMOVQCS acc5, acc1
CMOVQCS y_ptr, acc2
CMOVQCS t1, acc3
MOVQ acc0, (8*0)(res_ptr)
MOVQ acc1, (8*1)(res_ptr)
MOVQ acc2, (8*2)(res_ptr)
MOVQ acc3, (8*3)(res_ptr)
MOVQ res_ptr, x_ptr
DECQ BX
JNE gfpSqrLoop
RET
/* ---------------------------------------*/
// func gfpFromMont(res, in *gfP)
TEXT ·gfpFromMont(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+8(FP), x_ptr
MOVQ (8*0)(x_ptr), acc0
MOVQ (8*1)(x_ptr), acc1
MOVQ (8*2)(x_ptr), acc2
MOVQ (8*3)(x_ptr), acc3
XORQ acc4, acc4
// Only reduce, no multiplications are needed
// First reduction step
MOVQ acc0, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // acc0 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc0, acc0
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
XORQ acc5, acc5
// Second reduction step
MOVQ acc1, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc1 // acc1 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc1, acc1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
// Third reduction step
MOVQ acc2, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc2 // acc2 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
// Last reduction step
MOVQ acc3, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc3 // acc3 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc3, acc3
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
SUBQ ·p2+0x00(SB), acc4
SBBQ ·p2+0x08(SB) ,acc5
SBBQ ·p2+0x10(SB), acc0
SBBQ ·p2+0x18(SB), acc1
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
end:
MOVQ c+0(FP), DI
storeBlock(R12,R13,R14,CX, 0(DI))
RET

View File

@ -1,6 +1,43 @@
//go:build arm64 && !purego
// +build arm64,!purego
#include "textflag.h"
#define res_ptr R0
#define a_ptr R1
#define b_ptr R2
#define acc0 R3
#define acc1 R4
#define acc2 R5
#define acc3 R6
#define acc4 R7
#define acc5 R8
#define acc6 R9
#define acc7 R10
#define t0 R11
#define t1 R12
#define t2 R13
#define t3 R14
#define const0 R15
#define const1 R16
#define hlp0 R17
#define hlp1 res_ptr
#define x0 R19
#define x1 R20
#define x2 R21
#define x3 R22
#define y0 R23
#define y1 R24
#define y2 R25
#define y3 R26
#define const2 t2
#define const3 t3
#define storeBlock(a0,a1,a2,a3, r) \
MOVD a0, 0+r \
MOVD a1, 8+r \
@ -19,8 +56,6 @@
MOVD ·p2+16(SB), p2 \
MOVD ·p2+24(SB), p3
#include "mul_arm64.h"
TEXT ·gfpNeg(SB),0,$0-16
MOVD a+8(FP), R0
loadBlock(0(R0), R1,R2,R3,R4)
@ -100,15 +135,550 @@ TEXT ·gfpSub(SB),0,$0-24
storeBlock(R1,R2,R3,R4, 0(R0))
RET
TEXT ·gfpMul(SB),0,$0-24
MOVD a+8(FP), R0
loadBlock(0(R0), R1,R2,R3,R4)
MOVD b+16(FP), R0
loadBlock(0(R0), R5,R6,R7,R8)
TEXT ·gfpMul(SB),NOSPLIT,$0
MOVD in1+8(FP), a_ptr
MOVD in2+16(FP), b_ptr
mul(R9,R10,R11,R12,R13,R14,R15,R16)
gfpReduce()
MOVD ·np+0x00(SB), hlp1
LDP ·p2+0x00(SB), (const0, const1)
LDP ·p2+0x10(SB), (const2, const3)
LDP 0*16(a_ptr), (x0, x1)
LDP 1*16(a_ptr), (x2, x3)
LDP 0*16(b_ptr), (y0, y1)
LDP 1*16(b_ptr), (y2, y3)
// y[0] * x
MUL y0, x0, acc0
UMULH y0, x0, acc1
MUL y0, x1, t0
ADDS t0, acc1
UMULH y0, x1, acc2
MUL y0, x2, t0
ADCS t0, acc2
UMULH y0, x2, acc3
MUL y0, x3, t0
ADCS t0, acc3
UMULH y0, x3, acc4
ADC $0, acc4
// First reduction step
MUL acc0, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc0, acc0
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc1, acc1
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc2, acc2
UMULH const2, hlp0, acc0
MUL const3, hlp0, t0
ADCS t0, acc3, acc3
UMULH const3, hlp0, hlp0
ADC $0, acc4
ADDS t1, acc1, acc1
ADCS y0, acc2, acc2
ADCS acc0, acc3, acc3
ADC $0, hlp0, acc0
// y[1] * x
MUL y1, x0, t0
ADDS t0, acc1
UMULH y1, x0, t1
MUL y1, x1, t0
ADCS t0, acc2
UMULH y1, x1, hlp0
MUL y1, x2, t0
ADCS t0, acc3
UMULH y1, x2, y0
MUL y1, x3, t0
ADCS t0, acc4
UMULH y1, x3, y1
ADC $0, ZR, acc5
ADDS t1, acc2
ADCS hlp0, acc3
ADCS y0, acc4
ADC y1, acc5
// Second reduction step
MUL acc1, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc1, acc1
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc2, acc2
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc3, acc3
UMULH const2, hlp0, acc1
MUL const3, hlp0, t0
ADCS t0, acc0, acc0
UMULH const3, hlp0, hlp0
ADC $0, acc5
ADDS t1, acc2, acc2
ADCS y0, acc3, acc3
ADCS acc1, acc0, acc0
ADC $0, hlp0, acc1
// y[2] * x
MUL y2, x0, t0
ADDS t0, acc2
UMULH y2, x0, t1
MUL y2, x1, t0
ADCS t0, acc3
UMULH y2, x1, hlp0
MUL y2, x2, t0
ADCS t0, acc4
UMULH y2, x2, y0
MUL y2, x3, t0
ADCS t0, acc5
UMULH y2, x3, y1
ADC $0, ZR, acc6
ADDS t1, acc3
ADCS hlp0, acc4
ADCS y0, acc5
ADC y1, acc6
// Third reduction step
MUL acc2, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc2, acc2
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc3, acc3
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc0, acc0
UMULH const2, hlp0, acc2
MUL const3, hlp0, t0
ADCS t0, acc1, acc1
UMULH const3, hlp0, hlp0
ADC $0, acc6
ADDS t1, acc3, acc3
ADCS y0, acc0, acc0
ADCS acc2, acc1, acc1
ADC $0, hlp0, acc2
// y[3] * x
MUL y3, x0, t0
ADDS t0, acc3
UMULH y3, x0, t1
MUL y3, x1, t0
ADCS t0, acc4
UMULH y3, x1, hlp0
MUL y3, x2, t0
ADCS t0, acc5
UMULH y3, x2, y0
MUL y3, x3, t0
ADCS t0, acc6
UMULH y3, x3, y1
ADC $0, ZR, acc7
ADDS t1, acc4
ADCS hlp0, acc5
ADCS y0, acc6
ADC y1, acc7
// Last reduction step
MUL acc3, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc3, acc3
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc0, acc0
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc1, acc1
UMULH const2, hlp0, acc3
MUL const3, hlp0, t0
ADCS t0, acc2, acc2
UMULH const3, hlp0, hlp0
ADC $0, acc7
ADDS t1, acc0, acc0
ADCS y0, acc1, acc1
ADCS acc3, acc2, acc2
ADC $0, hlp0, acc3
ADDS acc4, acc0, acc0
ADCS acc5, acc1, acc1
ADCS acc6, acc2, acc2
ADCS acc7, acc3, acc3
ADC $0, ZR, acc4
SUBS const0, acc0, t0
SBCS const1, acc1, t1
SBCS const2, acc2, t2
SBCS const3, acc3, t3
SBCS $0, acc4, acc4
CSEL CS, t0, acc0, acc0
CSEL CS, t1, acc1, acc1
CSEL CS, t2, acc2, acc2
CSEL CS, t3, acc3, acc3
MOVD res+0(FP), res_ptr
STP (acc0, acc1), 0*16(res_ptr)
STP (acc2, acc3), 1*16(res_ptr)
RET
// func gfpSqr(res, in *gfP, n int)
TEXT ·gfpSqr(SB),NOSPLIT,$0
MOVD in+8(FP), a_ptr
MOVD n+16(FP), b_ptr
MOVD ·np+0x00(SB), hlp1
LDP ·p2+0x00(SB), (const0, const1)
LDP ·p2+0x10(SB), (const2, const3)
LDP 0*16(a_ptr), (x0, x1)
LDP 1*16(a_ptr), (x2, x3)
ordSqrLoop:
SUB $1, b_ptr
// x[1:] * x[0]
MUL x0, x1, acc1
UMULH x0, x1, acc2
MUL x0, x2, t0
ADDS t0, acc2, acc2
UMULH x0, x2, acc3
MUL x0, x3, t0
ADCS t0, acc3, acc3
UMULH x0, x3, acc4
ADC $0, acc4, acc4
// x[2:] * x[1]
MUL x1, x2, t0
ADDS t0, acc3
UMULH x1, x2, t1
ADCS t1, acc4
ADC $0, ZR, acc5
MUL x1, x3, t0
ADDS t0, acc4
UMULH x1, x3, t1
ADC t1, acc5
// x[3] * x[2]
MUL x2, x3, t0
ADDS t0, acc5
UMULH x2, x3, acc6
ADC $0, acc6
MOVD $0, acc7
// *2
ADDS acc1, acc1
ADCS acc2, acc2
ADCS acc3, acc3
ADCS acc4, acc4
ADCS acc5, acc5
ADCS acc6, acc6
ADC $0, acc7
// Missing products
MUL x0, x0, acc0
UMULH x0, x0, t0
ADDS t0, acc1, acc1
MUL x1, x1, t0
ADCS t0, acc2, acc2
UMULH x1, x1, t1
ADCS t1, acc3, acc3
MUL x2, x2, t0
ADCS t0, acc4, acc4
UMULH x2, x2, t1
ADCS t1, acc5, acc5
MUL x3, x3, t0
ADCS t0, acc6, acc6
UMULH x3, x3, t1
ADC t1, acc7, acc7
// First reduction step
MUL acc0, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc0, acc0
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc1, acc1
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc2, acc2
UMULH const2, hlp0, acc0
MUL const3, hlp0, t0
ADCS t0, acc3, acc3
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc1, acc1
ADCS y0, acc2, acc2
ADCS acc0, acc3, acc3
ADC $0, hlp0, acc0
// Second reduction step
MUL acc1, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc1, acc1
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc2, acc2
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc3, acc3
UMULH const2, hlp0, acc1
MUL const3, hlp0, t0
ADCS t0, acc0, acc0
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc2, acc2
ADCS y0, acc3, acc3
ADCS acc1, acc0, acc0
ADC $0, hlp0, acc1
// Third reduction step
MUL acc2, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc2, acc2
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc3, acc3
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc0, acc0
UMULH const2, hlp0, acc2
MUL const3, hlp0, t0
ADCS t0, acc1, acc1
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc3, acc3
ADCS y0, acc0, acc0
ADCS acc2, acc1, acc1
ADC $0, hlp0, acc2
// Last reduction step
MUL acc3, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc3, acc3
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc0, acc0
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc1, acc1
UMULH const2, hlp0, acc3
MUL const3, hlp0, t0
ADCS t0, acc2, acc2
UMULH const3, hlp0, hlp0
ADC $0, acc7
ADDS t1, acc0, acc0
ADCS y0, acc1, acc1
ADCS acc3, acc2, acc2
ADC $0, hlp0, acc3
ADDS acc4, acc0, acc0
ADCS acc5, acc1, acc1
ADCS acc6, acc2, acc2
ADCS acc7, acc3, acc3
ADC $0, ZR, acc4
SUBS const0, acc0, y0
SBCS const1, acc1, y1
SBCS const2, acc2, y2
SBCS const3, acc3, y3
SBCS $0, acc4, acc4
CSEL CS, y0, acc0, x0
CSEL CS, y1, acc1, x1
CSEL CS, y2, acc2, x2
CSEL CS, y3, acc3, x3
CBNZ b_ptr, ordSqrLoop
MOVD res+0(FP), res_ptr
STP (x0, x1), 0*16(res_ptr)
STP (x2, x3), 1*16(res_ptr)
RET
/* ---------------------------------------*/
// func gfpFromMont(res, in *gfP)
TEXT ·gfpFromMont(SB),NOSPLIT,$0
MOVD in+8(FP), a_ptr
MOVD ·np+0x00(SB), hlp1
LDP ·p2+0x00(SB), (const0, const1)
LDP ·p2+0x10(SB), (const2, const3)
LDP 0*16(a_ptr), (x0, x1)
LDP 1*16(a_ptr), (x2, x3)
// Only reduce, no multiplications are needed
// First reduction step
MUL acc0, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc0, acc0
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc1, acc1
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc2, acc2
UMULH const2, hlp0, acc0
MUL const3, hlp0, t0
ADCS t0, acc3, acc3
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc1, acc1
ADCS y0, acc2, acc2
ADCS acc0, acc3, acc3
ADC $0, hlp0, acc0
// Second reduction step
MUL acc1, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc1, acc1
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc2, acc2
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc3, acc3
UMULH const2, hlp0, acc1
MUL const3, hlp0, t0
ADCS t0, acc0, acc0
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc2, acc2
ADCS y0, acc3, acc3
ADCS acc1, acc0, acc0
ADC $0, hlp0, acc1
// Third reduction step
MUL acc2, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc2, acc2
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc3, acc3
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc0, acc0
UMULH const2, hlp0, acc2
MUL const3, hlp0, t0
ADCS t0, acc1, acc1
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc3, acc3
ADCS y0, acc0, acc0
ADCS acc2, acc1, acc1
ADC $0, hlp0, acc2
// Last reduction step
MUL acc3, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc3, acc3
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc0, acc0
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc1, acc1
UMULH const2, hlp0, acc3
MUL const3, hlp0, t0
ADCS t0, acc2, acc2
UMULH const3, hlp0, hlp0
ADC $0, acc7
ADDS t1, acc0, acc0
ADCS y0, acc1, acc1
ADCS acc3, acc2, acc2
ADC $0, hlp0, acc3
SUBS const0, acc0, y0
SBCS const1, acc1, y1
SBCS const2, acc2, y2
SBCS const3, acc3, y3
CSEL CS, y0, acc0, x0
CSEL CS, y1, acc1, x1
CSEL CS, y2, acc2, x2
CSEL CS, y3, acc3, x3
MOVD res+0(FP), res_ptr
STP (x0, x1), 0*16(res_ptr)
STP (x2, x3), 1*16(res_ptr)
MOVD c+0(FP), R0
storeBlock(R1,R2,R3,R4, 0(R0))
RET

View File

@ -23,3 +23,14 @@ func gfpSub(c, a, b *gfP)
//go:noescape
func gfpMul(c, a, b *gfP)
// Montgomery square, repeated n times (n >= 1).
//
//go:noescape
func gfpSqr(res, in *gfP, n int)
// Montgomery multiplication by R⁻¹, or 1 outside the domain.
// Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
//
//go:noescape
func gfpFromMont(res, in *gfP)

View File

@ -125,3 +125,14 @@ func gfpMul(c, a, b *gfP) {
*c = gfP{T[4], T[5], T[6], T[7]}
gfpCarry(c, carry)
}
func gfpSqr(res, in *gfP, n int) {
gfpMul(res, in, in)
for i := 1; i < n; i++ {
gfpMul(res, res, res)
}
}
func gfpFromMont(res, in *gfP) {
gfpMul(res, in, &gfP{1})
}

View File

@ -73,8 +73,8 @@ func (e *gfP) Invert(x *gfP) *gfP {
var t19 = new(gfP)
var t20 = new(gfP)
t17.Square(x)
t15.Square(t17)
t17.Square(x, 1)
t15.Square(t17, 1)
z.Mul(t17, t15)
t2.Mul(t15, z)
t14.Mul(x, t2)
@ -100,122 +100,66 @@ func (e *gfP) Invert(x *gfP) *gfP {
t17.Mul(z, t18)
z.Mul(z, t17)
t20.Mul(t13, z)
for s := 0; s < 2; s++ {
t20.Square(t20)
}
t20.Square(t20, 2)
t20.Mul(x, t20)
for s := 0; s < 33; s++ {
t20.Square(t20)
}
t20.Square(t20, 33)
t19.Mul(t19, t20)
for s := 0; s < 8; s++ {
t19.Square(t19)
}
t19.Square(t19, 8)
t19.Mul(t12, t19)
for s := 0; s < 9; s++ {
t19.Square(t19)
}
t19.Square(t19, 9)
t18.Mul(t18, t19)
for s := 0; s < 10; s++ {
t18.Square(t18)
}
t18.Square(t18, 10)
t18.Mul(t17, t18)
t18.Square(t18)
t18.Square(t18, 1)
t18.Mul(x, t18)
for s := 0; s < 14; s++ {
t18.Square(t18)
}
t18.Square(t18, 14)
t17.Mul(t17, t18)
for s := 0; s < 5; s++ {
t17.Square(t17)
}
t17.Square(t17, 5)
t16.Mul(t16, t17)
for s := 0; s < 9; s++ {
t16.Square(t16)
}
t16.Square(t16, 9)
t16.Mul(z, t16)
t15.Mul(t15, t16)
t15.Square(t15)
t15.Square(t15, 1)
t15.Mul(x, t15)
for s := 0; s < 5; s++ {
t15.Square(t15)
}
t15.Square(t15, 5)
t14.Mul(t14, t15)
for s := 0; s < 9; s++ {
t14.Square(t14)
}
t14.Square(t14, 9)
t13.Mul(t13, t14)
for s := 0; s < 8; s++ {
t13.Square(t13)
}
t13.Square(t13, 8)
t12.Mul(t12, t13)
for s := 0; s < 9; s++ {
t12.Square(t12)
}
t12.Square(t12, 9)
t12.Mul(t11, t12)
for s := 0; s < 9; s++ {
t12.Square(t12)
}
t12.Square(t12, 9)
t12.Mul(t5, t12)
for s := 0; s < 8; s++ {
t12.Square(t12)
}
t12.Square(t12, 8)
t11.Mul(t11, t12)
for s := 0; s < 9; s++ {
t11.Square(t11)
}
t11.Square(t11, 9)
t10.Mul(t10, t11)
for s := 0; s < 8; s++ {
t10.Square(t10)
}
t10.Square(t10, 8)
t10.Mul(t2, t10)
for s := 0; s < 8; s++ {
t10.Square(t10)
}
t10.Square(t10, 8)
t10.Mul(t3, t10)
for s := 0; s < 8; s++ {
t10.Square(t10)
}
t10.Square(t10, 8)
t9.Mul(t9, t10)
for s := 0; s < 7; s++ {
t9.Square(t9)
}
t9.Square(t9, 7)
t8.Mul(t8, t9)
for s := 0; s < 7; s++ {
t8.Square(t8)
}
t8.Square(t8, 7)
t7.Mul(t7, t8)
for s := 0; s < 8; s++ {
t7.Square(t7)
}
t7.Square(t7, 8)
t6.Mul(t6, t7)
for s := 0; s < 6; s++ {
t6.Square(t6)
}
t6.Square(t6, 6)
t5.Mul(t5, t6)
for s := 0; s < 7; s++ {
t5.Square(t5)
}
t5.Square(t5, 7)
t4.Mul(t4, t5)
for s := 0; s < 9; s++ {
t4.Square(t4)
}
t4.Square(t4, 9)
t3.Mul(t3, t4)
for s := 0; s < 7; s++ {
t3.Square(t3)
}
t3.Square(t3, 7)
t2.Mul(t2, t3)
for s := 0; s < 8; s++ {
t2.Square(t2)
}
t2.Square(t2, 8)
t1.Mul(t1, t2)
for s := 0; s < 8; s++ {
t1.Square(t1)
}
t1.Square(t1, 8)
t0.Mul(t0, t1)
for s := 0; s < 8; s++ {
t0.Square(t0)
}
t0.Square(t0, 8)
z.Mul(z, t0)
return e.Set(z)
}
@ -231,7 +175,7 @@ func Sqrt(e, x *gfP) (isSquare bool) {
gfpMul(i, i, b) // i=2(fb)b
gfpSub(i, i, one) // i=2(fb)b-1
gfpMul(i, candidate, i) // i=(fb)(2(fb)b-1)
square := new(gfP).Square(i)
square := new(gfP).Square(i, 1)
if square.Equal(x) != 1 {
return false
}
@ -306,8 +250,8 @@ func sqrtCandidate(z, x *gfP) {
var t18 = new(gfP)
var t19 = new(gfP)
t18.Square(x)
t8.Square(t18)
t18.Square(x, 1)
t8.Square(t18, 1)
t16.Mul(t18, t8)
t2.Mul(t8, t16)
t14.Mul(x, t2)
@ -332,120 +276,64 @@ func sqrtCandidate(z, x *gfP) {
t8.Mul(t8, t10)
t18.Mul(t18, t8)
t16.Mul(t16, t18)
for s := 0; s < 3; s++ {
t19.Square(t19)
}
t19.Square(t19, 3)
t19.Mul(x, t19)
for s := 0; s < 33; s++ {
t19.Square(t19)
}
t19.Square(t19, 33)
t19.Mul(t15, t19)
for s := 0; s < 8; s++ {
t19.Square(t19)
}
t19.Square(t19, 8)
t19.Mul(t12, t19)
for s := 0; s < 9; s++ {
t19.Square(t19)
}
t19.Square(t19, 9)
t18.Mul(t18, t19)
for s := 0; s < 10; s++ {
t18.Square(t18)
}
t18.Square(t18, 10)
t18.Mul(t16, t18)
t18.Square(t18)
t18.Square(t18, 1)
t18.Mul(x, t18)
for s := 0; s < 14; s++ {
t18.Square(t18)
}
t18.Square(t18, 14)
t18.Mul(t16, t18)
for s := 0; s < 5; s++ {
t18.Square(t18)
}
t18.Square(t18, 5)
t17.Mul(t17, t18)
for s := 0; s < 9; s++ {
t17.Square(t17)
}
t17.Square(t17, 9)
t16.Mul(t16, t17)
t16.Square(t16)
t16.Square(t16, 1)
t15.Mul(t15, t16)
for s := 0; s < 5; s++ {
t15.Square(t15)
}
t15.Square(t15, 5)
t14.Mul(t14, t15)
for s := 0; s < 9; s++ {
t14.Square(t14)
}
t14.Square(t14, 9)
t13.Mul(t13, t14)
for s := 0; s < 8; s++ {
t13.Square(t13)
}
t13.Square(t13, 8)
t12.Mul(t12, t13)
for s := 0; s < 9; s++ {
t12.Square(t12)
}
t12.Square(t12, 9)
t12.Mul(t11, t12)
for s := 0; s < 9; s++ {
t12.Square(t12)
}
t12.Square(t12, 9)
t12.Mul(t5, t12)
for s := 0; s < 8; s++ {
t12.Square(t12)
}
t12.Square(t12, 8)
t11.Mul(t11, t12)
for s := 0; s < 9; s++ {
t11.Square(t11)
}
t11.Square(t11, 9)
t10.Mul(t10, t11)
for s := 0; s < 8; s++ {
t10.Square(t10)
}
t10.Square(t10, 8)
t10.Mul(t2, t10)
for s := 0; s < 8; s++ {
t10.Square(t10)
}
t10.Square(t10, 8)
t10.Mul(t3, t10)
for s := 0; s < 8; s++ {
t10.Square(t10)
}
t10.Square(t10, 8)
t9.Mul(t9, t10)
for s := 0; s < 7; s++ {
t9.Square(t9)
}
t9.Square(t9, 7)
t8.Mul(t8, t9)
for s := 0; s < 7; s++ {
t8.Square(t8)
}
t8.Square(t8, 7)
t7.Mul(t7, t8)
for s := 0; s < 8; s++ {
t7.Square(t7)
}
t7.Square(t7, 8)
t6.Mul(t6, t7)
for s := 0; s < 6; s++ {
t6.Square(t6)
}
t6.Square(t6, 6)
t5.Mul(t5, t6)
for s := 0; s < 7; s++ {
t5.Square(t5)
}
t5.Square(t5, 7)
t4.Mul(t4, t5)
for s := 0; s < 9; s++ {
t4.Square(t4)
}
t4.Square(t4, 9)
t3.Mul(t3, t4)
for s := 0; s < 7; s++ {
t3.Square(t3)
}
t3.Square(t3, 7)
t2.Mul(t2, t3)
for s := 0; s < 8; s++ {
t2.Square(t2)
}
t2.Square(t2, 8)
t1.Mul(t1, t2)
for s := 0; s < 8; s++ {
t1.Square(t1)
}
t1.Square(t1, 8)
t0.Mul(t0, t1)
for s := 0; s < 5; s++ {
t0.Square(t0)
}
t0.Square(t0, 5)
z.Mul(z, t0)
}