mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-22 10:16:18 +08:00
sm9/bn256: asm rewrite batch 1, for arm64 test
This commit is contained in:
parent
ecab517411
commit
0bb1fa5be5
@ -91,8 +91,8 @@ func (e *gfP) Mul(a, b *gfP) *gfP {
|
||||
return e
|
||||
}
|
||||
|
||||
func (e *gfP) Square(a *gfP) *gfP {
|
||||
gfpMul(e, a, a)
|
||||
func (e *gfP) Square(a *gfP, n int) *gfP {
|
||||
gfpSqr(e, a, n)
|
||||
return e
|
||||
}
|
||||
|
||||
@ -150,7 +150,7 @@ func (e *gfP) Unmarshal(in []byte) error {
|
||||
}
|
||||
|
||||
func montEncode(c, a *gfP) { gfpMul(c, a, r2) }
|
||||
func montDecode(c, a *gfP) { gfpMul(c, a, &gfP{1}) }
|
||||
func montDecode(c, a *gfP) { gfpFromMont(c, a) }
|
||||
|
||||
// cmovznzU64 is a single-word conditional move.
|
||||
//
|
||||
|
@ -173,8 +173,8 @@ func (e *gfP2) Square(a *gfP2) *gfP2 {
|
||||
// Complex squaring algorithm:
|
||||
// (xu+y)² = y^2-2*x^2 + 2*u*x*y
|
||||
tx, ty := &gfP{}, &gfP{}
|
||||
gfpMul(tx, &a.x, &a.x)
|
||||
gfpMul(ty, &a.y, &a.y)
|
||||
gfpSqr(tx, &a.x, 1)
|
||||
gfpSqr(ty, &a.y, 1)
|
||||
gfpSub(ty, ty, tx)
|
||||
gfpSub(ty, ty, tx)
|
||||
|
||||
@ -192,8 +192,8 @@ func (e *gfP2) SquareU(a *gfP2) *gfP2 {
|
||||
|
||||
tx, ty := &gfP{}, &gfP{}
|
||||
// tx = a0^2 - 2 * a1^2
|
||||
gfpMul(ty, &a.x, &a.x)
|
||||
gfpMul(tx, &a.y, &a.y)
|
||||
gfpSqr(ty, &a.x, 1)
|
||||
gfpSqr(tx, &a.y, 1)
|
||||
gfpAdd(ty, ty, ty)
|
||||
gfpSub(tx, tx, ty)
|
||||
|
||||
|
@ -1,6 +1,21 @@
|
||||
//go:build amd64 && !purego
|
||||
// +build amd64,!purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define res_ptr DI
|
||||
#define x_ptr SI
|
||||
#define y_ptr CX
|
||||
|
||||
#define acc0 R8
|
||||
#define acc1 R9
|
||||
#define acc2 R10
|
||||
#define acc3 R11
|
||||
#define acc4 R12
|
||||
#define acc5 R13
|
||||
#define t0 R14
|
||||
#define t1 R15
|
||||
|
||||
#define storeBlock(a0,a1,a2,a3, r) \
|
||||
MOVQ a0, 0+r \
|
||||
MOVQ a1, 8+r \
|
||||
@ -34,9 +49,6 @@
|
||||
CMOVQCC b2, a2 \
|
||||
CMOVQCC b3, a3
|
||||
|
||||
#include "mul_amd64.h"
|
||||
#include "mul_bmi2_amd64.h"
|
||||
|
||||
TEXT ·gfpNeg(SB),0,$0-16
|
||||
MOVQ ·p2+0(SB), R8
|
||||
MOVQ ·p2+8(SB), R9
|
||||
@ -106,25 +118,732 @@ TEXT ·gfpSub(SB),0,$0-24
|
||||
storeBlock(R8,R9,R10,R11, 0(DI))
|
||||
RET
|
||||
|
||||
TEXT ·gfpMul(SB),0,$160-24
|
||||
MOVQ a+8(FP), DI
|
||||
MOVQ b+16(FP), SI
|
||||
TEXT ·gfpMul(SB),0,$0-24
|
||||
MOVQ res+0(FP), res_ptr
|
||||
MOVQ in1+8(FP), x_ptr
|
||||
MOVQ in2+16(FP), y_ptr
|
||||
// x * y[0]
|
||||
MOVQ (8*0)(y_ptr), t0
|
||||
|
||||
// Jump to a slightly different implementation if MULX isn't supported.
|
||||
CMPB ·hasBMI2(SB), $0
|
||||
JE nobmi2Mul
|
||||
MOVQ (8*0)(x_ptr), AX
|
||||
MULQ t0
|
||||
MOVQ AX, acc0
|
||||
MOVQ DX, acc1
|
||||
|
||||
mulBMI2(0(DI),8(DI),16(DI),24(DI), 0(SI))
|
||||
storeBlock( R8, R9,R10,R11, 0(SP))
|
||||
storeBlock(R12,R13,R14,CX, 32(SP))
|
||||
gfpReduceBMI2()
|
||||
JMP end
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc2
|
||||
|
||||
nobmi2Mul:
|
||||
mul(0(DI),8(DI),16(DI),24(DI), 0(SI), 0(SP))
|
||||
gfpReduce(0(SP))
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc3
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc4
|
||||
XORQ acc5, acc5
|
||||
// First reduction step
|
||||
MOVQ acc0, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc0
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc1
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ DX, acc4
|
||||
ADCQ $0, acc5
|
||||
// x * y[1]
|
||||
MOVQ (8*1)(y_ptr), t0
|
||||
|
||||
MOVQ (8*0)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ DX, acc5
|
||||
ADCQ $0, acc0
|
||||
// Second reduction step
|
||||
MOVQ acc1, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ DX, acc5
|
||||
ADCQ $0, acc0
|
||||
// x * y[2]
|
||||
MOVQ (8*2)(y_ptr), t0
|
||||
|
||||
MOVQ (8*0)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc5
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc5
|
||||
ADCQ DX, acc0
|
||||
ADCQ $0, acc1
|
||||
// Third reduction step
|
||||
MOVQ acc2, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc5
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc5
|
||||
ADCQ DX, acc0
|
||||
ADCQ $0, acc1
|
||||
// x * y[3]
|
||||
MOVQ (8*3)(y_ptr), t0
|
||||
|
||||
MOVQ (8*0)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc5
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc5
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc0
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc0
|
||||
ADCQ DX, acc1
|
||||
ADCQ $0, acc2
|
||||
// Last reduction step
|
||||
MOVQ acc3, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc5
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc5
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc0
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc0
|
||||
ADCQ DX, acc1
|
||||
ADCQ $0, acc2
|
||||
// Copy result [255:0]
|
||||
MOVQ acc4, x_ptr
|
||||
MOVQ acc5, acc3
|
||||
MOVQ acc0, t0
|
||||
MOVQ acc1, t1
|
||||
// Subtract p2
|
||||
SUBQ ·p2+0x00(SB), acc4
|
||||
SBBQ ·p2+0x08(SB) ,acc5
|
||||
SBBQ ·p2+0x10(SB), acc0
|
||||
SBBQ ·p2+0x18(SB), acc1
|
||||
SBBQ $0, acc2
|
||||
|
||||
CMOVQCS x_ptr, acc4
|
||||
CMOVQCS acc3, acc5
|
||||
CMOVQCS t0, acc0
|
||||
CMOVQCS t1, acc1
|
||||
|
||||
MOVQ acc4, (8*0)(res_ptr)
|
||||
MOVQ acc5, (8*1)(res_ptr)
|
||||
MOVQ acc0, (8*2)(res_ptr)
|
||||
MOVQ acc1, (8*3)(res_ptr)
|
||||
|
||||
RET
|
||||
|
||||
// func gfpSqr(res, in *gfP, n int)
|
||||
TEXT ·gfpSqr(SB),NOSPLIT,$0
|
||||
MOVQ res+0(FP), res_ptr
|
||||
MOVQ in+8(FP), x_ptr
|
||||
MOVQ n+16(FP), BX
|
||||
|
||||
gfpSqrLoop:
|
||||
|
||||
// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ t0
|
||||
MOVQ AX, acc1
|
||||
MOVQ DX, acc2
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc3
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc4
|
||||
// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc5
|
||||
// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc5
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, y_ptr
|
||||
XORQ t1, t1
|
||||
// *2
|
||||
ADDQ acc1, acc1
|
||||
ADCQ acc2, acc2
|
||||
ADCQ acc3, acc3
|
||||
ADCQ acc4, acc4
|
||||
ADCQ acc5, acc5
|
||||
ADCQ y_ptr, y_ptr
|
||||
ADCQ $0, t1
|
||||
// Missing products
|
||||
MOVQ (8*0)(x_ptr), AX
|
||||
MULQ AX
|
||||
MOVQ AX, acc0
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc1
|
||||
ADCQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc3
|
||||
ADCQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc5
|
||||
ADCQ AX, y_ptr
|
||||
ADCQ DX, t1
|
||||
MOVQ t1, x_ptr
|
||||
// T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr]
|
||||
// First reduction step
|
||||
MOVQ acc0, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0 // Y
|
||||
|
||||
// Calculate next T = T+Y*P
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc0 // acc0 is free now
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
XORQ acc0, acc0
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc1
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ DX, acc0
|
||||
|
||||
// Second reduction step
|
||||
MOVQ acc1, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0 // Y
|
||||
|
||||
// Calculate next T = T+Y*P
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc1 // acc1 is free now
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
XORQ acc1, acc1
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc0
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc0
|
||||
ADCQ DX, acc1
|
||||
|
||||
// Third reduction step
|
||||
MOVQ acc2, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0 // Y
|
||||
|
||||
// Calculate next T = T+Y*P
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2 // acc2 is free now
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
XORQ acc2, acc2
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc0
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc0
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc1
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc1
|
||||
ADCQ DX, acc2
|
||||
|
||||
// Last reduction step
|
||||
MOVQ acc3, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0 // Y
|
||||
|
||||
// Calculate next T = T+Y*P
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3 // acc3 is free now
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
XORQ acc3, acc3
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc0
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc0
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc1
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ DX, acc3
|
||||
|
||||
XORQ t0, t0
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADCQ acc4, acc0
|
||||
ADCQ acc5, acc1
|
||||
ADCQ y_ptr, acc2
|
||||
ADCQ x_ptr, acc3
|
||||
ADCQ $0, t0
|
||||
|
||||
MOVQ acc0, acc4
|
||||
MOVQ acc1, acc5
|
||||
MOVQ acc2, y_ptr
|
||||
MOVQ acc3, t1
|
||||
// Subtract p2
|
||||
SUBQ ·p2+0x00(SB), acc0
|
||||
SBBQ ·p2+0x08(SB) ,acc1
|
||||
SBBQ ·p2+0x10(SB), acc2
|
||||
SBBQ ·p2+0x18(SB), acc3
|
||||
SBBQ $0, t0
|
||||
|
||||
CMOVQCS acc4, acc0
|
||||
CMOVQCS acc5, acc1
|
||||
CMOVQCS y_ptr, acc2
|
||||
CMOVQCS t1, acc3
|
||||
|
||||
MOVQ acc0, (8*0)(res_ptr)
|
||||
MOVQ acc1, (8*1)(res_ptr)
|
||||
MOVQ acc2, (8*2)(res_ptr)
|
||||
MOVQ acc3, (8*3)(res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
DECQ BX
|
||||
JNE gfpSqrLoop
|
||||
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// func gfpFromMont(res, in *gfP)
|
||||
TEXT ·gfpFromMont(SB),NOSPLIT,$0
|
||||
MOVQ res+0(FP), res_ptr
|
||||
MOVQ in+8(FP), x_ptr
|
||||
|
||||
MOVQ (8*0)(x_ptr), acc0
|
||||
MOVQ (8*1)(x_ptr), acc1
|
||||
MOVQ (8*2)(x_ptr), acc2
|
||||
MOVQ (8*3)(x_ptr), acc3
|
||||
XORQ acc4, acc4
|
||||
|
||||
// Only reduce, no multiplications are needed
|
||||
// First reduction step
|
||||
MOVQ acc0, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0 // Y
|
||||
|
||||
// Calculate next T = T+Y*P
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc0 // acc0 is free now
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
XORQ acc0, acc0
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc1
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ DX, acc4
|
||||
XORQ acc5, acc5
|
||||
|
||||
// Second reduction step
|
||||
MOVQ acc1, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0 // Y
|
||||
|
||||
// Calculate next T = T+Y*P
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc1 // acc1 is free now
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
XORQ acc1, acc1
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ DX, acc5
|
||||
|
||||
// Third reduction step
|
||||
MOVQ acc2, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0 // Y
|
||||
|
||||
// Calculate next T = T+Y*P
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2 // acc2 is free now
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc5
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc5
|
||||
ADCQ DX, acc0
|
||||
|
||||
// Last reduction step
|
||||
MOVQ acc3, AX
|
||||
MULQ ·np+0x00(SB)
|
||||
MOVQ AX, t0 // Y
|
||||
|
||||
// Calculate next T = T+Y*P
|
||||
MOVQ ·p2+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3 // acc3 is free now
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
XORQ acc3, acc3
|
||||
|
||||
MOVQ ·p2+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x10(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc5
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc5
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1 // carry
|
||||
|
||||
MOVQ ·p2+0x18(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc0
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc0
|
||||
ADCQ DX, acc1
|
||||
|
||||
MOVQ acc4, x_ptr
|
||||
MOVQ acc5, acc3
|
||||
MOVQ acc0, t0
|
||||
MOVQ acc1, t1
|
||||
|
||||
SUBQ ·p2+0x00(SB), acc4
|
||||
SBBQ ·p2+0x08(SB) ,acc5
|
||||
SBBQ ·p2+0x10(SB), acc0
|
||||
SBBQ ·p2+0x18(SB), acc1
|
||||
|
||||
CMOVQCS x_ptr, acc4
|
||||
CMOVQCS acc3, acc5
|
||||
CMOVQCS t0, acc0
|
||||
CMOVQCS t1, acc1
|
||||
|
||||
MOVQ acc4, (8*0)(res_ptr)
|
||||
MOVQ acc5, (8*1)(res_ptr)
|
||||
MOVQ acc0, (8*2)(res_ptr)
|
||||
MOVQ acc1, (8*3)(res_ptr)
|
||||
|
||||
end:
|
||||
MOVQ c+0(FP), DI
|
||||
storeBlock(R12,R13,R14,CX, 0(DI))
|
||||
RET
|
||||
|
@ -1,6 +1,43 @@
|
||||
//go:build arm64 && !purego
|
||||
// +build arm64,!purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define res_ptr R0
|
||||
#define a_ptr R1
|
||||
#define b_ptr R2
|
||||
|
||||
#define acc0 R3
|
||||
#define acc1 R4
|
||||
#define acc2 R5
|
||||
#define acc3 R6
|
||||
|
||||
#define acc4 R7
|
||||
#define acc5 R8
|
||||
#define acc6 R9
|
||||
#define acc7 R10
|
||||
#define t0 R11
|
||||
#define t1 R12
|
||||
#define t2 R13
|
||||
#define t3 R14
|
||||
#define const0 R15
|
||||
#define const1 R16
|
||||
|
||||
#define hlp0 R17
|
||||
#define hlp1 res_ptr
|
||||
|
||||
#define x0 R19
|
||||
#define x1 R20
|
||||
#define x2 R21
|
||||
#define x3 R22
|
||||
#define y0 R23
|
||||
#define y1 R24
|
||||
#define y2 R25
|
||||
#define y3 R26
|
||||
|
||||
#define const2 t2
|
||||
#define const3 t3
|
||||
|
||||
#define storeBlock(a0,a1,a2,a3, r) \
|
||||
MOVD a0, 0+r \
|
||||
MOVD a1, 8+r \
|
||||
@ -19,8 +56,6 @@
|
||||
MOVD ·p2+16(SB), p2 \
|
||||
MOVD ·p2+24(SB), p3
|
||||
|
||||
#include "mul_arm64.h"
|
||||
|
||||
TEXT ·gfpNeg(SB),0,$0-16
|
||||
MOVD a+8(FP), R0
|
||||
loadBlock(0(R0), R1,R2,R3,R4)
|
||||
@ -100,15 +135,550 @@ TEXT ·gfpSub(SB),0,$0-24
|
||||
storeBlock(R1,R2,R3,R4, 0(R0))
|
||||
RET
|
||||
|
||||
TEXT ·gfpMul(SB),0,$0-24
|
||||
MOVD a+8(FP), R0
|
||||
loadBlock(0(R0), R1,R2,R3,R4)
|
||||
MOVD b+16(FP), R0
|
||||
loadBlock(0(R0), R5,R6,R7,R8)
|
||||
TEXT ·gfpMul(SB),NOSPLIT,$0
|
||||
MOVD in1+8(FP), a_ptr
|
||||
MOVD in2+16(FP), b_ptr
|
||||
|
||||
mul(R9,R10,R11,R12,R13,R14,R15,R16)
|
||||
gfpReduce()
|
||||
MOVD ·np+0x00(SB), hlp1
|
||||
LDP ·p2+0x00(SB), (const0, const1)
|
||||
LDP ·p2+0x10(SB), (const2, const3)
|
||||
|
||||
LDP 0*16(a_ptr), (x0, x1)
|
||||
LDP 1*16(a_ptr), (x2, x3)
|
||||
LDP 0*16(b_ptr), (y0, y1)
|
||||
LDP 1*16(b_ptr), (y2, y3)
|
||||
|
||||
// y[0] * x
|
||||
MUL y0, x0, acc0
|
||||
UMULH y0, x0, acc1
|
||||
|
||||
MUL y0, x1, t0
|
||||
ADDS t0, acc1
|
||||
UMULH y0, x1, acc2
|
||||
|
||||
MUL y0, x2, t0
|
||||
ADCS t0, acc2
|
||||
UMULH y0, x2, acc3
|
||||
|
||||
MUL y0, x3, t0
|
||||
ADCS t0, acc3
|
||||
UMULH y0, x3, acc4
|
||||
ADC $0, acc4
|
||||
// First reduction step
|
||||
MUL acc0, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc0, acc0
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const2, hlp0, acc0
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc4
|
||||
|
||||
ADDS t1, acc1, acc1
|
||||
ADCS y0, acc2, acc2
|
||||
ADCS acc0, acc3, acc3
|
||||
ADC $0, hlp0, acc0
|
||||
// y[1] * x
|
||||
MUL y1, x0, t0
|
||||
ADDS t0, acc1
|
||||
UMULH y1, x0, t1
|
||||
|
||||
MUL y1, x1, t0
|
||||
ADCS t0, acc2
|
||||
UMULH y1, x1, hlp0
|
||||
|
||||
MUL y1, x2, t0
|
||||
ADCS t0, acc3
|
||||
UMULH y1, x2, y0
|
||||
|
||||
MUL y1, x3, t0
|
||||
ADCS t0, acc4
|
||||
UMULH y1, x3, y1
|
||||
ADC $0, ZR, acc5
|
||||
|
||||
ADDS t1, acc2
|
||||
ADCS hlp0, acc3
|
||||
ADCS y0, acc4
|
||||
ADC y1, acc5
|
||||
// Second reduction step
|
||||
MUL acc1, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc1, acc1
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const2, hlp0, acc1
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc5
|
||||
|
||||
ADDS t1, acc2, acc2
|
||||
ADCS y0, acc3, acc3
|
||||
ADCS acc1, acc0, acc0
|
||||
ADC $0, hlp0, acc1
|
||||
// y[2] * x
|
||||
MUL y2, x0, t0
|
||||
ADDS t0, acc2
|
||||
UMULH y2, x0, t1
|
||||
|
||||
MUL y2, x1, t0
|
||||
ADCS t0, acc3
|
||||
UMULH y2, x1, hlp0
|
||||
|
||||
MUL y2, x2, t0
|
||||
ADCS t0, acc4
|
||||
UMULH y2, x2, y0
|
||||
|
||||
MUL y2, x3, t0
|
||||
ADCS t0, acc5
|
||||
UMULH y2, x3, y1
|
||||
ADC $0, ZR, acc6
|
||||
|
||||
ADDS t1, acc3
|
||||
ADCS hlp0, acc4
|
||||
ADCS y0, acc5
|
||||
ADC y1, acc6
|
||||
// Third reduction step
|
||||
MUL acc2, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const2, hlp0, acc2
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc6
|
||||
|
||||
ADDS t1, acc3, acc3
|
||||
ADCS y0, acc0, acc0
|
||||
ADCS acc2, acc1, acc1
|
||||
ADC $0, hlp0, acc2
|
||||
// y[3] * x
|
||||
MUL y3, x0, t0
|
||||
ADDS t0, acc3
|
||||
UMULH y3, x0, t1
|
||||
|
||||
MUL y3, x1, t0
|
||||
ADCS t0, acc4
|
||||
UMULH y3, x1, hlp0
|
||||
|
||||
MUL y3, x2, t0
|
||||
ADCS t0, acc5
|
||||
UMULH y3, x2, y0
|
||||
|
||||
MUL y3, x3, t0
|
||||
ADCS t0, acc6
|
||||
UMULH y3, x3, y1
|
||||
ADC $0, ZR, acc7
|
||||
|
||||
ADDS t1, acc4
|
||||
ADCS hlp0, acc5
|
||||
ADCS y0, acc6
|
||||
ADC y1, acc7
|
||||
// Last reduction step
|
||||
MUL acc3, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc3, acc3
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const2, hlp0, acc3
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc7
|
||||
|
||||
ADDS t1, acc0, acc0
|
||||
ADCS y0, acc1, acc1
|
||||
ADCS acc3, acc2, acc2
|
||||
ADC $0, hlp0, acc3
|
||||
|
||||
ADDS acc4, acc0, acc0
|
||||
ADCS acc5, acc1, acc1
|
||||
ADCS acc6, acc2, acc2
|
||||
ADCS acc7, acc3, acc3
|
||||
ADC $0, ZR, acc4
|
||||
|
||||
SUBS const0, acc0, t0
|
||||
SBCS const1, acc1, t1
|
||||
SBCS const2, acc2, t2
|
||||
SBCS const3, acc3, t3
|
||||
SBCS $0, acc4, acc4
|
||||
|
||||
CSEL CS, t0, acc0, acc0
|
||||
CSEL CS, t1, acc1, acc1
|
||||
CSEL CS, t2, acc2, acc2
|
||||
CSEL CS, t3, acc3, acc3
|
||||
|
||||
MOVD res+0(FP), res_ptr
|
||||
STP (acc0, acc1), 0*16(res_ptr)
|
||||
STP (acc2, acc3), 1*16(res_ptr)
|
||||
|
||||
RET
|
||||
|
||||
// func gfpSqr(res, in *gfP, n int)
|
||||
TEXT ·gfpSqr(SB),NOSPLIT,$0
|
||||
MOVD in+8(FP), a_ptr
|
||||
MOVD n+16(FP), b_ptr
|
||||
|
||||
MOVD ·np+0x00(SB), hlp1
|
||||
LDP ·p2+0x00(SB), (const0, const1)
|
||||
LDP ·p2+0x10(SB), (const2, const3)
|
||||
|
||||
LDP 0*16(a_ptr), (x0, x1)
|
||||
LDP 1*16(a_ptr), (x2, x3)
|
||||
|
||||
ordSqrLoop:
|
||||
SUB $1, b_ptr
|
||||
|
||||
// x[1:] * x[0]
|
||||
MUL x0, x1, acc1
|
||||
UMULH x0, x1, acc2
|
||||
|
||||
MUL x0, x2, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH x0, x2, acc3
|
||||
|
||||
MUL x0, x3, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH x0, x3, acc4
|
||||
ADC $0, acc4, acc4
|
||||
// x[2:] * x[1]
|
||||
MUL x1, x2, t0
|
||||
ADDS t0, acc3
|
||||
UMULH x1, x2, t1
|
||||
ADCS t1, acc4
|
||||
ADC $0, ZR, acc5
|
||||
|
||||
MUL x1, x3, t0
|
||||
ADDS t0, acc4
|
||||
UMULH x1, x3, t1
|
||||
ADC t1, acc5
|
||||
// x[3] * x[2]
|
||||
MUL x2, x3, t0
|
||||
ADDS t0, acc5
|
||||
UMULH x2, x3, acc6
|
||||
ADC $0, acc6
|
||||
|
||||
MOVD $0, acc7
|
||||
// *2
|
||||
ADDS acc1, acc1
|
||||
ADCS acc2, acc2
|
||||
ADCS acc3, acc3
|
||||
ADCS acc4, acc4
|
||||
ADCS acc5, acc5
|
||||
ADCS acc6, acc6
|
||||
ADC $0, acc7
|
||||
// Missing products
|
||||
MUL x0, x0, acc0
|
||||
UMULH x0, x0, t0
|
||||
ADDS t0, acc1, acc1
|
||||
|
||||
MUL x1, x1, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH x1, x1, t1
|
||||
ADCS t1, acc3, acc3
|
||||
|
||||
MUL x2, x2, t0
|
||||
ADCS t0, acc4, acc4
|
||||
UMULH x2, x2, t1
|
||||
ADCS t1, acc5, acc5
|
||||
|
||||
MUL x3, x3, t0
|
||||
ADCS t0, acc6, acc6
|
||||
UMULH x3, x3, t1
|
||||
ADC t1, acc7, acc7
|
||||
// First reduction step
|
||||
MUL acc0, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc0, acc0
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const2, hlp0, acc0
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc1, acc1
|
||||
ADCS y0, acc2, acc2
|
||||
ADCS acc0, acc3, acc3
|
||||
ADC $0, hlp0, acc0
|
||||
// Second reduction step
|
||||
MUL acc1, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc1, acc1
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const2, hlp0, acc1
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc2, acc2
|
||||
ADCS y0, acc3, acc3
|
||||
ADCS acc1, acc0, acc0
|
||||
ADC $0, hlp0, acc1
|
||||
// Third reduction step
|
||||
MUL acc2, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const2, hlp0, acc2
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc3, acc3
|
||||
ADCS y0, acc0, acc0
|
||||
ADCS acc2, acc1, acc1
|
||||
ADC $0, hlp0, acc2
|
||||
|
||||
// Last reduction step
|
||||
MUL acc3, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc3, acc3
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const2, hlp0, acc3
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc7
|
||||
|
||||
ADDS t1, acc0, acc0
|
||||
ADCS y0, acc1, acc1
|
||||
ADCS acc3, acc2, acc2
|
||||
ADC $0, hlp0, acc3
|
||||
|
||||
ADDS acc4, acc0, acc0
|
||||
ADCS acc5, acc1, acc1
|
||||
ADCS acc6, acc2, acc2
|
||||
ADCS acc7, acc3, acc3
|
||||
ADC $0, ZR, acc4
|
||||
|
||||
SUBS const0, acc0, y0
|
||||
SBCS const1, acc1, y1
|
||||
SBCS const2, acc2, y2
|
||||
SBCS const3, acc3, y3
|
||||
SBCS $0, acc4, acc4
|
||||
|
||||
CSEL CS, y0, acc0, x0
|
||||
CSEL CS, y1, acc1, x1
|
||||
CSEL CS, y2, acc2, x2
|
||||
CSEL CS, y3, acc3, x3
|
||||
|
||||
CBNZ b_ptr, ordSqrLoop
|
||||
|
||||
MOVD res+0(FP), res_ptr
|
||||
STP (x0, x1), 0*16(res_ptr)
|
||||
STP (x2, x3), 1*16(res_ptr)
|
||||
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// func gfpFromMont(res, in *gfP)
|
||||
TEXT ·gfpFromMont(SB),NOSPLIT,$0
|
||||
MOVD in+8(FP), a_ptr
|
||||
|
||||
MOVD ·np+0x00(SB), hlp1
|
||||
LDP ·p2+0x00(SB), (const0, const1)
|
||||
LDP ·p2+0x10(SB), (const2, const3)
|
||||
|
||||
LDP 0*16(a_ptr), (x0, x1)
|
||||
LDP 1*16(a_ptr), (x2, x3)
|
||||
// Only reduce, no multiplications are needed
|
||||
// First reduction step
|
||||
MUL acc0, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc0, acc0
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const2, hlp0, acc0
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc1, acc1
|
||||
ADCS y0, acc2, acc2
|
||||
ADCS acc0, acc3, acc3
|
||||
ADC $0, hlp0, acc0
|
||||
// Second reduction step
|
||||
MUL acc1, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc1, acc1
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const2, hlp0, acc1
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc2, acc2
|
||||
ADCS y0, acc3, acc3
|
||||
ADCS acc1, acc0, acc0
|
||||
ADC $0, hlp0, acc1
|
||||
// Third reduction step
|
||||
MUL acc2, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const2, hlp0, acc2
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc3, acc3
|
||||
ADCS y0, acc0, acc0
|
||||
ADCS acc2, acc1, acc1
|
||||
ADC $0, hlp0, acc2
|
||||
|
||||
// Last reduction step
|
||||
MUL acc3, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc3, acc3
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const2, hlp0, acc3
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc7
|
||||
|
||||
ADDS t1, acc0, acc0
|
||||
ADCS y0, acc1, acc1
|
||||
ADCS acc3, acc2, acc2
|
||||
ADC $0, hlp0, acc3
|
||||
|
||||
SUBS const0, acc0, y0
|
||||
SBCS const1, acc1, y1
|
||||
SBCS const2, acc2, y2
|
||||
SBCS const3, acc3, y3
|
||||
|
||||
CSEL CS, y0, acc0, x0
|
||||
CSEL CS, y1, acc1, x1
|
||||
CSEL CS, y2, acc2, x2
|
||||
CSEL CS, y3, acc3, x3
|
||||
|
||||
MOVD res+0(FP), res_ptr
|
||||
STP (x0, x1), 0*16(res_ptr)
|
||||
STP (x2, x3), 1*16(res_ptr)
|
||||
|
||||
MOVD c+0(FP), R0
|
||||
storeBlock(R1,R2,R3,R4, 0(R0))
|
||||
RET
|
||||
|
@ -23,3 +23,14 @@ func gfpSub(c, a, b *gfP)
|
||||
|
||||
//go:noescape
|
||||
func gfpMul(c, a, b *gfP)
|
||||
|
||||
// Montgomery square, repeated n times (n >= 1).
|
||||
//
|
||||
//go:noescape
|
||||
func gfpSqr(res, in *gfP, n int)
|
||||
|
||||
// Montgomery multiplication by R⁻¹, or 1 outside the domain.
|
||||
// Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
|
||||
//
|
||||
//go:noescape
|
||||
func gfpFromMont(res, in *gfP)
|
||||
|
@ -125,3 +125,14 @@ func gfpMul(c, a, b *gfP) {
|
||||
*c = gfP{T[4], T[5], T[6], T[7]}
|
||||
gfpCarry(c, carry)
|
||||
}
|
||||
|
||||
func gfpSqr(res, in *gfP, n int) {
|
||||
gfpMul(res, in, in)
|
||||
for i := 1; i < n; i++ {
|
||||
gfpMul(res, res, res)
|
||||
}
|
||||
}
|
||||
|
||||
func gfpFromMont(res, in *gfP) {
|
||||
gfpMul(res, in, &gfP{1})
|
||||
}
|
||||
|
@ -73,8 +73,8 @@ func (e *gfP) Invert(x *gfP) *gfP {
|
||||
var t19 = new(gfP)
|
||||
var t20 = new(gfP)
|
||||
|
||||
t17.Square(x)
|
||||
t15.Square(t17)
|
||||
t17.Square(x, 1)
|
||||
t15.Square(t17, 1)
|
||||
z.Mul(t17, t15)
|
||||
t2.Mul(t15, z)
|
||||
t14.Mul(x, t2)
|
||||
@ -100,122 +100,66 @@ func (e *gfP) Invert(x *gfP) *gfP {
|
||||
t17.Mul(z, t18)
|
||||
z.Mul(z, t17)
|
||||
t20.Mul(t13, z)
|
||||
for s := 0; s < 2; s++ {
|
||||
t20.Square(t20)
|
||||
}
|
||||
t20.Square(t20, 2)
|
||||
t20.Mul(x, t20)
|
||||
for s := 0; s < 33; s++ {
|
||||
t20.Square(t20)
|
||||
}
|
||||
t20.Square(t20, 33)
|
||||
t19.Mul(t19, t20)
|
||||
for s := 0; s < 8; s++ {
|
||||
t19.Square(t19)
|
||||
}
|
||||
t19.Square(t19, 8)
|
||||
t19.Mul(t12, t19)
|
||||
for s := 0; s < 9; s++ {
|
||||
t19.Square(t19)
|
||||
}
|
||||
t19.Square(t19, 9)
|
||||
t18.Mul(t18, t19)
|
||||
for s := 0; s < 10; s++ {
|
||||
t18.Square(t18)
|
||||
}
|
||||
t18.Square(t18, 10)
|
||||
t18.Mul(t17, t18)
|
||||
t18.Square(t18)
|
||||
t18.Square(t18, 1)
|
||||
t18.Mul(x, t18)
|
||||
for s := 0; s < 14; s++ {
|
||||
t18.Square(t18)
|
||||
}
|
||||
t18.Square(t18, 14)
|
||||
t17.Mul(t17, t18)
|
||||
for s := 0; s < 5; s++ {
|
||||
t17.Square(t17)
|
||||
}
|
||||
t17.Square(t17, 5)
|
||||
t16.Mul(t16, t17)
|
||||
for s := 0; s < 9; s++ {
|
||||
t16.Square(t16)
|
||||
}
|
||||
t16.Square(t16, 9)
|
||||
t16.Mul(z, t16)
|
||||
t15.Mul(t15, t16)
|
||||
t15.Square(t15)
|
||||
t15.Square(t15, 1)
|
||||
t15.Mul(x, t15)
|
||||
for s := 0; s < 5; s++ {
|
||||
t15.Square(t15)
|
||||
}
|
||||
t15.Square(t15, 5)
|
||||
t14.Mul(t14, t15)
|
||||
for s := 0; s < 9; s++ {
|
||||
t14.Square(t14)
|
||||
}
|
||||
t14.Square(t14, 9)
|
||||
t13.Mul(t13, t14)
|
||||
for s := 0; s < 8; s++ {
|
||||
t13.Square(t13)
|
||||
}
|
||||
t13.Square(t13, 8)
|
||||
t12.Mul(t12, t13)
|
||||
for s := 0; s < 9; s++ {
|
||||
t12.Square(t12)
|
||||
}
|
||||
t12.Square(t12, 9)
|
||||
t12.Mul(t11, t12)
|
||||
for s := 0; s < 9; s++ {
|
||||
t12.Square(t12)
|
||||
}
|
||||
t12.Square(t12, 9)
|
||||
t12.Mul(t5, t12)
|
||||
for s := 0; s < 8; s++ {
|
||||
t12.Square(t12)
|
||||
}
|
||||
t12.Square(t12, 8)
|
||||
t11.Mul(t11, t12)
|
||||
for s := 0; s < 9; s++ {
|
||||
t11.Square(t11)
|
||||
}
|
||||
t11.Square(t11, 9)
|
||||
t10.Mul(t10, t11)
|
||||
for s := 0; s < 8; s++ {
|
||||
t10.Square(t10)
|
||||
}
|
||||
t10.Square(t10, 8)
|
||||
t10.Mul(t2, t10)
|
||||
for s := 0; s < 8; s++ {
|
||||
t10.Square(t10)
|
||||
}
|
||||
t10.Square(t10, 8)
|
||||
t10.Mul(t3, t10)
|
||||
for s := 0; s < 8; s++ {
|
||||
t10.Square(t10)
|
||||
}
|
||||
t10.Square(t10, 8)
|
||||
t9.Mul(t9, t10)
|
||||
for s := 0; s < 7; s++ {
|
||||
t9.Square(t9)
|
||||
}
|
||||
t9.Square(t9, 7)
|
||||
t8.Mul(t8, t9)
|
||||
for s := 0; s < 7; s++ {
|
||||
t8.Square(t8)
|
||||
}
|
||||
t8.Square(t8, 7)
|
||||
t7.Mul(t7, t8)
|
||||
for s := 0; s < 8; s++ {
|
||||
t7.Square(t7)
|
||||
}
|
||||
t7.Square(t7, 8)
|
||||
t6.Mul(t6, t7)
|
||||
for s := 0; s < 6; s++ {
|
||||
t6.Square(t6)
|
||||
}
|
||||
t6.Square(t6, 6)
|
||||
t5.Mul(t5, t6)
|
||||
for s := 0; s < 7; s++ {
|
||||
t5.Square(t5)
|
||||
}
|
||||
t5.Square(t5, 7)
|
||||
t4.Mul(t4, t5)
|
||||
for s := 0; s < 9; s++ {
|
||||
t4.Square(t4)
|
||||
}
|
||||
t4.Square(t4, 9)
|
||||
t3.Mul(t3, t4)
|
||||
for s := 0; s < 7; s++ {
|
||||
t3.Square(t3)
|
||||
}
|
||||
t3.Square(t3, 7)
|
||||
t2.Mul(t2, t3)
|
||||
for s := 0; s < 8; s++ {
|
||||
t2.Square(t2)
|
||||
}
|
||||
t2.Square(t2, 8)
|
||||
t1.Mul(t1, t2)
|
||||
for s := 0; s < 8; s++ {
|
||||
t1.Square(t1)
|
||||
}
|
||||
t1.Square(t1, 8)
|
||||
t0.Mul(t0, t1)
|
||||
for s := 0; s < 8; s++ {
|
||||
t0.Square(t0)
|
||||
}
|
||||
t0.Square(t0, 8)
|
||||
z.Mul(z, t0)
|
||||
return e.Set(z)
|
||||
}
|
||||
@ -231,7 +175,7 @@ func Sqrt(e, x *gfP) (isSquare bool) {
|
||||
gfpMul(i, i, b) // i=2(fb)b
|
||||
gfpSub(i, i, one) // i=2(fb)b-1
|
||||
gfpMul(i, candidate, i) // i=(fb)(2(fb)b-1)
|
||||
square := new(gfP).Square(i)
|
||||
square := new(gfP).Square(i, 1)
|
||||
if square.Equal(x) != 1 {
|
||||
return false
|
||||
}
|
||||
@ -306,8 +250,8 @@ func sqrtCandidate(z, x *gfP) {
|
||||
var t18 = new(gfP)
|
||||
var t19 = new(gfP)
|
||||
|
||||
t18.Square(x)
|
||||
t8.Square(t18)
|
||||
t18.Square(x, 1)
|
||||
t8.Square(t18, 1)
|
||||
t16.Mul(t18, t8)
|
||||
t2.Mul(t8, t16)
|
||||
t14.Mul(x, t2)
|
||||
@ -332,120 +276,64 @@ func sqrtCandidate(z, x *gfP) {
|
||||
t8.Mul(t8, t10)
|
||||
t18.Mul(t18, t8)
|
||||
t16.Mul(t16, t18)
|
||||
for s := 0; s < 3; s++ {
|
||||
t19.Square(t19)
|
||||
}
|
||||
t19.Square(t19, 3)
|
||||
t19.Mul(x, t19)
|
||||
for s := 0; s < 33; s++ {
|
||||
t19.Square(t19)
|
||||
}
|
||||
t19.Square(t19, 33)
|
||||
t19.Mul(t15, t19)
|
||||
for s := 0; s < 8; s++ {
|
||||
t19.Square(t19)
|
||||
}
|
||||
t19.Square(t19, 8)
|
||||
t19.Mul(t12, t19)
|
||||
for s := 0; s < 9; s++ {
|
||||
t19.Square(t19)
|
||||
}
|
||||
t19.Square(t19, 9)
|
||||
t18.Mul(t18, t19)
|
||||
for s := 0; s < 10; s++ {
|
||||
t18.Square(t18)
|
||||
}
|
||||
t18.Square(t18, 10)
|
||||
t18.Mul(t16, t18)
|
||||
t18.Square(t18)
|
||||
t18.Square(t18, 1)
|
||||
t18.Mul(x, t18)
|
||||
for s := 0; s < 14; s++ {
|
||||
t18.Square(t18)
|
||||
}
|
||||
t18.Square(t18, 14)
|
||||
t18.Mul(t16, t18)
|
||||
for s := 0; s < 5; s++ {
|
||||
t18.Square(t18)
|
||||
}
|
||||
t18.Square(t18, 5)
|
||||
t17.Mul(t17, t18)
|
||||
for s := 0; s < 9; s++ {
|
||||
t17.Square(t17)
|
||||
}
|
||||
t17.Square(t17, 9)
|
||||
t16.Mul(t16, t17)
|
||||
t16.Square(t16)
|
||||
t16.Square(t16, 1)
|
||||
t15.Mul(t15, t16)
|
||||
for s := 0; s < 5; s++ {
|
||||
t15.Square(t15)
|
||||
}
|
||||
t15.Square(t15, 5)
|
||||
t14.Mul(t14, t15)
|
||||
for s := 0; s < 9; s++ {
|
||||
t14.Square(t14)
|
||||
}
|
||||
t14.Square(t14, 9)
|
||||
t13.Mul(t13, t14)
|
||||
for s := 0; s < 8; s++ {
|
||||
t13.Square(t13)
|
||||
}
|
||||
t13.Square(t13, 8)
|
||||
t12.Mul(t12, t13)
|
||||
for s := 0; s < 9; s++ {
|
||||
t12.Square(t12)
|
||||
}
|
||||
t12.Square(t12, 9)
|
||||
t12.Mul(t11, t12)
|
||||
for s := 0; s < 9; s++ {
|
||||
t12.Square(t12)
|
||||
}
|
||||
t12.Square(t12, 9)
|
||||
t12.Mul(t5, t12)
|
||||
for s := 0; s < 8; s++ {
|
||||
t12.Square(t12)
|
||||
}
|
||||
t12.Square(t12, 8)
|
||||
t11.Mul(t11, t12)
|
||||
for s := 0; s < 9; s++ {
|
||||
t11.Square(t11)
|
||||
}
|
||||
t11.Square(t11, 9)
|
||||
t10.Mul(t10, t11)
|
||||
for s := 0; s < 8; s++ {
|
||||
t10.Square(t10)
|
||||
}
|
||||
t10.Square(t10, 8)
|
||||
t10.Mul(t2, t10)
|
||||
for s := 0; s < 8; s++ {
|
||||
t10.Square(t10)
|
||||
}
|
||||
t10.Square(t10, 8)
|
||||
t10.Mul(t3, t10)
|
||||
for s := 0; s < 8; s++ {
|
||||
t10.Square(t10)
|
||||
}
|
||||
t10.Square(t10, 8)
|
||||
t9.Mul(t9, t10)
|
||||
for s := 0; s < 7; s++ {
|
||||
t9.Square(t9)
|
||||
}
|
||||
t9.Square(t9, 7)
|
||||
t8.Mul(t8, t9)
|
||||
for s := 0; s < 7; s++ {
|
||||
t8.Square(t8)
|
||||
}
|
||||
t8.Square(t8, 7)
|
||||
t7.Mul(t7, t8)
|
||||
for s := 0; s < 8; s++ {
|
||||
t7.Square(t7)
|
||||
}
|
||||
t7.Square(t7, 8)
|
||||
t6.Mul(t6, t7)
|
||||
for s := 0; s < 6; s++ {
|
||||
t6.Square(t6)
|
||||
}
|
||||
t6.Square(t6, 6)
|
||||
t5.Mul(t5, t6)
|
||||
for s := 0; s < 7; s++ {
|
||||
t5.Square(t5)
|
||||
}
|
||||
t5.Square(t5, 7)
|
||||
t4.Mul(t4, t5)
|
||||
for s := 0; s < 9; s++ {
|
||||
t4.Square(t4)
|
||||
}
|
||||
t4.Square(t4, 9)
|
||||
t3.Mul(t3, t4)
|
||||
for s := 0; s < 7; s++ {
|
||||
t3.Square(t3)
|
||||
}
|
||||
t3.Square(t3, 7)
|
||||
t2.Mul(t2, t3)
|
||||
for s := 0; s < 8; s++ {
|
||||
t2.Square(t2)
|
||||
}
|
||||
t2.Square(t2, 8)
|
||||
t1.Mul(t1, t2)
|
||||
for s := 0; s < 8; s++ {
|
||||
t1.Square(t1)
|
||||
}
|
||||
t1.Square(t1, 8)
|
||||
t0.Mul(t0, t1)
|
||||
for s := 0; s < 5; s++ {
|
||||
t0.Square(t0)
|
||||
}
|
||||
t0.Square(t0, 5)
|
||||
z.Mul(z, t0)
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user