gmsm/sm9/bn256/gfp_amd64.s

1174 lines
18 KiB
ArmAsm
Raw Normal View History

//go:build amd64 && !purego
// +build amd64,!purego
2022-06-07 17:13:23 +08:00
#include "textflag.h"
#define res_ptr DI
#define x_ptr SI
#define y_ptr CX
#define acc0 R8
#define acc1 R9
#define acc2 R10
#define acc3 R11
#define acc4 R12
#define acc5 R13
#define t0 R14
#define t1 R15
2022-06-07 17:13:23 +08:00
#define storeBlock(a0,a1,a2,a3, r) \
MOVQ a0, 0+r \
MOVQ a1, 8+r \
MOVQ a2, 16+r \
MOVQ a3, 24+r
#define loadBlock(r, a0,a1,a2,a3) \
MOVQ 0+r, a0 \
MOVQ 8+r, a1 \
MOVQ 16+r, a2 \
MOVQ 24+r, a3
#define gfpCarry(a0,a1,a2,a3, b0,b1,b2,b3,b4) \
2022-06-07 17:13:23 +08:00
\ // b = a-p
MOVQ a0, b0 \
MOVQ a1, b1 \
MOVQ a2, b2 \
MOVQ a3, b3 \
\
SUBQ ·p2+0(SB), b0 \
SBBQ ·p2+8(SB), b1 \
SBBQ ·p2+16(SB), b2 \
SBBQ ·p2+24(SB), b3 \
SBBQ $0, b4 \
\
\ // if b is negative then return a
\ // else return b
CMOVQCC b0, a0 \
CMOVQCC b1, a1 \
CMOVQCC b2, a2 \
CMOVQCC b3, a3
#define gfpCarryWithoutCarry(a0,a1,a2,a3, b0,b1,b2,b3) \
\ // b = a-p
MOVQ a0, b0 \
MOVQ a1, b1 \
MOVQ a2, b2 \
MOVQ a3, b3 \
\
SUBQ ·p2+0(SB), b0 \
SBBQ ·p2+8(SB), b1 \
SBBQ ·p2+16(SB), b2 \
SBBQ ·p2+24(SB), b3 \
\
\ // if b is negative then return a
\ // else return b
CMOVQCC b0, a0 \
CMOVQCC b1, a1 \
CMOVQCC b2, a2 \
CMOVQCC b3, a3
2022-06-07 17:13:23 +08:00
TEXT ·gfpNeg(SB),0,$0-16
MOVQ ·p2+0(SB), R8
MOVQ ·p2+8(SB), R9
MOVQ ·p2+16(SB), R10
MOVQ ·p2+24(SB), R11
MOVQ a+8(FP), DI
SUBQ 0(DI), R8
SBBQ 8(DI), R9
SBBQ 16(DI), R10
SBBQ 24(DI), R11
gfpCarryWithoutCarry(R8,R9,R10,R11, R12,R13,R14,CX)
2022-06-07 17:13:23 +08:00
MOVQ c+0(FP), DI
storeBlock(R8,R9,R10,R11, 0(DI))
RET
TEXT ·gfpAdd(SB),0,$0-24
MOVQ a+8(FP), DI
MOVQ b+16(FP), SI
loadBlock(0(DI), R8,R9,R10,R11)
MOVQ $0, R12
ADDQ 0(SI), R8
ADCQ 8(SI), R9
ADCQ 16(SI), R10
ADCQ 24(SI), R11
ADCQ $0, R12
gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12)
2022-06-07 17:13:23 +08:00
MOVQ c+0(FP), DI
storeBlock(R8,R9,R10,R11, 0(DI))
RET
TEXT ·gfpSub(SB),0,$0-24
MOVQ a+8(FP), DI
MOVQ b+16(FP), SI
loadBlock(0(DI), R8,R9,R10,R11)
MOVQ ·p2+0(SB), R12
MOVQ ·p2+8(SB), R13
MOVQ ·p2+16(SB), R14
MOVQ ·p2+24(SB), CX
MOVQ $0, AX
SUBQ 0(SI), R8
SBBQ 8(SI), R9
SBBQ 16(SI), R10
SBBQ 24(SI), R11
CMOVQCC AX, R12
CMOVQCC AX, R13
CMOVQCC AX, R14
CMOVQCC AX, CX
ADDQ R12, R8
ADCQ R13, R9
ADCQ R14, R10
ADCQ CX, R11
MOVQ c+0(FP), DI
storeBlock(R8,R9,R10,R11, 0(DI))
RET
TEXT ·gfpMul(SB),0,$0-24
MOVQ res+0(FP), res_ptr
MOVQ in1+8(FP), x_ptr
MOVQ in2+16(FP), y_ptr
CMPB ·hasBMI2(SB), $0
JE nobmi2Mul
// x * y[0]
MOVQ (8*0)(y_ptr), DX
MULXQ (8*0)(x_ptr), acc0, acc1
MULXQ (8*1)(x_ptr), AX, acc2
ADDQ AX, acc1
ADCQ $0, acc2
MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2
ADCQ $0, acc3
MULXQ (8*3)(x_ptr), AX, acc4
ADDQ AX, acc3
ADCQ $0, acc4
XORQ acc5, acc5
// First reduction step
MOVQ acc0, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc0
ADCQ t1, acc1
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
ADCQ $0, acc5
// x * y[1]
MOVQ (8*1)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ (8*1)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ (8*2)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ (8*3)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
ADCQ $0, acc0
// x * y[2]
MOVQ (8*2)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ (8*1)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ (8*2)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
MULXQ (8*3)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc5
ADCQ t1, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc5
ADCQ t1, acc0
ADCQ $0, acc1
// x * y[3]
MOVQ (8*3)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ (8*1)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
MULXQ (8*2)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc5
ADCQ t1, acc0
MULXQ (8*3)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc0
ADCQ t1, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc5
ADCQ t1, acc0
MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc0
ADCQ t1, acc1
ADCQ $0, acc2
// Copy result [255:0]
gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2)
storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
RET
nobmi2Mul:
// x * y[0]
MOVQ (8*0)(y_ptr), t0
2022-06-07 17:13:23 +08:00
MOVQ (8*0)(x_ptr), AX
MULQ t0
MOVQ AX, acc0
MOVQ DX, acc1
2022-06-07 17:13:23 +08:00
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, acc2
2022-06-07 17:13:23 +08:00
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
XORQ acc5, acc5
// First reduction step
MOVQ acc0, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5
// x * y[1]
MOVQ (8*1)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
// x * y[2]
MOVQ (8*2)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
// x * y[3]
MOVQ (8*3)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Copy result [255:0]
gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2)
storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
RET
// func gfpSqr(res, in *gfP, n int)
TEXT ·gfpSqr(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+8(FP), x_ptr
MOVQ n+16(FP), BX
CMPB ·hasBMI2(SB), $0
JE gfpSqrLoop
gfpSqrLoopBMI2:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2
ADCQ $0, acc3
MULXQ (8*3)(x_ptr), AX, acc4
ADDQ AX, acc3
ADCQ $0, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ (8*3)(x_ptr), AX, acc5
ADCQ $0, acc5
ADDQ AX, acc4
ADCQ $0, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr
ADDQ AX, acc5
ADCQ $0, y_ptr
XORQ t1, t1
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, t1
// Missing products
MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0
ADDQ t0, acc1
MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0
ADCQ AX, acc2
ADCQ t0, acc3
MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0
ADCQ AX, acc4
ADCQ t0, acc5
MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr
ADCQ AX, y_ptr
ADCQ t1, x_ptr
// First reduction step
MOVQ acc0, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
ADCQ t1, acc1
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x18(SB), AX, acc0
ADCQ $0, acc0
ADDQ AX, acc3
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc0
MULXQ ·p2+0x18(SB), AX, acc1
ADCQ $0, acc1
ADDQ AX, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc0
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc0
ADCQ t1, acc1
MULXQ ·p2+0x18(SB), AX, acc2
ADCQ $0, acc2
ADDQ AX, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc3
ADCQ t1, acc0
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc0
ADCQ t1, acc1
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x18(SB), AX, acc3
ADCQ $0, acc3
ADDQ AX, acc2
ADCQ $0, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADDQ acc4, acc0
ADCQ acc5, acc1
ADCQ y_ptr, acc2
ADCQ x_ptr, acc3
ADCQ $0, t0
gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0)
storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
MOVQ res_ptr, x_ptr
DECQ BX
JNE gfpSqrLoopBMI2
RET
gfpSqrLoop:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0
MOVQ (8*1)(x_ptr), AX
MULQ t0
MOVQ AX, acc1
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, y_ptr
XORQ t1, t1
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, t1
// Missing products
MOVQ (8*0)(x_ptr), AX
MULQ AX
MOVQ AX, acc0
MOVQ DX, t0
MOVQ (8*1)(x_ptr), AX
MULQ AX
ADDQ t0, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*2)(x_ptr), AX
MULQ AX
ADDQ t0, acc3
ADCQ AX, acc4
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*3)(x_ptr), AX
MULQ AX
ADDQ t0, acc5
ADCQ AX, y_ptr
ADCQ DX, t1
MOVQ t1, x_ptr
// T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr]
// First reduction step
MOVQ acc0, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // acc0 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc0, acc0
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc0
// Second reduction step
MOVQ acc1, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc1 // acc1 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc1, acc1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
// Third reduction step
MOVQ acc2, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc2 // acc2 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc2, acc2
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ DX, acc2
// Last reduction step
MOVQ acc3, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc3 // acc3 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc3, acc3
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ DX, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADDQ acc4, acc0
ADCQ acc5, acc1
ADCQ y_ptr, acc2
ADCQ x_ptr, acc3
ADCQ $0, t0
gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0)
storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
MOVQ res_ptr, x_ptr
DECQ BX
JNE gfpSqrLoop
RET
/* ---------------------------------------*/
// func gfpFromMont(res, in *gfP)
TEXT ·gfpFromMont(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+8(FP), x_ptr
MOVQ (8*0)(x_ptr), acc0
MOVQ (8*1)(x_ptr), acc1
MOVQ (8*2)(x_ptr), acc2
MOVQ (8*3)(x_ptr), acc3
XORQ acc4, acc4
// Only reduce, no multiplications are needed
// First reduction step
MOVQ acc0, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // acc0 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc0, acc0
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
XORQ acc5, acc5
// Second reduction step
MOVQ acc1, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc1 // acc1 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc1, acc1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
// Third reduction step
MOVQ acc2, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc2 // acc2 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
// Last reduction step
MOVQ acc3, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc3 // acc3 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc3, acc3
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
gfpCarryWithoutCarry(acc4, acc5, acc0, acc1, x_ptr, acc3, t0, t1)
storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
2022-06-07 17:13:23 +08:00
RET