gmsm/sm9/bn256/gfp_amd64.s
2023-06-22 18:54:09 +08:00

1246 lines
19 KiB
ArmAsm

//go:build amd64 && !purego
// +build amd64,!purego
#include "textflag.h"
#define res_ptr DI
#define x_ptr SI
#define y_ptr CX
#define acc0 R8
#define acc1 R9
#define acc2 R10
#define acc3 R11
#define acc4 R12
#define acc5 R13
#define t0 R14
#define t1 R15
#define storeBlock(a0,a1,a2,a3, r) \
MOVQ a0, 0+r \
MOVQ a1, 8+r \
MOVQ a2, 16+r \
MOVQ a3, 24+r
#define loadBlock(r, a0,a1,a2,a3) \
MOVQ 0+r, a0 \
MOVQ 8+r, a1 \
MOVQ 16+r, a2 \
MOVQ 24+r, a3
#define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
\ // b = a-p
MOVQ a0, b0 \
MOVQ a1, b1 \
MOVQ a2, b2 \
MOVQ a3, b3 \
MOVQ a4, b4 \
\
SUBQ ·p2+0(SB), b0 \
SBBQ ·p2+8(SB), b1 \
SBBQ ·p2+16(SB), b2 \
SBBQ ·p2+24(SB), b3 \
SBBQ $0, b4 \
\
\ // if b is negative then return a
\ // else return b
CMOVQCC b0, a0 \
CMOVQCC b1, a1 \
CMOVQCC b2, a2 \
CMOVQCC b3, a3
TEXT ·gfpNeg(SB),0,$0-16
MOVQ ·p2+0(SB), R8
MOVQ ·p2+8(SB), R9
MOVQ ·p2+16(SB), R10
MOVQ ·p2+24(SB), R11
MOVQ a+8(FP), DI
SUBQ 0(DI), R8
SBBQ 8(DI), R9
SBBQ 16(DI), R10
SBBQ 24(DI), R11
MOVQ $0, AX
gfpCarry(R8,R9,R10,R11,AX, R12,R13,R14,CX,BX)
MOVQ c+0(FP), DI
storeBlock(R8,R9,R10,R11, 0(DI))
RET
TEXT ·gfpAdd(SB),0,$0-24
MOVQ a+8(FP), DI
MOVQ b+16(FP), SI
loadBlock(0(DI), R8,R9,R10,R11)
MOVQ $0, R12
ADDQ 0(SI), R8
ADCQ 8(SI), R9
ADCQ 16(SI), R10
ADCQ 24(SI), R11
ADCQ $0, R12
gfpCarry(R8,R9,R10,R11,R12, R13,R14,CX,AX,BX)
MOVQ c+0(FP), DI
storeBlock(R8,R9,R10,R11, 0(DI))
RET
TEXT ·gfpSub(SB),0,$0-24
MOVQ a+8(FP), DI
MOVQ b+16(FP), SI
loadBlock(0(DI), R8,R9,R10,R11)
MOVQ ·p2+0(SB), R12
MOVQ ·p2+8(SB), R13
MOVQ ·p2+16(SB), R14
MOVQ ·p2+24(SB), CX
MOVQ $0, AX
SUBQ 0(SI), R8
SBBQ 8(SI), R9
SBBQ 16(SI), R10
SBBQ 24(SI), R11
CMOVQCC AX, R12
CMOVQCC AX, R13
CMOVQCC AX, R14
CMOVQCC AX, CX
ADDQ R12, R8
ADCQ R13, R9
ADCQ R14, R10
ADCQ CX, R11
MOVQ c+0(FP), DI
storeBlock(R8,R9,R10,R11, 0(DI))
RET
TEXT ·gfpMul(SB),0,$0-24
MOVQ res+0(FP), res_ptr
MOVQ in1+8(FP), x_ptr
MOVQ in2+16(FP), y_ptr
CMPB ·hasBMI2(SB), $0
JE nobmi2Mul
// x * y[0]
MOVQ (8*0)(y_ptr), DX
MULXQ (8*0)(x_ptr), acc0, acc1
MULXQ (8*1)(x_ptr), AX, acc2
ADDQ AX, acc1
ADCQ $0, acc2
MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2
ADCQ $0, acc3
MULXQ (8*3)(x_ptr), AX, acc4
ADDQ AX, acc3
ADCQ $0, acc4
XORQ acc5, acc5
// First reduction step
MOVQ acc0, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc0
ADCQ t1, acc1
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
ADCQ $0, acc5
// x * y[1]
MOVQ (8*1)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ (8*1)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ (8*2)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ (8*3)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
ADCQ $0, acc0
// x * y[2]
MOVQ (8*2)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ (8*1)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ (8*2)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
MULXQ (8*3)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc5
ADCQ t1, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc5
ADCQ t1, acc0
ADCQ $0, acc1
// x * y[3]
MOVQ (8*3)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ (8*1)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
MULXQ (8*2)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc5
ADCQ t1, acc0
MULXQ (8*3)(x_ptr), AX, t1
ADCQ $0, t1
ADDQ AX, acc0
ADCQ t1, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc4
ADCQ t1, acc5
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc5
ADCQ t1, acc0
MULXQ ·p2+0x18(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc0
ADCQ t1, acc1
ADCQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
// Subtract p2
SUBQ ·p2+0x00(SB), acc4
SBBQ ·p2+0x08(SB) ,acc5
SBBQ ·p2+0x10(SB), acc0
SBBQ ·p2+0x18(SB), acc1
SBBQ $0, acc2
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
RET
nobmi2Mul:
// x * y[0]
MOVQ (8*0)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
MOVQ AX, acc0
MOVQ DX, acc1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
XORQ acc5, acc5
// First reduction step
MOVQ acc0, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5
// x * y[1]
MOVQ (8*1)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
// x * y[2]
MOVQ (8*2)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
// x * y[3]
MOVQ (8*3)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
// Subtract p2
SUBQ ·p2+0x00(SB), acc4
SBBQ ·p2+0x08(SB) ,acc5
SBBQ ·p2+0x10(SB), acc0
SBBQ ·p2+0x18(SB), acc1
SBBQ $0, acc2
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
RET
// func gfpSqr(res, in *gfP, n int)
TEXT ·gfpSqr(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+8(FP), x_ptr
MOVQ n+16(FP), BX
CMPB ·hasBMI2(SB), $0
JE gfpSqrLoop
gfpSqrLoopBMI2:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2
ADCQ $0, acc3
MULXQ (8*3)(x_ptr), AX, acc4
ADDQ AX, acc3
ADCQ $0, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, t1
ADDQ AX, acc3
ADCQ t1, acc4
MULXQ (8*3)(x_ptr), AX, acc5
ADCQ $0, acc5
ADDQ AX, acc4
ADCQ $0, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr
ADDQ AX, acc5
ADCQ $0, y_ptr
XORQ t1, t1
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, t1
// Missing products
MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0
ADDQ t0, acc1
MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0
ADCQ AX, acc2
ADCQ t0, acc3
MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0
ADCQ AX, acc4
ADCQ t0, acc5
MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr
ADCQ AX, y_ptr
ADCQ t1, x_ptr
// First reduction step
MOVQ acc0, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
ADCQ t1, acc1
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x18(SB), AX, acc0
ADCQ $0, acc0
ADDQ AX, acc3
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc0
MULXQ ·p2+0x18(SB), AX, acc1
ADCQ $0, acc1
ADDQ AX, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc2
ADCQ t1, acc3
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc3
ADCQ t1, acc0
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc0
ADCQ t1, acc1
MULXQ ·p2+0x18(SB), AX, acc2
ADCQ $0, acc2
ADDQ AX, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, DX
MULXQ ·np+0x00(SB), DX, AX
MULXQ ·p2+0x00(SB), AX, t1
ADDQ AX, acc3
ADCQ t1, acc0
MULXQ ·p2+0x08(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc0
ADCQ t1, acc1
MULXQ ·p2+0x10(SB), AX, t1
ADCQ $0, t1
ADDQ AX, acc1
ADCQ t1, acc2
MULXQ ·p2+0x18(SB), AX, acc3
ADCQ $0, acc3
ADDQ AX, acc2
ADCQ $0, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADDQ acc4, acc0
ADCQ acc5, acc1
ADCQ y_ptr, acc2
ADCQ x_ptr, acc3
ADCQ $0, t0
MOVQ acc0, acc4
MOVQ acc1, acc5
MOVQ acc2, y_ptr
MOVQ acc3, t1
// Subtract p2
SUBQ ·p2+0x00(SB), acc0
SBBQ ·p2+0x08(SB) ,acc1
SBBQ ·p2+0x10(SB), acc2
SBBQ ·p2+0x18(SB), acc3
SBBQ $0, t0
CMOVQCS acc4, acc0
CMOVQCS acc5, acc1
CMOVQCS y_ptr, acc2
CMOVQCS t1, acc3
MOVQ acc0, (8*0)(res_ptr)
MOVQ acc1, (8*1)(res_ptr)
MOVQ acc2, (8*2)(res_ptr)
MOVQ acc3, (8*3)(res_ptr)
MOVQ res_ptr, x_ptr
DECQ BX
JNE gfpSqrLoopBMI2
RET
gfpSqrLoop:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0
MOVQ (8*1)(x_ptr), AX
MULQ t0
MOVQ AX, acc1
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, y_ptr
XORQ t1, t1
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, t1
// Missing products
MOVQ (8*0)(x_ptr), AX
MULQ AX
MOVQ AX, acc0
MOVQ DX, t0
MOVQ (8*1)(x_ptr), AX
MULQ AX
ADDQ t0, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*2)(x_ptr), AX
MULQ AX
ADDQ t0, acc3
ADCQ AX, acc4
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*3)(x_ptr), AX
MULQ AX
ADDQ t0, acc5
ADCQ AX, y_ptr
ADCQ DX, t1
MOVQ t1, x_ptr
// T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr]
// First reduction step
MOVQ acc0, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // acc0 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc0, acc0
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc0
// Second reduction step
MOVQ acc1, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc1 // acc1 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc1, acc1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
// Third reduction step
MOVQ acc2, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc2 // acc2 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc2, acc2
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ DX, acc2
// Last reduction step
MOVQ acc3, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc3 // acc3 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc3, acc3
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ DX, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADDQ acc4, acc0
ADCQ acc5, acc1
ADCQ y_ptr, acc2
ADCQ x_ptr, acc3
ADCQ $0, t0
MOVQ acc0, acc4
MOVQ acc1, acc5
MOVQ acc2, y_ptr
MOVQ acc3, t1
// Subtract p2
SUBQ ·p2+0x00(SB), acc0
SBBQ ·p2+0x08(SB) ,acc1
SBBQ ·p2+0x10(SB), acc2
SBBQ ·p2+0x18(SB), acc3
SBBQ $0, t0
CMOVQCS acc4, acc0
CMOVQCS acc5, acc1
CMOVQCS y_ptr, acc2
CMOVQCS t1, acc3
MOVQ acc0, (8*0)(res_ptr)
MOVQ acc1, (8*1)(res_ptr)
MOVQ acc2, (8*2)(res_ptr)
MOVQ acc3, (8*3)(res_ptr)
MOVQ res_ptr, x_ptr
DECQ BX
JNE gfpSqrLoop
RET
/* ---------------------------------------*/
// func gfpFromMont(res, in *gfP)
TEXT ·gfpFromMont(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+8(FP), x_ptr
MOVQ (8*0)(x_ptr), acc0
MOVQ (8*1)(x_ptr), acc1
MOVQ (8*2)(x_ptr), acc2
MOVQ (8*3)(x_ptr), acc3
XORQ acc4, acc4
// Only reduce, no multiplications are needed
// First reduction step
MOVQ acc0, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // acc0 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc0, acc0
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
XORQ acc5, acc5
// Second reduction step
MOVQ acc1, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc1 // acc1 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc1, acc1
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
// Third reduction step
MOVQ acc2, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc2 // acc2 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
// Last reduction step
MOVQ acc3, AX
MULQ ·np+0x00(SB)
MOVQ AX, t0 // Y
// Calculate next T = T+Y*P
MOVQ ·p2+0x00(SB), AX
MULQ t0
ADDQ AX, acc3 // acc3 is free now
ADCQ $0, DX
MOVQ DX, t1 // carry
XORQ acc3, acc3
MOVQ ·p2+0x08(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x10(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1 // carry
MOVQ ·p2+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
SUBQ ·p2+0x00(SB), acc4
SBBQ ·p2+0x08(SB) ,acc5
SBBQ ·p2+0x10(SB), acc0
SBBQ ·p2+0x18(SB), acc1
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
RET