gmsm/sm9/bn256/gfp2_g1_amd64.s

1950 lines
38 KiB
ArmAsm
Raw Normal View History

//go:build amd64 && !purego
// +build amd64,!purego
#include "textflag.h"
/* ---------------------------------------*/
#define mul0 AX
#define mul1 DX
#define acc0 BX
#define acc1 CX
#define acc2 R8
#define acc3 R9
#define acc4 R10
#define acc5 R11
#define acc6 R12
#define acc7 R13
#define t0 R14
#define t1 R15
#define t2 DI
#define t3 SI
#define hlp BP
/* ---------------------------------------*/
// (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) - (t3, t2, t1, t0)
TEXT gfpSubInternal(SB),NOSPLIT,$0
XORQ mul0, mul0
SUBQ t0, acc4
SBBQ t1, acc5
SBBQ t2, acc6
SBBQ t3, acc7
SBBQ $0, mul0
MOVQ acc4, acc0
MOVQ acc5, acc1
MOVQ acc6, acc2
MOVQ acc7, acc3
ADDQ ·p2+0(SB), acc4
ADCQ ·p2+8(SB), acc5
ADCQ ·p2+16(SB), acc6
ADCQ ·p2+24(SB), acc7
ANDQ $1, mul0
// CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc0, acc4
CMOVQEQ acc1, acc5
CMOVQEQ acc2, acc6
CMOVQEQ acc3, acc7
RET
/* ---------------------------------------*/
// (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) * (t3, t2, t1, t0)
// t0, t1 will be overwrited after this function call
TEXT gfpMulInternal(SB),NOSPLIT,$8
CMPB ·supportADX(SB), $0
JE noAdxMul
// [t3, t2, t1, t0] * acc4
MOVQ acc4, mul1
MULXQ t0, acc0, acc1
MULXQ t1, mul0, acc2
ADDQ mul0, acc1
MULXQ t2, mul0, acc3
ADCQ mul0, acc2
MULXQ t3, mul0, acc4
ADCQ mul0, acc3
ADCQ $0, acc4
// [t3, t2, t1, t0] * acc5
MOVQ acc5, mul1
MULXQ t0, mul0, hlp
ADDQ mul0, acc1
ADCQ hlp, acc2
MULXQ t1, mul0, hlp
ADCQ $0, hlp
ADDQ mul0, acc2
ADCQ hlp, acc3
MULXQ t2, mul0, hlp
ADCQ $0, hlp
ADDQ mul0, acc3
ADCQ hlp, acc4
MULXQ t3, mul0, acc5
ADCQ $0, acc5
ADDQ mul0, acc4
ADCQ $0, acc5
// [t3, t2, t1, t0] * acc5
MOVQ acc6, mul1
MULXQ t0, mul0, hlp
ADDQ mul0, acc2
ADCQ hlp, acc3
MULXQ t1, mul0, hlp
ADCQ $0, hlp
ADDQ mul0, acc3
ADCQ hlp, acc4
MULXQ t2, mul0, hlp
ADCQ $0, hlp
ADDQ mul0, acc4
ADCQ hlp, acc5
MULXQ t3, mul0, acc6
ADCQ $0, acc6
ADDQ mul0, acc5
ADCQ $0, acc6
// [t3, t2, t1, t0] * acc7
MOVQ acc7, mul1
MULXQ t0, mul0, hlp
ADDQ mul0, acc3
ADCQ hlp, acc4
MULXQ t1, mul0, hlp
ADCQ $0, hlp
ADDQ mul0, acc4
ADCQ hlp, acc5
MULXQ t2, mul0, hlp
ADCQ $0, hlp
ADDQ mul0, acc5
ADCQ hlp, acc6
MULXQ t3, mul0, acc7
ADCQ $0, acc7
ADDQ mul0, acc6
ADCQ $0, acc7
// T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step
XORQ t1, t1
MOVQ acc0, mul1
MULXQ ·np+0x00(SB), mul1, mul0
MULXQ ·p2+0x00(SB), mul0, t0
ADOXQ mul0, acc0 // (carry1, acc0) = acc0 + t0 * ord0
MULXQ ·p2+0x08(SB), mul0, hlp
ADCXQ t0, mul0
ADOXQ mul0, acc1
MULXQ ·p2+0x10(SB), mul0, t0
ADCXQ hlp, mul0
ADOXQ mul0, acc2
MULXQ ·p2+0x18(SB), mul0, acc0
ADCXQ t0, mul0
ADOXQ mul0, acc3
ADCXQ t1, acc0
ADOXQ t1, acc0
// Second reduction step
MOVQ acc1, mul1
MULXQ ·np+0x00(SB), mul1, mul0
MULXQ ·p2+0x00(SB), mul0, t0
ADOXQ mul0, acc1
MULXQ ·p2+0x08(SB), mul0, hlp
ADCXQ t0, mul0
ADOXQ mul0, acc2
MULXQ ·p2+0x10(SB), mul0, t0
ADCXQ hlp, mul0
ADOXQ mul0, acc3
MULXQ ·p2+0x18(SB), mul0, acc1
ADCXQ t0, mul0
ADOXQ mul0, acc0
ADCXQ t1, acc1
ADOXQ t1, acc1
// Third reduction step
MOVQ acc2, mul1
MULXQ ·np+0x00(SB), mul1, mul0
MULXQ ·p2+0x00(SB), mul0, t0
ADOXQ mul0, acc2
MULXQ ·p2+0x08(SB), mul0, hlp
ADCXQ t0, mul0
ADOXQ mul0, acc3
MULXQ ·p2+0x10(SB), mul0, t0
ADCXQ hlp, mul0
ADOXQ mul0, acc0
MULXQ ·p2+0x18(SB), mul0, acc2
ADCXQ t0, mul0
ADOXQ mul0, acc1
ADCXQ t1, acc2
ADOXQ t1, acc2
// Last reduction step
MOVQ acc3, mul1
MULXQ ·np+0x00(SB), mul1, mul0
MULXQ ·p2+0x00(SB), mul0, t0
ADOXQ mul0, acc3
MULXQ ·p2+0x08(SB), mul0, hlp
ADCXQ t0, mul0
ADOXQ mul0, acc0
MULXQ ·p2+0x10(SB), mul0, t0
ADCXQ hlp, mul0
ADOXQ mul0, acc1
MULXQ ·p2+0x18(SB), mul0, acc3
ADCXQ t0, mul0
ADOXQ mul0, acc2
ADCXQ t1, acc3
ADOXQ t1, acc3
MOVQ $0, hlp
// Add bits [511:256] of the result
ADDQ acc0, acc4
ADCQ acc1, acc5
ADCQ acc2, acc6
ADCQ acc3, acc7
ADCQ $0, hlp
// Copy result
MOVQ acc4, acc0
MOVQ acc5, acc1
MOVQ acc6, acc2
MOVQ acc7, acc3
// Subtract p
SUBQ ·p2+0(SB), acc4
SBBQ ·p2+8(SB), acc5
SBBQ ·p2+16(SB), acc6
SBBQ ·p2+24(SB), acc7
SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result
CMOVQCS acc0, acc4
CMOVQCS acc1, acc5
CMOVQCS acc2, acc6
CMOVQCS acc3, acc7
RET
noAdxMul:
// [t3, t2, t1, t0] * acc4
MOVQ acc4, mul0
MULQ t0
MOVQ mul0, acc0
MOVQ mul1, acc1
MOVQ acc4, mul0
MULQ t1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc2
MOVQ acc4, mul0
MULQ t2
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
MOVQ acc4, mul0
MULQ t3
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc4
// [t3, t2, t1, t0] * acc5
MOVQ acc5, mul0
MULQ t0
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc5, mul0
MULQ t1
ADDQ hlp, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc5, mul0
MULQ t2
ADDQ hlp, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc5, mul0
MULQ t3
ADDQ hlp, acc4
ADCQ $0, mul1
ADDQ mul0, acc4
ADCQ $0, mul1
MOVQ mul1, acc5
// [t3, t2, t1, t0] * acc6
MOVQ acc6, mul0
MULQ t0
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc6, mul0
MULQ t1
ADDQ hlp, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc6, mul0
MULQ t2
ADDQ hlp, acc4
ADCQ $0, mul1
ADDQ mul0, acc4
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc6, mul0
MULQ t3
ADDQ hlp, acc5
ADCQ $0, mul1
ADDQ mul0, acc5
ADCQ $0, mul1
MOVQ mul1, acc6
// [t3, t2, t1, t0] * acc7
MOVQ acc7, mul0
MULQ t0
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc7, mul0
MULQ t1
ADDQ hlp, acc4
ADCQ $0, mul1
ADDQ mul0, acc4
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc7, mul0
MULQ t2
ADDQ hlp, acc5
ADCQ $0, mul1
ADDQ mul0, acc5
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc7, mul0
MULQ t3
ADDQ hlp, acc6
ADCQ $0, mul1
ADDQ mul0, acc6
ADCQ $0, mul1
MOVQ mul1, acc7
// T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step
MOVQ acc0, mul0
MULQ ·np+0x00(SB)
MOVQ mul0, hlp
MOVQ ·p2+0x00(SB), mul0
MULQ hlp
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, t0
XORQ acc0, acc0
MOVQ ·p2+0x08(SB), mul0
MULQ hlp
ADDQ t0, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ ·p2+0x10(SB), mul0
MULQ hlp
ADDQ t0, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ ·p2+0x18(SB), mul0
MULQ hlp
ADDQ t0, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MULQ ·np+0x00(SB)
MOVQ mul0, hlp
MOVQ ·p2+0x00(SB), mul0
MULQ hlp
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, t0
XORQ acc1, acc1
MOVQ ·p2+0x08(SB), mul0
MULQ hlp
ADDQ t0, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ ·p2+0x10(SB), mul0
MULQ hlp
ADDQ t0, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ ·p2+0x18(SB), mul0
MULQ hlp
ADDQ t0, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MULQ ·np+0x00(SB)
MOVQ mul0, hlp
MOVQ ·p2+0x00(SB), mul0
MULQ hlp
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, t0
XORQ acc2, acc2
MOVQ ·p2+0x08(SB), mul0
MULQ hlp
ADDQ t0, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ ·p2+0x10(SB), mul0
MULQ hlp
ADDQ t0, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ ·p2+0x18(SB), mul0
MULQ hlp
ADDQ t0, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MULQ ·np+0x00(SB)
MOVQ mul0, hlp
MOVQ ·p2+0x00(SB), mul0
MULQ hlp
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, t0
XORQ acc3, acc3
MOVQ ·p2+0x08(SB), mul0
MULQ hlp
ADDQ t0, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ ·p2+0x10(SB), mul0
MULQ hlp
ADDQ t0, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ ·p2+0x18(SB), mul0
MULQ hlp
ADDQ t0, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ mul1, acc3
MOVQ $0, hlp
// Add bits [511:256] of the result
ADDQ acc0, acc4
ADCQ acc1, acc5
ADCQ acc2, acc6
ADCQ acc3, acc7
ADCQ $0, hlp
// Copy result
MOVQ acc4, acc0
MOVQ acc5, acc1
MOVQ acc6, acc2
MOVQ acc7, acc3
// Subtract p
SUBQ ·p2+0(SB), acc4
SBBQ ·p2+8(SB), acc5
SBBQ ·p2+16(SB), acc6
SBBQ ·p2+24(SB), acc7
SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result
CMOVQCS acc0, acc4
CMOVQCS acc1, acc5
CMOVQCS acc2, acc6
CMOVQCS acc3, acc7
RET
/* ---------------------------------------*/
// (acc7, acc6, acc5, acc4) = (acc7, acc6, acc5, acc4) ^ 2
TEXT gfpSqrInternal(SB),NOSPLIT,$8
CMPB ·supportADX(SB), $0
JE noAdxSqr
XORQ t3, t3
// [acc7, acc6, acc5] * acc4
MOVQ acc4, mul1
MULXQ acc5, acc1, acc2
MULXQ acc6, mul0, acc3
ADOXQ mul0, acc2
MULXQ acc7, mul0, t0
ADOXQ mul0, acc3
ADOXQ t3, t0
// [acc7, acc6] * acc5
MOVQ acc5, mul1
MULXQ acc6, mul0, hlp
ADOXQ mul0, acc3
MULXQ acc7, mul0, t1
ADCXQ hlp, mul0
ADOXQ mul0, t0
ADCXQ t3, t1
// acc7 * acc6
MOVQ acc6, mul1
MULXQ acc7, mul0, t2
ADOXQ mul0, t1
ADOXQ t3, t2
// *2
ADOXQ acc1, acc1
ADOXQ acc2, acc2
ADOXQ acc3, acc3
ADOXQ t0, t0
ADOXQ t1, t1
ADOXQ t2, t2
ADOXQ t3, t3
// Missing products
MOVQ acc4, mul1
MULXQ mul1, acc0, acc4
ADCXQ acc4, acc1
MOVQ acc5, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, acc2
ADCXQ acc4, acc3
MOVQ acc6, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, t0
ADCXQ acc4, t1
MOVQ acc7, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, t2
ADCXQ acc4, t3
// T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0]
// First reduction step
XORQ acc5, acc5
MOVQ acc0, mul1
MULXQ ·np+0x00(SB), mul1, mul0
MULXQ ·p2+0x00(SB), mul0, acc4
ADOXQ mul0, acc0 // (carry1, acc0) = acc0 + acc5 * ord0
MULXQ ·p2+0x08(SB), mul0, hlp
ADCXQ acc4, mul0
ADOXQ mul0, acc1
MULXQ ·p2+0x10(SB), mul0, acc4
ADCXQ hlp, mul0
ADOXQ mul0, acc2
MULXQ ·p2+0x18(SB), mul0, acc0
ADCXQ acc4, mul0
ADOXQ mul0, acc3
ADCXQ acc5, acc0
ADOXQ acc5, acc0
// Second reduction step
MOVQ acc1, mul1
MULXQ ·np+0x00(SB), mul1, mul0
MULXQ ·p2+0x00(SB), mul0, acc4
ADOXQ mul0, acc1
MULXQ ·p2+0x08(SB), mul0, hlp
ADCXQ acc4, mul0
ADOXQ mul0, acc2
MULXQ ·p2+0x10(SB), mul0, acc4
ADCXQ hlp, mul0
ADOXQ mul0, acc3
MULXQ ·p2+0x18(SB), mul0, acc1
ADCXQ acc4, mul0
ADOXQ mul0, acc0
ADCXQ acc5, acc1
ADOXQ acc5, acc1
// Third reduction step
MOVQ acc2, mul1
MULXQ ·np+0x00(SB), mul1, mul0
MULXQ ·p2+0x00(SB), mul0, acc4
ADOXQ mul0, acc2
MULXQ ·p2+0x08(SB), mul0, hlp
ADCXQ acc4, mul0
ADOXQ mul0, acc3
MULXQ ·p2+0x10(SB), mul0, acc4
ADCXQ hlp, mul0
ADOXQ mul0, acc0
MULXQ ·p2+0x18(SB), mul0, acc2
ADCXQ acc4, mul0
ADOXQ mul0, acc1
ADCXQ acc5, acc2
ADOXQ acc5, acc2
// Last reduction step
MOVQ acc3, mul1
MULXQ ·np+0x00(SB), mul1, mul0
MULXQ ·p2+0x00(SB), mul0, acc4
ADOXQ mul0, acc3
MULXQ ·p2+0x08(SB), mul0, hlp
ADCXQ acc4, mul0
ADOXQ mul0, acc0
MULXQ ·p2+0x10(SB), mul0, acc4
ADCXQ hlp, mul0
ADOXQ mul0, acc1
MULXQ ·p2+0x18(SB), mul0, acc3
ADCXQ acc4, mul0
ADOXQ mul0, acc2
ADCXQ acc5, acc3
ADOXQ acc5, acc3
MOVQ $0, hlp
// Add bits [511:256] of the result
ADDQ acc0, t0
ADCQ acc1, t1
ADCQ acc2, t2
ADCQ acc3, t3
ADCQ $0, hlp
// Copy result
MOVQ t0, acc4
MOVQ t1, acc5
MOVQ t2, acc6
MOVQ t3, acc7
// Subtract p
SUBQ ·p2+0(SB), acc4
SBBQ ·p2+8(SB), acc5
SBBQ ·p2+16(SB), acc6
SBBQ ·p2+24(SB), acc7
SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result
CMOVQCS t0, acc4
CMOVQCS t1, acc5
CMOVQCS t2, acc6
CMOVQCS t3, acc7
RET
noAdxSqr:
MOVQ acc4, mul0
MULQ acc5
MOVQ mul0, acc1
MOVQ mul1, acc2
MOVQ acc4, mul0
MULQ acc6
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
MOVQ acc4, mul0
MULQ acc7
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ acc5, mul0
MULQ acc6
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc5, mul0
MULQ acc7
ADDQ hlp, t0
ADCQ $0, mul1
ADDQ mul0, t0
ADCQ $0, mul1
MOVQ mul1, t1
MOVQ acc6, mul0
MULQ acc7
ADDQ mul0, t1
ADCQ $0, mul1
MOVQ mul1, t2
XORQ t3, t3
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ t0, t0
ADCQ t1, t1
ADCQ t2, t2
ADCQ $0, t3
// Missing products
MOVQ acc4, mul0
MULQ mul0
MOVQ mul0, acc0
MOVQ DX, acc4
MOVQ acc5, mul0
MULQ mul0
ADDQ acc4, acc1
ADCQ mul0, acc2
ADCQ $0, DX
MOVQ DX, acc4
MOVQ acc6, mul0
MULQ mul0
ADDQ acc4, acc3
ADCQ mul0, t0
ADCQ $0, DX
MOVQ DX, acc4
MOVQ acc7, mul0
MULQ mul0
ADDQ acc4, t1
ADCQ mul0, t2
ADCQ DX, t3
// T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0]
// First reduction step
MOVQ acc0, mul0
MULQ ·np+0x00(SB)
MOVQ mul0, hlp
MOVQ ·p2+0x00(SB), mul0
MULQ hlp
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, acc5
XORQ acc0, acc0
MOVQ ·p2+0x08(SB), mul0
MULQ hlp
ADDQ acc5, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc5
MOVQ ·p2+0x10(SB), mul0
MULQ hlp
ADDQ acc5, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc5
MOVQ ·p2+0x18(SB), mul0
MULQ hlp
ADDQ acc5, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MULQ ·np+0x00(SB)
MOVQ mul0, hlp
MOVQ ·p2+0x00(SB), mul0
MULQ hlp
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc5
XORQ acc1, acc1
MOVQ ·p2+0x08(SB), mul0
MULQ hlp
ADDQ acc5, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc5
MOVQ ·p2+0x10(SB), mul0
MULQ hlp
ADDQ acc5, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc5
MOVQ ·p2+0x18(SB), mul0
MULQ hlp
ADDQ acc5, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MULQ ·np+0x00(SB)
MOVQ mul0, hlp
MOVQ ·p2+0x00(SB), mul0
MULQ hlp
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc5
XORQ acc2, acc2
MOVQ ·p2+0x08(SB), mul0
MULQ hlp
ADDQ acc5, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc5
MOVQ ·p2+0x10(SB), mul0
MULQ hlp
ADDQ acc5, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, acc5
MOVQ ·p2+0x18(SB), mul0
MULQ hlp
ADDQ acc5, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MULQ ·np+0x00(SB)
MOVQ mul0, hlp
MOVQ ·p2+0x00(SB), mul0
MULQ hlp
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc5
XORQ acc3, acc3
MOVQ ·p2+0x08(SB), mul0
MULQ hlp
ADDQ acc5, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, acc5
MOVQ ·p2+0x10(SB), mul0
MULQ hlp
ADDQ acc5, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc5
MOVQ ·p2+0x18(SB), mul0
MULQ hlp
ADDQ acc5, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ mul1, acc3
MOVQ $0, hlp
// Add bits [511:256] of the result
ADDQ acc0, t0
ADCQ acc1, t1
ADCQ acc2, t2
ADCQ acc3, t3
ADCQ $0, hlp
// Copy result
MOVQ t0, acc4
MOVQ t1, acc5
MOVQ t2, acc6
MOVQ t3, acc7
// Subtract p
SUBQ ·p2+0(SB), acc4
SBBQ ·p2+8(SB), acc5
SBBQ ·p2+16(SB), acc6
SBBQ ·p2+24(SB), acc7
SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result
CMOVQCS t0, acc4
CMOVQCS t1, acc5
CMOVQCS t2, acc6
CMOVQCS t3, acc7
RET
/* ---------------------------------------*/
// (t3, t2, t1, t0) = 2(acc7, acc6, acc5, acc4)
#define gfpMulBy2Inline \
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ ·p2+0(SB), t0;\
SBBQ ·p2+8(SB), t1;\
SBBQ ·p2+16(SB), t2;\
SBBQ ·p2+24(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
2023-07-27 13:03:25 +08:00
// (acc7, acc6, acc5, acc4) = 2(acc7, acc6, acc5, acc4)
#define gfpMulBy2Inline2 \
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ ·p2+0(SB), acc4;\
SBBQ ·p2+8(SB), acc5;\
SBBQ ·p2+16(SB), acc6;\
SBBQ ·p2+24(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
/* ---------------------------------------*/
// (t3, t2, t1, t0) = (acc7, acc6, acc5, acc4) + (t3, t2, t1, t0)
#define gfpAddInline \
XORQ mul0, mul0;\
ADDQ t0, acc4;\
ADCQ t1, acc5;\
ADCQ t2, acc6;\
ADCQ t3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ ·p2+0(SB), t0;\
SBBQ ·p2+8(SB), t1;\
SBBQ ·p2+16(SB), t2;\
SBBQ ·p2+24(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
/* ---------------------------------------*/
#define axin(off) (32*0 + off)(SP)
#define ayin(off) (32*1 + off)(SP)
#define bxin(off) (32*2 + off)(SP)
#define byin(off) (32*3 + off)(SP)
#define tmp0(off) (32*4 + off)(SP)
#define tmp1(off) (32*5 + off)(SP)
#define cxout(off) (32*6 + off)(SP)
#define rptr (32*7)(SP)
TEXT ·gfp2Mul(SB),NOSPLIT,$256-24
// Move input to stack in order to free registers
MOVQ res+0(FP), CX
MOVQ in1+8(FP), AX
MOVQ in2+16(FP), BX
MOVOU (16*0)(AX), X0
MOVOU (16*1)(AX), X1
MOVOU (16*2)(AX), X2
MOVOU (16*3)(AX), X3
MOVOU X0, axin(16*0)
MOVOU X1, axin(16*1)
MOVOU X2, ayin(16*0)
MOVOU X3, ayin(16*1)
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU X0, bxin(16*0)
MOVOU X1, bxin(16*1)
MOVOU X2, byin(16*0)
MOVOU X3, byin(16*1)
// Store pointer to result
MOVQ CX, rptr
LDacc (ayin)
LDt (byin)
CALL gfpMulInternal(SB)
ST (tmp0)
LDacc (axin)
LDt (bxin)
CALL gfpMulInternal(SB)
ST (tmp1)
LDacc (axin)
LDt (ayin)
gfpAddInline
STt (cxout)
LDacc (bxin)
LDt (byin)
gfpAddInline
LDacc (cxout)
CALL gfpMulInternal(SB)
LDt (tmp0)
CALL gfpSubInternal(SB)
LDt (tmp1)
CALL gfpSubInternal(SB)
// Store x
MOVQ rptr, AX
MOVQ acc4, (16*0 + 8*0)(AX)
MOVQ acc5, (16*0 + 8*1)(AX)
MOVQ acc6, (16*0 + 8*2)(AX)
MOVQ acc7, (16*0 + 8*3)(AX)
LDacc (tmp0)
//LDt (tmp1)
CALL gfpSubInternal(SB)
CALL gfpSubInternal(SB)
MOVQ rptr, AX
///////////////////////
MOVQ $0, rptr
// Store y
MOVQ acc4, (16*2 + 8*0)(AX)
MOVQ acc5, (16*2 + 8*1)(AX)
MOVQ acc6, (16*2 + 8*2)(AX)
MOVQ acc7, (16*2 + 8*3)(AX)
RET
TEXT ·gfp2MulU(SB),NOSPLIT,$256-24
// Move input to stack in order to free registers
MOVQ res+0(FP), CX
MOVQ in1+8(FP), AX
MOVQ in2+16(FP), BX
MOVOU (16*0)(AX), X0
MOVOU (16*1)(AX), X1
MOVOU (16*2)(AX), X2
MOVOU (16*3)(AX), X3
MOVOU X0, axin(16*0)
MOVOU X1, axin(16*1)
MOVOU X2, ayin(16*0)
MOVOU X3, ayin(16*1)
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU X0, bxin(16*0)
MOVOU X1, bxin(16*1)
MOVOU X2, byin(16*0)
MOVOU X3, byin(16*1)
// Store pointer to result
MOVQ CX, rptr
LDacc (ayin)
LDt (byin)
CALL gfpMulInternal(SB)
ST (tmp0)
LDacc (axin)
LDt (bxin)
CALL gfpMulInternal(SB)
ST (tmp1)
LDacc (axin)
LDt (ayin)
gfpAddInline
STt (cxout)
LDacc (bxin)
LDt (byin)
gfpAddInline
LDacc (cxout)
CALL gfpMulInternal(SB)
LDt (tmp0)
CALL gfpSubInternal(SB)
LDt (tmp1)
CALL gfpSubInternal(SB)
gfpMulBy2Inline
XORQ acc4, acc4
XORQ acc5, acc5
XORQ acc6, acc6
XORQ acc7, acc7
CALL gfpSubInternal(SB)
// Store y
MOVQ rptr, AX
MOVQ acc4, (16*2 + 8*0)(AX)
MOVQ acc5, (16*2 + 8*1)(AX)
MOVQ acc6, (16*2 + 8*2)(AX)
MOVQ acc7, (16*2 + 8*3)(AX)
LDacc (tmp0)
LDt (tmp1)
CALL gfpSubInternal(SB)
CALL gfpSubInternal(SB)
MOVQ rptr, AX
///////////////////////
MOVQ $0, rptr
// Store x
MOVQ acc4, (16*0 + 8*0)(AX)
MOVQ acc5, (16*0 + 8*1)(AX)
MOVQ acc6, (16*0 + 8*2)(AX)
MOVQ acc7, (16*0 + 8*3)(AX)
RET
#undef axin
#undef ayin
#undef bxin
#undef byin
#undef tmp0
#undef tmp1
#undef cxout
#undef rptr
2023-07-27 15:59:52 +08:00
TEXT ·gfp2MulU1(SB),NOSPLIT,$0-16
// Move input to stack in order to free registers
MOVQ res+0(FP), mul1
MOVQ in1+8(FP), AX
//LDacc (axin)
MOVOU (16*2)(AX), X2
MOVOU (16*3)(AX), X3
MOVQ (16*0 + 8*0)(AX), acc4
MOVQ (16*0 + 8*1)(AX), acc5
MOVQ (16*0 + 8*2)(AX), acc6
MOVQ (16*0 + 8*3)(AX), acc7
gfpMulBy2Inline
XORQ acc4, acc4
XORQ acc5, acc5
XORQ acc6, acc6
XORQ acc7, acc7
CALL gfpSubInternal(SB)
MOVOU X2, (16*0)(mul1)
MOVOU X3, (16*1)(mul1)
MOVQ acc4, (16*2 + 8*0)(mul1)
MOVQ acc5, (16*2 + 8*1)(mul1)
MOVQ acc6, (16*2 + 8*2)(mul1)
MOVQ acc7, (16*2 + 8*3)(mul1)
RET
#define axin(off) (32*0 + off)(SP)
#define ayin(off) (32*1 + off)(SP)
#define cxout(off) (32*2 + off)(SP)
#define cyout(off) (32*3 + off)(SP)
#define rptr (32*4)(SP)
TEXT ·gfp2Square(SB),NOSPLIT,$160-16
// Move input to stack in order to free registers
MOVQ res+0(FP), AX
MOVQ in1+8(FP), BX
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU X0, axin(16*0)
MOVOU X1, axin(16*1)
MOVOU X2, ayin(16*0)
MOVOU X3, ayin(16*1)
// Store pointer to result
MOVQ AX, rptr
LDacc (axin)
LDt (ayin)
gfpAddInline
STt (cyout)
LDacc (axin)
gfpMulBy2Inline
LDacc (ayin)
CALL gfpSubInternal(SB)
LDt (cyout)
CALL gfpMulInternal(SB)
ST (cyout)
LDacc (axin)
LDt (ayin)
CALL gfpMulInternal(SB)
ST (cxout)
LDt (cyout)
gfpAddInline
// Store y
MOVQ rptr, AX
MOVQ t0, (16*2 + 8*0)(AX)
MOVQ t1, (16*2 + 8*1)(AX)
MOVQ t2, (16*2 + 8*2)(AX)
MOVQ t3, (16*2 + 8*3)(AX)
LDacc (cxout)
gfpMulBy2Inline
// Store x
MOVQ rptr, AX
///////////////////////
MOVQ $0, rptr
MOVQ t0, (16*0 + 8*0)(AX)
MOVQ t1, (16*0 + 8*1)(AX)
MOVQ t2, (16*0 + 8*2)(AX)
MOVQ t3, (16*0 + 8*3)(AX)
RET
TEXT ·gfp2SquareU(SB),NOSPLIT,$160-16
// Move input to stack in order to free registers
MOVQ res+0(FP), AX
MOVQ in1+8(FP), BX
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU X0, axin(16*0)
MOVOU X1, axin(16*1)
MOVOU X2, ayin(16*0)
MOVOU X3, ayin(16*1)
// Store pointer to result
MOVQ AX, rptr
LDacc (axin)
LDt (ayin)
gfpAddInline
STt (cxout)
LDacc (axin)
gfpMulBy2Inline
LDacc (ayin)
CALL gfpSubInternal(SB)
LDt (cxout)
CALL gfpMulInternal(SB)
ST (cxout)
LDacc (axin)
LDt (ayin)
CALL gfpMulInternal(SB)
ST (cyout)
LDt (cxout)
gfpAddInline
// Store x
MOVQ rptr, AX
MOVQ t0, (16*0 + 8*0)(AX)
MOVQ t1, (16*0 + 8*1)(AX)
MOVQ t2, (16*0 + 8*2)(AX)
MOVQ t3, (16*0 + 8*3)(AX)
LDacc (cyout)
2023-07-27 13:03:25 +08:00
gfpMulBy2Inline2
gfpMulBy2Inline
XORQ acc4, acc4
XORQ acc5, acc5
XORQ acc6, acc6
XORQ acc7, acc7
CALL gfpSubInternal(SB)
// Store y
MOVQ rptr, AX
///////////////////////
MOVQ $0, rptr
MOVQ acc4, (16*2 + 8*0)(AX)
MOVQ acc5, (16*2 + 8*1)(AX)
MOVQ acc6, (16*2 + 8*2)(AX)
MOVQ acc7, (16*2 + 8*3)(AX)
RET
#undef axin
#undef ayin
#undef cxout
#undef cyout
#undef rptr
/* ---------------------------------------*/
#define xin(off) (32*0 + off)(SP)
#define yin(off) (32*1 + off)(SP)
#define zin(off) (32*2 + off)(SP)
#define xout(off) (32*3 + off)(SP)
#define yout(off) (32*4 + off)(SP)
#define zout(off) (32*5 + off)(SP)
#define tmp0(off) (32*6 + off)(SP)
#define tmp2(off) (32*7 + off)(SP)
#define rptr (32*8)(SP)
// func curvePointDoubleComplete(c, a *curvePoint)
TEXT ·curvePointDoubleComplete(SB),NOSPLIT,$288-16
MOVQ res+0(FP), AX
MOVQ in+8(FP), BX
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU (16*4)(BX), X4
MOVOU (16*5)(BX), X5
MOVOU X0, xin(16*0)
MOVOU X1, xin(16*1)
MOVOU X2, yin(16*0)
MOVOU X3, yin(16*1)
MOVOU X4, zin(16*0)
MOVOU X5, zin(16*1)
// Store pointer to result
MOVQ AX, rptr
LDacc (yin)
CALL gfpSqrInternal(SB) // t0 := Y^2
ST (tmp0)
2023-07-27 13:03:25 +08:00
gfpMulBy2Inline2 // Z3 := t0 + t0
gfpMulBy2Inline2 // Z3 := Z3 + Z3
gfpMulBy2Inline // Z3 := Z3 + Z3
STt (zout)
LDacc (zin)
CALL gfpSqrInternal(SB) // t2 := Z^2
2023-07-27 13:03:25 +08:00
MOVQ acc4, acc0
MOVQ acc5, acc1
MOVQ acc6, acc2
MOVQ acc7, acc3
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
gfpMulBy2Inline2
MOVQ acc0, t0
MOVQ acc1, t1
MOVQ acc2, t2
MOVQ acc3, t3
CALL gfpSubInternal(SB) // t2 := 3b * t2
ST (tmp2)
LDt (zout)
2023-07-27 13:03:25 +08:00
CALL gfpMulInternal(SB) // X3 := Z3 * t2
ST (xout)
LDacc (tmp0)
LDt (tmp2)
2023-07-27 13:03:25 +08:00
gfpAddInline // Y3 := t0 + t2
STt (yout)
LDacc (yin)
LDt (zin)
2023-07-27 13:03:25 +08:00
CALL gfpMulInternal(SB) // t1 := YZ
LDt (zout)
2023-07-27 13:03:25 +08:00
CALL gfpMulInternal(SB) // Z3 := t1 * Z3
MOVQ rptr, AX
// Store Z
MOVQ acc4, (16*4 + 8*0)(AX)
MOVQ acc5, (16*4 + 8*1)(AX)
MOVQ acc6, (16*4 + 8*2)(AX)
MOVQ acc7, (16*4 + 8*3)(AX)
LDacc (tmp2)
gfpMulBy2Inline
LDacc (tmp2)
2023-07-27 13:03:25 +08:00
gfpAddInline // t2 := t2 + t2 + t2
LDacc (tmp0)
CALL gfpSubInternal(SB) // t0 := t0 - t2
ST (tmp0)
LDt (yout)
CALL gfpMulInternal(SB) // Y3 = t0 * Y3
LDt (xout)
2023-07-27 13:03:25 +08:00
gfpAddInline // Y3 := X3 + Y3
MOVQ rptr, AX
// Store y
MOVQ t0, (16*2 + 8*0)(AX)
MOVQ t1, (16*2 + 8*1)(AX)
MOVQ t2, (16*2 + 8*2)(AX)
MOVQ t3, (16*2 + 8*3)(AX)
LDacc (xin)
LDt (yin)
CALL gfpMulInternal(SB) // t1 := XY
LDt (tmp0)
CALL gfpMulInternal(SB) // X3 := t0 * t1
gfpMulBy2Inline // X3 := X3 + X3
MOVQ rptr, AX
MOVQ $0, rptr
// Store x
MOVQ t0, (16*0 + 8*0)(AX)
MOVQ t1, (16*0 + 8*1)(AX)
MOVQ t2, (16*0 + 8*2)(AX)
MOVQ t3, (16*0 + 8*3)(AX)
RET
#undef xin
#undef yin
#undef zin
#undef xout
#undef yout
#undef zout
#undef tmp0
#undef tmp2
#undef rptr
2023-07-24 13:02:00 +08:00
/* ---------------------------------------*/
#define x1in(off) (32*0 + off)(SP)
#define y1in(off) (32*1 + off)(SP)
#define z1in(off) (32*2 + off)(SP)
#define x2in(off) (32*3 + off)(SP)
#define y2in(off) (32*4 + off)(SP)
#define z2in(off) (32*5 + off)(SP)
#define xout(off) (32*6 + off)(SP)
#define yout(off) (32*7 + off)(SP)
#define zout(off) (32*8 + off)(SP)
#define tmp0(off) (32*9 + off)(SP)
#define tmp1(off) (32*10 + off)(SP)
#define tmp2(off) (32*11 + off)(SP)
#define tmp3(off) (32*12 + off)(SP)
#define tmp4(off) (32*13 + off)(SP)
#define rptr (32*14)(SP)
#define curvePointAddCompleteInline \
LDacc (x1in) \
LDt (x2in) \
CALL gfpMulInternal(SB) \ // t0 := X1X2
ST (tmp0) \
LDacc (y1in) \
LDt (y2in) \
CALL gfpMulInternal(SB) \ // t1 := Y1Y2
ST (tmp1) \
LDacc (z1in) \
LDt (z2in) \
CALL gfpMulInternal(SB) \ // t2 := Z1Z2
ST (tmp2) \
\
LDacc (x1in) \
LDt (y1in) \
gfpAddInline \
STt (tmp3) \ // t3 := X1 + Y1
LDacc (x2in) \
LDt (y2in) \
gfpAddInline \
LDacc (tmp3) \
CALL gfpMulInternal(SB) \ // t3 := t3 * t4 = (X1 + Y1) * (X2 + Y2)
ST (tmp3) \
LDacc (tmp0) \
LDt (tmp1) \
gfpAddInline \
LDacc (tmp3) \
CALL gfpSubInternal(SB) \ // t3 := t3 - t4 = X1Y2 + X2Y1
ST (tmp3) \
\
LDacc (y1in) \
LDt (z1in) \
gfpAddInline \ // t4 := Y1 + Z1
STt (tmp4) \
LDacc (y2in) \
LDt (z2in) \
gfpAddInline \
LDacc (tmp4) \
CALL gfpMulInternal(SB) \ // t4 := t4 * X3 = (Y1 + Z1)(Y2 + Z2)
ST (tmp4) \
LDacc (tmp1) \
LDt (tmp2) \
gfpAddInline \
LDacc (tmp4) \
CALL gfpSubInternal(SB) \ // t4 := t4 - X3 = Y1Z2 + Y2Z1
ST (tmp4) \
\
LDacc (z1in) \
LDt (x1in) \
gfpAddInline \ // X3 := X1 + Z1
STt (xout) \
LDacc (z2in) \
LDt (x2in) \
gfpAddInline \
LDacc (xout) \
CALL gfpMulInternal(SB) \ // X3 := X3 * Y3
ST (xout) \
LDacc (tmp0) \
LDt (tmp2) \
gfpAddInline \
LDacc (xout) \
CALL gfpSubInternal(SB) \ // Y3 := X3 - Y3 = X1Z2 + X2Z1
ST (yout) \
\
LDacc (tmp0) \
gfpMulBy2Inline \
LDacc (tmp0) \
gfpAddInline \ // t0 := t0 + t0 + t0 = 3X1X2
STt (tmp0) \
\
LDacc (tmp2) \
2023-07-27 13:03:25 +08:00
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
2023-07-24 13:02:00 +08:00
LDt (tmp2) \
CALL gfpSubInternal(SB) \ // t2 := 3b * t2 = 3bZ1Z2
ST (tmp2) \
\
LDt (tmp1) \
gfpAddInline \ // Z3 := t1 + t2 = Y1Y2 + 3bZ1Z2
STt (zout) \
\
LDacc (tmp1) \
LDt (tmp2) \
CALL gfpSubInternal(SB) \ // t1 := t1 - t2 = Y1Y2 - 3bZ1Z2
ST (tmp1) \
\
LDacc (yout) \
2023-07-27 13:03:25 +08:00
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
gfpMulBy2Inline2 \
2023-07-24 13:02:00 +08:00
LDt (yout) \
CALL gfpSubInternal(SB) \ // Y3 = 3b * Y3 = 3b(X1Z2 + X2Z1)
ST (yout) \
\
LDt (tmp4) \
CALL gfpMulInternal(SB) \ // X3 := t4 * Y3 = 3b(X1Z2 + X2Z1)(Y1Z2 + Y2Z1)
ST (xout) \
\
LDacc (tmp1) \
LDt (tmp3) \
CALL gfpMulInternal(SB) \ // t2 := t3 * t1 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2)
LDt (xout) \
CALL gfpSubInternal(SB) \ // X3 := t2 - X3 = (X1Y2 + X2Y1)(Y1Y2 - 3bZ1Z2) - 3b(Y1Z2 + Y2Z1)(X1Z2 + X2Z1)
MOVQ rptr, AX \
\// Store x
MOVQ acc4, (16*0 + 8*0)(AX) \
MOVQ acc5, (16*0 + 8*1)(AX) \
MOVQ acc6, (16*0 + 8*2)(AX) \
MOVQ acc7, (16*0 + 8*3)(AX) \
\
LDacc (yout) \
LDt (tmp0) \
CALL gfpMulInternal(SB) \ // Y3 := Y3 * t0 = 9bX1X2(X1Z2 + X2Z1)
ST (yout) \
\
LDacc (tmp1) \
LDt (zout) \
CALL gfpMulInternal(SB) \ // t1 := t1 * Z3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2)
LDt (yout) \
gfpAddInline \ // Y3 := t1 + Y3 = (Y1Y2 + 3bZ1Z2)(Y1Y2 - 3bZ1Z2) + 9bX1X2(X1Z2 + X2Z1)
MOVQ rptr, AX \
\// Store y
MOVQ t0, (16*2 + 8*0)(AX) \
MOVQ t1, (16*2 + 8*1)(AX) \
MOVQ t2, (16*2 + 8*2)(AX) \
MOVQ t3, (16*2 + 8*3)(AX) \
\
LDacc (tmp0) \
LDt (tmp3) \
CALL gfpMulInternal(SB) \ // t0 := t0 * t3 = 3X1X2(X1Y2 + X2Y1)
ST (tmp0) \
LDacc (zout) \
LDt (tmp4) \
CALL gfpMulInternal(SB) \ // Z3 := Z3 * t4 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2)
LDt (tmp0) \
gfpAddInline \ // Z3 := Z3 + t0 = (Y1Z2 + Y2Z1)(Y1Y2 + 3bZ1Z2) + 3X1X2(X1Y2 + X2Y1)
MOVQ rptr, AX \
MOVQ $0, rptr \
\// Store z
MOVQ t0, (16*4 + 8*0)(AX) \
MOVQ t1, (16*4 + 8*1)(AX) \
MOVQ t2, (16*4 + 8*2)(AX) \
MOVQ t3, (16*4 + 8*3)(AX) \
// func curvePointAddComplete(c, a, b *curvePoint)
TEXT ·curvePointAddComplete(SB),0,$480-24
// Move input to stack in order to free registers
MOVQ res+0(FP), AX
MOVQ in1+8(FP), BX
MOVQ in2+16(FP), CX
CMPB ·supportAVX2+0(SB), $0x01
JEQ pointadd_avx2
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU (16*4)(BX), X4
MOVOU (16*5)(BX), X5
MOVOU X0, x1in(16*0)
MOVOU X1, x1in(16*1)
MOVOU X2, y1in(16*0)
MOVOU X3, y1in(16*1)
MOVOU X4, z1in(16*0)
MOVOU X5, z1in(16*1)
MOVOU (16*0)(CX), X0
MOVOU (16*1)(CX), X1
MOVOU (16*2)(CX), X2
MOVOU (16*3)(CX), X3
MOVOU (16*4)(CX), X4
MOVOU (16*5)(CX), X5
MOVOU X0, x2in(16*0)
MOVOU X1, x2in(16*1)
MOVOU X2, y2in(16*0)
MOVOU X3, y2in(16*1)
MOVOU X4, z2in(16*0)
MOVOU X5, z2in(16*1)
// Store pointer to result
MOVQ AX, rptr
curvePointAddCompleteInline
RET
pointadd_avx2:
VMOVDQU (32*0)(BX), Y0
VMOVDQU (32*1)(BX), Y1
VMOVDQU (32*2)(BX), Y2
VMOVDQU Y0, x1in(32*0)
VMOVDQU Y1, y1in(32*0)
VMOVDQU Y2, z1in(32*0)
VMOVDQU (32*0)(CX), Y0
VMOVDQU (32*1)(CX), Y1
VMOVDQU (32*2)(CX), Y2
VMOVDQU Y0, x2in(32*0)
VMOVDQU Y1, y2in(32*0)
VMOVDQU Y2, z2in(32*0)
// Store pointer to result
MOVQ AX, rptr
curvePointAddCompleteInline
VZEROUPPER
RET
#undef x1in
#undef y1in
#undef z1in
#undef x2in
#undef y2in
#undef z2in
#undef xout
#undef yout
#undef zout
#undef tmp0
#undef tmp1
#undef tmp2
#undef tmp3
#undef tmp4
#undef rptr
/* ---------------------------------------*/
/*
// gfpIsZero returns 1 in AX if [acc4..acc7] represents zero and zero
// otherwise. It writes to [acc4..acc7], t0 and t1.
TEXT gfpIsZero(SB),NOSPLIT,$0
// AX contains a flag that is set if the input is zero.
XORQ AX, AX
MOVQ $1, t1
// Check whether [acc4..acc7] are all zero.
MOVQ acc4, t0
ORQ acc5, t0
ORQ acc6, t0
ORQ acc7, t0
// Set the zero flag if so. (CMOV of a constant to a register doesn't
// appear to be supported in Go. Thus t1 = 1.)
CMOVQEQ t1, AX
// XOR [acc4..acc7] with P and compare with zero again.
XORQ ·p2+0(SB), acc4
XORQ ·p2+8(SB), acc5
XORQ ·p2+16(SB), acc6
XORQ ·p2+24(SB), acc7
ORQ acc5, acc4
ORQ acc6, acc4
ORQ acc7, acc4
// Set the zero flag if so.
CMOVQEQ t1, AX
RET
#define x1in(off) (32*0 + off)(SP)
#define y1in(off) (32*1 + off)(SP)
#define z1in(off) (32*2 + off)(SP)
#define x2in(off) (32*3 + off)(SP)
#define y2in(off) (32*4 + off)(SP)
#define z2in(off) (32*5 + off)(SP)
#define xout(off) (32*6 + off)(SP)
#define yout(off) (32*7 + off)(SP)
#define zout(off) (32*8 + off)(SP)
#define u1(off) (32*9 + off)(SP)
#define u2(off) (32*10 + off)(SP)
#define s1(off) (32*11 + off)(SP)
#define s2(off) (32*12 + off)(SP)
#define z1sqr(off) (32*13 + off)(SP)
#define z2sqr(off) (32*14 + off)(SP)
#define h(off) (32*15 + off)(SP)
#define r(off) (32*16 + off)(SP)
#define hsqr(off) (32*17 + off)(SP)
#define rsqr(off) (32*18 + off)(SP)
#define hcub(off) (32*19 + off)(SP)
#define rptr (32*20)(SP)
#define points_eq (32*20+8)(SP)
#define curvePointAddInline \
\// Begin point add
LDacc (z2in) \
CALL gfpSqrInternal(SB) \// z2ˆ2
ST (z2sqr) \
LDt (z2in) \
CALL gfpMulInternal(SB) \// z2ˆ3
LDt (y1in) \
CALL gfpMulInternal(SB) \// s1 = z2ˆ3*y1
ST (s1) \
\
LDacc (z1in) \
CALL gfpSqrInternal(SB) \// z1ˆ2
ST (z1sqr) \
LDt (z1in) \
CALL gfpMulInternal(SB) \// z1ˆ3
LDt (y2in) \
CALL gfpMulInternal(SB) \// s2 = z1ˆ3*y2
ST (s2) \
\
LDt (s1) \
CALL gfpSubInternal(SB) \// r = s2 - s1
ST (r) \
CALL gfpIsZero(SB) \
MOVQ AX, points_eq \
\
LDacc (z2sqr) \
LDt (x1in) \
CALL gfpMulInternal(SB) \// u1 = x1 * z2ˆ2
ST (u1) \
LDacc (z1sqr) \
LDt (x2in) \
CALL gfpMulInternal(SB) \// u2 = x2 * z1ˆ2
ST (u2) \
\
LDt (u1) \
CALL gfpSubInternal(SB) \// h = u2 - u1
ST (h) \
CALL gfpIsZero(SB) \
ANDQ points_eq, AX \
MOVQ AX, points_eq \
\
LDacc (r) \
CALL gfpSqrInternal(SB) \// rsqr = rˆ2
ST (rsqr) \
\
LDacc (h) \
CALL gfpSqrInternal(SB) \// hsqr = hˆ2
ST (hsqr) \
\
LDt (h) \
CALL gfpMulInternal(SB) \// hcub = hˆ3
ST (hcub) \
\
LDt (s1) \
CALL gfpMulInternal(SB) \
ST (s2) \
\
LDacc (z1in) \
LDt (z2in) \
CALL gfpMulInternal(SB) \// z1 * z2
LDt (h) \
CALL gfpMulInternal(SB) \// z1 * z2 * h
ST (zout) \
\
LDacc (hsqr) \
LDt (u1) \
CALL gfpMulInternal(SB) \// hˆ2 * u1
ST (u2) \
\
gfpMulBy2Inline \// u1 * hˆ2 * 2, inline
LDacc (rsqr) \
CALL gfpSubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
\
LDt (hcub) \
CALL gfpSubInternal(SB) \
ST (xout) \
\
MOVQ acc4, t0 \
MOVQ acc5, t1 \
MOVQ acc6, t2 \
MOVQ acc7, t3 \
LDacc (u2) \
CALL gfpSubInternal(SB) \
\
LDt (r) \
CALL gfpMulInternal(SB) \
\
LDt (s2) \
CALL gfpSubInternal(SB) \
ST (yout) \
// func curvePointAdd(c, a, b *curvePoint) int
TEXT ·curvePointAdd(SB),0,$680-32
// Move input to stack in order to free registers
MOVQ res+0(FP), AX
MOVQ in1+8(FP), BX
MOVQ in2+16(FP), CX
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU (16*4)(BX), X4
MOVOU (16*5)(BX), X5
MOVOU X0, x1in(16*0)
MOVOU X1, x1in(16*1)
MOVOU X2, y1in(16*0)
MOVOU X3, y1in(16*1)
MOVOU X4, z1in(16*0)
MOVOU X5, z1in(16*1)
MOVOU (16*0)(CX), X0
MOVOU (16*1)(CX), X1
MOVOU (16*2)(CX), X2
MOVOU (16*3)(CX), X3
MOVOU (16*4)(CX), X4
MOVOU (16*5)(CX), X5
MOVOU X0, x2in(16*0)
MOVOU X1, x2in(16*1)
MOVOU X2, y2in(16*0)
MOVOU X3, y2in(16*1)
MOVOU X4, z2in(16*0)
MOVOU X5, z2in(16*1)
// Store pointer to result
MOVQ AX, rptr
curvePointAddInline
MOVOU xout(16*0), X0
MOVOU xout(16*1), X1
MOVOU yout(16*0), X2
MOVOU yout(16*1), X3
MOVOU zout(16*0), X4
MOVOU zout(16*1), X5
// Finally output the result
MOVQ rptr, AX
MOVQ $0, rptr
MOVOU X0, (16*0)(AX)
MOVOU X1, (16*1)(AX)
MOVOU X2, (16*2)(AX)
MOVOU X3, (16*3)(AX)
MOVOU X4, (16*4)(AX)
MOVOU X5, (16*5)(AX)
MOVQ points_eq, AX
MOVQ AX, ret+24(FP)
RET
*/