//go:build !purego #include "textflag.h" #include "gfp_macros_amd64.s" TEXT ·gfpNeg(SB),NOSPLIT,$0-16 MOVQ ·p2+0(SB), R8 MOVQ ·p2+8(SB), R9 MOVQ ·p2+16(SB), R10 MOVQ ·p2+24(SB), R11 MOVQ a+8(FP), DI SUBQ 0(DI), R8 SBBQ 8(DI), R9 SBBQ 16(DI), R10 SBBQ 24(DI), R11 gfpCarryWithoutCarry(R8,R9,R10,R11, R12,R13,R14,CX) MOVQ c+0(FP), DI storeBlock(R8,R9,R10,R11, 0(DI)) RET TEXT ·gfpAdd(SB),NOSPLIT,$0-24 MOVQ a+8(FP), DI MOVQ b+16(FP), SI loadBlock(0(DI), R8,R9,R10,R11) MOVQ $0, R12 ADDQ 0(SI), R8 ADCQ 8(SI), R9 ADCQ 16(SI), R10 ADCQ 24(SI), R11 ADCQ $0, R12 gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) MOVQ c+0(FP), DI storeBlock(R8,R9,R10,R11, 0(DI)) RET TEXT ·gfpDouble(SB),NOSPLIT,$0-16 MOVQ a+0(FP), DI MOVQ b+8(FP), SI loadBlock(0(SI), R8,R9,R10,R11) XORQ R12, R12 ADDQ R8, R8 ADCQ R9, R9 ADCQ R10, R10 ADCQ R11, R11 ADCQ $0, R12 gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) storeBlock(R8,R9,R10,R11, 0(DI)) RET TEXT ·gfpTriple(SB),NOSPLIT,$0-16 MOVQ a+0(FP), DI MOVQ b+8(FP), SI loadBlock(0(SI), R8,R9,R10,R11) XORQ R12, R12 ADDQ R8, R8 ADCQ R9, R9 ADCQ R10, R10 ADCQ R11, R11 ADCQ $0, R12 gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) XORQ R12, R12 ADDQ 0(SI), R8 ADCQ 8(SI), R9 ADCQ 16(SI), R10 ADCQ 24(SI), R11 ADCQ $0, R12 gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) storeBlock(R8,R9,R10,R11, 0(DI)) RET TEXT ·gfpSub(SB),NOSPLIT,$0-24 MOVQ a+8(FP), DI MOVQ b+16(FP), SI loadBlock(0(DI), R8,R9,R10,R11) MOVQ ·p2+0(SB), R12 MOVQ ·p2+8(SB), R13 MOVQ ·p2+16(SB), R14 MOVQ ·p2+24(SB), CX MOVQ $0, AX SUBQ 0(SI), R8 SBBQ 8(SI), R9 SBBQ 16(SI), R10 SBBQ 24(SI), R11 CMOVQCC AX, R12 CMOVQCC AX, R13 CMOVQCC AX, R14 CMOVQCC AX, CX ADDQ R12, R8 ADCQ R13, R9 ADCQ R14, R10 ADCQ CX, R11 MOVQ c+0(FP), DI storeBlock(R8,R9,R10,R11, 0(DI)) RET TEXT ·gfpMul(SB),NOSPLIT,$0-24 MOVQ in1+8(FP), x_ptr MOVQ in2+16(FP), y_ptr CMPB ·supportADX(SB), $0 JE noAdxMul XORQ acc5, acc5 XORQ res_ptr, res_ptr // x * y[0] MOVQ (8*0)(y_ptr), DX MULXQ (8*0)(x_ptr), acc0, acc1 MULXQ (8*1)(x_ptr), AX, acc2 ADCXQ AX, acc1 MULXQ (8*2)(x_ptr), AX, acc3 ADCXQ AX, acc2 MULXQ (8*3)(x_ptr), AX, acc4 ADCXQ AX, acc3 ADCXQ acc5, acc4 // First reduction step MOVQ acc0, DX MULXQ ·np+0x00(SB), DX, AX MULXQ ·p2+0x00(SB), AX, t0 ADOXQ AX, acc0 MULXQ ·p2+0x08(SB), AX, BX ADCXQ t0, AX ADOXQ AX, acc1 MULXQ ·p2+0x10(SB), AX, t0 ADCXQ BX, AX ADOXQ AX, acc2 MULXQ ·p2+0x18(SB), AX, BX ADCXQ t0, AX ADOXQ AX, acc3 ADCXQ res_ptr, BX ADOXQ BX, acc4 ADOXQ res_ptr, acc5 XORQ acc0, acc0 // It seems this line is optional. // x * y[1] MOVQ (8*1)(y_ptr), DX MULXQ (8*0)(x_ptr), AX, t0 ADOXQ AX, acc1 MULXQ (8*1)(x_ptr), AX, BX ADCXQ t0, AX ADOXQ AX, acc2 MULXQ (8*2)(x_ptr), AX, t0 ADCXQ BX, AX ADOXQ AX, acc3 MULXQ (8*3)(x_ptr), AX, BX ADCXQ t0, AX ADOXQ AX, acc4 ADCXQ acc0, BX ADOXQ BX, acc5 ADOXQ res_ptr, acc0 // Second reduction step MOVQ acc1, DX MULXQ ·np+0x00(SB), DX, AX MULXQ ·p2+0x00(SB), AX, t0 ADOXQ AX, acc1 MULXQ ·p2+0x08(SB), AX, BX ADCXQ t0, AX ADOXQ AX, acc2 MULXQ ·p2+0x10(SB), AX, t0 ADCXQ BX, AX ADOXQ AX, acc3 MULXQ ·p2+0x18(SB), AX, BX ADCXQ t0, AX ADOXQ AX, acc4 ADCXQ res_ptr, BX ADOXQ BX, acc5 ADOXQ res_ptr, acc0 XORQ acc1, acc1 // It seems this line is optional. // x * y[2] MOVQ (8*2)(y_ptr), DX MULXQ (8*0)(x_ptr), AX, t0 ADOXQ AX, acc2 MULXQ (8*1)(x_ptr), AX, BX ADCXQ t0, AX ADOXQ AX, acc3 MULXQ (8*2)(x_ptr), AX, t0 ADCXQ BX, AX ADOXQ AX, acc4 MULXQ (8*3)(x_ptr), AX, BX ADCXQ t0, AX ADOXQ AX, acc5 ADCXQ res_ptr, BX ADOXQ BX, acc0 ADOXQ res_ptr, acc1 // Third reduction step MOVQ acc2, DX MULXQ ·np+0x00(SB), DX, AX MULXQ ·p2+0x00(SB), AX, t0 ADOXQ AX, acc2 MULXQ ·p2+0x08(SB), AX, BX ADCXQ t0, AX ADOXQ AX, acc3 MULXQ ·p2+0x10(SB), AX, t0 ADCXQ BX, AX ADOXQ AX, acc4 MULXQ ·p2+0x18(SB), AX, BX ADCXQ t0, AX ADOXQ AX, acc5 ADCXQ res_ptr, BX ADOXQ BX, acc0 ADOXQ res_ptr, acc1 XORQ acc2, acc2 // It seems this line is optional. // x * y[3] MOVQ (8*3)(y_ptr), DX MULXQ (8*0)(x_ptr), AX, t0 ADOXQ AX, acc3 MULXQ (8*1)(x_ptr), AX, BX ADCXQ t0, AX ADOXQ AX, acc4 MULXQ (8*2)(x_ptr), AX, t0 ADCXQ BX, AX ADOXQ AX, acc5 MULXQ (8*3)(x_ptr), AX, BX ADCXQ t0, AX ADOXQ AX, acc0 ADCXQ res_ptr, BX ADOXQ BX, acc1 ADOXQ res_ptr, acc2 // Last reduction step MOVQ acc3, DX MULXQ ·np+0x00(SB), DX, AX MULXQ ·p2+0x00(SB), AX, t0 ADOXQ AX, acc3 MULXQ ·p2+0x08(SB), AX, BX ADCXQ t0, AX ADOXQ AX, acc4 MULXQ ·p2+0x10(SB), AX, t0 ADCXQ BX, AX ADOXQ AX, acc5 MULXQ ·p2+0x18(SB), AX, BX ADCXQ t0, AX ADOXQ AX, acc0 ADCXQ res_ptr, BX ADOXQ BX, acc1 ADOXQ res_ptr, acc2 // Copy result [255:0] gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,BX,acc2) MOVQ res+0(FP), res_ptr storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) RET noAdxMul: // x * y[0] MOVQ (8*0)(y_ptr), t0 MOVQ (8*0)(x_ptr), AX MULQ t0 MOVQ AX, acc0 MOVQ DX, acc1 MOVQ (8*1)(x_ptr), AX MULQ t0 ADDQ AX, acc1 ADCQ $0, DX MOVQ DX, acc2 MOVQ (8*2)(x_ptr), AX MULQ t0 ADDQ AX, acc2 ADCQ $0, DX MOVQ DX, acc3 MOVQ (8*3)(x_ptr), AX MULQ t0 ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, acc4 XORQ acc5, acc5 // First reduction step MOVQ acc0, AX MULQ ·np+0x00(SB) MOVQ AX, t0 MOVQ ·p2+0x00(SB), AX MULQ t0 ADDQ AX, acc0 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x08(SB), AX MULQ t0 ADDQ BX, acc1 ADCQ $0, DX ADDQ AX, acc1 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x10(SB), AX MULQ t0 ADDQ BX, acc2 ADCQ $0, DX ADDQ AX, acc2 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x18(SB), AX MULQ t0 ADDQ BX, acc3 ADCQ $0, DX ADDQ AX, acc3 ADCQ DX, acc4 ADCQ $0, acc5 XORQ acc0, acc0 // It seems this line is optional. // x * y[1] MOVQ (8*1)(y_ptr), t0 MOVQ (8*0)(x_ptr), AX MULQ t0 ADDQ AX, acc1 ADCQ $0, DX MOVQ DX, BX MOVQ (8*1)(x_ptr), AX MULQ t0 ADDQ BX, acc2 ADCQ $0, DX ADDQ AX, acc2 ADCQ $0, DX MOVQ DX, BX MOVQ (8*2)(x_ptr), AX MULQ t0 ADDQ BX, acc3 ADCQ $0, DX ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, BX MOVQ (8*3)(x_ptr), AX MULQ t0 ADDQ BX, acc4 ADCQ $0, DX ADDQ AX, acc4 ADCQ DX, acc5 ADCQ $0, acc0 // Second reduction step MOVQ acc1, AX MULQ ·np+0x00(SB) MOVQ AX, t0 MOVQ ·p2+0x00(SB), AX MULQ t0 ADDQ AX, acc1 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x08(SB), AX MULQ t0 ADDQ BX, acc2 ADCQ $0, DX ADDQ AX, acc2 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x10(SB), AX MULQ t0 ADDQ BX, acc3 ADCQ $0, DX ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x18(SB), AX MULQ t0 ADDQ BX, acc4 ADCQ $0, DX ADDQ AX, acc4 ADCQ DX, acc5 ADCQ $0, acc0 XORQ acc1, acc1 // It seems this line is optional. // x * y[2] MOVQ (8*2)(y_ptr), t0 MOVQ (8*0)(x_ptr), AX MULQ t0 ADDQ AX, acc2 ADCQ $0, DX MOVQ DX, BX MOVQ (8*1)(x_ptr), AX MULQ t0 ADDQ BX, acc3 ADCQ $0, DX ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, BX MOVQ (8*2)(x_ptr), AX MULQ t0 ADDQ BX, acc4 ADCQ $0, DX ADDQ AX, acc4 ADCQ $0, DX MOVQ DX, BX MOVQ (8*3)(x_ptr), AX MULQ t0 ADDQ BX, acc5 ADCQ $0, DX ADDQ AX, acc5 ADCQ DX, acc0 ADCQ $0, acc1 // Third reduction step MOVQ acc2, AX MULQ ·np+0x00(SB) MOVQ AX, t0 MOVQ ·p2+0x00(SB), AX MULQ t0 ADDQ AX, acc2 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x08(SB), AX MULQ t0 ADDQ BX, acc3 ADCQ $0, DX ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x10(SB), AX MULQ t0 ADDQ BX, acc4 ADCQ $0, DX ADDQ AX, acc4 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x18(SB), AX MULQ t0 ADDQ BX, acc5 ADCQ $0, DX ADDQ AX, acc5 ADCQ DX, acc0 ADCQ $0, acc1 XORQ acc2, acc2 // It seems this line is optional. // x * y[3] MOVQ (8*3)(y_ptr), t0 MOVQ (8*0)(x_ptr), AX MULQ t0 ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, BX MOVQ (8*1)(x_ptr), AX MULQ t0 ADDQ BX, acc4 ADCQ $0, DX ADDQ AX, acc4 ADCQ $0, DX MOVQ DX, BX MOVQ (8*2)(x_ptr), AX MULQ t0 ADDQ BX, acc5 ADCQ $0, DX ADDQ AX, acc5 ADCQ $0, DX MOVQ DX, BX MOVQ (8*3)(x_ptr), AX MULQ t0 ADDQ BX, acc0 ADCQ $0, DX ADDQ AX, acc0 ADCQ DX, acc1 ADCQ $0, acc2 // Last reduction step MOVQ acc3, AX MULQ ·np+0x00(SB) MOVQ AX, t0 MOVQ ·p2+0x00(SB), AX MULQ t0 ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x08(SB), AX MULQ t0 ADDQ BX, acc4 ADCQ $0, DX ADDQ AX, acc4 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x10(SB), AX MULQ t0 ADDQ BX, acc5 ADCQ $0, DX ADDQ AX, acc5 ADCQ $0, DX MOVQ DX, BX MOVQ ·p2+0x18(SB), AX MULQ t0 ADDQ BX, acc0 ADCQ $0, DX ADDQ AX, acc0 ADCQ DX, acc1 ADCQ $0, acc2 // Copy result [255:0] gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,BX,acc2) MOVQ res+0(FP), res_ptr storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) RET /* ---------------------------------------*/ // func gfpFromMont(res, in *gfP) TEXT ·gfpFromMont(SB),NOSPLIT,$0 MOVQ res+0(FP), res_ptr MOVQ in+8(FP), x_ptr MOVQ (8*0)(x_ptr), acc0 MOVQ (8*1)(x_ptr), acc1 MOVQ (8*2)(x_ptr), acc2 MOVQ (8*3)(x_ptr), acc3 XORQ acc4, acc4 // Only reduce, no multiplications are needed // First reduction step MOVQ acc0, AX MULQ ·np+0x00(SB) MOVQ AX, t0 // Y // Calculate next T = T+Y*P MOVQ ·p2+0x00(SB), AX MULQ t0 ADDQ AX, acc0 // acc0 is free now ADCQ $0, DX MOVQ DX, BX // carry XORQ acc0, acc0 MOVQ ·p2+0x08(SB), AX MULQ t0 ADDQ BX, acc1 ADCQ $0, DX ADDQ AX, acc1 ADCQ $0, DX MOVQ DX, BX // carry MOVQ ·p2+0x10(SB), AX MULQ t0 ADDQ BX, acc2 ADCQ $0, DX ADDQ AX, acc2 ADCQ $0, DX MOVQ DX, BX // carry MOVQ ·p2+0x18(SB), AX MULQ t0 ADDQ BX, acc3 ADCQ $0, DX ADDQ AX, acc3 ADCQ DX, acc4 XORQ acc5, acc5 // Second reduction step MOVQ acc1, AX MULQ ·np+0x00(SB) MOVQ AX, t0 // Y // Calculate next T = T+Y*P MOVQ ·p2+0x00(SB), AX MULQ t0 ADDQ AX, acc1 // acc1 is free now ADCQ $0, DX MOVQ DX, BX // carry XORQ acc1, acc1 MOVQ ·p2+0x08(SB), AX MULQ t0 ADDQ BX, acc2 ADCQ $0, DX ADDQ AX, acc2 ADCQ $0, DX MOVQ DX, BX // carry MOVQ ·p2+0x10(SB), AX MULQ t0 ADDQ BX, acc3 ADCQ $0, DX ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, BX // carry MOVQ ·p2+0x18(SB), AX MULQ t0 ADDQ BX, acc4 ADCQ $0, DX ADDQ AX, acc4 ADCQ DX, acc5 // Third reduction step MOVQ acc2, AX MULQ ·np+0x00(SB) MOVQ AX, t0 // Y // Calculate next T = T+Y*P MOVQ ·p2+0x00(SB), AX MULQ t0 ADDQ AX, acc2 // acc2 is free now ADCQ $0, DX MOVQ DX, BX // carry MOVQ ·p2+0x08(SB), AX MULQ t0 ADDQ BX, acc3 ADCQ $0, DX ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, BX // carry MOVQ ·p2+0x10(SB), AX MULQ t0 ADDQ BX, acc4 ADCQ $0, DX ADDQ AX, acc4 ADCQ $0, DX MOVQ DX, BX // carry MOVQ ·p2+0x18(SB), AX MULQ t0 ADDQ BX, acc5 ADCQ $0, DX ADDQ AX, acc5 ADCQ DX, acc0 // Last reduction step MOVQ acc3, AX MULQ ·np+0x00(SB) MOVQ AX, t0 // Y // Calculate next T = T+Y*P MOVQ ·p2+0x00(SB), AX MULQ t0 ADDQ AX, acc3 // acc3 is free now ADCQ $0, DX MOVQ DX, BX // carry XORQ acc3, acc3 MOVQ ·p2+0x08(SB), AX MULQ t0 ADDQ BX, acc4 ADCQ $0, DX ADDQ AX, acc4 ADCQ $0, DX MOVQ DX, BX // carry MOVQ ·p2+0x10(SB), AX MULQ t0 ADDQ BX, acc5 ADCQ $0, DX ADDQ AX, acc5 ADCQ $0, DX MOVQ DX, BX // carry MOVQ ·p2+0x18(SB), AX MULQ t0 ADDQ BX, acc0 ADCQ $0, DX ADDQ AX, acc0 ADCQ DX, acc1 gfpCarryWithoutCarry(acc4, acc5, acc0, acc1, x_ptr, acc3, t0, BX) storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) RET /* ---------------------------------------*/ // func gfpUnmarshal(res *gfP, in *[32]byte) TEXT ·gfpUnmarshal(SB),NOSPLIT,$0 JMP ·gfpMarshal(SB) /* ---------------------------------------*/ // func gfpMarshal(res *[32]byte, in *gfP) TEXT ·gfpMarshal(SB),NOSPLIT,$0 MOVQ res+0(FP), res_ptr MOVQ in+8(FP), x_ptr MOVQ (8*0)(x_ptr), acc0 MOVQ (8*1)(x_ptr), acc1 MOVQ (8*2)(x_ptr), acc2 MOVQ (8*3)(x_ptr), acc3 BSWAPQ acc0 BSWAPQ acc1 BSWAPQ acc2 BSWAPQ acc3 MOVQ acc3, (8*0)(res_ptr) MOVQ acc2, (8*1)(res_ptr) MOVQ acc1, (8*2)(res_ptr) MOVQ acc0, (8*3)(res_ptr) RET