From 3bb540d45fe0874bcbe4346e8f57ccebee84e7fb Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Fri, 8 Sep 2023 16:36:43 +0800 Subject: [PATCH] sm9/bn256: amd64 asm support plugin, eliminate usage of R15 #154 --- sm9/bn256/gfp_amd64.s | 818 +---------------------------------- sm9/bn256/gfp_cmn_amd64.s | 760 ++++++++++++++++++++++++++++++++ sm9/bn256/gfp_decl.go | 4 +- sm9/bn256/gfp_generic.go | 4 +- sm9/bn256/gfp_macros_amd64.s | 62 +++ sm9/bn256/gfp_plugin_amd64.s | 423 ++++++++++++++++++ 6 files changed, 1250 insertions(+), 821 deletions(-) create mode 100644 sm9/bn256/gfp_cmn_amd64.s create mode 100644 sm9/bn256/gfp_macros_amd64.s create mode 100644 sm9/bn256/gfp_plugin_amd64.s diff --git a/sm9/bn256/gfp_amd64.s b/sm9/bn256/gfp_amd64.s index 4949616..f36e191 100644 --- a/sm9/bn256/gfp_amd64.s +++ b/sm9/bn256/gfp_amd64.s @@ -2,636 +2,9 @@ // +build amd64,!purego,!plugin #include "textflag.h" - -#define res_ptr DI -#define x_ptr SI -#define y_ptr CX - -#define acc0 R8 -#define acc1 R9 -#define acc2 R10 -#define acc3 R11 -#define acc4 R12 -#define acc5 R13 -#define t0 R14 +#include "gfp_macros_amd64.s" #define t1 R15 -#define storeBlock(a0,a1,a2,a3, r) \ - MOVQ a0, 0+r \ - MOVQ a1, 8+r \ - MOVQ a2, 16+r \ - MOVQ a3, 24+r - -#define loadBlock(r, a0,a1,a2,a3) \ - MOVQ 0+r, a0 \ - MOVQ 8+r, a1 \ - MOVQ 16+r, a2 \ - MOVQ 24+r, a3 - -#define gfpCarry(a0,a1,a2,a3, b0,b1,b2,b3,b4) \ - \ // b = a-p - MOVQ a0, b0 \ - MOVQ a1, b1 \ - MOVQ a2, b2 \ - MOVQ a3, b3 \ - \ - SUBQ ·p2+0(SB), b0 \ - SBBQ ·p2+8(SB), b1 \ - SBBQ ·p2+16(SB), b2 \ - SBBQ ·p2+24(SB), b3 \ - SBBQ $0, b4 \ - \ - \ // if b is negative then return a - \ // else return b - CMOVQCC b0, a0 \ - CMOVQCC b1, a1 \ - CMOVQCC b2, a2 \ - CMOVQCC b3, a3 - -#define gfpCarryWithoutCarry(a0,a1,a2,a3, b0,b1,b2,b3) \ - \ // b = a-p - MOVQ a0, b0 \ - MOVQ a1, b1 \ - MOVQ a2, b2 \ - MOVQ a3, b3 \ - \ - SUBQ ·p2+0(SB), b0 \ - SBBQ ·p2+8(SB), b1 \ - SBBQ ·p2+16(SB), b2 \ - SBBQ ·p2+24(SB), b3 \ - \ - \ // if b is negative then return a - \ // else return b - CMOVQCC b0, a0 \ - CMOVQCC b1, a1 \ - CMOVQCC b2, a2 \ - CMOVQCC b3, a3 - -TEXT ·gfpNeg(SB),NOSPLIT,$0-16 - MOVQ ·p2+0(SB), R8 - MOVQ ·p2+8(SB), R9 - MOVQ ·p2+16(SB), R10 - MOVQ ·p2+24(SB), R11 - - MOVQ a+8(FP), DI - SUBQ 0(DI), R8 - SBBQ 8(DI), R9 - SBBQ 16(DI), R10 - SBBQ 24(DI), R11 - - gfpCarryWithoutCarry(R8,R9,R10,R11, R12,R13,R14,CX) - - MOVQ c+0(FP), DI - storeBlock(R8,R9,R10,R11, 0(DI)) - RET - -TEXT ·gfpAdd(SB),NOSPLIT,$0-24 - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - - loadBlock(0(DI), R8,R9,R10,R11) - MOVQ $0, R12 - - ADDQ 0(SI), R8 - ADCQ 8(SI), R9 - ADCQ 16(SI), R10 - ADCQ 24(SI), R11 - ADCQ $0, R12 - - gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) - - MOVQ c+0(FP), DI - storeBlock(R8,R9,R10,R11, 0(DI)) - RET - -TEXT ·gfpDouble(SB),NOSPLIT,$0-16 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - - loadBlock(0(SI), R8,R9,R10,R11) - XORQ R12, R12 - - ADDQ R8, R8 - ADCQ R9, R9 - ADCQ R10, R10 - ADCQ R11, R11 - ADCQ $0, R12 - - gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) - - storeBlock(R8,R9,R10,R11, 0(DI)) - RET - -TEXT ·gfpTriple(SB),NOSPLIT,$0-16 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - - loadBlock(0(SI), R8,R9,R10,R11) - XORQ R12, R12 - - ADDQ R8, R8 - ADCQ R9, R9 - ADCQ R10, R10 - ADCQ R11, R11 - ADCQ $0, R12 - - gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) - - XORQ R12, R12 - ADDQ 0(SI), R8 - ADCQ 8(SI), R9 - ADCQ 16(SI), R10 - ADCQ 24(SI), R11 - ADCQ $0, R12 - - gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) - - storeBlock(R8,R9,R10,R11, 0(DI)) - RET - -TEXT ·gfpSub(SB),NOSPLIT,$0-24 - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - - loadBlock(0(DI), R8,R9,R10,R11) - - MOVQ ·p2+0(SB), R12 - MOVQ ·p2+8(SB), R13 - MOVQ ·p2+16(SB), R14 - MOVQ ·p2+24(SB), CX - MOVQ $0, AX - - SUBQ 0(SI), R8 - SBBQ 8(SI), R9 - SBBQ 16(SI), R10 - SBBQ 24(SI), R11 - - CMOVQCC AX, R12 - CMOVQCC AX, R13 - CMOVQCC AX, R14 - CMOVQCC AX, CX - - ADDQ R12, R8 - ADCQ R13, R9 - ADCQ R14, R10 - ADCQ CX, R11 - - MOVQ c+0(FP), DI - storeBlock(R8,R9,R10,R11, 0(DI)) - RET - -TEXT ·gfpMul(SB),NOSPLIT,$0-24 - MOVQ in1+8(FP), x_ptr - MOVQ in2+16(FP), y_ptr - - CMPB ·supportADX(SB), $0 - JE noAdxMul - - XORQ acc5, acc5 - XORQ res_ptr, res_ptr - // x * y[0] - MOVQ (8*0)(y_ptr), DX - MULXQ (8*0)(x_ptr), acc0, acc1 - - MULXQ (8*1)(x_ptr), AX, acc2 - ADCXQ AX, acc1 - - MULXQ (8*2)(x_ptr), AX, acc3 - ADCXQ AX, acc2 - - MULXQ (8*3)(x_ptr), AX, acc4 - ADCXQ AX, acc3 - ADCXQ acc5, acc4 - - // First reduction step - MOVQ acc0, DX - MULXQ ·np+0x00(SB), DX, AX - - MULXQ ·p2+0x00(SB), AX, t0 - ADOXQ AX, acc0 - - MULXQ ·p2+0x08(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc1 - - MULXQ ·p2+0x10(SB), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc2 - - MULXQ ·p2+0x18(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc3 - - ADCXQ res_ptr, t1 - ADOXQ t1, acc4 - ADOXQ res_ptr, acc5 - XORQ acc0, acc0 - - // x * y[1] - MOVQ (8*1)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t0 - ADOXQ AX, acc1 - - MULXQ (8*1)(x_ptr), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc2 - - MULXQ (8*2)(x_ptr), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc3 - - MULXQ (8*3)(x_ptr), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc4 - - ADCXQ acc0, t1 - ADOXQ t1, acc5 - ADOXQ res_ptr, acc0 - - // Second reduction step - MOVQ acc1, DX - MULXQ ·np+0x00(SB), DX, AX - - MULXQ ·p2+0x00(SB), AX, t0 - ADOXQ AX, acc1 - - MULXQ ·p2+0x08(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc2 - - MULXQ ·p2+0x10(SB), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc3 - - MULXQ ·p2+0x18(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc4 - - ADCXQ res_ptr, t1 - ADOXQ t1, acc5 - ADOXQ res_ptr, acc0 - XORQ acc1, acc1 - - // x * y[2] - MOVQ (8*2)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t0 - ADOXQ AX, acc2 - - MULXQ (8*1)(x_ptr), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc3 - - MULXQ (8*2)(x_ptr), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc4 - - MULXQ (8*3)(x_ptr), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc5 - - ADCXQ res_ptr, t1 - ADOXQ t1, acc0 - ADOXQ res_ptr, acc1 - - // Third reduction step - MOVQ acc2, DX - MULXQ ·np+0x00(SB), DX, AX - - MULXQ ·p2+0x00(SB), AX, t0 - ADOXQ AX, acc2 - - MULXQ ·p2+0x08(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc3 - - MULXQ ·p2+0x10(SB), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc4 - - MULXQ ·p2+0x18(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc5 - - ADCXQ res_ptr, t1 - ADOXQ t1, acc0 - ADOXQ res_ptr, acc1 - XORQ acc2, acc2 - - // x * y[3] - MOVQ (8*3)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t0 - ADOXQ AX, acc3 - - MULXQ (8*1)(x_ptr), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc4 - - MULXQ (8*2)(x_ptr), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc5 - - MULXQ (8*3)(x_ptr), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc0 - - ADCXQ res_ptr, t1 - ADOXQ t1, acc1 - ADOXQ res_ptr, acc2 - - // Last reduction step - MOVQ acc3, DX - MULXQ ·np+0x00(SB), DX, AX - - MULXQ ·p2+0x00(SB), AX, t0 - ADOXQ AX, acc3 - - MULXQ ·p2+0x08(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc4 - - MULXQ ·p2+0x10(SB), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc5 - - MULXQ ·p2+0x18(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc0 - - ADCXQ res_ptr, t1 - ADOXQ t1, acc1 - ADOXQ res_ptr, acc2 - // Copy result [255:0] - gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2) - MOVQ res+0(FP), res_ptr - storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) - RET - -noAdxMul: - // x * y[0] - MOVQ (8*0)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - MOVQ AX, acc0 - MOVQ DX, acc1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 - XORQ acc5, acc5 - // First reduction step - MOVQ acc0, AX - MULQ ·np+0x00(SB) - MOVQ AX, t0 - - MOVQ ·p2+0x00(SB), AX - MULQ t0 - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x08(SB), AX - MULQ t0 - ADDQ t1, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x10(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x18(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 - ADCQ $0, acc5 - // x * y[1] - MOVQ (8*1)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 - ADCQ $0, acc0 - // Second reduction step - MOVQ acc1, AX - MULQ ·np+0x00(SB) - MOVQ AX, t0 - - MOVQ ·p2+0x00(SB), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x08(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x10(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x18(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 - ADCQ $0, acc0 - // x * y[2] - MOVQ (8*2)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 - ADCQ $0, acc1 - // Third reduction step - MOVQ acc2, AX - MULQ ·np+0x00(SB) - MOVQ AX, t0 - - MOVQ ·p2+0x00(SB), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x08(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x10(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x18(SB), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 - ADCQ $0, acc1 - // x * y[3] - MOVQ (8*3)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 - // Last reduction step - MOVQ acc3, AX - MULQ ·np+0x00(SB) - MOVQ AX, t0 - - MOVQ ·p2+0x00(SB), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x08(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x10(SB), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ ·p2+0x18(SB), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 - // Copy result [255:0] - gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2) - MOVQ res+0(FP), res_ptr - storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) - - RET - // func gfpSqr(res, in *gfP, n int) TEXT ·gfpSqr(SB),NOSPLIT,$0 MOVQ res+0(FP), res_ptr @@ -1046,192 +419,3 @@ gfpSqrLoop: JNE gfpSqrLoop RET - -/* ---------------------------------------*/ -// func gfpFromMont(res, in *gfP) -TEXT ·gfpFromMont(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in+8(FP), x_ptr - - MOVQ (8*0)(x_ptr), acc0 - MOVQ (8*1)(x_ptr), acc1 - MOVQ (8*2)(x_ptr), acc2 - MOVQ (8*3)(x_ptr), acc3 - XORQ acc4, acc4 - - // Only reduce, no multiplications are needed - // First reduction step - MOVQ acc0, AX - MULQ ·np+0x00(SB) - MOVQ AX, t0 // Y - - // Calculate next T = T+Y*P - MOVQ ·p2+0x00(SB), AX - MULQ t0 - ADDQ AX, acc0 // acc0 is free now - ADCQ $0, DX - MOVQ DX, t1 // carry - XORQ acc0, acc0 - - MOVQ ·p2+0x08(SB), AX - MULQ t0 - ADDQ t1, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 // carry - - MOVQ ·p2+0x10(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 // carry - - MOVQ ·p2+0x18(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 - XORQ acc5, acc5 - - // Second reduction step - MOVQ acc1, AX - MULQ ·np+0x00(SB) - MOVQ AX, t0 // Y - - // Calculate next T = T+Y*P - MOVQ ·p2+0x00(SB), AX - MULQ t0 - ADDQ AX, acc1 // acc1 is free now - ADCQ $0, DX - MOVQ DX, t1 // carry - XORQ acc1, acc1 - - MOVQ ·p2+0x08(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 // carry - - MOVQ ·p2+0x10(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 // carry - - MOVQ ·p2+0x18(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 - - // Third reduction step - MOVQ acc2, AX - MULQ ·np+0x00(SB) - MOVQ AX, t0 // Y - - // Calculate next T = T+Y*P - MOVQ ·p2+0x00(SB), AX - MULQ t0 - ADDQ AX, acc2 // acc2 is free now - ADCQ $0, DX - MOVQ DX, t1 // carry - - MOVQ ·p2+0x08(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 // carry - - MOVQ ·p2+0x10(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 // carry - - MOVQ ·p2+0x18(SB), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 - - // Last reduction step - MOVQ acc3, AX - MULQ ·np+0x00(SB) - MOVQ AX, t0 // Y - - // Calculate next T = T+Y*P - MOVQ ·p2+0x00(SB), AX - MULQ t0 - ADDQ AX, acc3 // acc3 is free now - ADCQ $0, DX - MOVQ DX, t1 // carry - XORQ acc3, acc3 - - MOVQ ·p2+0x08(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 // carry - - MOVQ ·p2+0x10(SB), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 // carry - - MOVQ ·p2+0x18(SB), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 - - gfpCarryWithoutCarry(acc4, acc5, acc0, acc1, x_ptr, acc3, t0, t1) - storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) - RET - -/* ---------------------------------------*/ -// func gfpUnmarshal(res *gfP, in *[32]byte) -TEXT ·gfpUnmarshal(SB),NOSPLIT,$0 - JMP ·gfpMarshal(SB) - -/* ---------------------------------------*/ -// func gfpMarshal(res *[32]byte, in *gfP) -TEXT ·gfpMarshal(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in+8(FP), x_ptr - - MOVQ (8*0)(x_ptr), acc0 - MOVQ (8*1)(x_ptr), acc1 - MOVQ (8*2)(x_ptr), acc2 - MOVQ (8*3)(x_ptr), acc3 - - BSWAPQ acc0 - BSWAPQ acc1 - BSWAPQ acc2 - BSWAPQ acc3 - - MOVQ acc3, (8*0)(res_ptr) - MOVQ acc2, (8*1)(res_ptr) - MOVQ acc1, (8*2)(res_ptr) - MOVQ acc0, (8*3)(res_ptr) - - RET diff --git a/sm9/bn256/gfp_cmn_amd64.s b/sm9/bn256/gfp_cmn_amd64.s new file mode 100644 index 0000000..4af2efe --- /dev/null +++ b/sm9/bn256/gfp_cmn_amd64.s @@ -0,0 +1,760 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +#include "textflag.h" + +#include "gfp_macros_amd64.s" + +TEXT ·gfpNeg(SB),NOSPLIT,$0-16 + MOVQ ·p2+0(SB), R8 + MOVQ ·p2+8(SB), R9 + MOVQ ·p2+16(SB), R10 + MOVQ ·p2+24(SB), R11 + + MOVQ a+8(FP), DI + SUBQ 0(DI), R8 + SBBQ 8(DI), R9 + SBBQ 16(DI), R10 + SBBQ 24(DI), R11 + + gfpCarryWithoutCarry(R8,R9,R10,R11, R12,R13,R14,CX) + + MOVQ c+0(FP), DI + storeBlock(R8,R9,R10,R11, 0(DI)) + RET + +TEXT ·gfpAdd(SB),NOSPLIT,$0-24 + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + + loadBlock(0(DI), R8,R9,R10,R11) + MOVQ $0, R12 + + ADDQ 0(SI), R8 + ADCQ 8(SI), R9 + ADCQ 16(SI), R10 + ADCQ 24(SI), R11 + ADCQ $0, R12 + + gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) + + MOVQ c+0(FP), DI + storeBlock(R8,R9,R10,R11, 0(DI)) + RET + +TEXT ·gfpDouble(SB),NOSPLIT,$0-16 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + loadBlock(0(SI), R8,R9,R10,R11) + XORQ R12, R12 + + ADDQ R8, R8 + ADCQ R9, R9 + ADCQ R10, R10 + ADCQ R11, R11 + ADCQ $0, R12 + + gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) + + storeBlock(R8,R9,R10,R11, 0(DI)) + RET + +TEXT ·gfpTriple(SB),NOSPLIT,$0-16 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + loadBlock(0(SI), R8,R9,R10,R11) + XORQ R12, R12 + + ADDQ R8, R8 + ADCQ R9, R9 + ADCQ R10, R10 + ADCQ R11, R11 + ADCQ $0, R12 + + gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) + + XORQ R12, R12 + ADDQ 0(SI), R8 + ADCQ 8(SI), R9 + ADCQ 16(SI), R10 + ADCQ 24(SI), R11 + ADCQ $0, R12 + + gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) + + storeBlock(R8,R9,R10,R11, 0(DI)) + RET + +TEXT ·gfpSub(SB),NOSPLIT,$0-24 + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + + loadBlock(0(DI), R8,R9,R10,R11) + + MOVQ ·p2+0(SB), R12 + MOVQ ·p2+8(SB), R13 + MOVQ ·p2+16(SB), R14 + MOVQ ·p2+24(SB), CX + MOVQ $0, AX + + SUBQ 0(SI), R8 + SBBQ 8(SI), R9 + SBBQ 16(SI), R10 + SBBQ 24(SI), R11 + + CMOVQCC AX, R12 + CMOVQCC AX, R13 + CMOVQCC AX, R14 + CMOVQCC AX, CX + + ADDQ R12, R8 + ADCQ R13, R9 + ADCQ R14, R10 + ADCQ CX, R11 + + MOVQ c+0(FP), DI + storeBlock(R8,R9,R10,R11, 0(DI)) + RET + +TEXT ·gfpMul(SB),NOSPLIT,$0-24 + MOVQ in1+8(FP), x_ptr + MOVQ in2+16(FP), y_ptr + + CMPB ·supportADX(SB), $0 + JE noAdxMul + + XORQ acc5, acc5 + XORQ res_ptr, res_ptr + // x * y[0] + MOVQ (8*0)(y_ptr), DX + MULXQ (8*0)(x_ptr), acc0, acc1 + + MULXQ (8*1)(x_ptr), AX, acc2 + ADCXQ AX, acc1 + + MULXQ (8*2)(x_ptr), AX, acc3 + ADCXQ AX, acc2 + + MULXQ (8*3)(x_ptr), AX, acc4 + ADCXQ AX, acc3 + ADCXQ acc5, acc4 + + // First reduction step + MOVQ acc0, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc0 + + MULXQ ·p2+0x08(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc1 + + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc2 + + MULXQ ·p2+0x18(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc3 + + ADCXQ res_ptr, BX + ADOXQ BX, acc4 + ADOXQ res_ptr, acc5 + XORQ acc0, acc0 + + // x * y[1] + MOVQ (8*1)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc1 + + MULXQ (8*1)(x_ptr), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc2 + + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc3 + + MULXQ (8*3)(x_ptr), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc4 + + ADCXQ acc0, BX + ADOXQ BX, acc5 + ADOXQ res_ptr, acc0 + + // Second reduction step + MOVQ acc1, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc1 + + MULXQ ·p2+0x08(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc2 + + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc3 + + MULXQ ·p2+0x18(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc4 + + ADCXQ res_ptr, BX + ADOXQ BX, acc5 + ADOXQ res_ptr, acc0 + XORQ acc1, acc1 + + // x * y[2] + MOVQ (8*2)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc2 + + MULXQ (8*1)(x_ptr), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc3 + + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc4 + + MULXQ (8*3)(x_ptr), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc5 + + ADCXQ res_ptr, BX + ADOXQ BX, acc0 + ADOXQ res_ptr, acc1 + + // Third reduction step + MOVQ acc2, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc2 + + MULXQ ·p2+0x08(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc3 + + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc4 + + MULXQ ·p2+0x18(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc5 + + ADCXQ res_ptr, BX + ADOXQ BX, acc0 + ADOXQ res_ptr, acc1 + XORQ acc2, acc2 + + // x * y[3] + MOVQ (8*3)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc3 + + MULXQ (8*1)(x_ptr), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc4 + + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc5 + + MULXQ (8*3)(x_ptr), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc0 + + ADCXQ res_ptr, BX + ADOXQ BX, acc1 + ADOXQ res_ptr, acc2 + + // Last reduction step + MOVQ acc3, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc3 + + MULXQ ·p2+0x08(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc4 + + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc5 + + MULXQ ·p2+0x18(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc0 + + ADCXQ res_ptr, BX + ADOXQ BX, acc1 + ADOXQ res_ptr, acc2 + // Copy result [255:0] + gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,BX,acc2) + MOVQ res+0(FP), res_ptr + storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) + RET + +noAdxMul: + // x * y[0] + MOVQ (8*0)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + MOVQ AX, acc0 + MOVQ DX, acc1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + XORQ acc5, acc5 + // First reduction step + MOVQ acc0, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 + + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc4 + ADCQ $0, acc5 + // x * y[1] + MOVQ (8*1)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // Second reduction step + MOVQ acc1, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 + + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // x * y[2] + MOVQ (8*2)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // Third reduction step + MOVQ acc2, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 + + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // x * y[3] + MOVQ (8*3)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Last reduction step + MOVQ acc3, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 + + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Copy result [255:0] + gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,BX,acc2) + MOVQ res+0(FP), res_ptr + storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) + + RET + +/* ---------------------------------------*/ +// func gfpFromMont(res, in *gfP) +TEXT ·gfpFromMont(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+8(FP), x_ptr + + MOVQ (8*0)(x_ptr), acc0 + MOVQ (8*1)(x_ptr), acc1 + MOVQ (8*2)(x_ptr), acc2 + MOVQ (8*3)(x_ptr), acc3 + XORQ acc4, acc4 + + // Only reduce, no multiplications are needed + // First reduction step + MOVQ acc0, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 // acc0 is free now + ADCQ $0, DX + MOVQ DX, BX // carry + XORQ acc0, acc0 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc4 + XORQ acc5, acc5 + + // Second reduction step + MOVQ acc1, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 // acc1 is free now + ADCQ $0, DX + MOVQ DX, BX // carry + XORQ acc1, acc1 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + + // Third reduction step + MOVQ acc2, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 // acc2 is free now + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + + // Last reduction step + MOVQ acc3, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 // acc3 is free now + ADCQ $0, DX + MOVQ DX, BX // carry + XORQ acc3, acc3 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + + gfpCarryWithoutCarry(acc4, acc5, acc0, acc1, x_ptr, acc3, t0, BX) + storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) + RET + +/* ---------------------------------------*/ +// func gfpUnmarshal(res *gfP, in *[32]byte) +TEXT ·gfpUnmarshal(SB),NOSPLIT,$0 + JMP ·gfpMarshal(SB) + +/* ---------------------------------------*/ +// func gfpMarshal(res *[32]byte, in *gfP) +TEXT ·gfpMarshal(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+8(FP), x_ptr + + MOVQ (8*0)(x_ptr), acc0 + MOVQ (8*1)(x_ptr), acc1 + MOVQ (8*2)(x_ptr), acc2 + MOVQ (8*3)(x_ptr), acc3 + + BSWAPQ acc0 + BSWAPQ acc1 + BSWAPQ acc2 + BSWAPQ acc3 + + MOVQ acc3, (8*0)(res_ptr) + MOVQ acc2, (8*1)(res_ptr) + MOVQ acc1, (8*2)(res_ptr) + MOVQ acc0, (8*3)(res_ptr) + + RET diff --git a/sm9/bn256/gfp_decl.go b/sm9/bn256/gfp_decl.go index 9f826ce..589bd3f 100644 --- a/sm9/bn256/gfp_decl.go +++ b/sm9/bn256/gfp_decl.go @@ -1,5 +1,5 @@ -//go:build (amd64 && !purego && !plugin) || (arm64 && !purego) -// +build amd64,!purego,!plugin arm64,!purego +//go:build (amd64 && !purego) || (arm64 && !purego) +// +build amd64,!purego arm64,!purego package bn256 diff --git a/sm9/bn256/gfp_generic.go b/sm9/bn256/gfp_generic.go index c6d1a24..29264f0 100644 --- a/sm9/bn256/gfp_generic.go +++ b/sm9/bn256/gfp_generic.go @@ -1,5 +1,5 @@ -//go:build (!amd64 && !arm64) || purego || plugin -// +build !amd64,!arm64 purego plugin +//go:build (!amd64 && !arm64) || purego +// +build !amd64,!arm64 purego package bn256 diff --git a/sm9/bn256/gfp_macros_amd64.s b/sm9/bn256/gfp_macros_amd64.s new file mode 100644 index 0000000..ef86d21 --- /dev/null +++ b/sm9/bn256/gfp_macros_amd64.s @@ -0,0 +1,62 @@ +#define res_ptr DI +#define x_ptr SI +#define y_ptr CX + +#define acc0 R8 +#define acc1 R9 +#define acc2 R10 +#define acc3 R11 +#define acc4 R12 +#define acc5 R13 +#define t0 R14 + +#define storeBlock(a0,a1,a2,a3, r) \ + MOVQ a0, 0+r \ + MOVQ a1, 8+r \ + MOVQ a2, 16+r \ + MOVQ a3, 24+r + +#define loadBlock(r, a0,a1,a2,a3) \ + MOVQ 0+r, a0 \ + MOVQ 8+r, a1 \ + MOVQ 16+r, a2 \ + MOVQ 24+r, a3 + +#define gfpCarry(a0,a1,a2,a3, b0,b1,b2,b3,b4) \ + \ // b = a-p + MOVQ a0, b0 \ + MOVQ a1, b1 \ + MOVQ a2, b2 \ + MOVQ a3, b3 \ + \ + SUBQ ·p2+0(SB), b0 \ + SBBQ ·p2+8(SB), b1 \ + SBBQ ·p2+16(SB), b2 \ + SBBQ ·p2+24(SB), b3 \ + SBBQ $0, b4 \ + \ + \ // if b is negative then return a + \ // else return b + CMOVQCC b0, a0 \ + CMOVQCC b1, a1 \ + CMOVQCC b2, a2 \ + CMOVQCC b3, a3 + +#define gfpCarryWithoutCarry(a0,a1,a2,a3, b0,b1,b2,b3) \ + \ // b = a-p + MOVQ a0, b0 \ + MOVQ a1, b1 \ + MOVQ a2, b2 \ + MOVQ a3, b3 \ + \ + SUBQ ·p2+0(SB), b0 \ + SBBQ ·p2+8(SB), b1 \ + SBBQ ·p2+16(SB), b2 \ + SBBQ ·p2+24(SB), b3 \ + \ + \ // if b is negative then return a + \ // else return b + CMOVQCC b0, a0 \ + CMOVQCC b1, a1 \ + CMOVQCC b2, a2 \ + CMOVQCC b3, a3 diff --git a/sm9/bn256/gfp_plugin_amd64.s b/sm9/bn256/gfp_plugin_amd64.s new file mode 100644 index 0000000..cce02b2 --- /dev/null +++ b/sm9/bn256/gfp_plugin_amd64.s @@ -0,0 +1,423 @@ +//go:build amd64 && !purego && plugin +// +build amd64,!purego,plugin + +#include "textflag.h" +#include "gfp_macros_amd64.s" + +// func gfpSqr(res, in *gfP, n int) +TEXT ·gfpSqr(SB),NOSPLIT,$24-8 + MOVQ res+0(FP), res_ptr + MOVQ in+8(FP), x_ptr + MOVQ n+16(FP), BX + + CMPB ·supportADX(SB), $0 + JE gfpSqrLoop + +gfpSqrLoopAdx: + MOVQ BX, (SP) + XORQ acc0, acc0 + XORQ y_ptr, y_ptr + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), DX + MULXQ (8*1)(x_ptr), acc1, acc2 + + MULXQ (8*2)(x_ptr), AX, acc3 + ADOXQ AX, acc2 + + MULXQ (8*3)(x_ptr), AX, acc4 + ADOXQ AX, acc3 + ADOXQ y_ptr, acc4 + + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), DX + MULXQ (8*2)(x_ptr), AX, BX + ADOXQ AX, acc3 + + MULXQ (8*3)(x_ptr), AX, acc5 + ADCXQ BX, AX + ADOXQ AX, acc4 + ADCXQ y_ptr, acc5 + + // y[3] * y[2] + MOVQ (8*2)(x_ptr), DX + MULXQ (8*3)(x_ptr), AX, y_ptr + ADOXQ AX, acc5 + ADOXQ acc0, y_ptr + + XORQ BX, BX + // *2 + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ acc4, acc4 + ADOXQ acc5, acc5 + ADOXQ y_ptr, y_ptr + ADOXQ acc0, BX + + // Missing products + MOVQ (8*0)(x_ptr), DX + MULXQ DX, acc0, t0 + ADCXQ t0, acc1 + + MOVQ (8*1)(x_ptr), DX + MULXQ DX, AX, t0 + ADCXQ AX, acc2 + ADCXQ t0, acc3 + + MOVQ (8*2)(x_ptr), DX + MULXQ DX, AX, t0 + ADCXQ AX, acc4 + ADCXQ t0, acc5 + + MOVQ (8*3)(x_ptr), DX + MULXQ DX, AX, x_ptr + ADCXQ AX, y_ptr + ADCXQ BX, x_ptr + + // First reduction step + MOVQ acc0, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 + + MULXQ ·p2+0x08(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc1 + + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc2 + + MULXQ ·p2+0x18(SB), AX, acc0 + ADCXQ t0, AX + ADOXQ AX, acc3 + MOVQ $0, t0 + ADCXQ t0, acc0 + ADOXQ t0, acc0 + + // Second reduction step + MOVQ acc1, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc1 + + MULXQ ·p2+0x08(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc2 + + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc3 + + MULXQ ·p2+0x18(SB), AX, acc1 + ADCXQ t0, AX + ADOXQ AX, acc0 + MOVQ $0, t0 + ADCXQ t0, acc1 + ADOXQ t0, acc1 + + // Third reduction step + MOVQ acc2, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc2 + + MULXQ ·p2+0x08(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc3 + + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc0 + + MULXQ ·p2+0x18(SB), AX, acc2 + ADCXQ t0, AX + ADOXQ AX, acc1 + MOVQ $0, t0 + ADCXQ t0, acc2 + ADOXQ t0, acc2 + + // Last reduction step + MOVQ acc3, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc3 + + MULXQ ·p2+0x08(SB), AX, BX + ADCXQ t0, AX + ADOXQ AX, acc0 + + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc1 + + MULXQ ·p2+0x18(SB), AX, acc3 + ADCXQ t0, AX + ADOXQ AX, acc2 + MOVQ $0, t0 + ADCXQ t0, acc3 + ADOXQ t0, acc3 + + XORQ BX, BX + // Add bits [511:256] of the sqr result + ADCXQ acc4, acc0 + ADCXQ acc5, acc1 + ADCXQ y_ptr, acc2 + ADCXQ x_ptr, acc3 + ADCXQ BX, t0 + + gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,BX,t0) + storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr)) + + MOVQ res_ptr, x_ptr + MOVQ (SP), BX + DECQ BX + JNE gfpSqrLoopAdx + + RET + +gfpSqrLoop: + MOVQ BX, (SP) + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + MOVQ AX, acc1 + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, acc5 + // y[3] * y[2] + MOVQ (8*2)(x_ptr), t0 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, y_ptr + XORQ BX, BX + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, BX + // Missing products + MOVQ (8*0)(x_ptr), AX + MULQ AX + MOVQ AX, acc0 + MOVQ DX, t0 + + MOVQ (8*1)(x_ptr), AX + MULQ AX + ADDQ t0, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*2)(x_ptr), AX + MULQ AX + ADDQ t0, acc3 + ADCQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*3)(x_ptr), AX + MULQ AX + ADDQ t0, acc5 + ADCQ AX, y_ptr + ADCQ DX, BX + MOVQ BX, x_ptr + // T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr] + // First reduction step + MOVQ acc0, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 // acc0 is free now + ADCQ $0, DX + MOVQ DX, BX // carry + XORQ acc0, acc0 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc0 + + // Second reduction step + MOVQ acc1, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 // acc1 is free now + ADCQ $0, DX + MOVQ DX, BX // carry + XORQ acc1, acc1 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + + // Third reduction step + MOVQ acc2, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 // acc2 is free now + ADCQ $0, DX + MOVQ DX, BX // carry + XORQ acc2, acc2 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ DX, acc2 + + // Last reduction step + MOVQ acc3, AX + MULQ ·np+0x00(SB) + MOVQ AX, t0 // Y + + // Calculate next T = T+Y*P + MOVQ ·p2+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 // acc3 is free now + ADCQ $0, DX + MOVQ DX, BX // carry + XORQ acc3, acc3 + + MOVQ ·p2+0x08(SB), AX + MULQ t0 + ADDQ BX, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x10(SB), AX + MULQ t0 + ADDQ BX, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX // carry + + MOVQ ·p2+0x18(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ DX, acc3 + + XORQ t0, t0 + // Add bits [511:256] of the sqr result + ADDQ acc4, acc0 + ADCQ acc5, acc1 + ADCQ y_ptr, acc2 + ADCQ x_ptr, acc3 + ADCQ $0, t0 + + gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,BX,t0) + storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr)) + MOVQ res_ptr, x_ptr + MOVQ (SP), BX + DECQ BX + JNE gfpSqrLoop + + RET