diff --git a/sm9/bn256/gfp_amd64.s b/sm9/bn256/gfp_amd64.s index 53d2997..0031a24 100644 --- a/sm9/bn256/gfp_amd64.s +++ b/sm9/bn256/gfp_amd64.s @@ -122,6 +122,213 @@ TEXT ·gfpMul(SB),0,$0-24 MOVQ res+0(FP), res_ptr MOVQ in1+8(FP), x_ptr MOVQ in2+16(FP), y_ptr + + CMPB ·hasBMI2(SB), $0 + JE nobmi2Mul + + // x * y[0] + MOVQ (8*0)(y_ptr), DX + MULXQ (8*0)(x_ptr), acc0, acc1 + + MULXQ (8*1)(x_ptr), AX, acc2 + ADDQ AX, acc1 + ADCQ $0, acc2 + + MULXQ (8*2)(x_ptr), AX, acc3 + ADDQ AX, acc2 + ADCQ $0, acc3 + + MULXQ (8*3)(x_ptr), AX, acc4 + ADDQ AX, acc3 + ADCQ $0, acc4 + + XORQ acc5, acc5 + + // First reduction step + MOVQ acc0, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t1 + ADDQ AX, acc0 + ADCQ t1, acc1 + + MULXQ ·p2+0x08(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc1 + ADCQ t1, acc2 + + MULXQ ·p2+0x10(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc2 + ADCQ t1, acc3 + + MULXQ ·p2+0x18(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc3 + ADCQ t1, acc4 + ADCQ $0, acc5 + + // x * y[1] + MOVQ (8*1)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, t1 + ADDQ AX, acc1 + ADCQ t1, acc2 + + MULXQ (8*1)(x_ptr), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc2 + ADCQ t1, acc3 + + MULXQ (8*2)(x_ptr), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc3 + ADCQ t1, acc4 + + MULXQ (8*3)(x_ptr), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc4 + ADCQ t1, acc5 + ADCQ $0, acc0 + + // Second reduction step + MOVQ acc1, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t1 + ADDQ AX, acc1 + ADCQ t1, acc2 + + MULXQ ·p2+0x08(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc2 + ADCQ t1, acc3 + + MULXQ ·p2+0x10(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc3 + ADCQ t1, acc4 + + MULXQ ·p2+0x18(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc4 + ADCQ t1, acc5 + ADCQ $0, acc0 + + // x * y[2] + MOVQ (8*2)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, t1 + ADDQ AX, acc2 + ADCQ t1, acc3 + + MULXQ (8*1)(x_ptr), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc3 + ADCQ t1, acc4 + + MULXQ (8*2)(x_ptr), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc4 + ADCQ t1, acc5 + + MULXQ (8*3)(x_ptr), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc5 + ADCQ t1, acc0 + ADCQ $0, acc1 + + // Third reduction step + MOVQ acc2, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t1 + ADDQ AX, acc2 + ADCQ t1, acc3 + + MULXQ ·p2+0x08(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc3 + ADCQ t1, acc4 + + MULXQ ·p2+0x10(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc4 + ADCQ t1, acc5 + + MULXQ ·p2+0x18(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc5 + ADCQ t1, acc0 + ADCQ $0, acc1 + + // x * y[3] + MOVQ (8*3)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, t1 + ADDQ AX, acc3 + ADCQ t1, acc4 + + MULXQ (8*1)(x_ptr), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc4 + ADCQ t1, acc5 + + MULXQ (8*2)(x_ptr), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc5 + ADCQ t1, acc0 + + MULXQ (8*3)(x_ptr), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc0 + ADCQ t1, acc1 + ADCQ $0, acc2 + + // Last reduction step + MOVQ acc3, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t1 + ADDQ AX, acc3 + ADCQ t1, acc4 + + MULXQ ·p2+0x08(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc4 + ADCQ t1, acc5 + + MULXQ ·p2+0x10(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc5 + ADCQ t1, acc0 + + MULXQ ·p2+0x18(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc0 + ADCQ t1, acc1 + ADCQ $0, acc2 + // Copy result [255:0] + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, t1 + // Subtract p2 + SUBQ ·p2+0x00(SB), acc4 + SBBQ ·p2+0x08(SB) ,acc5 + SBBQ ·p2+0x10(SB), acc0 + SBBQ ·p2+0x18(SB), acc1 + SBBQ $0, acc2 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) + + RET +nobmi2Mul: // x * y[0] MOVQ (8*0)(y_ptr), t0 @@ -410,6 +617,195 @@ TEXT ·gfpSqr(SB),NOSPLIT,$0 MOVQ in+8(FP), x_ptr MOVQ n+16(FP), BX + CMPB ·hasBMI2(SB), $0 + JE gfpSqrLoop + +gfpSqrLoopBMI2: + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), DX + MULXQ (8*1)(x_ptr), acc1, acc2 + + MULXQ (8*2)(x_ptr), AX, acc3 + ADDQ AX, acc2 + ADCQ $0, acc3 + + MULXQ (8*3)(x_ptr), AX, acc4 + ADDQ AX, acc3 + ADCQ $0, acc4 + + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), DX + MULXQ (8*2)(x_ptr), AX, t1 + ADDQ AX, acc3 + ADCQ t1, acc4 + + MULXQ (8*3)(x_ptr), AX, acc5 + ADCQ $0, acc5 + ADDQ AX, acc4 + ADCQ $0, acc5 + + // y[3] * y[2] + MOVQ (8*2)(x_ptr), DX + MULXQ (8*3)(x_ptr), AX, y_ptr + ADDQ AX, acc5 + ADCQ $0, y_ptr + + XORQ t1, t1 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, t1 + + // Missing products + MOVQ (8*0)(x_ptr), DX + MULXQ DX, acc0, t0 + ADDQ t0, acc1 + + MOVQ (8*1)(x_ptr), DX + MULXQ DX, AX, t0 + ADCQ AX, acc2 + ADCQ t0, acc3 + + MOVQ (8*2)(x_ptr), DX + MULXQ DX, AX, t0 + ADCQ AX, acc4 + ADCQ t0, acc5 + + MOVQ (8*3)(x_ptr), DX + MULXQ DX, AX, x_ptr + ADCQ AX, y_ptr + ADCQ t1, x_ptr + + // First reduction step + MOVQ acc0, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t1 + ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 + ADCQ t1, acc1 + + MULXQ ·p2+0x08(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc1 + ADCQ t1, acc2 + + MULXQ ·p2+0x10(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc2 + ADCQ t1, acc3 + + MULXQ ·p2+0x18(SB), AX, acc0 + ADCQ $0, acc0 + ADDQ AX, acc3 + ADCQ $0, acc0 + + // Second reduction step + MOVQ acc1, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t1 + ADDQ AX, acc1 + ADCQ t1, acc2 + + MULXQ ·p2+0x08(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc2 + ADCQ t1, acc3 + + MULXQ ·p2+0x10(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc3 + ADCQ t1, acc0 + + MULXQ ·p2+0x18(SB), AX, acc1 + ADCQ $0, acc1 + ADDQ AX, acc0 + ADCQ $0, acc1 + + // Third reduction step + MOVQ acc2, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t1 + ADDQ AX, acc2 + ADCQ t1, acc3 + + MULXQ ·p2+0x08(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc3 + ADCQ t1, acc0 + + MULXQ ·p2+0x10(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc0 + ADCQ t1, acc1 + + MULXQ ·p2+0x18(SB), AX, acc2 + ADCQ $0, acc2 + ADDQ AX, acc1 + ADCQ $0, acc2 + + // Last reduction step + MOVQ acc3, DX + MULXQ ·np+0x00(SB), DX, AX + + MULXQ ·p2+0x00(SB), AX, t1 + ADDQ AX, acc3 + ADCQ t1, acc0 + + MULXQ ·p2+0x08(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc0 + ADCQ t1, acc1 + + MULXQ ·p2+0x10(SB), AX, t1 + ADCQ $0, t1 + ADDQ AX, acc1 + ADCQ t1, acc2 + + MULXQ ·p2+0x18(SB), AX, acc3 + ADCQ $0, acc3 + ADDQ AX, acc2 + ADCQ $0, acc3 + + XORQ t0, t0 + // Add bits [511:256] of the sqr result + ADDQ acc4, acc0 + ADCQ acc5, acc1 + ADCQ y_ptr, acc2 + ADCQ x_ptr, acc3 + ADCQ $0, t0 + + MOVQ acc0, acc4 + MOVQ acc1, acc5 + MOVQ acc2, y_ptr + MOVQ acc3, t1 + // Subtract p2 + SUBQ ·p2+0x00(SB), acc0 + SBBQ ·p2+0x08(SB) ,acc1 + SBBQ ·p2+0x10(SB), acc2 + SBBQ ·p2+0x18(SB), acc3 + SBBQ $0, t0 + + CMOVQCS acc4, acc0 + CMOVQCS acc5, acc1 + CMOVQCS y_ptr, acc2 + CMOVQCS t1, acc3 + + MOVQ acc0, (8*0)(res_ptr) + MOVQ acc1, (8*1)(res_ptr) + MOVQ acc2, (8*2)(res_ptr) + MOVQ acc3, (8*3)(res_ptr) + MOVQ res_ptr, x_ptr + DECQ BX + JNE gfpSqrLoopBMI2 + + RET + gfpSqrLoop: // y[1:] * y[0] @@ -637,7 +1033,7 @@ gfpSqrLoop: XORQ t0, t0 // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 + ADDQ acc4, acc0 ADCQ acc5, acc1 ADCQ y_ptr, acc2 ADCQ x_ptr, acc3