mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
sm9/bn256: use ADCX ADOX together with MULX #132
This commit is contained in:
parent
ce489e2b4b
commit
0afaeb49eb
@ -136,198 +136,192 @@ TEXT ·gfpSub(SB),0,$0-24
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
TEXT ·gfpMul(SB),0,$0-24
|
TEXT ·gfpMul(SB),0,$0-24
|
||||||
MOVQ res+0(FP), res_ptr
|
|
||||||
MOVQ in1+8(FP), x_ptr
|
MOVQ in1+8(FP), x_ptr
|
||||||
MOVQ in2+16(FP), y_ptr
|
MOVQ in2+16(FP), y_ptr
|
||||||
|
|
||||||
CMPB ·hasBMI2(SB), $0
|
CMPB ·supportADX(SB), $0
|
||||||
JE nobmi2Mul
|
JE noAdxMul
|
||||||
|
|
||||||
|
XORQ acc5, acc5
|
||||||
|
XORQ res_ptr, res_ptr
|
||||||
// x * y[0]
|
// x * y[0]
|
||||||
MOVQ (8*0)(y_ptr), DX
|
MOVQ (8*0)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), acc0, acc1
|
MULXQ (8*0)(x_ptr), acc0, acc1
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, acc2
|
MULXQ (8*1)(x_ptr), AX, acc2
|
||||||
ADDQ AX, acc1
|
ADCXQ AX, acc1
|
||||||
ADCQ $0, acc2
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, acc3
|
MULXQ (8*2)(x_ptr), AX, acc3
|
||||||
ADDQ AX, acc2
|
ADCXQ AX, acc2
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc4
|
MULXQ (8*3)(x_ptr), AX, acc4
|
||||||
ADDQ AX, acc3
|
ADCXQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADCXQ acc5, acc4
|
||||||
|
|
||||||
XORQ acc5, acc5
|
|
||||||
|
|
||||||
// First reduction step
|
// First reduction step
|
||||||
MOVQ acc0, DX
|
MOVQ acc0, DX
|
||||||
MULXQ ·np+0x00(SB), DX, AX
|
MULXQ ·np+0x00(SB), DX, AX
|
||||||
|
|
||||||
MULXQ ·p2+0x00(SB), AX, t1
|
MULXQ ·p2+0x00(SB), AX, t0
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc0
|
||||||
ADCQ t1, acc1
|
|
||||||
|
|
||||||
MULXQ ·p2+0x08(SB), AX, t1
|
MULXQ ·p2+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ t1, acc2
|
|
||||||
|
|
||||||
MULXQ ·p2+0x10(SB), AX, t1
|
MULXQ ·p2+0x10(SB), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ t1, acc3
|
|
||||||
|
|
||||||
MULXQ ·p2+0x18(SB), AX, t1
|
MULXQ ·p2+0x18(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
ADCQ $0, acc5
|
ADCXQ res_ptr, t1
|
||||||
|
ADOXQ t1, acc4
|
||||||
|
ADOXQ res_ptr, acc5
|
||||||
|
XORQ acc0, acc0
|
||||||
|
|
||||||
// x * y[1]
|
// x * y[1]
|
||||||
MOVQ (8*1)(y_ptr), DX
|
MOVQ (8*1)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), AX, t1
|
MULXQ (8*0)(x_ptr), AX, t0
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ t1, acc2
|
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, t1
|
MULXQ (8*1)(x_ptr), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ t1, acc3
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, t1
|
MULXQ (8*2)(x_ptr), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, t1
|
MULXQ (8*3)(x_ptr), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ t1, acc5
|
|
||||||
ADCQ $0, acc0
|
ADCXQ acc0, t1
|
||||||
|
ADOXQ t1, acc5
|
||||||
|
ADOXQ res_ptr, acc0
|
||||||
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ acc1, DX
|
MOVQ acc1, DX
|
||||||
MULXQ ·np+0x00(SB), DX, AX
|
MULXQ ·np+0x00(SB), DX, AX
|
||||||
|
|
||||||
MULXQ ·p2+0x00(SB), AX, t1
|
MULXQ ·p2+0x00(SB), AX, t0
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ t1, acc2
|
|
||||||
|
|
||||||
MULXQ ·p2+0x08(SB), AX, t1
|
MULXQ ·p2+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ t1, acc3
|
|
||||||
|
|
||||||
MULXQ ·p2+0x10(SB), AX, t1
|
MULXQ ·p2+0x10(SB), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
|
|
||||||
MULXQ ·p2+0x18(SB), AX, t1
|
MULXQ ·p2+0x18(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ t1, acc5
|
|
||||||
ADCQ $0, acc0
|
ADCXQ res_ptr, t1
|
||||||
|
ADOXQ t1, acc5
|
||||||
|
ADOXQ res_ptr, acc0
|
||||||
|
XORQ acc1, acc1
|
||||||
|
|
||||||
// x * y[2]
|
// x * y[2]
|
||||||
MOVQ (8*2)(y_ptr), DX
|
MOVQ (8*2)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), AX, t1
|
MULXQ (8*0)(x_ptr), AX, t0
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ t1, acc3
|
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, t1
|
MULXQ (8*1)(x_ptr), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, t1
|
MULXQ (8*2)(x_ptr), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ t1, acc5
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, t1
|
MULXQ (8*3)(x_ptr), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ t1, acc0
|
|
||||||
ADCQ $0, acc1
|
ADCXQ res_ptr, t1
|
||||||
|
ADOXQ t1, acc0
|
||||||
|
ADOXQ res_ptr, acc1
|
||||||
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
MOVQ acc2, DX
|
MOVQ acc2, DX
|
||||||
MULXQ ·np+0x00(SB), DX, AX
|
MULXQ ·np+0x00(SB), DX, AX
|
||||||
|
|
||||||
MULXQ ·p2+0x00(SB), AX, t1
|
MULXQ ·p2+0x00(SB), AX, t0
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ t1, acc3
|
|
||||||
|
|
||||||
MULXQ ·p2+0x08(SB), AX, t1
|
MULXQ ·p2+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
|
|
||||||
MULXQ ·p2+0x10(SB), AX, t1
|
MULXQ ·p2+0x10(SB), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ t1, acc5
|
|
||||||
|
|
||||||
MULXQ ·p2+0x18(SB), AX, t1
|
MULXQ ·p2+0x18(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ t1, acc0
|
|
||||||
ADCQ $0, acc1
|
ADCXQ res_ptr, t1
|
||||||
|
ADOXQ t1, acc0
|
||||||
|
ADOXQ res_ptr, acc1
|
||||||
|
XORQ acc2, acc2
|
||||||
|
|
||||||
// x * y[3]
|
// x * y[3]
|
||||||
MOVQ (8*3)(y_ptr), DX
|
MOVQ (8*3)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), AX, t1
|
MULXQ (8*0)(x_ptr), AX, t0
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, t1
|
MULXQ (8*1)(x_ptr), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ t1, acc5
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, t1
|
MULXQ (8*2)(x_ptr), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ t1, acc0
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, t1
|
MULXQ (8*3)(x_ptr), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc0
|
||||||
ADCQ t1, acc1
|
|
||||||
ADCQ $0, acc2
|
ADCXQ res_ptr, t1
|
||||||
|
ADOXQ t1, acc1
|
||||||
|
ADOXQ res_ptr, acc2
|
||||||
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
MOVQ acc3, DX
|
MOVQ acc3, DX
|
||||||
MULXQ ·np+0x00(SB), DX, AX
|
MULXQ ·np+0x00(SB), DX, AX
|
||||||
|
|
||||||
MULXQ ·p2+0x00(SB), AX, t1
|
MULXQ ·p2+0x00(SB), AX, t0
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
|
|
||||||
MULXQ ·p2+0x08(SB), AX, t1
|
MULXQ ·p2+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ t1, acc5
|
|
||||||
|
|
||||||
MULXQ ·p2+0x10(SB), AX, t1
|
MULXQ ·p2+0x10(SB), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ t1, acc0
|
|
||||||
|
|
||||||
MULXQ ·p2+0x18(SB), AX, t1
|
MULXQ ·p2+0x18(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc0
|
||||||
ADCQ t1, acc1
|
|
||||||
ADCQ $0, acc2
|
ADCXQ res_ptr, t1
|
||||||
|
ADOXQ t1, acc1
|
||||||
|
ADOXQ res_ptr, acc2
|
||||||
// Copy result [255:0]
|
// Copy result [255:0]
|
||||||
gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2)
|
gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2)
|
||||||
|
MOVQ res+0(FP), res_ptr
|
||||||
storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
|
storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
|
||||||
|
|
||||||
RET
|
RET
|
||||||
nobmi2Mul:
|
|
||||||
|
noAdxMul:
|
||||||
// x * y[0]
|
// x * y[0]
|
||||||
MOVQ (8*0)(y_ptr), t0
|
MOVQ (8*0)(y_ptr), t0
|
||||||
|
|
||||||
@ -588,6 +582,7 @@ nobmi2Mul:
|
|||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
// Copy result [255:0]
|
// Copy result [255:0]
|
||||||
gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2)
|
gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2)
|
||||||
|
MOVQ res+0(FP), res_ptr
|
||||||
storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
|
storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
|
||||||
|
|
||||||
RET
|
RET
|
||||||
@ -598,175 +593,171 @@ TEXT ·gfpSqr(SB),NOSPLIT,$0
|
|||||||
MOVQ in+8(FP), x_ptr
|
MOVQ in+8(FP), x_ptr
|
||||||
MOVQ n+16(FP), BX
|
MOVQ n+16(FP), BX
|
||||||
|
|
||||||
CMPB ·hasBMI2(SB), $0
|
CMPB ·supportADX(SB), $0
|
||||||
JE gfpSqrLoop
|
JE gfpSqrLoop
|
||||||
|
|
||||||
gfpSqrLoopBMI2:
|
gfpSqrLoopAdx:
|
||||||
|
XORQ acc0, acc0
|
||||||
|
XORQ y_ptr, y_ptr
|
||||||
// y[1:] * y[0]
|
// y[1:] * y[0]
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
MULXQ (8*1)(x_ptr), acc1, acc2
|
MULXQ (8*1)(x_ptr), acc1, acc2
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, acc3
|
MULXQ (8*2)(x_ptr), AX, acc3
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc4
|
MULXQ (8*3)(x_ptr), AX, acc4
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADOXQ y_ptr, acc4
|
||||||
|
|
||||||
// y[2:] * y[1]
|
// y[2:] * y[1]
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
MULXQ (8*2)(x_ptr), AX, t1
|
MULXQ (8*2)(x_ptr), AX, t1
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc5
|
MULXQ (8*3)(x_ptr), AX, acc5
|
||||||
ADCQ $0, acc5
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ $0, acc5
|
ADCXQ y_ptr, acc5
|
||||||
|
|
||||||
// y[3] * y[2]
|
// y[3] * y[2]
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
MULXQ (8*3)(x_ptr), AX, y_ptr
|
MULXQ (8*3)(x_ptr), AX, y_ptr
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ $0, y_ptr
|
ADOXQ acc0, y_ptr
|
||||||
|
|
||||||
XORQ t1, t1
|
XORQ t1, t1
|
||||||
// *2
|
// *2
|
||||||
ADDQ acc1, acc1
|
ADOXQ acc1, acc1
|
||||||
ADCQ acc2, acc2
|
ADOXQ acc2, acc2
|
||||||
ADCQ acc3, acc3
|
ADOXQ acc3, acc3
|
||||||
ADCQ acc4, acc4
|
ADOXQ acc4, acc4
|
||||||
ADCQ acc5, acc5
|
ADOXQ acc5, acc5
|
||||||
ADCQ y_ptr, y_ptr
|
ADOXQ y_ptr, y_ptr
|
||||||
ADCQ $0, t1
|
ADOXQ acc0, t1
|
||||||
|
|
||||||
// Missing products
|
// Missing products
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
MULXQ DX, acc0, t0
|
MULXQ DX, acc0, t0
|
||||||
ADDQ t0, acc1
|
ADCXQ t0, acc1
|
||||||
|
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc2
|
ADCXQ AX, acc2
|
||||||
ADCQ t0, acc3
|
ADCXQ t0, acc3
|
||||||
|
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc4
|
ADCXQ AX, acc4
|
||||||
ADCQ t0, acc5
|
ADCXQ t0, acc5
|
||||||
|
|
||||||
MOVQ (8*3)(x_ptr), DX
|
MOVQ (8*3)(x_ptr), DX
|
||||||
MULXQ DX, AX, x_ptr
|
MULXQ DX, AX, x_ptr
|
||||||
ADCQ AX, y_ptr
|
ADCXQ AX, y_ptr
|
||||||
ADCQ t1, x_ptr
|
ADCXQ t1, x_ptr
|
||||||
|
|
||||||
// First reduction step
|
// First reduction step
|
||||||
MOVQ acc0, DX
|
MOVQ acc0, DX
|
||||||
MULXQ ·np+0x00(SB), DX, AX
|
MULXQ ·np+0x00(SB), DX, AX
|
||||||
|
|
||||||
MULXQ ·p2+0x00(SB), AX, t1
|
MULXQ ·p2+0x00(SB), AX, t0
|
||||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
|
ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
|
||||||
ADCQ t1, acc1
|
|
||||||
|
|
||||||
MULXQ ·p2+0x08(SB), AX, t1
|
MULXQ ·p2+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ t1, acc2
|
|
||||||
|
|
||||||
MULXQ ·p2+0x10(SB), AX, t1
|
MULXQ ·p2+0x10(SB), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ t1, acc3
|
|
||||||
|
|
||||||
MULXQ ·p2+0x18(SB), AX, acc0
|
MULXQ ·p2+0x18(SB), AX, acc0
|
||||||
ADCQ $0, acc0
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ $0, acc0
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc0
|
||||||
|
ADOXQ t0, acc0
|
||||||
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ acc1, DX
|
MOVQ acc1, DX
|
||||||
MULXQ ·np+0x00(SB), DX, AX
|
MULXQ ·np+0x00(SB), DX, AX
|
||||||
|
|
||||||
MULXQ ·p2+0x00(SB), AX, t1
|
MULXQ ·p2+0x00(SB), AX, t0
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ t1, acc2
|
|
||||||
|
|
||||||
MULXQ ·p2+0x08(SB), AX, t1
|
MULXQ ·p2+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ t1, acc3
|
|
||||||
|
|
||||||
MULXQ ·p2+0x10(SB), AX, t1
|
MULXQ ·p2+0x10(SB), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc0
|
|
||||||
|
|
||||||
MULXQ ·p2+0x18(SB), AX, acc1
|
MULXQ ·p2+0x18(SB), AX, acc1
|
||||||
ADCQ $0, acc1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc4
|
||||||
ADCQ $0, acc1
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc1
|
||||||
|
ADOXQ t0, acc1
|
||||||
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
MOVQ acc2, DX
|
MOVQ acc2, DX
|
||||||
MULXQ ·np+0x00(SB), DX, AX
|
MULXQ ·np+0x00(SB), DX, AX
|
||||||
|
|
||||||
MULXQ ·p2+0x00(SB), AX, t1
|
MULXQ ·p2+0x00(SB), AX, t0
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ t1, acc3
|
|
||||||
|
|
||||||
MULXQ ·p2+0x08(SB), AX, t1
|
MULXQ ·p2+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc0
|
|
||||||
|
|
||||||
MULXQ ·p2+0x10(SB), AX, t1
|
MULXQ ·p2+0x10(SB), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc0
|
||||||
ADCQ t1, acc1
|
|
||||||
|
|
||||||
MULXQ ·p2+0x18(SB), AX, acc2
|
MULXQ ·p2+0x18(SB), AX, acc2
|
||||||
ADCQ $0, acc2
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ $0, acc2
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc2
|
||||||
|
ADOXQ t0, acc2
|
||||||
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
MOVQ acc3, DX
|
MOVQ acc3, DX
|
||||||
MULXQ ·np+0x00(SB), DX, AX
|
MULXQ ·np+0x00(SB), DX, AX
|
||||||
|
|
||||||
MULXQ ·p2+0x00(SB), AX, t1
|
MULXQ ·p2+0x00(SB), AX, t0
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc0
|
|
||||||
|
|
||||||
MULXQ ·p2+0x08(SB), AX, t1
|
MULXQ ·p2+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc0
|
||||||
ADCQ t1, acc1
|
|
||||||
|
|
||||||
MULXQ ·p2+0x10(SB), AX, t1
|
MULXQ ·p2+0x10(SB), AX, t0
|
||||||
ADCQ $0, t1
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ t1, acc2
|
|
||||||
|
|
||||||
MULXQ ·p2+0x18(SB), AX, acc3
|
MULXQ ·p2+0x18(SB), AX, acc3
|
||||||
ADCQ $0, acc3
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ $0, acc3
|
MOVQ $0, t0
|
||||||
|
ADCXQ t0, acc3
|
||||||
|
ADOXQ t0, acc3
|
||||||
|
|
||||||
XORQ t0, t0
|
XORQ t1, t1
|
||||||
// Add bits [511:256] of the sqr result
|
// Add bits [511:256] of the sqr result
|
||||||
ADDQ acc4, acc0
|
ADCXQ acc4, acc0
|
||||||
ADCQ acc5, acc1
|
ADCXQ acc5, acc1
|
||||||
ADCQ y_ptr, acc2
|
ADCXQ y_ptr, acc2
|
||||||
ADCQ x_ptr, acc3
|
ADCXQ x_ptr, acc3
|
||||||
ADCQ $0, t0
|
ADCXQ t1, t0
|
||||||
|
|
||||||
gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0)
|
gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0)
|
||||||
storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
|
storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
|
||||||
|
|
||||||
MOVQ res_ptr, x_ptr
|
MOVQ res_ptr, x_ptr
|
||||||
DECQ BX
|
DECQ BX
|
||||||
JNE gfpSqrLoopBMI2
|
JNE gfpSqrLoopAdx
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
@ -10,7 +10,10 @@ import (
|
|||||||
"golang.org/x/sys/cpu"
|
"golang.org/x/sys/cpu"
|
||||||
)
|
)
|
||||||
|
|
||||||
var hasBMI2 = cpu.X86.HasBMI2
|
// amd64 assembly uses ADCX/ADOX/MULX if ADX is available to run two carry
|
||||||
|
// chains in the flags in parallel across the whole operation, and aggressively
|
||||||
|
// unrolls loops. arm64 processes four words at a time.
|
||||||
|
var supportADX = cpu.X86.HasADX && cpu.X86.HasBMI2
|
||||||
|
|
||||||
// Set c = p - a, if c == p, then c = 0
|
// Set c = p - a, if c == p, then c = 0
|
||||||
//
|
//
|
||||||
|
@ -134,5 +134,15 @@ func gfpSqr(res, in *gfP, n int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func gfpFromMont(res, in *gfP) {
|
func gfpFromMont(res, in *gfP) {
|
||||||
gfpMul(res, in, &gfP{1})
|
var T [8]uint64
|
||||||
|
var carry uint64
|
||||||
|
copy(T[:], in[:])
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
Y := T[i] * np[0]
|
||||||
|
c2 := addMulVVW(T[i:4+i], p2[:], Y)
|
||||||
|
T[4+i], carry = bits.Add64(uint64(0), c2, carry)
|
||||||
|
}
|
||||||
|
|
||||||
|
*res = gfP{T[4], T[5], T[6], T[7]}
|
||||||
|
gfpCarry(res, carry)
|
||||||
}
|
}
|
||||||
|
@ -48,6 +48,28 @@ func Test_gfpBasicOperations(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Test_gfpSqr(t *testing.T) {
|
||||||
|
// p - 1
|
||||||
|
pMinusOne := new(big.Int).Sub(p, big.NewInt(1))
|
||||||
|
x := fromBigInt(pMinusOne)
|
||||||
|
ret := &gfP{}
|
||||||
|
gfpSqr(ret, x, 1)
|
||||||
|
pMinusOne.Mul(pMinusOne, pMinusOne)
|
||||||
|
pMinusOne.Mod(pMinusOne, p)
|
||||||
|
if *ret != *fromBigInt(pMinusOne) {
|
||||||
|
t.Errorf("bad sqr")
|
||||||
|
}
|
||||||
|
// p + 1
|
||||||
|
pPlusOne := new(big.Int).Add(p, big.NewInt(1))
|
||||||
|
x = fromBigInt(pPlusOne)
|
||||||
|
gfpSqr(ret, x, 1)
|
||||||
|
pPlusOne.Mul(pPlusOne, pPlusOne)
|
||||||
|
pPlusOne.Mod(pPlusOne, p)
|
||||||
|
if *ret != *fromBigInt(pPlusOne) {
|
||||||
|
t.Errorf("bad sqr")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestFromMont(t *testing.T) {
|
func TestFromMont(t *testing.T) {
|
||||||
x := fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141"))
|
x := fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141"))
|
||||||
ret1, ret2 := &gfP{}, &gfP{}
|
ret1, ret2 := &gfP{}, &gfP{}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user