MAGIC - optimize p256Mul p256FromMont

This commit is contained in:
emmansun 2021-02-12 14:54:21 +08:00
parent aa6b436714
commit 371ea2f4b3

View File

@ -388,26 +388,24 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
MOVQ DX, acc4 MOVQ DX, acc4
XORQ acc5, acc5 XORQ acc5, acc5
// First reduction step // First reduction step
MOVQ p256p<>+0x08(SB), AX MOVQ acc0, AX
MULQ acc0 MOVQ acc0, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1 ADDQ acc0, acc1
ADCQ $0, DX ADCQ $0, acc2
ADDQ AX, acc1 SUBQ AX, acc1
ADCQ $0, DX SBBQ DX, acc2
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX MOVQ acc0, t1
MULQ acc0 ADDQ acc0, acc3
ADDQ t1, acc2 ADCQ $0, t1
ADCQ $0, DX SUBQ AX, acc3
ADDQ AX, acc2 SBBQ DX, t1
ADCQ $0, DX SUBQ acc0, acc3
MOVQ DX, t1 SBBQ $0, t1
MOVQ p256p<>+0x018(SB), AX ADDQ t1, acc4
MULQ acc0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5 ADCQ $0, acc5
XORQ acc0, acc0 XORQ acc0, acc0
@ -444,26 +442,24 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc5 ADCQ DX, acc5
ADCQ $0, acc0 ADCQ $0, acc0
// Second reduction step // Second reduction step
MOVQ p256p<>+0x08(SB), AX MOVQ acc1, AX
MULQ acc1 MOVQ acc1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2 ADDQ acc1, acc2
ADCQ $0, DX ADCQ $0, acc3
ADDQ AX, acc2 SUBQ AX, acc2
ADCQ $0, DX SBBQ DX, acc3
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX MOVQ acc1, t1
MULQ acc1 ADDQ acc1, acc4
ADDQ t1, acc3 ADCQ $0, t1
ADCQ $0, DX SUBQ AX, acc4
ADDQ AX, acc3 SBBQ DX, t1
ADCQ $0, DX SUBQ acc1, acc4
MOVQ DX, t1 SBBQ $0, t1
MOVQ p256p<>+0x018(SB), AX ADDQ t1, acc5
MULQ acc1
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0 ADCQ $0, acc0
XORQ acc1, acc1 XORQ acc1, acc1
@ -500,26 +496,24 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc0 ADCQ DX, acc0
ADCQ $0, acc1 ADCQ $0, acc1
// Third reduction step // Third reduction step
MOVQ p256p<>+0x08(SB), AX MOVQ acc2, AX
MULQ acc2 MOVQ acc2, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3 ADDQ acc2, acc3
ADCQ $0, DX ADCQ $0, acc4
ADDQ AX, acc3 SUBQ AX, acc3
ADCQ $0, DX SBBQ DX, acc4
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX MOVQ acc2, t1
MULQ acc2 ADDQ acc2, acc5
ADDQ t1, acc4 ADCQ $0, t1
ADCQ $0, DX SUBQ AX, acc5
ADDQ AX, acc4 SBBQ DX, t1
ADCQ $0, DX SUBQ acc2, acc5
MOVQ DX, t1 SBBQ $0, t1
MOVQ p256p<>+0x018(SB), AX ADDQ t1, acc0
MULQ acc2
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1 ADCQ $0, acc1
XORQ acc2, acc2 XORQ acc2, acc2
// x * y[3] // x * y[3]
@ -555,26 +549,24 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc1 ADCQ DX, acc1
ADCQ $0, acc2 ADCQ $0, acc2
// Last reduction step // Last reduction step
MOVQ p256p<>+0x08(SB), AX MOVQ acc3, AX
MULQ acc3 MOVQ acc3, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc4 ADDQ acc3, acc4
ADCQ $0, DX ADCQ $0, acc5
ADDQ AX, acc4 SUBQ AX, acc4
ADCQ $0, DX SBBQ DX, acc5
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX MOVQ acc3, t1
MULQ acc3 ADDQ acc3, acc0
ADDQ t1, acc5 ADCQ $0, t1
ADCQ $0, DX SUBQ AX, acc0
ADDQ AX, acc5 SBBQ DX, t1
ADCQ $0, DX SUBQ acc3, acc0
MOVQ DX, t1 SBBQ $0, t1
MOVQ p256p<>+0x018(SB), AX ADDQ t1, acc1
MULQ acc3
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2 ADCQ $0, acc2
// Copy result [255:0] // Copy result [255:0]
MOVQ acc4, x_ptr MOVQ acc4, x_ptr
@ -613,93 +605,85 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
// Only reduce, no multiplications are needed // Only reduce, no multiplications are needed
// First stage // First stage
MOVQ p256p<>+0x08(SB), AX MOVQ acc0, AX
MULQ acc0 MOVQ acc0, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1 ADDQ acc0, acc1
ADCQ $0, DX ADCQ $0, acc2
ADDQ AX, acc1 SUBQ AX, acc1
ADCQ $0, DX SBBQ DX, acc2
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX MOVQ acc0, t1
MULQ acc0 ADDQ acc0, acc3
ADDQ t1, acc2 ADCQ $0, t1
ADCQ $0, DX SUBQ AX, acc3
ADDQ AX, acc2 SBBQ DX, t1
ADCQ $0, DX SUBQ acc0, acc3
MOVQ DX, t1 SBBQ $0, t1
MOVQ p256p<>+0x018(SB), AX ADDQ t1, acc4
MULQ acc0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
XORQ acc5, acc5 XORQ acc5, acc5
// Second stage // Second stage
MOVQ p256p<>+0x08(SB), AX MOVQ acc1, AX
MULQ acc1 MOVQ acc1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2 ADDQ acc1, acc2
ADCQ $0, DX ADCQ $0, acc3
ADDQ AX, acc2 SUBQ AX, acc2
ADCQ $0, DX SBBQ DX, acc3
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX MOVQ acc1, t1
MULQ acc1 ADDQ acc1, acc4
ADDQ t1, acc3 ADCQ $0, t1
ADCQ $0, DX SUBQ AX, acc4
ADDQ AX, acc3 SBBQ DX, t1
ADCQ $0, DX SUBQ acc1, acc4
MOVQ DX, t1 SBBQ $0, t1
MOVQ p256p<>+0x018(SB), AX ADDQ t1, acc5
MULQ acc1
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
XORQ acc0, acc0 XORQ acc0, acc0
// Third stage // Third stage
MOVQ p256p<>+0x08(SB), AX MOVQ acc2, AX
MULQ acc2 MOVQ acc2, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3 ADDQ acc2, acc3
ADCQ $0, DX ADCQ $0, acc4
ADDQ AX, acc3 SUBQ AX, acc3
ADCQ $0, DX SBBQ DX, acc4
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX MOVQ acc2, t1
MULQ acc2 ADDQ acc2, acc5
ADDQ t1, acc4 ADCQ $0, t1
ADCQ $0, DX SUBQ AX, acc5
ADDQ AX, acc4 SBBQ DX, t1
ADCQ $0, DX SUBQ acc2, acc5
MOVQ DX, t1 SBBQ $0, t1
MOVQ p256p<>+0x018(SB), AX ADDQ t1, acc0
MULQ acc2
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
XORQ acc1, acc1 XORQ acc1, acc1
// Last stage // Last stage
MOVQ p256p<>+0x08(SB), AX MOVQ acc3, AX
MULQ acc3 MOVQ acc3, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc4 ADDQ acc3, acc4
ADCQ $0, DX ADCQ $0, acc5
ADDQ AX, acc4 SUBQ AX, acc4
ADCQ $0, DX SBBQ DX, acc5
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX MOVQ acc3, t1
MULQ acc3 ADDQ acc3, acc0
ADDQ t1, acc5 ADCQ $0, t1
ADCQ $0, DX SUBQ AX, acc0
ADDQ AX, acc5 SBBQ DX, t1
ADCQ $0, DX SUBQ acc3, acc0
MOVQ DX, t1 SBBQ $0, t1
MOVQ p256p<>+0x018(SB), AX ADDQ t1, acc1
MULQ acc3
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
MOVQ acc4, x_ptr MOVQ acc4, x_ptr
MOVQ acc5, acc3 MOVQ acc5, acc3