optimize sm2 p256 amd64 implementation, reduce multiplication

This commit is contained in:
Emman 2021-12-27 08:54:56 +08:00
parent 4ff0c4547f
commit 381476a913

View File

@ -848,24 +848,20 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
ADDQ t1, acc1 ADDQ t1, acc1
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc1 ADDQ AX, acc1
ADCQ $0, DX ADCQ DX, acc2
MOVQ DX, t1 ADCQ $0, acc3
ADCQ t0, acc4
MOVQ p256ord<>+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5 ADCQ $0, acc5
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc4
SBBQ $0, acc5
// x * y[1] // x * y[1]
MOVQ (8*1)(y_ptr), t0 MOVQ (8*1)(y_ptr), t0
@ -914,24 +910,20 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
ADDQ t1, acc2 ADDQ t1, acc2
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc2 ADDQ AX, acc2
ADCQ $0, DX ADCQ DX, acc3
MOVQ DX, t1 ADCQ $0, acc4
ADCQ t0, acc5
MOVQ p256ord<>+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x18(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0 ADCQ $0, acc0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc3
SBBQ AX, acc4
SBBQ DX, acc5
SBBQ $0, acc0
// x * y[2] // x * y[2]
MOVQ (8*2)(y_ptr), t0 MOVQ (8*2)(y_ptr), t0
@ -980,24 +972,20 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
ADDQ t1, acc3 ADDQ t1, acc3
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc3 ADDQ AX, acc3
ADCQ $0, DX ADCQ DX, acc4
MOVQ DX, t1 ADCQ $0, acc5
ADCQ t0, acc0
MOVQ p256ord<>+0x10(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x18(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1 ADCQ $0, acc1
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc4
SBBQ AX, acc5
SBBQ DX, acc0
SBBQ $0, acc1
// x * y[3] // x * y[3]
MOVQ (8*3)(y_ptr), t0 MOVQ (8*3)(y_ptr), t0
@ -1046,24 +1034,20 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
ADDQ t1, acc4 ADDQ t1, acc4
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc4 ADDQ AX, acc4
ADCQ $0, DX ADCQ DX, acc5
MOVQ DX, t1 ADCQ $0, acc0
ADCQ t0, acc1
MOVQ p256ord<>+0x10(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2 ADCQ $0, acc2
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc5
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
// Copy result [255:0] // Copy result [255:0]
MOVQ acc4, x_ptr MOVQ acc4, x_ptr
MOVQ acc5, acc3 MOVQ acc5, acc3