From 381476a913781e0af134f88ab1e8a017439e7479 Mon Sep 17 00:00:00 2001 From: Emman Date: Mon, 27 Dec 2021 08:54:56 +0800 Subject: [PATCH] optimize sm2 p256 amd64 implementation, reduce multiplication --- sm2/p256_asm_amd64.s | 120 +++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 68 deletions(-) diff --git a/sm2/p256_asm_amd64.s b/sm2/p256_asm_amd64.s index e1fccc7..f9c0ce2 100644 --- a/sm2/p256_asm_amd64.s +++ b/sm2/p256_asm_amd64.s @@ -848,24 +848,20 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 ADDQ t1, acc1 ADCQ $0, DX ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x10(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x18(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 + ADCQ DX, acc2 + ADCQ $0, acc3 + ADCQ t0, acc4 ADCQ $0, acc5 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc4 + SBBQ $0, acc5 // x * y[1] MOVQ (8*1)(y_ptr), t0 @@ -914,24 +910,20 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 ADDQ t1, acc2 ADCQ $0, DX ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x10(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x18(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 + ADCQ DX, acc3 + ADCQ $0, acc4 + ADCQ t0, acc5 ADCQ $0, acc0 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc3 + SBBQ AX, acc4 + SBBQ DX, acc5 + SBBQ $0, acc0 // x * y[2] MOVQ (8*2)(y_ptr), t0 @@ -980,24 +972,20 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 ADDQ t1, acc3 ADCQ $0, DX ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x10(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x18(SB), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 + ADCQ DX, acc4 + ADCQ $0, acc5 + ADCQ t0, acc0 ADCQ $0, acc1 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc4 + SBBQ AX, acc5 + SBBQ DX, acc0 + SBBQ $0, acc1 // x * y[3] MOVQ (8*3)(y_ptr), t0 @@ -1046,24 +1034,20 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 ADDQ t1, acc4 ADCQ $0, DX ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x10(SB), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x18(SB), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 + ADCQ DX, acc5 + ADCQ $0, acc0 + ADCQ t0, acc1 ADCQ $0, acc2 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc5 + SBBQ AX, acc0 + SBBQ DX, acc1 + SBBQ $0, acc2 // Copy result [255:0] MOVQ acc4, x_ptr MOVQ acc5, acc3