From 35466446d44d5a596c927b78399d8158148472ef Mon Sep 17 00:00:00 2001 From: emmansun Date: Sat, 24 Feb 2024 13:36:46 +0800 Subject: [PATCH] internal/sm2ec: order sqr/mul WWMM sub first --- internal/sm2ec/p256_asm_amd64.s | 74 ++++++++++----------- internal/sm2ec/p256_common_amd64.s | 100 +++++++++++++++-------------- internal/sm2ec/p256_plugin_amd64.s | 73 +++++++++++---------- sm9/bn256/gfp_cmn_amd64.s | 6 ++ 4 files changed, 135 insertions(+), 118 deletions(-) diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 3228901..3a66253 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -269,12 +269,10 @@ ordSqrLoop: MOVQ t1, x_ptr // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] MOVQ acc0, AX MULQ p256ordK0<>(SB) MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 - // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] - // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. + MOVQ p256ord<>+0x00(SB), AX MULQ t0 ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) @@ -282,6 +280,16 @@ ordSqrLoop: MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0) MOVQ t0, acc0 // acc0 = t0 + // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc0 + MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1 @@ -291,15 +299,7 @@ ordSqrLoop: ADCQ DX, acc2 ADCQ $0, acc3 ADCQ $0, acc0 - // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc0 // Second reduction step MOVQ acc1, AX MULQ p256ordK0<>(SB) @@ -312,6 +312,15 @@ ordSqrLoop: MOVQ DX, t1 MOVQ t0, acc1 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc3 + SBBQ AX, acc0 + SBBQ DX, acc1 + MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ t1, acc2 @@ -322,14 +331,6 @@ ordSqrLoop: ADCQ $0, acc0 ADCQ $0, acc1 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc3 - SBBQ AX, acc0 - SBBQ DX, acc1 // Third reduction step MOVQ acc2, AX MULQ p256ordK0<>(SB) @@ -342,6 +343,15 @@ ordSqrLoop: MOVQ DX, t1 MOVQ t0, acc2 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc0 + SBBQ AX, acc1 + SBBQ DX, acc2 + MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ t1, acc3 @@ -352,14 +362,6 @@ ordSqrLoop: ADCQ $0, acc1 ADCQ $0, acc2 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc0 - SBBQ AX, acc1 - SBBQ DX, acc2 // Last reduction step MOVQ acc3, AX MULQ p256ordK0<>(SB) @@ -372,6 +374,15 @@ ordSqrLoop: MOVQ DX, t1 MOVQ t0, acc3 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc1 + SBBQ AX, acc2 + SBBQ DX, acc3 + MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ t1, acc0 @@ -382,15 +393,6 @@ ordSqrLoop: ADCQ $0, acc2 ADCQ $0, acc3 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc1 - SBBQ AX, acc2 - SBBQ DX, acc3 - XORQ t0, t0 // Add bits [511:256] of the sqr result ADCQ acc4, acc0 diff --git a/internal/sm2ec/p256_common_amd64.s b/internal/sm2ec/p256_common_amd64.s index c0ee797..a0e9868 100644 --- a/internal/sm2ec/p256_common_amd64.s +++ b/internal/sm2ec/p256_common_amd64.s @@ -918,16 +918,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 ADCQ $0, DX MOVQ DX, BX - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ BX, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ DX, acc2 - ADCQ $0, acc3 - ADCQ t0, acc4 - ADCQ $0, acc5 - + MOVQ t0, acc0 MOVQ t0, AX MOVQ t0, DX SHLQ $32, AX @@ -935,8 +926,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 SUBQ t0, acc2 SBBQ AX, acc3 - SBBQ DX, acc4 - SBBQ $0, acc5 + SBBQ DX, acc0 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ DX, acc2 + ADCQ $0, acc3 + ADCQ acc0, acc4 + ADCQ $0, acc5 + + XORQ acc0, acc0 // x * y[1] MOVQ (8*1)(y_ptr), t0 @@ -980,16 +982,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 ADCQ $0, DX MOVQ DX, BX - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ BX, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ DX, acc3 - ADCQ $0, acc4 - ADCQ t0, acc5 - ADCQ $0, acc0 - + MOVQ t0, acc1 MOVQ t0, AX MOVQ t0, DX SHLQ $32, AX @@ -997,8 +990,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 SUBQ t0, acc3 SBBQ AX, acc4 - SBBQ DX, acc5 - SBBQ $0, acc0 + SBBQ DX, acc1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ DX, acc3 + ADCQ $0, acc4 + ADCQ acc1, acc5 + ADCQ $0, acc0 + + XORQ acc1, acc1 // x * y[2] MOVQ (8*2)(y_ptr), t0 @@ -1042,16 +1046,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 ADCQ $0, DX MOVQ DX, BX - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ BX, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 - ADCQ $0, acc5 - ADCQ t0, acc0 - ADCQ $0, acc1 - + MOVQ t0, acc2 MOVQ t0, AX MOVQ t0, DX SHLQ $32, AX @@ -1059,8 +1054,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 SUBQ t0, acc4 SBBQ AX, acc5 - SBBQ DX, acc0 - SBBQ $0, acc1 + SBBQ DX, acc2 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc4 + ADCQ $0, acc5 + ADCQ acc2, acc0 + ADCQ $0, acc1 + + XORQ acc2, acc2 // x * y[3] MOVQ (8*3)(y_ptr), t0 @@ -1104,6 +1110,16 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 ADCQ $0, DX MOVQ DX, BX + MOVQ t0, acc3 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc5 + SBBQ AX, acc0 + SBBQ DX, acc3 + MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ BX, acc4 @@ -1111,19 +1127,9 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 ADDQ AX, acc4 ADCQ DX, acc5 ADCQ $0, acc0 - ADCQ t0, acc1 + ADCQ acc3, acc1 ADCQ $0, acc2 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc5 - SBBQ AX, acc0 - SBBQ DX, acc1 - SBBQ $0, acc2 - MOVQ res+0(FP), res_ptr p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) diff --git a/internal/sm2ec/p256_plugin_amd64.s b/internal/sm2ec/p256_plugin_amd64.s index a35b5f2..cdf4b65 100644 --- a/internal/sm2ec/p256_plugin_amd64.s +++ b/internal/sm2ec/p256_plugin_amd64.s @@ -272,8 +272,7 @@ ordSqrLoop: MOVQ acc0, AX MULQ p256ordK0<>(SB) MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 - // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] - // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. + MOVQ p256ord<>+0x00(SB), AX MULQ t0 ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) @@ -281,6 +280,16 @@ ordSqrLoop: MOVQ DX, BX // BX = carry1 + H(t0 * ord0) MOVQ t0, acc0 // acc0 = t0 + // calculate the negative part: [acc0, acc3, acc2] - [0, 0x100000000, 1] * t0 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc0 + MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ BX, acc1 // (carry2, acc1) = acc1 + BX @@ -290,15 +299,7 @@ ordSqrLoop: ADCQ DX, acc2 ADCQ $0, acc3 ADCQ $0, acc0 - // calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc0 // Second reduction step MOVQ acc1, AX MULQ p256ordK0<>(SB) @@ -311,6 +312,15 @@ ordSqrLoop: MOVQ DX, BX MOVQ t0, acc1 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc3 + SBBQ AX, acc0 + SBBQ DX, acc1 + MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ BX, acc2 @@ -321,14 +331,6 @@ ordSqrLoop: ADCQ $0, acc0 ADCQ $0, acc1 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc3 - SBBQ AX, acc0 - SBBQ DX, acc1 // Third reduction step MOVQ acc2, AX MULQ p256ordK0<>(SB) @@ -341,6 +343,15 @@ ordSqrLoop: MOVQ DX, BX MOVQ t0, acc2 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc0 + SBBQ AX, acc1 + SBBQ DX, acc2 + MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ BX, acc3 @@ -351,14 +362,6 @@ ordSqrLoop: ADCQ $0, acc1 ADCQ $0, acc2 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc0 - SBBQ AX, acc1 - SBBQ DX, acc2 // Last reduction step MOVQ acc3, AX MULQ p256ordK0<>(SB) @@ -371,6 +374,15 @@ ordSqrLoop: MOVQ DX, BX MOVQ t0, acc3 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc1 + SBBQ AX, acc2 + SBBQ DX, acc3 + MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ BX, acc0 @@ -381,15 +393,6 @@ ordSqrLoop: ADCQ $0, acc2 ADCQ $0, acc3 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc1 - SBBQ AX, acc2 - SBBQ DX, acc3 - XORQ t0, t0 // Add bits [511:256] of the sqr result ADCQ acc4, acc0 diff --git a/sm9/bn256/gfp_cmn_amd64.s b/sm9/bn256/gfp_cmn_amd64.s index 1fa43a2..a961542 100644 --- a/sm9/bn256/gfp_cmn_amd64.s +++ b/sm9/bn256/gfp_cmn_amd64.s @@ -364,6 +364,8 @@ noAdxMul: ADDQ AX, acc3 ADCQ DX, acc4 ADCQ $0, acc5 + + XORQ acc0, acc0 // x * y[1] MOVQ (8*1)(y_ptr), t0 @@ -430,6 +432,8 @@ noAdxMul: ADDQ AX, acc4 ADCQ DX, acc5 ADCQ $0, acc0 + + XORQ acc1, acc1 // x * y[2] MOVQ (8*2)(y_ptr), t0 @@ -496,6 +500,8 @@ noAdxMul: ADDQ AX, acc5 ADCQ DX, acc0 ADCQ $0, acc1 + + XORQ acc2, acc2 // x * y[3] MOVQ (8*3)(y_ptr), t0