internal/sm2ec: order sqr/mul WWMM sub first

This commit is contained in:
emmansun 2024-02-24 13:36:46 +08:00
parent 0996508b5b
commit 35466446d4
4 changed files with 135 additions and 118 deletions

View File

@ -269,12 +269,10 @@ ordSqrLoop:
MOVQ t1, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
MOVQ acc0, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
@ -282,6 +280,16 @@ ordSqrLoop:
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
MOVQ t0, acc0 // acc0 = t0
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
@ -291,15 +299,7 @@ ordSqrLoop:
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ $0, acc0
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
// Second reduction step
MOVQ acc1, AX
MULQ p256ordK0<>(SB)
@ -312,6 +312,15 @@ ordSqrLoop:
MOVQ DX, t1
MOVQ t0, acc1
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc3
SBBQ AX, acc0
SBBQ DX, acc1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
@ -322,14 +331,6 @@ ordSqrLoop:
ADCQ $0, acc0
ADCQ $0, acc1
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc3
SBBQ AX, acc0
SBBQ DX, acc1
// Third reduction step
MOVQ acc2, AX
MULQ p256ordK0<>(SB)
@ -342,6 +343,15 @@ ordSqrLoop:
MOVQ DX, t1
MOVQ t0, acc2
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc0
SBBQ AX, acc1
SBBQ DX, acc2
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
@ -352,14 +362,6 @@ ordSqrLoop:
ADCQ $0, acc1
ADCQ $0, acc2
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc0
SBBQ AX, acc1
SBBQ DX, acc2
// Last reduction step
MOVQ acc3, AX
MULQ p256ordK0<>(SB)
@ -372,6 +374,15 @@ ordSqrLoop:
MOVQ DX, t1
MOVQ t0, acc3
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc1
SBBQ AX, acc2
SBBQ DX, acc3
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc0
@ -382,15 +393,6 @@ ordSqrLoop:
ADCQ $0, acc2
ADCQ $0, acc3
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc1
SBBQ AX, acc2
SBBQ DX, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADCQ acc4, acc0

View File

@ -918,16 +918,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
ADCQ $0, DX
MOVQ DX, BX
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ t0, acc4
ADCQ $0, acc5
MOVQ t0, acc0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
@ -935,8 +926,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc4
SBBQ $0, acc5
SBBQ DX, acc0
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ acc0, acc4
ADCQ $0, acc5
XORQ acc0, acc0
// x * y[1]
MOVQ (8*1)(y_ptr), t0
@ -980,16 +982,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
ADCQ $0, DX
MOVQ DX, BX
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ DX, acc3
ADCQ $0, acc4
ADCQ t0, acc5
ADCQ $0, acc0
MOVQ t0, acc1
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
@ -997,8 +990,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
SUBQ t0, acc3
SBBQ AX, acc4
SBBQ DX, acc5
SBBQ $0, acc0
SBBQ DX, acc1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ DX, acc3
ADCQ $0, acc4
ADCQ acc1, acc5
ADCQ $0, acc0
XORQ acc1, acc1
// x * y[2]
MOVQ (8*2)(y_ptr), t0
@ -1042,16 +1046,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
ADCQ $0, DX
MOVQ DX, BX
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5
ADCQ t0, acc0
ADCQ $0, acc1
MOVQ t0, acc2
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
@ -1059,8 +1054,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
SUBQ t0, acc4
SBBQ AX, acc5
SBBQ DX, acc0
SBBQ $0, acc1
SBBQ DX, acc2
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5
ADCQ acc2, acc0
ADCQ $0, acc1
XORQ acc2, acc2
// x * y[3]
MOVQ (8*3)(y_ptr), t0
@ -1104,6 +1110,16 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
ADCQ $0, DX
MOVQ DX, BX
MOVQ t0, acc3
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc5
SBBQ AX, acc0
SBBQ DX, acc3
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc4
@ -1111,19 +1127,9 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
ADCQ t0, acc1
ADCQ acc3, acc1
ADCQ $0, acc2
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc5
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
MOVQ res+0(FP), res_ptr
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)

View File

@ -272,8 +272,7 @@ ordSqrLoop:
MOVQ acc0, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
@ -281,6 +280,16 @@ ordSqrLoop:
MOVQ DX, BX // BX = carry1 + H(t0 * ord0)
MOVQ t0, acc0 // acc0 = t0
// calculate the negative part: [acc0, acc3, acc2] - [0, 0x100000000, 1] * t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc1 // (carry2, acc1) = acc1 + BX
@ -290,15 +299,7 @@ ordSqrLoop:
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ $0, acc0
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
// Second reduction step
MOVQ acc1, AX
MULQ p256ordK0<>(SB)
@ -311,6 +312,15 @@ ordSqrLoop:
MOVQ DX, BX
MOVQ t0, acc1
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc3
SBBQ AX, acc0
SBBQ DX, acc1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc2
@ -321,14 +331,6 @@ ordSqrLoop:
ADCQ $0, acc0
ADCQ $0, acc1
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc3
SBBQ AX, acc0
SBBQ DX, acc1
// Third reduction step
MOVQ acc2, AX
MULQ p256ordK0<>(SB)
@ -341,6 +343,15 @@ ordSqrLoop:
MOVQ DX, BX
MOVQ t0, acc2
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc0
SBBQ AX, acc1
SBBQ DX, acc2
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc3
@ -351,14 +362,6 @@ ordSqrLoop:
ADCQ $0, acc1
ADCQ $0, acc2
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc0
SBBQ AX, acc1
SBBQ DX, acc2
// Last reduction step
MOVQ acc3, AX
MULQ p256ordK0<>(SB)
@ -371,6 +374,15 @@ ordSqrLoop:
MOVQ DX, BX
MOVQ t0, acc3
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc1
SBBQ AX, acc2
SBBQ DX, acc3
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc0
@ -381,15 +393,6 @@ ordSqrLoop:
ADCQ $0, acc2
ADCQ $0, acc3
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc1
SBBQ AX, acc2
SBBQ DX, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADCQ acc4, acc0

View File

@ -364,6 +364,8 @@ noAdxMul:
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5
XORQ acc0, acc0
// x * y[1]
MOVQ (8*1)(y_ptr), t0
@ -430,6 +432,8 @@ noAdxMul:
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
XORQ acc1, acc1
// x * y[2]
MOVQ (8*2)(y_ptr), t0
@ -496,6 +500,8 @@ noAdxMul:
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
XORQ acc2, acc2
// x * y[3]
MOVQ (8*3)(y_ptr), t0