mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-21 17:56:19 +08:00
internal/sm2ec: order sqr/mul WWMM sub first
This commit is contained in:
parent
0996508b5b
commit
35466446d4
@ -269,12 +269,10 @@ ordSqrLoop:
|
||||
MOVQ t1, x_ptr
|
||||
|
||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
|
||||
MOVQ acc0, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
||||
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
||||
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
||||
@ -282,6 +280,16 @@ ordSqrLoop:
|
||||
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
|
||||
MOVQ t0, acc0 // acc0 = t0
|
||||
|
||||
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc0
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
|
||||
@ -291,15 +299,7 @@ ordSqrLoop:
|
||||
ADCQ DX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc0
|
||||
// Second reduction step
|
||||
MOVQ acc1, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
@ -312,6 +312,15 @@ ordSqrLoop:
|
||||
MOVQ DX, t1
|
||||
MOVQ t0, acc1
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc3
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
@ -322,14 +331,6 @@ ordSqrLoop:
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc3
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
// Third reduction step
|
||||
MOVQ acc2, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
@ -342,6 +343,15 @@ ordSqrLoop:
|
||||
MOVQ DX, t1
|
||||
MOVQ t0, acc2
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc0
|
||||
SBBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
@ -352,14 +362,6 @@ ordSqrLoop:
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc0
|
||||
SBBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
// Last reduction step
|
||||
MOVQ acc3, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
@ -372,6 +374,15 @@ ordSqrLoop:
|
||||
MOVQ DX, t1
|
||||
MOVQ t0, acc3
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc1
|
||||
SBBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc0
|
||||
@ -382,15 +393,6 @@ ordSqrLoop:
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc1
|
||||
SBBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
|
||||
XORQ t0, t0
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADCQ acc4, acc0
|
||||
|
@ -918,16 +918,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, BX
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc1
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc1
|
||||
ADCQ DX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ t0, acc4
|
||||
ADCQ $0, acc5
|
||||
|
||||
MOVQ t0, acc0
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
@ -935,8 +926,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
|
||||
SUBQ t0, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
SBBQ $0, acc5
|
||||
SBBQ DX, acc0
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc1
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc1
|
||||
ADCQ DX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ acc0, acc4
|
||||
ADCQ $0, acc5
|
||||
|
||||
XORQ acc0, acc0
|
||||
// x * y[1]
|
||||
MOVQ (8*1)(y_ptr), t0
|
||||
|
||||
@ -980,16 +982,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, BX
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ DX, acc3
|
||||
ADCQ $0, acc4
|
||||
ADCQ t0, acc5
|
||||
ADCQ $0, acc0
|
||||
|
||||
MOVQ t0, acc1
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
@ -997,8 +990,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
|
||||
SUBQ t0, acc3
|
||||
SBBQ AX, acc4
|
||||
SBBQ DX, acc5
|
||||
SBBQ $0, acc0
|
||||
SBBQ DX, acc1
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc2
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc2
|
||||
ADCQ DX, acc3
|
||||
ADCQ $0, acc4
|
||||
ADCQ acc1, acc5
|
||||
ADCQ $0, acc0
|
||||
|
||||
XORQ acc1, acc1
|
||||
// x * y[2]
|
||||
MOVQ (8*2)(y_ptr), t0
|
||||
|
||||
@ -1042,16 +1046,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, BX
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ DX, acc4
|
||||
ADCQ $0, acc5
|
||||
ADCQ t0, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
MOVQ t0, acc2
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
@ -1059,8 +1054,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
|
||||
SUBQ t0, acc4
|
||||
SBBQ AX, acc5
|
||||
SBBQ DX, acc0
|
||||
SBBQ $0, acc1
|
||||
SBBQ DX, acc2
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc3
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc3
|
||||
ADCQ DX, acc4
|
||||
ADCQ $0, acc5
|
||||
ADCQ acc2, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
XORQ acc2, acc2
|
||||
// x * y[3]
|
||||
MOVQ (8*3)(y_ptr), t0
|
||||
|
||||
@ -1104,6 +1110,16 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, BX
|
||||
|
||||
MOVQ t0, acc3
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc5
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc3
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc4
|
||||
@ -1111,19 +1127,9 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
ADDQ AX, acc4
|
||||
ADCQ DX, acc5
|
||||
ADCQ $0, acc0
|
||||
ADCQ t0, acc1
|
||||
ADCQ acc3, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc5
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
SBBQ $0, acc2
|
||||
|
||||
MOVQ res+0(FP), res_ptr
|
||||
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||
|
||||
|
@ -272,8 +272,7 @@ ordSqrLoop:
|
||||
MOVQ acc0, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
||||
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
||||
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
||||
@ -281,6 +280,16 @@ ordSqrLoop:
|
||||
MOVQ DX, BX // BX = carry1 + H(t0 * ord0)
|
||||
MOVQ t0, acc0 // acc0 = t0
|
||||
|
||||
// calculate the negative part: [acc0, acc3, acc2] - [0, 0x100000000, 1] * t0
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc0
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc1 // (carry2, acc1) = acc1 + BX
|
||||
@ -290,15 +299,7 @@ ordSqrLoop:
|
||||
ADCQ DX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc0
|
||||
// Second reduction step
|
||||
MOVQ acc1, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
@ -311,6 +312,15 @@ ordSqrLoop:
|
||||
MOVQ DX, BX
|
||||
MOVQ t0, acc1
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc3
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc2
|
||||
@ -321,14 +331,6 @@ ordSqrLoop:
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc3
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
// Third reduction step
|
||||
MOVQ acc2, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
@ -341,6 +343,15 @@ ordSqrLoop:
|
||||
MOVQ DX, BX
|
||||
MOVQ t0, acc2
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc0
|
||||
SBBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc3
|
||||
@ -351,14 +362,6 @@ ordSqrLoop:
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc0
|
||||
SBBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
// Last reduction step
|
||||
MOVQ acc3, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
@ -371,6 +374,15 @@ ordSqrLoop:
|
||||
MOVQ DX, BX
|
||||
MOVQ t0, acc3
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc1
|
||||
SBBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc0
|
||||
@ -381,15 +393,6 @@ ordSqrLoop:
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc1
|
||||
SBBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
|
||||
XORQ t0, t0
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADCQ acc4, acc0
|
||||
|
@ -364,6 +364,8 @@ noAdxMul:
|
||||
ADDQ AX, acc3
|
||||
ADCQ DX, acc4
|
||||
ADCQ $0, acc5
|
||||
|
||||
XORQ acc0, acc0
|
||||
// x * y[1]
|
||||
MOVQ (8*1)(y_ptr), t0
|
||||
|
||||
@ -430,6 +432,8 @@ noAdxMul:
|
||||
ADDQ AX, acc4
|
||||
ADCQ DX, acc5
|
||||
ADCQ $0, acc0
|
||||
|
||||
XORQ acc1, acc1
|
||||
// x * y[2]
|
||||
MOVQ (8*2)(y_ptr), t0
|
||||
|
||||
@ -496,6 +500,8 @@ noAdxMul:
|
||||
ADDQ AX, acc5
|
||||
ADCQ DX, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
XORQ acc2, acc2
|
||||
// x * y[3]
|
||||
MOVQ (8*3)(y_ptr), t0
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user