mirror of
https://github.com/emmansun/gmsm.git
synced 2025-06-29 00:37:51 +08:00
internal/sm2ec: order sqr/mul WWMM sub first
This commit is contained in:
parent
0996508b5b
commit
35466446d4
@ -269,12 +269,10 @@ ordSqrLoop:
|
|||||||
MOVQ t1, x_ptr
|
MOVQ t1, x_ptr
|
||||||
|
|
||||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||||
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
|
|
||||||
MOVQ acc0, AX
|
MOVQ acc0, AX
|
||||||
MULQ p256ordK0<>(SB)
|
MULQ p256ordK0<>(SB)
|
||||||
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
||||||
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
|
||||||
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
|
||||||
MOVQ p256ord<>+0x00(SB), AX
|
MOVQ p256ord<>+0x00(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
||||||
@ -282,6 +280,16 @@ ordSqrLoop:
|
|||||||
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
|
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
|
||||||
MOVQ t0, acc0 // acc0 = t0
|
MOVQ t0, acc0 // acc0 = t0
|
||||||
|
|
||||||
|
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||||
|
MOVQ t0, AX
|
||||||
|
MOVQ t0, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
SUBQ t0, acc2
|
||||||
|
SBBQ AX, acc3
|
||||||
|
SBBQ DX, acc0
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
|
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
|
||||||
@ -291,15 +299,7 @@ ordSqrLoop:
|
|||||||
ADCQ DX, acc2
|
ADCQ DX, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
|
||||||
MOVQ t0, AX
|
|
||||||
MOVQ t0, DX
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc2
|
|
||||||
SBBQ AX, acc3
|
|
||||||
SBBQ DX, acc0
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ acc1, AX
|
MOVQ acc1, AX
|
||||||
MULQ p256ordK0<>(SB)
|
MULQ p256ordK0<>(SB)
|
||||||
@ -312,6 +312,15 @@ ordSqrLoop:
|
|||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ t0, acc1
|
MOVQ t0, acc1
|
||||||
|
|
||||||
|
MOVQ t0, AX
|
||||||
|
MOVQ t0, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
SUBQ t0, acc3
|
||||||
|
SBBQ AX, acc0
|
||||||
|
SBBQ DX, acc1
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ t1, acc2
|
ADDQ t1, acc2
|
||||||
@ -322,14 +331,6 @@ ordSqrLoop:
|
|||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
ADCQ $0, acc1
|
ADCQ $0, acc1
|
||||||
|
|
||||||
MOVQ t0, AX
|
|
||||||
MOVQ t0, DX
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc3
|
|
||||||
SBBQ AX, acc0
|
|
||||||
SBBQ DX, acc1
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
MOVQ acc2, AX
|
MOVQ acc2, AX
|
||||||
MULQ p256ordK0<>(SB)
|
MULQ p256ordK0<>(SB)
|
||||||
@ -342,6 +343,15 @@ ordSqrLoop:
|
|||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ t0, acc2
|
MOVQ t0, acc2
|
||||||
|
|
||||||
|
MOVQ t0, AX
|
||||||
|
MOVQ t0, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
SUBQ t0, acc0
|
||||||
|
SBBQ AX, acc1
|
||||||
|
SBBQ DX, acc2
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ t1, acc3
|
ADDQ t1, acc3
|
||||||
@ -352,14 +362,6 @@ ordSqrLoop:
|
|||||||
ADCQ $0, acc1
|
ADCQ $0, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
|
|
||||||
MOVQ t0, AX
|
|
||||||
MOVQ t0, DX
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc0
|
|
||||||
SBBQ AX, acc1
|
|
||||||
SBBQ DX, acc2
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
MOVQ acc3, AX
|
MOVQ acc3, AX
|
||||||
MULQ p256ordK0<>(SB)
|
MULQ p256ordK0<>(SB)
|
||||||
@ -372,6 +374,15 @@ ordSqrLoop:
|
|||||||
MOVQ DX, t1
|
MOVQ DX, t1
|
||||||
MOVQ t0, acc3
|
MOVQ t0, acc3
|
||||||
|
|
||||||
|
MOVQ t0, AX
|
||||||
|
MOVQ t0, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
SUBQ t0, acc1
|
||||||
|
SBBQ AX, acc2
|
||||||
|
SBBQ DX, acc3
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ t1, acc0
|
ADDQ t1, acc0
|
||||||
@ -382,15 +393,6 @@ ordSqrLoop:
|
|||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
|
|
||||||
MOVQ t0, AX
|
|
||||||
MOVQ t0, DX
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc1
|
|
||||||
SBBQ AX, acc2
|
|
||||||
SBBQ DX, acc3
|
|
||||||
|
|
||||||
XORQ t0, t0
|
XORQ t0, t0
|
||||||
// Add bits [511:256] of the sqr result
|
// Add bits [511:256] of the sqr result
|
||||||
ADCQ acc4, acc0
|
ADCQ acc4, acc0
|
||||||
|
@ -918,16 +918,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, BX
|
MOVQ DX, BX
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ t0, acc0
|
||||||
MULQ t0
|
|
||||||
ADDQ BX, acc1
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc1
|
|
||||||
ADCQ DX, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ t0, acc4
|
|
||||||
ADCQ $0, acc5
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MOVQ t0, AX
|
||||||
MOVQ t0, DX
|
MOVQ t0, DX
|
||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
@ -935,8 +926,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
SUBQ t0, acc2
|
SUBQ t0, acc2
|
||||||
SBBQ AX, acc3
|
SBBQ AX, acc3
|
||||||
SBBQ DX, acc4
|
SBBQ DX, acc0
|
||||||
SBBQ $0, acc5
|
|
||||||
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
|
MULQ t0
|
||||||
|
ADDQ BX, acc1
|
||||||
|
ADCQ $0, DX
|
||||||
|
ADDQ AX, acc1
|
||||||
|
ADCQ DX, acc2
|
||||||
|
ADCQ $0, acc3
|
||||||
|
ADCQ acc0, acc4
|
||||||
|
ADCQ $0, acc5
|
||||||
|
|
||||||
|
XORQ acc0, acc0
|
||||||
// x * y[1]
|
// x * y[1]
|
||||||
MOVQ (8*1)(y_ptr), t0
|
MOVQ (8*1)(y_ptr), t0
|
||||||
|
|
||||||
@ -980,16 +982,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, BX
|
MOVQ DX, BX
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ t0, acc1
|
||||||
MULQ t0
|
|
||||||
ADDQ BX, acc2
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc2
|
|
||||||
ADCQ DX, acc3
|
|
||||||
ADCQ $0, acc4
|
|
||||||
ADCQ t0, acc5
|
|
||||||
ADCQ $0, acc0
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MOVQ t0, AX
|
||||||
MOVQ t0, DX
|
MOVQ t0, DX
|
||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
@ -997,8 +990,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
SUBQ t0, acc3
|
SUBQ t0, acc3
|
||||||
SBBQ AX, acc4
|
SBBQ AX, acc4
|
||||||
SBBQ DX, acc5
|
SBBQ DX, acc1
|
||||||
SBBQ $0, acc0
|
|
||||||
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
|
MULQ t0
|
||||||
|
ADDQ BX, acc2
|
||||||
|
ADCQ $0, DX
|
||||||
|
ADDQ AX, acc2
|
||||||
|
ADCQ DX, acc3
|
||||||
|
ADCQ $0, acc4
|
||||||
|
ADCQ acc1, acc5
|
||||||
|
ADCQ $0, acc0
|
||||||
|
|
||||||
|
XORQ acc1, acc1
|
||||||
// x * y[2]
|
// x * y[2]
|
||||||
MOVQ (8*2)(y_ptr), t0
|
MOVQ (8*2)(y_ptr), t0
|
||||||
|
|
||||||
@ -1042,16 +1046,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, BX
|
MOVQ DX, BX
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ t0, acc2
|
||||||
MULQ t0
|
|
||||||
ADDQ BX, acc3
|
|
||||||
ADCQ $0, DX
|
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ DX, acc4
|
|
||||||
ADCQ $0, acc5
|
|
||||||
ADCQ t0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
|
|
||||||
MOVQ t0, AX
|
MOVQ t0, AX
|
||||||
MOVQ t0, DX
|
MOVQ t0, DX
|
||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
@ -1059,8 +1054,19 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
SUBQ t0, acc4
|
SUBQ t0, acc4
|
||||||
SBBQ AX, acc5
|
SBBQ AX, acc5
|
||||||
SBBQ DX, acc0
|
SBBQ DX, acc2
|
||||||
SBBQ $0, acc1
|
|
||||||
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
|
MULQ t0
|
||||||
|
ADDQ BX, acc3
|
||||||
|
ADCQ $0, DX
|
||||||
|
ADDQ AX, acc3
|
||||||
|
ADCQ DX, acc4
|
||||||
|
ADCQ $0, acc5
|
||||||
|
ADCQ acc2, acc0
|
||||||
|
ADCQ $0, acc1
|
||||||
|
|
||||||
|
XORQ acc2, acc2
|
||||||
// x * y[3]
|
// x * y[3]
|
||||||
MOVQ (8*3)(y_ptr), t0
|
MOVQ (8*3)(y_ptr), t0
|
||||||
|
|
||||||
@ -1104,6 +1110,16 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
ADCQ $0, DX
|
ADCQ $0, DX
|
||||||
MOVQ DX, BX
|
MOVQ DX, BX
|
||||||
|
|
||||||
|
MOVQ t0, acc3
|
||||||
|
MOVQ t0, AX
|
||||||
|
MOVQ t0, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
SUBQ t0, acc5
|
||||||
|
SBBQ AX, acc0
|
||||||
|
SBBQ DX, acc3
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ BX, acc4
|
ADDQ BX, acc4
|
||||||
@ -1111,19 +1127,9 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
ADDQ AX, acc4
|
ADDQ AX, acc4
|
||||||
ADCQ DX, acc5
|
ADCQ DX, acc5
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
ADCQ t0, acc1
|
ADCQ acc3, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
|
|
||||||
MOVQ t0, AX
|
|
||||||
MOVQ t0, DX
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc5
|
|
||||||
SBBQ AX, acc0
|
|
||||||
SBBQ DX, acc1
|
|
||||||
SBBQ $0, acc2
|
|
||||||
|
|
||||||
MOVQ res+0(FP), res_ptr
|
MOVQ res+0(FP), res_ptr
|
||||||
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||||
|
|
||||||
|
@ -272,8 +272,7 @@ ordSqrLoop:
|
|||||||
MOVQ acc0, AX
|
MOVQ acc0, AX
|
||||||
MULQ p256ordK0<>(SB)
|
MULQ p256ordK0<>(SB)
|
||||||
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
||||||
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
|
||||||
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
|
||||||
MOVQ p256ord<>+0x00(SB), AX
|
MOVQ p256ord<>+0x00(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
||||||
@ -281,6 +280,16 @@ ordSqrLoop:
|
|||||||
MOVQ DX, BX // BX = carry1 + H(t0 * ord0)
|
MOVQ DX, BX // BX = carry1 + H(t0 * ord0)
|
||||||
MOVQ t0, acc0 // acc0 = t0
|
MOVQ t0, acc0 // acc0 = t0
|
||||||
|
|
||||||
|
// calculate the negative part: [acc0, acc3, acc2] - [0, 0x100000000, 1] * t0
|
||||||
|
MOVQ t0, AX
|
||||||
|
MOVQ t0, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
SUBQ t0, acc2
|
||||||
|
SBBQ AX, acc3
|
||||||
|
SBBQ DX, acc0
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ BX, acc1 // (carry2, acc1) = acc1 + BX
|
ADDQ BX, acc1 // (carry2, acc1) = acc1 + BX
|
||||||
@ -290,15 +299,7 @@ ordSqrLoop:
|
|||||||
ADCQ DX, acc2
|
ADCQ DX, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
|
||||||
MOVQ t0, AX
|
|
||||||
MOVQ t0, DX
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc2
|
|
||||||
SBBQ AX, acc3
|
|
||||||
SBBQ DX, acc0
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ acc1, AX
|
MOVQ acc1, AX
|
||||||
MULQ p256ordK0<>(SB)
|
MULQ p256ordK0<>(SB)
|
||||||
@ -311,6 +312,15 @@ ordSqrLoop:
|
|||||||
MOVQ DX, BX
|
MOVQ DX, BX
|
||||||
MOVQ t0, acc1
|
MOVQ t0, acc1
|
||||||
|
|
||||||
|
MOVQ t0, AX
|
||||||
|
MOVQ t0, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
SUBQ t0, acc3
|
||||||
|
SBBQ AX, acc0
|
||||||
|
SBBQ DX, acc1
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ BX, acc2
|
ADDQ BX, acc2
|
||||||
@ -321,14 +331,6 @@ ordSqrLoop:
|
|||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
ADCQ $0, acc1
|
ADCQ $0, acc1
|
||||||
|
|
||||||
MOVQ t0, AX
|
|
||||||
MOVQ t0, DX
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc3
|
|
||||||
SBBQ AX, acc0
|
|
||||||
SBBQ DX, acc1
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
MOVQ acc2, AX
|
MOVQ acc2, AX
|
||||||
MULQ p256ordK0<>(SB)
|
MULQ p256ordK0<>(SB)
|
||||||
@ -341,6 +343,15 @@ ordSqrLoop:
|
|||||||
MOVQ DX, BX
|
MOVQ DX, BX
|
||||||
MOVQ t0, acc2
|
MOVQ t0, acc2
|
||||||
|
|
||||||
|
MOVQ t0, AX
|
||||||
|
MOVQ t0, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
SUBQ t0, acc0
|
||||||
|
SBBQ AX, acc1
|
||||||
|
SBBQ DX, acc2
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ BX, acc3
|
ADDQ BX, acc3
|
||||||
@ -351,14 +362,6 @@ ordSqrLoop:
|
|||||||
ADCQ $0, acc1
|
ADCQ $0, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
|
|
||||||
MOVQ t0, AX
|
|
||||||
MOVQ t0, DX
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc0
|
|
||||||
SBBQ AX, acc1
|
|
||||||
SBBQ DX, acc2
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
MOVQ acc3, AX
|
MOVQ acc3, AX
|
||||||
MULQ p256ordK0<>(SB)
|
MULQ p256ordK0<>(SB)
|
||||||
@ -371,6 +374,15 @@ ordSqrLoop:
|
|||||||
MOVQ DX, BX
|
MOVQ DX, BX
|
||||||
MOVQ t0, acc3
|
MOVQ t0, acc3
|
||||||
|
|
||||||
|
MOVQ t0, AX
|
||||||
|
MOVQ t0, DX
|
||||||
|
SHLQ $32, AX
|
||||||
|
SHRQ $32, DX
|
||||||
|
|
||||||
|
SUBQ t0, acc1
|
||||||
|
SBBQ AX, acc2
|
||||||
|
SBBQ DX, acc3
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ BX, acc0
|
ADDQ BX, acc0
|
||||||
@ -381,15 +393,6 @@ ordSqrLoop:
|
|||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
|
|
||||||
MOVQ t0, AX
|
|
||||||
MOVQ t0, DX
|
|
||||||
SHLQ $32, AX
|
|
||||||
SHRQ $32, DX
|
|
||||||
|
|
||||||
SUBQ t0, acc1
|
|
||||||
SBBQ AX, acc2
|
|
||||||
SBBQ DX, acc3
|
|
||||||
|
|
||||||
XORQ t0, t0
|
XORQ t0, t0
|
||||||
// Add bits [511:256] of the sqr result
|
// Add bits [511:256] of the sqr result
|
||||||
ADCQ acc4, acc0
|
ADCQ acc4, acc0
|
||||||
|
@ -364,6 +364,8 @@ noAdxMul:
|
|||||||
ADDQ AX, acc3
|
ADDQ AX, acc3
|
||||||
ADCQ DX, acc4
|
ADCQ DX, acc4
|
||||||
ADCQ $0, acc5
|
ADCQ $0, acc5
|
||||||
|
|
||||||
|
XORQ acc0, acc0
|
||||||
// x * y[1]
|
// x * y[1]
|
||||||
MOVQ (8*1)(y_ptr), t0
|
MOVQ (8*1)(y_ptr), t0
|
||||||
|
|
||||||
@ -430,6 +432,8 @@ noAdxMul:
|
|||||||
ADDQ AX, acc4
|
ADDQ AX, acc4
|
||||||
ADCQ DX, acc5
|
ADCQ DX, acc5
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
|
|
||||||
|
XORQ acc1, acc1
|
||||||
// x * y[2]
|
// x * y[2]
|
||||||
MOVQ (8*2)(y_ptr), t0
|
MOVQ (8*2)(y_ptr), t0
|
||||||
|
|
||||||
@ -496,6 +500,8 @@ noAdxMul:
|
|||||||
ADDQ AX, acc5
|
ADDQ AX, acc5
|
||||||
ADCQ DX, acc0
|
ADCQ DX, acc0
|
||||||
ADCQ $0, acc1
|
ADCQ $0, acc1
|
||||||
|
|
||||||
|
XORQ acc2, acc2
|
||||||
// x * y[3]
|
// x * y[3]
|
||||||
MOVQ (8*3)(y_ptr), t0
|
MOVQ (8*3)(y_ptr), t0
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user