Updated SM2 WWMM (2) (markdown)

Sun Yimin 2024-02-26 18:00:00 +08:00
parent dd2e16d3b2
commit 2ce60e917b

@ -407,38 +407,39 @@ $T_3=t_7 \ast 2^{448} + t_6 \ast 2^{384} + t_5 \ast 2^{320} + (t_4+Y) \ast 2^{25
**依然采用先减后加!** **依然采用先减后加!**
```asm ```asm
// First reduction step // First reduction step
MOVQ acc0, AX MOVQ acc0, AX
MULQ p256ordK0<>(SB) MULQ p256ordK0<>(SB)
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
MOVQ p256ord<>+0x00(SB), AX // 处理第一个加法以便释放acc0
MULQ t0 MOVQ p256ord<>+0x00(SB), AX
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) MULQ t0
ADCQ $0, DX // DX = carry1 + H(t0 * ord0) ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)acc0 可以被释放了。
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0) ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
MOVQ t0, acc0 // acc0 = t0 MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
// 处理减法
MOVQ t0, acc0 // acc0 = t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 SUBQ t0, acc2
MOVQ t0, AX SBBQ AX, acc3
MOVQ t0, DX SBBQ DX, acc0
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
ADCQ $0, DX // DX = carry2 + H(t0*ord1)
ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ $0, acc0
// 处理加法
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
ADCQ $0, DX // DX = carry2 + H(t0*ord1)
ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ $0, acc0
``` ```
乘法: 3 乘法: 3
移位2 移位2
@ -450,15 +451,15 @@ $T_3=t_7 \ast 2^{448} + t_6 \ast 2^{384} + t_5 \ast 2^{320} + (t_4+Y) \ast 2^{25
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
MOVQ acc0, DX MOVQ acc0, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), t0, AX
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] // 处理第一个加法以便释放acc0
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64
MULXQ p256ord<>+0x00(SB), AX, t1 MULXQ p256ord<>+0x00(SB), AX, t1
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)acc0 可以被释放了。
ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1 ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1
MOVQ t0, acc0 // acc0 = t0
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
// 处理减法
MOVQ t0, acc0 // acc0 = t0
MOVQ t0, AX MOVQ t0, AX
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
@ -467,6 +468,7 @@ $T_3=t_7 \ast 2^{448} + t_6 \ast 2^{384} + t_5 \ast 2^{320} + (t_4+Y) \ast 2^{25
SBBQ AX, acc3 SBBQ AX, acc3
SBBQ DX, acc0 SBBQ DX, acc0
// 处理加法
MOVQ t0, DX MOVQ t0, DX
MULXQ p256ord<>+0x08(SB), AX, t1 MULXQ p256ord<>+0x08(SB), AX, t1
ADCQ $0, t1 // t1 = carry2 + H(t0*ord1) ADCQ $0, t1 // t1 = carry2 + H(t0*ord1)
@ -589,36 +591,39 @@ $T_3=(t_4+Y) \ast 2^{256}+(t_3 - Y \ast 2^{32}) \ast 2^{192} + (t_2 - Y) \ast 2^
伪代码: 伪代码:
```asm ```asm
// First reduction step // First reduction step
MOVQ acc0, AX MOVQ acc0, AX
MULQ p256ordK0<>(SB) MULQ p256ordK0<>(SB)
MOVQ AX, t0 MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX // 处理第一个加法以便释放acc0
MULQ t0 MOVQ p256ord<>+0x00(SB), AX
ADDQ AX, acc0 MULQ t0
ADCQ $0, DX ADDQ AX, acc0 // acc0 可以被释放了
MOVQ DX, BX ADCQ $0, DX
MOVQ DX, BX
MOVQ t0, acc0 // 开始处理减法
MOVQ t0, AX MOVQ t0, acc0
MOVQ t0, DX MOVQ t0, AX
SHLQ $32, AX MOVQ t0, DX
SHRQ $32, DX SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2 SUBQ t0, acc2
SBBQ AX, acc3 SBBQ AX, acc3
SBBQ DX, acc0 SBBQ DX, acc0
MOVQ p256ord<>+0x08(SB), AX // 处理加法
MULQ t0 MOVQ p256ord<>+0x08(SB), AX
ADDQ BX, acc1 MULQ t0
ADCQ $0, DX ADDQ BX, acc1
ADDQ AX, acc1 ADCQ $0, DX
ADCQ DX, acc2 ADDQ AX, acc1
ADCQ $0, acc3 ADCQ DX, acc2
ADCQ acc0, acc4 ADCQ $0, acc3
ADCQ $0, acc5 ADCQ acc0, acc4
ADCQ $0, acc5
``` ```
乘法: 3 乘法: 3
移位2 移位2
@ -631,12 +636,14 @@ $T_3=(t_4+Y) \ast 2^{256}+(t_3 - Y \ast 2^{32}) \ast 2^{192} + (t_2 - Y) \ast 2^
MOVQ acc0, DX MOVQ acc0, DX
MULXQ p256ordK0<>(SB), t0, AX MULXQ p256ordK0<>(SB), t0, AX
// 处理第一个加法以便释放acc0
MOVQ t0, DX MOVQ t0, DX
MULXQ p256ord<>+0x00(SB), AX, BX MULXQ p256ord<>+0x00(SB), AX, BX
ADDQ AX, acc0 ADDQ AX, acc0 // acc0 可以被释放了
ADCQ BX, acc1 ADCQ BX, acc1
MOVQ t0, acc0
// 开始处理减法
MOVQ t0, acc0
MOVQ t0, AX MOVQ t0, AX
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
@ -645,6 +652,7 @@ $T_3=(t_4+Y) \ast 2^{256}+(t_3 - Y \ast 2^{32}) \ast 2^{192} + (t_2 - Y) \ast 2^
SBBQ AX, acc3 SBBQ AX, acc3
SBBQ DX, acc0 SBBQ DX, acc0
// 处理加法
MOVQ t0, DX MOVQ t0, DX
MULXQ p256ord<>+0x08(SB), AX, BX MULXQ p256ord<>+0x08(SB), AX, BX
ADCQ $0, BX ADCQ $0, BX