Updated SM2 MFMM (2) (markdown)

Sun Yimin 2024-02-23 08:52:05 +08:00
parent cd25a55fcb
commit 57dd400fa9

@ -379,4 +379,84 @@ $t_0=0+0$
ADOXQ t0, acc0 ADOXQ t0, acc0
``` ```
乘法: 5 乘法: 5
加法9 加法9
### 方案二:(移位、加法、减法)
因为Order素数不是MFMM所以这个方案其实优势不大、甚至没有优势尤其是在使用**使用MULXQ/ADCXQ/ADOXQ**的情况下。
移位针对 $O_3$ $O_2$ 乘法
```asm
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
MOVQ acc0, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
MOVQ t0, acc0 // acc0 = t0
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
ADCQ $0, DX // DX = carry2 + H(t0*ord1)
ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ $0, acc0
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
```
乘法: 3
移位2
加法8
减法3
**使用MULXQ**:
```asm
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
MOVQ acc0, DX
MULXQ p256ordK0<>(SB), t0, AX
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64
MULXQ p256ord<>+0x00(SB), AX, t1
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1
MOVQ t0, acc0 // acc0 = t0
MULXQ p256ord<>+0x08(SB), AX, t1
ADCQ $0, t1 // t1 = carry2 + H(t0*ord1)
ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1)
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
ADCQ $0, acc0 // acc0 = t0 + carry5
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
//MOVQ t0, DX // This is not required due to t0=DX already
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
```
乘法: 3
移位2
加法7
减法3
看来在支持**MULXQ/ADCXQ/ADOXQ**的情况下,使用方案一更好!