diff --git a/SM2-WWMM-(2).md b/SM2-WWMM-(2).md index 0317445..3abe98d 100644 --- a/SM2-WWMM-(2).md +++ b/SM2-WWMM-(2).md @@ -446,32 +446,34 @@ $T_3=t_7 \ast 2^{448} + t_6 \ast 2^{384} + t_5 \ast 2^{320} + (t_4+Y) \ast 2^{25 **使用MULXQ**: ```asm - // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] - MOVQ acc0, DX - MULXQ p256ordK0<>(SB), t0, AX - // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] - // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. - MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 - MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) - ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1 - MOVQ t0, acc0 // acc0 = t0 + // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] + MOVQ acc0, DX + MULXQ p256ordK0<>(SB), t0, AX + // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] + // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. + MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 + MULXQ p256ord<>+0x00(SB), AX, t1 + ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) + ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1 + MOVQ t0, acc0 // acc0 = t0 - MULXQ p256ord<>+0x08(SB), AX, t1 - ADCQ $0, t1 // t1 = carry2 + H(t0*ord1) - ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1) - ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3 - ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4 - ADCQ $0, acc0 // acc0 = t0 + carry5 - // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 + MOVQ t0, AX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc0 + + MOVQ t0, DX + MULXQ p256ord<>+0x08(SB), AX, t1 + ADCQ $0, t1 // t1 = carry2 + H(t0*ord1) + ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1) + ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3 + ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4 + ADCQ $0, acc0 // acc0 = t0 + carry5 - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc0 ``` 乘法: 3 移位:2 @@ -633,47 +635,48 @@ $t_5=t_5 - 0$ 乘法: 3 移位:2 加法:9 -减法:4 +减法:3 **使用MULXQ**: ```asm - // First reduction step - MOVQ acc0, DX - MULXQ p256ordK0<>(SB), t0, AX + // First reduction step + MOVQ acc0, DX + MULXQ p256ordK0<>(SB), t0, AX - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc0 - ADCQ BX, acc1 + MOVQ t0, DX + MULXQ p256ord<>+0x00(SB), AX, BX + ADDQ AX, acc0 + ADCQ BX, acc1 + MOVQ t0, acc0 - MULXQ p256ord<>+0x08(SB), AX, BX - ADCQ $0, BX - ADDQ AX, acc1 - ADCQ BX, acc2 - ADCQ $0, acc3 - ADCQ t0, acc4 - ADCQ $0, acc5 - - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX + MOVQ t0, AX + SHLQ $32, AX + SHRQ $32, DX - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 - SBBQ $0, acc5 + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc0 + + MOVQ t0, DX + MULXQ p256ord<>+0x08(SB), AX, BX + ADCQ $0, BX + ADDQ AX, acc1 + ADCQ BX, acc2 + ADCQ $0, acc3 + ADCQ acc0, acc4 + ADCQ $0, acc5 + ``` 乘法: 3 移位:2 加法:8 -减法:4 +减法:3 | 方案 | 乘法 | 移位 | 加法 | 减法 | | ----------- | ----------- | ----------- | ----------- | ----------- | | 方案一 | 5 | 0 | 15 | 0 | | 方案一(MULX/ADCX/ADOX) | 5 | 0 | 10 | 0 | | 方案二 | 3 | 2 | 9 | 3 | -| 方案二(MULX) | 3 | 2 | 8 | 4 | +| 方案二(MULX) | 3 | 2 | 8 | 3 | 看来在支持**MULXQ/ADCXQ/ADOXQ**的情况下,使用方案一(MULX/ADCX/ADOX)更好!