diff --git a/SM2-WWMM-(2).md b/SM2-WWMM-(2).md index 1bef80c..51e9d37 100644 --- a/SM2-WWMM-(2).md +++ b/SM2-WWMM-(2).md @@ -407,38 +407,39 @@ $T_3=t_7 \ast 2^{448} + t_6 \ast 2^{384} + t_5 \ast 2^{320} + (t_4+Y) \ast 2^{25 **依然采用先减后加!** ```asm - // First reduction step - MOVQ acc0, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 + // First reduction step + MOVQ acc0, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) - ADCQ $0, DX // DX = carry1 + H(t0 * ord0) - MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0) - MOVQ t0, acc0 // acc0 = t0 + // 处理第一个加法,以便释放acc0 + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0),acc0 可以被释放了。 + ADCQ $0, DX // DX = carry1 + H(t0 * ord0) + MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0) + + // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 + // 处理减法 + MOVQ t0, acc0 // acc0 = t0 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX - // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc0 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1 - ADCQ $0, DX // DX = carry2 + H(t0*ord1) - - ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1) - ADCQ DX, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc0 + // 处理加法 + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1 + ADCQ $0, DX // DX = carry2 + H(t0*ord1) + ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1) + ADCQ DX, acc2 + ADCQ $0, acc3 + ADCQ $0, acc0 ``` 乘法: 3 移位:2 @@ -450,15 +451,15 @@ $T_3=t_7 \ast 2^{448} + t_6 \ast 2^{384} + t_5 \ast 2^{320} + (t_4+Y) \ast 2^{25 // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] MOVQ acc0, DX MULXQ p256ordK0<>(SB), t0, AX - // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] - // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. + // 处理第一个加法,以便释放acc0 MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) + ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0),acc0 可以被释放了。 ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1 - MOVQ t0, acc0 // acc0 = t0 // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 + // 处理减法 + MOVQ t0, acc0 // acc0 = t0 MOVQ t0, AX SHLQ $32, AX SHRQ $32, DX @@ -467,6 +468,7 @@ $T_3=t_7 \ast 2^{448} + t_6 \ast 2^{384} + t_5 \ast 2^{320} + (t_4+Y) \ast 2^{25 SBBQ AX, acc3 SBBQ DX, acc0 + // 处理加法 MOVQ t0, DX MULXQ p256ord<>+0x08(SB), AX, t1 ADCQ $0, t1 // t1 = carry2 + H(t0*ord1) @@ -589,36 +591,39 @@ $T_3=(t_4+Y) \ast 2^{256}+(t_3 - Y \ast 2^{32}) \ast 2^{192} + (t_2 - Y) \ast 2^ 伪代码: ```asm - // First reduction step - MOVQ acc0, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 + // First reduction step + MOVQ acc0, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, BX + // 处理第一个加法,以便释放acc0 + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 // acc0 可以被释放了 + ADCQ $0, DX + MOVQ DX, BX - MOVQ t0, acc0 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX + // 开始处理减法 + MOVQ t0, acc0 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc0 + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc0 - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ BX, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ DX, acc2 - ADCQ $0, acc3 - ADCQ acc0, acc4 - ADCQ $0, acc5 + // 处理加法 + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ DX, acc2 + ADCQ $0, acc3 + ADCQ acc0, acc4 + ADCQ $0, acc5 ``` 乘法: 3 移位:2 @@ -631,12 +636,14 @@ $T_3=(t_4+Y) \ast 2^{256}+(t_3 - Y \ast 2^{32}) \ast 2^{192} + (t_2 - Y) \ast 2^ MOVQ acc0, DX MULXQ p256ordK0<>(SB), t0, AX + // 处理第一个加法,以便释放acc0 MOVQ t0, DX MULXQ p256ord<>+0x00(SB), AX, BX - ADDQ AX, acc0 + ADDQ AX, acc0 // acc0 可以被释放了 ADCQ BX, acc1 - MOVQ t0, acc0 + // 开始处理减法 + MOVQ t0, acc0 MOVQ t0, AX SHLQ $32, AX SHRQ $32, DX @@ -645,6 +652,7 @@ $T_3=(t_4+Y) \ast 2^{256}+(t_3 - Y \ast 2^{32}) \ast 2^{192} + (t_2 - Y) \ast 2^ SBBQ AX, acc3 SBBQ DX, acc0 + // 处理加法 MOVQ t0, DX MULXQ p256ord<>+0x08(SB), AX, BX ADCQ $0, BX