From ba6bd136f9e180cbe9657a65699b9c9f8bdb3851 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 26 Jul 2023 10:26:32 +0800 Subject: [PATCH] internal/sm2ec: add comment for mont. reduction and others --- internal/sm2ec/p256_asm_amd64.s | 67 ++++++++++++++++++++------------- internal/sm2ec/p256_asm_arm64.s | 5 +++ 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 7069219..14d4156 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -224,18 +224,19 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0 MOVQ a3, (8*3)(res) \ #define p256SqrMontReduce() \ - \ // First reduction step + \ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1] MOVQ acc0, AX \ MOVQ acc0, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ - \ - ADDQ acc0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - \ - SUBQ AX, acc1 \ + SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part + SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part + \ // calculate the positive part first: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1], + \ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry. + ADDQ acc0, acc1 \ // acc1' = L (acc0 + acc1) + ADCQ $0, acc2 \ // acc2' = acc2 + carry1 + ADCQ $0, acc3 \ // acc3' = acc3 + carry2 + ADCQ $0, acc0 \ // acc0' = acc0 + carry3 + \// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0 + SUBQ AX, acc1 \ SBBQ DX, acc2 \ SBBQ AX, acc3 \ SBBQ DX, acc0 \ @@ -382,6 +383,7 @@ sqrLoop: ADCQ DX, t1 MOVQ t1, x_ptr + // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] p256SqrMontReduce() p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) MOVQ res_ptr, x_ptr @@ -450,6 +452,7 @@ sqrBMI2: ADCQ AX, y_ptr ADCQ t1, x_ptr + // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] p256SqrMontReduce() p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) MOVQ res_ptr, x_ptr @@ -1722,17 +1725,20 @@ ordSqrLoop: ADCQ AX, y_ptr ADCQ DX, t1 MOVQ t1, x_ptr - // First reduction step + + // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] MOVQ acc0, AX MULQ p256ordK0<>(SB) MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 - + // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] + // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. MOVQ p256ord<>+0x00(SB), AX MULQ t0 - ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 + ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) ADCQ $0, DX // DX = carry1 + H(t0 * ord0) MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0) - MOVQ t0, acc0 + MOVQ t0, acc0 // acc0 = t0 MOVQ p256ord<>+0x08(SB), AX MULQ t0 @@ -1743,7 +1749,7 @@ ordSqrLoop: ADCQ DX, acc2 ADCQ $0, acc3 ADCQ $0, acc0 - + // calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 MOVQ t0, AX MOVQ t0, DX SHLQ $32, AX @@ -1918,23 +1924,25 @@ ordSqrLoopBMI2: ADCQ AX, y_ptr ADCQ t1, x_ptr - // First reduction step + // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] MOVQ acc0, DX MULXQ p256ordK0<>(SB), t0, AX - + // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] + // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 - ADCQ t1, acc1 - MOVQ t0, acc0 + ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) + ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1 + MOVQ t0, acc0 // acc0 = t0 MULXQ p256ord<>+0x08(SB), AX, t1 ADCQ $0, t1 // t1 = carry2 + H(t0*ord1) - ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1) - ADCQ t1, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - + ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1) + ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3 + ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4 + ADCQ $0, acc0 // acc0 = t0 + carry5 + // calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 MOVQ t0, AX //MOVQ t0, DX // This is not required due to t0=DX already SHLQ $32, AX @@ -2065,6 +2073,7 @@ ordSqrLoopBMI2: #define t3 SI #define hlp BP /* ---------------------------------------*/ +// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0] TEXT sm2P256SubInternal(SB),NOSPLIT,$0 XORQ mul0, mul0 SUBQ t0, acc4 @@ -2091,6 +2100,7 @@ TEXT sm2P256SubInternal(SB),NOSPLIT,$0 RET /* ---------------------------------------*/ +// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0] TEXT sm2P256MulInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalMulBMI2 @@ -2538,6 +2548,7 @@ internalMulBMI2: CMOVQCS t3, acc7 \ /* ---------------------------------------*/ +// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4]^2 TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalSqrBMI2 @@ -2612,7 +2623,7 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADDQ acc4, t1 ADCQ mul0, t2 ADCQ DX, t3 - + // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] sm2P256SqrReductionInternal() RET @@ -2670,12 +2681,13 @@ internalSqrBMI2: MULXQ mul1, mul0, acc4 ADCQ mul0, t2 ADCQ acc4, t3 - + // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] sm2P256SqrReductionInternal() RET /* ---------------------------------------*/ +// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4] #define p256MulBy2Inline\ XORQ mul0, mul0;\ ADDQ acc4, acc4;\ @@ -2697,6 +2709,7 @@ internalSqrBMI2: CMOVQCS acc6, t2;\ CMOVQCS acc7, t3; /* ---------------------------------------*/ +// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0] #define p256AddInline \ XORQ mul0, mul0;\ ADDQ t0, acc4;\ diff --git a/internal/sm2ec/p256_asm_arm64.s b/internal/sm2ec/p256_asm_arm64.s index 6ee0ff8..a3b1e04 100644 --- a/internal/sm2ec/p256_asm_arm64.s +++ b/internal/sm2ec/p256_asm_arm64.s @@ -837,6 +837,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 RET /* ---------------------------------------*/ +// (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0) TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0 SUBS x0, y0, acc0 SBCS x1, y1, acc1 @@ -858,6 +859,7 @@ TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0 RET /* ---------------------------------------*/ +// (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2 TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0 // x[1:] * x[0] MUL x0, x1, acc1 @@ -988,6 +990,7 @@ TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0 CSEL CS, acc6, acc3, y3 RET /* ---------------------------------------*/ +// (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0) TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0 // y[0] * x MUL y0, x0, acc0 @@ -1146,6 +1149,7 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0 CSEL CS, acc6, acc3, y3 RET /* ---------------------------------------*/ +// (x3, x2, x1, x0) = 2(y3, y2, y1, y0) #define p256MulBy2Inline \ ADDS y0, y0, x0; \ ADCS y1, y1, x1; \ @@ -1364,6 +1368,7 @@ TEXT ·p256PointAddAffineAsm(SB),0,$264-48 RET +// (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0) #define p256AddInline \ ADDS y0, x0, x0; \ ADCS y1, x1, x1; \