internal/sm2ec: update WWMM reduction for Square

This commit is contained in:
Sun Yimin 2024-02-22 14:53:02 +08:00 committed by GitHub
parent de2376fe60
commit 2553456216
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 92 additions and 80 deletions

View File

@ -869,58 +869,62 @@ TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
ADCS t0, acc6, acc6 ADCS t0, acc6, acc6
UMULH x3, x3, t1 UMULH x3, x3, t1
ADCS t1, acc7, acc7 ADCS t1, acc7, acc7
// First reduction step // First reduction step
LSL $32, acc0, y0 LSL $32, acc0, y0
LSR $32, acc0, y1 LSR $32, acc0, y1
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0
SUBS y0, acc1 SUBS y0, acc1
SBCS y1, acc2 SBCS y1, acc2
SBCS y0, acc3 SBCS y0, acc3
SBC y1, acc0 SBC y1, acc0, y0
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, y0, acc0
// Second reduction step // Second reduction step
LSL $32, acc1, y0 LSL $32, acc1, y0
LSR $32, acc1, y1 LSR $32, acc1, y1
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1
SUBS y0, acc2 SUBS y0, acc2
SBCS y1, acc3 SBCS y1, acc3
SBCS y0, acc0 SBCS y0, acc0
SBC y1, acc1 SBC y1, acc1, y0
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, y0, acc1
// Third reduction step // Third reduction step
LSL $32, acc2, y0 LSL $32, acc2, y0
LSR $32, acc2, y1 LSR $32, acc2, y1
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2
SUBS y0, acc3 SUBS y0, acc3
SBCS y1, acc0 SBCS y1, acc0
SBCS y0, acc1 SBCS y0, acc1
SBC y1, acc2 SBC y1, acc2, y0
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, y0, acc2
// Last reduction step // Last reduction step
LSL $32, acc3, y0 LSL $32, acc3, y0
LSR $32, acc3, y1 LSR $32, acc3, y1
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3
SUBS y0, acc0 SUBS y0, acc0
SBCS y1, acc1 SBCS y1, acc1
SBCS y0, acc2 SBCS y0, acc2
SBC y1, acc3 SBC y1, acc3, y0
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, y0, acc3
// Add bits [511:256] of the sqr result // Add bits [511:256] of the sqr result
ADDS acc4, acc0, acc0 ADDS acc4, acc0, acc0

View File

@ -30,72 +30,76 @@ GLOBL p256one<>(SB), 8, $32
#define p256SqrMontReduce() \ #define p256SqrMontReduce() \
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1] \ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
MOVQ acc0, AX \ MOVQ acc0, AX \
MOVQ acc0, DX \ MOVQ acc0, DX \
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
\ // calculate the positive part first: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
ADDQ acc0, acc1 \ // acc1' = L (acc0 + acc1)
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0 \// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
SUBQ AX, acc1 \ SUBQ AX, acc1 \
SBBQ DX, acc2 \ SBBQ DX, acc2 \
SBBQ AX, acc3 \ SBBQ AX, acc3 \
MOVQ acc0, AX \
SBBQ DX, acc0 \ SBBQ DX, acc0 \
\ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1)
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
\ // Second reduction step \ // Second reduction step
MOVQ acc1, AX \ MOVQ acc1, AX \
MOVQ acc1, DX \ MOVQ acc1, DX \
SHLQ $32, AX \ SHLQ $32, AX \
SHRQ $32, DX \ SHRQ $32, DX \
\
ADDQ acc1, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
\ \
SUBQ AX, acc2 \ SUBQ AX, acc2 \
SBBQ DX, acc3 \ SBBQ DX, acc3 \
SBBQ AX, acc0 \ SBBQ AX, acc0 \
MOVQ acc1, AX \
SBBQ DX, acc1 \ SBBQ DX, acc1 \
\
ADDQ AX, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
\ // Third reduction step \ // Third reduction step
MOVQ acc2, AX \ MOVQ acc2, AX \
MOVQ acc2, DX \ MOVQ acc2, DX \
SHLQ $32, AX \ SHLQ $32, AX \
SHRQ $32, DX \ SHRQ $32, DX \
\
ADDQ acc2, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
\ \
SUBQ AX, acc3 \ SUBQ AX, acc3 \
SBBQ DX, acc0 \ SBBQ DX, acc0 \
SBBQ AX, acc1 \ SBBQ AX, acc1 \
MOVQ acc2, AX \
SBBQ DX, acc2 \ SBBQ DX, acc2 \
\ // Last reduction step
XORQ t0, t0 \
MOVQ acc3, AX \
MOVQ acc3, DX \
SHLQ $32, AX \
SHRQ $32, DX \
\ \
ADDQ acc3, acc0 \ ADDQ AX, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \ ADCQ $0, acc1 \
ADCQ $0, acc2 \ ADCQ $0, acc2 \
ADCQ $0, acc3 \ \ // Last reduction step
XORQ t0, t0 \
MOVQ acc3, AX \
MOVQ acc3, DX \
SHLQ $32, AX \
SHRQ $32, DX \
\ \
SUBQ AX, acc0 \ SUBQ AX, acc0 \
SBBQ DX, acc1 \ SBBQ DX, acc1 \
SBBQ AX, acc2 \ SBBQ AX, acc2 \
MOVQ acc3, AX \
SBBQ DX, acc3 \ SBBQ DX, acc3 \
\
ADDQ AX, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
\ // Add bits [511:256] of the sqr result \ // Add bits [511:256] of the sqr result
ADCQ acc4, acc0 \ ADCQ acc4, acc0 \
ADCQ acc5, acc1 \ ADCQ acc5, acc1 \
ADCQ y_ptr, acc2 \ ADCQ y_ptr, acc2 \
ADCQ x_ptr, acc3 \ ADCQ x_ptr, acc3 \
ADCQ $0, t0 ADCQ $0, t0
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ #define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
@ -151,60 +155,64 @@ GLOBL p256one<>(SB), 8, $32
SHLQ $32, mul0 \ SHLQ $32, mul0 \
SHRQ $32, mul1 \ SHRQ $32, mul1 \
\ \
ADDQ acc0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
\
SUBQ mul0, acc1 \ SUBQ mul0, acc1 \
SBBQ mul1, acc2 \ SBBQ mul1, acc2 \
SBBQ mul0, acc3 \ SBBQ mul0, acc3 \
MOVQ acc0, mul0 \
SBBQ mul1, acc0 \ SBBQ mul1, acc0 \
\
ADDQ mul0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
\ // Second reduction step \ // Second reduction step
MOVQ acc1, mul0 \ MOVQ acc1, mul0 \
MOVQ acc1, mul1 \ MOVQ acc1, mul1 \
SHLQ $32, mul0 \ SHLQ $32, mul0 \
SHRQ $32, mul1 \ SHRQ $32, mul1 \
\ \
ADDQ acc1, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
\
SUBQ mul0, acc2 \ SUBQ mul0, acc2 \
SBBQ mul1, acc3 \ SBBQ mul1, acc3 \
SBBQ mul0, acc0 \ SBBQ mul0, acc0 \
MOVQ acc1, mul0 \
SBBQ mul1, acc1 \ SBBQ mul1, acc1 \
\
ADDQ mul0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
\ // Third reduction step \ // Third reduction step
MOVQ acc2, mul0 \ MOVQ acc2, mul0 \
MOVQ acc2, mul1 \ MOVQ acc2, mul1 \
SHLQ $32, mul0 \ SHLQ $32, mul0 \
SHRQ $32, mul1 \ SHRQ $32, mul1 \
\ \
ADDQ acc2, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
\
SUBQ mul0, acc3 \ SUBQ mul0, acc3 \
SBBQ mul1, acc0 \ SBBQ mul1, acc0 \
SBBQ mul0, acc1 \ SBBQ mul0, acc1 \
MOVQ acc2, mul0 \
SBBQ mul1, acc2 \ SBBQ mul1, acc2 \
\
ADDQ mul0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
\ // Last reduction step \ // Last reduction step
MOVQ acc3, mul0 \ MOVQ acc3, mul0 \
MOVQ acc3, mul1 \ MOVQ acc3, mul1 \
SHLQ $32, mul0 \ SHLQ $32, mul0 \
SHRQ $32, mul1 \ SHRQ $32, mul1 \
\ \
ADDQ acc3, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
\
SUBQ mul0, acc0 \ SUBQ mul0, acc0 \
SBBQ mul1, acc1 \ SBBQ mul1, acc1 \
SBBQ mul0, acc2 \ SBBQ mul0, acc2 \
MOVQ acc3, mul0 \
SBBQ mul1, acc3 \ SBBQ mul1, acc3 \
\
ADDQ mul0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
MOVQ $0, mul0 \ MOVQ $0, mul0 \
\ // Add bits [511:256] of the result \ // Add bits [511:256] of the result
ADCQ acc0, t0 \ ADCQ acc0, t0 \