mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
internal/sm2ec: update WWMM reduction for Square
This commit is contained in:
parent
de2376fe60
commit
2553456216
@ -869,58 +869,62 @@ TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
|
||||
ADCS t0, acc6, acc6
|
||||
UMULH x3, x3, t1
|
||||
ADCS t1, acc7, acc7
|
||||
|
||||
// First reduction step
|
||||
LSL $32, acc0, y0
|
||||
LSR $32, acc0, y1
|
||||
|
||||
ADDS acc0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADC $0, acc0, acc0
|
||||
|
||||
SUBS y0, acc1
|
||||
SBCS y1, acc2
|
||||
SBCS y0, acc3
|
||||
SBC y1, acc0
|
||||
SBC y1, acc0, y0
|
||||
|
||||
ADDS acc0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADC $0, y0, acc0
|
||||
|
||||
// Second reduction step
|
||||
LSL $32, acc1, y0
|
||||
LSR $32, acc1, y1
|
||||
|
||||
ADDS acc1, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADC $0, acc1, acc1
|
||||
|
||||
SUBS y0, acc2
|
||||
SBCS y1, acc3
|
||||
SBCS y0, acc0
|
||||
SBC y1, acc1
|
||||
SBC y1, acc1, y0
|
||||
|
||||
ADDS acc1, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADC $0, y0, acc1
|
||||
|
||||
// Third reduction step
|
||||
LSL $32, acc2, y0
|
||||
LSR $32, acc2, y1
|
||||
|
||||
ADDS acc2, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADC $0, acc2, acc2
|
||||
|
||||
SUBS y0, acc3
|
||||
SBCS y1, acc0
|
||||
SBCS y0, acc1
|
||||
SBC y1, acc2
|
||||
SBC y1, acc2, y0
|
||||
|
||||
ADDS acc2, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADC $0, y0, acc2
|
||||
|
||||
// Last reduction step
|
||||
LSL $32, acc3, y0
|
||||
LSR $32, acc3, y1
|
||||
|
||||
ADDS acc3, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADC $0, acc3, acc3
|
||||
|
||||
SUBS y0, acc0
|
||||
SBCS y1, acc1
|
||||
SBCS y0, acc2
|
||||
SBC y1, acc3
|
||||
SBC y1, acc3, y0
|
||||
|
||||
ADDS acc3, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADC $0, y0, acc3
|
||||
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADDS acc4, acc0, acc0
|
||||
|
@ -30,72 +30,76 @@ GLOBL p256one<>(SB), 8, $32
|
||||
|
||||
#define p256SqrMontReduce() \
|
||||
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
|
||||
MOVQ acc0, AX \
|
||||
MOVQ acc0, DX \
|
||||
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
|
||||
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
|
||||
\ // calculate the positive part first: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
|
||||
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
|
||||
ADDQ acc0, acc1 \ // acc1' = L (acc0 + acc1)
|
||||
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
|
||||
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
|
||||
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
|
||||
MOVQ acc0, AX \
|
||||
MOVQ acc0, DX \
|
||||
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
|
||||
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
|
||||
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
|
||||
SUBQ AX, acc1 \
|
||||
SBBQ DX, acc2 \
|
||||
SBBQ AX, acc3 \
|
||||
MOVQ acc0, AX \
|
||||
SBBQ DX, acc0 \
|
||||
\ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
|
||||
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
|
||||
ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1)
|
||||
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
|
||||
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
|
||||
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
|
||||
\ // Second reduction step
|
||||
MOVQ acc1, AX \
|
||||
MOVQ acc1, DX \
|
||||
SHLQ $32, AX \
|
||||
SHRQ $32, DX \
|
||||
\
|
||||
ADDQ acc1, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
SHLQ $32, AX \
|
||||
SHRQ $32, DX \
|
||||
\
|
||||
SUBQ AX, acc2 \
|
||||
SBBQ DX, acc3 \
|
||||
SBBQ AX, acc0 \
|
||||
MOVQ acc1, AX \
|
||||
SBBQ DX, acc1 \
|
||||
\
|
||||
ADDQ AX, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
\ // Third reduction step
|
||||
MOVQ acc2, AX \
|
||||
MOVQ acc2, DX \
|
||||
SHLQ $32, AX \
|
||||
SHRQ $32, DX \
|
||||
\
|
||||
ADDQ acc2, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
SHLQ $32, AX \
|
||||
SHRQ $32, DX \
|
||||
\
|
||||
SUBQ AX, acc3 \
|
||||
SBBQ DX, acc0 \
|
||||
SBBQ AX, acc1 \
|
||||
MOVQ acc2, AX \
|
||||
SBBQ DX, acc2 \
|
||||
\ // Last reduction step
|
||||
XORQ t0, t0 \
|
||||
MOVQ acc3, AX \
|
||||
MOVQ acc3, DX \
|
||||
SHLQ $32, AX \
|
||||
SHRQ $32, DX \
|
||||
\
|
||||
ADDQ acc3, acc0 \
|
||||
ADDQ AX, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
\ // Last reduction step
|
||||
XORQ t0, t0 \
|
||||
MOVQ acc3, AX \
|
||||
MOVQ acc3, DX \
|
||||
SHLQ $32, AX \
|
||||
SHRQ $32, DX \
|
||||
\
|
||||
SUBQ AX, acc0 \
|
||||
SBBQ DX, acc1 \
|
||||
SBBQ AX, acc2 \
|
||||
MOVQ acc3, AX \
|
||||
SBBQ DX, acc3 \
|
||||
\
|
||||
ADDQ AX, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
\ // Add bits [511:256] of the sqr result
|
||||
ADCQ acc4, acc0 \
|
||||
ADCQ acc5, acc1 \
|
||||
ADCQ y_ptr, acc2 \
|
||||
ADCQ x_ptr, acc3 \
|
||||
ADCQ acc4, acc0 \
|
||||
ADCQ acc5, acc1 \
|
||||
ADCQ y_ptr, acc2 \
|
||||
ADCQ x_ptr, acc3 \
|
||||
ADCQ $0, t0
|
||||
|
||||
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
||||
@ -151,60 +155,64 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SHLQ $32, mul0 \
|
||||
SHRQ $32, mul1 \
|
||||
\
|
||||
ADDQ acc0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
\
|
||||
SUBQ mul0, acc1 \
|
||||
SBBQ mul1, acc2 \
|
||||
SBBQ mul0, acc3 \
|
||||
MOVQ acc0, mul0 \
|
||||
SBBQ mul1, acc0 \
|
||||
\
|
||||
ADDQ mul0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
\ // Second reduction step
|
||||
MOVQ acc1, mul0 \
|
||||
MOVQ acc1, mul1 \
|
||||
SHLQ $32, mul0 \
|
||||
SHRQ $32, mul1 \
|
||||
\
|
||||
ADDQ acc1, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
\
|
||||
SUBQ mul0, acc2 \
|
||||
SBBQ mul1, acc3 \
|
||||
SBBQ mul0, acc0 \
|
||||
MOVQ acc1, mul0 \
|
||||
SBBQ mul1, acc1 \
|
||||
\
|
||||
ADDQ mul0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
\ // Third reduction step
|
||||
MOVQ acc2, mul0 \
|
||||
MOVQ acc2, mul1 \
|
||||
SHLQ $32, mul0 \
|
||||
SHRQ $32, mul1 \
|
||||
\
|
||||
ADDQ acc2, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
\
|
||||
SUBQ mul0, acc3 \
|
||||
SBBQ mul1, acc0 \
|
||||
SBBQ mul0, acc1 \
|
||||
MOVQ acc2, mul0 \
|
||||
SBBQ mul1, acc2 \
|
||||
\
|
||||
ADDQ mul0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
\ // Last reduction step
|
||||
MOVQ acc3, mul0 \
|
||||
MOVQ acc3, mul1 \
|
||||
SHLQ $32, mul0 \
|
||||
SHRQ $32, mul1 \
|
||||
\
|
||||
ADDQ acc3, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
\
|
||||
SUBQ mul0, acc0 \
|
||||
SBBQ mul1, acc1 \
|
||||
SBBQ mul0, acc2 \
|
||||
MOVQ acc3, mul0 \
|
||||
SBBQ mul1, acc3 \
|
||||
\
|
||||
ADDQ mul0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
MOVQ $0, mul0 \
|
||||
\ // Add bits [511:256] of the result
|
||||
ADCQ acc0, t0 \
|
||||
|
Loading…
x
Reference in New Issue
Block a user