mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
internal/sm2ec: update WWMM reduction for Square
This commit is contained in:
parent
de2376fe60
commit
2553456216
@ -869,58 +869,62 @@ TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
|
|||||||
ADCS t0, acc6, acc6
|
ADCS t0, acc6, acc6
|
||||||
UMULH x3, x3, t1
|
UMULH x3, x3, t1
|
||||||
ADCS t1, acc7, acc7
|
ADCS t1, acc7, acc7
|
||||||
|
|
||||||
// First reduction step
|
// First reduction step
|
||||||
LSL $32, acc0, y0
|
LSL $32, acc0, y0
|
||||||
LSR $32, acc0, y1
|
LSR $32, acc0, y1
|
||||||
|
|
||||||
ADDS acc0, acc1, acc1
|
|
||||||
ADCS $0, acc2, acc2
|
|
||||||
ADCS $0, acc3, acc3
|
|
||||||
ADC $0, acc0, acc0
|
|
||||||
|
|
||||||
SUBS y0, acc1
|
SUBS y0, acc1
|
||||||
SBCS y1, acc2
|
SBCS y1, acc2
|
||||||
SBCS y0, acc3
|
SBCS y0, acc3
|
||||||
SBC y1, acc0
|
SBC y1, acc0, y0
|
||||||
|
|
||||||
|
ADDS acc0, acc1, acc1
|
||||||
|
ADCS $0, acc2, acc2
|
||||||
|
ADCS $0, acc3, acc3
|
||||||
|
ADC $0, y0, acc0
|
||||||
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
LSL $32, acc1, y0
|
LSL $32, acc1, y0
|
||||||
LSR $32, acc1, y1
|
LSR $32, acc1, y1
|
||||||
|
|
||||||
ADDS acc1, acc2, acc2
|
|
||||||
ADCS $0, acc3, acc3
|
|
||||||
ADCS $0, acc0, acc0
|
|
||||||
ADC $0, acc1, acc1
|
|
||||||
|
|
||||||
SUBS y0, acc2
|
SUBS y0, acc2
|
||||||
SBCS y1, acc3
|
SBCS y1, acc3
|
||||||
SBCS y0, acc0
|
SBCS y0, acc0
|
||||||
SBC y1, acc1
|
SBC y1, acc1, y0
|
||||||
|
|
||||||
|
ADDS acc1, acc2, acc2
|
||||||
|
ADCS $0, acc3, acc3
|
||||||
|
ADCS $0, acc0, acc0
|
||||||
|
ADC $0, y0, acc1
|
||||||
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
LSL $32, acc2, y0
|
LSL $32, acc2, y0
|
||||||
LSR $32, acc2, y1
|
LSR $32, acc2, y1
|
||||||
|
|
||||||
ADDS acc2, acc3, acc3
|
|
||||||
ADCS $0, acc0, acc0
|
|
||||||
ADCS $0, acc1, acc1
|
|
||||||
ADC $0, acc2, acc2
|
|
||||||
|
|
||||||
SUBS y0, acc3
|
SUBS y0, acc3
|
||||||
SBCS y1, acc0
|
SBCS y1, acc0
|
||||||
SBCS y0, acc1
|
SBCS y0, acc1
|
||||||
SBC y1, acc2
|
SBC y1, acc2, y0
|
||||||
|
|
||||||
|
ADDS acc2, acc3, acc3
|
||||||
|
ADCS $0, acc0, acc0
|
||||||
|
ADCS $0, acc1, acc1
|
||||||
|
ADC $0, y0, acc2
|
||||||
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
LSL $32, acc3, y0
|
LSL $32, acc3, y0
|
||||||
LSR $32, acc3, y1
|
LSR $32, acc3, y1
|
||||||
|
|
||||||
ADDS acc3, acc0, acc0
|
|
||||||
ADCS $0, acc1, acc1
|
|
||||||
ADCS $0, acc2, acc2
|
|
||||||
ADC $0, acc3, acc3
|
|
||||||
|
|
||||||
SUBS y0, acc0
|
SUBS y0, acc0
|
||||||
SBCS y1, acc1
|
SBCS y1, acc1
|
||||||
SBCS y0, acc2
|
SBCS y0, acc2
|
||||||
SBC y1, acc3
|
SBC y1, acc3, y0
|
||||||
|
|
||||||
|
ADDS acc3, acc0, acc0
|
||||||
|
ADCS $0, acc1, acc1
|
||||||
|
ADCS $0, acc2, acc2
|
||||||
|
ADC $0, y0, acc3
|
||||||
|
|
||||||
// Add bits [511:256] of the sqr result
|
// Add bits [511:256] of the sqr result
|
||||||
ADDS acc4, acc0, acc0
|
ADDS acc4, acc0, acc0
|
||||||
|
@ -30,72 +30,76 @@ GLOBL p256one<>(SB), 8, $32
|
|||||||
|
|
||||||
#define p256SqrMontReduce() \
|
#define p256SqrMontReduce() \
|
||||||
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
|
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
|
||||||
MOVQ acc0, AX \
|
MOVQ acc0, AX \
|
||||||
MOVQ acc0, DX \
|
MOVQ acc0, DX \
|
||||||
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
|
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
|
||||||
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
|
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
|
||||||
\ // calculate the positive part first: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
|
|
||||||
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
|
|
||||||
ADDQ acc0, acc1 \ // acc1' = L (acc0 + acc1)
|
|
||||||
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
|
|
||||||
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
|
|
||||||
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
|
|
||||||
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
|
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
|
||||||
SUBQ AX, acc1 \
|
SUBQ AX, acc1 \
|
||||||
SBBQ DX, acc2 \
|
SBBQ DX, acc2 \
|
||||||
SBBQ AX, acc3 \
|
SBBQ AX, acc3 \
|
||||||
|
MOVQ acc0, AX \
|
||||||
SBBQ DX, acc0 \
|
SBBQ DX, acc0 \
|
||||||
|
\ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
|
||||||
|
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
|
||||||
|
ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1)
|
||||||
|
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
|
||||||
|
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
|
||||||
|
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
|
||||||
\ // Second reduction step
|
\ // Second reduction step
|
||||||
MOVQ acc1, AX \
|
MOVQ acc1, AX \
|
||||||
MOVQ acc1, DX \
|
MOVQ acc1, DX \
|
||||||
SHLQ $32, AX \
|
SHLQ $32, AX \
|
||||||
SHRQ $32, DX \
|
SHRQ $32, DX \
|
||||||
\
|
|
||||||
ADDQ acc1, acc2 \
|
|
||||||
ADCQ $0, acc3 \
|
|
||||||
ADCQ $0, acc0 \
|
|
||||||
ADCQ $0, acc1 \
|
|
||||||
\
|
\
|
||||||
SUBQ AX, acc2 \
|
SUBQ AX, acc2 \
|
||||||
SBBQ DX, acc3 \
|
SBBQ DX, acc3 \
|
||||||
SBBQ AX, acc0 \
|
SBBQ AX, acc0 \
|
||||||
|
MOVQ acc1, AX \
|
||||||
SBBQ DX, acc1 \
|
SBBQ DX, acc1 \
|
||||||
|
\
|
||||||
|
ADDQ AX, acc2 \
|
||||||
|
ADCQ $0, acc3 \
|
||||||
|
ADCQ $0, acc0 \
|
||||||
|
ADCQ $0, acc1 \
|
||||||
\ // Third reduction step
|
\ // Third reduction step
|
||||||
MOVQ acc2, AX \
|
MOVQ acc2, AX \
|
||||||
MOVQ acc2, DX \
|
MOVQ acc2, DX \
|
||||||
SHLQ $32, AX \
|
SHLQ $32, AX \
|
||||||
SHRQ $32, DX \
|
SHRQ $32, DX \
|
||||||
\
|
|
||||||
ADDQ acc2, acc3 \
|
|
||||||
ADCQ $0, acc0 \
|
|
||||||
ADCQ $0, acc1 \
|
|
||||||
ADCQ $0, acc2 \
|
|
||||||
\
|
\
|
||||||
SUBQ AX, acc3 \
|
SUBQ AX, acc3 \
|
||||||
SBBQ DX, acc0 \
|
SBBQ DX, acc0 \
|
||||||
SBBQ AX, acc1 \
|
SBBQ AX, acc1 \
|
||||||
|
MOVQ acc2, AX \
|
||||||
SBBQ DX, acc2 \
|
SBBQ DX, acc2 \
|
||||||
\ // Last reduction step
|
|
||||||
XORQ t0, t0 \
|
|
||||||
MOVQ acc3, AX \
|
|
||||||
MOVQ acc3, DX \
|
|
||||||
SHLQ $32, AX \
|
|
||||||
SHRQ $32, DX \
|
|
||||||
\
|
\
|
||||||
ADDQ acc3, acc0 \
|
ADDQ AX, acc3 \
|
||||||
|
ADCQ $0, acc0 \
|
||||||
ADCQ $0, acc1 \
|
ADCQ $0, acc1 \
|
||||||
ADCQ $0, acc2 \
|
ADCQ $0, acc2 \
|
||||||
ADCQ $0, acc3 \
|
\ // Last reduction step
|
||||||
|
XORQ t0, t0 \
|
||||||
|
MOVQ acc3, AX \
|
||||||
|
MOVQ acc3, DX \
|
||||||
|
SHLQ $32, AX \
|
||||||
|
SHRQ $32, DX \
|
||||||
\
|
\
|
||||||
SUBQ AX, acc0 \
|
SUBQ AX, acc0 \
|
||||||
SBBQ DX, acc1 \
|
SBBQ DX, acc1 \
|
||||||
SBBQ AX, acc2 \
|
SBBQ AX, acc2 \
|
||||||
|
MOVQ acc3, AX \
|
||||||
SBBQ DX, acc3 \
|
SBBQ DX, acc3 \
|
||||||
|
\
|
||||||
|
ADDQ AX, acc0 \
|
||||||
|
ADCQ $0, acc1 \
|
||||||
|
ADCQ $0, acc2 \
|
||||||
|
ADCQ $0, acc3 \
|
||||||
\ // Add bits [511:256] of the sqr result
|
\ // Add bits [511:256] of the sqr result
|
||||||
ADCQ acc4, acc0 \
|
ADCQ acc4, acc0 \
|
||||||
ADCQ acc5, acc1 \
|
ADCQ acc5, acc1 \
|
||||||
ADCQ y_ptr, acc2 \
|
ADCQ y_ptr, acc2 \
|
||||||
ADCQ x_ptr, acc3 \
|
ADCQ x_ptr, acc3 \
|
||||||
ADCQ $0, t0
|
ADCQ $0, t0
|
||||||
|
|
||||||
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
||||||
@ -151,60 +155,64 @@ GLOBL p256one<>(SB), 8, $32
|
|||||||
SHLQ $32, mul0 \
|
SHLQ $32, mul0 \
|
||||||
SHRQ $32, mul1 \
|
SHRQ $32, mul1 \
|
||||||
\
|
\
|
||||||
ADDQ acc0, acc1 \
|
|
||||||
ADCQ $0, acc2 \
|
|
||||||
ADCQ $0, acc3 \
|
|
||||||
ADCQ $0, acc0 \
|
|
||||||
\
|
|
||||||
SUBQ mul0, acc1 \
|
SUBQ mul0, acc1 \
|
||||||
SBBQ mul1, acc2 \
|
SBBQ mul1, acc2 \
|
||||||
SBBQ mul0, acc3 \
|
SBBQ mul0, acc3 \
|
||||||
|
MOVQ acc0, mul0 \
|
||||||
SBBQ mul1, acc0 \
|
SBBQ mul1, acc0 \
|
||||||
|
\
|
||||||
|
ADDQ mul0, acc1 \
|
||||||
|
ADCQ $0, acc2 \
|
||||||
|
ADCQ $0, acc3 \
|
||||||
|
ADCQ $0, acc0 \
|
||||||
\ // Second reduction step
|
\ // Second reduction step
|
||||||
MOVQ acc1, mul0 \
|
MOVQ acc1, mul0 \
|
||||||
MOVQ acc1, mul1 \
|
MOVQ acc1, mul1 \
|
||||||
SHLQ $32, mul0 \
|
SHLQ $32, mul0 \
|
||||||
SHRQ $32, mul1 \
|
SHRQ $32, mul1 \
|
||||||
\
|
\
|
||||||
ADDQ acc1, acc2 \
|
|
||||||
ADCQ $0, acc3 \
|
|
||||||
ADCQ $0, acc0 \
|
|
||||||
ADCQ $0, acc1 \
|
|
||||||
\
|
|
||||||
SUBQ mul0, acc2 \
|
SUBQ mul0, acc2 \
|
||||||
SBBQ mul1, acc3 \
|
SBBQ mul1, acc3 \
|
||||||
SBBQ mul0, acc0 \
|
SBBQ mul0, acc0 \
|
||||||
|
MOVQ acc1, mul0 \
|
||||||
SBBQ mul1, acc1 \
|
SBBQ mul1, acc1 \
|
||||||
|
\
|
||||||
|
ADDQ mul0, acc2 \
|
||||||
|
ADCQ $0, acc3 \
|
||||||
|
ADCQ $0, acc0 \
|
||||||
|
ADCQ $0, acc1 \
|
||||||
\ // Third reduction step
|
\ // Third reduction step
|
||||||
MOVQ acc2, mul0 \
|
MOVQ acc2, mul0 \
|
||||||
MOVQ acc2, mul1 \
|
MOVQ acc2, mul1 \
|
||||||
SHLQ $32, mul0 \
|
SHLQ $32, mul0 \
|
||||||
SHRQ $32, mul1 \
|
SHRQ $32, mul1 \
|
||||||
\
|
\
|
||||||
ADDQ acc2, acc3 \
|
|
||||||
ADCQ $0, acc0 \
|
|
||||||
ADCQ $0, acc1 \
|
|
||||||
ADCQ $0, acc2 \
|
|
||||||
\
|
|
||||||
SUBQ mul0, acc3 \
|
SUBQ mul0, acc3 \
|
||||||
SBBQ mul1, acc0 \
|
SBBQ mul1, acc0 \
|
||||||
SBBQ mul0, acc1 \
|
SBBQ mul0, acc1 \
|
||||||
|
MOVQ acc2, mul0 \
|
||||||
SBBQ mul1, acc2 \
|
SBBQ mul1, acc2 \
|
||||||
|
\
|
||||||
|
ADDQ mul0, acc3 \
|
||||||
|
ADCQ $0, acc0 \
|
||||||
|
ADCQ $0, acc1 \
|
||||||
|
ADCQ $0, acc2 \
|
||||||
\ // Last reduction step
|
\ // Last reduction step
|
||||||
MOVQ acc3, mul0 \
|
MOVQ acc3, mul0 \
|
||||||
MOVQ acc3, mul1 \
|
MOVQ acc3, mul1 \
|
||||||
SHLQ $32, mul0 \
|
SHLQ $32, mul0 \
|
||||||
SHRQ $32, mul1 \
|
SHRQ $32, mul1 \
|
||||||
\
|
\
|
||||||
ADDQ acc3, acc0 \
|
|
||||||
ADCQ $0, acc1 \
|
|
||||||
ADCQ $0, acc2 \
|
|
||||||
ADCQ $0, acc3 \
|
|
||||||
\
|
|
||||||
SUBQ mul0, acc0 \
|
SUBQ mul0, acc0 \
|
||||||
SBBQ mul1, acc1 \
|
SBBQ mul1, acc1 \
|
||||||
SBBQ mul0, acc2 \
|
SBBQ mul0, acc2 \
|
||||||
|
MOVQ acc3, mul0 \
|
||||||
SBBQ mul1, acc3 \
|
SBBQ mul1, acc3 \
|
||||||
|
\
|
||||||
|
ADDQ mul0, acc0 \
|
||||||
|
ADCQ $0, acc1 \
|
||||||
|
ADCQ $0, acc2 \
|
||||||
|
ADCQ $0, acc3 \
|
||||||
MOVQ $0, mul0 \
|
MOVQ $0, mul0 \
|
||||||
\ // Add bits [511:256] of the result
|
\ // Add bits [511:256] of the result
|
||||||
ADCQ acc0, t0 \
|
ADCQ acc0, t0 \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user