internal/sm2ec: mul WWMM reduction, sub first

This commit is contained in:
Sun Yimin 2024-02-22 17:44:16 +08:00 committed by GitHub
parent 2553456216
commit 052040fd82
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 209 additions and 364 deletions

View File

@ -291,7 +291,7 @@ ordSqrLoop:
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ $0, acc0
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
@ -484,7 +484,7 @@ ordSqrLoopBMI2:
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
ADCQ $0, acc0 // acc0 = t0 + carry5
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
//MOVQ t0, DX // This is not required due to t0=DX already
SHLQ $32, AX
@ -759,66 +759,8 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADDQ mul0, acc6
ADCQ $0, mul1
MOVQ mul1, acc7
// First reduction step
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
sm2P256MulReductionInternal()
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
MOVQ $0, BP
// Add bits [511:256] of the result
ADCQ acc0, acc4
@ -918,66 +860,7 @@ internalMulBMI2:
ADDQ mul0, acc6
ADCQ $0, acc7
// First reduction step
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
sm2P256MulReductionInternal()
MOVQ $0, BP
// Add bits [511:256] of the result
ADCQ acc0, acc4

View File

@ -207,54 +207,57 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
LSL $32, acc0, y0
LSR $32, acc0, y1
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0
SUBS y0, acc1
SBCS y1, acc2
SBCS y0, acc3
SBC y1, acc0
SBC y1, acc0, y0
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, y0, acc0
// Second reduction step
LSL $32, acc1, y0
LSR $32, acc1, y1
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1
SUBS y0, acc2
SBCS y1, acc3
SBCS y0, acc0
SBC y1, acc1
SBC y1, acc1, y0
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, y0, acc1
// Third reduction step
LSL $32, acc2, y0
LSR $32, acc2, y1
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2
SUBS y0, acc3
SBCS y1, acc0
SBCS y0, acc1
SBC y1, acc2
SBC y1, acc2, y0
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, y0, acc2
// Last reduction step
LSL $32, acc3, y0
LSR $32, acc3, y1
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3
SUBS y0, acc0
SBCS y1, acc1
SBCS y0, acc2
SBC y1, acc3
SBC y1, acc3, y0
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, y0, acc3
SUBS const0, acc0, t0
SBCS const1, acc1, t1
@ -967,15 +970,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc0, t0
LSR $32, acc0, t1
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0
SUBS t0, acc1
SBCS t1, acc2
SBCS t0, acc3
SBC t1, acc0
SBC t1, acc0, t0
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, t0, acc0
// y[1] * x
MUL y1, x0, t0
@ -1003,15 +1006,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc1, t0
LSR $32, acc1, t1
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1
SUBS t0, acc2
SBCS t1, acc3
SBCS t0, acc0
SBC t1, acc1
SBC t1, acc1, t0
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, t0, acc1
// y[2] * x
MUL y2, x0, t0
@ -1039,15 +1042,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc2, t0
LSR $32, acc2, t1
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2
SUBS t0, acc3
SBCS t1, acc0
SBCS t0, acc1
SBC t1, acc2
SBC t1, acc2, t0
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, t0, acc2
// y[3] * x
MUL y3, x0, t0
@ -1075,15 +1078,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc3, t0
LSR $32, acc3, t1
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3
SUBS t0, acc0
SBCS t1, acc1
SBCS t0, acc2
SBC t1, acc3
SBC t1, acc3, t0
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, t0, acc3
// Add bits [511:256] of the mul result
ADDS acc4, acc0, acc0

View File

@ -207,19 +207,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
MOVQ acc0, AX
SBBQ DX, acc0
ADDQ AX, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ acc0, acc4
ADCQ $0, acc5
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
SBBQ DX, acc4
SBBQ $0, acc5
XORQ acc0, acc0
// x * y[1]
MOVQ (8*1)(y_ptr), t0
@ -258,19 +258,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
MOVQ acc1, AX
SBBQ DX, acc1
ADDQ AX, acc2
ADCQ $0, acc3
ADCQ $0, acc4
ADCQ acc1, acc5
ADCQ $0, acc0
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
SBBQ DX, acc5
SBBQ $0, acc0
XORQ acc1, acc1
// x * y[2]
MOVQ (8*2)(y_ptr), t0
@ -309,17 +309,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
MOVQ acc2, AX
SBBQ DX, acc2
ADDQ AX, acc3
ADCQ $0, acc4
ADCQ $0, acc5
ADCQ acc2, acc0
ADCQ $0, acc1
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
SBBQ DX, acc0
SBBQ $0, acc1
XORQ acc2, acc2
// x * y[3]
MOVQ (8*3)(y_ptr), t0
@ -359,17 +360,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc4
SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
MOVQ acc3, AX
SBBQ DX, acc3
ADDQ AX, acc4
ADCQ $0, acc5
ADCQ $0, acc0
ADCQ acc3, acc1
ADCQ $0, acc2
SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
RET
@ -395,19 +397,19 @@ mulBMI2:
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
MOVQ acc0, AX
SBBQ DX, acc0
ADDQ AX, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ acc0, acc4
ADCQ $0, acc5
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
SBBQ DX, acc4
SBBQ $0, acc5
XORQ acc0, acc0
XORQ acc0, acc0
// x * y[1]
MOVQ (8*1)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, BX
@ -436,19 +438,19 @@ mulBMI2:
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
MOVQ acc1, AX
SBBQ DX, acc1
ADDQ AX, acc2
ADCQ $0, acc3
ADCQ $0, acc4
ADCQ acc1, acc5
ADCQ $0, acc0
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
SBBQ DX, acc5
SBBQ $0, acc0
XORQ acc1, acc1
// x * y[2]
MOVQ (8*2)(y_ptr), DX
@ -477,17 +479,18 @@ mulBMI2:
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
MOVQ acc2, AX
SBBQ DX, acc2
ADDQ AX, acc3
ADCQ $0, acc4
ADCQ $0, acc5
ADCQ acc2, acc0
ADCQ $0, acc1
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
SBBQ DX, acc0
SBBQ $0, acc1
XORQ acc2, acc2
// x * y[3]
MOVQ (8*3)(y_ptr), DX
@ -517,17 +520,18 @@ mulBMI2:
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc4
SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
MOVQ acc3, AX
SBBQ DX, acc3
ADDQ AX, acc4
ADCQ $0, acc5
ADCQ $0, acc0
ADCQ acc3, acc1
ADCQ $0, acc2
SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
RET
@ -550,32 +554,35 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
MOVQ acc0, AX
SBBQ DX, acc0
ADDQ AX, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ acc0, acc4
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
SBBQ DX, acc4
XORQ acc5, acc5
// Second stage
MOVQ acc1, AX
MOVQ acc1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
MOVQ acc1, AX
SBBQ DX, acc5
ADDQ AX, acc2
ADCQ $0, acc3
ADCQ $0, acc4
ADCQ acc1, acc5
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
SBBQ DX, acc5
XORQ acc0, acc0
// Third stage
MOVQ acc2, AX
@ -583,15 +590,17 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
MOVQ acc2, AX
SBBQ DX, acc2
ADDQ AX, acc3
ADCQ $0, acc4
ADCQ $0, acc5
ADCQ acc2, acc0
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
SBBQ DX, acc0
XORQ acc1, acc1
// Last stage
MOVQ acc3, AX
@ -599,15 +608,16 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc4
ADCQ $0, acc5
ADCQ $0, acc0
ADCQ acc3, acc1
SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
SBBQ DX, acc1
MOVQ acc3, AX
SBBQ DX, acc3
ADDQ AX, acc4
ADCQ $0, acc5
ADCQ $0, acc0
ADCQ acc3, acc1
MOVQ acc4, x_ptr
MOVQ acc5, acc3

View File

@ -237,6 +237,72 @@ GLOBL p256one<>(SB), 8, $32
CMOVQCS t2, acc6 \
CMOVQCS t3, acc7
#define sm2P256MulReductionInternal() \
\// First reduction step
MOVQ acc0, mul0 \
MOVQ acc0, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc1 \
SBBQ mul1, acc2 \
SBBQ mul0, acc3 \
MOVQ acc0, mul0 \
SBBQ mul1, acc0 \
\
ADDQ mul0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
\// Second reduction step
MOVQ acc1, mul0 \
MOVQ acc1, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc2 \
SBBQ mul1, acc3 \
SBBQ mul0, acc0 \
MOVQ acc1, mul0 \
SBBQ mul1, acc1 \
\
ADDQ mul0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
\// Third reduction step
MOVQ acc2, mul0 \
MOVQ acc2, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc3 \
SBBQ mul1, acc0 \
SBBQ mul0, acc1 \
MOVQ acc2, mul0 \
SBBQ mul1, acc2 \
\
ADDQ mul0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
\// Last reduction step
MOVQ acc3, mul0 \
MOVQ acc3, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc0 \
SBBQ mul1, acc1 \
SBBQ mul0, acc2 \
MOVQ acc3, mul0 \
SBBQ mul1, acc3 \
\
ADDQ mul0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3
#define p256PointDoubleInit() \
MOVOU (16*0)(BX), X0 \
MOVOU (16*1)(BX), X1 \

View File

@ -757,67 +757,9 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADDQ mul0, acc6
ADCQ $0, mul1
MOVQ mul1, acc7
// First reduction step
PEXTRQ $0, X0, acc0
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
sm2P256MulReductionInternal()
MOVQ $0, mul0
// Add bits [511:256] of the result
ADCQ acc0, acc4
@ -918,67 +860,8 @@ internalMulBMI2:
ADDQ mul0, acc6
ADCQ $0, acc7
// First reduction step
PEXTRQ $0, X0, acc0
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
sm2P256MulReductionInternal()
MOVQ $0, mul0
// Add bits [511:256] of the result
ADCQ acc0, acc4