internal/sm2ec: mul WWMM reduction, sub first

This commit is contained in:
Sun Yimin 2024-02-22 17:44:16 +08:00 committed by GitHub
parent 2553456216
commit 052040fd82
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 209 additions and 364 deletions

View File

@ -291,7 +291,7 @@ ordSqrLoop:
ADCQ DX, acc2 ADCQ DX, acc2
ADCQ $0, acc3 ADCQ $0, acc3
ADCQ $0, acc0 ADCQ $0, acc0
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX MOVQ t0, AX
MOVQ t0, DX MOVQ t0, DX
SHLQ $32, AX SHLQ $32, AX
@ -484,7 +484,7 @@ ordSqrLoopBMI2:
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3 ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4 ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
ADCQ $0, acc0 // acc0 = t0 + carry5 ADCQ $0, acc0 // acc0 = t0 + carry5
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX MOVQ t0, AX
//MOVQ t0, DX // This is not required due to t0=DX already //MOVQ t0, DX // This is not required due to t0=DX already
SHLQ $32, AX SHLQ $32, AX
@ -759,66 +759,8 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADDQ mul0, acc6 ADDQ mul0, acc6
ADCQ $0, mul1 ADCQ $0, mul1
MOVQ mul1, acc7 MOVQ mul1, acc7
// First reduction step sm2P256MulReductionInternal()
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
MOVQ $0, BP MOVQ $0, BP
// Add bits [511:256] of the result // Add bits [511:256] of the result
ADCQ acc0, acc4 ADCQ acc0, acc4
@ -918,66 +860,7 @@ internalMulBMI2:
ADDQ mul0, acc6 ADDQ mul0, acc6
ADCQ $0, acc7 ADCQ $0, acc7
// First reduction step sm2P256MulReductionInternal()
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
MOVQ $0, BP MOVQ $0, BP
// Add bits [511:256] of the result // Add bits [511:256] of the result
ADCQ acc0, acc4 ADCQ acc0, acc4

View File

@ -207,54 +207,57 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
LSL $32, acc0, y0 LSL $32, acc0, y0
LSR $32, acc0, y1 LSR $32, acc0, y1
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0
SUBS y0, acc1 SUBS y0, acc1
SBCS y1, acc2 SBCS y1, acc2
SBCS y0, acc3 SBCS y0, acc3
SBC y1, acc0 SBC y1, acc0, y0
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, y0, acc0
// Second reduction step // Second reduction step
LSL $32, acc1, y0 LSL $32, acc1, y0
LSR $32, acc1, y1 LSR $32, acc1, y1
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1
SUBS y0, acc2 SUBS y0, acc2
SBCS y1, acc3 SBCS y1, acc3
SBCS y0, acc0 SBCS y0, acc0
SBC y1, acc1 SBC y1, acc1, y0
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, y0, acc1
// Third reduction step // Third reduction step
LSL $32, acc2, y0 LSL $32, acc2, y0
LSR $32, acc2, y1 LSR $32, acc2, y1
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2
SUBS y0, acc3 SUBS y0, acc3
SBCS y1, acc0 SBCS y1, acc0
SBCS y0, acc1 SBCS y0, acc1
SBC y1, acc2 SBC y1, acc2, y0
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, y0, acc2
// Last reduction step // Last reduction step
LSL $32, acc3, y0 LSL $32, acc3, y0
LSR $32, acc3, y1 LSR $32, acc3, y1
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3
SUBS y0, acc0 SUBS y0, acc0
SBCS y1, acc1 SBCS y1, acc1
SBCS y0, acc2 SBCS y0, acc2
SBC y1, acc3 SBC y1, acc3, y0
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, y0, acc3
SUBS const0, acc0, t0 SUBS const0, acc0, t0
SBCS const1, acc1, t1 SBCS const1, acc1, t1
@ -967,15 +970,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc0, t0 LSL $32, acc0, t0
LSR $32, acc0, t1 LSR $32, acc0, t1
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0
SUBS t0, acc1 SUBS t0, acc1
SBCS t1, acc2 SBCS t1, acc2
SBCS t0, acc3 SBCS t0, acc3
SBC t1, acc0 SBC t1, acc0, t0
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, t0, acc0
// y[1] * x // y[1] * x
MUL y1, x0, t0 MUL y1, x0, t0
@ -1003,15 +1006,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc1, t0 LSL $32, acc1, t0
LSR $32, acc1, t1 LSR $32, acc1, t1
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1
SUBS t0, acc2 SUBS t0, acc2
SBCS t1, acc3 SBCS t1, acc3
SBCS t0, acc0 SBCS t0, acc0
SBC t1, acc1 SBC t1, acc1, t0
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, t0, acc1
// y[2] * x // y[2] * x
MUL y2, x0, t0 MUL y2, x0, t0
@ -1039,15 +1042,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc2, t0 LSL $32, acc2, t0
LSR $32, acc2, t1 LSR $32, acc2, t1
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2
SUBS t0, acc3 SUBS t0, acc3
SBCS t1, acc0 SBCS t1, acc0
SBCS t0, acc1 SBCS t0, acc1
SBC t1, acc2 SBC t1, acc2, t0
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, t0, acc2
// y[3] * x // y[3] * x
MUL y3, x0, t0 MUL y3, x0, t0
@ -1075,15 +1078,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc3, t0 LSL $32, acc3, t0
LSR $32, acc3, t1 LSR $32, acc3, t1
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3
SUBS t0, acc0 SUBS t0, acc0
SBCS t1, acc1 SBCS t1, acc1
SBCS t0, acc2 SBCS t0, acc2
SBC t1, acc3 SBC t1, acc3, t0
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, t0, acc3
// Add bits [511:256] of the mul result // Add bits [511:256] of the mul result
ADDS acc4, acc0, acc0 ADDS acc4, acc0, acc0

View File

@ -207,19 +207,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc0, acc1 SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
MOVQ acc0, AX
SBBQ DX, acc0
ADDQ AX, acc1
ADCQ $0, acc2 ADCQ $0, acc2
ADCQ $0, acc3 ADCQ $0, acc3
ADCQ acc0, acc4 ADCQ acc0, acc4
ADCQ $0, acc5 ADCQ $0, acc5
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
SBBQ DX, acc4
SBBQ $0, acc5
XORQ acc0, acc0 XORQ acc0, acc0
// x * y[1] // x * y[1]
MOVQ (8*1)(y_ptr), t0 MOVQ (8*1)(y_ptr), t0
@ -258,19 +258,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc1, acc2 SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
MOVQ acc1, AX
SBBQ DX, acc1
ADDQ AX, acc2
ADCQ $0, acc3 ADCQ $0, acc3
ADCQ $0, acc4 ADCQ $0, acc4
ADCQ acc1, acc5 ADCQ acc1, acc5
ADCQ $0, acc0 ADCQ $0, acc0
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
SBBQ DX, acc5
SBBQ $0, acc0
XORQ acc1, acc1 XORQ acc1, acc1
// x * y[2] // x * y[2]
MOVQ (8*2)(y_ptr), t0 MOVQ (8*2)(y_ptr), t0
@ -309,17 +309,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc2, acc3 SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
MOVQ acc2, AX
SBBQ DX, acc2
ADDQ AX, acc3
ADCQ $0, acc4 ADCQ $0, acc4
ADCQ $0, acc5 ADCQ $0, acc5
ADCQ acc2, acc0 ADCQ acc2, acc0
ADCQ $0, acc1 ADCQ $0, acc1
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
SBBQ DX, acc0
SBBQ $0, acc1
XORQ acc2, acc2 XORQ acc2, acc2
// x * y[3] // x * y[3]
MOVQ (8*3)(y_ptr), t0 MOVQ (8*3)(y_ptr), t0
@ -359,17 +360,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc3, acc4 SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
MOVQ acc3, AX
SBBQ DX, acc3
ADDQ AX, acc4
ADCQ $0, acc5 ADCQ $0, acc5
ADCQ $0, acc0 ADCQ $0, acc0
ADCQ acc3, acc1 ADCQ acc3, acc1
ADCQ $0, acc2 ADCQ $0, acc2
SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
RET RET
@ -395,19 +397,19 @@ mulBMI2:
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc0, acc1 SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
MOVQ acc0, AX
SBBQ DX, acc0
ADDQ AX, acc1
ADCQ $0, acc2 ADCQ $0, acc2
ADCQ $0, acc3 ADCQ $0, acc3
ADCQ acc0, acc4 ADCQ acc0, acc4
ADCQ $0, acc5 ADCQ $0, acc5
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
SBBQ DX, acc4
SBBQ $0, acc5
XORQ acc0, acc0 XORQ acc0, acc0
// x * y[1] // x * y[1]
MOVQ (8*1)(y_ptr), DX MOVQ (8*1)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, BX MULXQ (8*0)(x_ptr), AX, BX
@ -436,19 +438,19 @@ mulBMI2:
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc1, acc2 SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
MOVQ acc1, AX
SBBQ DX, acc1
ADDQ AX, acc2
ADCQ $0, acc3 ADCQ $0, acc3
ADCQ $0, acc4 ADCQ $0, acc4
ADCQ acc1, acc5 ADCQ acc1, acc5
ADCQ $0, acc0 ADCQ $0, acc0
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
SBBQ DX, acc5
SBBQ $0, acc0
XORQ acc1, acc1 XORQ acc1, acc1
// x * y[2] // x * y[2]
MOVQ (8*2)(y_ptr), DX MOVQ (8*2)(y_ptr), DX
@ -477,17 +479,18 @@ mulBMI2:
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc2, acc3 SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
MOVQ acc2, AX
SBBQ DX, acc2
ADDQ AX, acc3
ADCQ $0, acc4 ADCQ $0, acc4
ADCQ $0, acc5 ADCQ $0, acc5
ADCQ acc2, acc0 ADCQ acc2, acc0
ADCQ $0, acc1 ADCQ $0, acc1
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
SBBQ DX, acc0
SBBQ $0, acc1
XORQ acc2, acc2 XORQ acc2, acc2
// x * y[3] // x * y[3]
MOVQ (8*3)(y_ptr), DX MOVQ (8*3)(y_ptr), DX
@ -517,17 +520,18 @@ mulBMI2:
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc3, acc4 SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
MOVQ acc3, AX
SBBQ DX, acc3
ADDQ AX, acc4
ADCQ $0, acc5 ADCQ $0, acc5
ADCQ $0, acc0 ADCQ $0, acc0
ADCQ acc3, acc1 ADCQ acc3, acc1
ADCQ $0, acc2 ADCQ $0, acc2
SUBQ AX, acc4
SBBQ DX, acc5
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
RET RET
@ -550,32 +554,35 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc0, acc1 SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
MOVQ acc0, AX
SBBQ DX, acc0
ADDQ AX, acc1
ADCQ $0, acc2 ADCQ $0, acc2
ADCQ $0, acc3 ADCQ $0, acc3
ADCQ acc0, acc4 ADCQ acc0, acc4
SUBQ AX, acc1
SBBQ DX, acc2
SBBQ AX, acc3
SBBQ DX, acc4
XORQ acc5, acc5 XORQ acc5, acc5
// Second stage // Second stage
MOVQ acc1, AX MOVQ acc1, AX
MOVQ acc1, DX MOVQ acc1, DX
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc1, acc2 SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
MOVQ acc1, AX
SBBQ DX, acc5
ADDQ AX, acc2
ADCQ $0, acc3 ADCQ $0, acc3
ADCQ $0, acc4 ADCQ $0, acc4
ADCQ acc1, acc5 ADCQ acc1, acc5
SUBQ AX, acc2
SBBQ DX, acc3
SBBQ AX, acc4
SBBQ DX, acc5
XORQ acc0, acc0 XORQ acc0, acc0
// Third stage // Third stage
MOVQ acc2, AX MOVQ acc2, AX
@ -583,15 +590,17 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc2, acc3 SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
MOVQ acc2, AX
SBBQ DX, acc2
ADDQ AX, acc3
ADCQ $0, acc4 ADCQ $0, acc4
ADCQ $0, acc5 ADCQ $0, acc5
ADCQ acc2, acc0 ADCQ acc2, acc0
SUBQ AX, acc3
SBBQ DX, acc4
SBBQ AX, acc5
SBBQ DX, acc0
XORQ acc1, acc1 XORQ acc1, acc1
// Last stage // Last stage
MOVQ acc3, AX MOVQ acc3, AX
@ -599,15 +608,16 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
SHLQ $32, AX SHLQ $32, AX
SHRQ $32, DX SHRQ $32, DX
ADDQ acc3, acc4
ADCQ $0, acc5
ADCQ $0, acc0
ADCQ acc3, acc1
SUBQ AX, acc4 SUBQ AX, acc4
SBBQ DX, acc5 SBBQ DX, acc5
SBBQ AX, acc0 SBBQ AX, acc0
SBBQ DX, acc1 MOVQ acc3, AX
SBBQ DX, acc3
ADDQ AX, acc4
ADCQ $0, acc5
ADCQ $0, acc0
ADCQ acc3, acc1
MOVQ acc4, x_ptr MOVQ acc4, x_ptr
MOVQ acc5, acc3 MOVQ acc5, acc3

View File

@ -237,6 +237,72 @@ GLOBL p256one<>(SB), 8, $32
CMOVQCS t2, acc6 \ CMOVQCS t2, acc6 \
CMOVQCS t3, acc7 CMOVQCS t3, acc7
#define sm2P256MulReductionInternal() \
\// First reduction step
MOVQ acc0, mul0 \
MOVQ acc0, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc1 \
SBBQ mul1, acc2 \
SBBQ mul0, acc3 \
MOVQ acc0, mul0 \
SBBQ mul1, acc0 \
\
ADDQ mul0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
\// Second reduction step
MOVQ acc1, mul0 \
MOVQ acc1, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc2 \
SBBQ mul1, acc3 \
SBBQ mul0, acc0 \
MOVQ acc1, mul0 \
SBBQ mul1, acc1 \
\
ADDQ mul0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
\// Third reduction step
MOVQ acc2, mul0 \
MOVQ acc2, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc3 \
SBBQ mul1, acc0 \
SBBQ mul0, acc1 \
MOVQ acc2, mul0 \
SBBQ mul1, acc2 \
\
ADDQ mul0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
\// Last reduction step
MOVQ acc3, mul0 \
MOVQ acc3, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc0 \
SBBQ mul1, acc1 \
SBBQ mul0, acc2 \
MOVQ acc3, mul0 \
SBBQ mul1, acc3 \
\
ADDQ mul0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3
#define p256PointDoubleInit() \ #define p256PointDoubleInit() \
MOVOU (16*0)(BX), X0 \ MOVOU (16*0)(BX), X0 \
MOVOU (16*1)(BX), X1 \ MOVOU (16*1)(BX), X1 \

View File

@ -757,67 +757,9 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADDQ mul0, acc6 ADDQ mul0, acc6
ADCQ $0, mul1 ADCQ $0, mul1
MOVQ mul1, acc7 MOVQ mul1, acc7
// First reduction step
PEXTRQ $0, X0, acc0 PEXTRQ $0, X0, acc0
MOVQ acc0, mul0 sm2P256MulReductionInternal()
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
MOVQ $0, mul0 MOVQ $0, mul0
// Add bits [511:256] of the result // Add bits [511:256] of the result
ADCQ acc0, acc4 ADCQ acc0, acc4
@ -918,67 +860,8 @@ internalMulBMI2:
ADDQ mul0, acc6 ADDQ mul0, acc6
ADCQ $0, acc7 ADCQ $0, acc7
// First reduction step
PEXTRQ $0, X0, acc0 PEXTRQ $0, X0, acc0
MOVQ acc0, mul0 sm2P256MulReductionInternal()
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0
SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2
SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
MOVQ $0, mul0 MOVQ $0, mul0
// Add bits [511:256] of the result // Add bits [511:256] of the result
ADCQ acc0, acc4 ADCQ acc0, acc4