mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
internal/sm2ec: mul WWMM reduction, sub first
This commit is contained in:
parent
2553456216
commit
052040fd82
@ -291,7 +291,7 @@ ordSqrLoop:
|
||||
ADCQ DX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
@ -484,7 +484,7 @@ ordSqrLoopBMI2:
|
||||
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
|
||||
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
|
||||
ADCQ $0, acc0 // acc0 = t0 + carry5
|
||||
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||
MOVQ t0, AX
|
||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
||||
SHLQ $32, AX
|
||||
@ -759,66 +759,8 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||
ADDQ mul0, acc6
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, acc7
|
||||
// First reduction step
|
||||
MOVQ acc0, mul0
|
||||
MOVQ acc0, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
sm2P256MulReductionInternal()
|
||||
|
||||
ADDQ acc0, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
|
||||
SUBQ mul0, acc1
|
||||
SBBQ mul1, acc2
|
||||
SBBQ mul0, acc3
|
||||
SBBQ mul1, acc0
|
||||
// Second reduction step
|
||||
MOVQ acc1, mul0
|
||||
MOVQ acc1, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc1, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
SUBQ mul0, acc2
|
||||
SBBQ mul1, acc3
|
||||
SBBQ mul0, acc0
|
||||
SBBQ mul1, acc1
|
||||
// Third reduction step
|
||||
MOVQ acc2, mul0
|
||||
MOVQ acc2, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc2, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
SUBQ mul0, acc3
|
||||
SBBQ mul1, acc0
|
||||
SBBQ mul0, acc1
|
||||
SBBQ mul1, acc2
|
||||
// Last reduction step
|
||||
MOVQ acc3, mul0
|
||||
MOVQ acc3, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc3, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
|
||||
SUBQ mul0, acc0
|
||||
SBBQ mul1, acc1
|
||||
SBBQ mul0, acc2
|
||||
SBBQ mul1, acc3
|
||||
MOVQ $0, BP
|
||||
// Add bits [511:256] of the result
|
||||
ADCQ acc0, acc4
|
||||
@ -918,66 +860,7 @@ internalMulBMI2:
|
||||
ADDQ mul0, acc6
|
||||
ADCQ $0, acc7
|
||||
|
||||
// First reduction step
|
||||
MOVQ acc0, mul0
|
||||
MOVQ acc0, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc0, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
|
||||
SUBQ mul0, acc1
|
||||
SBBQ mul1, acc2
|
||||
SBBQ mul0, acc3
|
||||
SBBQ mul1, acc0
|
||||
// Second reduction step
|
||||
MOVQ acc1, mul0
|
||||
MOVQ acc1, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc1, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
SUBQ mul0, acc2
|
||||
SBBQ mul1, acc3
|
||||
SBBQ mul0, acc0
|
||||
SBBQ mul1, acc1
|
||||
// Third reduction step
|
||||
MOVQ acc2, mul0
|
||||
MOVQ acc2, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc2, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
SUBQ mul0, acc3
|
||||
SBBQ mul1, acc0
|
||||
SBBQ mul0, acc1
|
||||
SBBQ mul1, acc2
|
||||
// Last reduction step
|
||||
MOVQ acc3, mul0
|
||||
MOVQ acc3, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc3, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
|
||||
SUBQ mul0, acc0
|
||||
SBBQ mul1, acc1
|
||||
SBBQ mul0, acc2
|
||||
SBBQ mul1, acc3
|
||||
sm2P256MulReductionInternal()
|
||||
MOVQ $0, BP
|
||||
// Add bits [511:256] of the result
|
||||
ADCQ acc0, acc4
|
||||
|
@ -207,54 +207,57 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
||||
LSL $32, acc0, y0
|
||||
LSR $32, acc0, y1
|
||||
|
||||
ADDS acc0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADC $0, acc0, acc0
|
||||
|
||||
SUBS y0, acc1
|
||||
SBCS y1, acc2
|
||||
SBCS y0, acc3
|
||||
SBC y1, acc0
|
||||
SBC y1, acc0, y0
|
||||
|
||||
ADDS acc0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADC $0, y0, acc0
|
||||
|
||||
// Second reduction step
|
||||
LSL $32, acc1, y0
|
||||
LSR $32, acc1, y1
|
||||
|
||||
ADDS acc1, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADC $0, acc1, acc1
|
||||
|
||||
SUBS y0, acc2
|
||||
SBCS y1, acc3
|
||||
SBCS y0, acc0
|
||||
SBC y1, acc1
|
||||
SBC y1, acc1, y0
|
||||
|
||||
ADDS acc1, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADC $0, y0, acc1
|
||||
|
||||
// Third reduction step
|
||||
LSL $32, acc2, y0
|
||||
LSR $32, acc2, y1
|
||||
|
||||
ADDS acc2, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADC $0, acc2, acc2
|
||||
|
||||
SUBS y0, acc3
|
||||
SBCS y1, acc0
|
||||
SBCS y0, acc1
|
||||
SBC y1, acc2
|
||||
SBC y1, acc2, y0
|
||||
|
||||
ADDS acc2, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADC $0, y0, acc2
|
||||
|
||||
// Last reduction step
|
||||
LSL $32, acc3, y0
|
||||
LSR $32, acc3, y1
|
||||
|
||||
ADDS acc3, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADC $0, acc3, acc3
|
||||
|
||||
SUBS y0, acc0
|
||||
SBCS y1, acc1
|
||||
SBCS y0, acc2
|
||||
SBC y1, acc3
|
||||
SBC y1, acc3, y0
|
||||
|
||||
ADDS acc3, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADC $0, y0, acc3
|
||||
|
||||
SUBS const0, acc0, t0
|
||||
SBCS const1, acc1, t1
|
||||
@ -967,15 +970,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
||||
LSL $32, acc0, t0
|
||||
LSR $32, acc0, t1
|
||||
|
||||
ADDS acc0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADC $0, acc0, acc0
|
||||
|
||||
SUBS t0, acc1
|
||||
SBCS t1, acc2
|
||||
SBCS t0, acc3
|
||||
SBC t1, acc0
|
||||
SBC t1, acc0, t0
|
||||
|
||||
ADDS acc0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADC $0, t0, acc0
|
||||
|
||||
// y[1] * x
|
||||
MUL y1, x0, t0
|
||||
@ -1003,15 +1006,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
||||
LSL $32, acc1, t0
|
||||
LSR $32, acc1, t1
|
||||
|
||||
ADDS acc1, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADC $0, acc1, acc1
|
||||
|
||||
SUBS t0, acc2
|
||||
SBCS t1, acc3
|
||||
SBCS t0, acc0
|
||||
SBC t1, acc1
|
||||
SBC t1, acc1, t0
|
||||
|
||||
ADDS acc1, acc2, acc2
|
||||
ADCS $0, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADC $0, t0, acc1
|
||||
|
||||
// y[2] * x
|
||||
MUL y2, x0, t0
|
||||
@ -1039,15 +1042,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
||||
LSL $32, acc2, t0
|
||||
LSR $32, acc2, t1
|
||||
|
||||
ADDS acc2, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADC $0, acc2, acc2
|
||||
|
||||
SUBS t0, acc3
|
||||
SBCS t1, acc0
|
||||
SBCS t0, acc1
|
||||
SBC t1, acc2
|
||||
SBC t1, acc2, t0
|
||||
|
||||
ADDS acc2, acc3, acc3
|
||||
ADCS $0, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADC $0, t0, acc2
|
||||
|
||||
// y[3] * x
|
||||
MUL y3, x0, t0
|
||||
@ -1075,15 +1078,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
||||
LSL $32, acc3, t0
|
||||
LSR $32, acc3, t1
|
||||
|
||||
ADDS acc3, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADC $0, acc3, acc3
|
||||
|
||||
SUBS t0, acc0
|
||||
SBCS t1, acc1
|
||||
SBCS t0, acc2
|
||||
SBC t1, acc3
|
||||
SBC t1, acc3, t0
|
||||
|
||||
ADDS acc3, acc0, acc0
|
||||
ADCS $0, acc1, acc1
|
||||
ADCS $0, acc2, acc2
|
||||
ADC $0, t0, acc3
|
||||
|
||||
// Add bits [511:256] of the mul result
|
||||
ADDS acc4, acc0, acc0
|
||||
|
@ -207,19 +207,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc0, acc1
|
||||
SUBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
SBBQ AX, acc3
|
||||
MOVQ acc0, AX
|
||||
SBBQ DX, acc0
|
||||
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ acc0, acc4
|
||||
ADCQ $0, acc5
|
||||
|
||||
SUBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
SBBQ $0, acc5
|
||||
XORQ acc0, acc0
|
||||
|
||||
// x * y[1]
|
||||
MOVQ (8*1)(y_ptr), t0
|
||||
|
||||
@ -258,19 +258,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc1, acc2
|
||||
SUBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
SBBQ AX, acc4
|
||||
MOVQ acc1, AX
|
||||
SBBQ DX, acc1
|
||||
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc4
|
||||
ADCQ acc1, acc5
|
||||
ADCQ $0, acc0
|
||||
|
||||
SUBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
SBBQ AX, acc4
|
||||
SBBQ DX, acc5
|
||||
SBBQ $0, acc0
|
||||
XORQ acc1, acc1
|
||||
|
||||
// x * y[2]
|
||||
MOVQ (8*2)(y_ptr), t0
|
||||
|
||||
@ -309,17 +309,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc2, acc3
|
||||
SUBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
SBBQ AX, acc5
|
||||
MOVQ acc2, AX
|
||||
SBBQ DX, acc2
|
||||
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, acc4
|
||||
ADCQ $0, acc5
|
||||
ADCQ acc2, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
SUBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
SBBQ AX, acc5
|
||||
SBBQ DX, acc0
|
||||
SBBQ $0, acc1
|
||||
XORQ acc2, acc2
|
||||
// x * y[3]
|
||||
MOVQ (8*3)(y_ptr), t0
|
||||
@ -359,17 +360,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc3, acc4
|
||||
SUBQ AX, acc4
|
||||
SBBQ DX, acc5
|
||||
SBBQ AX, acc0
|
||||
MOVQ acc3, AX
|
||||
SBBQ DX, acc3
|
||||
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, acc5
|
||||
ADCQ $0, acc0
|
||||
ADCQ acc3, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
SUBQ AX, acc4
|
||||
SBBQ DX, acc5
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
SBBQ $0, acc2
|
||||
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||
RET
|
||||
|
||||
@ -395,19 +397,19 @@ mulBMI2:
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc0, acc1
|
||||
SUBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
SBBQ AX, acc3
|
||||
MOVQ acc0, AX
|
||||
SBBQ DX, acc0
|
||||
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ acc0, acc4
|
||||
ADCQ $0, acc5
|
||||
|
||||
SUBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
SBBQ $0, acc5
|
||||
XORQ acc0, acc0
|
||||
|
||||
// x * y[1]
|
||||
MOVQ (8*1)(y_ptr), DX
|
||||
MULXQ (8*0)(x_ptr), AX, BX
|
||||
@ -436,19 +438,19 @@ mulBMI2:
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc1, acc2
|
||||
SUBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
SBBQ AX, acc4
|
||||
MOVQ acc1, AX
|
||||
SBBQ DX, acc1
|
||||
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc4
|
||||
ADCQ acc1, acc5
|
||||
ADCQ $0, acc0
|
||||
|
||||
SUBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
SBBQ AX, acc4
|
||||
SBBQ DX, acc5
|
||||
SBBQ $0, acc0
|
||||
XORQ acc1, acc1
|
||||
|
||||
// x * y[2]
|
||||
MOVQ (8*2)(y_ptr), DX
|
||||
|
||||
@ -477,17 +479,18 @@ mulBMI2:
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc2, acc3
|
||||
SUBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
SBBQ AX, acc5
|
||||
MOVQ acc2, AX
|
||||
SBBQ DX, acc2
|
||||
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, acc4
|
||||
ADCQ $0, acc5
|
||||
ADCQ acc2, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
SUBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
SBBQ AX, acc5
|
||||
SBBQ DX, acc0
|
||||
SBBQ $0, acc1
|
||||
XORQ acc2, acc2
|
||||
// x * y[3]
|
||||
MOVQ (8*3)(y_ptr), DX
|
||||
@ -517,17 +520,18 @@ mulBMI2:
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc3, acc4
|
||||
SUBQ AX, acc4
|
||||
SBBQ DX, acc5
|
||||
SBBQ AX, acc0
|
||||
MOVQ acc3, AX
|
||||
SBBQ DX, acc3
|
||||
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, acc5
|
||||
ADCQ $0, acc0
|
||||
ADCQ acc3, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
SUBQ AX, acc4
|
||||
SBBQ DX, acc5
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
SBBQ $0, acc2
|
||||
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||
RET
|
||||
|
||||
@ -550,32 +554,35 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc0, acc1
|
||||
SUBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
SBBQ AX, acc3
|
||||
MOVQ acc0, AX
|
||||
SBBQ DX, acc0
|
||||
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ acc0, acc4
|
||||
|
||||
SUBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
XORQ acc5, acc5
|
||||
|
||||
// Second stage
|
||||
MOVQ acc1, AX
|
||||
MOVQ acc1, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc1, acc2
|
||||
SUBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
SBBQ AX, acc4
|
||||
MOVQ acc1, AX
|
||||
SBBQ DX, acc5
|
||||
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc4
|
||||
ADCQ acc1, acc5
|
||||
|
||||
SUBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
SBBQ AX, acc4
|
||||
SBBQ DX, acc5
|
||||
XORQ acc0, acc0
|
||||
// Third stage
|
||||
MOVQ acc2, AX
|
||||
@ -583,15 +590,17 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc2, acc3
|
||||
SUBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
SBBQ AX, acc5
|
||||
MOVQ acc2, AX
|
||||
SBBQ DX, acc2
|
||||
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, acc4
|
||||
ADCQ $0, acc5
|
||||
ADCQ acc2, acc0
|
||||
|
||||
SUBQ AX, acc3
|
||||
SBBQ DX, acc4
|
||||
SBBQ AX, acc5
|
||||
SBBQ DX, acc0
|
||||
XORQ acc1, acc1
|
||||
// Last stage
|
||||
MOVQ acc3, AX
|
||||
@ -599,15 +608,16 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
ADDQ acc3, acc4
|
||||
ADCQ $0, acc5
|
||||
ADCQ $0, acc0
|
||||
ADCQ acc3, acc1
|
||||
|
||||
SUBQ AX, acc4
|
||||
SBBQ DX, acc5
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
MOVQ acc3, AX
|
||||
SBBQ DX, acc3
|
||||
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, acc5
|
||||
ADCQ $0, acc0
|
||||
ADCQ acc3, acc1
|
||||
|
||||
MOVQ acc4, x_ptr
|
||||
MOVQ acc5, acc3
|
||||
|
@ -237,6 +237,72 @@ GLOBL p256one<>(SB), 8, $32
|
||||
CMOVQCS t2, acc6 \
|
||||
CMOVQCS t3, acc7
|
||||
|
||||
#define sm2P256MulReductionInternal() \
|
||||
\// First reduction step
|
||||
MOVQ acc0, mul0 \
|
||||
MOVQ acc0, mul1 \
|
||||
SHLQ $32, mul0 \
|
||||
SHRQ $32, mul1 \
|
||||
\
|
||||
SUBQ mul0, acc1 \
|
||||
SBBQ mul1, acc2 \
|
||||
SBBQ mul0, acc3 \
|
||||
MOVQ acc0, mul0 \
|
||||
SBBQ mul1, acc0 \
|
||||
\
|
||||
ADDQ mul0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
\// Second reduction step
|
||||
MOVQ acc1, mul0 \
|
||||
MOVQ acc1, mul1 \
|
||||
SHLQ $32, mul0 \
|
||||
SHRQ $32, mul1 \
|
||||
\
|
||||
SUBQ mul0, acc2 \
|
||||
SBBQ mul1, acc3 \
|
||||
SBBQ mul0, acc0 \
|
||||
MOVQ acc1, mul0 \
|
||||
SBBQ mul1, acc1 \
|
||||
\
|
||||
ADDQ mul0, acc2 \
|
||||
ADCQ $0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
\// Third reduction step
|
||||
MOVQ acc2, mul0 \
|
||||
MOVQ acc2, mul1 \
|
||||
SHLQ $32, mul0 \
|
||||
SHRQ $32, mul1 \
|
||||
\
|
||||
SUBQ mul0, acc3 \
|
||||
SBBQ mul1, acc0 \
|
||||
SBBQ mul0, acc1 \
|
||||
MOVQ acc2, mul0 \
|
||||
SBBQ mul1, acc2 \
|
||||
\
|
||||
ADDQ mul0, acc3 \
|
||||
ADCQ $0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
\// Last reduction step
|
||||
MOVQ acc3, mul0 \
|
||||
MOVQ acc3, mul1 \
|
||||
SHLQ $32, mul0 \
|
||||
SHRQ $32, mul1 \
|
||||
\
|
||||
SUBQ mul0, acc0 \
|
||||
SBBQ mul1, acc1 \
|
||||
SBBQ mul0, acc2 \
|
||||
MOVQ acc3, mul0 \
|
||||
SBBQ mul1, acc3 \
|
||||
\
|
||||
ADDQ mul0, acc0 \
|
||||
ADCQ $0, acc1 \
|
||||
ADCQ $0, acc2 \
|
||||
ADCQ $0, acc3
|
||||
|
||||
#define p256PointDoubleInit() \
|
||||
MOVOU (16*0)(BX), X0 \
|
||||
MOVOU (16*1)(BX), X1 \
|
||||
|
@ -757,67 +757,9 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||
ADDQ mul0, acc6
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, acc7
|
||||
// First reduction step
|
||||
|
||||
PEXTRQ $0, X0, acc0
|
||||
MOVQ acc0, mul0
|
||||
MOVQ acc0, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc0, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
|
||||
SUBQ mul0, acc1
|
||||
SBBQ mul1, acc2
|
||||
SBBQ mul0, acc3
|
||||
SBBQ mul1, acc0
|
||||
// Second reduction step
|
||||
MOVQ acc1, mul0
|
||||
MOVQ acc1, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc1, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
SUBQ mul0, acc2
|
||||
SBBQ mul1, acc3
|
||||
SBBQ mul0, acc0
|
||||
SBBQ mul1, acc1
|
||||
// Third reduction step
|
||||
MOVQ acc2, mul0
|
||||
MOVQ acc2, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc2, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
SUBQ mul0, acc3
|
||||
SBBQ mul1, acc0
|
||||
SBBQ mul0, acc1
|
||||
SBBQ mul1, acc2
|
||||
// Last reduction step
|
||||
MOVQ acc3, mul0
|
||||
MOVQ acc3, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc3, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
|
||||
SUBQ mul0, acc0
|
||||
SBBQ mul1, acc1
|
||||
SBBQ mul0, acc2
|
||||
SBBQ mul1, acc3
|
||||
sm2P256MulReductionInternal()
|
||||
MOVQ $0, mul0
|
||||
// Add bits [511:256] of the result
|
||||
ADCQ acc0, acc4
|
||||
@ -918,67 +860,8 @@ internalMulBMI2:
|
||||
ADDQ mul0, acc6
|
||||
ADCQ $0, acc7
|
||||
|
||||
// First reduction step
|
||||
PEXTRQ $0, X0, acc0
|
||||
MOVQ acc0, mul0
|
||||
MOVQ acc0, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc0, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
|
||||
SUBQ mul0, acc1
|
||||
SBBQ mul1, acc2
|
||||
SBBQ mul0, acc3
|
||||
SBBQ mul1, acc0
|
||||
// Second reduction step
|
||||
MOVQ acc1, mul0
|
||||
MOVQ acc1, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc1, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
SUBQ mul0, acc2
|
||||
SBBQ mul1, acc3
|
||||
SBBQ mul0, acc0
|
||||
SBBQ mul1, acc1
|
||||
// Third reduction step
|
||||
MOVQ acc2, mul0
|
||||
MOVQ acc2, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc2, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
SUBQ mul0, acc3
|
||||
SBBQ mul1, acc0
|
||||
SBBQ mul0, acc1
|
||||
SBBQ mul1, acc2
|
||||
// Last reduction step
|
||||
MOVQ acc3, mul0
|
||||
MOVQ acc3, mul1
|
||||
SHLQ $32, mul0
|
||||
SHRQ $32, mul1
|
||||
|
||||
ADDQ acc3, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
|
||||
SUBQ mul0, acc0
|
||||
SBBQ mul1, acc1
|
||||
SBBQ mul0, acc2
|
||||
SBBQ mul1, acc3
|
||||
sm2P256MulReductionInternal()
|
||||
MOVQ $0, mul0
|
||||
// Add bits [511:256] of the result
|
||||
ADCQ acc0, acc4
|
||||
|
Loading…
x
Reference in New Issue
Block a user