mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 20:56:18 +08:00
internal/sm2ec: mul WWMM reduction, sub first
This commit is contained in:
parent
2553456216
commit
052040fd82
@ -291,7 +291,7 @@ ordSqrLoop:
|
|||||||
ADCQ DX, acc2
|
ADCQ DX, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||||
MOVQ t0, AX
|
MOVQ t0, AX
|
||||||
MOVQ t0, DX
|
MOVQ t0, DX
|
||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
@ -484,7 +484,7 @@ ordSqrLoopBMI2:
|
|||||||
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
|
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
|
||||||
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
|
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
|
||||||
ADCQ $0, acc0 // acc0 = t0 + carry5
|
ADCQ $0, acc0 // acc0 = t0 + carry5
|
||||||
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||||
MOVQ t0, AX
|
MOVQ t0, AX
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
@ -759,66 +759,8 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
|||||||
ADDQ mul0, acc6
|
ADDQ mul0, acc6
|
||||||
ADCQ $0, mul1
|
ADCQ $0, mul1
|
||||||
MOVQ mul1, acc7
|
MOVQ mul1, acc7
|
||||||
// First reduction step
|
sm2P256MulReductionInternal()
|
||||||
MOVQ acc0, mul0
|
|
||||||
MOVQ acc0, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
|
|
||||||
SUBQ mul0, acc1
|
|
||||||
SBBQ mul1, acc2
|
|
||||||
SBBQ mul0, acc3
|
|
||||||
SBBQ mul1, acc0
|
|
||||||
// Second reduction step
|
|
||||||
MOVQ acc1, mul0
|
|
||||||
MOVQ acc1, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc1, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
|
|
||||||
SUBQ mul0, acc2
|
|
||||||
SBBQ mul1, acc3
|
|
||||||
SBBQ mul0, acc0
|
|
||||||
SBBQ mul1, acc1
|
|
||||||
// Third reduction step
|
|
||||||
MOVQ acc2, mul0
|
|
||||||
MOVQ acc2, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc2, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
|
|
||||||
SUBQ mul0, acc3
|
|
||||||
SBBQ mul1, acc0
|
|
||||||
SBBQ mul0, acc1
|
|
||||||
SBBQ mul1, acc2
|
|
||||||
// Last reduction step
|
|
||||||
MOVQ acc3, mul0
|
|
||||||
MOVQ acc3, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc3, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
SUBQ mul0, acc0
|
|
||||||
SBBQ mul1, acc1
|
|
||||||
SBBQ mul0, acc2
|
|
||||||
SBBQ mul1, acc3
|
|
||||||
MOVQ $0, BP
|
MOVQ $0, BP
|
||||||
// Add bits [511:256] of the result
|
// Add bits [511:256] of the result
|
||||||
ADCQ acc0, acc4
|
ADCQ acc0, acc4
|
||||||
@ -918,66 +860,7 @@ internalMulBMI2:
|
|||||||
ADDQ mul0, acc6
|
ADDQ mul0, acc6
|
||||||
ADCQ $0, acc7
|
ADCQ $0, acc7
|
||||||
|
|
||||||
// First reduction step
|
sm2P256MulReductionInternal()
|
||||||
MOVQ acc0, mul0
|
|
||||||
MOVQ acc0, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
|
|
||||||
SUBQ mul0, acc1
|
|
||||||
SBBQ mul1, acc2
|
|
||||||
SBBQ mul0, acc3
|
|
||||||
SBBQ mul1, acc0
|
|
||||||
// Second reduction step
|
|
||||||
MOVQ acc1, mul0
|
|
||||||
MOVQ acc1, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc1, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
|
|
||||||
SUBQ mul0, acc2
|
|
||||||
SBBQ mul1, acc3
|
|
||||||
SBBQ mul0, acc0
|
|
||||||
SBBQ mul1, acc1
|
|
||||||
// Third reduction step
|
|
||||||
MOVQ acc2, mul0
|
|
||||||
MOVQ acc2, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc2, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
|
|
||||||
SUBQ mul0, acc3
|
|
||||||
SBBQ mul1, acc0
|
|
||||||
SBBQ mul0, acc1
|
|
||||||
SBBQ mul1, acc2
|
|
||||||
// Last reduction step
|
|
||||||
MOVQ acc3, mul0
|
|
||||||
MOVQ acc3, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc3, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
SUBQ mul0, acc0
|
|
||||||
SBBQ mul1, acc1
|
|
||||||
SBBQ mul0, acc2
|
|
||||||
SBBQ mul1, acc3
|
|
||||||
MOVQ $0, BP
|
MOVQ $0, BP
|
||||||
// Add bits [511:256] of the result
|
// Add bits [511:256] of the result
|
||||||
ADCQ acc0, acc4
|
ADCQ acc0, acc4
|
||||||
|
@ -207,54 +207,57 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
|||||||
LSL $32, acc0, y0
|
LSL $32, acc0, y0
|
||||||
LSR $32, acc0, y1
|
LSR $32, acc0, y1
|
||||||
|
|
||||||
ADDS acc0, acc1, acc1
|
|
||||||
ADCS $0, acc2, acc2
|
|
||||||
ADCS $0, acc3, acc3
|
|
||||||
ADC $0, acc0, acc0
|
|
||||||
|
|
||||||
SUBS y0, acc1
|
SUBS y0, acc1
|
||||||
SBCS y1, acc2
|
SBCS y1, acc2
|
||||||
SBCS y0, acc3
|
SBCS y0, acc3
|
||||||
SBC y1, acc0
|
SBC y1, acc0, y0
|
||||||
|
|
||||||
|
ADDS acc0, acc1, acc1
|
||||||
|
ADCS $0, acc2, acc2
|
||||||
|
ADCS $0, acc3, acc3
|
||||||
|
ADC $0, y0, acc0
|
||||||
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
LSL $32, acc1, y0
|
LSL $32, acc1, y0
|
||||||
LSR $32, acc1, y1
|
LSR $32, acc1, y1
|
||||||
|
|
||||||
ADDS acc1, acc2, acc2
|
|
||||||
ADCS $0, acc3, acc3
|
|
||||||
ADCS $0, acc0, acc0
|
|
||||||
ADC $0, acc1, acc1
|
|
||||||
|
|
||||||
SUBS y0, acc2
|
SUBS y0, acc2
|
||||||
SBCS y1, acc3
|
SBCS y1, acc3
|
||||||
SBCS y0, acc0
|
SBCS y0, acc0
|
||||||
SBC y1, acc1
|
SBC y1, acc1, y0
|
||||||
|
|
||||||
|
ADDS acc1, acc2, acc2
|
||||||
|
ADCS $0, acc3, acc3
|
||||||
|
ADCS $0, acc0, acc0
|
||||||
|
ADC $0, y0, acc1
|
||||||
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
LSL $32, acc2, y0
|
LSL $32, acc2, y0
|
||||||
LSR $32, acc2, y1
|
LSR $32, acc2, y1
|
||||||
|
|
||||||
ADDS acc2, acc3, acc3
|
|
||||||
ADCS $0, acc0, acc0
|
|
||||||
ADCS $0, acc1, acc1
|
|
||||||
ADC $0, acc2, acc2
|
|
||||||
|
|
||||||
SUBS y0, acc3
|
SUBS y0, acc3
|
||||||
SBCS y1, acc0
|
SBCS y1, acc0
|
||||||
SBCS y0, acc1
|
SBCS y0, acc1
|
||||||
SBC y1, acc2
|
SBC y1, acc2, y0
|
||||||
|
|
||||||
|
ADDS acc2, acc3, acc3
|
||||||
|
ADCS $0, acc0, acc0
|
||||||
|
ADCS $0, acc1, acc1
|
||||||
|
ADC $0, y0, acc2
|
||||||
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
LSL $32, acc3, y0
|
LSL $32, acc3, y0
|
||||||
LSR $32, acc3, y1
|
LSR $32, acc3, y1
|
||||||
|
|
||||||
ADDS acc3, acc0, acc0
|
|
||||||
ADCS $0, acc1, acc1
|
|
||||||
ADCS $0, acc2, acc2
|
|
||||||
ADC $0, acc3, acc3
|
|
||||||
|
|
||||||
SUBS y0, acc0
|
SUBS y0, acc0
|
||||||
SBCS y1, acc1
|
SBCS y1, acc1
|
||||||
SBCS y0, acc2
|
SBCS y0, acc2
|
||||||
SBC y1, acc3
|
SBC y1, acc3, y0
|
||||||
|
|
||||||
|
ADDS acc3, acc0, acc0
|
||||||
|
ADCS $0, acc1, acc1
|
||||||
|
ADCS $0, acc2, acc2
|
||||||
|
ADC $0, y0, acc3
|
||||||
|
|
||||||
SUBS const0, acc0, t0
|
SUBS const0, acc0, t0
|
||||||
SBCS const1, acc1, t1
|
SBCS const1, acc1, t1
|
||||||
@ -967,15 +970,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
|||||||
LSL $32, acc0, t0
|
LSL $32, acc0, t0
|
||||||
LSR $32, acc0, t1
|
LSR $32, acc0, t1
|
||||||
|
|
||||||
ADDS acc0, acc1, acc1
|
|
||||||
ADCS $0, acc2, acc2
|
|
||||||
ADCS $0, acc3, acc3
|
|
||||||
ADC $0, acc0, acc0
|
|
||||||
|
|
||||||
SUBS t0, acc1
|
SUBS t0, acc1
|
||||||
SBCS t1, acc2
|
SBCS t1, acc2
|
||||||
SBCS t0, acc3
|
SBCS t0, acc3
|
||||||
SBC t1, acc0
|
SBC t1, acc0, t0
|
||||||
|
|
||||||
|
ADDS acc0, acc1, acc1
|
||||||
|
ADCS $0, acc2, acc2
|
||||||
|
ADCS $0, acc3, acc3
|
||||||
|
ADC $0, t0, acc0
|
||||||
|
|
||||||
// y[1] * x
|
// y[1] * x
|
||||||
MUL y1, x0, t0
|
MUL y1, x0, t0
|
||||||
@ -1003,15 +1006,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
|||||||
LSL $32, acc1, t0
|
LSL $32, acc1, t0
|
||||||
LSR $32, acc1, t1
|
LSR $32, acc1, t1
|
||||||
|
|
||||||
ADDS acc1, acc2, acc2
|
|
||||||
ADCS $0, acc3, acc3
|
|
||||||
ADCS $0, acc0, acc0
|
|
||||||
ADC $0, acc1, acc1
|
|
||||||
|
|
||||||
SUBS t0, acc2
|
SUBS t0, acc2
|
||||||
SBCS t1, acc3
|
SBCS t1, acc3
|
||||||
SBCS t0, acc0
|
SBCS t0, acc0
|
||||||
SBC t1, acc1
|
SBC t1, acc1, t0
|
||||||
|
|
||||||
|
ADDS acc1, acc2, acc2
|
||||||
|
ADCS $0, acc3, acc3
|
||||||
|
ADCS $0, acc0, acc0
|
||||||
|
ADC $0, t0, acc1
|
||||||
|
|
||||||
// y[2] * x
|
// y[2] * x
|
||||||
MUL y2, x0, t0
|
MUL y2, x0, t0
|
||||||
@ -1039,15 +1042,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
|||||||
LSL $32, acc2, t0
|
LSL $32, acc2, t0
|
||||||
LSR $32, acc2, t1
|
LSR $32, acc2, t1
|
||||||
|
|
||||||
ADDS acc2, acc3, acc3
|
|
||||||
ADCS $0, acc0, acc0
|
|
||||||
ADCS $0, acc1, acc1
|
|
||||||
ADC $0, acc2, acc2
|
|
||||||
|
|
||||||
SUBS t0, acc3
|
SUBS t0, acc3
|
||||||
SBCS t1, acc0
|
SBCS t1, acc0
|
||||||
SBCS t0, acc1
|
SBCS t0, acc1
|
||||||
SBC t1, acc2
|
SBC t1, acc2, t0
|
||||||
|
|
||||||
|
ADDS acc2, acc3, acc3
|
||||||
|
ADCS $0, acc0, acc0
|
||||||
|
ADCS $0, acc1, acc1
|
||||||
|
ADC $0, t0, acc2
|
||||||
|
|
||||||
// y[3] * x
|
// y[3] * x
|
||||||
MUL y3, x0, t0
|
MUL y3, x0, t0
|
||||||
@ -1075,15 +1078,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
|||||||
LSL $32, acc3, t0
|
LSL $32, acc3, t0
|
||||||
LSR $32, acc3, t1
|
LSR $32, acc3, t1
|
||||||
|
|
||||||
ADDS acc3, acc0, acc0
|
|
||||||
ADCS $0, acc1, acc1
|
|
||||||
ADCS $0, acc2, acc2
|
|
||||||
ADC $0, acc3, acc3
|
|
||||||
|
|
||||||
SUBS t0, acc0
|
SUBS t0, acc0
|
||||||
SBCS t1, acc1
|
SBCS t1, acc1
|
||||||
SBCS t0, acc2
|
SBCS t0, acc2
|
||||||
SBC t1, acc3
|
SBC t1, acc3, t0
|
||||||
|
|
||||||
|
ADDS acc3, acc0, acc0
|
||||||
|
ADCS $0, acc1, acc1
|
||||||
|
ADCS $0, acc2, acc2
|
||||||
|
ADC $0, t0, acc3
|
||||||
|
|
||||||
// Add bits [511:256] of the mul result
|
// Add bits [511:256] of the mul result
|
||||||
ADDS acc4, acc0, acc0
|
ADDS acc4, acc0, acc0
|
||||||
|
@ -207,19 +207,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc0, acc1
|
SUBQ AX, acc1
|
||||||
|
SBBQ DX, acc2
|
||||||
|
SBBQ AX, acc3
|
||||||
|
MOVQ acc0, AX
|
||||||
|
SBBQ DX, acc0
|
||||||
|
|
||||||
|
ADDQ AX, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ acc0, acc4
|
ADCQ acc0, acc4
|
||||||
ADCQ $0, acc5
|
ADCQ $0, acc5
|
||||||
|
|
||||||
SUBQ AX, acc1
|
|
||||||
SBBQ DX, acc2
|
|
||||||
SBBQ AX, acc3
|
|
||||||
SBBQ DX, acc4
|
|
||||||
SBBQ $0, acc5
|
|
||||||
XORQ acc0, acc0
|
XORQ acc0, acc0
|
||||||
|
|
||||||
// x * y[1]
|
// x * y[1]
|
||||||
MOVQ (8*1)(y_ptr), t0
|
MOVQ (8*1)(y_ptr), t0
|
||||||
|
|
||||||
@ -258,19 +258,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc1, acc2
|
SUBQ AX, acc2
|
||||||
|
SBBQ DX, acc3
|
||||||
|
SBBQ AX, acc4
|
||||||
|
MOVQ acc1, AX
|
||||||
|
SBBQ DX, acc1
|
||||||
|
|
||||||
|
ADDQ AX, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ $0, acc4
|
ADCQ $0, acc4
|
||||||
ADCQ acc1, acc5
|
ADCQ acc1, acc5
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
|
|
||||||
SUBQ AX, acc2
|
|
||||||
SBBQ DX, acc3
|
|
||||||
SBBQ AX, acc4
|
|
||||||
SBBQ DX, acc5
|
|
||||||
SBBQ $0, acc0
|
|
||||||
XORQ acc1, acc1
|
XORQ acc1, acc1
|
||||||
|
|
||||||
// x * y[2]
|
// x * y[2]
|
||||||
MOVQ (8*2)(y_ptr), t0
|
MOVQ (8*2)(y_ptr), t0
|
||||||
|
|
||||||
@ -309,17 +309,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc2, acc3
|
SUBQ AX, acc3
|
||||||
|
SBBQ DX, acc4
|
||||||
|
SBBQ AX, acc5
|
||||||
|
MOVQ acc2, AX
|
||||||
|
SBBQ DX, acc2
|
||||||
|
|
||||||
|
ADDQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADCQ $0, acc4
|
||||||
ADCQ $0, acc5
|
ADCQ $0, acc5
|
||||||
ADCQ acc2, acc0
|
ADCQ acc2, acc0
|
||||||
ADCQ $0, acc1
|
ADCQ $0, acc1
|
||||||
|
|
||||||
SUBQ AX, acc3
|
|
||||||
SBBQ DX, acc4
|
|
||||||
SBBQ AX, acc5
|
|
||||||
SBBQ DX, acc0
|
|
||||||
SBBQ $0, acc1
|
|
||||||
XORQ acc2, acc2
|
XORQ acc2, acc2
|
||||||
// x * y[3]
|
// x * y[3]
|
||||||
MOVQ (8*3)(y_ptr), t0
|
MOVQ (8*3)(y_ptr), t0
|
||||||
@ -359,17 +360,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc3, acc4
|
SUBQ AX, acc4
|
||||||
|
SBBQ DX, acc5
|
||||||
|
SBBQ AX, acc0
|
||||||
|
MOVQ acc3, AX
|
||||||
|
SBBQ DX, acc3
|
||||||
|
|
||||||
|
ADDQ AX, acc4
|
||||||
ADCQ $0, acc5
|
ADCQ $0, acc5
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
ADCQ acc3, acc1
|
ADCQ acc3, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
|
|
||||||
SUBQ AX, acc4
|
|
||||||
SBBQ DX, acc5
|
|
||||||
SBBQ AX, acc0
|
|
||||||
SBBQ DX, acc1
|
|
||||||
SBBQ $0, acc2
|
|
||||||
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
@ -395,19 +397,19 @@ mulBMI2:
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc0, acc1
|
SUBQ AX, acc1
|
||||||
|
SBBQ DX, acc2
|
||||||
|
SBBQ AX, acc3
|
||||||
|
MOVQ acc0, AX
|
||||||
|
SBBQ DX, acc0
|
||||||
|
|
||||||
|
ADDQ AX, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ acc0, acc4
|
ADCQ acc0, acc4
|
||||||
ADCQ $0, acc5
|
ADCQ $0, acc5
|
||||||
|
|
||||||
SUBQ AX, acc1
|
|
||||||
SBBQ DX, acc2
|
|
||||||
SBBQ AX, acc3
|
|
||||||
SBBQ DX, acc4
|
|
||||||
SBBQ $0, acc5
|
|
||||||
XORQ acc0, acc0
|
XORQ acc0, acc0
|
||||||
|
|
||||||
// x * y[1]
|
// x * y[1]
|
||||||
MOVQ (8*1)(y_ptr), DX
|
MOVQ (8*1)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), AX, BX
|
MULXQ (8*0)(x_ptr), AX, BX
|
||||||
@ -436,19 +438,19 @@ mulBMI2:
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc1, acc2
|
SUBQ AX, acc2
|
||||||
|
SBBQ DX, acc3
|
||||||
|
SBBQ AX, acc4
|
||||||
|
MOVQ acc1, AX
|
||||||
|
SBBQ DX, acc1
|
||||||
|
|
||||||
|
ADDQ AX, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ $0, acc4
|
ADCQ $0, acc4
|
||||||
ADCQ acc1, acc5
|
ADCQ acc1, acc5
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
|
|
||||||
SUBQ AX, acc2
|
|
||||||
SBBQ DX, acc3
|
|
||||||
SBBQ AX, acc4
|
|
||||||
SBBQ DX, acc5
|
|
||||||
SBBQ $0, acc0
|
|
||||||
XORQ acc1, acc1
|
XORQ acc1, acc1
|
||||||
|
|
||||||
// x * y[2]
|
// x * y[2]
|
||||||
MOVQ (8*2)(y_ptr), DX
|
MOVQ (8*2)(y_ptr), DX
|
||||||
|
|
||||||
@ -477,17 +479,18 @@ mulBMI2:
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc2, acc3
|
SUBQ AX, acc3
|
||||||
|
SBBQ DX, acc4
|
||||||
|
SBBQ AX, acc5
|
||||||
|
MOVQ acc2, AX
|
||||||
|
SBBQ DX, acc2
|
||||||
|
|
||||||
|
ADDQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADCQ $0, acc4
|
||||||
ADCQ $0, acc5
|
ADCQ $0, acc5
|
||||||
ADCQ acc2, acc0
|
ADCQ acc2, acc0
|
||||||
ADCQ $0, acc1
|
ADCQ $0, acc1
|
||||||
|
|
||||||
SUBQ AX, acc3
|
|
||||||
SBBQ DX, acc4
|
|
||||||
SBBQ AX, acc5
|
|
||||||
SBBQ DX, acc0
|
|
||||||
SBBQ $0, acc1
|
|
||||||
XORQ acc2, acc2
|
XORQ acc2, acc2
|
||||||
// x * y[3]
|
// x * y[3]
|
||||||
MOVQ (8*3)(y_ptr), DX
|
MOVQ (8*3)(y_ptr), DX
|
||||||
@ -517,17 +520,18 @@ mulBMI2:
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc3, acc4
|
SUBQ AX, acc4
|
||||||
|
SBBQ DX, acc5
|
||||||
|
SBBQ AX, acc0
|
||||||
|
MOVQ acc3, AX
|
||||||
|
SBBQ DX, acc3
|
||||||
|
|
||||||
|
ADDQ AX, acc4
|
||||||
ADCQ $0, acc5
|
ADCQ $0, acc5
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
ADCQ acc3, acc1
|
ADCQ acc3, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
|
|
||||||
SUBQ AX, acc4
|
|
||||||
SBBQ DX, acc5
|
|
||||||
SBBQ AX, acc0
|
|
||||||
SBBQ DX, acc1
|
|
||||||
SBBQ $0, acc2
|
|
||||||
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
@ -550,32 +554,35 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc0, acc1
|
SUBQ AX, acc1
|
||||||
|
SBBQ DX, acc2
|
||||||
|
SBBQ AX, acc3
|
||||||
|
MOVQ acc0, AX
|
||||||
|
SBBQ DX, acc0
|
||||||
|
|
||||||
|
ADDQ AX, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ acc0, acc4
|
ADCQ acc0, acc4
|
||||||
|
|
||||||
SUBQ AX, acc1
|
|
||||||
SBBQ DX, acc2
|
|
||||||
SBBQ AX, acc3
|
|
||||||
SBBQ DX, acc4
|
|
||||||
XORQ acc5, acc5
|
XORQ acc5, acc5
|
||||||
|
|
||||||
// Second stage
|
// Second stage
|
||||||
MOVQ acc1, AX
|
MOVQ acc1, AX
|
||||||
MOVQ acc1, DX
|
MOVQ acc1, DX
|
||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc1, acc2
|
SUBQ AX, acc2
|
||||||
|
SBBQ DX, acc3
|
||||||
|
SBBQ AX, acc4
|
||||||
|
MOVQ acc1, AX
|
||||||
|
SBBQ DX, acc5
|
||||||
|
|
||||||
|
ADDQ AX, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ $0, acc4
|
ADCQ $0, acc4
|
||||||
ADCQ acc1, acc5
|
ADCQ acc1, acc5
|
||||||
|
|
||||||
SUBQ AX, acc2
|
|
||||||
SBBQ DX, acc3
|
|
||||||
SBBQ AX, acc4
|
|
||||||
SBBQ DX, acc5
|
|
||||||
XORQ acc0, acc0
|
XORQ acc0, acc0
|
||||||
// Third stage
|
// Third stage
|
||||||
MOVQ acc2, AX
|
MOVQ acc2, AX
|
||||||
@ -583,15 +590,17 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc2, acc3
|
SUBQ AX, acc3
|
||||||
|
SBBQ DX, acc4
|
||||||
|
SBBQ AX, acc5
|
||||||
|
MOVQ acc2, AX
|
||||||
|
SBBQ DX, acc2
|
||||||
|
|
||||||
|
ADDQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADCQ $0, acc4
|
||||||
ADCQ $0, acc5
|
ADCQ $0, acc5
|
||||||
ADCQ acc2, acc0
|
ADCQ acc2, acc0
|
||||||
|
|
||||||
SUBQ AX, acc3
|
|
||||||
SBBQ DX, acc4
|
|
||||||
SBBQ AX, acc5
|
|
||||||
SBBQ DX, acc0
|
|
||||||
XORQ acc1, acc1
|
XORQ acc1, acc1
|
||||||
// Last stage
|
// Last stage
|
||||||
MOVQ acc3, AX
|
MOVQ acc3, AX
|
||||||
@ -599,15 +608,16 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
|
|||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
SHRQ $32, DX
|
SHRQ $32, DX
|
||||||
|
|
||||||
ADDQ acc3, acc4
|
|
||||||
ADCQ $0, acc5
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ acc3, acc1
|
|
||||||
|
|
||||||
SUBQ AX, acc4
|
SUBQ AX, acc4
|
||||||
SBBQ DX, acc5
|
SBBQ DX, acc5
|
||||||
SBBQ AX, acc0
|
SBBQ AX, acc0
|
||||||
SBBQ DX, acc1
|
MOVQ acc3, AX
|
||||||
|
SBBQ DX, acc3
|
||||||
|
|
||||||
|
ADDQ AX, acc4
|
||||||
|
ADCQ $0, acc5
|
||||||
|
ADCQ $0, acc0
|
||||||
|
ADCQ acc3, acc1
|
||||||
|
|
||||||
MOVQ acc4, x_ptr
|
MOVQ acc4, x_ptr
|
||||||
MOVQ acc5, acc3
|
MOVQ acc5, acc3
|
||||||
|
@ -237,6 +237,72 @@ GLOBL p256one<>(SB), 8, $32
|
|||||||
CMOVQCS t2, acc6 \
|
CMOVQCS t2, acc6 \
|
||||||
CMOVQCS t3, acc7
|
CMOVQCS t3, acc7
|
||||||
|
|
||||||
|
#define sm2P256MulReductionInternal() \
|
||||||
|
\// First reduction step
|
||||||
|
MOVQ acc0, mul0 \
|
||||||
|
MOVQ acc0, mul1 \
|
||||||
|
SHLQ $32, mul0 \
|
||||||
|
SHRQ $32, mul1 \
|
||||||
|
\
|
||||||
|
SUBQ mul0, acc1 \
|
||||||
|
SBBQ mul1, acc2 \
|
||||||
|
SBBQ mul0, acc3 \
|
||||||
|
MOVQ acc0, mul0 \
|
||||||
|
SBBQ mul1, acc0 \
|
||||||
|
\
|
||||||
|
ADDQ mul0, acc1 \
|
||||||
|
ADCQ $0, acc2 \
|
||||||
|
ADCQ $0, acc3 \
|
||||||
|
ADCQ $0, acc0 \
|
||||||
|
\// Second reduction step
|
||||||
|
MOVQ acc1, mul0 \
|
||||||
|
MOVQ acc1, mul1 \
|
||||||
|
SHLQ $32, mul0 \
|
||||||
|
SHRQ $32, mul1 \
|
||||||
|
\
|
||||||
|
SUBQ mul0, acc2 \
|
||||||
|
SBBQ mul1, acc3 \
|
||||||
|
SBBQ mul0, acc0 \
|
||||||
|
MOVQ acc1, mul0 \
|
||||||
|
SBBQ mul1, acc1 \
|
||||||
|
\
|
||||||
|
ADDQ mul0, acc2 \
|
||||||
|
ADCQ $0, acc3 \
|
||||||
|
ADCQ $0, acc0 \
|
||||||
|
ADCQ $0, acc1 \
|
||||||
|
\// Third reduction step
|
||||||
|
MOVQ acc2, mul0 \
|
||||||
|
MOVQ acc2, mul1 \
|
||||||
|
SHLQ $32, mul0 \
|
||||||
|
SHRQ $32, mul1 \
|
||||||
|
\
|
||||||
|
SUBQ mul0, acc3 \
|
||||||
|
SBBQ mul1, acc0 \
|
||||||
|
SBBQ mul0, acc1 \
|
||||||
|
MOVQ acc2, mul0 \
|
||||||
|
SBBQ mul1, acc2 \
|
||||||
|
\
|
||||||
|
ADDQ mul0, acc3 \
|
||||||
|
ADCQ $0, acc0 \
|
||||||
|
ADCQ $0, acc1 \
|
||||||
|
ADCQ $0, acc2 \
|
||||||
|
\// Last reduction step
|
||||||
|
MOVQ acc3, mul0 \
|
||||||
|
MOVQ acc3, mul1 \
|
||||||
|
SHLQ $32, mul0 \
|
||||||
|
SHRQ $32, mul1 \
|
||||||
|
\
|
||||||
|
SUBQ mul0, acc0 \
|
||||||
|
SBBQ mul1, acc1 \
|
||||||
|
SBBQ mul0, acc2 \
|
||||||
|
MOVQ acc3, mul0 \
|
||||||
|
SBBQ mul1, acc3 \
|
||||||
|
\
|
||||||
|
ADDQ mul0, acc0 \
|
||||||
|
ADCQ $0, acc1 \
|
||||||
|
ADCQ $0, acc2 \
|
||||||
|
ADCQ $0, acc3
|
||||||
|
|
||||||
#define p256PointDoubleInit() \
|
#define p256PointDoubleInit() \
|
||||||
MOVOU (16*0)(BX), X0 \
|
MOVOU (16*0)(BX), X0 \
|
||||||
MOVOU (16*1)(BX), X1 \
|
MOVOU (16*1)(BX), X1 \
|
||||||
|
@ -757,67 +757,9 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
|||||||
ADDQ mul0, acc6
|
ADDQ mul0, acc6
|
||||||
ADCQ $0, mul1
|
ADCQ $0, mul1
|
||||||
MOVQ mul1, acc7
|
MOVQ mul1, acc7
|
||||||
// First reduction step
|
|
||||||
PEXTRQ $0, X0, acc0
|
PEXTRQ $0, X0, acc0
|
||||||
MOVQ acc0, mul0
|
sm2P256MulReductionInternal()
|
||||||
MOVQ acc0, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
|
|
||||||
SUBQ mul0, acc1
|
|
||||||
SBBQ mul1, acc2
|
|
||||||
SBBQ mul0, acc3
|
|
||||||
SBBQ mul1, acc0
|
|
||||||
// Second reduction step
|
|
||||||
MOVQ acc1, mul0
|
|
||||||
MOVQ acc1, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc1, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
|
|
||||||
SUBQ mul0, acc2
|
|
||||||
SBBQ mul1, acc3
|
|
||||||
SBBQ mul0, acc0
|
|
||||||
SBBQ mul1, acc1
|
|
||||||
// Third reduction step
|
|
||||||
MOVQ acc2, mul0
|
|
||||||
MOVQ acc2, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc2, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
|
|
||||||
SUBQ mul0, acc3
|
|
||||||
SBBQ mul1, acc0
|
|
||||||
SBBQ mul0, acc1
|
|
||||||
SBBQ mul1, acc2
|
|
||||||
// Last reduction step
|
|
||||||
MOVQ acc3, mul0
|
|
||||||
MOVQ acc3, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc3, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
SUBQ mul0, acc0
|
|
||||||
SBBQ mul1, acc1
|
|
||||||
SBBQ mul0, acc2
|
|
||||||
SBBQ mul1, acc3
|
|
||||||
MOVQ $0, mul0
|
MOVQ $0, mul0
|
||||||
// Add bits [511:256] of the result
|
// Add bits [511:256] of the result
|
||||||
ADCQ acc0, acc4
|
ADCQ acc0, acc4
|
||||||
@ -918,67 +860,8 @@ internalMulBMI2:
|
|||||||
ADDQ mul0, acc6
|
ADDQ mul0, acc6
|
||||||
ADCQ $0, acc7
|
ADCQ $0, acc7
|
||||||
|
|
||||||
// First reduction step
|
|
||||||
PEXTRQ $0, X0, acc0
|
PEXTRQ $0, X0, acc0
|
||||||
MOVQ acc0, mul0
|
sm2P256MulReductionInternal()
|
||||||
MOVQ acc0, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
|
|
||||||
SUBQ mul0, acc1
|
|
||||||
SBBQ mul1, acc2
|
|
||||||
SBBQ mul0, acc3
|
|
||||||
SBBQ mul1, acc0
|
|
||||||
// Second reduction step
|
|
||||||
MOVQ acc1, mul0
|
|
||||||
MOVQ acc1, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc1, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
|
|
||||||
SUBQ mul0, acc2
|
|
||||||
SBBQ mul1, acc3
|
|
||||||
SBBQ mul0, acc0
|
|
||||||
SBBQ mul1, acc1
|
|
||||||
// Third reduction step
|
|
||||||
MOVQ acc2, mul0
|
|
||||||
MOVQ acc2, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc2, acc3
|
|
||||||
ADCQ $0, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
|
|
||||||
SUBQ mul0, acc3
|
|
||||||
SBBQ mul1, acc0
|
|
||||||
SBBQ mul0, acc1
|
|
||||||
SBBQ mul1, acc2
|
|
||||||
// Last reduction step
|
|
||||||
MOVQ acc3, mul0
|
|
||||||
MOVQ acc3, mul1
|
|
||||||
SHLQ $32, mul0
|
|
||||||
SHRQ $32, mul1
|
|
||||||
|
|
||||||
ADDQ acc3, acc0
|
|
||||||
ADCQ $0, acc1
|
|
||||||
ADCQ $0, acc2
|
|
||||||
ADCQ $0, acc3
|
|
||||||
|
|
||||||
SUBQ mul0, acc0
|
|
||||||
SBBQ mul1, acc1
|
|
||||||
SBBQ mul0, acc2
|
|
||||||
SBBQ mul1, acc3
|
|
||||||
MOVQ $0, mul0
|
MOVQ $0, mul0
|
||||||
// Add bits [511:256] of the result
|
// Add bits [511:256] of the result
|
||||||
ADCQ acc0, acc4
|
ADCQ acc0, acc4
|
||||||
|
Loading…
x
Reference in New Issue
Block a user