sm2 p256 arm64 reduce multiplication

This commit is contained in:
Emman 2021-12-28 15:28:22 +08:00
parent 6e01ca2f63
commit 4991042efd

View File

@ -215,82 +215,57 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
LDP 1*16(a_ptr), (acc2, acc3) LDP 1*16(a_ptr), (acc2, acc3)
// Only reduce, no multiplications are needed // Only reduce, no multiplications are needed
// First reduction step // First reduction step
MUL const1, acc0, t0 LSL $32, acc0, y0
ADDS t0, acc1, acc1 // (carry1, acc1) = acc1 + L(acc0*p1) LSR $32, acc0, y1
UMULH const1, acc0, y0 // y0 = H(acc0*p1)
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0
MUL const2, acc0, t0 SUBS y0, acc1
ADCS t0, acc2, acc2 // (carry2, acc2) = acc2 + L(acc0*p2) SBCS y1, acc2
UMULH const2, acc0, hlp0 // hlp0 = H(acc0*p2) SBCS y0, acc3
SBC y1, acc0
MUL const3, acc0, t0 // t0 = L(acc0*p3)
ADCS t0, acc3, acc3 // (carry3,acc3) = acc3 + L(acc0*p3)
UMULH const3, acc0, y1 // y1 = H(acc0*p3)
ADC $0, y1
ADDS acc0, acc1, acc1 // (carry4, acc1) = acc0 + acc1 + L(acc0*p1)
ADCS y0, acc2, acc2 // (carry5, acc2) = carry4 + acc2 + L(acc0*p2) + H(acc0*p1)
ADCS hlp0, acc3, acc3 // (carry6, acc3) = carry5 + acc3 + L(acc0*p3) + H(acc0*p2)
ADC $0, y1, acc0 // acc0 = carry6 + H(acc0*p3)
// Second reduction step // Second reduction step
MUL const1, acc1, t0 LSL $32, acc1, y0
ADDS t0, acc2, acc2 // (carry1, acc2) = acc2 + L(acc1*p1) LSR $32, acc1, y1
UMULH const1, acc1, y0 // y0 = H(acc1*p1)
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1
MUL const2, acc1, t0 SUBS y0, acc2
ADCS t0, acc3, acc3 // (carry2, acc3) = acc3 + L(acc1*p2) SBCS y1, acc3
UMULH const2, acc1, hlp0 // hlp0 = H(acc1*p2) SBCS y0, acc0
SBC y1, acc1
MUL const3, acc1, t0 // t0 = L(acc1*p3)
ADCS t0, acc0, acc0 // (carry3,acc0) = acc0 + L(acc1*p3)
UMULH const3, acc1, y1 // y1 = H(acc1*p3)
ADC $0, y1
ADDS acc1, acc2, acc2 // (carry4, acc2) = acc1 + acc2 + L(acc1*p1)
ADCS y0, acc3, acc3 // (carry5, acc3) = carry4 + acc3 + L(acc1*p2) + H(acc1*p1)
ADCS hlp0, acc0, acc0 // (carry6, acc0) = carry5 + acc0 + L(acc1*p3) + H(acc1*p2)
ADC $0, y1, acc1 // acc1 = carry6 + H(acc1*p3)
// Third reduction step // Third reduction step
MUL const1, acc2, t0 LSL $32, acc2, y0
ADDS t0, acc3, acc3 // (carry1, acc3) = acc3 + L(acc2*p1) LSR $32, acc2, y1
UMULH const1, acc2, y0 // y0 = H(acc2*p1)
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2
MUL const2, acc2, t0 SUBS y0, acc3
ADCS t0, acc0, acc0 // (carry2, acc0) = acc0 + L(acc2*p2) SBCS y1, acc0
UMULH const2, acc2, hlp0 // hlp0 = H(acc2*p2) SBCS y0, acc1
SBC y1, acc2
MUL const3, acc2, t0 // t0 = L(acc2*p3)
ADCS t0, acc1, acc1 // (carry3,acc1) = acc1 + L(acc2*p3)
UMULH const3, acc2, y1 // y1 = H(acc2*p3)
ADC $0, y1
ADDS acc2, acc3, acc3 // (carry4, acc3) = acc2 + acc3 + L(acc2*p1)
ADCS y0, acc0, acc0 // (carry5, acc0) = carry4 + acc0 + L(acc2*p2) + H(acc2*p1)
ADCS hlp0, acc1, acc1 // (carry6, acc1) = carry5 + acc1 + L(acc2*p3) + H(acc2*p2)
ADC $0, y1, acc2 // acc2 = carry6 + H(acc2*p3)
// Last reduction step // Last reduction step
MUL const1, acc3, t0 LSL $32, acc3, y0
ADDS t0, acc0, acc0 // (carry1, acc0) = acc0 + L(acc3*p1) LSR $32, acc3, y1
UMULH const1, acc3, y0 // y0 = H(acc3*p1)
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3
MUL const2, acc3, t0 SUBS y0, acc0
ADCS t0, acc1, acc1 // (carry2, acc1) = acc1 + L(acc3*p2) SBCS y1, acc1
UMULH const2, acc3, hlp0 // hlp0 = H(acc3*p2) SBCS y0, acc2
SBC y1, acc3
MUL const3, acc3, t0 // t0 = L(acc3*p3)
ADCS t0, acc2, acc2 // (carry3,acc2) = acc2 + L(acc3*p3)
UMULH const3, acc3, y1 // y1 = H(acc3*p3)
ADC $0, y1
ADDS acc3, acc0, acc0 // (carry4, acc0) = acc3 + acc0 + L(acc3*p1)
ADCS y0, acc1, acc1 // (carry5, acc1) = carry4 + acc1 + L(acc3*p2) + H(acc3*p1)
ADCS hlp0, acc2, acc2 // (carry6, acc2) = carry5 + acc2 + L(acc3*p3) + H(acc3*p2)
ADC $0, y1, acc3 // acc3 = carry6 + H(acc3*p3)
SUBS const0, acc0, t0 SUBS const0, acc0, t0
SBCS const1, acc1, t1 SBCS const1, acc1, t1
@ -905,81 +880,57 @@ TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
UMULH x3, x3, t1 UMULH x3, x3, t1
ADCS t1, acc7, acc7 ADCS t1, acc7, acc7
// First reduction step // First reduction step
MUL const1, acc0, t0 LSL $32, acc0, y0
ADDS t0, acc1, acc1 // (carry1, acc1) = acc1 + L(acc0*p1) LSR $32, acc0, y1
UMULH const1, acc0, y0 // y0 = H(acc0*p1)
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0
MUL const2, acc0, t0 SUBS y0, acc1
ADCS t0, acc2, acc2 // (carry2, acc2) = acc2 + L(acc0*p2) SBCS y1, acc2
UMULH const2, acc0, hlp0 // hlp0 = H(acc0*p2) SBCS y0, acc3
SBC y1, acc0
MUL const3, acc0, t0 // t0 = L(acc0*p3)
ADCS t0, acc3, acc3 // (carry3,acc3) = acc3 + L(acc0*p3)
UMULH const3, acc0, y1 // y1 = H(acc0*p3)
ADC $0, y1
ADDS acc0, acc1, acc1 // (carry4, acc1) = acc0 + acc1 + L(acc0*p1)
ADCS y0, acc2, acc2 // (carry5, acc2) = carry4 + acc2 + L(acc0*p2) + H(acc0*p1)
ADCS hlp0, acc3, acc3 // (carry6, acc3) = carry5 + acc3 + L(acc0*p3) + H(acc0*p2)
ADC $0, y1, acc0 // acc0 = carry6 + H(acc0*p3)
// Second reduction step // Second reduction step
MUL const1, acc1, t0 LSL $32, acc1, y0
ADDS t0, acc2, acc2 // (carry1, acc2) = acc2 + L(acc1*p1) LSR $32, acc1, y1
UMULH const1, acc1, y0 // y0 = H(acc1*p1)
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1
MUL const2, acc1, t0 SUBS y0, acc2
ADCS t0, acc3, acc3 // (carry2, acc3) = acc3 + L(acc1*p2) SBCS y1, acc3
UMULH const2, acc1, hlp0 // hlp0 = H(acc1*p2) SBCS y0, acc0
SBC y1, acc1
MUL const3, acc1, t0 // t0 = L(acc1*p3)
ADCS t0, acc0, acc0 // (carry3,acc0) = acc0 + L(acc1*p3)
UMULH const3, acc1, y1 // y1 = H(acc1*p3)
ADC $0, y1
ADDS acc1, acc2, acc2 // (carry4, acc2) = acc1 + acc2 + L(acc1*p1)
ADCS y0, acc3, acc3 // (carry5, acc3) = carry4 + acc3 + L(acc1*p2) + H(acc1*p1)
ADCS hlp0, acc0, acc0 // (carry6, acc0) = carry5 + acc0 + L(acc1*p3) + H(acc1*p2)
ADC $0, y1, acc1 // acc1 = carry6 + H(acc1*p3)
// Third reduction step // Third reduction step
MUL const1, acc2, t0 LSL $32, acc2, y0
ADDS t0, acc3, acc3 // (carry1, acc3) = acc3 + L(acc2*p1) LSR $32, acc2, y1
UMULH const1, acc2, y0 // y0 = H(acc2*p1)
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2
MUL const2, acc2, t0 SUBS y0, acc3
ADCS t0, acc0, acc0 // (carry2, acc0) = acc0 + L(acc2*p2) SBCS y1, acc0
UMULH const2, acc2, hlp0 // hlp0 = H(acc2*p2) SBCS y0, acc1
SBC y1, acc2
MUL const3, acc2, t0 // t0 = L(acc2*p3)
ADCS t0, acc1, acc1 // (carry3,acc1) = acc1 + L(acc2*p3)
UMULH const3, acc2, y1 // y1 = H(acc2*p3)
ADC $0, y1
ADDS acc2, acc3, acc3 // (carry4, acc3) = acc2 + acc3 + L(acc2*p1)
ADCS y0, acc0, acc0 // (carry5, acc0) = carry4 + acc0 + L(acc2*p2) + H(acc2*p1)
ADCS hlp0, acc1, acc1 // (carry6, acc1) = carry5 + acc1 + L(acc2*p3) + H(acc2*p2)
ADC $0, y1, acc2 // acc2 = carry6 + H(acc2*p3)
// Last reduction step // Last reduction step
MUL const1, acc3, t0 LSL $32, acc3, y0
ADDS t0, acc0, acc0 // (carry1, acc0) = acc0 + L(acc3*p1) LSR $32, acc3, y1
UMULH const1, acc3, y0 // y0 = H(acc3*p1)
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3
MUL const2, acc3, t0 SUBS y0, acc0
ADCS t0, acc1, acc1 // (carry2, acc1) = acc1 + L(acc3*p2) SBCS y1, acc1
UMULH const2, acc3, hlp0 // hlp0 = H(acc3*p2) SBCS y0, acc2
SBC y1, acc3
MUL const3, acc3, t0 // t0 = L(acc3*p3)
ADCS t0, acc2, acc2 // (carry3,acc2) = acc2 + L(acc3*p3)
UMULH const3, acc3, y1 // y1 = H(acc3*p3)
ADC $0, acc7 // acc7 = carry3 + acc7
ADDS acc3, acc0, acc0 // (carry4, acc0) = acc3 + acc0 + L(acc3*p1)
ADCS y0, acc1, acc1 // (carry5, acc1) = carry4 + acc1 + L(acc3*p2) + H(acc3*p1)
ADCS hlp0, acc2, acc2 // (carry6, acc2) = carry5 + acc2 + L(acc3*p3) + H(acc3*p2)
ADC $0, y1, acc3 // acc3 = carry6 + H(acc3*p3)
// Add bits [511:256] of the sqr result // Add bits [511:256] of the sqr result
ADDS acc4, acc0, acc0 ADDS acc4, acc0, acc0
@ -1018,24 +969,18 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
UMULH y0, x3, acc4 UMULH y0, x3, acc4
ADC $0, acc4 ADC $0, acc4
// First reduction step // First reduction step
MUL const1, acc0, t0 LSL $32, acc0, t0
ADDS t0, acc1, acc1 // (carry1, acc1) = acc1 + L(acc0*p1) LSR $32, acc0, t1
UMULH const1, acc0, y0 // y0 = H(acc0*p1)
ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0
MUL const2, acc0, t0 SUBS t0, acc1
ADCS t0, acc2, acc2 // (carry2, acc2) = acc2 + L(acc0*p2) SBCS t1, acc2
UMULH const2, acc0, hlp0 // hlp0 = H(acc0*p2) SBCS t0, acc3
SBC t1, acc0
MUL const3, acc0, t0 // t0 = L(acc0*p3)
ADCS t0, acc3, acc3 // (carry3,acc3) = acc3 + L(acc0*p3)
UMULH const3, acc0, acc5 // acc5 = H(acc0*p3)
ADC $0, acc4 // acc4 = carry3 + acc4
ADDS acc0, acc1, acc1 // (carry4, acc1) = acc0 + acc1 + L(acc0*p1)
ADCS y0, acc2, acc2 // (carry5, acc2) = carry4 + acc2 + L(acc0*p2) + H(acc0*p1)
ADCS hlp0, acc3, acc3 // (carry6, acc3) = carry5 + acc3 + L(acc0*p3) + H(acc0*p2)
ADC $0, acc5, acc0 // acc0 = carry6 + H(acc0*p3)
// y[1] * x // y[1] * x
MUL y1, x0, t0 MUL y1, x0, t0
@ -1060,24 +1005,18 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
ADCS acc6, acc4 ADCS acc6, acc4
ADC hlp0, acc5 ADC hlp0, acc5
// Second reduction step // Second reduction step
MUL const1, acc1, t0 LSL $32, acc1, t0
ADDS t0, acc2, acc2 // (carry1, acc2) = acc2 + L(acc1*p1) LSR $32, acc1, t1
UMULH const1, acc1, y0 // y0 = H(acc1*p1)
ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1
MUL const2, acc1, t0 SUBS t0, acc2
ADCS t0, acc3, acc3 // (carry2, acc3) = acc3 + L(acc1*p2) SBCS t1, acc3
UMULH const2, acc1, hlp0 // hlp0 = H(acc1*p2) SBCS t0, acc0
SBC t1, acc1
MUL const3, acc1, t0 // t0 = L(acc1*p3)
ADCS t0, acc0, acc0 // (carry3,acc0) = acc0 + L(acc1*p3)
UMULH const3, acc1, y1 // y1 = H(acc1*p3)
ADC $0, acc5 // acc5 = carry3 + acc5
ADDS acc1, acc2, acc2 // (carry4, acc2) = acc1 + acc2 + L(acc1*p1)
ADCS y0, acc3, acc3 // (carry5, acc3) = carry4 + acc3 + L(acc1*p2) + H(acc1*p1)
ADCS hlp0, acc0, acc0 // (carry6, acc0) = carry5 + acc0 + L(acc1*p3) + H(acc1*p2)
ADC $0, y1, acc1 // acc1 = carry6 + H(acc1*p3)
// y[2] * x // y[2] * x
MUL y2, x0, t0 MUL y2, x0, t0
@ -1102,24 +1041,18 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
ADCS y1, acc5 ADCS y1, acc5
ADC hlp0, acc6 ADC hlp0, acc6
// Third reduction step // Third reduction step
MUL const1, acc2, t0 LSL $32, acc2, t0
ADDS t0, acc3, acc3 // (carry1, acc3) = acc3 + L(acc2*p1) LSR $32, acc2, t1
UMULH const1, acc2, y0 // y0 = H(acc2*p1)
ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2
MUL const2, acc2, t0 SUBS t0, acc3
ADCS t0, acc0, acc0 // (carry2, acc0) = acc0 + L(acc2*p2) SBCS t1, acc0
UMULH const2, acc2, hlp0 // hlp0 = H(acc2*p2) SBCS t0, acc1
SBC t1, acc2
MUL const3, acc2, t0 // t0 = L(acc2*p3)
ADCS t0, acc1, acc1 // (carry3,acc1) = acc1 + L(acc2*p3)
UMULH const3, acc2, y1 // y1 = H(acc2*p3)
ADC $0, acc6 // acc6 = carry3 + acc6
ADDS acc2, acc3, acc3 // (carry4, acc3) = acc2 + acc3 + L(acc2*p1)
ADCS y0, acc0, acc0 // (carry5, acc0) = carry4 + acc0 + L(acc2*p2) + H(acc2*p1)
ADCS hlp0, acc1, acc1 // (carry6, acc1) = carry5 + acc1 + L(acc2*p3) + H(acc2*p2)
ADC $0, y1, acc2 // acc2 = carry6 + H(acc2*p3)
// y[3] * x // y[3] * x
MUL y3, x0, t0 MUL y3, x0, t0
@ -1144,24 +1077,18 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
ADCS y1, acc6 ADCS y1, acc6
ADC hlp0, acc7 ADC hlp0, acc7
// Last reduction step // Last reduction step
MUL const1, acc3, t0 LSL $32, acc3, t0
ADDS t0, acc0, acc0 // (carry1, acc0) = acc0 + L(acc3*p1) LSR $32, acc3, t1
UMULH const1, acc3, y0 // y0 = H(acc3*p1)
ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3
MUL const2, acc3, t0 SUBS t0, acc0
ADCS t0, acc1, acc1 // (carry2, acc1) = acc1 + L(acc3*p2) SBCS t1, acc1
UMULH const2, acc3, hlp0 // hlp0 = H(acc3*p2) SBCS t0, acc2
SBC t1, acc3
MUL const3, acc3, t0 // t0 = L(acc3*p3)
ADCS t0, acc2, acc2 // (carry3,acc2) = acc2 + L(acc3*p3)
UMULH const3, acc3, y1 // y1 = H(acc3*p3)
ADC $0, acc7 // acc7 = carry3 + acc7
ADDS acc3, acc0, acc0 // (carry4, acc0) = acc3 + acc0 + L(acc3*p1)
ADCS y0, acc1, acc1 // (carry5, acc1) = carry4 + acc1 + L(acc3*p2) + H(acc3*p1)
ADCS hlp0, acc2, acc2 // (carry6, acc2) = carry5 + acc2 + L(acc3*p3) + H(acc3*p2)
ADC $0, y1, acc3 // acc3 = carry6 + H(acc3*p3)
// Add bits [511:256] of the mul result // Add bits [511:256] of the mul result
ADDS acc4, acc0, acc0 ADDS acc4, acc0, acc0