Rollback reduction process with multiple and addition

This commit is contained in:
Emman 2021-12-17 16:50:47 +08:00
parent 996ab5047f
commit 49e0071a8a

View File

@ -246,82 +246,94 @@ sqrLoop:
ADCQ DX, t1 ADCQ DX, t1
MOVQ t1, x_ptr MOVQ t1, x_ptr
// First reduction step // First reduction step
MOVQ acc0, t1 MOVQ p256p<>+0x08(SB), AX
MULQ acc0
MOVQ t1, AX ADDQ acc0, acc1
MOVQ t1, DX ADCQ $0, DX
SHLQ $32, AX ADDQ AX, acc1
SHRQ $32, DX ADCQ $0, DX
MOVQ DX, t1
ADDQ t1, acc1 MOVQ p256p<>+0x010(SB), AX
ADCQ $0, acc2 MULQ acc0
SUBQ AX, acc1
SBBQ DX, acc2
ADDQ t1, acc3
ADCQ $0, acc0
SUBQ AX, acc3
SBBQ DX, acc0
SUBQ t1, acc3
SBBQ $0, acc0
// Second reduction step
MOVQ acc1, t1
MOVQ t1, AX
MOVQ t1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc2 ADDQ t1, acc2
ADCQ $0, acc3 ADCQ $0, DX
SUBQ AX, acc2 ADDQ AX, acc2
SBBQ DX, acc3 ADCQ $0, DX
MOVQ DX, t1
ADDQ t1, acc0 MOVQ p256p<>+0x018(SB), AX
ADCQ $0, acc1 MULQ acc0
SUBQ AX, acc0
SBBQ DX, acc1
SUBQ t1, acc0
SBBQ $0, acc1
// Third reduction step
MOVQ acc2, t1
MOVQ t1, AX
MOVQ t1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc3 ADDQ t1, acc3
ADCQ $0, acc0 ADCQ $0, DX
SUBQ AX, acc3 ADDQ AX, acc3
SBBQ DX, acc0 ADCQ $0, DX
MOVQ DX, acc0
// Second reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc1
ADDQ acc1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc1
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc1
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, acc1
// Third reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc2
ADDQ acc2, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc2
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc2
ADDQ t1, acc1 ADDQ t1, acc1
ADCQ $0, acc2 ADCQ $0, DX
SUBQ AX, acc1 ADDQ AX, acc1
SBBQ DX, acc2 ADCQ $0, DX
SUBQ t1, acc1 MOVQ DX, acc2
SBBQ $0, acc2
// Last reduction step // Last reduction step
XORQ t0, t0 XORQ t0, t0
MOVQ acc3, t1 MOVQ p256p<>+0x08(SB), AX
MULQ acc3
MOVQ t1, AX ADDQ acc3, acc0
MOVQ t1, DX ADCQ $0, DX
SHLQ $32, AX ADDQ AX, acc0
SHRQ $32, DX ADCQ $0, DX
MOVQ DX, t1
ADDQ t1, acc0 MOVQ p256p<>+0x010(SB), AX
ADCQ $0, acc1 MULQ acc3
SUBQ AX, acc0 ADDQ t1, acc1
SBBQ DX, acc1 ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc3
ADDQ t1, acc2 ADDQ t1, acc2
ADCQ $0, acc3 ADCQ $0, DX
SUBQ AX, acc2 ADDQ AX, acc2
SBBQ DX, acc3 ADCQ $0, DX
SUBQ t1, acc2 MOVQ DX, acc3
SBBQ $0, acc3
// Add bits [511:256] of the sqr result // Add bits [511:256] of the sqr result
ADCQ acc4, acc0 ADCQ acc4, acc0
@ -388,24 +400,26 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
MOVQ DX, acc4 MOVQ DX, acc4
XORQ acc5, acc5 XORQ acc5, acc5
// First reduction step // First reduction step
MOVQ acc0, AX MOVQ p256p<>+0x08(SB), AX
MOVQ acc0, DX MULQ acc0
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1 ADDQ acc0, acc1
ADCQ $0, acc2 ADCQ $0, DX
SUBQ AX, acc1 ADDQ AX, acc1
SBBQ DX, acc2 ADCQ $0, DX
MOVQ DX, t1
MOVQ acc0, t1 MOVQ p256p<>+0x010(SB), AX
ADDQ acc0, acc3 MULQ acc0
ADCQ $0, t1 ADDQ t1, acc2
SUBQ AX, acc3 ADCQ $0, DX
SBBQ DX, t1 ADDQ AX, acc2
SUBQ acc0, acc3 ADCQ $0, DX
SBBQ $0, t1 MOVQ DX, t1
ADDQ t1, acc4 MOVQ p256p<>+0x018(SB), AX
MULQ acc0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5 ADCQ $0, acc5
XORQ acc0, acc0 XORQ acc0, acc0
@ -442,24 +456,26 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc5 ADCQ DX, acc5
ADCQ $0, acc0 ADCQ $0, acc0
// Second reduction step // Second reduction step
MOVQ acc1, AX MOVQ p256p<>+0x08(SB), AX
MOVQ acc1, DX MULQ acc1
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2 ADDQ acc1, acc2
ADCQ $0, acc3 ADCQ $0, DX
SUBQ AX, acc2 ADDQ AX, acc2
SBBQ DX, acc3 ADCQ $0, DX
MOVQ DX, t1
MOVQ acc1, t1 MOVQ p256p<>+0x010(SB), AX
ADDQ acc1, acc4 MULQ acc1
ADCQ $0, t1 ADDQ t1, acc3
SUBQ AX, acc4 ADCQ $0, DX
SBBQ DX, t1 ADDQ AX, acc3
SUBQ acc1, acc4 ADCQ $0, DX
SBBQ $0, t1 MOVQ DX, t1
ADDQ t1, acc5 MOVQ p256p<>+0x018(SB), AX
MULQ acc1
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0 ADCQ $0, acc0
XORQ acc1, acc1 XORQ acc1, acc1
@ -496,24 +512,26 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc0 ADCQ DX, acc0
ADCQ $0, acc1 ADCQ $0, acc1
// Third reduction step // Third reduction step
MOVQ acc2, AX MOVQ p256p<>+0x08(SB), AX
MOVQ acc2, DX MULQ acc2
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3 ADDQ acc2, acc3
ADCQ $0, acc4 ADCQ $0, DX
SUBQ AX, acc3 ADDQ AX, acc3
SBBQ DX, acc4 ADCQ $0, DX
MOVQ DX, t1
MOVQ acc2, t1 MOVQ p256p<>+0x010(SB), AX
ADDQ acc2, acc5 MULQ acc2
ADCQ $0, t1 ADDQ t1, acc4
SUBQ AX, acc5 ADCQ $0, DX
SBBQ DX, t1 ADDQ AX, acc4
SUBQ acc2, acc5 ADCQ $0, DX
SBBQ $0, t1 MOVQ DX, t1
ADDQ t1, acc0 MOVQ p256p<>+0x018(SB), AX
MULQ acc2
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1 ADCQ $0, acc1
XORQ acc2, acc2 XORQ acc2, acc2
// x * y[3] // x * y[3]
@ -549,24 +567,26 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc1 ADCQ DX, acc1
ADCQ $0, acc2 ADCQ $0, acc2
// Last reduction step // Last reduction step
MOVQ acc3, AX MOVQ p256p<>+0x08(SB), AX
MOVQ acc3, DX MULQ acc3
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc4 ADDQ acc3, acc4
ADCQ $0, acc5 ADCQ $0, DX
SUBQ AX, acc4 ADDQ AX, acc4
SBBQ DX, acc5 ADCQ $0, DX
MOVQ DX, t1
MOVQ acc3, t1 MOVQ p256p<>+0x010(SB), AX
ADDQ acc3, acc0 MULQ acc3
ADCQ $0, t1 ADDQ t1, acc5
SUBQ AX, acc0 ADCQ $0, DX
SBBQ DX, t1 ADDQ AX, acc5
SUBQ acc3, acc0 ADCQ $0, DX
SBBQ $0, t1 MOVQ DX, t1
ADDQ t1, acc1 MOVQ p256p<>+0x018(SB), AX
MULQ acc3
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2 ADCQ $0, acc2
// Copy result [255:0] // Copy result [255:0]
MOVQ acc4, x_ptr MOVQ acc4, x_ptr
@ -605,85 +625,93 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
// Only reduce, no multiplications are needed // Only reduce, no multiplications are needed
// First stage // First stage
MOVQ acc0, AX MOVQ p256p<>+0x08(SB), AX
MOVQ acc0, DX MULQ acc0
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc0, acc1 ADDQ acc0, acc1
ADCQ $0, acc2 ADCQ $0, DX
SUBQ AX, acc1 ADDQ AX, acc1
SBBQ DX, acc2 ADCQ $0, DX
MOVQ DX, t1
MOVQ acc0, t1 MOVQ p256p<>+0x010(SB), AX
ADDQ acc0, acc3 MULQ acc0
ADCQ $0, t1 ADDQ t1, acc2
SUBQ AX, acc3 ADCQ $0, DX
SBBQ DX, t1 ADDQ AX, acc2
SUBQ acc0, acc3 ADCQ $0, DX
SBBQ $0, t1 MOVQ DX, t1
ADDQ t1, acc4 MOVQ p256p<>+0x018(SB), AX
MULQ acc0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
XORQ acc5, acc5 XORQ acc5, acc5
// Second stage // Second stage
MOVQ acc1, AX MOVQ p256p<>+0x08(SB), AX
MOVQ acc1, DX MULQ acc1
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc1, acc2 ADDQ acc1, acc2
ADCQ $0, acc3 ADCQ $0, DX
SUBQ AX, acc2 ADDQ AX, acc2
SBBQ DX, acc3 ADCQ $0, DX
MOVQ DX, t1
MOVQ acc1, t1 MOVQ p256p<>+0x010(SB), AX
ADDQ acc1, acc4 MULQ acc1
ADCQ $0, t1 ADDQ t1, acc3
SUBQ AX, acc4 ADCQ $0, DX
SBBQ DX, t1 ADDQ AX, acc3
SUBQ acc1, acc4 ADCQ $0, DX
SBBQ $0, t1 MOVQ DX, t1
ADDQ t1, acc5 MOVQ p256p<>+0x018(SB), AX
MULQ acc1
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
XORQ acc0, acc0 XORQ acc0, acc0
// Third stage // Third stage
MOVQ acc2, AX MOVQ p256p<>+0x08(SB), AX
MOVQ acc2, DX MULQ acc2
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc2, acc3 ADDQ acc2, acc3
ADCQ $0, acc4 ADCQ $0, DX
SUBQ AX, acc3 ADDQ AX, acc3
SBBQ DX, acc4 ADCQ $0, DX
MOVQ DX, t1
MOVQ acc2, t1 MOVQ p256p<>+0x010(SB), AX
ADDQ acc2, acc5 MULQ acc2
ADCQ $0, t1 ADDQ t1, acc4
SUBQ AX, acc5 ADCQ $0, DX
SBBQ DX, t1 ADDQ AX, acc4
SUBQ acc2, acc5 ADCQ $0, DX
SBBQ $0, t1 MOVQ DX, t1
ADDQ t1, acc0 MOVQ p256p<>+0x018(SB), AX
MULQ acc2
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
XORQ acc1, acc1 XORQ acc1, acc1
// Last stage // Last stage
MOVQ acc3, AX MOVQ p256p<>+0x08(SB), AX
MOVQ acc3, DX MULQ acc3
SHLQ $32, AX
SHRQ $32, DX
ADDQ acc3, acc4 ADDQ acc3, acc4
ADCQ $0, acc5 ADCQ $0, DX
SUBQ AX, acc4 ADDQ AX, acc4
SBBQ DX, acc5 ADCQ $0, DX
MOVQ DX, t1
MOVQ acc3, t1 MOVQ p256p<>+0x010(SB), AX
ADDQ acc3, acc0 MULQ acc3
ADCQ $0, t1 ADDQ t1, acc5
SUBQ AX, acc0 ADCQ $0, DX
SBBQ DX, t1 ADDQ AX, acc5
SUBQ acc3, acc0 ADCQ $0, DX
SBBQ $0, t1 MOVQ DX, t1
ADDQ t1, acc1 MOVQ p256p<>+0x018(SB), AX
MULQ acc3
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
MOVQ acc4, x_ptr MOVQ acc4, x_ptr
MOVQ acc5, acc3 MOVQ acc5, acc3
@ -1563,81 +1591,93 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$0
ADCQ $0, mul1 ADCQ $0, mul1
MOVQ mul1, acc7 MOVQ mul1, acc7
// First reduction step // First reduction step
MOVQ acc0, hlp MOVQ p256p<>+0x08(SB), mul0
MULQ acc0
MOVQ hlp, mul0 ADDQ acc0, acc1
MOVQ hlp, mul1 ADCQ $0, mul1
SHLQ $32, mul0 ADDQ mul0, acc1
SHRQ $32, mul1 ADCQ $0, mul1
MOVQ mul1, hlp
ADDQ hlp, acc1 MOVQ p256p<>+0x010(SB), mul0
ADCQ $0, acc2 MULQ acc0
SUBQ mul0, acc1 ADDQ hlp, acc2
SBBQ mul1, acc2 ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc0
ADDQ hlp, acc3 ADDQ hlp, acc3
ADCQ $0, acc0 ADCQ $0, mul1
SUBQ mul0, acc3 ADDQ mul0, acc3
SBBQ mul1, acc0 ADCQ $0, mul1
SUBQ hlp, acc3 MOVQ mul1, acc0
SBBQ $0, acc0
// Second reduction step // Second reduction step
MOVQ acc1, hlp MOVQ p256p<>+0x08(SB), mul0
MULQ acc1
MOVQ hlp, mul0 ADDQ acc1, acc2
MOVQ hlp, mul1 ADCQ $0, mul1
SHLQ $32, mul0 ADDQ mul0, acc2
SHRQ $32, mul1 ADCQ $0, mul1
MOVQ mul1, hlp
ADDQ hlp, acc2 MOVQ p256p<>+0x010(SB), mul0
ADCQ $0, acc3 MULQ acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
ADDQ hlp, acc0
ADCQ $0, acc1
SUBQ mul0, acc0
SBBQ mul1, acc1
SUBQ hlp, acc0
SBBQ $0, acc1
// Third reduction step
MOVQ acc2, hlp
MOVQ hlp, mul0
MOVQ hlp, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ hlp, acc3 ADDQ hlp, acc3
ADCQ $0, acc0 ADCQ $0, mul1
SUBQ mul0, acc3 ADDQ mul0, acc3
SBBQ mul1, acc0 ADCQ $0, mul1
MOVQ mul1, hlp
ADDQ hlp, acc1 MOVQ p256p<>+0x018(SB), mul0
ADCQ $0, acc2 MULQ acc1
SUBQ mul0, acc1
SBBQ mul1, acc2
SUBQ hlp, acc1
SBBQ $0, acc2
// Last reduction step
MOVQ acc3, hlp
MOVQ hlp, mul0
MOVQ hlp, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ hlp, acc0 ADDQ hlp, acc0
ADCQ $0, acc1 ADCQ $0, mul1
SUBQ mul0, acc0 ADDQ mul0, acc0
SBBQ mul1, acc1 ADCQ $0, mul1
MOVQ mul1, acc1
// Third reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc2
ADDQ acc2, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc2
ADDQ hlp, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc2
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc2
// Last reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc3
ADDQ acc3, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc3
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc3
ADDQ hlp, acc2 ADDQ hlp, acc2
ADCQ $0, acc3 ADCQ $0, mul1
SUBQ mul0, acc2 ADDQ mul0, acc2
SBBQ mul1, acc3 ADCQ $0, mul1
SUBQ hlp, acc2 MOVQ mul1, acc3
SBBQ $0, acc3
MOVQ $0, BP MOVQ $0, BP
// Add bits [511:256] of the result // Add bits [511:256] of the result
ADCQ acc0, acc4 ADCQ acc0, acc4
@ -1737,81 +1777,93 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$0
ADCQ mul0, t2 ADCQ mul0, t2
ADCQ DX, t3 ADCQ DX, t3
// First reduction step // First reduction step
MOVQ acc0, hlp MOVQ p256p<>+0x08(SB), mul0
MULQ acc0
MOVQ hlp, mul0 ADDQ acc0, acc1
MOVQ hlp, mul1 ADCQ $0, mul1
SHLQ $32, mul0 ADDQ mul0, acc1
SHRQ $32, mul1 ADCQ $0, mul1
MOVQ mul1, hlp
ADDQ hlp, acc1 MOVQ p256p<>+0x010(SB), mul0
ADCQ $0, acc2 MULQ acc0
SUBQ mul0, acc1 ADDQ hlp, acc2
SBBQ mul1, acc2 ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc0
ADDQ hlp, acc3 ADDQ hlp, acc3
ADCQ $0, acc0 ADCQ $0, mul1
SUBQ mul0, acc3 ADDQ mul0, acc3
SBBQ mul1, acc0 ADCQ $0, mul1
SUBQ hlp, acc3 MOVQ mul1, acc0
SBBQ $0, acc0
// Second reduction step // Second reduction step
MOVQ acc1, hlp MOVQ p256p<>+0x08(SB), mul0
MULQ acc1
MOVQ hlp, mul0 ADDQ acc1, acc2
MOVQ hlp, mul1 ADCQ $0, mul1
SHLQ $32, mul0 ADDQ mul0, acc2
SHRQ $32, mul1 ADCQ $0, mul1
MOVQ mul1, hlp
ADDQ hlp, acc2 MOVQ p256p<>+0x010(SB), mul0
ADCQ $0, acc3 MULQ acc1
SUBQ mul0, acc2
SBBQ mul1, acc3
ADDQ hlp, acc0
ADCQ $0, acc1
SUBQ mul0, acc0
SBBQ mul1, acc1
SUBQ hlp, acc0
SBBQ $0, acc1
// Third reduction step
MOVQ acc2, hlp
MOVQ hlp, mul0
MOVQ hlp, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ hlp, acc3 ADDQ hlp, acc3
ADCQ $0, acc0 ADCQ $0, mul1
SUBQ mul0, acc3 ADDQ mul0, acc3
SBBQ mul1, acc0 ADCQ $0, mul1
MOVQ mul1, hlp
ADDQ hlp, acc1 MOVQ p256p<>+0x018(SB), mul0
ADCQ $0, acc2 MULQ acc1
SUBQ mul0, acc1
SBBQ mul1, acc2
SUBQ hlp, acc1
SBBQ $0, acc2
// Last reduction step
MOVQ acc3, hlp
MOVQ hlp, mul0
MOVQ hlp, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ hlp, acc0 ADDQ hlp, acc0
ADCQ $0, acc1 ADCQ $0, mul1
SUBQ mul0, acc0 ADDQ mul0, acc0
SBBQ mul1, acc1 ADCQ $0, mul1
MOVQ mul1, acc1
// Third reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc2
ADDQ acc2, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc2
ADDQ hlp, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc2
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc2
// Last reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc3
ADDQ acc3, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc3
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc3
ADDQ hlp, acc2 ADDQ hlp, acc2
ADCQ $0, acc3 ADCQ $0, mul1
SUBQ mul0, acc2 ADDQ mul0, acc2
SBBQ mul1, acc3 ADCQ $0, mul1
SUBQ hlp, acc2 MOVQ mul1, acc3
SBBQ $0, acc3
MOVQ $0, BP MOVQ $0, BP
// Add bits [511:256] of the result // Add bits [511:256] of the result
ADCQ acc0, t0 ADCQ acc0, t0