MAGIC - optimize p256Sqr, avoid mul

This commit is contained in:
emmansun 2021-02-12 11:55:23 +08:00
parent c0199ac104
commit b97c484b85

View File

@ -1,7 +1,3 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file contains constant-time, 64-bit assembly implementation of // This file contains constant-time, 64-bit assembly implementation of
// P256. The optimizations performed here are described in detail in: // P256. The optimizations performed here are described in detail in:
// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
@ -24,8 +20,6 @@
#define t0 R14 #define t0 R14
#define t1 R15 #define t1 R15
DATA p256const0<>+0x00(SB)/8, $0xffffffff00000000
DATA p256const1<>+0x00(SB)/8, $0xfffffffeffffffff
DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
DATA p256p<>+0x08(SB)/8, $0xffffffff00000000 DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
@ -39,8 +33,6 @@ DATA p256one<>+0x00(SB)/8, $0x0000000000000001
DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
DATA p256one<>+0x10(SB)/8, $0x0000000000000000 DATA p256one<>+0x10(SB)/8, $0x0000000000000000
DATA p256one<>+0x18(SB)/8, $0x0000000100000000 DATA p256one<>+0x18(SB)/8, $0x0000000100000000
GLOBL p256const0<>(SB), 8, $8
GLOBL p256const1<>(SB), 8, $8
GLOBL p256p<>(SB), RODATA, $32 GLOBL p256p<>(SB), RODATA, $32
GLOBL p256ordK0<>(SB), RODATA, $8 GLOBL p256ordK0<>(SB), RODATA, $8
GLOBL p256ord<>(SB), RODATA, $32 GLOBL p256ord<>(SB), RODATA, $32
@ -254,94 +246,82 @@ sqrLoop:
ADCQ DX, t1 ADCQ DX, t1
MOVQ t1, x_ptr MOVQ t1, x_ptr
// First reduction step // First reduction step
MOVQ p256p<>+0x08(SB), AX MOVQ acc0, t1
MULQ acc0
ADDQ acc0, acc1 MOVQ t1, AX
ADCQ $0, DX MOVQ t1, DX
ADDQ AX, acc1 SHLQ $32, AX
ADCQ $0, DX SHRQ $32, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc0
// Second reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc1
ADDQ acc1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc1
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc1
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, acc1
// Third reduction step
MOVQ p256p<>+0x08(SB), AX
MULQ acc2
ADDQ acc2, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc2
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc2
ADDQ t1, acc1 ADDQ t1, acc1
ADCQ $0, DX ADCQ $0, acc2
ADDQ AX, acc1 SUBQ AX, acc1
ADCQ $0, DX SBBQ DX, acc2
MOVQ DX, acc2
ADDQ t1, acc3
ADCQ $0, acc0
SUBQ AX, acc3
SBBQ DX, acc0
SUBQ t1, acc3
SBBQ $0, acc0
// Second reduction step
MOVQ acc1, t1
MOVQ t1, AX
MOVQ t1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc2
ADCQ $0, acc3
SUBQ AX, acc2
SBBQ DX, acc3
ADDQ t1, acc0
ADCQ $0, acc1
SUBQ AX, acc0
SBBQ DX, acc1
SUBQ t1, acc0
SBBQ $0, acc1
// Third reduction step
MOVQ acc2, t1
MOVQ t1, AX
MOVQ t1, DX
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc3
ADCQ $0, acc0
SUBQ AX, acc3
SBBQ DX, acc0
ADDQ t1, acc1
ADCQ $0, acc2
SUBQ AX, acc1
SBBQ DX, acc2
SUBQ t1, acc1
SBBQ $0, acc2
// Last reduction step // Last reduction step
XORQ t0, t0 XORQ t0, t0
MOVQ p256p<>+0x08(SB), AX MOVQ acc3, t1
MULQ acc3
ADDQ acc3, acc0 MOVQ t1, AX
ADCQ $0, DX MOVQ t1, DX
ADDQ AX, acc0 SHLQ $32, AX
ADCQ $0, DX SHRQ $32, DX
MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX ADDQ t1, acc0
MULQ acc3 ADCQ $0, acc1
ADDQ t1, acc1 SUBQ AX, acc0
ADCQ $0, DX SBBQ DX, acc1
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX
MULQ acc3
ADDQ t1, acc2 ADDQ t1, acc2
ADCQ $0, DX ADCQ $0, acc3
ADDQ AX, acc2 SUBQ AX, acc2
ADCQ $0, DX SBBQ DX, acc3
MOVQ DX, acc3 SUBQ t1, acc2
SBBQ $0, acc3
// Add bits [511:256] of the sqr result // Add bits [511:256] of the sqr result
ADCQ acc4, acc0 ADCQ acc4, acc0
@ -356,9 +336,9 @@ sqrLoop:
MOVQ acc3, t1 MOVQ acc3, t1
// Subtract p256 // Subtract p256
SUBQ $-1, acc0 SUBQ $-1, acc0
SBBQ p256const0<>(SB) ,acc1 SBBQ p256p<>+0x08(SB), acc1
SBBQ $-1, acc2 SBBQ $-1, acc2
SBBQ p256const1<>(SB), acc3 SBBQ p256p<>+0x018(SB), acc3
SBBQ $0, t0 SBBQ $0, t0
CMOVQCS acc4, acc0 CMOVQCS acc4, acc0
@ -408,22 +388,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
MOVQ DX, acc4 MOVQ DX, acc4
XORQ acc5, acc5 XORQ acc5, acc5
// First reduction step // First reduction step
MOVQ p256p<>+0x08(SB), AX MOVQ p256p<>+0x08(SB), AX
MULQ acc0 MULQ acc0
ADDQ acc0, acc1 ADDQ acc0, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 ADDQ AX, acc1
MOVQ p256p<>+0x010(SB), AX ADCQ $0, DX
MULQ acc0 MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc0
ADDQ t1, acc2 ADDQ t1, acc2
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc2 ADDQ AX, acc2
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX MOVQ p256p<>+0x018(SB), AX
MULQ acc0 MULQ acc0
ADDQ t1, acc3 ADDQ t1, acc3
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc3 ADDQ AX, acc3
@ -464,22 +444,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc5 ADCQ DX, acc5
ADCQ $0, acc0 ADCQ $0, acc0
// Second reduction step // Second reduction step
MOVQ p256p<>+0x08(SB), AX MOVQ p256p<>+0x08(SB), AX
MULQ acc1 MULQ acc1
ADDQ acc1, acc2 ADDQ acc1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 ADDQ AX, acc2
MOVQ p256p<>+0x010(SB), AX ADCQ $0, DX
MULQ acc1 MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc1
ADDQ t1, acc3 ADDQ t1, acc3
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc3 ADDQ AX, acc3
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX MOVQ p256p<>+0x018(SB), AX
MULQ acc1 MULQ acc1
ADDQ t1, acc4 ADDQ t1, acc4
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc4 ADDQ AX, acc4
@ -520,22 +500,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc0 ADCQ DX, acc0
ADCQ $0, acc1 ADCQ $0, acc1
// Third reduction step // Third reduction step
MOVQ p256p<>+0x08(SB), AX MOVQ p256p<>+0x08(SB), AX
MULQ acc2 MULQ acc2
ADDQ acc2, acc3 ADDQ acc2, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 ADDQ AX, acc3
MOVQ p256p<>+0x010(SB), AX ADCQ $0, DX
MULQ acc2 MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc2
ADDQ t1, acc4 ADDQ t1, acc4
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc4 ADDQ AX, acc4
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX MOVQ p256p<>+0x018(SB), AX
MULQ acc2 MULQ acc2
ADDQ t1, acc5 ADDQ t1, acc5
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc5 ADDQ AX, acc5
@ -575,22 +555,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ DX, acc1 ADCQ DX, acc1
ADCQ $0, acc2 ADCQ $0, acc2
// Last reduction step // Last reduction step
MOVQ p256p<>+0x08(SB), AX MOVQ p256p<>+0x08(SB), AX
MULQ acc3 MULQ acc3
ADDQ acc3, acc4 ADDQ acc3, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 ADDQ AX, acc4
MOVQ p256p<>+0x010(SB), AX ADCQ $0, DX
MULQ acc3 MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc3
ADDQ t1, acc5 ADDQ t1, acc5
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc5 ADDQ AX, acc5
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX MOVQ p256p<>+0x018(SB), AX
MULQ acc3 MULQ acc3
ADDQ t1, acc0 ADDQ t1, acc0
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc0 ADDQ AX, acc0
@ -603,9 +583,9 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
MOVQ acc1, t1 MOVQ acc1, t1
// Subtract p256 // Subtract p256
SUBQ $-1, acc4 SUBQ $-1, acc4
SBBQ p256const0<>(SB) ,acc5 SBBQ p256p<>+0x08(SB), acc5
SBBQ $-1, acc0 SBBQ $-1, acc0
SBBQ p256const1<>(SB), acc1 SBBQ p256p<>+0x018(SB), acc1
SBBQ $0, acc2 SBBQ $0, acc2
CMOVQCS x_ptr, acc4 CMOVQCS x_ptr, acc4
@ -633,22 +613,22 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
// Only reduce, no multiplications are needed // Only reduce, no multiplications are needed
// First stage // First stage
MOVQ p256p<>+0x08(SB), AX MOVQ p256p<>+0x08(SB), AX
MULQ acc0 MULQ acc0
ADDQ acc0, acc1 ADDQ acc0, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 ADDQ AX, acc1
MOVQ p256p<>+0x010(SB), AX ADCQ $0, DX
MULQ acc0 MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc0
ADDQ t1, acc2 ADDQ t1, acc2
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc2 ADDQ AX, acc2
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX MOVQ p256p<>+0x018(SB), AX
MULQ acc0 MULQ acc0
ADDQ t1, acc3 ADDQ t1, acc3
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc3 ADDQ AX, acc3
@ -656,66 +636,66 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
XORQ acc5, acc5 XORQ acc5, acc5
// Second stage // Second stage
MOVQ p256p<>+0x08(SB), AX MOVQ p256p<>+0x08(SB), AX
MULQ acc1 MULQ acc1
ADDQ acc1, acc2 ADDQ acc1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 ADDQ AX, acc2
MOVQ p256p<>+0x010(SB), AX ADCQ $0, DX
MULQ acc1 MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc1
ADDQ t1, acc3 ADDQ t1, acc3
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc3 ADDQ AX, acc3
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX MOVQ p256p<>+0x018(SB), AX
MULQ acc1 MULQ acc1
ADDQ t1, acc4 ADDQ t1, acc4
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc4 ADDQ AX, acc4
ADCQ DX, acc5 ADCQ DX, acc5
XORQ acc0, acc0 XORQ acc0, acc0
// Third stage // Third stage
MOVQ p256p<>+0x08(SB), AX MOVQ p256p<>+0x08(SB), AX
MULQ acc2 MULQ acc2
ADDQ acc2, acc3 ADDQ acc2, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 ADDQ AX, acc3
MOVQ p256p<>+0x010(SB), AX ADCQ $0, DX
MULQ acc2 MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc2
ADDQ t1, acc4 ADDQ t1, acc4
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc4 ADDQ AX, acc4
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX MOVQ p256p<>+0x018(SB), AX
MULQ acc2 MULQ acc2
ADDQ t1, acc5 ADDQ t1, acc5
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc5 ADDQ AX, acc5
ADCQ DX, acc0 ADCQ DX, acc0
XORQ acc1, acc1 XORQ acc1, acc1
// Last stage // Last stage
MOVQ p256p<>+0x08(SB), AX MOVQ p256p<>+0x08(SB), AX
MULQ acc3 MULQ acc3
ADDQ acc3, acc4 ADDQ acc3, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 ADDQ AX, acc4
MOVQ p256p<>+0x010(SB), AX ADCQ $0, DX
MULQ acc3 MOVQ DX, t1
MOVQ p256p<>+0x010(SB), AX
MULQ acc3
ADDQ t1, acc5 ADDQ t1, acc5
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc5 ADDQ AX, acc5
ADCQ $0, DX ADCQ $0, DX
MOVQ DX, t1 MOVQ DX, t1
MOVQ p256p<>+0x018(SB), AX MOVQ p256p<>+0x018(SB), AX
MULQ acc3 MULQ acc3
ADDQ t1, acc0 ADDQ t1, acc0
ADCQ $0, DX ADCQ $0, DX
ADDQ AX, acc0 ADDQ AX, acc0
@ -727,9 +707,9 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
MOVQ acc1, t1 MOVQ acc1, t1
SUBQ $-1, acc4 SUBQ $-1, acc4
SBBQ p256const0<>(SB), acc5 SBBQ p256p<>+0x08(SB), acc5
SBBQ $-1, acc0 SBBQ $-1, acc0
SBBQ p256const1<>(SB), acc1 SBBQ p256p<>+0x018(SB), acc1
CMOVQCS x_ptr, acc4 CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5 CMOVQCS acc3, acc5
@ -1473,9 +1453,9 @@ TEXT sm2P256SubInternal(SB),NOSPLIT,$0
MOVQ acc7, acc3 MOVQ acc7, acc3
ADDQ $-1, acc4 ADDQ $-1, acc4
ADCQ p256const0<>(SB), acc5 ADCQ p256p<>+0x08(SB), acc5
ADCQ $-1, acc6 ADCQ $-1, acc6
ADCQ p256const1<>(SB), acc7 ADCQ p256p<>+0x018(SB), acc7
ANDQ $1, mul0 ANDQ $1, mul0
CMOVQEQ acc0, acc4 CMOVQEQ acc0, acc4
@ -1599,93 +1579,81 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$0
ADCQ $0, mul1 ADCQ $0, mul1
MOVQ mul1, acc7 MOVQ mul1, acc7
// First reduction step // First reduction step
MOVQ p256p<>+0x08(SB), mul0 MOVQ acc0, hlp
MULQ acc0
ADDQ acc0, acc1 MOVQ hlp, mul0
ADCQ $0, mul1 MOVQ hlp, mul1
ADDQ mul0, acc1 SHLQ $32, mul0
ADCQ $0, mul1 SHRQ $32, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0 ADDQ hlp, acc1
MULQ acc0 ADCQ $0, acc2
ADDQ hlp, acc2 SUBQ mul0, acc1
ADCQ $0, mul1 SBBQ mul1, acc2
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc0
ADDQ hlp, acc3 ADDQ hlp, acc3
ADCQ $0, mul1 ADCQ $0, acc0
ADDQ mul0, acc3 SUBQ mul0, acc3
ADCQ $0, mul1 SBBQ mul1, acc0
MOVQ mul1, acc0 SUBQ hlp, acc3
SBBQ $0, acc0
// Second reduction step // Second reduction step
MOVQ p256p<>+0x08(SB), mul0 MOVQ acc1, hlp
MULQ acc1
ADDQ acc1, acc2 MOVQ hlp, mul0
ADCQ $0, mul1 MOVQ hlp, mul1
ADDQ mul0, acc2 SHLQ $32, mul0
ADCQ $0, mul1 SHRQ $32, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc1
ADDQ hlp, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc1
ADDQ hlp, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, acc1
// Third reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc2
ADDQ acc2, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc2
ADDQ hlp, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc2
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc2
// Last reduction step
MOVQ p256p<>+0x08(SB), mul0
MULQ acc3
ADDQ acc3, acc0
ADCQ $0, mul1
ADDQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x010(SB), mul0
MULQ acc3
ADDQ hlp, acc1
ADCQ $0, mul1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ p256p<>+0x018(SB), mul0
MULQ acc3
ADDQ hlp, acc2 ADDQ hlp, acc2
ADCQ $0, mul1 ADCQ $0, acc3
ADDQ mul0, acc2 SUBQ mul0, acc2
ADCQ $0, mul1 SBBQ mul1, acc3
MOVQ mul1, acc3
ADDQ hlp, acc0
ADCQ $0, acc1
SUBQ mul0, acc0
SBBQ mul1, acc1
SUBQ hlp, acc0
SBBQ $0, acc1
// Third reduction step
MOVQ acc2, hlp
MOVQ hlp, mul0
MOVQ hlp, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ hlp, acc3
ADCQ $0, acc0
SUBQ mul0, acc3
SBBQ mul1, acc0
ADDQ hlp, acc1
ADCQ $0, acc2
SUBQ mul0, acc1
SBBQ mul1, acc2
SUBQ hlp, acc1
SBBQ $0, acc2
// Last reduction step
MOVQ acc3, hlp
MOVQ hlp, mul0
MOVQ hlp, mul1
SHLQ $32, mul0
SHRQ $32, mul1
ADDQ hlp, acc0
ADCQ $0, acc1
SUBQ mul0, acc0
SBBQ mul1, acc1
ADDQ hlp, acc2
ADCQ $0, acc3
SUBQ mul0, acc2
SBBQ mul1, acc3
SUBQ hlp, acc2
SBBQ $0, acc3
MOVQ $0, BP MOVQ $0, BP
// Add bits [511:256] of the result // Add bits [511:256] of the result
ADCQ acc0, acc4 ADCQ acc0, acc4
@ -1700,9 +1668,9 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$0
MOVQ acc7, acc3 MOVQ acc7, acc3
// Subtract p256 // Subtract p256
SUBQ $-1, acc4 SUBQ $-1, acc4
SBBQ p256const0<>(SB) ,acc5 SBBQ p256p<>+0x08(SB), acc5
SBBQ $-1, acc6 SBBQ $-1, acc6
SBBQ p256const1<>(SB), acc7 SBBQ p256p<>+0x018(SB), acc7
SBBQ $0, hlp SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result // If the result of the subtraction is negative, restore the previous result
CMOVQCS acc0, acc4 CMOVQCS acc0, acc4
@ -1886,9 +1854,9 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$0
MOVQ t3, acc7 MOVQ t3, acc7
// Subtract p256 // Subtract p256
SUBQ $-1, acc4 SUBQ $-1, acc4
SBBQ p256const0<>(SB) ,acc5 SBBQ p256p<>+0x08(SB), acc5
SBBQ $-1, acc6 SBBQ $-1, acc6
SBBQ p256const1<>(SB), acc7 SBBQ p256p<>+0x018(SB), acc7
SBBQ $0, hlp SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result // If the result of the subtraction is negative, restore the previous result
CMOVQCS t0, acc4 CMOVQCS t0, acc4
@ -1910,9 +1878,9 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$0
MOVQ acc6, t2;\ MOVQ acc6, t2;\
MOVQ acc7, t3;\ MOVQ acc7, t3;\
SUBQ $-1, t0;\ SUBQ $-1, t0;\
SBBQ p256const0<>(SB), t1;\ SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\ SBBQ $-1, t2;\
SBBQ p256const1<>(SB), t3;\ SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\ SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ CMOVQCS acc4, t0;\
CMOVQCS acc5, t1;\ CMOVQCS acc5, t1;\
@ -1931,9 +1899,9 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$0
MOVQ acc6, t2;\ MOVQ acc6, t2;\
MOVQ acc7, t3;\ MOVQ acc7, t3;\
SUBQ $-1, t0;\ SUBQ $-1, t0;\
SBBQ p256const0<>(SB), t1;\ SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\ SBBQ $-1, t2;\
SBBQ p256const1<>(SB), t3;\ SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\ SBBQ $0, mul0;\
CMOVQCS acc4, t0;\ CMOVQCS acc4, t0;\
CMOVQCS acc5, t1;\ CMOVQCS acc5, t1;\
@ -2005,9 +1973,9 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-96
MOVQ (16*2 + 8*2)(CX), acc6 MOVQ (16*2 + 8*2)(CX), acc6
MOVQ (16*2 + 8*3)(CX), acc7 MOVQ (16*2 + 8*3)(CX), acc7
MOVQ $-1, acc0 MOVQ $-1, acc0
MOVQ p256const0<>(SB), acc1 MOVQ p256p<>+0x08(SB), acc1
MOVQ $-1, acc2 MOVQ $-1, acc2
MOVQ p256const1<>(SB), acc3 MOVQ p256p<>+0x018(SB), acc3
XORQ mul0, mul0 XORQ mul0, mul0
// Speculatively subtract // Speculatively subtract
SUBQ acc4, acc0 SUBQ acc4, acc0
@ -2021,9 +1989,9 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-96
MOVQ acc3, t3 MOVQ acc3, t3
// Add in case the operand was > p256 // Add in case the operand was > p256
ADDQ $-1, acc0 ADDQ $-1, acc0
ADCQ p256const0<>(SB), acc1 ADCQ p256p<>+0x08(SB), acc1
ADCQ $-1, acc2 ADCQ $-1, acc2
ADCQ p256const1<>(SB), acc3 ADCQ p256p<>+0x018(SB), acc3
ADCQ $0, mul0 ADCQ $0, mul0
CMOVQNE t0, acc0 CMOVQNE t0, acc0
CMOVQNE t1, acc1 CMOVQNE t1, acc1
@ -2242,9 +2210,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
// XOR [acc4..acc7] with P and compare with zero again. // XOR [acc4..acc7] with P and compare with zero again.
XORQ $-1, acc4 XORQ $-1, acc4
XORQ p256const0<>(SB), acc5 XORQ p256p<>+0x08(SB), acc5
XORQ $-1, acc6 XORQ $-1, acc6
XORQ p256const1<>(SB), acc7 XORQ p256p<>+0x018(SB), acc7
ORQ acc5, acc4 ORQ acc5, acc4
ORQ acc6, acc4 ORQ acc6, acc4
ORQ acc7, acc4 ORQ acc7, acc4
@ -2526,9 +2494,9 @@ TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
MOVQ acc7, t3 MOVQ acc7, t3
ADDQ $-1, acc4 ADDQ $-1, acc4
ADCQ p256const0<>(SB), acc5 ADCQ p256p<>+0x08(SB), acc5
ADCQ $-1, acc6 ADCQ $-1, acc6
ADCQ p256const1<>(SB), acc7 ADCQ p256p<>+0x018(SB), acc7
ADCQ $0, mul0 ADCQ $0, mul0
TESTQ $1, t0 TESTQ $1, t0