sm2ec: use asm to implement p256OrdReduce

This commit is contained in:
Sun Yimin 2023-06-16 15:52:28 +08:00 committed by GitHub
parent 0e667b152d
commit 3bd048c903
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 106 additions and 166 deletions

View File

@ -201,7 +201,29 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
RET
#define p256sqrReduction() \
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
MOVQ a0, b0 \
MOVQ a1, b1 \
MOVQ a2, b2 \
MOVQ a3, b3 \
\ // Subtract p256
SUBQ $-1, a0 \
SBBQ p256p<>+0x08(SB), a1 \
SBBQ $-1, a2 \
SBBQ p256p<>+0x018(SB), a3 \
SBBQ $0, a4 \
\
CMOVQCS b0, a0 \
CMOVQCS b1, a1 \
CMOVQCS b2, a2 \
CMOVQCS b3, a3 \
\
MOVQ a0, (8*0)(res) \
MOVQ a1, (8*1)(res) \
MOVQ a2, (8*2)(res) \
MOVQ a3, (8*3)(res) \
#define p256SqrMontReduce() \
\ // First reduction step
MOVQ acc0, AX \
MOVQ acc0, DX \
@ -268,30 +290,7 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
ADCQ acc5, acc1 \
ADCQ y_ptr, acc2 \
ADCQ x_ptr, acc3 \
ADCQ $0, t0 \
\
MOVQ acc0, acc4 \
MOVQ acc1, acc5 \
MOVQ acc2, y_ptr \
MOVQ acc3, t1 \
\ // Subtract p256
SUBQ $-1, acc0 \
SBBQ p256p<>+0x08(SB), acc1 \
SBBQ $-1, acc2 \
SBBQ p256p<>+0x018(SB), acc3 \
SBBQ $0, t0 \
\
CMOVQCS acc4, acc0 \
CMOVQCS acc5, acc1 \
CMOVQCS y_ptr, acc2 \
CMOVQCS t1, acc3 \
\
MOVQ acc0, (8*0)(res_ptr) \
MOVQ acc1, (8*1)(res_ptr) \
MOVQ acc2, (8*2)(res_ptr) \
MOVQ acc3, (8*3)(res_ptr) \
MOVQ res_ptr, x_ptr \
DECQ BX \
ADCQ $0, t0 \
/* ---------------------------------------*/
// func p256Sqr(res, in *p256Element, n int)
@ -383,7 +382,10 @@ sqrLoop:
ADCQ DX, t1
MOVQ t1, x_ptr
p256sqrReduction()
p256SqrMontReduce()
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
MOVQ res_ptr, x_ptr
DECQ BX
JNE sqrLoop
RET
@ -448,7 +450,10 @@ sqrBMI2:
ADCQ AX, y_ptr
ADCQ t1, x_ptr
p256sqrReduction()
p256SqrMontReduce()
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
MOVQ res_ptr, x_ptr
DECQ BX
JNE sqrBMI2
RET
@ -657,28 +662,7 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
// Subtract p256
SUBQ $-1, acc4
SBBQ p256p<>+0x08(SB), acc5
SBBQ $-1, acc0
SBBQ p256p<>+0x018(SB), acc1
SBBQ $0, acc2
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
RET
mulBMI2:
@ -836,27 +820,7 @@ mulBMI2:
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
// Subtract p256
SUBQ $-1, acc4
SBBQ p256p<>+0x08(SB), acc5
SBBQ $-1, acc0
SBBQ p256p<>+0x018(SB), acc1
SBBQ $0, acc2
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
RET
/* ---------------------------------------*/
@ -1182,6 +1146,40 @@ loop_select_base_avx2:
RET
/* ---------------------------------------*/
#define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
\// Copy result [255:0]
MOVQ a0, b0 \
MOVQ a1, b1 \
MOVQ a2, b2 \
MOVQ a3, b3 \
\// Subtract p256
SUBQ p256ord<>+0x00(SB), a0 \
SBBQ p256ord<>+0x08(SB) ,a1 \
SBBQ p256ord<>+0x10(SB), a2 \
SBBQ p256ord<>+0x18(SB), a3 \
SBBQ $0, a4 \
\
CMOVQCS b0, a0 \
CMOVQCS b1, a1 \
CMOVQCS b2, a2 \
CMOVQCS b3, a3 \
\
MOVQ a0, (8*0)(res) \
MOVQ a1, (8*1)(res) \
MOVQ a2, (8*2)(res) \
MOVQ a3, (8*3)(res) \
//func p256OrdReduce(s *p256OrdElement)
TEXT ·p256OrdReduce(SB),NOSPLIT,$0
MOVQ s+0(FP), res_ptr
MOVQ (8*0)(res_ptr), acc0
MOVQ (8*1)(res_ptr), acc1
MOVQ (8*2)(res_ptr), acc2
MOVQ (8*3)(res_ptr), acc3
XORQ acc4, acc4
p256OrdReduceInline(acc0, acc1, acc2, acc3, acc4, acc5, x_ptr, y_ptr, t0, res_ptr)
RET
// func p256OrdMul(res, in1, in2 *p256OrdElement)
TEXT ·p256OrdMul(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
@ -1432,27 +1430,8 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
SBBQ AX, acc0
SBBQ DX, acc1
SBBQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
// Subtract p256
SUBQ p256ord<>+0x00(SB), acc4
SBBQ p256ord<>+0x08(SB) ,acc5
SBBQ p256ord<>+0x10(SB), acc0
SBBQ p256ord<>+0x18(SB), acc1
SBBQ $0, acc2
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
RET
@ -1649,27 +1628,7 @@ ordMulBMI2:
SBBQ DX, acc1
SBBQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
// Subtract p256
SUBQ p256ord<>+0x00(SB), acc4
SBBQ p256ord<>+0x08(SB) ,acc5
SBBQ p256ord<>+0x10(SB), acc0
SBBQ p256ord<>+0x18(SB), acc1
SBBQ $0, acc2
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
RET
@ -1892,26 +1851,7 @@ ordSqrLoop:
ADCQ x_ptr, acc3
ADCQ $0, t0
MOVQ acc0, acc4
MOVQ acc1, acc5
MOVQ acc2, y_ptr
MOVQ acc3, t1
// Subtract p256
SUBQ p256ord<>+0x00(SB), acc0
SBBQ p256ord<>+0x08(SB) ,acc1
SBBQ p256ord<>+0x10(SB), acc2
SBBQ p256ord<>+0x18(SB), acc3
SBBQ $0, t0
CMOVQCS acc4, acc0
CMOVQCS acc5, acc1
CMOVQCS y_ptr, acc2
CMOVQCS t1, acc3
MOVQ acc0, (8*0)(res_ptr)
MOVQ acc1, (8*1)(res_ptr)
MOVQ acc2, (8*2)(res_ptr)
MOVQ acc3, (8*3)(res_ptr)
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
MOVQ res_ptr, x_ptr
DECQ BX
JNE ordSqrLoop
@ -2088,26 +2028,7 @@ ordSqrLoopBMI2:
ADCQ x_ptr, acc3
ADCQ $0, t0
MOVQ acc0, acc4
MOVQ acc1, acc5
MOVQ acc2, y_ptr
MOVQ acc3, t1
// Subtract p256
SUBQ p256ord<>+0x00(SB), acc0
SBBQ p256ord<>+0x08(SB) ,acc1
SBBQ p256ord<>+0x10(SB), acc2
SBBQ p256ord<>+0x18(SB), acc3
SBBQ $0, t0
CMOVQCS acc4, acc0
CMOVQCS acc5, acc1
CMOVQCS y_ptr, acc2
CMOVQCS t1, acc3
MOVQ acc0, (8*0)(res_ptr)
MOVQ acc1, (8*1)(res_ptr)
MOVQ acc2, (8*2)(res_ptr)
MOVQ acc3, (8*3)(res_ptr)
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
MOVQ res_ptr, x_ptr
DECQ BX
JNE ordSqrLoopBMI2

View File

@ -387,6 +387,35 @@ loop_select:
STP (y0, y1), 2*16(res_ptr)
STP (y2, y3), 3*16(res_ptr)
RET
/* ---------------------------------------*/
//func p256OrdReduce(s *p256OrdElement)
TEXT ·p256OrdReduce(SB),NOSPLIT,$0
MOVD s+0(FP), res_ptr
LDP p256ord<>+0x00(SB), (const0, const1)
LDP p256ord<>+0x10(SB), (const2, const3)
LDP 0*16(res_ptr), (acc0, acc1)
LDP 1*16(res_ptr), (acc2, acc3)
EOR acc4, acc4, acc4
SUBS const0, acc0, y0
SBCS const1, acc1, y1
SBCS const2, acc2, y2
SBCS const3, acc3, y3
SBCS $0, acc4, acc4
CSEL CS, y0, acc0, x0
CSEL CS, y1, acc1, x1
CSEL CS, y2, acc2, x2
CSEL CS, y3, acc3, x3
STP (x0, x1), 0*16(res_ptr)
STP (x2, x3), 1*16(res_ptr)
RET
/* ---------------------------------------*/
// func p256OrdSqr(res, in *p256OrdElement, n int)
TEXT ·p256OrdSqr(SB),NOSPLIT,$0

View File

@ -353,6 +353,11 @@ func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
//go:noescape
func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
// p256OrdReduce ensures s is in the range [0, ord(G)-1].
//
//go:noescape
func p256OrdReduce(s *p256OrdElement)
// p256Table is a table of the first 16 multiples of a point. Points are stored
// at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15.
// [0]P is the point at infinity and it's not stored.
@ -425,21 +430,6 @@ func p256PointDouble6TimesAsm(res, in *SM2P256Point)
// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
type p256OrdElement [4]uint64
// p256OrdReduce ensures s is in the range [0, ord(G)-1].
func p256OrdReduce(s *p256OrdElement) {
// Since 2 * ord(G) > 2²⁵⁶, we can just conditionally subtract ord(G),
// keeping the result if it doesn't underflow.
t0, b := bits.Sub64(s[0], 0x53bbf40939d54123, 0)
t1, b := bits.Sub64(s[1], 0x7203df6b21c6052b, b)
t2, b := bits.Sub64(s[2], 0xffffffffffffffff, b)
t3, b := bits.Sub64(s[3], 0xfffffffeffffffff, b)
tMask := b - 1 // zero if subtraction underflowed
s[0] ^= (t0 ^ s[0]) & tMask
s[1] ^= (t1 ^ s[1]) & tMask
s[2] ^= (t2 ^ s[2]) & tMask
s[3] ^= (t3 ^ s[3]) & tMask
}
// Add sets q = p1 + p2, and returns q. The points may overlap.
func (q *SM2P256Point) Add(r1, r2 *SM2P256Point) *SM2P256Point {
var sum, double SM2P256Point