mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
sm2ec: use asm to implement p256OrdReduce
This commit is contained in:
parent
0e667b152d
commit
3bd048c903
@ -201,7 +201,29 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
|
||||
|
||||
RET
|
||||
|
||||
#define p256sqrReduction() \
|
||||
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
||||
MOVQ a0, b0 \
|
||||
MOVQ a1, b1 \
|
||||
MOVQ a2, b2 \
|
||||
MOVQ a3, b3 \
|
||||
\ // Subtract p256
|
||||
SUBQ $-1, a0 \
|
||||
SBBQ p256p<>+0x08(SB), a1 \
|
||||
SBBQ $-1, a2 \
|
||||
SBBQ p256p<>+0x018(SB), a3 \
|
||||
SBBQ $0, a4 \
|
||||
\
|
||||
CMOVQCS b0, a0 \
|
||||
CMOVQCS b1, a1 \
|
||||
CMOVQCS b2, a2 \
|
||||
CMOVQCS b3, a3 \
|
||||
\
|
||||
MOVQ a0, (8*0)(res) \
|
||||
MOVQ a1, (8*1)(res) \
|
||||
MOVQ a2, (8*2)(res) \
|
||||
MOVQ a3, (8*3)(res) \
|
||||
|
||||
#define p256SqrMontReduce() \
|
||||
\ // First reduction step
|
||||
MOVQ acc0, AX \
|
||||
MOVQ acc0, DX \
|
||||
@ -268,30 +290,7 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
|
||||
ADCQ acc5, acc1 \
|
||||
ADCQ y_ptr, acc2 \
|
||||
ADCQ x_ptr, acc3 \
|
||||
ADCQ $0, t0 \
|
||||
\
|
||||
MOVQ acc0, acc4 \
|
||||
MOVQ acc1, acc5 \
|
||||
MOVQ acc2, y_ptr \
|
||||
MOVQ acc3, t1 \
|
||||
\ // Subtract p256
|
||||
SUBQ $-1, acc0 \
|
||||
SBBQ p256p<>+0x08(SB), acc1 \
|
||||
SBBQ $-1, acc2 \
|
||||
SBBQ p256p<>+0x018(SB), acc3 \
|
||||
SBBQ $0, t0 \
|
||||
\
|
||||
CMOVQCS acc4, acc0 \
|
||||
CMOVQCS acc5, acc1 \
|
||||
CMOVQCS y_ptr, acc2 \
|
||||
CMOVQCS t1, acc3 \
|
||||
\
|
||||
MOVQ acc0, (8*0)(res_ptr) \
|
||||
MOVQ acc1, (8*1)(res_ptr) \
|
||||
MOVQ acc2, (8*2)(res_ptr) \
|
||||
MOVQ acc3, (8*3)(res_ptr) \
|
||||
MOVQ res_ptr, x_ptr \
|
||||
DECQ BX \
|
||||
ADCQ $0, t0 \
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// func p256Sqr(res, in *p256Element, n int)
|
||||
@ -383,7 +382,10 @@ sqrLoop:
|
||||
ADCQ DX, t1
|
||||
MOVQ t1, x_ptr
|
||||
|
||||
p256sqrReduction()
|
||||
p256SqrMontReduce()
|
||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
DECQ BX
|
||||
JNE sqrLoop
|
||||
RET
|
||||
|
||||
@ -448,7 +450,10 @@ sqrBMI2:
|
||||
ADCQ AX, y_ptr
|
||||
ADCQ t1, x_ptr
|
||||
|
||||
p256sqrReduction()
|
||||
p256SqrMontReduce()
|
||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
DECQ BX
|
||||
JNE sqrBMI2
|
||||
RET
|
||||
|
||||
@ -657,28 +662,7 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
SBBQ $0, acc2
|
||||
// Copy result [255:0]
|
||||
MOVQ acc4, x_ptr
|
||||
MOVQ acc5, acc3
|
||||
MOVQ acc0, t0
|
||||
MOVQ acc1, t1
|
||||
// Subtract p256
|
||||
SUBQ $-1, acc4
|
||||
SBBQ p256p<>+0x08(SB), acc5
|
||||
SBBQ $-1, acc0
|
||||
SBBQ p256p<>+0x018(SB), acc1
|
||||
SBBQ $0, acc2
|
||||
|
||||
CMOVQCS x_ptr, acc4
|
||||
CMOVQCS acc3, acc5
|
||||
CMOVQCS t0, acc0
|
||||
CMOVQCS t1, acc1
|
||||
|
||||
MOVQ acc4, (8*0)(res_ptr)
|
||||
MOVQ acc5, (8*1)(res_ptr)
|
||||
MOVQ acc0, (8*2)(res_ptr)
|
||||
MOVQ acc1, (8*3)(res_ptr)
|
||||
|
||||
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
|
||||
RET
|
||||
|
||||
mulBMI2:
|
||||
@ -836,27 +820,7 @@ mulBMI2:
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
SBBQ $0, acc2
|
||||
// Copy result [255:0]
|
||||
MOVQ acc4, x_ptr
|
||||
MOVQ acc5, acc3
|
||||
MOVQ acc0, t0
|
||||
MOVQ acc1, t1
|
||||
// Subtract p256
|
||||
SUBQ $-1, acc4
|
||||
SBBQ p256p<>+0x08(SB), acc5
|
||||
SBBQ $-1, acc0
|
||||
SBBQ p256p<>+0x018(SB), acc1
|
||||
SBBQ $0, acc2
|
||||
|
||||
CMOVQCS x_ptr, acc4
|
||||
CMOVQCS acc3, acc5
|
||||
CMOVQCS t0, acc0
|
||||
CMOVQCS t1, acc1
|
||||
|
||||
MOVQ acc4, (8*0)(res_ptr)
|
||||
MOVQ acc5, (8*1)(res_ptr)
|
||||
MOVQ acc0, (8*2)(res_ptr)
|
||||
MOVQ acc1, (8*3)(res_ptr)
|
||||
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
@ -1182,6 +1146,40 @@ loop_select_base_avx2:
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
||||
\// Copy result [255:0]
|
||||
MOVQ a0, b0 \
|
||||
MOVQ a1, b1 \
|
||||
MOVQ a2, b2 \
|
||||
MOVQ a3, b3 \
|
||||
\// Subtract p256
|
||||
SUBQ p256ord<>+0x00(SB), a0 \
|
||||
SBBQ p256ord<>+0x08(SB) ,a1 \
|
||||
SBBQ p256ord<>+0x10(SB), a2 \
|
||||
SBBQ p256ord<>+0x18(SB), a3 \
|
||||
SBBQ $0, a4 \
|
||||
\
|
||||
CMOVQCS b0, a0 \
|
||||
CMOVQCS b1, a1 \
|
||||
CMOVQCS b2, a2 \
|
||||
CMOVQCS b3, a3 \
|
||||
\
|
||||
MOVQ a0, (8*0)(res) \
|
||||
MOVQ a1, (8*1)(res) \
|
||||
MOVQ a2, (8*2)(res) \
|
||||
MOVQ a3, (8*3)(res) \
|
||||
|
||||
//func p256OrdReduce(s *p256OrdElement)
|
||||
TEXT ·p256OrdReduce(SB),NOSPLIT,$0
|
||||
MOVQ s+0(FP), res_ptr
|
||||
MOVQ (8*0)(res_ptr), acc0
|
||||
MOVQ (8*1)(res_ptr), acc1
|
||||
MOVQ (8*2)(res_ptr), acc2
|
||||
MOVQ (8*3)(res_ptr), acc3
|
||||
XORQ acc4, acc4
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, acc4, acc5, x_ptr, y_ptr, t0, res_ptr)
|
||||
RET
|
||||
|
||||
// func p256OrdMul(res, in1, in2 *p256OrdElement)
|
||||
TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
MOVQ res+0(FP), res_ptr
|
||||
@ -1432,27 +1430,8 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
SBBQ $0, acc2
|
||||
// Copy result [255:0]
|
||||
MOVQ acc4, x_ptr
|
||||
MOVQ acc5, acc3
|
||||
MOVQ acc0, t0
|
||||
MOVQ acc1, t1
|
||||
// Subtract p256
|
||||
SUBQ p256ord<>+0x00(SB), acc4
|
||||
SBBQ p256ord<>+0x08(SB) ,acc5
|
||||
SBBQ p256ord<>+0x10(SB), acc0
|
||||
SBBQ p256ord<>+0x18(SB), acc1
|
||||
SBBQ $0, acc2
|
||||
|
||||
CMOVQCS x_ptr, acc4
|
||||
CMOVQCS acc3, acc5
|
||||
CMOVQCS t0, acc0
|
||||
CMOVQCS t1, acc1
|
||||
|
||||
MOVQ acc4, (8*0)(res_ptr)
|
||||
MOVQ acc5, (8*1)(res_ptr)
|
||||
MOVQ acc0, (8*2)(res_ptr)
|
||||
MOVQ acc1, (8*3)(res_ptr)
|
||||
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
|
||||
|
||||
RET
|
||||
|
||||
@ -1649,27 +1628,7 @@ ordMulBMI2:
|
||||
SBBQ DX, acc1
|
||||
SBBQ $0, acc2
|
||||
|
||||
// Copy result [255:0]
|
||||
MOVQ acc4, x_ptr
|
||||
MOVQ acc5, acc3
|
||||
MOVQ acc0, t0
|
||||
MOVQ acc1, t1
|
||||
// Subtract p256
|
||||
SUBQ p256ord<>+0x00(SB), acc4
|
||||
SBBQ p256ord<>+0x08(SB) ,acc5
|
||||
SBBQ p256ord<>+0x10(SB), acc0
|
||||
SBBQ p256ord<>+0x18(SB), acc1
|
||||
SBBQ $0, acc2
|
||||
|
||||
CMOVQCS x_ptr, acc4
|
||||
CMOVQCS acc3, acc5
|
||||
CMOVQCS t0, acc0
|
||||
CMOVQCS t1, acc1
|
||||
|
||||
MOVQ acc4, (8*0)(res_ptr)
|
||||
MOVQ acc5, (8*1)(res_ptr)
|
||||
MOVQ acc0, (8*2)(res_ptr)
|
||||
MOVQ acc1, (8*3)(res_ptr)
|
||||
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
|
||||
|
||||
RET
|
||||
|
||||
@ -1892,26 +1851,7 @@ ordSqrLoop:
|
||||
ADCQ x_ptr, acc3
|
||||
ADCQ $0, t0
|
||||
|
||||
MOVQ acc0, acc4
|
||||
MOVQ acc1, acc5
|
||||
MOVQ acc2, y_ptr
|
||||
MOVQ acc3, t1
|
||||
// Subtract p256
|
||||
SUBQ p256ord<>+0x00(SB), acc0
|
||||
SBBQ p256ord<>+0x08(SB) ,acc1
|
||||
SBBQ p256ord<>+0x10(SB), acc2
|
||||
SBBQ p256ord<>+0x18(SB), acc3
|
||||
SBBQ $0, t0
|
||||
|
||||
CMOVQCS acc4, acc0
|
||||
CMOVQCS acc5, acc1
|
||||
CMOVQCS y_ptr, acc2
|
||||
CMOVQCS t1, acc3
|
||||
|
||||
MOVQ acc0, (8*0)(res_ptr)
|
||||
MOVQ acc1, (8*1)(res_ptr)
|
||||
MOVQ acc2, (8*2)(res_ptr)
|
||||
MOVQ acc3, (8*3)(res_ptr)
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
DECQ BX
|
||||
JNE ordSqrLoop
|
||||
@ -2088,26 +2028,7 @@ ordSqrLoopBMI2:
|
||||
ADCQ x_ptr, acc3
|
||||
ADCQ $0, t0
|
||||
|
||||
MOVQ acc0, acc4
|
||||
MOVQ acc1, acc5
|
||||
MOVQ acc2, y_ptr
|
||||
MOVQ acc3, t1
|
||||
// Subtract p256
|
||||
SUBQ p256ord<>+0x00(SB), acc0
|
||||
SBBQ p256ord<>+0x08(SB) ,acc1
|
||||
SBBQ p256ord<>+0x10(SB), acc2
|
||||
SBBQ p256ord<>+0x18(SB), acc3
|
||||
SBBQ $0, t0
|
||||
|
||||
CMOVQCS acc4, acc0
|
||||
CMOVQCS acc5, acc1
|
||||
CMOVQCS y_ptr, acc2
|
||||
CMOVQCS t1, acc3
|
||||
|
||||
MOVQ acc0, (8*0)(res_ptr)
|
||||
MOVQ acc1, (8*1)(res_ptr)
|
||||
MOVQ acc2, (8*2)(res_ptr)
|
||||
MOVQ acc3, (8*3)(res_ptr)
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
DECQ BX
|
||||
JNE ordSqrLoopBMI2
|
||||
|
@ -387,6 +387,35 @@ loop_select:
|
||||
STP (y0, y1), 2*16(res_ptr)
|
||||
STP (y2, y3), 3*16(res_ptr)
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
//func p256OrdReduce(s *p256OrdElement)
|
||||
TEXT ·p256OrdReduce(SB),NOSPLIT,$0
|
||||
MOVD s+0(FP), res_ptr
|
||||
|
||||
LDP p256ord<>+0x00(SB), (const0, const1)
|
||||
LDP p256ord<>+0x10(SB), (const2, const3)
|
||||
|
||||
LDP 0*16(res_ptr), (acc0, acc1)
|
||||
LDP 1*16(res_ptr), (acc2, acc3)
|
||||
EOR acc4, acc4, acc4
|
||||
|
||||
SUBS const0, acc0, y0
|
||||
SBCS const1, acc1, y1
|
||||
SBCS const2, acc2, y2
|
||||
SBCS const3, acc3, y3
|
||||
SBCS $0, acc4, acc4
|
||||
|
||||
CSEL CS, y0, acc0, x0
|
||||
CSEL CS, y1, acc1, x1
|
||||
CSEL CS, y2, acc2, x2
|
||||
CSEL CS, y3, acc3, x3
|
||||
|
||||
STP (x0, x1), 0*16(res_ptr)
|
||||
STP (x2, x3), 1*16(res_ptr)
|
||||
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// func p256OrdSqr(res, in *p256OrdElement, n int)
|
||||
TEXT ·p256OrdSqr(SB),NOSPLIT,$0
|
||||
|
@ -353,6 +353,11 @@ func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
|
||||
//go:noescape
|
||||
func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
|
||||
|
||||
// p256OrdReduce ensures s is in the range [0, ord(G)-1].
|
||||
//
|
||||
//go:noescape
|
||||
func p256OrdReduce(s *p256OrdElement)
|
||||
|
||||
// p256Table is a table of the first 16 multiples of a point. Points are stored
|
||||
// at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15.
|
||||
// [0]P is the point at infinity and it's not stored.
|
||||
@ -425,21 +430,6 @@ func p256PointDouble6TimesAsm(res, in *SM2P256Point)
|
||||
// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
|
||||
type p256OrdElement [4]uint64
|
||||
|
||||
// p256OrdReduce ensures s is in the range [0, ord(G)-1].
|
||||
func p256OrdReduce(s *p256OrdElement) {
|
||||
// Since 2 * ord(G) > 2²⁵⁶, we can just conditionally subtract ord(G),
|
||||
// keeping the result if it doesn't underflow.
|
||||
t0, b := bits.Sub64(s[0], 0x53bbf40939d54123, 0)
|
||||
t1, b := bits.Sub64(s[1], 0x7203df6b21c6052b, b)
|
||||
t2, b := bits.Sub64(s[2], 0xffffffffffffffff, b)
|
||||
t3, b := bits.Sub64(s[3], 0xfffffffeffffffff, b)
|
||||
tMask := b - 1 // zero if subtraction underflowed
|
||||
s[0] ^= (t0 ^ s[0]) & tMask
|
||||
s[1] ^= (t1 ^ s[1]) & tMask
|
||||
s[2] ^= (t2 ^ s[2]) & tMask
|
||||
s[3] ^= (t3 ^ s[3]) & tMask
|
||||
}
|
||||
|
||||
// Add sets q = p1 + p2, and returns q. The points may overlap.
|
||||
func (q *SM2P256Point) Add(r1, r2 *SM2P256Point) *SM2P256Point {
|
||||
var sum, double SM2P256Point
|
||||
|
Loading…
x
Reference in New Issue
Block a user