diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 3c5ad79..37f9fa2 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -201,7 +201,29 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0 RET -#define p256sqrReduction() \ +#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ + MOVQ a0, b0 \ + MOVQ a1, b1 \ + MOVQ a2, b2 \ + MOVQ a3, b3 \ + \ // Subtract p256 + SUBQ $-1, a0 \ + SBBQ p256p<>+0x08(SB), a1 \ + SBBQ $-1, a2 \ + SBBQ p256p<>+0x018(SB), a3 \ + SBBQ $0, a4 \ + \ + CMOVQCS b0, a0 \ + CMOVQCS b1, a1 \ + CMOVQCS b2, a2 \ + CMOVQCS b3, a3 \ + \ + MOVQ a0, (8*0)(res) \ + MOVQ a1, (8*1)(res) \ + MOVQ a2, (8*2)(res) \ + MOVQ a3, (8*3)(res) \ + +#define p256SqrMontReduce() \ \ // First reduction step MOVQ acc0, AX \ MOVQ acc0, DX \ @@ -268,30 +290,7 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0 ADCQ acc5, acc1 \ ADCQ y_ptr, acc2 \ ADCQ x_ptr, acc3 \ - ADCQ $0, t0 \ - \ - MOVQ acc0, acc4 \ - MOVQ acc1, acc5 \ - MOVQ acc2, y_ptr \ - MOVQ acc3, t1 \ - \ // Subtract p256 - SUBQ $-1, acc0 \ - SBBQ p256p<>+0x08(SB), acc1 \ - SBBQ $-1, acc2 \ - SBBQ p256p<>+0x018(SB), acc3 \ - SBBQ $0, t0 \ - \ - CMOVQCS acc4, acc0 \ - CMOVQCS acc5, acc1 \ - CMOVQCS y_ptr, acc2 \ - CMOVQCS t1, acc3 \ - \ - MOVQ acc0, (8*0)(res_ptr) \ - MOVQ acc1, (8*1)(res_ptr) \ - MOVQ acc2, (8*2)(res_ptr) \ - MOVQ acc3, (8*3)(res_ptr) \ - MOVQ res_ptr, x_ptr \ - DECQ BX \ + ADCQ $0, t0 \ /* ---------------------------------------*/ // func p256Sqr(res, in *p256Element, n int) @@ -383,7 +382,10 @@ sqrLoop: ADCQ DX, t1 MOVQ t1, x_ptr - p256sqrReduction() + p256SqrMontReduce() + p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) + MOVQ res_ptr, x_ptr + DECQ BX JNE sqrLoop RET @@ -448,7 +450,10 @@ sqrBMI2: ADCQ AX, y_ptr ADCQ t1, x_ptr - p256sqrReduction() + p256SqrMontReduce() + p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) + MOVQ res_ptr, x_ptr + DECQ BX JNE sqrBMI2 RET @@ -657,28 +662,7 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 SBBQ AX, acc0 SBBQ DX, acc1 SBBQ $0, acc2 - // Copy result [255:0] - MOVQ acc4, x_ptr - MOVQ acc5, acc3 - MOVQ acc0, t0 - MOVQ acc1, t1 - // Subtract p256 - SUBQ $-1, acc4 - SBBQ p256p<>+0x08(SB), acc5 - SBBQ $-1, acc0 - SBBQ p256p<>+0x018(SB), acc1 - SBBQ $0, acc2 - - CMOVQCS x_ptr, acc4 - CMOVQCS acc3, acc5 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - - MOVQ acc4, (8*0)(res_ptr) - MOVQ acc5, (8*1)(res_ptr) - MOVQ acc0, (8*2)(res_ptr) - MOVQ acc1, (8*3)(res_ptr) - + p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr) RET mulBMI2: @@ -836,27 +820,7 @@ mulBMI2: SBBQ AX, acc0 SBBQ DX, acc1 SBBQ $0, acc2 - // Copy result [255:0] - MOVQ acc4, x_ptr - MOVQ acc5, acc3 - MOVQ acc0, t0 - MOVQ acc1, t1 - // Subtract p256 - SUBQ $-1, acc4 - SBBQ p256p<>+0x08(SB), acc5 - SBBQ $-1, acc0 - SBBQ p256p<>+0x018(SB), acc1 - SBBQ $0, acc2 - - CMOVQCS x_ptr, acc4 - CMOVQCS acc3, acc5 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - - MOVQ acc4, (8*0)(res_ptr) - MOVQ acc5, (8*1)(res_ptr) - MOVQ acc0, (8*2)(res_ptr) - MOVQ acc1, (8*3)(res_ptr) + p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr) RET /* ---------------------------------------*/ @@ -1182,6 +1146,40 @@ loop_select_base_avx2: RET /* ---------------------------------------*/ +#define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ + \// Copy result [255:0] + MOVQ a0, b0 \ + MOVQ a1, b1 \ + MOVQ a2, b2 \ + MOVQ a3, b3 \ + \// Subtract p256 + SUBQ p256ord<>+0x00(SB), a0 \ + SBBQ p256ord<>+0x08(SB) ,a1 \ + SBBQ p256ord<>+0x10(SB), a2 \ + SBBQ p256ord<>+0x18(SB), a3 \ + SBBQ $0, a4 \ + \ + CMOVQCS b0, a0 \ + CMOVQCS b1, a1 \ + CMOVQCS b2, a2 \ + CMOVQCS b3, a3 \ + \ + MOVQ a0, (8*0)(res) \ + MOVQ a1, (8*1)(res) \ + MOVQ a2, (8*2)(res) \ + MOVQ a3, (8*3)(res) \ + +//func p256OrdReduce(s *p256OrdElement) +TEXT ·p256OrdReduce(SB),NOSPLIT,$0 + MOVQ s+0(FP), res_ptr + MOVQ (8*0)(res_ptr), acc0 + MOVQ (8*1)(res_ptr), acc1 + MOVQ (8*2)(res_ptr), acc2 + MOVQ (8*3)(res_ptr), acc3 + XORQ acc4, acc4 + p256OrdReduceInline(acc0, acc1, acc2, acc3, acc4, acc5, x_ptr, y_ptr, t0, res_ptr) + RET + // func p256OrdMul(res, in1, in2 *p256OrdElement) TEXT ·p256OrdMul(SB),NOSPLIT,$0 MOVQ res+0(FP), res_ptr @@ -1432,27 +1430,8 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0 SBBQ AX, acc0 SBBQ DX, acc1 SBBQ $0, acc2 - // Copy result [255:0] - MOVQ acc4, x_ptr - MOVQ acc5, acc3 - MOVQ acc0, t0 - MOVQ acc1, t1 - // Subtract p256 - SUBQ p256ord<>+0x00(SB), acc4 - SBBQ p256ord<>+0x08(SB) ,acc5 - SBBQ p256ord<>+0x10(SB), acc0 - SBBQ p256ord<>+0x18(SB), acc1 - SBBQ $0, acc2 - CMOVQCS x_ptr, acc4 - CMOVQCS acc3, acc5 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - - MOVQ acc4, (8*0)(res_ptr) - MOVQ acc5, (8*1)(res_ptr) - MOVQ acc0, (8*2)(res_ptr) - MOVQ acc1, (8*3)(res_ptr) + p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr) RET @@ -1649,27 +1628,7 @@ ordMulBMI2: SBBQ DX, acc1 SBBQ $0, acc2 - // Copy result [255:0] - MOVQ acc4, x_ptr - MOVQ acc5, acc3 - MOVQ acc0, t0 - MOVQ acc1, t1 - // Subtract p256 - SUBQ p256ord<>+0x00(SB), acc4 - SBBQ p256ord<>+0x08(SB) ,acc5 - SBBQ p256ord<>+0x10(SB), acc0 - SBBQ p256ord<>+0x18(SB), acc1 - SBBQ $0, acc2 - - CMOVQCS x_ptr, acc4 - CMOVQCS acc3, acc5 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - - MOVQ acc4, (8*0)(res_ptr) - MOVQ acc5, (8*1)(res_ptr) - MOVQ acc0, (8*2)(res_ptr) - MOVQ acc1, (8*3)(res_ptr) + p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr) RET @@ -1892,26 +1851,7 @@ ordSqrLoop: ADCQ x_ptr, acc3 ADCQ $0, t0 - MOVQ acc0, acc4 - MOVQ acc1, acc5 - MOVQ acc2, y_ptr - MOVQ acc3, t1 - // Subtract p256 - SUBQ p256ord<>+0x00(SB), acc0 - SBBQ p256ord<>+0x08(SB) ,acc1 - SBBQ p256ord<>+0x10(SB), acc2 - SBBQ p256ord<>+0x18(SB), acc3 - SBBQ $0, t0 - - CMOVQCS acc4, acc0 - CMOVQCS acc5, acc1 - CMOVQCS y_ptr, acc2 - CMOVQCS t1, acc3 - - MOVQ acc0, (8*0)(res_ptr) - MOVQ acc1, (8*1)(res_ptr) - MOVQ acc2, (8*2)(res_ptr) - MOVQ acc3, (8*3)(res_ptr) + p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) MOVQ res_ptr, x_ptr DECQ BX JNE ordSqrLoop @@ -2088,26 +2028,7 @@ ordSqrLoopBMI2: ADCQ x_ptr, acc3 ADCQ $0, t0 - MOVQ acc0, acc4 - MOVQ acc1, acc5 - MOVQ acc2, y_ptr - MOVQ acc3, t1 - // Subtract p256 - SUBQ p256ord<>+0x00(SB), acc0 - SBBQ p256ord<>+0x08(SB) ,acc1 - SBBQ p256ord<>+0x10(SB), acc2 - SBBQ p256ord<>+0x18(SB), acc3 - SBBQ $0, t0 - - CMOVQCS acc4, acc0 - CMOVQCS acc5, acc1 - CMOVQCS y_ptr, acc2 - CMOVQCS t1, acc3 - - MOVQ acc0, (8*0)(res_ptr) - MOVQ acc1, (8*1)(res_ptr) - MOVQ acc2, (8*2)(res_ptr) - MOVQ acc3, (8*3)(res_ptr) + p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) MOVQ res_ptr, x_ptr DECQ BX JNE ordSqrLoopBMI2 diff --git a/internal/sm2ec/p256_asm_arm64.s b/internal/sm2ec/p256_asm_arm64.s index 9ce98de..4a654ab 100644 --- a/internal/sm2ec/p256_asm_arm64.s +++ b/internal/sm2ec/p256_asm_arm64.s @@ -387,6 +387,35 @@ loop_select: STP (y0, y1), 2*16(res_ptr) STP (y2, y3), 3*16(res_ptr) RET + +/* ---------------------------------------*/ +//func p256OrdReduce(s *p256OrdElement) +TEXT ·p256OrdReduce(SB),NOSPLIT,$0 + MOVD s+0(FP), res_ptr + + LDP p256ord<>+0x00(SB), (const0, const1) + LDP p256ord<>+0x10(SB), (const2, const3) + + LDP 0*16(res_ptr), (acc0, acc1) + LDP 1*16(res_ptr), (acc2, acc3) + EOR acc4, acc4, acc4 + + SUBS const0, acc0, y0 + SBCS const1, acc1, y1 + SBCS const2, acc2, y2 + SBCS const3, acc3, y3 + SBCS $0, acc4, acc4 + + CSEL CS, y0, acc0, x0 + CSEL CS, y1, acc1, x1 + CSEL CS, y2, acc2, x2 + CSEL CS, y3, acc3, x3 + + STP (x0, x1), 0*16(res_ptr) + STP (x2, x3), 1*16(res_ptr) + + RET + /* ---------------------------------------*/ // func p256OrdSqr(res, in *p256OrdElement, n int) TEXT ·p256OrdSqr(SB),NOSPLIT,$0 diff --git a/internal/sm2ec/sm2p256_asm.go b/internal/sm2ec/sm2p256_asm.go index c562955..96b5c30 100644 --- a/internal/sm2ec/sm2p256_asm.go +++ b/internal/sm2ec/sm2p256_asm.go @@ -353,6 +353,11 @@ func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) //go:noescape func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) +// p256OrdReduce ensures s is in the range [0, ord(G)-1]. +// +//go:noescape +func p256OrdReduce(s *p256OrdElement) + // p256Table is a table of the first 16 multiples of a point. Points are stored // at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15. // [0]P is the point at infinity and it's not stored. @@ -425,21 +430,6 @@ func p256PointDouble6TimesAsm(res, in *SM2P256Point) // Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order. type p256OrdElement [4]uint64 -// p256OrdReduce ensures s is in the range [0, ord(G)-1]. -func p256OrdReduce(s *p256OrdElement) { - // Since 2 * ord(G) > 2²⁵⁶, we can just conditionally subtract ord(G), - // keeping the result if it doesn't underflow. - t0, b := bits.Sub64(s[0], 0x53bbf40939d54123, 0) - t1, b := bits.Sub64(s[1], 0x7203df6b21c6052b, b) - t2, b := bits.Sub64(s[2], 0xffffffffffffffff, b) - t3, b := bits.Sub64(s[3], 0xfffffffeffffffff, b) - tMask := b - 1 // zero if subtraction underflowed - s[0] ^= (t0 ^ s[0]) & tMask - s[1] ^= (t1 ^ s[1]) & tMask - s[2] ^= (t2 ^ s[2]) & tMask - s[3] ^= (t3 ^ s[3]) & tMask -} - // Add sets q = p1 + p2, and returns q. The points may overlap. func (q *SM2P256Point) Add(r1, r2 *SM2P256Point) *SM2P256Point { var sum, double SM2P256Point