mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
internal/sm2ec: add comment for mont. reduction and others
This commit is contained in:
parent
49513c7420
commit
ba6bd136f9
@ -224,18 +224,19 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
|
|||||||
MOVQ a3, (8*3)(res) \
|
MOVQ a3, (8*3)(res) \
|
||||||
|
|
||||||
#define p256SqrMontReduce() \
|
#define p256SqrMontReduce() \
|
||||||
\ // First reduction step
|
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
|
||||||
MOVQ acc0, AX \
|
MOVQ acc0, AX \
|
||||||
MOVQ acc0, DX \
|
MOVQ acc0, DX \
|
||||||
SHLQ $32, AX \
|
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
|
||||||
SHRQ $32, DX \
|
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
|
||||||
\
|
\ // calculate the positive part first: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
|
||||||
ADDQ acc0, acc1 \
|
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
|
||||||
ADCQ $0, acc2 \
|
ADDQ acc0, acc1 \ // acc1' = L (acc0 + acc1)
|
||||||
ADCQ $0, acc3 \
|
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
|
||||||
ADCQ $0, acc0 \
|
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
|
||||||
\
|
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
|
||||||
SUBQ AX, acc1 \
|
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
|
||||||
|
SUBQ AX, acc1 \
|
||||||
SBBQ DX, acc2 \
|
SBBQ DX, acc2 \
|
||||||
SBBQ AX, acc3 \
|
SBBQ AX, acc3 \
|
||||||
SBBQ DX, acc0 \
|
SBBQ DX, acc0 \
|
||||||
@ -382,6 +383,7 @@ sqrLoop:
|
|||||||
ADCQ DX, t1
|
ADCQ DX, t1
|
||||||
MOVQ t1, x_ptr
|
MOVQ t1, x_ptr
|
||||||
|
|
||||||
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||||
p256SqrMontReduce()
|
p256SqrMontReduce()
|
||||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||||
MOVQ res_ptr, x_ptr
|
MOVQ res_ptr, x_ptr
|
||||||
@ -450,6 +452,7 @@ sqrBMI2:
|
|||||||
ADCQ AX, y_ptr
|
ADCQ AX, y_ptr
|
||||||
ADCQ t1, x_ptr
|
ADCQ t1, x_ptr
|
||||||
|
|
||||||
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||||
p256SqrMontReduce()
|
p256SqrMontReduce()
|
||||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||||
MOVQ res_ptr, x_ptr
|
MOVQ res_ptr, x_ptr
|
||||||
@ -1722,17 +1725,20 @@ ordSqrLoop:
|
|||||||
ADCQ AX, y_ptr
|
ADCQ AX, y_ptr
|
||||||
ADCQ DX, t1
|
ADCQ DX, t1
|
||||||
MOVQ t1, x_ptr
|
MOVQ t1, x_ptr
|
||||||
// First reduction step
|
|
||||||
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||||
|
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
|
||||||
MOVQ acc0, AX
|
MOVQ acc0, AX
|
||||||
MULQ p256ordK0<>(SB)
|
MULQ p256ordK0<>(SB)
|
||||||
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
||||||
|
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
||||||
|
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
||||||
MOVQ p256ord<>+0x00(SB), AX
|
MOVQ p256ord<>+0x00(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
|
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
||||||
ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
|
ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
|
||||||
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
|
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
|
||||||
MOVQ t0, acc0
|
MOVQ t0, acc0 // acc0 = t0
|
||||||
|
|
||||||
MOVQ p256ord<>+0x08(SB), AX
|
MOVQ p256ord<>+0x08(SB), AX
|
||||||
MULQ t0
|
MULQ t0
|
||||||
@ -1743,7 +1749,7 @@ ordSqrLoop:
|
|||||||
ADCQ DX, acc2
|
ADCQ DX, acc2
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0
|
||||||
|
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||||
MOVQ t0, AX
|
MOVQ t0, AX
|
||||||
MOVQ t0, DX
|
MOVQ t0, DX
|
||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
@ -1918,23 +1924,25 @@ ordSqrLoopBMI2:
|
|||||||
ADCQ AX, y_ptr
|
ADCQ AX, y_ptr
|
||||||
ADCQ t1, x_ptr
|
ADCQ t1, x_ptr
|
||||||
|
|
||||||
// First reduction step
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||||
|
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
|
||||||
MOVQ acc0, DX
|
MOVQ acc0, DX
|
||||||
MULXQ p256ordK0<>(SB), t0, AX
|
MULXQ p256ordK0<>(SB), t0, AX
|
||||||
|
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
||||||
|
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
||||||
MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64
|
MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64
|
||||||
MULXQ p256ord<>+0x00(SB), AX, t1
|
MULXQ p256ord<>+0x00(SB), AX, t1
|
||||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
|
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
||||||
ADCQ t1, acc1
|
ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1
|
||||||
MOVQ t0, acc0
|
MOVQ t0, acc0 // acc0 = t0
|
||||||
|
|
||||||
MULXQ p256ord<>+0x08(SB), AX, t1
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
||||||
ADCQ $0, t1 // t1 = carry2 + H(t0*ord1)
|
ADCQ $0, t1 // t1 = carry2 + H(t0*ord1)
|
||||||
ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
|
ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1)
|
||||||
ADCQ t1, acc2
|
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
|
||||||
ADCQ $0, acc3
|
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
|
||||||
ADCQ $0, acc0
|
ADCQ $0, acc0 // acc0 = t0 + carry5
|
||||||
|
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||||
MOVQ t0, AX
|
MOVQ t0, AX
|
||||||
//MOVQ t0, DX // This is not required due to t0=DX already
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
||||||
SHLQ $32, AX
|
SHLQ $32, AX
|
||||||
@ -2065,6 +2073,7 @@ ordSqrLoopBMI2:
|
|||||||
#define t3 SI
|
#define t3 SI
|
||||||
#define hlp BP
|
#define hlp BP
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
|
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
|
||||||
TEXT sm2P256SubInternal(SB),NOSPLIT,$0
|
TEXT sm2P256SubInternal(SB),NOSPLIT,$0
|
||||||
XORQ mul0, mul0
|
XORQ mul0, mul0
|
||||||
SUBQ t0, acc4
|
SUBQ t0, acc4
|
||||||
@ -2091,6 +2100,7 @@ TEXT sm2P256SubInternal(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
RET
|
RET
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
|
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0]
|
||||||
TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||||
CMPB ·supportBMI2+0(SB), $0x01
|
CMPB ·supportBMI2+0(SB), $0x01
|
||||||
JEQ internalMulBMI2
|
JEQ internalMulBMI2
|
||||||
@ -2538,6 +2548,7 @@ internalMulBMI2:
|
|||||||
CMOVQCS t3, acc7 \
|
CMOVQCS t3, acc7 \
|
||||||
|
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
|
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4]^2
|
||||||
TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
|
TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
|
||||||
CMPB ·supportBMI2+0(SB), $0x01
|
CMPB ·supportBMI2+0(SB), $0x01
|
||||||
JEQ internalSqrBMI2
|
JEQ internalSqrBMI2
|
||||||
@ -2612,7 +2623,7 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
|
|||||||
ADDQ acc4, t1
|
ADDQ acc4, t1
|
||||||
ADCQ mul0, t2
|
ADCQ mul0, t2
|
||||||
ADCQ DX, t3
|
ADCQ DX, t3
|
||||||
|
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||||
sm2P256SqrReductionInternal()
|
sm2P256SqrReductionInternal()
|
||||||
RET
|
RET
|
||||||
|
|
||||||
@ -2670,12 +2681,13 @@ internalSqrBMI2:
|
|||||||
MULXQ mul1, mul0, acc4
|
MULXQ mul1, mul0, acc4
|
||||||
ADCQ mul0, t2
|
ADCQ mul0, t2
|
||||||
ADCQ acc4, t3
|
ADCQ acc4, t3
|
||||||
|
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||||
sm2P256SqrReductionInternal()
|
sm2P256SqrReductionInternal()
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
|
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
|
||||||
#define p256MulBy2Inline\
|
#define p256MulBy2Inline\
|
||||||
XORQ mul0, mul0;\
|
XORQ mul0, mul0;\
|
||||||
ADDQ acc4, acc4;\
|
ADDQ acc4, acc4;\
|
||||||
@ -2697,6 +2709,7 @@ internalSqrBMI2:
|
|||||||
CMOVQCS acc6, t2;\
|
CMOVQCS acc6, t2;\
|
||||||
CMOVQCS acc7, t3;
|
CMOVQCS acc7, t3;
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
|
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
|
||||||
#define p256AddInline \
|
#define p256AddInline \
|
||||||
XORQ mul0, mul0;\
|
XORQ mul0, mul0;\
|
||||||
ADDQ t0, acc4;\
|
ADDQ t0, acc4;\
|
||||||
|
@ -837,6 +837,7 @@ TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
RET
|
RET
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
|
// (x3, x2, x1, x0) = (y3, y2, y1, y0) - (x3, x2, x1, x0)
|
||||||
TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0
|
TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0
|
||||||
SUBS x0, y0, acc0
|
SUBS x0, y0, acc0
|
||||||
SBCS x1, y1, acc1
|
SBCS x1, y1, acc1
|
||||||
@ -858,6 +859,7 @@ TEXT sm2P256Subinternal<>(SB),NOSPLIT,$0
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
|
// (y3, y2, y1, y0) = (x3, x2, x1, x0) ^ 2
|
||||||
TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
|
TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
|
||||||
// x[1:] * x[0]
|
// x[1:] * x[0]
|
||||||
MUL x0, x1, acc1
|
MUL x0, x1, acc1
|
||||||
@ -988,6 +990,7 @@ TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0
|
|||||||
CSEL CS, acc6, acc3, y3
|
CSEL CS, acc6, acc3, y3
|
||||||
RET
|
RET
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
|
// (y3, y2, y1, y0) = (x3, x2, x1, x0) * (y3, y2, y1, y0)
|
||||||
TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
||||||
// y[0] * x
|
// y[0] * x
|
||||||
MUL y0, x0, acc0
|
MUL y0, x0, acc0
|
||||||
@ -1146,6 +1149,7 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
|
|||||||
CSEL CS, acc6, acc3, y3
|
CSEL CS, acc6, acc3, y3
|
||||||
RET
|
RET
|
||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
|
// (x3, x2, x1, x0) = 2(y3, y2, y1, y0)
|
||||||
#define p256MulBy2Inline \
|
#define p256MulBy2Inline \
|
||||||
ADDS y0, y0, x0; \
|
ADDS y0, y0, x0; \
|
||||||
ADCS y1, y1, x1; \
|
ADCS y1, y1, x1; \
|
||||||
@ -1364,6 +1368,7 @@ TEXT ·p256PointAddAffineAsm(SB),0,$264-48
|
|||||||
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
// (x3, x2, x1, x0) = (x3, x2, x1, x0) + (y3, y2, y1, y0)
|
||||||
#define p256AddInline \
|
#define p256AddInline \
|
||||||
ADDS y0, x0, x0; \
|
ADDS y0, x0, x0; \
|
||||||
ADCS y1, x1, x1; \
|
ADCS y1, x1, x1; \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user