internal/sm2ec: amd64 refactoring, reduce duplicated code

This commit is contained in:
Sun Yimin 2024-02-29 17:53:28 +08:00 committed by GitHub
parent fabcb6ad30
commit 53ac591635
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 1068 additions and 1747 deletions

View File

@ -20,159 +20,13 @@ TEXT ·p256Sqr(SB),NOSPLIT,$0
JEQ sqrBMI2
sqrLoop:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0
MOVQ (8*1)(x_ptr), AX
MULQ t0
MOVQ AX, acc1
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, y_ptr
XORQ t1, t1
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, t1
// Missing products
MOVQ (8*0)(x_ptr), AX
MULQ AX
MOVQ AX, acc0
MOVQ DX, t0
MOVQ (8*1)(x_ptr), AX
MULQ AX
ADDQ t0, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*2)(x_ptr), AX
MULQ AX
ADDQ t0, acc3
ADCQ AX, acc4
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*3)(x_ptr), AX
MULQ AX
ADDQ t0, acc5
ADCQ AX, y_ptr
ADCQ DX, t1
MOVQ t1, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
p256SqrMontReduce()
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
MOVQ res_ptr, x_ptr
p256SqrRound(t1)
DECQ BX
JNE sqrLoop
RET
sqrBMI2:
XORQ acc0, acc0
XORQ y_ptr, y_ptr
// x[1:] * x[0]
MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3
ADOXQ AX, acc2
MULXQ (8*3)(x_ptr), AX, acc4
ADOXQ AX, acc3
ADOXQ y_ptr, acc4
// x[2:] * x[1]
MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, t1
ADOXQ AX, acc3
MULXQ (8*3)(x_ptr), AX, acc5
ADCXQ t1, AX
ADOXQ AX, acc4
ADCXQ y_ptr, acc5
// y[x] * x[2]
MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr
ADOXQ AX, acc5
ADOXQ acc0, y_ptr
XORQ t1, t1
// *2
ADOXQ acc1, acc1
ADOXQ acc2, acc2
ADOXQ acc3, acc3
ADOXQ acc4, acc4
ADOXQ acc5, acc5
ADOXQ y_ptr, y_ptr
ADOXQ acc0, t1
// Missing products
MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0
ADCXQ t0, acc1
MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0
ADCXQ AX, acc2
ADCXQ t0, acc3
MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0
ADCXQ AX, acc4
ADCXQ t0, acc5
MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr
ADCXQ AX, y_ptr
ADCXQ t1, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
p256SqrMontReduce()
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
MOVQ res_ptr, x_ptr
p256SqrRoundAdx(t1)
DECQ BX
JNE sqrBMI2
RET
@ -188,385 +42,14 @@ TEXT ·p256OrdSqr(SB),NOSPLIT,$0
JEQ ordSqrLoopBMI2
ordSqrLoop:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0
MOVQ (8*1)(x_ptr), AX
MULQ t0
MOVQ AX, acc1
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, y_ptr
XORQ t1, t1
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, t1
// Missing products
MOVQ (8*0)(x_ptr), AX
MULQ AX
MOVQ AX, acc0
MOVQ DX, t0
MOVQ (8*1)(x_ptr), AX
MULQ AX
ADDQ t0, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*2)(x_ptr), AX
MULQ AX
ADDQ t0, acc3
ADCQ AX, acc4
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*3)(x_ptr), AX
MULQ AX
ADDQ t0, acc5
ADCQ AX, y_ptr
ADCQ DX, t1
MOVQ t1, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
MOVQ acc0, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
MOVQ t0, acc0 // acc0 = t0
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
ADCQ $0, DX // DX = carry2 + H(t0*ord1)
ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ t0, acc1
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc3
SBBQ AX, acc0
SBBQ DX, acc1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ DX, acc3
ADCQ $0, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ t0, acc2
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc0
SBBQ AX, acc1
SBBQ DX, acc2
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc0
ADCQ $0, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ t0, acc3
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc1
SBBQ AX, acc2
SBBQ DX, acc3
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
ADCQ $0, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADCQ acc4, acc0
ADCQ acc5, acc1
ADCQ y_ptr, acc2
ADCQ x_ptr, acc3
ADCQ $0, t0
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
MOVQ res_ptr, x_ptr
p256OrdSqrRound(t1)
DECQ BX
JNE ordSqrLoop
RET
ordSqrLoopBMI2:
XORQ acc0, acc0
XORQ y_ptr, y_ptr
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3
ADOXQ AX, acc2
MULXQ (8*3)(x_ptr), AX, acc4
ADOXQ AX, acc3
ADOXQ y_ptr, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, t1
ADOXQ AX, acc3
MULXQ (8*3)(x_ptr), AX, acc5
ADCXQ t1, AX
ADOXQ AX, acc4
ADCXQ y_ptr, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr
ADOXQ AX, acc5
ADOXQ acc0, y_ptr
XORQ t1, t1
// *2
ADOXQ acc1, acc1
ADOXQ acc2, acc2
ADOXQ acc3, acc3
ADOXQ acc4, acc4
ADOXQ acc5, acc5
ADOXQ y_ptr, y_ptr
ADOXQ acc0, t1
// Missing products
MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0
ADCXQ t0, acc1
MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0
ADCXQ AX, acc2
ADCXQ t0, acc3
MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0
ADCXQ AX, acc4
ADCXQ t0, acc5
MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr
ADCXQ AX, y_ptr
ADCXQ t1, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step
MOVQ acc0, DX
MULXQ p256ordK0<>(SB), DX, AX
MULXQ p256ord<>+0x00(SB), AX, t0
ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
MULXQ p256ord<>+0x08(SB), AX, t1
ADCXQ t0, AX
ADOXQ AX, acc1
MULXQ p256ord<>+0x10(SB), AX, t0
ADCXQ t1, AX
ADOXQ AX, acc2
MULXQ p256ord<>+0x18(SB), AX, acc0
ADCXQ t0, AX
ADOXQ AX, acc3
MOVQ $0, t0
ADCXQ t0, acc0
ADOXQ t0, acc0
// Second reduction step
MOVQ acc1, DX
MULXQ p256ordK0<>(SB), DX, AX
MULXQ p256ord<>+0x00(SB), AX, t0
ADOXQ AX, acc1
MULXQ p256ord<>+0x08(SB), AX, t1
ADCXQ t0, AX
ADOXQ AX, acc2
MULXQ p256ord<>+0x10(SB), AX, t0
ADCXQ t1, AX
ADOXQ AX, acc3
MULXQ p256ord<>+0x18(SB), AX, acc1
ADCXQ t0, AX
ADOXQ AX, acc0
MOVQ $0, t0
ADCXQ t0, acc1
ADOXQ t0, acc1
// Third reduction step
MOVQ acc2, DX
MULXQ p256ordK0<>(SB), DX, AX
MULXQ p256ord<>+0x00(SB), AX, t0
ADOXQ AX, acc2
MULXQ p256ord<>+0x08(SB), AX, t1
ADCXQ t0, AX
ADOXQ AX, acc3
MULXQ p256ord<>+0x10(SB), AX, t0
ADCXQ t1, AX
ADOXQ AX, acc0
MULXQ p256ord<>+0x18(SB), AX, acc2
ADCXQ t0, AX
ADOXQ AX, acc1
MOVQ $0, t0
ADCXQ t0, acc2
ADOXQ t0, acc2
// Last reduction step
MOVQ acc3, DX
MULXQ p256ordK0<>(SB), DX, AX
MULXQ p256ord<>+0x00(SB), AX, t0
ADOXQ AX, acc3
MULXQ p256ord<>+0x08(SB), AX, t1
ADCXQ t0, AX
ADOXQ AX, acc0
MULXQ p256ord<>+0x10(SB), AX, t0
ADCXQ t1, AX
ADOXQ AX, acc1
MULXQ p256ord<>+0x18(SB), AX, acc3
ADCXQ t0, AX
ADOXQ AX, acc2
MOVQ $0, t0
ADCXQ t0, acc3
ADOXQ t0, acc3
XORQ t1, t1
// Add bits [511:256] of the sqr result
ADCXQ acc4, acc0
ADCXQ acc5, acc1
ADCXQ y_ptr, acc2
ADCXQ x_ptr, acc3
ADCXQ t1, t0
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
MOVQ res_ptr, x_ptr
p256OrdSqrRoundAdx(t1)
DECQ BX
JNE ordSqrLoopBMI2
@ -601,33 +84,7 @@ ordSqrLoopBMI2:
#define t2 DI
#define t3 SI
#define hlp BP
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
TEXT sm2P256SubInternal(SB),NOSPLIT,$0
XORQ mul0, mul0
SUBQ t0, acc4
SBBQ t1, acc5
SBBQ t2, acc6
SBBQ t3, acc7
SBBQ $0, mul0
MOVQ acc4, acc0
MOVQ acc5, acc1
MOVQ acc6, acc2
MOVQ acc7, acc3
ADDQ $-1, acc4
ADCQ p256p<>+0x08(SB), acc5
ADCQ $-1, acc6
ADCQ p256p<>+0x018(SB), acc7
ANDQ $1, mul0
CMOVQEQ acc0, acc4
CMOVQEQ acc1, acc5
CMOVQEQ acc2, acc6
CMOVQEQ acc3, acc7
RET
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0]
TEXT sm2P256MulInternal(SB),NOSPLIT,$8
@ -746,7 +203,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADDQ mul0, acc6
ADCQ $0, mul1
MOVQ mul1, acc7
sm2P256MulReductionInternal()
sm2P256MulReductionInline
MOVQ $0, BP
// Add bits [511:256] of the result
@ -767,7 +224,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
SBBQ p256p<>+0x018(SB), acc7
SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result
CMOVQCS acc0, acc4
CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc1, acc5
CMOVQCS acc2, acc6
CMOVQCS acc3, acc7
@ -847,7 +304,7 @@ internalMulBMI2:
ADDQ mul0, acc6
ADCQ $0, acc7
sm2P256MulReductionInternal()
sm2P256MulReductionInline
MOVQ $0, BP
// Add bits [511:256] of the result
ADCQ acc0, acc4
@ -867,7 +324,7 @@ internalMulBMI2:
SBBQ p256p<>+0x018(SB), acc7
SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result
CMOVQCS acc0, acc4
CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc1, acc5
CMOVQCS acc2, acc6
CMOVQCS acc3, acc7
@ -880,140 +337,11 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
CMPB ·supportBMI2+0(SB), $0x01
JEQ internalSqrBMI2
MOVQ acc4, mul0
MULQ acc5
MOVQ mul0, acc1
MOVQ mul1, acc2
MOVQ acc4, mul0
MULQ acc6
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
MOVQ acc4, mul0
MULQ acc7
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ acc5, mul0
MULQ acc6
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc0
MOVQ acc5, mul0
MULQ acc7
ADDQ acc0, t0
ADCQ $0, mul1
ADDQ mul0, t0
ADCQ $0, mul1
MOVQ mul1, t1
MOVQ acc6, mul0
MULQ acc7
ADDQ mul0, t1
ADCQ $0, mul1
MOVQ mul1, t2
XORQ t3, t3
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ t0, t0
ADCQ t1, t1
ADCQ t2, t2
ADCQ $0, t3
// Missing products
MOVQ acc4, mul0
MULQ mul0
MOVQ mul0, acc0
MOVQ DX, acc4
MOVQ acc5, mul0
MULQ mul0
ADDQ acc4, acc1
ADCQ mul0, acc2
ADCQ $0, DX
MOVQ DX, acc4
MOVQ acc6, mul0
MULQ mul0
ADDQ acc4, acc3
ADCQ mul0, t0
ADCQ $0, DX
MOVQ DX, acc4
MOVQ acc7, mul0
MULQ mul0
ADDQ acc4, t1
ADCQ mul0, t2
ADCQ DX, t3
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInternal()
p256SqrInternalInline
RET
internalSqrBMI2:
XORQ acc0, acc0
XORQ t2, t2
MOVQ acc4, mul1
MULXQ acc5, acc1, acc2
MULXQ acc6, mul0, acc3
ADOXQ mul0, acc2
MULXQ acc7, mul0, t0
ADOXQ mul0, acc3
ADOXQ t2, t0
MOVQ acc5, mul1
MULXQ acc6, mul0, t3
ADOXQ mul0, acc3
MULXQ acc7, mul0, t1
ADCXQ t3, mul0
ADOXQ mul0, t0
ADCXQ t2, t1
MOVQ acc6, mul1
MULXQ acc7, mul0, t2
ADOXQ mul0, t1
ADOXQ acc0, t2
XORQ t3, t3
// *2
ADOXQ acc1, acc1
ADOXQ acc2, acc2
ADOXQ acc3, acc3
ADOXQ t0, t0
ADOXQ t1, t1
ADOXQ t2, t2
ADOXQ acc0, t3
// Missing products
MOVQ acc4, mul1
MULXQ mul1, acc0, acc4
ADDQ acc4, acc1
MOVQ acc5, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, acc2
ADCXQ acc4, acc3
MOVQ acc6, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, t0
ADCXQ acc4, t1
MOVQ acc7, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, t2
ADCXQ acc4, t3
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInternal()
p256SqrInternalInlineAdx
RET
/* ---------------------------------------*/
@ -1073,14 +401,14 @@ internalSqrBMI2:
ADCQ p256p<>+0x08(SB), acc1 \
ADCQ $-1, acc2 \
ADCQ p256p<>+0x018(SB), acc3 \
ADCQ $0, mul0 \
CMOVQNE t0, acc0 \
ADCQ $0, mul0 \ // ZF := 1 if mul0 == 0 after ADC
CMOVQNE t0, acc0 \ // CMOVQNE: Move if not equal (ZF == 0)
CMOVQNE t1, acc1 \
CMOVQNE t2, acc2 \
CMOVQNE t3, acc3 \
\// If condition is 0, keep original value
TESTQ DX, DX \
CMOVQEQ acc4, acc0 \
TESTQ DX, DX \ // ZF := 1 if (DX AND DX == 0)
CMOVQEQ acc4, acc0 \ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc5, acc1 \
CMOVQEQ acc6, acc2 \
CMOVQEQ acc7, acc3 \
@ -1098,7 +426,7 @@ internalSqrBMI2:
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2
\
LDt (x1in) \
CALL sm2P256SubInternal(SB) \// h = u2 - u1
p256SubInline2 \// h = u2 - u1
ST (h) \
\
LDt (z1in) \
@ -1113,7 +441,7 @@ internalSqrBMI2:
ST (s2) \
\
LDt (y1in) \
CALL sm2P256SubInternal(SB) \// r = s2 - s1
p256SubInline2 \// r = s2 - s1
ST (r) \
\
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
@ -1138,10 +466,10 @@ internalSqrBMI2:
\
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
LDacc (rsqr) \
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
\
LDt (hcub) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
ST (xout) \
\
MOVQ acc4, t0 \
@ -1149,13 +477,13 @@ internalSqrBMI2:
MOVQ acc6, t2 \
MOVQ acc7, t3 \
LDacc (h) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
\
LDt (r) \
CALL sm2P256MulInternal(SB) \
\
LDt (s2) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
ST (yout) \
\// Load stored values from stack
MOVQ rptr, AX \
@ -1372,36 +700,6 @@ pointaddaffine_avx2:
#undef sel_save
#undef zero_save
// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
// otherwise. It writes to [acc4..acc7], t0 and t1.
TEXT sm2P256IsZero(SB),NOSPLIT,$0
// AX contains a flag that is set if the input is zero.
XORQ AX, AX
MOVQ $1, t1
// Check whether [acc4..acc7] are all zero.
MOVQ acc4, t0
ORQ acc5, t0
ORQ acc6, t0
ORQ acc7, t0
// Set the zero flag if so. (CMOV of a constant to a register doesn't
// appear to be supported in Go. Thus t1 = 1.)
CMOVQEQ t1, AX
// XOR [acc4..acc7] with P and compare with zero again.
XORQ $-1, acc4
XORQ p256p<>+0x08(SB), acc5
XORQ $-1, acc6
XORQ p256p<>+0x018(SB), acc7
ORQ acc5, acc4
ORQ acc6, acc4
ORQ acc7, acc4
// Set the zero flag if so.
CMOVQEQ t1, AX
RET
/* ---------------------------------------*/
#define x1in(off) (32*0 + off)(SP)
#define y1in(off) (32*1 + off)(SP)
@ -1449,9 +747,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
ST (s2) \
\
LDt (s1) \
CALL sm2P256SubInternal(SB) \// r = s2 - s1
p256SubInline2 \// r = s2 - s1
ST (r) \
CALL sm2P256IsZero(SB) \
p256IsZeroInline \
MOVQ AX, points_eq \
\
LDacc (z2sqr) \
@ -1464,9 +762,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
ST (u2) \
\
LDt (u1) \
CALL sm2P256SubInternal(SB) \// h = u2 - u1
p256SubInline2 \// h = u2 - u1
ST (h) \
CALL sm2P256IsZero(SB) \
p256IsZeroInline \
ANDQ points_eq, AX \
MOVQ AX, points_eq \
\
@ -1500,10 +798,10 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
\
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
LDacc (rsqr) \
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
\
LDt (hcub) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
ST (xout) \
\
MOVQ acc4, t0 \
@ -1511,13 +809,13 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
MOVQ acc6, t2 \
MOVQ acc7, t3 \
LDacc (u2) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
\
LDt (r) \
CALL sm2P256MulInternal(SB) \
\
LDt (s2) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
ST (yout) \
//func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
@ -1668,7 +966,7 @@ pointadd_avx2:
#define calX() \
LDacc (x) \
LDt (zsqr) \
CALL sm2P256SubInternal(SB) \ // X1 - ZZ
p256SubInline2 \ // X1 - ZZ
LDt (m) \
CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2
ST (m) \
@ -1717,18 +1015,18 @@ pointadd_avx2:
LDacc (m) \
CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2
LDt (tmp) \
CALL sm2P256SubInternal(SB) \ // X3 = M^2 - 2*S
p256SubInline2 \ // X3 = M^2 - 2*S
#define calY() \
acc2t \
LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
CALL sm2P256SubInternal(SB) \ // S - X3
p256SubInline2 \ // S - X3
\
LDt (m) \
CALL sm2P256MulInternal(SB) \ // M * (S - X3)
\
LDt (y) \
CALL sm2P256SubInternal(SB) \ // Y3 = M * (S - X3) - 8 * YYYYY
p256SubInline2 \ // Y3 = M * (S - X3) - 8 * YYYYY
#define lastP256PointDouble() \
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl

View File

@ -28,7 +28,7 @@ GLOBL p256ordK0<>(SB), 8, $8
GLOBL p256ord<>(SB), 8, $32
GLOBL p256one<>(SB), 8, $32
#define p256SqrMontReduce() \
#define p256SqrMontReduceInline \
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
MOVQ acc0, AX \
MOVQ acc0, DX \
@ -114,7 +114,7 @@ GLOBL p256one<>(SB), 8, $32
SBBQ p256p<>+0x018(SB), a3 \
SBBQ $0, a4 \
\
CMOVQCS b0, a0 \
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
CMOVQCS b1, a1 \
CMOVQCS b2, a2 \
CMOVQCS b3, a3 \
@ -138,7 +138,7 @@ GLOBL p256one<>(SB), 8, $32
SBBQ p256ord<>+0x18(SB), a3 \
SBBQ $0, a4 \
\
CMOVQCS b0, a0 \
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
CMOVQCS b1, a1 \
CMOVQCS b2, a2 \
CMOVQCS b3, a3 \
@ -148,7 +148,7 @@ GLOBL p256one<>(SB), 8, $32
MOVQ a2, (8*2)(res) \
MOVQ a3, (8*3)(res)
#define sm2P256SqrReductionInternal() \
#define sm2P256SqrReductionInline \
\ // First reduction step
MOVQ acc0, mul0 \
MOVQ acc0, mul1 \
@ -232,12 +232,12 @@ GLOBL p256one<>(SB), 8, $32
SBBQ p256p<>+0x018(SB), acc7\
SBBQ $0, mul0 \
\ // If the result of the subtraction is negative, restore the previous result
CMOVQCS t0, acc4 \
CMOVQCS t0, acc4 \ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5 \
CMOVQCS t2, acc6 \
CMOVQCS t3, acc7
#define sm2P256MulReductionInternal() \
#define sm2P256MulReductionInline \
\// First reduction step
MOVQ acc0, mul0 \
MOVQ acc0, mul1 \
@ -304,19 +304,19 @@ GLOBL p256one<>(SB), 8, $32
ADCQ $0, acc3
#define p256PointDoubleInit() \
MOVOU (16*0)(BX), X0 \
MOVOU (16*1)(BX), X1 \
MOVOU (16*2)(BX), X2 \
MOVOU (16*3)(BX), X3 \
MOVOU (16*4)(BX), X4 \
MOVOU (16*5)(BX), X5 \
MOVOU (16*0)(BX), X0;\
MOVOU (16*1)(BX), X1;\
MOVOU (16*2)(BX), X2;\
MOVOU (16*3)(BX), X3;\
MOVOU (16*4)(BX), X4;\
MOVOU (16*5)(BX), X5;\
\
MOVOU X0, x(16*0) \
MOVOU X1, x(16*1) \
MOVOU X2, y(16*0) \
MOVOU X3, y(16*1) \
MOVOU X4, z(16*0) \
MOVOU X5, z(16*1)
MOVOU X0, x(16*0);\
MOVOU X1, x(16*1);\
MOVOU X2, y(16*0);\
MOVOU X3, y(16*1);\
MOVOU X4, z(16*0);\
MOVOU X5, z(16*1);
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
@ -336,7 +336,7 @@ GLOBL p256one<>(SB), 8, $32
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
@ -359,7 +359,7 @@ GLOBL p256one<>(SB), 8, $32
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
@ -386,7 +386,7 @@ GLOBL p256one<>(SB), 8, $32
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;\
@ -405,7 +405,7 @@ GLOBL p256one<>(SB), 8, $32
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
@ -428,7 +428,718 @@ GLOBL p256one<>(SB), 8, $32
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
#define p256SubInline2 \
XORQ mul0, mul0;\
SUBQ t0, acc4;\
SBBQ t1, acc5;\
SBBQ t2, acc6;\
SBBQ t3, acc7;\
SBBQ $0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ $-1, acc4;\
ADCQ p256p<>+0x08(SB), acc5;\
ADCQ $-1, acc6;\
ADCQ p256p<>+0x018(SB), acc7;\
ANDQ $1, mul0;\
CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ acc1, acc5;\
CMOVQEQ acc2, acc6;\
CMOVQEQ acc3, acc7;\
/* ---------------------------------------*/
#define p256SqrRound(t1) \
\// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0;\
\
MOVQ (8*1)(x_ptr), AX;\
MULQ t0;\
MOVQ AX, acc1;\
MOVQ DX, acc2;\
\
MOVQ (8*2)(x_ptr), AX;\
MULQ t0;\
ADDQ AX, acc2;\
ADCQ $0, DX;\
MOVQ DX, acc3;\
\
MOVQ (8*3)(x_ptr), AX;\
MULQ t0;\
ADDQ AX, acc3;\
ADCQ $0, DX;\
MOVQ DX, acc4;\
\// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0;\
\
MOVQ (8*2)(x_ptr), AX;\
MULQ t0;\
ADDQ AX, acc3;\
ADCQ $0, DX;\
MOVQ DX, t1;\
\
MOVQ (8*3)(x_ptr), AX;\
MULQ t0;\
ADDQ t1, acc4;\
ADCQ $0, DX;\
ADDQ AX, acc4;\
ADCQ $0, DX;\
MOVQ DX, acc5;\
\// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0;\
\
MOVQ (8*3)(x_ptr), AX;\
MULQ t0;\
ADDQ AX, acc5;\
ADCQ $0, DX;\
MOVQ DX, y_ptr;\
XORQ t1, t1;\
\// *2
ADDQ acc1, acc1;\
ADCQ acc2, acc2;\
ADCQ acc3, acc3;\
ADCQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ y_ptr, y_ptr;\
ADCQ $0, t1;\
\// Missing products
MOVQ (8*0)(x_ptr), AX;\
MULQ AX;\
MOVQ AX, acc0;\
MOVQ DX, t0;\
\
MOVQ (8*1)(x_ptr), AX;\
MULQ AX;\
ADDQ t0, acc1;\
ADCQ AX, acc2;\
ADCQ $0, DX;\
MOVQ DX, t0;\
\
MOVQ (8*2)(x_ptr), AX;\
MULQ AX;\
ADDQ t0, acc3;\
ADCQ AX, acc4;\
ADCQ $0, DX;\
MOVQ DX, t0;\
\
MOVQ (8*3)(x_ptr), AX;\
MULQ AX;\
ADDQ t0, acc5;\
ADCQ AX, y_ptr;\
ADCQ DX, t1;\
MOVQ t1, x_ptr;\
\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
p256SqrMontReduceInline;\
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
MOVQ res_ptr, x_ptr;
/* ---------------------------------------*/
#define p256SqrRoundAdx(t1) \
XORQ acc0, acc0;\
XORQ y_ptr, y_ptr;\
\// x[1:] * x[0]
MOVQ (8*0)(x_ptr), DX;\
MULXQ (8*1)(x_ptr), acc1, acc2;\
\
MULXQ (8*2)(x_ptr), AX, acc3;\
ADOXQ AX, acc2;\
\
MULXQ (8*3)(x_ptr), AX, acc4;\
ADOXQ AX, acc3;\
ADOXQ y_ptr, acc4;\
\
\// x[2:] * x[1]
MOVQ (8*1)(x_ptr), DX;\
MULXQ (8*2)(x_ptr), AX, t1;\
ADOXQ AX, acc3;\
\
MULXQ (8*3)(x_ptr), AX, acc5;\
ADCXQ t1, AX;\
ADOXQ AX, acc4;\
ADCXQ y_ptr, acc5;\
\
\// y[x] * x[2]
MOVQ (8*2)(x_ptr), DX;\
MULXQ (8*3)(x_ptr), AX, y_ptr ;\
ADOXQ AX, acc5;\
ADOXQ acc0, y_ptr;\
\
XORQ t1, t1;\
\
\// *2
ADOXQ acc1, acc1;\
ADOXQ acc2, acc2;\
ADOXQ acc3, acc3;\
ADOXQ acc4, acc4;\
ADOXQ acc5, acc5;\
ADOXQ y_ptr, y_ptr;\
ADOXQ acc0, t1;\
\
\// Missing products
MOVQ (8*0)(x_ptr), DX;\
MULXQ DX, acc0, t0;\
ADCXQ t0, acc1;\
\
MOVQ (8*1)(x_ptr), DX;\
MULXQ DX, AX, t0;\
ADCXQ AX, acc2;\
ADCXQ t0, acc3;\
\
MOVQ (8*2)(x_ptr), DX;\
MULXQ DX, AX, t0 ;\
ADCXQ AX, acc4;\
ADCXQ t0, acc5;\
\
MOVQ (8*3)(x_ptr), DX;\
MULXQ DX, AX, x_ptr;\
ADCXQ AX, y_ptr;\
ADCXQ t1, x_ptr;\
\
\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
p256SqrMontReduceInline;\
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
MOVQ res_ptr, x_ptr;
/* ---------------------------------------*/
#define p256OrdSqrRound(t1) \
\// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0;\
\
MOVQ (8*1)(x_ptr), AX;\
MULQ t0;\
MOVQ AX, acc1;\
MOVQ DX, acc2;\
\
MOVQ (8*2)(x_ptr), AX;\
MULQ t0;\
ADDQ AX, acc2;\
ADCQ $0, DX;\
MOVQ DX, acc3;\
\
MOVQ (8*3)(x_ptr), AX;\
MULQ t0;\
ADDQ AX, acc3;\
ADCQ $0, DX;\
MOVQ DX, acc4;\
\// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0;\
\
MOVQ (8*2)(x_ptr), AX;\
MULQ t0;\
ADDQ AX, acc3;\
ADCQ $0, DX;\
MOVQ DX, t1;\
\
MOVQ (8*3)(x_ptr), AX;\
MULQ t0;\
ADDQ t1, acc4;\
ADCQ $0, DX;\
ADDQ AX, acc4;\
ADCQ $0, DX;\
MOVQ DX, acc5;\
\// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0;\
\
MOVQ (8*3)(x_ptr), AX;\
MULQ t0;\
ADDQ AX, acc5;\
ADCQ $0, DX;\
MOVQ DX, y_ptr;\
XORQ t1, t1;\
\// *2
ADDQ acc1, acc1;\
ADCQ acc2, acc2;\
ADCQ acc3, acc3;\
ADCQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ y_ptr, y_ptr;\
ADCQ $0, t1;\
\// Missing products
MOVQ (8*0)(x_ptr), AX;\
MULQ AX;\
MOVQ AX, acc0;\
MOVQ DX, t0;\
\
MOVQ (8*1)(x_ptr), AX;\
MULQ AX;\
ADDQ t0, acc1;\
ADCQ AX, acc2;\
ADCQ $0, DX;\
MOVQ DX, t0;\
\
MOVQ (8*2)(x_ptr), AX;\
MULQ AX;\
ADDQ t0, acc3;\
ADCQ AX, acc4;\
ADCQ $0, DX;\
MOVQ DX, t0;\
\
MOVQ (8*3)(x_ptr), AX;\
MULQ AX;\
ADDQ t0, acc5;\
ADCQ AX, y_ptr;\
ADCQ DX, t1;\
MOVQ t1, x_ptr;\
\
\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
MOVQ acc0, AX;\
MULQ p256ordK0<>(SB);\
MOVQ AX, t0;\ // Y = t0 = (k0 * acc0) mod 2^64
\
MOVQ p256ord<>+0x00(SB), AX;\
MULQ t0;\
ADDQ AX, acc0;\ // (carry1, acc0) = acc0 + L(t0 * ord0)
ADCQ $0, DX;\ // DX = carry1 + H(t0 * ord0)
MOVQ DX, t1;\ // t1 = carry1 + H(t0 * ord0)
MOVQ t0, acc0;\ // acc0 = t0
\
\// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX;\
MOVQ t0, DX;\
SHLQ $32, AX;\
SHRQ $32, DX;\
\
SUBQ t0, acc2;\
SBBQ AX, acc3;\
SBBQ DX, acc0;\
\
MOVQ p256ord<>+0x08(SB), AX;\
MULQ t0;\
ADDQ t1, acc1;\ // (carry2, acc1) = acc1 + t1
ADCQ $0, DX;\ // DX = carry2 + H(t0*ord1)
\
ADDQ AX, acc1;\ // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
ADCQ DX, acc2;\
ADCQ $0, acc3;\
ADCQ $0, acc0;\
\
\// Second reduction step
MOVQ acc1, AX;\
MULQ p256ordK0<>(SB);\
MOVQ AX, t0;\
\
MOVQ p256ord<>+0x00(SB), AX;\
MULQ t0;\
ADDQ AX, acc1;\
ADCQ $0, DX;\
MOVQ DX, t1;\
MOVQ t0, acc1;\
\
MOVQ t0, AX;\
MOVQ t0, DX;\
SHLQ $32, AX;\
SHRQ $32, DX;\
\
SUBQ t0, acc3;\
SBBQ AX, acc0;\
SBBQ DX, acc1;\
\
MOVQ p256ord<>+0x08(SB), AX;\
MULQ t0;\
ADDQ t1, acc2;\
ADCQ $0, DX;\
\
ADDQ AX, acc2;\
ADCQ DX, acc3;\
ADCQ $0, acc0;\
ADCQ $0, acc1;\
\
\// Third reduction step
MOVQ acc2, AX;\
MULQ p256ordK0<>(SB);\
MOVQ AX, t0;\
\
MOVQ p256ord<>+0x00(SB), AX;\
MULQ t0;\
ADDQ AX, acc2;\
ADCQ $0, DX;\
MOVQ DX, t1;\
MOVQ t0, acc2;\
\
MOVQ t0, AX;\
MOVQ t0, DX;\
SHLQ $32, AX;\
SHRQ $32, DX;\
\
SUBQ t0, acc0;\
SBBQ AX, acc1;\
SBBQ DX, acc2;\
\
MOVQ p256ord<>+0x08(SB), AX;\
MULQ t0;\
ADDQ t1, acc3;\
ADCQ $0, DX;\
\
ADDQ AX, acc3;\
ADCQ DX, acc0;\
ADCQ $0, acc1;\
ADCQ $0, acc2;\
\
\// Last reduction step
MOVQ acc3, AX;\
MULQ p256ordK0<>(SB);\
MOVQ AX, t0;\
\
MOVQ p256ord<>+0x00(SB), AX;\
MULQ t0;\
ADDQ AX, acc3;\
ADCQ $0, DX;\
MOVQ DX, t1;\
MOVQ t0, acc3;\
\
MOVQ t0, AX;\
MOVQ t0, DX;\
SHLQ $32, AX;\
SHRQ $32, DX;\
\
SUBQ t0, acc1;\
SBBQ AX, acc2;\
SBBQ DX, acc3;\
\
MOVQ p256ord<>+0x08(SB), AX;\
MULQ t0;\
ADDQ t1, acc0;\
ADCQ $0, DX;\
\
ADDQ AX, acc0;\
ADCQ DX, acc1;\
ADCQ $0, acc2;\
ADCQ $0, acc3;\
XORQ t0, t0;\
\// Add bits [511:256] of the sqr result
ADCQ acc4, acc0;\
ADCQ acc5, acc1;\
ADCQ y_ptr, acc2;\
ADCQ x_ptr, acc3;\
ADCQ $0, t0;\
\
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
MOVQ res_ptr, x_ptr;
/* ---------------------------------------*/
#define p256OrdSqrRoundAdx(t1) \
XORQ acc0, acc0;\
XORQ y_ptr, y_ptr;\
\// y[1:] * y[0]
MOVQ (8*0)(x_ptr), DX;\
MULXQ (8*1)(x_ptr), acc1, acc2 ;\
\
MULXQ (8*2)(x_ptr), AX, acc3;\
ADOXQ AX, acc2;\
\
MULXQ (8*3)(x_ptr), AX, acc4;\
ADOXQ AX, acc3;\
ADOXQ y_ptr, acc4;\
\
\// y[2:] * y[1]
MOVQ (8*1)(x_ptr), DX;\
MULXQ (8*2)(x_ptr), AX, t1;\
ADOXQ AX, acc3;\
\
MULXQ (8*3)(x_ptr), AX, acc5;\
ADCXQ t1, AX;\
ADOXQ AX, acc4;\
ADCXQ y_ptr, acc5;\
\
\// y[3] * y[2]
MOVQ (8*2)(x_ptr), DX;\
MULXQ (8*3)(x_ptr), AX, y_ptr;\
ADOXQ AX, acc5;\
ADOXQ acc0, y_ptr;\
\
XORQ t1, t1;\
\// *2
ADOXQ acc1, acc1;\
ADOXQ acc2, acc2;\
ADOXQ acc3, acc3;\
ADOXQ acc4, acc4;\
ADOXQ acc5, acc5;\
ADOXQ y_ptr, y_ptr;\
ADOXQ acc0, t1;\
\
\// Missing products
MOVQ (8*0)(x_ptr), DX;\
MULXQ DX, acc0, t0;\
ADCXQ t0, acc1;\
\
MOVQ (8*1)(x_ptr), DX;\
MULXQ DX, AX, t0;\
ADCXQ AX, acc2;\
ADCXQ t0, acc3;\
\
MOVQ (8*2)(x_ptr), DX;\
MULXQ DX, AX, t0 ;\
ADCXQ AX, acc4;\
ADCXQ t0, acc5;\
\
MOVQ (8*3)(x_ptr), DX;\
MULXQ DX, AX, x_ptr;\
ADCXQ AX, y_ptr;\
ADCXQ t1, x_ptr;\
\
\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
\// First reduction step
MOVQ acc0, DX;\
MULXQ p256ordK0<>(SB), DX, AX;\
\
MULXQ p256ord<>+0x00(SB), AX, t0;\
ADOXQ AX, acc0 ;\// (carry1, acc0) = acc0 + t0 * ord0
\
MULXQ p256ord<>+0x08(SB), AX, t1;\
ADCXQ t0, AX;\
ADOXQ AX, acc1;\
\
MULXQ p256ord<>+0x10(SB), AX, t0;\
ADCXQ t1, AX;\
ADOXQ AX, acc2;\
\
MULXQ p256ord<>+0x18(SB), AX, acc0;\
ADCXQ t0, AX;\
ADOXQ AX, acc3;\
MOVQ $0, t0;\
ADCXQ t0, acc0;\
ADOXQ t0, acc0;\
\
\// Second reduction step
MOVQ acc1, DX;\
MULXQ p256ordK0<>(SB), DX, AX;\
\
MULXQ p256ord<>+0x00(SB), AX, t0;\
ADOXQ AX, acc1;\
\
MULXQ p256ord<>+0x08(SB), AX, t1;\
ADCXQ t0, AX;\
ADOXQ AX, acc2;\
\
MULXQ p256ord<>+0x10(SB), AX, t0;\
ADCXQ t1, AX;\
ADOXQ AX, acc3;\
\
MULXQ p256ord<>+0x18(SB), AX, acc1;\
ADCXQ t0, AX;\
ADOXQ AX, acc0;\
MOVQ $0, t0;\
ADCXQ t0, acc1;\
ADOXQ t0, acc1;\
\
\// Third reduction step
MOVQ acc2, DX;\
MULXQ p256ordK0<>(SB), DX, AX;\
\
MULXQ p256ord<>+0x00(SB), AX, t0;\
ADOXQ AX, acc2;\
\
MULXQ p256ord<>+0x08(SB), AX, t1;\
ADCXQ t0, AX;\
ADOXQ AX, acc3;\
\
MULXQ p256ord<>+0x10(SB), AX, t0;\
ADCXQ t1, AX;\
ADOXQ AX, acc0;\
\
MULXQ p256ord<>+0x18(SB), AX, acc2;\
ADCXQ t0, AX;\
ADOXQ AX, acc1;\
MOVQ $0, t0;\
ADCXQ t0, acc2;\
ADOXQ t0, acc2;\
\
\// Last reduction step
MOVQ acc3, DX;\
MULXQ p256ordK0<>(SB), DX, AX;\
\
MULXQ p256ord<>+0x00(SB), AX, t0;\
ADOXQ AX, acc3;\
\
MULXQ p256ord<>+0x08(SB), AX, t1;\
ADCXQ t0, AX;\
ADOXQ AX, acc0;\
\
MULXQ p256ord<>+0x10(SB), AX, t0;\
ADCXQ t1, AX;\
ADOXQ AX, acc1;\
\
MULXQ p256ord<>+0x18(SB), AX, acc3;\
ADCXQ t0, AX;\
ADOXQ AX, acc2;\
MOVQ $0, t0;\
ADCXQ t0, acc3;\
ADOXQ t0, acc3;\
\
XORQ t1, t1;\
\// Add bits [511:256] of the sqr result
ADCXQ acc4, acc0;\
ADCXQ acc5, acc1;\
ADCXQ y_ptr, acc2;\
ADCXQ x_ptr, acc3;\
ADCXQ t1, t0;\
\
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
MOVQ res_ptr, x_ptr;
#define p256SqrInternalInline \
MOVQ acc4, mul0;\
MULQ acc5;\
MOVQ mul0, acc1;\
MOVQ mul1, acc2;\
\
MOVQ acc4, mul0;\
MULQ acc6;\
ADDQ mul0, acc2;\
ADCQ $0, mul1;\
MOVQ mul1, acc3;\
\
MOVQ acc4, mul0;\
MULQ acc7;\
ADDQ mul0, acc3;\
ADCQ $0, mul1;\
MOVQ mul1, t0;\
\
MOVQ acc5, mul0;\
MULQ acc6;\
ADDQ mul0, acc3;\
ADCQ $0, mul1;\
MOVQ mul1, acc0;\
\
MOVQ acc5, mul0;\
MULQ acc7;\
ADDQ acc0, t0;\
ADCQ $0, mul1;\
ADDQ mul0, t0;\
ADCQ $0, mul1;\
MOVQ mul1, t1;\
\
MOVQ acc6, mul0;\
MULQ acc7;\
ADDQ mul0, t1;\
ADCQ $0, mul1;\
MOVQ mul1, t2;\
XORQ t3, t3;\
\// *2
ADDQ acc1, acc1;\
ADCQ acc2, acc2;\
ADCQ acc3, acc3;\
ADCQ t0, t0;\
ADCQ t1, t1;\
ADCQ t2, t2;\
ADCQ $0, t3;\
\// Missing products
MOVQ acc4, mul0;\
MULQ mul0;\
MOVQ mul0, acc0;\
MOVQ mul1, acc4;\
\
MOVQ acc5, mul0;\
MULQ mul0;\
ADDQ acc4, acc1;\
ADCQ mul0, acc2;\
ADCQ $0, mul1;\
MOVQ mul1, acc4;\
\
MOVQ acc6, mul0;\
MULQ mul0;\
ADDQ acc4, acc3;\
ADCQ mul0, t0;\
ADCQ $0, mul1;\
MOVQ mul1, acc4;\
\
MOVQ acc7, mul0;\
MULQ mul0;\
ADDQ acc4, t1;\
ADCQ mul0, t2;\
ADCQ mul1, t3;\
\// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInline;
#define p256SqrInternalInlineAdx \
XORQ acc0, acc0;\
XORQ t2, t2;\
MOVQ acc4, mul1;\
MULXQ acc5, acc1, acc2;\
\
MULXQ acc6, mul0, acc3;\
ADOXQ mul0, acc2;\
\
MULXQ acc7, mul0, t0;\
ADOXQ mul0, acc3;\
ADOXQ t2, t0;\
\
MOVQ acc5, mul1;\
MULXQ acc6, mul0, t3;\
ADOXQ mul0, acc3;\
\
MULXQ acc7, mul0, t1;\
ADCXQ t3, mul0;\
ADOXQ mul0, t0;\
ADCXQ t2, t1;\
\
MOVQ acc6, mul1;\
MULXQ acc7, mul0, t2;\
ADOXQ mul0, t1;\
ADOXQ acc0, t2;\
XORQ t3, t3;\
\
\// *2
ADOXQ acc1, acc1;\
ADOXQ acc2, acc2;\
ADOXQ acc3, acc3;\
ADOXQ t0, t0;\
ADOXQ t1, t1;\
ADOXQ t2, t2;\
ADOXQ acc0, t3;\
\
\// Missing products
MOVQ acc4, mul1;\
MULXQ mul1, acc0, acc4;\
ADDQ acc4, acc1;\
\
MOVQ acc5, mul1;\
MULXQ mul1, mul0, acc4;\
ADCXQ mul0, acc2;\
ADCXQ acc4, acc3;\
\
MOVQ acc6, mul1;\
MULXQ mul1, mul0, acc4;\
ADCXQ mul0, t0;\
ADCXQ acc4, t1;\
\
MOVQ acc7, mul1;\
MULXQ mul1, mul0, acc4;\
ADCXQ mul0, t2;\
ADCXQ acc4, t3;\
\// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInline;
// p256IsZeroInline returns 1 in AX if [acc4..acc7] represents zero and zero
// otherwise. It writes to [acc4..acc7], t0 and t1.
#define p256IsZeroInline \
\// AX contains a flag that is set if the input is zero.
XORQ AX, AX;\
MOVQ $1, t1;\
\// Check whether [acc4..acc7] are all zero.
MOVQ acc4, t0;\
ORQ acc5, t0;\
ORQ acc6, t0;\
ORQ acc7, t0;\
\// Set the zero flag if so. (CMOV of a constant to a register doesn't
\// appear to be supported in Go. Thus t1 = 1.)
CMOVQEQ t1, AX;\ // CMOVQEQ: Move if equal (ZF == 1)
\// XOR [acc4..acc7] with P and compare with zero again.
XORQ $-1, acc4;\
XORQ p256p<>+0x08(SB), acc5;\
XORQ $-1, acc6;\
XORQ p256p<>+0x018(SB), acc7;\
ORQ acc5, acc4;\
ORQ acc6, acc4;\
ORQ acc7, acc4;\
\// Set the zero flag if so.
\// CMOVQEQ: Move if equal (ZF == 1)
CMOVQEQ t1, AX;

View File

@ -6,11 +6,21 @@
// https://eprint.iacr.org/2013/816.pdf
//go:build amd64 && !purego && plugin
// plugin mode - DO NOT use the R15 Register.
// Below functions are different:
// 1.p256Sqr
// 2.p256OrdSqr
// 3.sm2P256MulInternal
// 4.sm2P256SqrInternal
#include "textflag.h"
#include "p256_macros_amd64.s"
/* ---------------------------------------*/
// This func is same as non-plugin mode, except that it uses BP to store n
// and does not use R15.
//
// func p256Sqr(res, in *p256Element, n int)
TEXT ·p256Sqr(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
@ -21,162 +31,21 @@ TEXT ·p256Sqr(SB),NOSPLIT,$0
JEQ sqrBMI2
sqrLoop:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0
MOVQ (8*1)(x_ptr), AX
MULQ t0
MOVQ AX, acc1
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, BX
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ BX, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, y_ptr
XORQ BX, BX
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, BX
// Missing products
MOVQ (8*0)(x_ptr), AX
MULQ AX
MOVQ AX, acc0
MOVQ DX, t0
MOVQ (8*1)(x_ptr), AX
MULQ AX
ADDQ t0, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*2)(x_ptr), AX
MULQ AX
ADDQ t0, acc3
ADCQ AX, acc4
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*3)(x_ptr), AX
MULQ AX
ADDQ t0, acc5
ADCQ AX, y_ptr
ADCQ DX, BX
MOVQ BX, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
p256SqrMontReduce()
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
MOVQ res_ptr, x_ptr
p256SqrRound(BX)
DECQ BP
JNE sqrLoop
RET
sqrBMI2:
XORQ acc0, acc0
XORQ y_ptr, y_ptr
// x[1:] * x[0]
MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3
ADOXQ AX, acc2
MULXQ (8*3)(x_ptr), AX, acc4
ADOXQ AX, acc3
ADOXQ y_ptr, acc4
// x[2:] * x[1]
MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, BX
ADOXQ AX, acc3
MULXQ (8*3)(x_ptr), AX, acc5
ADCXQ BX, AX
ADOXQ AX, acc4
ADCXQ y_ptr, acc5
// x[3] * x[2]
MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr
ADOXQ AX, acc5
ADOXQ acc0, y_ptr
XORQ BX, BX
// *2
ADOXQ acc1, acc1
ADOXQ acc2, acc2
ADOXQ acc3, acc3
ADOXQ acc4, acc4
ADOXQ acc5, acc5
ADOXQ y_ptr, y_ptr
ADOXQ acc0, BX
// Missing products
MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0
ADCXQ t0, acc1
MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0
ADCXQ AX, acc2
ADCXQ t0, acc3
MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0
ADCXQ AX, acc4
ADCXQ t0, acc5
MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr
ADCXQ AX, y_ptr
ADCXQ BX, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
p256SqrMontReduce()
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
MOVQ res_ptr, x_ptr
p256SqrRoundAdx(BX)
DECQ BP
JNE sqrBMI2
RET
/* ---------------------------------------*/
// This func is same as non-plugin mode, except that it uses BP to store n
// and does not use R15.
//
// func p256OrdSqr(res, in *p256OrdElement, n int)
TEXT ·p256OrdSqr(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
@ -187,385 +56,14 @@ TEXT ·p256OrdSqr(SB),NOSPLIT,$0
JEQ ordSqrLoopBMI2
ordSqrLoop:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0
MOVQ (8*1)(x_ptr), AX
MULQ t0
MOVQ AX, acc1
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, BX
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ BX, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, y_ptr
XORQ BX, BX
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, BX
// Missing products
MOVQ (8*0)(x_ptr), AX
MULQ AX
MOVQ AX, acc0
MOVQ DX, t0
MOVQ (8*1)(x_ptr), AX
MULQ AX
ADDQ t0, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*2)(x_ptr), AX
MULQ AX
ADDQ t0, acc3
ADCQ AX, acc4
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*3)(x_ptr), AX
MULQ AX
ADDQ t0, acc5
ADCQ AX, y_ptr
ADCQ DX, BX
MOVQ BX, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
MOVQ acc0, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
MOVQ DX, BX // BX = carry1 + H(t0 * ord0)
MOVQ t0, acc0 // acc0 = t0
// calculate the negative part: [acc0, acc3, acc2] - [0, 0x100000000, 1] * t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc2
SBBQ AX, acc3
SBBQ DX, acc0
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc1 // (carry2, acc1) = acc1 + BX
ADCQ $0, DX // DX = carry2 + H(t0*ord1)
ADDQ AX, acc1 // (carry3, acc1) = acc1 + BX + L(t0*ord1)
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, BX
MOVQ t0, acc1
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc3
SBBQ AX, acc0
SBBQ DX, acc1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ DX, acc3
ADCQ $0, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, BX
MOVQ t0, acc2
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc0
SBBQ AX, acc1
SBBQ DX, acc2
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc0
ADCQ $0, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, BX
MOVQ t0, acc3
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
SHRQ $32, DX
SUBQ t0, acc1
SBBQ AX, acc2
SBBQ DX, acc3
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ BX, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
ADCQ $0, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADCQ acc4, acc0
ADCQ acc5, acc1
ADCQ y_ptr, acc2
ADCQ x_ptr, acc3
ADCQ $0, t0
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
MOVQ res_ptr, x_ptr
p256OrdSqrRound(BX)
DECQ BP
JNE ordSqrLoop
RET
ordSqrLoopBMI2:
XORQ acc0, acc0
XORQ y_ptr, y_ptr
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3
ADOXQ AX, acc2
MULXQ (8*3)(x_ptr), AX, acc4
ADOXQ AX, acc3
ADOXQ y_ptr, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, BX
ADOXQ AX, acc3
MULXQ (8*3)(x_ptr), AX, acc5
ADCXQ BX, AX
ADOXQ AX, acc4
ADCXQ y_ptr, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr
ADOXQ AX, acc5
ADOXQ acc0, y_ptr
XORQ BX, BX
// *2
ADOXQ acc1, acc1
ADOXQ acc2, acc2
ADOXQ acc3, acc3
ADOXQ acc4, acc4
ADOXQ acc5, acc5
ADOXQ y_ptr, y_ptr
ADOXQ acc0, BX
// Missing products
MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0
ADCXQ t0, acc1
MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0
ADCXQ AX, acc2
ADCXQ t0, acc3
MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0
ADCXQ AX, acc4
ADCXQ t0, acc5
MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr
ADCXQ AX, y_ptr
ADCXQ BX, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step
MOVQ acc0, DX
MULXQ p256ordK0<>(SB), DX, AX
MULXQ p256ord<>+0x00(SB), AX, t0
ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
MULXQ p256ord<>+0x08(SB), AX, BX
ADCXQ t0, AX
ADOXQ AX, acc1
MULXQ p256ord<>+0x10(SB), AX, t0
ADCXQ BX, AX
ADOXQ AX, acc2
MULXQ p256ord<>+0x18(SB), AX, acc0
ADCXQ t0, AX
ADOXQ AX, acc3
MOVQ $0, t0
ADCXQ t0, acc0
ADOXQ t0, acc0
// Second reduction step
MOVQ acc1, DX
MULXQ p256ordK0<>(SB), DX, AX
MULXQ p256ord<>+0x00(SB), AX, t0
ADOXQ AX, acc1
MULXQ p256ord<>+0x08(SB), AX, BX
ADCXQ t0, AX
ADOXQ AX, acc2
MULXQ p256ord<>+0x10(SB), AX, t0
ADCXQ BX, AX
ADOXQ AX, acc3
MULXQ p256ord<>+0x18(SB), AX, acc1
ADCXQ t0, AX
ADOXQ AX, acc0
MOVQ $0, t0
ADCXQ t0, acc1
ADOXQ t0, acc1
// Third reduction step
MOVQ acc2, DX
MULXQ p256ordK0<>(SB), DX, AX
MULXQ p256ord<>+0x00(SB), AX, t0
ADOXQ AX, acc2
MULXQ p256ord<>+0x08(SB), AX, BX
ADCXQ t0, AX
ADOXQ AX, acc3
MULXQ p256ord<>+0x10(SB), AX, t0
ADCXQ BX, AX
ADOXQ AX, acc0
MULXQ p256ord<>+0x18(SB), AX, acc2
ADCXQ t0, AX
ADOXQ AX, acc1
MOVQ $0, t0
ADCXQ t0, acc2
ADOXQ t0, acc2
// Last reduction step
MOVQ acc3, DX
MULXQ p256ordK0<>(SB), DX, AX
MULXQ p256ord<>+0x00(SB), AX, t0
ADOXQ AX, acc3
MULXQ p256ord<>+0x08(SB), AX, BX
ADCXQ t0, AX
ADOXQ AX, acc0
MULXQ p256ord<>+0x10(SB), AX, t0
ADCXQ BX, AX
ADOXQ AX, acc1
MULXQ p256ord<>+0x18(SB), AX, acc3
ADCXQ t0, AX
ADOXQ AX, acc2
MOVQ $0, t0
ADCXQ t0, acc3
ADOXQ t0, acc3
XORQ BX, BX
// Add bits [511:256] of the sqr result
ADCXQ acc4, acc0
ADCXQ acc5, acc1
ADCXQ y_ptr, acc2
ADCXQ x_ptr, acc3
ADCXQ BX, t0
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
MOVQ res_ptr, x_ptr
p256OrdSqrRoundAdx(BX)
DECQ BP
JNE ordSqrLoopBMI2
@ -599,33 +97,6 @@ ordSqrLoopBMI2:
#define t2 SI
#define t3 R9
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
TEXT sm2P256SubInternal(SB),NOSPLIT,$0
XORQ mul0, mul0
SUBQ t0, acc4
SBBQ t1, acc5
SBBQ t2, acc6
SBBQ t3, acc7
SBBQ $0, mul0
MOVQ acc4, acc0
MOVQ acc5, acc1
MOVQ acc6, acc2
MOVQ acc7, acc3
ADDQ $-1, acc4
ADCQ p256p<>+0x08(SB), acc5
ADCQ $-1, acc6
ADCQ p256p<>+0x018(SB), acc7
ANDQ $1, mul0
CMOVQEQ acc0, acc4
CMOVQEQ acc1, acc5
CMOVQEQ acc2, acc6
CMOVQEQ acc3, acc7
RET
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0]
TEXT sm2P256MulInternal(SB),NOSPLIT,$8
@ -634,7 +105,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
MOVQ acc4, mul0
MULQ t0
MOVQ mul0, X0
MOVQ mul0, X0 // uses X0 as temp register/storage
MOVQ mul1, acc1
MOVQ acc4, mul0
@ -746,7 +217,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
MOVQ mul1, acc7
PEXTRQ $0, X0, acc0
sm2P256MulReductionInternal()
sm2P256MulReductionInline
MOVQ $0, mul0
// Add bits [511:256] of the result
ADCQ acc0, acc4
@ -775,7 +246,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
internalMulBMI2:
MOVQ acc4, mul1
MULXQ t0, acc0, acc1
MOVQ acc0, X0
MOVQ acc0, X0 // uses X0 as temp register/storage
MULXQ t1, mul0, acc2
ADDQ mul0, acc1
@ -848,7 +319,7 @@ internalMulBMI2:
ADCQ $0, acc7
PEXTRQ $0, X0, acc0
sm2P256MulReductionInternal()
sm2P256MulReductionInline
MOVQ $0, mul0
// Add bits [511:256] of the result
ADCQ acc0, acc4
@ -881,140 +352,11 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
CMPB ·supportBMI2+0(SB), $0x01
JEQ internalSqrBMI2
MOVQ acc4, mul0
MULQ acc5
MOVQ mul0, acc1
MOVQ mul1, acc2
MOVQ acc4, mul0
MULQ acc6
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
MOVQ acc4, mul0
MULQ acc7
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ acc5, mul0
MULQ acc6
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc0
MOVQ acc5, mul0
MULQ acc7
ADDQ acc0, t0
ADCQ $0, mul1
ADDQ mul0, t0
ADCQ $0, mul1
MOVQ mul1, t1
MOVQ acc6, mul0
MULQ acc7
ADDQ mul0, t1
ADCQ $0, mul1
MOVQ mul1, t2
XORQ t3, t3
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ t0, t0
ADCQ t1, t1
ADCQ t2, t2
ADCQ $0, t3
// Missing products
MOVQ acc4, mul0
MULQ mul0
MOVQ mul0, acc0
MOVQ mul1, acc4
MOVQ acc5, mul0
MULQ mul0
ADDQ acc4, acc1
ADCQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc4
MOVQ acc6, mul0
MULQ mul0
ADDQ acc4, acc3
ADCQ mul0, t0
ADCQ $0, mul1
MOVQ mul1, acc4
MOVQ acc7, mul0
MULQ mul0
ADDQ acc4, t1
ADCQ mul0, t2
ADCQ mul1, t3
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInternal()
p256SqrInternalInline
RET
internalSqrBMI2:
XORQ acc0, acc0
XORQ t2, t2
MOVQ acc4, mul1
MULXQ acc5, acc1, acc2
MULXQ acc6, mul0, acc3
ADOXQ mul0, acc2
MULXQ acc7, mul0, t0
ADOXQ mul0, acc3
ADOXQ t2, t0
MOVQ acc5, mul1
MULXQ acc6, mul0, t3
ADOXQ mul0, acc3
MULXQ acc7, mul0, t1
ADCXQ t3, mul0
ADOXQ mul0, t0
ADCXQ t2, t1
MOVQ acc6, mul1
MULXQ acc7, mul0, t2
ADOXQ mul0, t1
ADOXQ acc0, t2
XORQ t3, t3
// *2
ADOXQ acc1, acc1
ADOXQ acc2, acc2
ADOXQ acc3, acc3
ADOXQ t0, t0
ADOXQ t1, t1
ADOXQ t2, t2
ADOXQ acc0, t3
// Missing products
MOVQ acc4, mul1
MULXQ mul1, acc0, acc4
ADDQ acc4, acc1
MOVQ acc5, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, acc2
ADCXQ acc4, acc3
MOVQ acc6, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, t0
ADCXQ acc4, t1
MOVQ acc7, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, t2
ADCXQ acc4, t3
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInternal()
p256SqrInternalInlineAdx
RET
/* ---------------------------------------*/
@ -1099,7 +441,7 @@ internalSqrBMI2:
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2
\
LDt (x1in) \
CALL sm2P256SubInternal(SB) \// h = u2 - u1
p256SubInline2 \// h = u2 - u1
ST (h) \
\
LDt (z1in) \
@ -1114,7 +456,7 @@ internalSqrBMI2:
ST (s2) \
\
LDt (y1in) \
CALL sm2P256SubInternal(SB) \// r = s2 - s1
p256SubInline2 \// r = s2 - s1
ST (r) \
\
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
@ -1139,10 +481,10 @@ internalSqrBMI2:
\
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
LDacc (rsqr) \
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
\
LDt (hcub) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
ST (xout) \
\
MOVQ acc4, t0 \
@ -1150,13 +492,13 @@ internalSqrBMI2:
MOVQ acc6, t2 \
MOVQ acc7, t3 \
LDacc (h) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
\
LDt (r) \
CALL sm2P256MulInternal(SB) \
\
LDt (s2) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
ST (yout) \
\// Load stored values from stack
MOVQ rptr, AX \
@ -1373,36 +715,6 @@ pointaddaffine_avx2:
#undef sel_save
#undef zero_save
// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
// otherwise. It writes to [acc4..acc7], t0 and t1.
TEXT sm2P256IsZero(SB),NOSPLIT,$0
// AX contains a flag that is set if the input is zero.
XORQ AX, AX
MOVQ $1, t1
// Check whether [acc4..acc7] are all zero.
MOVQ acc4, t0
ORQ acc5, t0
ORQ acc6, t0
ORQ acc7, t0
// Set the zero flag if so. (CMOV of a constant to a register doesn't
// appear to be supported in Go. Thus t1 = 1.)
CMOVQEQ t1, AX
// XOR [acc4..acc7] with P and compare with zero again.
XORQ $-1, acc4
XORQ p256p<>+0x08(SB), acc5
XORQ $-1, acc6
XORQ p256p<>+0x018(SB), acc7
ORQ acc5, acc4
ORQ acc6, acc4
ORQ acc7, acc4
// Set the zero flag if so.
CMOVQEQ t1, AX
RET
/* ---------------------------------------*/
#define x1in(off) (32*0 + off)(SP)
#define y1in(off) (32*1 + off)(SP)
@ -1450,9 +762,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
ST (s2) \
\
LDt (s1) \
CALL sm2P256SubInternal(SB) \// r = s2 - s1
p256SubInline2 \// r = s2 - s1
ST (r) \
CALL sm2P256IsZero(SB) \
p256IsZeroInline \
MOVQ AX, points_eq \
\
LDacc (z2sqr) \
@ -1465,9 +777,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
ST (u2) \
\
LDt (u1) \
CALL sm2P256SubInternal(SB) \// h = u2 - u1
p256SubInline2 \// h = u2 - u1
ST (h) \
CALL sm2P256IsZero(SB) \
p256IsZeroInline \
ANDQ points_eq, AX \
MOVQ AX, points_eq \
\
@ -1501,10 +813,10 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
\
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
LDacc (rsqr) \
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
\
LDt (hcub) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
ST (xout) \
\
MOVQ acc4, t0 \
@ -1512,13 +824,13 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
MOVQ acc6, t2 \
MOVQ acc7, t3 \
LDacc (u2) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
\
LDt (r) \
CALL sm2P256MulInternal(SB) \
\
LDt (s2) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
ST (yout) \
//func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
@ -1669,7 +981,7 @@ pointadd_avx2:
#define calX() \
LDacc (x) \
LDt (zsqr) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
LDt (m) \
CALL sm2P256MulInternal(SB) \
ST (m) \
@ -1718,18 +1030,18 @@ pointadd_avx2:
LDacc (m) \
CALL sm2P256SqrInternal(SB) \
LDt (tmp) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
#define calY() \
acc2t \
LDacc (s) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
\
LDt (m) \
CALL sm2P256MulInternal(SB) \
\
LDt (y) \
CALL sm2P256SubInternal(SB) \
p256SubInline2 \
#define lastP256PointDouble() \
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl