mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
internal/sm2ec: amd64 refactoring, reduce duplicated code
This commit is contained in:
parent
fabcb6ad30
commit
53ac591635
@ -20,159 +20,13 @@ TEXT ·p256Sqr(SB),NOSPLIT,$0
|
||||
JEQ sqrBMI2
|
||||
|
||||
sqrLoop:
|
||||
|
||||
// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ t0
|
||||
MOVQ AX, acc1
|
||||
MOVQ DX, acc2
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc3
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc4
|
||||
// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc5
|
||||
// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc5
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, y_ptr
|
||||
XORQ t1, t1
|
||||
// *2
|
||||
ADDQ acc1, acc1
|
||||
ADCQ acc2, acc2
|
||||
ADCQ acc3, acc3
|
||||
ADCQ acc4, acc4
|
||||
ADCQ acc5, acc5
|
||||
ADCQ y_ptr, y_ptr
|
||||
ADCQ $0, t1
|
||||
// Missing products
|
||||
MOVQ (8*0)(x_ptr), AX
|
||||
MULQ AX
|
||||
MOVQ AX, acc0
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc1
|
||||
ADCQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc3
|
||||
ADCQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc5
|
||||
ADCQ AX, y_ptr
|
||||
ADCQ DX, t1
|
||||
MOVQ t1, x_ptr
|
||||
|
||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
p256SqrMontReduce()
|
||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
p256SqrRound(t1)
|
||||
DECQ BX
|
||||
JNE sqrLoop
|
||||
RET
|
||||
|
||||
sqrBMI2:
|
||||
XORQ acc0, acc0
|
||||
XORQ y_ptr, y_ptr
|
||||
// x[1:] * x[0]
|
||||
MOVQ (8*0)(x_ptr), DX
|
||||
MULXQ (8*1)(x_ptr), acc1, acc2
|
||||
|
||||
MULXQ (8*2)(x_ptr), AX, acc3
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ (8*3)(x_ptr), AX, acc4
|
||||
ADOXQ AX, acc3
|
||||
ADOXQ y_ptr, acc4
|
||||
|
||||
// x[2:] * x[1]
|
||||
MOVQ (8*1)(x_ptr), DX
|
||||
MULXQ (8*2)(x_ptr), AX, t1
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ (8*3)(x_ptr), AX, acc5
|
||||
ADCXQ t1, AX
|
||||
ADOXQ AX, acc4
|
||||
ADCXQ y_ptr, acc5
|
||||
|
||||
// y[x] * x[2]
|
||||
MOVQ (8*2)(x_ptr), DX
|
||||
MULXQ (8*3)(x_ptr), AX, y_ptr
|
||||
ADOXQ AX, acc5
|
||||
ADOXQ acc0, y_ptr
|
||||
|
||||
XORQ t1, t1
|
||||
|
||||
// *2
|
||||
ADOXQ acc1, acc1
|
||||
ADOXQ acc2, acc2
|
||||
ADOXQ acc3, acc3
|
||||
ADOXQ acc4, acc4
|
||||
ADOXQ acc5, acc5
|
||||
ADOXQ y_ptr, y_ptr
|
||||
ADOXQ acc0, t1
|
||||
|
||||
// Missing products
|
||||
MOVQ (8*0)(x_ptr), DX
|
||||
MULXQ DX, acc0, t0
|
||||
ADCXQ t0, acc1
|
||||
|
||||
MOVQ (8*1)(x_ptr), DX
|
||||
MULXQ DX, AX, t0
|
||||
ADCXQ AX, acc2
|
||||
ADCXQ t0, acc3
|
||||
|
||||
MOVQ (8*2)(x_ptr), DX
|
||||
MULXQ DX, AX, t0
|
||||
ADCXQ AX, acc4
|
||||
ADCXQ t0, acc5
|
||||
|
||||
MOVQ (8*3)(x_ptr), DX
|
||||
MULXQ DX, AX, x_ptr
|
||||
ADCXQ AX, y_ptr
|
||||
ADCXQ t1, x_ptr
|
||||
|
||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
p256SqrMontReduce()
|
||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
p256SqrRoundAdx(t1)
|
||||
DECQ BX
|
||||
JNE sqrBMI2
|
||||
RET
|
||||
@ -188,385 +42,14 @@ TEXT ·p256OrdSqr(SB),NOSPLIT,$0
|
||||
JEQ ordSqrLoopBMI2
|
||||
|
||||
ordSqrLoop:
|
||||
|
||||
// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ t0
|
||||
MOVQ AX, acc1
|
||||
MOVQ DX, acc2
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc3
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc4
|
||||
// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc5
|
||||
// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc5
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, y_ptr
|
||||
XORQ t1, t1
|
||||
// *2
|
||||
ADDQ acc1, acc1
|
||||
ADCQ acc2, acc2
|
||||
ADCQ acc3, acc3
|
||||
ADCQ acc4, acc4
|
||||
ADCQ acc5, acc5
|
||||
ADCQ y_ptr, y_ptr
|
||||
ADCQ $0, t1
|
||||
// Missing products
|
||||
MOVQ (8*0)(x_ptr), AX
|
||||
MULQ AX
|
||||
MOVQ AX, acc0
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc1
|
||||
ADCQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc3
|
||||
ADCQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc5
|
||||
ADCQ AX, y_ptr
|
||||
ADCQ DX, t1
|
||||
MOVQ t1, x_ptr
|
||||
|
||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
MOVQ acc0, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
||||
ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
|
||||
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
|
||||
MOVQ t0, acc0 // acc0 = t0
|
||||
|
||||
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc0
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
|
||||
ADCQ $0, DX // DX = carry2 + H(t0*ord1)
|
||||
|
||||
ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
|
||||
ADCQ DX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
|
||||
// Second reduction step
|
||||
MOVQ acc1, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
MOVQ t0, acc1
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc3
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc2
|
||||
ADCQ $0, DX
|
||||
|
||||
ADDQ AX, acc2
|
||||
ADCQ DX, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
// Third reduction step
|
||||
MOVQ acc2, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
MOVQ t0, acc2
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc0
|
||||
SBBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc3
|
||||
ADCQ $0, DX
|
||||
|
||||
ADDQ AX, acc3
|
||||
ADCQ DX, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
// Last reduction step
|
||||
MOVQ acc3, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t1
|
||||
MOVQ t0, acc3
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc1
|
||||
SBBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ t1, acc0
|
||||
ADCQ $0, DX
|
||||
|
||||
ADDQ AX, acc0
|
||||
ADCQ DX, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
|
||||
XORQ t0, t0
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADCQ acc4, acc0
|
||||
ADCQ acc5, acc1
|
||||
ADCQ y_ptr, acc2
|
||||
ADCQ x_ptr, acc3
|
||||
ADCQ $0, t0
|
||||
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
p256OrdSqrRound(t1)
|
||||
DECQ BX
|
||||
JNE ordSqrLoop
|
||||
|
||||
RET
|
||||
|
||||
ordSqrLoopBMI2:
|
||||
XORQ acc0, acc0
|
||||
XORQ y_ptr, y_ptr
|
||||
// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), DX
|
||||
MULXQ (8*1)(x_ptr), acc1, acc2
|
||||
|
||||
MULXQ (8*2)(x_ptr), AX, acc3
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ (8*3)(x_ptr), AX, acc4
|
||||
ADOXQ AX, acc3
|
||||
ADOXQ y_ptr, acc4
|
||||
|
||||
// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), DX
|
||||
MULXQ (8*2)(x_ptr), AX, t1
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ (8*3)(x_ptr), AX, acc5
|
||||
ADCXQ t1, AX
|
||||
ADOXQ AX, acc4
|
||||
ADCXQ y_ptr, acc5
|
||||
|
||||
// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), DX
|
||||
MULXQ (8*3)(x_ptr), AX, y_ptr
|
||||
ADOXQ AX, acc5
|
||||
ADOXQ acc0, y_ptr
|
||||
|
||||
XORQ t1, t1
|
||||
// *2
|
||||
ADOXQ acc1, acc1
|
||||
ADOXQ acc2, acc2
|
||||
ADOXQ acc3, acc3
|
||||
ADOXQ acc4, acc4
|
||||
ADOXQ acc5, acc5
|
||||
ADOXQ y_ptr, y_ptr
|
||||
ADOXQ acc0, t1
|
||||
|
||||
// Missing products
|
||||
MOVQ (8*0)(x_ptr), DX
|
||||
MULXQ DX, acc0, t0
|
||||
ADCXQ t0, acc1
|
||||
|
||||
MOVQ (8*1)(x_ptr), DX
|
||||
MULXQ DX, AX, t0
|
||||
ADCXQ AX, acc2
|
||||
ADCXQ t0, acc3
|
||||
|
||||
MOVQ (8*2)(x_ptr), DX
|
||||
MULXQ DX, AX, t0
|
||||
ADCXQ AX, acc4
|
||||
ADCXQ t0, acc5
|
||||
|
||||
MOVQ (8*3)(x_ptr), DX
|
||||
MULXQ DX, AX, x_ptr
|
||||
ADCXQ AX, y_ptr
|
||||
ADCXQ t1, x_ptr
|
||||
|
||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
// First reduction step
|
||||
MOVQ acc0, DX
|
||||
MULXQ p256ordK0<>(SB), DX, AX
|
||||
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||
ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
|
||||
|
||||
MULXQ p256ord<>+0x08(SB), AX, t1
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc1
|
||||
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||
ADCXQ t1, AX
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc0
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc3
|
||||
MOVQ $0, t0
|
||||
ADCXQ t0, acc0
|
||||
ADOXQ t0, acc0
|
||||
|
||||
// Second reduction step
|
||||
MOVQ acc1, DX
|
||||
MULXQ p256ordK0<>(SB), DX, AX
|
||||
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||
ADOXQ AX, acc1
|
||||
|
||||
MULXQ p256ord<>+0x08(SB), AX, t1
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||
ADCXQ t1, AX
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc1
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc0
|
||||
MOVQ $0, t0
|
||||
ADCXQ t0, acc1
|
||||
ADOXQ t0, acc1
|
||||
|
||||
// Third reduction step
|
||||
MOVQ acc2, DX
|
||||
MULXQ p256ordK0<>(SB), DX, AX
|
||||
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ p256ord<>+0x08(SB), AX, t1
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||
ADCXQ t1, AX
|
||||
ADOXQ AX, acc0
|
||||
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc2
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc1
|
||||
MOVQ $0, t0
|
||||
ADCXQ t0, acc2
|
||||
ADOXQ t0, acc2
|
||||
|
||||
// Last reduction step
|
||||
MOVQ acc3, DX
|
||||
MULXQ p256ordK0<>(SB), DX, AX
|
||||
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ p256ord<>+0x08(SB), AX, t1
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc0
|
||||
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||
ADCXQ t1, AX
|
||||
ADOXQ AX, acc1
|
||||
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc3
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc2
|
||||
MOVQ $0, t0
|
||||
ADCXQ t0, acc3
|
||||
ADOXQ t0, acc3
|
||||
|
||||
XORQ t1, t1
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADCXQ acc4, acc0
|
||||
ADCXQ acc5, acc1
|
||||
ADCXQ y_ptr, acc2
|
||||
ADCXQ x_ptr, acc3
|
||||
ADCXQ t1, t0
|
||||
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
p256OrdSqrRoundAdx(t1)
|
||||
DECQ BX
|
||||
JNE ordSqrLoopBMI2
|
||||
|
||||
@ -601,33 +84,7 @@ ordSqrLoopBMI2:
|
||||
#define t2 DI
|
||||
#define t3 SI
|
||||
#define hlp BP
|
||||
/* ---------------------------------------*/
|
||||
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
|
||||
TEXT sm2P256SubInternal(SB),NOSPLIT,$0
|
||||
XORQ mul0, mul0
|
||||
SUBQ t0, acc4
|
||||
SBBQ t1, acc5
|
||||
SBBQ t2, acc6
|
||||
SBBQ t3, acc7
|
||||
SBBQ $0, mul0
|
||||
|
||||
MOVQ acc4, acc0
|
||||
MOVQ acc5, acc1
|
||||
MOVQ acc6, acc2
|
||||
MOVQ acc7, acc3
|
||||
|
||||
ADDQ $-1, acc4
|
||||
ADCQ p256p<>+0x08(SB), acc5
|
||||
ADCQ $-1, acc6
|
||||
ADCQ p256p<>+0x018(SB), acc7
|
||||
ANDQ $1, mul0
|
||||
|
||||
CMOVQEQ acc0, acc4
|
||||
CMOVQEQ acc1, acc5
|
||||
CMOVQEQ acc2, acc6
|
||||
CMOVQEQ acc3, acc7
|
||||
|
||||
RET
|
||||
/* ---------------------------------------*/
|
||||
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0]
|
||||
TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||
@ -746,7 +203,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||
ADDQ mul0, acc6
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, acc7
|
||||
sm2P256MulReductionInternal()
|
||||
sm2P256MulReductionInline
|
||||
|
||||
MOVQ $0, BP
|
||||
// Add bits [511:256] of the result
|
||||
@ -767,7 +224,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||
SBBQ p256p<>+0x018(SB), acc7
|
||||
SBBQ $0, hlp
|
||||
// If the result of the subtraction is negative, restore the previous result
|
||||
CMOVQCS acc0, acc4
|
||||
CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc1, acc5
|
||||
CMOVQCS acc2, acc6
|
||||
CMOVQCS acc3, acc7
|
||||
@ -847,7 +304,7 @@ internalMulBMI2:
|
||||
ADDQ mul0, acc6
|
||||
ADCQ $0, acc7
|
||||
|
||||
sm2P256MulReductionInternal()
|
||||
sm2P256MulReductionInline
|
||||
MOVQ $0, BP
|
||||
// Add bits [511:256] of the result
|
||||
ADCQ acc0, acc4
|
||||
@ -867,7 +324,7 @@ internalMulBMI2:
|
||||
SBBQ p256p<>+0x018(SB), acc7
|
||||
SBBQ $0, hlp
|
||||
// If the result of the subtraction is negative, restore the previous result
|
||||
CMOVQCS acc0, acc4
|
||||
CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc1, acc5
|
||||
CMOVQCS acc2, acc6
|
||||
CMOVQCS acc3, acc7
|
||||
@ -880,140 +337,11 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
|
||||
CMPB ·supportBMI2+0(SB), $0x01
|
||||
JEQ internalSqrBMI2
|
||||
|
||||
MOVQ acc4, mul0
|
||||
MULQ acc5
|
||||
MOVQ mul0, acc1
|
||||
MOVQ mul1, acc2
|
||||
|
||||
MOVQ acc4, mul0
|
||||
MULQ acc6
|
||||
ADDQ mul0, acc2
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, acc3
|
||||
|
||||
MOVQ acc4, mul0
|
||||
MULQ acc7
|
||||
ADDQ mul0, acc3
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, t0
|
||||
|
||||
MOVQ acc5, mul0
|
||||
MULQ acc6
|
||||
ADDQ mul0, acc3
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, acc0
|
||||
|
||||
MOVQ acc5, mul0
|
||||
MULQ acc7
|
||||
ADDQ acc0, t0
|
||||
ADCQ $0, mul1
|
||||
ADDQ mul0, t0
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, t1
|
||||
|
||||
MOVQ acc6, mul0
|
||||
MULQ acc7
|
||||
ADDQ mul0, t1
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, t2
|
||||
XORQ t3, t3
|
||||
// *2
|
||||
ADDQ acc1, acc1
|
||||
ADCQ acc2, acc2
|
||||
ADCQ acc3, acc3
|
||||
ADCQ t0, t0
|
||||
ADCQ t1, t1
|
||||
ADCQ t2, t2
|
||||
ADCQ $0, t3
|
||||
// Missing products
|
||||
MOVQ acc4, mul0
|
||||
MULQ mul0
|
||||
MOVQ mul0, acc0
|
||||
MOVQ DX, acc4
|
||||
|
||||
MOVQ acc5, mul0
|
||||
MULQ mul0
|
||||
ADDQ acc4, acc1
|
||||
ADCQ mul0, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc4
|
||||
|
||||
MOVQ acc6, mul0
|
||||
MULQ mul0
|
||||
ADDQ acc4, acc3
|
||||
ADCQ mul0, t0
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc4
|
||||
|
||||
MOVQ acc7, mul0
|
||||
MULQ mul0
|
||||
ADDQ acc4, t1
|
||||
ADCQ mul0, t2
|
||||
ADCQ DX, t3
|
||||
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||
sm2P256SqrReductionInternal()
|
||||
p256SqrInternalInline
|
||||
RET
|
||||
|
||||
internalSqrBMI2:
|
||||
XORQ acc0, acc0
|
||||
XORQ t2, t2
|
||||
MOVQ acc4, mul1
|
||||
MULXQ acc5, acc1, acc2
|
||||
|
||||
MULXQ acc6, mul0, acc3
|
||||
ADOXQ mul0, acc2
|
||||
|
||||
MULXQ acc7, mul0, t0
|
||||
ADOXQ mul0, acc3
|
||||
ADOXQ t2, t0
|
||||
|
||||
MOVQ acc5, mul1
|
||||
MULXQ acc6, mul0, t3
|
||||
ADOXQ mul0, acc3
|
||||
|
||||
MULXQ acc7, mul0, t1
|
||||
ADCXQ t3, mul0
|
||||
ADOXQ mul0, t0
|
||||
ADCXQ t2, t1
|
||||
|
||||
MOVQ acc6, mul1
|
||||
MULXQ acc7, mul0, t2
|
||||
ADOXQ mul0, t1
|
||||
ADOXQ acc0, t2
|
||||
|
||||
XORQ t3, t3
|
||||
|
||||
// *2
|
||||
ADOXQ acc1, acc1
|
||||
ADOXQ acc2, acc2
|
||||
ADOXQ acc3, acc3
|
||||
ADOXQ t0, t0
|
||||
ADOXQ t1, t1
|
||||
ADOXQ t2, t2
|
||||
ADOXQ acc0, t3
|
||||
|
||||
// Missing products
|
||||
MOVQ acc4, mul1
|
||||
MULXQ mul1, acc0, acc4
|
||||
ADDQ acc4, acc1
|
||||
|
||||
MOVQ acc5, mul1
|
||||
MULXQ mul1, mul0, acc4
|
||||
ADCXQ mul0, acc2
|
||||
ADCXQ acc4, acc3
|
||||
|
||||
MOVQ acc6, mul1
|
||||
MULXQ mul1, mul0, acc4
|
||||
ADCXQ mul0, t0
|
||||
ADCXQ acc4, t1
|
||||
|
||||
MOVQ acc7, mul1
|
||||
MULXQ mul1, mul0, acc4
|
||||
ADCXQ mul0, t2
|
||||
ADCXQ acc4, t3
|
||||
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||
sm2P256SqrReductionInternal()
|
||||
|
||||
p256SqrInternalInlineAdx
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
@ -1073,14 +401,14 @@ internalSqrBMI2:
|
||||
ADCQ p256p<>+0x08(SB), acc1 \
|
||||
ADCQ $-1, acc2 \
|
||||
ADCQ p256p<>+0x018(SB), acc3 \
|
||||
ADCQ $0, mul0 \
|
||||
CMOVQNE t0, acc0 \
|
||||
ADCQ $0, mul0 \ // ZF := 1 if mul0 == 0 after ADC
|
||||
CMOVQNE t0, acc0 \ // CMOVQNE: Move if not equal (ZF == 0)
|
||||
CMOVQNE t1, acc1 \
|
||||
CMOVQNE t2, acc2 \
|
||||
CMOVQNE t3, acc3 \
|
||||
\// If condition is 0, keep original value
|
||||
TESTQ DX, DX \
|
||||
CMOVQEQ acc4, acc0 \
|
||||
TESTQ DX, DX \ // ZF := 1 if (DX AND DX == 0)
|
||||
CMOVQEQ acc4, acc0 \ // CMOVQEQ: Move if equal (ZF == 1)
|
||||
CMOVQEQ acc5, acc1 \
|
||||
CMOVQEQ acc6, acc2 \
|
||||
CMOVQEQ acc7, acc3 \
|
||||
@ -1098,7 +426,7 @@ internalSqrBMI2:
|
||||
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2
|
||||
\
|
||||
LDt (x1in) \
|
||||
CALL sm2P256SubInternal(SB) \// h = u2 - u1
|
||||
p256SubInline2 \// h = u2 - u1
|
||||
ST (h) \
|
||||
\
|
||||
LDt (z1in) \
|
||||
@ -1113,7 +441,7 @@ internalSqrBMI2:
|
||||
ST (s2) \
|
||||
\
|
||||
LDt (y1in) \
|
||||
CALL sm2P256SubInternal(SB) \// r = s2 - s1
|
||||
p256SubInline2 \// r = s2 - s1
|
||||
ST (r) \
|
||||
\
|
||||
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
|
||||
@ -1138,10 +466,10 @@ internalSqrBMI2:
|
||||
\
|
||||
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
|
||||
LDacc (rsqr) \
|
||||
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
|
||||
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
|
||||
\
|
||||
LDt (hcub) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
ST (xout) \
|
||||
\
|
||||
MOVQ acc4, t0 \
|
||||
@ -1149,13 +477,13 @@ internalSqrBMI2:
|
||||
MOVQ acc6, t2 \
|
||||
MOVQ acc7, t3 \
|
||||
LDacc (h) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
\
|
||||
LDt (r) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
\
|
||||
LDt (s2) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
ST (yout) \
|
||||
\// Load stored values from stack
|
||||
MOVQ rptr, AX \
|
||||
@ -1372,36 +700,6 @@ pointaddaffine_avx2:
|
||||
#undef sel_save
|
||||
#undef zero_save
|
||||
|
||||
// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
|
||||
// otherwise. It writes to [acc4..acc7], t0 and t1.
|
||||
TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
// AX contains a flag that is set if the input is zero.
|
||||
XORQ AX, AX
|
||||
MOVQ $1, t1
|
||||
|
||||
// Check whether [acc4..acc7] are all zero.
|
||||
MOVQ acc4, t0
|
||||
ORQ acc5, t0
|
||||
ORQ acc6, t0
|
||||
ORQ acc7, t0
|
||||
|
||||
// Set the zero flag if so. (CMOV of a constant to a register doesn't
|
||||
// appear to be supported in Go. Thus t1 = 1.)
|
||||
CMOVQEQ t1, AX
|
||||
|
||||
// XOR [acc4..acc7] with P and compare with zero again.
|
||||
XORQ $-1, acc4
|
||||
XORQ p256p<>+0x08(SB), acc5
|
||||
XORQ $-1, acc6
|
||||
XORQ p256p<>+0x018(SB), acc7
|
||||
ORQ acc5, acc4
|
||||
ORQ acc6, acc4
|
||||
ORQ acc7, acc4
|
||||
|
||||
// Set the zero flag if so.
|
||||
CMOVQEQ t1, AX
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define x1in(off) (32*0 + off)(SP)
|
||||
#define y1in(off) (32*1 + off)(SP)
|
||||
@ -1449,9 +747,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
ST (s2) \
|
||||
\
|
||||
LDt (s1) \
|
||||
CALL sm2P256SubInternal(SB) \// r = s2 - s1
|
||||
p256SubInline2 \// r = s2 - s1
|
||||
ST (r) \
|
||||
CALL sm2P256IsZero(SB) \
|
||||
p256IsZeroInline \
|
||||
MOVQ AX, points_eq \
|
||||
\
|
||||
LDacc (z2sqr) \
|
||||
@ -1464,9 +762,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
ST (u2) \
|
||||
\
|
||||
LDt (u1) \
|
||||
CALL sm2P256SubInternal(SB) \// h = u2 - u1
|
||||
p256SubInline2 \// h = u2 - u1
|
||||
ST (h) \
|
||||
CALL sm2P256IsZero(SB) \
|
||||
p256IsZeroInline \
|
||||
ANDQ points_eq, AX \
|
||||
MOVQ AX, points_eq \
|
||||
\
|
||||
@ -1500,10 +798,10 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
\
|
||||
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
|
||||
LDacc (rsqr) \
|
||||
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
|
||||
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
|
||||
\
|
||||
LDt (hcub) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
ST (xout) \
|
||||
\
|
||||
MOVQ acc4, t0 \
|
||||
@ -1511,13 +809,13 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
MOVQ acc6, t2 \
|
||||
MOVQ acc7, t3 \
|
||||
LDacc (u2) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
\
|
||||
LDt (r) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
\
|
||||
LDt (s2) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
ST (yout) \
|
||||
|
||||
//func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
|
||||
@ -1668,7 +966,7 @@ pointadd_avx2:
|
||||
#define calX() \
|
||||
LDacc (x) \
|
||||
LDt (zsqr) \
|
||||
CALL sm2P256SubInternal(SB) \ // X1 - ZZ
|
||||
p256SubInline2 \ // X1 - ZZ
|
||||
LDt (m) \
|
||||
CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2
|
||||
ST (m) \
|
||||
@ -1717,18 +1015,18 @@ pointadd_avx2:
|
||||
LDacc (m) \
|
||||
CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2
|
||||
LDt (tmp) \
|
||||
CALL sm2P256SubInternal(SB) \ // X3 = M^2 - 2*S
|
||||
p256SubInline2 \ // X3 = M^2 - 2*S
|
||||
|
||||
#define calY() \
|
||||
acc2t \
|
||||
LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
|
||||
CALL sm2P256SubInternal(SB) \ // S - X3
|
||||
p256SubInline2 \ // S - X3
|
||||
\
|
||||
LDt (m) \
|
||||
CALL sm2P256MulInternal(SB) \ // M * (S - X3)
|
||||
\
|
||||
LDt (y) \
|
||||
CALL sm2P256SubInternal(SB) \ // Y3 = M * (S - X3) - 8 * YYYYY
|
||||
p256SubInline2 \ // Y3 = M * (S - X3) - 8 * YYYYY
|
||||
|
||||
#define lastP256PointDouble() \
|
||||
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl
|
||||
|
@ -28,7 +28,7 @@ GLOBL p256ordK0<>(SB), 8, $8
|
||||
GLOBL p256ord<>(SB), 8, $32
|
||||
GLOBL p256one<>(SB), 8, $32
|
||||
|
||||
#define p256SqrMontReduce() \
|
||||
#define p256SqrMontReduceInline \
|
||||
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
|
||||
MOVQ acc0, AX \
|
||||
MOVQ acc0, DX \
|
||||
@ -114,7 +114,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SBBQ p256p<>+0x018(SB), a3 \
|
||||
SBBQ $0, a4 \
|
||||
\
|
||||
CMOVQCS b0, a0 \
|
||||
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS b1, a1 \
|
||||
CMOVQCS b2, a2 \
|
||||
CMOVQCS b3, a3 \
|
||||
@ -138,7 +138,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SBBQ p256ord<>+0x18(SB), a3 \
|
||||
SBBQ $0, a4 \
|
||||
\
|
||||
CMOVQCS b0, a0 \
|
||||
CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS b1, a1 \
|
||||
CMOVQCS b2, a2 \
|
||||
CMOVQCS b3, a3 \
|
||||
@ -148,7 +148,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
MOVQ a2, (8*2)(res) \
|
||||
MOVQ a3, (8*3)(res)
|
||||
|
||||
#define sm2P256SqrReductionInternal() \
|
||||
#define sm2P256SqrReductionInline \
|
||||
\ // First reduction step
|
||||
MOVQ acc0, mul0 \
|
||||
MOVQ acc0, mul1 \
|
||||
@ -232,12 +232,12 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SBBQ p256p<>+0x018(SB), acc7\
|
||||
SBBQ $0, mul0 \
|
||||
\ // If the result of the subtraction is negative, restore the previous result
|
||||
CMOVQCS t0, acc4 \
|
||||
CMOVQCS t0, acc4 \ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS t1, acc5 \
|
||||
CMOVQCS t2, acc6 \
|
||||
CMOVQCS t3, acc7
|
||||
|
||||
#define sm2P256MulReductionInternal() \
|
||||
#define sm2P256MulReductionInline \
|
||||
\// First reduction step
|
||||
MOVQ acc0, mul0 \
|
||||
MOVQ acc0, mul1 \
|
||||
@ -304,19 +304,19 @@ GLOBL p256one<>(SB), 8, $32
|
||||
ADCQ $0, acc3
|
||||
|
||||
#define p256PointDoubleInit() \
|
||||
MOVOU (16*0)(BX), X0 \
|
||||
MOVOU (16*1)(BX), X1 \
|
||||
MOVOU (16*2)(BX), X2 \
|
||||
MOVOU (16*3)(BX), X3 \
|
||||
MOVOU (16*4)(BX), X4 \
|
||||
MOVOU (16*5)(BX), X5 \
|
||||
MOVOU (16*0)(BX), X0;\
|
||||
MOVOU (16*1)(BX), X1;\
|
||||
MOVOU (16*2)(BX), X2;\
|
||||
MOVOU (16*3)(BX), X3;\
|
||||
MOVOU (16*4)(BX), X4;\
|
||||
MOVOU (16*5)(BX), X5;\
|
||||
\
|
||||
MOVOU X0, x(16*0) \
|
||||
MOVOU X1, x(16*1) \
|
||||
MOVOU X2, y(16*0) \
|
||||
MOVOU X3, y(16*1) \
|
||||
MOVOU X4, z(16*0) \
|
||||
MOVOU X5, z(16*1)
|
||||
MOVOU X0, x(16*0);\
|
||||
MOVOU X1, x(16*1);\
|
||||
MOVOU X2, y(16*0);\
|
||||
MOVOU X3, y(16*1);\
|
||||
MOVOU X4, z(16*0);\
|
||||
MOVOU X5, z(16*1);
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
|
||||
@ -336,7 +336,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SBBQ $-1, t2;\
|
||||
SBBQ p256p<>+0x018(SB), t3;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS acc4, t0;\
|
||||
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc5, t1;\
|
||||
CMOVQCS acc6, t2;\
|
||||
CMOVQCS acc7, t3;
|
||||
@ -359,7 +359,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SBBQ $-1, acc6;\
|
||||
SBBQ p256p<>+0x018(SB), acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS t0, acc4;\
|
||||
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS t1, acc5;\
|
||||
CMOVQCS t2, acc6;\
|
||||
CMOVQCS t3, acc7;
|
||||
@ -386,7 +386,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SBBQ $-1, acc6;\
|
||||
SBBQ p256p<>+0x018(SB), acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS t0, acc4;\
|
||||
CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS t1, acc5;\
|
||||
CMOVQCS t2, acc6;\
|
||||
CMOVQCS t3, acc7;\
|
||||
@ -405,7 +405,7 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SBBQ $-1, t2;\
|
||||
SBBQ p256p<>+0x018(SB), t3;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS acc4, t0;\
|
||||
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc5, t1;\
|
||||
CMOVQCS acc6, t2;\
|
||||
CMOVQCS acc7, t3;
|
||||
@ -428,7 +428,718 @@ GLOBL p256one<>(SB), 8, $32
|
||||
SBBQ $-1, t2;\
|
||||
SBBQ p256p<>+0x018(SB), t3;\
|
||||
SBBQ $0, mul0;\
|
||||
CMOVQCS acc4, t0;\
|
||||
CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
|
||||
CMOVQCS acc5, t1;\
|
||||
CMOVQCS acc6, t2;\
|
||||
CMOVQCS acc7, t3;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
|
||||
#define p256SubInline2 \
|
||||
XORQ mul0, mul0;\
|
||||
SUBQ t0, acc4;\
|
||||
SBBQ t1, acc5;\
|
||||
SBBQ t2, acc6;\
|
||||
SBBQ t3, acc7;\
|
||||
SBBQ $0, mul0;\
|
||||
MOVQ acc4, acc0;\
|
||||
MOVQ acc5, acc1;\
|
||||
MOVQ acc6, acc2;\
|
||||
MOVQ acc7, acc3;\
|
||||
ADDQ $-1, acc4;\
|
||||
ADCQ p256p<>+0x08(SB), acc5;\
|
||||
ADCQ $-1, acc6;\
|
||||
ADCQ p256p<>+0x018(SB), acc7;\
|
||||
ANDQ $1, mul0;\
|
||||
CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1)
|
||||
CMOVQEQ acc1, acc5;\
|
||||
CMOVQEQ acc2, acc6;\
|
||||
CMOVQEQ acc3, acc7;\
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define p256SqrRound(t1) \
|
||||
\// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), t0;\
|
||||
\
|
||||
MOVQ (8*1)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
MOVQ AX, acc1;\
|
||||
MOVQ DX, acc2;\
|
||||
\
|
||||
MOVQ (8*2)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc2;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, acc3;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc3;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, acc4;\
|
||||
\// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), t0;\
|
||||
\
|
||||
MOVQ (8*2)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc3;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, t1;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ t1, acc4;\
|
||||
ADCQ $0, DX;\
|
||||
ADDQ AX, acc4;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, acc5;\
|
||||
\// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), t0;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc5;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, y_ptr;\
|
||||
XORQ t1, t1;\
|
||||
\// *2
|
||||
ADDQ acc1, acc1;\
|
||||
ADCQ acc2, acc2;\
|
||||
ADCQ acc3, acc3;\
|
||||
ADCQ acc4, acc4;\
|
||||
ADCQ acc5, acc5;\
|
||||
ADCQ y_ptr, y_ptr;\
|
||||
ADCQ $0, t1;\
|
||||
\// Missing products
|
||||
MOVQ (8*0)(x_ptr), AX;\
|
||||
MULQ AX;\
|
||||
MOVQ AX, acc0;\
|
||||
MOVQ DX, t0;\
|
||||
\
|
||||
MOVQ (8*1)(x_ptr), AX;\
|
||||
MULQ AX;\
|
||||
ADDQ t0, acc1;\
|
||||
ADCQ AX, acc2;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, t0;\
|
||||
\
|
||||
MOVQ (8*2)(x_ptr), AX;\
|
||||
MULQ AX;\
|
||||
ADDQ t0, acc3;\
|
||||
ADCQ AX, acc4;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, t0;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), AX;\
|
||||
MULQ AX;\
|
||||
ADDQ t0, acc5;\
|
||||
ADCQ AX, y_ptr;\
|
||||
ADCQ DX, t1;\
|
||||
MOVQ t1, x_ptr;\
|
||||
\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
p256SqrMontReduceInline;\
|
||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
|
||||
MOVQ res_ptr, x_ptr;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define p256SqrRoundAdx(t1) \
|
||||
XORQ acc0, acc0;\
|
||||
XORQ y_ptr, y_ptr;\
|
||||
\// x[1:] * x[0]
|
||||
MOVQ (8*0)(x_ptr), DX;\
|
||||
MULXQ (8*1)(x_ptr), acc1, acc2;\
|
||||
\
|
||||
MULXQ (8*2)(x_ptr), AX, acc3;\
|
||||
ADOXQ AX, acc2;\
|
||||
\
|
||||
MULXQ (8*3)(x_ptr), AX, acc4;\
|
||||
ADOXQ AX, acc3;\
|
||||
ADOXQ y_ptr, acc4;\
|
||||
\
|
||||
\// x[2:] * x[1]
|
||||
MOVQ (8*1)(x_ptr), DX;\
|
||||
MULXQ (8*2)(x_ptr), AX, t1;\
|
||||
ADOXQ AX, acc3;\
|
||||
\
|
||||
MULXQ (8*3)(x_ptr), AX, acc5;\
|
||||
ADCXQ t1, AX;\
|
||||
ADOXQ AX, acc4;\
|
||||
ADCXQ y_ptr, acc5;\
|
||||
\
|
||||
\// y[x] * x[2]
|
||||
MOVQ (8*2)(x_ptr), DX;\
|
||||
MULXQ (8*3)(x_ptr), AX, y_ptr ;\
|
||||
ADOXQ AX, acc5;\
|
||||
ADOXQ acc0, y_ptr;\
|
||||
\
|
||||
XORQ t1, t1;\
|
||||
\
|
||||
\// *2
|
||||
ADOXQ acc1, acc1;\
|
||||
ADOXQ acc2, acc2;\
|
||||
ADOXQ acc3, acc3;\
|
||||
ADOXQ acc4, acc4;\
|
||||
ADOXQ acc5, acc5;\
|
||||
ADOXQ y_ptr, y_ptr;\
|
||||
ADOXQ acc0, t1;\
|
||||
\
|
||||
\// Missing products
|
||||
MOVQ (8*0)(x_ptr), DX;\
|
||||
MULXQ DX, acc0, t0;\
|
||||
ADCXQ t0, acc1;\
|
||||
\
|
||||
MOVQ (8*1)(x_ptr), DX;\
|
||||
MULXQ DX, AX, t0;\
|
||||
ADCXQ AX, acc2;\
|
||||
ADCXQ t0, acc3;\
|
||||
\
|
||||
MOVQ (8*2)(x_ptr), DX;\
|
||||
MULXQ DX, AX, t0 ;\
|
||||
ADCXQ AX, acc4;\
|
||||
ADCXQ t0, acc5;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), DX;\
|
||||
MULXQ DX, AX, x_ptr;\
|
||||
ADCXQ AX, y_ptr;\
|
||||
ADCXQ t1, x_ptr;\
|
||||
\
|
||||
\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
p256SqrMontReduceInline;\
|
||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
|
||||
MOVQ res_ptr, x_ptr;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define p256OrdSqrRound(t1) \
|
||||
\// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), t0;\
|
||||
\
|
||||
MOVQ (8*1)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
MOVQ AX, acc1;\
|
||||
MOVQ DX, acc2;\
|
||||
\
|
||||
MOVQ (8*2)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc2;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, acc3;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc3;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, acc4;\
|
||||
\// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), t0;\
|
||||
\
|
||||
MOVQ (8*2)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc3;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, t1;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ t1, acc4;\
|
||||
ADCQ $0, DX;\
|
||||
ADDQ AX, acc4;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, acc5;\
|
||||
\// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), t0;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc5;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, y_ptr;\
|
||||
XORQ t1, t1;\
|
||||
\// *2
|
||||
ADDQ acc1, acc1;\
|
||||
ADCQ acc2, acc2;\
|
||||
ADCQ acc3, acc3;\
|
||||
ADCQ acc4, acc4;\
|
||||
ADCQ acc5, acc5;\
|
||||
ADCQ y_ptr, y_ptr;\
|
||||
ADCQ $0, t1;\
|
||||
\// Missing products
|
||||
MOVQ (8*0)(x_ptr), AX;\
|
||||
MULQ AX;\
|
||||
MOVQ AX, acc0;\
|
||||
MOVQ DX, t0;\
|
||||
\
|
||||
MOVQ (8*1)(x_ptr), AX;\
|
||||
MULQ AX;\
|
||||
ADDQ t0, acc1;\
|
||||
ADCQ AX, acc2;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, t0;\
|
||||
\
|
||||
MOVQ (8*2)(x_ptr), AX;\
|
||||
MULQ AX;\
|
||||
ADDQ t0, acc3;\
|
||||
ADCQ AX, acc4;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, t0;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), AX;\
|
||||
MULQ AX;\
|
||||
ADDQ t0, acc5;\
|
||||
ADCQ AX, y_ptr;\
|
||||
ADCQ DX, t1;\
|
||||
MOVQ t1, x_ptr;\
|
||||
\
|
||||
\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
MOVQ acc0, AX;\
|
||||
MULQ p256ordK0<>(SB);\
|
||||
MOVQ AX, t0;\ // Y = t0 = (k0 * acc0) mod 2^64
|
||||
\
|
||||
MOVQ p256ord<>+0x00(SB), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc0;\ // (carry1, acc0) = acc0 + L(t0 * ord0)
|
||||
ADCQ $0, DX;\ // DX = carry1 + H(t0 * ord0)
|
||||
MOVQ DX, t1;\ // t1 = carry1 + H(t0 * ord0)
|
||||
MOVQ t0, acc0;\ // acc0 = t0
|
||||
\
|
||||
\// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
||||
MOVQ t0, AX;\
|
||||
MOVQ t0, DX;\
|
||||
SHLQ $32, AX;\
|
||||
SHRQ $32, DX;\
|
||||
\
|
||||
SUBQ t0, acc2;\
|
||||
SBBQ AX, acc3;\
|
||||
SBBQ DX, acc0;\
|
||||
\
|
||||
MOVQ p256ord<>+0x08(SB), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ t1, acc1;\ // (carry2, acc1) = acc1 + t1
|
||||
ADCQ $0, DX;\ // DX = carry2 + H(t0*ord1)
|
||||
\
|
||||
ADDQ AX, acc1;\ // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
|
||||
ADCQ DX, acc2;\
|
||||
ADCQ $0, acc3;\
|
||||
ADCQ $0, acc0;\
|
||||
\
|
||||
\// Second reduction step
|
||||
MOVQ acc1, AX;\
|
||||
MULQ p256ordK0<>(SB);\
|
||||
MOVQ AX, t0;\
|
||||
\
|
||||
MOVQ p256ord<>+0x00(SB), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc1;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, t1;\
|
||||
MOVQ t0, acc1;\
|
||||
\
|
||||
MOVQ t0, AX;\
|
||||
MOVQ t0, DX;\
|
||||
SHLQ $32, AX;\
|
||||
SHRQ $32, DX;\
|
||||
\
|
||||
SUBQ t0, acc3;\
|
||||
SBBQ AX, acc0;\
|
||||
SBBQ DX, acc1;\
|
||||
\
|
||||
MOVQ p256ord<>+0x08(SB), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ t1, acc2;\
|
||||
ADCQ $0, DX;\
|
||||
\
|
||||
ADDQ AX, acc2;\
|
||||
ADCQ DX, acc3;\
|
||||
ADCQ $0, acc0;\
|
||||
ADCQ $0, acc1;\
|
||||
\
|
||||
\// Third reduction step
|
||||
MOVQ acc2, AX;\
|
||||
MULQ p256ordK0<>(SB);\
|
||||
MOVQ AX, t0;\
|
||||
\
|
||||
MOVQ p256ord<>+0x00(SB), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc2;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, t1;\
|
||||
MOVQ t0, acc2;\
|
||||
\
|
||||
MOVQ t0, AX;\
|
||||
MOVQ t0, DX;\
|
||||
SHLQ $32, AX;\
|
||||
SHRQ $32, DX;\
|
||||
\
|
||||
SUBQ t0, acc0;\
|
||||
SBBQ AX, acc1;\
|
||||
SBBQ DX, acc2;\
|
||||
\
|
||||
MOVQ p256ord<>+0x08(SB), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ t1, acc3;\
|
||||
ADCQ $0, DX;\
|
||||
\
|
||||
ADDQ AX, acc3;\
|
||||
ADCQ DX, acc0;\
|
||||
ADCQ $0, acc1;\
|
||||
ADCQ $0, acc2;\
|
||||
\
|
||||
\// Last reduction step
|
||||
MOVQ acc3, AX;\
|
||||
MULQ p256ordK0<>(SB);\
|
||||
MOVQ AX, t0;\
|
||||
\
|
||||
MOVQ p256ord<>+0x00(SB), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ AX, acc3;\
|
||||
ADCQ $0, DX;\
|
||||
MOVQ DX, t1;\
|
||||
MOVQ t0, acc3;\
|
||||
\
|
||||
MOVQ t0, AX;\
|
||||
MOVQ t0, DX;\
|
||||
SHLQ $32, AX;\
|
||||
SHRQ $32, DX;\
|
||||
\
|
||||
SUBQ t0, acc1;\
|
||||
SBBQ AX, acc2;\
|
||||
SBBQ DX, acc3;\
|
||||
\
|
||||
MOVQ p256ord<>+0x08(SB), AX;\
|
||||
MULQ t0;\
|
||||
ADDQ t1, acc0;\
|
||||
ADCQ $0, DX;\
|
||||
\
|
||||
ADDQ AX, acc0;\
|
||||
ADCQ DX, acc1;\
|
||||
ADCQ $0, acc2;\
|
||||
ADCQ $0, acc3;\
|
||||
XORQ t0, t0;\
|
||||
\// Add bits [511:256] of the sqr result
|
||||
ADCQ acc4, acc0;\
|
||||
ADCQ acc5, acc1;\
|
||||
ADCQ y_ptr, acc2;\
|
||||
ADCQ x_ptr, acc3;\
|
||||
ADCQ $0, t0;\
|
||||
\
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
|
||||
MOVQ res_ptr, x_ptr;
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define p256OrdSqrRoundAdx(t1) \
|
||||
XORQ acc0, acc0;\
|
||||
XORQ y_ptr, y_ptr;\
|
||||
\// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), DX;\
|
||||
MULXQ (8*1)(x_ptr), acc1, acc2 ;\
|
||||
\
|
||||
MULXQ (8*2)(x_ptr), AX, acc3;\
|
||||
ADOXQ AX, acc2;\
|
||||
\
|
||||
MULXQ (8*3)(x_ptr), AX, acc4;\
|
||||
ADOXQ AX, acc3;\
|
||||
ADOXQ y_ptr, acc4;\
|
||||
\
|
||||
\// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), DX;\
|
||||
MULXQ (8*2)(x_ptr), AX, t1;\
|
||||
ADOXQ AX, acc3;\
|
||||
\
|
||||
MULXQ (8*3)(x_ptr), AX, acc5;\
|
||||
ADCXQ t1, AX;\
|
||||
ADOXQ AX, acc4;\
|
||||
ADCXQ y_ptr, acc5;\
|
||||
\
|
||||
\// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), DX;\
|
||||
MULXQ (8*3)(x_ptr), AX, y_ptr;\
|
||||
ADOXQ AX, acc5;\
|
||||
ADOXQ acc0, y_ptr;\
|
||||
\
|
||||
XORQ t1, t1;\
|
||||
\// *2
|
||||
ADOXQ acc1, acc1;\
|
||||
ADOXQ acc2, acc2;\
|
||||
ADOXQ acc3, acc3;\
|
||||
ADOXQ acc4, acc4;\
|
||||
ADOXQ acc5, acc5;\
|
||||
ADOXQ y_ptr, y_ptr;\
|
||||
ADOXQ acc0, t1;\
|
||||
\
|
||||
\// Missing products
|
||||
MOVQ (8*0)(x_ptr), DX;\
|
||||
MULXQ DX, acc0, t0;\
|
||||
ADCXQ t0, acc1;\
|
||||
\
|
||||
MOVQ (8*1)(x_ptr), DX;\
|
||||
MULXQ DX, AX, t0;\
|
||||
ADCXQ AX, acc2;\
|
||||
ADCXQ t0, acc3;\
|
||||
\
|
||||
MOVQ (8*2)(x_ptr), DX;\
|
||||
MULXQ DX, AX, t0 ;\
|
||||
ADCXQ AX, acc4;\
|
||||
ADCXQ t0, acc5;\
|
||||
\
|
||||
MOVQ (8*3)(x_ptr), DX;\
|
||||
MULXQ DX, AX, x_ptr;\
|
||||
ADCXQ AX, y_ptr;\
|
||||
ADCXQ t1, x_ptr;\
|
||||
\
|
||||
\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
\// First reduction step
|
||||
MOVQ acc0, DX;\
|
||||
MULXQ p256ordK0<>(SB), DX, AX;\
|
||||
\
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0;\
|
||||
ADOXQ AX, acc0 ;\// (carry1, acc0) = acc0 + t0 * ord0
|
||||
\
|
||||
MULXQ p256ord<>+0x08(SB), AX, t1;\
|
||||
ADCXQ t0, AX;\
|
||||
ADOXQ AX, acc1;\
|
||||
\
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0;\
|
||||
ADCXQ t1, AX;\
|
||||
ADOXQ AX, acc2;\
|
||||
\
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc0;\
|
||||
ADCXQ t0, AX;\
|
||||
ADOXQ AX, acc3;\
|
||||
MOVQ $0, t0;\
|
||||
ADCXQ t0, acc0;\
|
||||
ADOXQ t0, acc0;\
|
||||
\
|
||||
\// Second reduction step
|
||||
MOVQ acc1, DX;\
|
||||
MULXQ p256ordK0<>(SB), DX, AX;\
|
||||
\
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0;\
|
||||
ADOXQ AX, acc1;\
|
||||
\
|
||||
MULXQ p256ord<>+0x08(SB), AX, t1;\
|
||||
ADCXQ t0, AX;\
|
||||
ADOXQ AX, acc2;\
|
||||
\
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0;\
|
||||
ADCXQ t1, AX;\
|
||||
ADOXQ AX, acc3;\
|
||||
\
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc1;\
|
||||
ADCXQ t0, AX;\
|
||||
ADOXQ AX, acc0;\
|
||||
MOVQ $0, t0;\
|
||||
ADCXQ t0, acc1;\
|
||||
ADOXQ t0, acc1;\
|
||||
\
|
||||
\// Third reduction step
|
||||
MOVQ acc2, DX;\
|
||||
MULXQ p256ordK0<>(SB), DX, AX;\
|
||||
\
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0;\
|
||||
ADOXQ AX, acc2;\
|
||||
\
|
||||
MULXQ p256ord<>+0x08(SB), AX, t1;\
|
||||
ADCXQ t0, AX;\
|
||||
ADOXQ AX, acc3;\
|
||||
\
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0;\
|
||||
ADCXQ t1, AX;\
|
||||
ADOXQ AX, acc0;\
|
||||
\
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc2;\
|
||||
ADCXQ t0, AX;\
|
||||
ADOXQ AX, acc1;\
|
||||
MOVQ $0, t0;\
|
||||
ADCXQ t0, acc2;\
|
||||
ADOXQ t0, acc2;\
|
||||
\
|
||||
\// Last reduction step
|
||||
MOVQ acc3, DX;\
|
||||
MULXQ p256ordK0<>(SB), DX, AX;\
|
||||
\
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0;\
|
||||
ADOXQ AX, acc3;\
|
||||
\
|
||||
MULXQ p256ord<>+0x08(SB), AX, t1;\
|
||||
ADCXQ t0, AX;\
|
||||
ADOXQ AX, acc0;\
|
||||
\
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0;\
|
||||
ADCXQ t1, AX;\
|
||||
ADOXQ AX, acc1;\
|
||||
\
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc3;\
|
||||
ADCXQ t0, AX;\
|
||||
ADOXQ AX, acc2;\
|
||||
MOVQ $0, t0;\
|
||||
ADCXQ t0, acc3;\
|
||||
ADOXQ t0, acc3;\
|
||||
\
|
||||
XORQ t1, t1;\
|
||||
\// Add bits [511:256] of the sqr result
|
||||
ADCXQ acc4, acc0;\
|
||||
ADCXQ acc5, acc1;\
|
||||
ADCXQ y_ptr, acc2;\
|
||||
ADCXQ x_ptr, acc3;\
|
||||
ADCXQ t1, t0;\
|
||||
\
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
|
||||
MOVQ res_ptr, x_ptr;
|
||||
|
||||
#define p256SqrInternalInline \
|
||||
MOVQ acc4, mul0;\
|
||||
MULQ acc5;\
|
||||
MOVQ mul0, acc1;\
|
||||
MOVQ mul1, acc2;\
|
||||
\
|
||||
MOVQ acc4, mul0;\
|
||||
MULQ acc6;\
|
||||
ADDQ mul0, acc2;\
|
||||
ADCQ $0, mul1;\
|
||||
MOVQ mul1, acc3;\
|
||||
\
|
||||
MOVQ acc4, mul0;\
|
||||
MULQ acc7;\
|
||||
ADDQ mul0, acc3;\
|
||||
ADCQ $0, mul1;\
|
||||
MOVQ mul1, t0;\
|
||||
\
|
||||
MOVQ acc5, mul0;\
|
||||
MULQ acc6;\
|
||||
ADDQ mul0, acc3;\
|
||||
ADCQ $0, mul1;\
|
||||
MOVQ mul1, acc0;\
|
||||
\
|
||||
MOVQ acc5, mul0;\
|
||||
MULQ acc7;\
|
||||
ADDQ acc0, t0;\
|
||||
ADCQ $0, mul1;\
|
||||
ADDQ mul0, t0;\
|
||||
ADCQ $0, mul1;\
|
||||
MOVQ mul1, t1;\
|
||||
\
|
||||
MOVQ acc6, mul0;\
|
||||
MULQ acc7;\
|
||||
ADDQ mul0, t1;\
|
||||
ADCQ $0, mul1;\
|
||||
MOVQ mul1, t2;\
|
||||
XORQ t3, t3;\
|
||||
\// *2
|
||||
ADDQ acc1, acc1;\
|
||||
ADCQ acc2, acc2;\
|
||||
ADCQ acc3, acc3;\
|
||||
ADCQ t0, t0;\
|
||||
ADCQ t1, t1;\
|
||||
ADCQ t2, t2;\
|
||||
ADCQ $0, t3;\
|
||||
\// Missing products
|
||||
MOVQ acc4, mul0;\
|
||||
MULQ mul0;\
|
||||
MOVQ mul0, acc0;\
|
||||
MOVQ mul1, acc4;\
|
||||
\
|
||||
MOVQ acc5, mul0;\
|
||||
MULQ mul0;\
|
||||
ADDQ acc4, acc1;\
|
||||
ADCQ mul0, acc2;\
|
||||
ADCQ $0, mul1;\
|
||||
MOVQ mul1, acc4;\
|
||||
\
|
||||
MOVQ acc6, mul0;\
|
||||
MULQ mul0;\
|
||||
ADDQ acc4, acc3;\
|
||||
ADCQ mul0, t0;\
|
||||
ADCQ $0, mul1;\
|
||||
MOVQ mul1, acc4;\
|
||||
\
|
||||
MOVQ acc7, mul0;\
|
||||
MULQ mul0;\
|
||||
ADDQ acc4, t1;\
|
||||
ADCQ mul0, t2;\
|
||||
ADCQ mul1, t3;\
|
||||
\// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||
sm2P256SqrReductionInline;
|
||||
|
||||
#define p256SqrInternalInlineAdx \
|
||||
XORQ acc0, acc0;\
|
||||
XORQ t2, t2;\
|
||||
MOVQ acc4, mul1;\
|
||||
MULXQ acc5, acc1, acc2;\
|
||||
\
|
||||
MULXQ acc6, mul0, acc3;\
|
||||
ADOXQ mul0, acc2;\
|
||||
\
|
||||
MULXQ acc7, mul0, t0;\
|
||||
ADOXQ mul0, acc3;\
|
||||
ADOXQ t2, t0;\
|
||||
\
|
||||
MOVQ acc5, mul1;\
|
||||
MULXQ acc6, mul0, t3;\
|
||||
ADOXQ mul0, acc3;\
|
||||
\
|
||||
MULXQ acc7, mul0, t1;\
|
||||
ADCXQ t3, mul0;\
|
||||
ADOXQ mul0, t0;\
|
||||
ADCXQ t2, t1;\
|
||||
\
|
||||
MOVQ acc6, mul1;\
|
||||
MULXQ acc7, mul0, t2;\
|
||||
ADOXQ mul0, t1;\
|
||||
ADOXQ acc0, t2;\
|
||||
XORQ t3, t3;\
|
||||
\
|
||||
\// *2
|
||||
ADOXQ acc1, acc1;\
|
||||
ADOXQ acc2, acc2;\
|
||||
ADOXQ acc3, acc3;\
|
||||
ADOXQ t0, t0;\
|
||||
ADOXQ t1, t1;\
|
||||
ADOXQ t2, t2;\
|
||||
ADOXQ acc0, t3;\
|
||||
\
|
||||
\// Missing products
|
||||
MOVQ acc4, mul1;\
|
||||
MULXQ mul1, acc0, acc4;\
|
||||
ADDQ acc4, acc1;\
|
||||
\
|
||||
MOVQ acc5, mul1;\
|
||||
MULXQ mul1, mul0, acc4;\
|
||||
ADCXQ mul0, acc2;\
|
||||
ADCXQ acc4, acc3;\
|
||||
\
|
||||
MOVQ acc6, mul1;\
|
||||
MULXQ mul1, mul0, acc4;\
|
||||
ADCXQ mul0, t0;\
|
||||
ADCXQ acc4, t1;\
|
||||
\
|
||||
MOVQ acc7, mul1;\
|
||||
MULXQ mul1, mul0, acc4;\
|
||||
ADCXQ mul0, t2;\
|
||||
ADCXQ acc4, t3;\
|
||||
\// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||
sm2P256SqrReductionInline;
|
||||
|
||||
// p256IsZeroInline returns 1 in AX if [acc4..acc7] represents zero and zero
|
||||
// otherwise. It writes to [acc4..acc7], t0 and t1.
|
||||
#define p256IsZeroInline \
|
||||
\// AX contains a flag that is set if the input is zero.
|
||||
XORQ AX, AX;\
|
||||
MOVQ $1, t1;\
|
||||
\// Check whether [acc4..acc7] are all zero.
|
||||
MOVQ acc4, t0;\
|
||||
ORQ acc5, t0;\
|
||||
ORQ acc6, t0;\
|
||||
ORQ acc7, t0;\
|
||||
\// Set the zero flag if so. (CMOV of a constant to a register doesn't
|
||||
\// appear to be supported in Go. Thus t1 = 1.)
|
||||
CMOVQEQ t1, AX;\ // CMOVQEQ: Move if equal (ZF == 1)
|
||||
\// XOR [acc4..acc7] with P and compare with zero again.
|
||||
XORQ $-1, acc4;\
|
||||
XORQ p256p<>+0x08(SB), acc5;\
|
||||
XORQ $-1, acc6;\
|
||||
XORQ p256p<>+0x018(SB), acc7;\
|
||||
ORQ acc5, acc4;\
|
||||
ORQ acc6, acc4;\
|
||||
ORQ acc7, acc4;\
|
||||
\// Set the zero flag if so.
|
||||
\// CMOVQEQ: Move if equal (ZF == 1)
|
||||
CMOVQEQ t1, AX;
|
||||
|
@ -6,11 +6,21 @@
|
||||
// https://eprint.iacr.org/2013/816.pdf
|
||||
//go:build amd64 && !purego && plugin
|
||||
|
||||
// plugin mode - DO NOT use the R15 Register.
|
||||
// Below functions are different:
|
||||
// 1.p256Sqr
|
||||
// 2.p256OrdSqr
|
||||
// 3.sm2P256MulInternal
|
||||
// 4.sm2P256SqrInternal
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#include "p256_macros_amd64.s"
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// This func is same as non-plugin mode, except that it uses BP to store n
|
||||
// and does not use R15.
|
||||
//
|
||||
// func p256Sqr(res, in *p256Element, n int)
|
||||
TEXT ·p256Sqr(SB),NOSPLIT,$0
|
||||
MOVQ res+0(FP), res_ptr
|
||||
@ -21,162 +31,21 @@ TEXT ·p256Sqr(SB),NOSPLIT,$0
|
||||
JEQ sqrBMI2
|
||||
|
||||
sqrLoop:
|
||||
// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ t0
|
||||
MOVQ AX, acc1
|
||||
MOVQ DX, acc2
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc3
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc4
|
||||
// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, BX
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc5
|
||||
// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc5
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, y_ptr
|
||||
XORQ BX, BX
|
||||
// *2
|
||||
ADDQ acc1, acc1
|
||||
ADCQ acc2, acc2
|
||||
ADCQ acc3, acc3
|
||||
ADCQ acc4, acc4
|
||||
ADCQ acc5, acc5
|
||||
ADCQ y_ptr, y_ptr
|
||||
ADCQ $0, BX
|
||||
// Missing products
|
||||
MOVQ (8*0)(x_ptr), AX
|
||||
MULQ AX
|
||||
MOVQ AX, acc0
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc1
|
||||
ADCQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc3
|
||||
ADCQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc5
|
||||
ADCQ AX, y_ptr
|
||||
ADCQ DX, BX
|
||||
MOVQ BX, x_ptr
|
||||
|
||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
p256SqrMontReduce()
|
||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
p256SqrRound(BX)
|
||||
DECQ BP
|
||||
JNE sqrLoop
|
||||
RET
|
||||
|
||||
sqrBMI2:
|
||||
XORQ acc0, acc0
|
||||
XORQ y_ptr, y_ptr
|
||||
// x[1:] * x[0]
|
||||
MOVQ (8*0)(x_ptr), DX
|
||||
MULXQ (8*1)(x_ptr), acc1, acc2
|
||||
|
||||
MULXQ (8*2)(x_ptr), AX, acc3
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ (8*3)(x_ptr), AX, acc4
|
||||
ADOXQ AX, acc3
|
||||
ADOXQ y_ptr, acc4
|
||||
|
||||
// x[2:] * x[1]
|
||||
MOVQ (8*1)(x_ptr), DX
|
||||
MULXQ (8*2)(x_ptr), AX, BX
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ (8*3)(x_ptr), AX, acc5
|
||||
ADCXQ BX, AX
|
||||
ADOXQ AX, acc4
|
||||
ADCXQ y_ptr, acc5
|
||||
|
||||
// x[3] * x[2]
|
||||
MOVQ (8*2)(x_ptr), DX
|
||||
MULXQ (8*3)(x_ptr), AX, y_ptr
|
||||
ADOXQ AX, acc5
|
||||
ADOXQ acc0, y_ptr
|
||||
XORQ BX, BX
|
||||
|
||||
// *2
|
||||
ADOXQ acc1, acc1
|
||||
ADOXQ acc2, acc2
|
||||
ADOXQ acc3, acc3
|
||||
ADOXQ acc4, acc4
|
||||
ADOXQ acc5, acc5
|
||||
ADOXQ y_ptr, y_ptr
|
||||
ADOXQ acc0, BX
|
||||
|
||||
// Missing products
|
||||
MOVQ (8*0)(x_ptr), DX
|
||||
MULXQ DX, acc0, t0
|
||||
ADCXQ t0, acc1
|
||||
|
||||
MOVQ (8*1)(x_ptr), DX
|
||||
MULXQ DX, AX, t0
|
||||
ADCXQ AX, acc2
|
||||
ADCXQ t0, acc3
|
||||
|
||||
MOVQ (8*2)(x_ptr), DX
|
||||
MULXQ DX, AX, t0
|
||||
ADCXQ AX, acc4
|
||||
ADCXQ t0, acc5
|
||||
|
||||
MOVQ (8*3)(x_ptr), DX
|
||||
MULXQ DX, AX, x_ptr
|
||||
ADCXQ AX, y_ptr
|
||||
ADCXQ BX, x_ptr
|
||||
|
||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
p256SqrMontReduce()
|
||||
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
p256SqrRoundAdx(BX)
|
||||
DECQ BP
|
||||
JNE sqrBMI2
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// This func is same as non-plugin mode, except that it uses BP to store n
|
||||
// and does not use R15.
|
||||
//
|
||||
// func p256OrdSqr(res, in *p256OrdElement, n int)
|
||||
TEXT ·p256OrdSqr(SB),NOSPLIT,$0
|
||||
MOVQ res+0(FP), res_ptr
|
||||
@ -187,385 +56,14 @@ TEXT ·p256OrdSqr(SB),NOSPLIT,$0
|
||||
JEQ ordSqrLoopBMI2
|
||||
|
||||
ordSqrLoop:
|
||||
// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ t0
|
||||
MOVQ AX, acc1
|
||||
MOVQ DX, acc2
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc3
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc4
|
||||
// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, BX
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc4
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, acc5
|
||||
// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc5
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, y_ptr
|
||||
XORQ BX, BX
|
||||
// *2
|
||||
ADDQ acc1, acc1
|
||||
ADCQ acc2, acc2
|
||||
ADCQ acc3, acc3
|
||||
ADCQ acc4, acc4
|
||||
ADCQ acc5, acc5
|
||||
ADCQ y_ptr, y_ptr
|
||||
ADCQ $0, BX
|
||||
// Missing products
|
||||
MOVQ (8*0)(x_ptr), AX
|
||||
MULQ AX
|
||||
MOVQ AX, acc0
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*1)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc1
|
||||
ADCQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*2)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc3
|
||||
ADCQ AX, acc4
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, t0
|
||||
|
||||
MOVQ (8*3)(x_ptr), AX
|
||||
MULQ AX
|
||||
ADDQ t0, acc5
|
||||
ADCQ AX, y_ptr
|
||||
ADCQ DX, BX
|
||||
MOVQ BX, x_ptr
|
||||
|
||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
|
||||
MOVQ acc0, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
||||
ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
|
||||
MOVQ DX, BX // BX = carry1 + H(t0 * ord0)
|
||||
MOVQ t0, acc0 // acc0 = t0
|
||||
|
||||
// calculate the negative part: [acc0, acc3, acc2] - [0, 0x100000000, 1] * t0
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc2
|
||||
SBBQ AX, acc3
|
||||
SBBQ DX, acc0
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc1 // (carry2, acc1) = acc1 + BX
|
||||
ADCQ $0, DX // DX = carry2 + H(t0*ord1)
|
||||
|
||||
ADDQ AX, acc1 // (carry3, acc1) = acc1 + BX + L(t0*ord1)
|
||||
ADCQ DX, acc2
|
||||
ADCQ $0, acc3
|
||||
ADCQ $0, acc0
|
||||
|
||||
// Second reduction step
|
||||
MOVQ acc1, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc1
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, BX
|
||||
MOVQ t0, acc1
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc3
|
||||
SBBQ AX, acc0
|
||||
SBBQ DX, acc1
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc2
|
||||
ADCQ $0, DX
|
||||
|
||||
ADDQ AX, acc2
|
||||
ADCQ DX, acc3
|
||||
ADCQ $0, acc0
|
||||
ADCQ $0, acc1
|
||||
|
||||
// Third reduction step
|
||||
MOVQ acc2, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc2
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, BX
|
||||
MOVQ t0, acc2
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc0
|
||||
SBBQ AX, acc1
|
||||
SBBQ DX, acc2
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc3
|
||||
ADCQ $0, DX
|
||||
|
||||
ADDQ AX, acc3
|
||||
ADCQ DX, acc0
|
||||
ADCQ $0, acc1
|
||||
ADCQ $0, acc2
|
||||
|
||||
// Last reduction step
|
||||
MOVQ acc3, AX
|
||||
MULQ p256ordK0<>(SB)
|
||||
MOVQ AX, t0
|
||||
|
||||
MOVQ p256ord<>+0x00(SB), AX
|
||||
MULQ t0
|
||||
ADDQ AX, acc3
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, BX
|
||||
MOVQ t0, acc3
|
||||
|
||||
MOVQ t0, AX
|
||||
MOVQ t0, DX
|
||||
SHLQ $32, AX
|
||||
SHRQ $32, DX
|
||||
|
||||
SUBQ t0, acc1
|
||||
SBBQ AX, acc2
|
||||
SBBQ DX, acc3
|
||||
|
||||
MOVQ p256ord<>+0x08(SB), AX
|
||||
MULQ t0
|
||||
ADDQ BX, acc0
|
||||
ADCQ $0, DX
|
||||
|
||||
ADDQ AX, acc0
|
||||
ADCQ DX, acc1
|
||||
ADCQ $0, acc2
|
||||
ADCQ $0, acc3
|
||||
|
||||
XORQ t0, t0
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADCQ acc4, acc0
|
||||
ADCQ acc5, acc1
|
||||
ADCQ y_ptr, acc2
|
||||
ADCQ x_ptr, acc3
|
||||
ADCQ $0, t0
|
||||
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
p256OrdSqrRound(BX)
|
||||
DECQ BP
|
||||
JNE ordSqrLoop
|
||||
|
||||
RET
|
||||
|
||||
ordSqrLoopBMI2:
|
||||
XORQ acc0, acc0
|
||||
XORQ y_ptr, y_ptr
|
||||
// y[1:] * y[0]
|
||||
MOVQ (8*0)(x_ptr), DX
|
||||
MULXQ (8*1)(x_ptr), acc1, acc2
|
||||
|
||||
MULXQ (8*2)(x_ptr), AX, acc3
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ (8*3)(x_ptr), AX, acc4
|
||||
ADOXQ AX, acc3
|
||||
ADOXQ y_ptr, acc4
|
||||
|
||||
// y[2:] * y[1]
|
||||
MOVQ (8*1)(x_ptr), DX
|
||||
MULXQ (8*2)(x_ptr), AX, BX
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ (8*3)(x_ptr), AX, acc5
|
||||
ADCXQ BX, AX
|
||||
ADOXQ AX, acc4
|
||||
ADCXQ y_ptr, acc5
|
||||
|
||||
// y[3] * y[2]
|
||||
MOVQ (8*2)(x_ptr), DX
|
||||
MULXQ (8*3)(x_ptr), AX, y_ptr
|
||||
ADOXQ AX, acc5
|
||||
ADOXQ acc0, y_ptr
|
||||
|
||||
XORQ BX, BX
|
||||
// *2
|
||||
ADOXQ acc1, acc1
|
||||
ADOXQ acc2, acc2
|
||||
ADOXQ acc3, acc3
|
||||
ADOXQ acc4, acc4
|
||||
ADOXQ acc5, acc5
|
||||
ADOXQ y_ptr, y_ptr
|
||||
ADOXQ acc0, BX
|
||||
|
||||
// Missing products
|
||||
MOVQ (8*0)(x_ptr), DX
|
||||
MULXQ DX, acc0, t0
|
||||
ADCXQ t0, acc1
|
||||
|
||||
MOVQ (8*1)(x_ptr), DX
|
||||
MULXQ DX, AX, t0
|
||||
ADCXQ AX, acc2
|
||||
ADCXQ t0, acc3
|
||||
|
||||
MOVQ (8*2)(x_ptr), DX
|
||||
MULXQ DX, AX, t0
|
||||
ADCXQ AX, acc4
|
||||
ADCXQ t0, acc5
|
||||
|
||||
MOVQ (8*3)(x_ptr), DX
|
||||
MULXQ DX, AX, x_ptr
|
||||
ADCXQ AX, y_ptr
|
||||
ADCXQ BX, x_ptr
|
||||
|
||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||
// First reduction step
|
||||
MOVQ acc0, DX
|
||||
MULXQ p256ordK0<>(SB), DX, AX
|
||||
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||
ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0
|
||||
|
||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc1
|
||||
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||
ADCXQ BX, AX
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc0
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc3
|
||||
MOVQ $0, t0
|
||||
ADCXQ t0, acc0
|
||||
ADOXQ t0, acc0
|
||||
|
||||
// Second reduction step
|
||||
MOVQ acc1, DX
|
||||
MULXQ p256ordK0<>(SB), DX, AX
|
||||
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||
ADOXQ AX, acc1
|
||||
|
||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||
ADCXQ BX, AX
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc1
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc0
|
||||
MOVQ $0, t0
|
||||
ADCXQ t0, acc1
|
||||
ADOXQ t0, acc1
|
||||
|
||||
// Third reduction step
|
||||
MOVQ acc2, DX
|
||||
MULXQ p256ordK0<>(SB), DX, AX
|
||||
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||
ADOXQ AX, acc2
|
||||
|
||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||
ADCXQ BX, AX
|
||||
ADOXQ AX, acc0
|
||||
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc2
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc1
|
||||
MOVQ $0, t0
|
||||
ADCXQ t0, acc2
|
||||
ADOXQ t0, acc2
|
||||
|
||||
// Last reduction step
|
||||
MOVQ acc3, DX
|
||||
MULXQ p256ordK0<>(SB), DX, AX
|
||||
|
||||
MULXQ p256ord<>+0x00(SB), AX, t0
|
||||
ADOXQ AX, acc3
|
||||
|
||||
MULXQ p256ord<>+0x08(SB), AX, BX
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc0
|
||||
|
||||
MULXQ p256ord<>+0x10(SB), AX, t0
|
||||
ADCXQ BX, AX
|
||||
ADOXQ AX, acc1
|
||||
|
||||
MULXQ p256ord<>+0x18(SB), AX, acc3
|
||||
ADCXQ t0, AX
|
||||
ADOXQ AX, acc2
|
||||
MOVQ $0, t0
|
||||
ADCXQ t0, acc3
|
||||
ADOXQ t0, acc3
|
||||
|
||||
XORQ BX, BX
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADCXQ acc4, acc0
|
||||
ADCXQ acc5, acc1
|
||||
ADCXQ y_ptr, acc2
|
||||
ADCXQ x_ptr, acc3
|
||||
ADCXQ BX, t0
|
||||
|
||||
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
|
||||
MOVQ res_ptr, x_ptr
|
||||
p256OrdSqrRoundAdx(BX)
|
||||
DECQ BP
|
||||
JNE ordSqrLoopBMI2
|
||||
|
||||
@ -599,33 +97,6 @@ ordSqrLoopBMI2:
|
||||
#define t2 SI
|
||||
#define t3 R9
|
||||
|
||||
/* ---------------------------------------*/
|
||||
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
|
||||
TEXT sm2P256SubInternal(SB),NOSPLIT,$0
|
||||
XORQ mul0, mul0
|
||||
SUBQ t0, acc4
|
||||
SBBQ t1, acc5
|
||||
SBBQ t2, acc6
|
||||
SBBQ t3, acc7
|
||||
SBBQ $0, mul0
|
||||
|
||||
MOVQ acc4, acc0
|
||||
MOVQ acc5, acc1
|
||||
MOVQ acc6, acc2
|
||||
MOVQ acc7, acc3
|
||||
|
||||
ADDQ $-1, acc4
|
||||
ADCQ p256p<>+0x08(SB), acc5
|
||||
ADCQ $-1, acc6
|
||||
ADCQ p256p<>+0x018(SB), acc7
|
||||
ANDQ $1, mul0
|
||||
|
||||
CMOVQEQ acc0, acc4
|
||||
CMOVQEQ acc1, acc5
|
||||
CMOVQEQ acc2, acc6
|
||||
CMOVQEQ acc3, acc7
|
||||
|
||||
RET
|
||||
/* ---------------------------------------*/
|
||||
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0]
|
||||
TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||
@ -634,7 +105,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||
|
||||
MOVQ acc4, mul0
|
||||
MULQ t0
|
||||
MOVQ mul0, X0
|
||||
MOVQ mul0, X0 // uses X0 as temp register/storage
|
||||
MOVQ mul1, acc1
|
||||
|
||||
MOVQ acc4, mul0
|
||||
@ -746,7 +217,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||
MOVQ mul1, acc7
|
||||
|
||||
PEXTRQ $0, X0, acc0
|
||||
sm2P256MulReductionInternal()
|
||||
sm2P256MulReductionInline
|
||||
MOVQ $0, mul0
|
||||
// Add bits [511:256] of the result
|
||||
ADCQ acc0, acc4
|
||||
@ -775,7 +246,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
||||
internalMulBMI2:
|
||||
MOVQ acc4, mul1
|
||||
MULXQ t0, acc0, acc1
|
||||
MOVQ acc0, X0
|
||||
MOVQ acc0, X0 // uses X0 as temp register/storage
|
||||
|
||||
MULXQ t1, mul0, acc2
|
||||
ADDQ mul0, acc1
|
||||
@ -848,7 +319,7 @@ internalMulBMI2:
|
||||
ADCQ $0, acc7
|
||||
|
||||
PEXTRQ $0, X0, acc0
|
||||
sm2P256MulReductionInternal()
|
||||
sm2P256MulReductionInline
|
||||
MOVQ $0, mul0
|
||||
// Add bits [511:256] of the result
|
||||
ADCQ acc0, acc4
|
||||
@ -881,140 +352,11 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
|
||||
CMPB ·supportBMI2+0(SB), $0x01
|
||||
JEQ internalSqrBMI2
|
||||
|
||||
MOVQ acc4, mul0
|
||||
MULQ acc5
|
||||
MOVQ mul0, acc1
|
||||
MOVQ mul1, acc2
|
||||
|
||||
MOVQ acc4, mul0
|
||||
MULQ acc6
|
||||
ADDQ mul0, acc2
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, acc3
|
||||
|
||||
MOVQ acc4, mul0
|
||||
MULQ acc7
|
||||
ADDQ mul0, acc3
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, t0
|
||||
|
||||
MOVQ acc5, mul0
|
||||
MULQ acc6
|
||||
ADDQ mul0, acc3
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, acc0
|
||||
|
||||
MOVQ acc5, mul0
|
||||
MULQ acc7
|
||||
ADDQ acc0, t0
|
||||
ADCQ $0, mul1
|
||||
ADDQ mul0, t0
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, t1
|
||||
|
||||
MOVQ acc6, mul0
|
||||
MULQ acc7
|
||||
ADDQ mul0, t1
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, t2
|
||||
XORQ t3, t3
|
||||
// *2
|
||||
ADDQ acc1, acc1
|
||||
ADCQ acc2, acc2
|
||||
ADCQ acc3, acc3
|
||||
ADCQ t0, t0
|
||||
ADCQ t1, t1
|
||||
ADCQ t2, t2
|
||||
ADCQ $0, t3
|
||||
// Missing products
|
||||
MOVQ acc4, mul0
|
||||
MULQ mul0
|
||||
MOVQ mul0, acc0
|
||||
MOVQ mul1, acc4
|
||||
|
||||
MOVQ acc5, mul0
|
||||
MULQ mul0
|
||||
ADDQ acc4, acc1
|
||||
ADCQ mul0, acc2
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, acc4
|
||||
|
||||
MOVQ acc6, mul0
|
||||
MULQ mul0
|
||||
ADDQ acc4, acc3
|
||||
ADCQ mul0, t0
|
||||
ADCQ $0, mul1
|
||||
MOVQ mul1, acc4
|
||||
|
||||
MOVQ acc7, mul0
|
||||
MULQ mul0
|
||||
ADDQ acc4, t1
|
||||
ADCQ mul0, t2
|
||||
ADCQ mul1, t3
|
||||
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||
sm2P256SqrReductionInternal()
|
||||
p256SqrInternalInline
|
||||
RET
|
||||
|
||||
internalSqrBMI2:
|
||||
XORQ acc0, acc0
|
||||
XORQ t2, t2
|
||||
MOVQ acc4, mul1
|
||||
MULXQ acc5, acc1, acc2
|
||||
|
||||
MULXQ acc6, mul0, acc3
|
||||
ADOXQ mul0, acc2
|
||||
|
||||
MULXQ acc7, mul0, t0
|
||||
ADOXQ mul0, acc3
|
||||
ADOXQ t2, t0
|
||||
|
||||
MOVQ acc5, mul1
|
||||
MULXQ acc6, mul0, t3
|
||||
ADOXQ mul0, acc3
|
||||
|
||||
MULXQ acc7, mul0, t1
|
||||
ADCXQ t3, mul0
|
||||
ADOXQ mul0, t0
|
||||
ADCXQ t2, t1
|
||||
|
||||
MOVQ acc6, mul1
|
||||
MULXQ acc7, mul0, t2
|
||||
ADOXQ mul0, t1
|
||||
ADOXQ acc0, t2
|
||||
|
||||
XORQ t3, t3
|
||||
|
||||
// *2
|
||||
ADOXQ acc1, acc1
|
||||
ADOXQ acc2, acc2
|
||||
ADOXQ acc3, acc3
|
||||
ADOXQ t0, t0
|
||||
ADOXQ t1, t1
|
||||
ADOXQ t2, t2
|
||||
ADOXQ acc0, t3
|
||||
|
||||
// Missing products
|
||||
MOVQ acc4, mul1
|
||||
MULXQ mul1, acc0, acc4
|
||||
ADDQ acc4, acc1
|
||||
|
||||
MOVQ acc5, mul1
|
||||
MULXQ mul1, mul0, acc4
|
||||
ADCXQ mul0, acc2
|
||||
ADCXQ acc4, acc3
|
||||
|
||||
MOVQ acc6, mul1
|
||||
MULXQ mul1, mul0, acc4
|
||||
ADCXQ mul0, t0
|
||||
ADCXQ acc4, t1
|
||||
|
||||
MOVQ acc7, mul1
|
||||
MULXQ mul1, mul0, acc4
|
||||
ADCXQ mul0, t2
|
||||
ADCXQ acc4, t3
|
||||
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||
sm2P256SqrReductionInternal()
|
||||
|
||||
p256SqrInternalInlineAdx
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
@ -1099,7 +441,7 @@ internalSqrBMI2:
|
||||
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2
|
||||
\
|
||||
LDt (x1in) \
|
||||
CALL sm2P256SubInternal(SB) \// h = u2 - u1
|
||||
p256SubInline2 \// h = u2 - u1
|
||||
ST (h) \
|
||||
\
|
||||
LDt (z1in) \
|
||||
@ -1114,7 +456,7 @@ internalSqrBMI2:
|
||||
ST (s2) \
|
||||
\
|
||||
LDt (y1in) \
|
||||
CALL sm2P256SubInternal(SB) \// r = s2 - s1
|
||||
p256SubInline2 \// r = s2 - s1
|
||||
ST (r) \
|
||||
\
|
||||
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
|
||||
@ -1139,10 +481,10 @@ internalSqrBMI2:
|
||||
\
|
||||
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
|
||||
LDacc (rsqr) \
|
||||
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
|
||||
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
|
||||
\
|
||||
LDt (hcub) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
ST (xout) \
|
||||
\
|
||||
MOVQ acc4, t0 \
|
||||
@ -1150,13 +492,13 @@ internalSqrBMI2:
|
||||
MOVQ acc6, t2 \
|
||||
MOVQ acc7, t3 \
|
||||
LDacc (h) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
\
|
||||
LDt (r) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
\
|
||||
LDt (s2) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
ST (yout) \
|
||||
\// Load stored values from stack
|
||||
MOVQ rptr, AX \
|
||||
@ -1373,36 +715,6 @@ pointaddaffine_avx2:
|
||||
#undef sel_save
|
||||
#undef zero_save
|
||||
|
||||
// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
|
||||
// otherwise. It writes to [acc4..acc7], t0 and t1.
|
||||
TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
// AX contains a flag that is set if the input is zero.
|
||||
XORQ AX, AX
|
||||
MOVQ $1, t1
|
||||
|
||||
// Check whether [acc4..acc7] are all zero.
|
||||
MOVQ acc4, t0
|
||||
ORQ acc5, t0
|
||||
ORQ acc6, t0
|
||||
ORQ acc7, t0
|
||||
|
||||
// Set the zero flag if so. (CMOV of a constant to a register doesn't
|
||||
// appear to be supported in Go. Thus t1 = 1.)
|
||||
CMOVQEQ t1, AX
|
||||
|
||||
// XOR [acc4..acc7] with P and compare with zero again.
|
||||
XORQ $-1, acc4
|
||||
XORQ p256p<>+0x08(SB), acc5
|
||||
XORQ $-1, acc6
|
||||
XORQ p256p<>+0x018(SB), acc7
|
||||
ORQ acc5, acc4
|
||||
ORQ acc6, acc4
|
||||
ORQ acc7, acc4
|
||||
|
||||
// Set the zero flag if so.
|
||||
CMOVQEQ t1, AX
|
||||
RET
|
||||
|
||||
/* ---------------------------------------*/
|
||||
#define x1in(off) (32*0 + off)(SP)
|
||||
#define y1in(off) (32*1 + off)(SP)
|
||||
@ -1450,9 +762,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
ST (s2) \
|
||||
\
|
||||
LDt (s1) \
|
||||
CALL sm2P256SubInternal(SB) \// r = s2 - s1
|
||||
p256SubInline2 \// r = s2 - s1
|
||||
ST (r) \
|
||||
CALL sm2P256IsZero(SB) \
|
||||
p256IsZeroInline \
|
||||
MOVQ AX, points_eq \
|
||||
\
|
||||
LDacc (z2sqr) \
|
||||
@ -1465,9 +777,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
ST (u2) \
|
||||
\
|
||||
LDt (u1) \
|
||||
CALL sm2P256SubInternal(SB) \// h = u2 - u1
|
||||
p256SubInline2 \// h = u2 - u1
|
||||
ST (h) \
|
||||
CALL sm2P256IsZero(SB) \
|
||||
p256IsZeroInline \
|
||||
ANDQ points_eq, AX \
|
||||
MOVQ AX, points_eq \
|
||||
\
|
||||
@ -1501,10 +813,10 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
\
|
||||
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
|
||||
LDacc (rsqr) \
|
||||
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
|
||||
p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2
|
||||
\
|
||||
LDt (hcub) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
ST (xout) \
|
||||
\
|
||||
MOVQ acc4, t0 \
|
||||
@ -1512,13 +824,13 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
||||
MOVQ acc6, t2 \
|
||||
MOVQ acc7, t3 \
|
||||
LDacc (u2) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
\
|
||||
LDt (r) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
\
|
||||
LDt (s2) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
ST (yout) \
|
||||
|
||||
//func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
|
||||
@ -1669,7 +981,7 @@ pointadd_avx2:
|
||||
#define calX() \
|
||||
LDacc (x) \
|
||||
LDt (zsqr) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
LDt (m) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
ST (m) \
|
||||
@ -1718,18 +1030,18 @@ pointadd_avx2:
|
||||
LDacc (m) \
|
||||
CALL sm2P256SqrInternal(SB) \
|
||||
LDt (tmp) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
|
||||
#define calY() \
|
||||
acc2t \
|
||||
LDacc (s) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
\
|
||||
LDt (m) \
|
||||
CALL sm2P256MulInternal(SB) \
|
||||
\
|
||||
LDt (y) \
|
||||
CALL sm2P256SubInternal(SB) \
|
||||
p256SubInline2 \
|
||||
|
||||
#define lastP256PointDouble() \
|
||||
\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl
|
||||
|
Loading…
x
Reference in New Issue
Block a user