From 53ac591635f5d9fad9e02ac60bf7ff5d535f7cb5 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Thu, 29 Feb 2024 17:53:28 +0800 Subject: [PATCH] internal/sm2ec: amd64 refactoring, reduce duplicated code --- internal/sm2ec/p256_asm_amd64.s | 904 +++------------------- internal/sm2ec/p256_macros_amd64.s | 1141 ++++++++++++++++++++++------ internal/sm2ec/p256_plugin_amd64.s | 770 +------------------ 3 files changed, 1068 insertions(+), 1747 deletions(-) diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index cfaff92..d1e713a 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -20,159 +20,13 @@ TEXT ·p256Sqr(SB),NOSPLIT,$0 JEQ sqrBMI2 sqrLoop: - - // y[1:] * y[0] - MOVQ (8*0)(x_ptr), t0 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - MOVQ AX, acc1 - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 - // y[2:] * y[1] - MOVQ (8*1)(x_ptr), t0 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, acc5 - // y[3] * y[2] - MOVQ (8*2)(x_ptr), t0 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, y_ptr - XORQ t1, t1 - // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, t1 - // Missing products - MOVQ (8*0)(x_ptr), AX - MULQ AX - MOVQ AX, acc0 - MOVQ DX, t0 - - MOVQ (8*1)(x_ptr), AX - MULQ AX - ADDQ t0, acc1 - ADCQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*2)(x_ptr), AX - MULQ AX - ADDQ t0, acc3 - ADCQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*3)(x_ptr), AX - MULQ AX - ADDQ t0, acc5 - ADCQ AX, y_ptr - ADCQ DX, t1 - MOVQ t1, x_ptr - - // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - p256SqrMontReduce() - p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) - MOVQ res_ptr, x_ptr + p256SqrRound(t1) DECQ BX JNE sqrLoop RET sqrBMI2: - XORQ acc0, acc0 - XORQ y_ptr, y_ptr - // x[1:] * x[0] - MOVQ (8*0)(x_ptr), DX - MULXQ (8*1)(x_ptr), acc1, acc2 - - MULXQ (8*2)(x_ptr), AX, acc3 - ADOXQ AX, acc2 - - MULXQ (8*3)(x_ptr), AX, acc4 - ADOXQ AX, acc3 - ADOXQ y_ptr, acc4 - - // x[2:] * x[1] - MOVQ (8*1)(x_ptr), DX - MULXQ (8*2)(x_ptr), AX, t1 - ADOXQ AX, acc3 - - MULXQ (8*3)(x_ptr), AX, acc5 - ADCXQ t1, AX - ADOXQ AX, acc4 - ADCXQ y_ptr, acc5 - - // y[x] * x[2] - MOVQ (8*2)(x_ptr), DX - MULXQ (8*3)(x_ptr), AX, y_ptr - ADOXQ AX, acc5 - ADOXQ acc0, y_ptr - - XORQ t1, t1 - - // *2 - ADOXQ acc1, acc1 - ADOXQ acc2, acc2 - ADOXQ acc3, acc3 - ADOXQ acc4, acc4 - ADOXQ acc5, acc5 - ADOXQ y_ptr, y_ptr - ADOXQ acc0, t1 - - // Missing products - MOVQ (8*0)(x_ptr), DX - MULXQ DX, acc0, t0 - ADCXQ t0, acc1 - - MOVQ (8*1)(x_ptr), DX - MULXQ DX, AX, t0 - ADCXQ AX, acc2 - ADCXQ t0, acc3 - - MOVQ (8*2)(x_ptr), DX - MULXQ DX, AX, t0 - ADCXQ AX, acc4 - ADCXQ t0, acc5 - - MOVQ (8*3)(x_ptr), DX - MULXQ DX, AX, x_ptr - ADCXQ AX, y_ptr - ADCXQ t1, x_ptr - - // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - p256SqrMontReduce() - p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) - MOVQ res_ptr, x_ptr + p256SqrRoundAdx(t1) DECQ BX JNE sqrBMI2 RET @@ -188,385 +42,14 @@ TEXT ·p256OrdSqr(SB),NOSPLIT,$0 JEQ ordSqrLoopBMI2 ordSqrLoop: - - // y[1:] * y[0] - MOVQ (8*0)(x_ptr), t0 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - MOVQ AX, acc1 - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 - // y[2:] * y[1] - MOVQ (8*1)(x_ptr), t0 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, acc5 - // y[3] * y[2] - MOVQ (8*2)(x_ptr), t0 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, y_ptr - XORQ t1, t1 - // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, t1 - // Missing products - MOVQ (8*0)(x_ptr), AX - MULQ AX - MOVQ AX, acc0 - MOVQ DX, t0 - - MOVQ (8*1)(x_ptr), AX - MULQ AX - ADDQ t0, acc1 - ADCQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*2)(x_ptr), AX - MULQ AX - ADDQ t0, acc3 - ADCQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*3)(x_ptr), AX - MULQ AX - ADDQ t0, acc5 - ADCQ AX, y_ptr - ADCQ DX, t1 - MOVQ t1, x_ptr - - // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - MOVQ acc0, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) - ADCQ $0, DX // DX = carry1 + H(t0 * ord0) - MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0) - MOVQ t0, acc0 // acc0 = t0 - - // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc0 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1 - ADCQ $0, DX // DX = carry2 + H(t0*ord1) - - ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1) - ADCQ DX, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - - // Second reduction step - MOVQ acc1, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ t0, acc1 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc3 - SBBQ AX, acc0 - SBBQ DX, acc1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - - ADDQ AX, acc2 - ADCQ DX, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - - // Third reduction step - MOVQ acc2, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ t0, acc2 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc0 - SBBQ AX, acc1 - SBBQ DX, acc2 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - - ADDQ AX, acc3 - ADCQ DX, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - - // Last reduction step - MOVQ acc3, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ t0, acc3 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc1 - SBBQ AX, acc2 - SBBQ DX, acc3 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - - ADDQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - - XORQ t0, t0 - // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 - ADCQ acc5, acc1 - ADCQ y_ptr, acc2 - ADCQ x_ptr, acc3 - ADCQ $0, t0 - - p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) - MOVQ res_ptr, x_ptr + p256OrdSqrRound(t1) DECQ BX JNE ordSqrLoop RET ordSqrLoopBMI2: - XORQ acc0, acc0 - XORQ y_ptr, y_ptr - // y[1:] * y[0] - MOVQ (8*0)(x_ptr), DX - MULXQ (8*1)(x_ptr), acc1, acc2 - - MULXQ (8*2)(x_ptr), AX, acc3 - ADOXQ AX, acc2 - - MULXQ (8*3)(x_ptr), AX, acc4 - ADOXQ AX, acc3 - ADOXQ y_ptr, acc4 - - // y[2:] * y[1] - MOVQ (8*1)(x_ptr), DX - MULXQ (8*2)(x_ptr), AX, t1 - ADOXQ AX, acc3 - - MULXQ (8*3)(x_ptr), AX, acc5 - ADCXQ t1, AX - ADOXQ AX, acc4 - ADCXQ y_ptr, acc5 - - // y[3] * y[2] - MOVQ (8*2)(x_ptr), DX - MULXQ (8*3)(x_ptr), AX, y_ptr - ADOXQ AX, acc5 - ADOXQ acc0, y_ptr - - XORQ t1, t1 - // *2 - ADOXQ acc1, acc1 - ADOXQ acc2, acc2 - ADOXQ acc3, acc3 - ADOXQ acc4, acc4 - ADOXQ acc5, acc5 - ADOXQ y_ptr, y_ptr - ADOXQ acc0, t1 - - // Missing products - MOVQ (8*0)(x_ptr), DX - MULXQ DX, acc0, t0 - ADCXQ t0, acc1 - - MOVQ (8*1)(x_ptr), DX - MULXQ DX, AX, t0 - ADCXQ AX, acc2 - ADCXQ t0, acc3 - - MOVQ (8*2)(x_ptr), DX - MULXQ DX, AX, t0 - ADCXQ AX, acc4 - ADCXQ t0, acc5 - - MOVQ (8*3)(x_ptr), DX - MULXQ DX, AX, x_ptr - ADCXQ AX, y_ptr - ADCXQ t1, x_ptr - - // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - // First reduction step - MOVQ acc0, DX - MULXQ p256ordK0<>(SB), DX, AX - - MULXQ p256ord<>+0x00(SB), AX, t0 - ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 - - MULXQ p256ord<>+0x08(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc1 - - MULXQ p256ord<>+0x10(SB), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc2 - - MULXQ p256ord<>+0x18(SB), AX, acc0 - ADCXQ t0, AX - ADOXQ AX, acc3 - MOVQ $0, t0 - ADCXQ t0, acc0 - ADOXQ t0, acc0 - - // Second reduction step - MOVQ acc1, DX - MULXQ p256ordK0<>(SB), DX, AX - - MULXQ p256ord<>+0x00(SB), AX, t0 - ADOXQ AX, acc1 - - MULXQ p256ord<>+0x08(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc2 - - MULXQ p256ord<>+0x10(SB), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc3 - - MULXQ p256ord<>+0x18(SB), AX, acc1 - ADCXQ t0, AX - ADOXQ AX, acc0 - MOVQ $0, t0 - ADCXQ t0, acc1 - ADOXQ t0, acc1 - - // Third reduction step - MOVQ acc2, DX - MULXQ p256ordK0<>(SB), DX, AX - - MULXQ p256ord<>+0x00(SB), AX, t0 - ADOXQ AX, acc2 - - MULXQ p256ord<>+0x08(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc3 - - MULXQ p256ord<>+0x10(SB), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc0 - - MULXQ p256ord<>+0x18(SB), AX, acc2 - ADCXQ t0, AX - ADOXQ AX, acc1 - MOVQ $0, t0 - ADCXQ t0, acc2 - ADOXQ t0, acc2 - - // Last reduction step - MOVQ acc3, DX - MULXQ p256ordK0<>(SB), DX, AX - - MULXQ p256ord<>+0x00(SB), AX, t0 - ADOXQ AX, acc3 - - MULXQ p256ord<>+0x08(SB), AX, t1 - ADCXQ t0, AX - ADOXQ AX, acc0 - - MULXQ p256ord<>+0x10(SB), AX, t0 - ADCXQ t1, AX - ADOXQ AX, acc1 - - MULXQ p256ord<>+0x18(SB), AX, acc3 - ADCXQ t0, AX - ADOXQ AX, acc2 - MOVQ $0, t0 - ADCXQ t0, acc3 - ADOXQ t0, acc3 - - XORQ t1, t1 - // Add bits [511:256] of the sqr result - ADCXQ acc4, acc0 - ADCXQ acc5, acc1 - ADCXQ y_ptr, acc2 - ADCXQ x_ptr, acc3 - ADCXQ t1, t0 - - p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr) - MOVQ res_ptr, x_ptr + p256OrdSqrRoundAdx(t1) DECQ BX JNE ordSqrLoopBMI2 @@ -601,33 +84,7 @@ ordSqrLoopBMI2: #define t2 DI #define t3 SI #define hlp BP -/* ---------------------------------------*/ -// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0] -TEXT sm2P256SubInternal(SB),NOSPLIT,$0 - XORQ mul0, mul0 - SUBQ t0, acc4 - SBBQ t1, acc5 - SBBQ t2, acc6 - SBBQ t3, acc7 - SBBQ $0, mul0 - MOVQ acc4, acc0 - MOVQ acc5, acc1 - MOVQ acc6, acc2 - MOVQ acc7, acc3 - - ADDQ $-1, acc4 - ADCQ p256p<>+0x08(SB), acc5 - ADCQ $-1, acc6 - ADCQ p256p<>+0x018(SB), acc7 - ANDQ $1, mul0 - - CMOVQEQ acc0, acc4 - CMOVQEQ acc1, acc5 - CMOVQEQ acc2, acc6 - CMOVQEQ acc3, acc7 - - RET /* ---------------------------------------*/ // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0] TEXT sm2P256MulInternal(SB),NOSPLIT,$8 @@ -746,7 +203,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADDQ mul0, acc6 ADCQ $0, mul1 MOVQ mul1, acc7 - sm2P256MulReductionInternal() + sm2P256MulReductionInline MOVQ $0, BP // Add bits [511:256] of the result @@ -767,7 +224,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 SBBQ p256p<>+0x018(SB), acc7 SBBQ $0, hlp // If the result of the subtraction is negative, restore the previous result - CMOVQCS acc0, acc4 + CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1) CMOVQCS acc1, acc5 CMOVQCS acc2, acc6 CMOVQCS acc3, acc7 @@ -847,7 +304,7 @@ internalMulBMI2: ADDQ mul0, acc6 ADCQ $0, acc7 - sm2P256MulReductionInternal() + sm2P256MulReductionInline MOVQ $0, BP // Add bits [511:256] of the result ADCQ acc0, acc4 @@ -867,7 +324,7 @@ internalMulBMI2: SBBQ p256p<>+0x018(SB), acc7 SBBQ $0, hlp // If the result of the subtraction is negative, restore the previous result - CMOVQCS acc0, acc4 + CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1) CMOVQCS acc1, acc5 CMOVQCS acc2, acc6 CMOVQCS acc3, acc7 @@ -880,140 +337,11 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalSqrBMI2 - MOVQ acc4, mul0 - MULQ acc5 - MOVQ mul0, acc1 - MOVQ mul1, acc2 - - MOVQ acc4, mul0 - MULQ acc6 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, acc3 - - MOVQ acc4, mul0 - MULQ acc7 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, t0 - - MOVQ acc5, mul0 - MULQ acc6 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, acc0 - - MOVQ acc5, mul0 - MULQ acc7 - ADDQ acc0, t0 - ADCQ $0, mul1 - ADDQ mul0, t0 - ADCQ $0, mul1 - MOVQ mul1, t1 - - MOVQ acc6, mul0 - MULQ acc7 - ADDQ mul0, t1 - ADCQ $0, mul1 - MOVQ mul1, t2 - XORQ t3, t3 - // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ t0, t0 - ADCQ t1, t1 - ADCQ t2, t2 - ADCQ $0, t3 - // Missing products - MOVQ acc4, mul0 - MULQ mul0 - MOVQ mul0, acc0 - MOVQ DX, acc4 - - MOVQ acc5, mul0 - MULQ mul0 - ADDQ acc4, acc1 - ADCQ mul0, acc2 - ADCQ $0, DX - MOVQ DX, acc4 - - MOVQ acc6, mul0 - MULQ mul0 - ADDQ acc4, acc3 - ADCQ mul0, t0 - ADCQ $0, DX - MOVQ DX, acc4 - - MOVQ acc7, mul0 - MULQ mul0 - ADDQ acc4, t1 - ADCQ mul0, t2 - ADCQ DX, t3 - // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] - sm2P256SqrReductionInternal() + p256SqrInternalInline RET internalSqrBMI2: - XORQ acc0, acc0 - XORQ t2, t2 - MOVQ acc4, mul1 - MULXQ acc5, acc1, acc2 - - MULXQ acc6, mul0, acc3 - ADOXQ mul0, acc2 - - MULXQ acc7, mul0, t0 - ADOXQ mul0, acc3 - ADOXQ t2, t0 - - MOVQ acc5, mul1 - MULXQ acc6, mul0, t3 - ADOXQ mul0, acc3 - - MULXQ acc7, mul0, t1 - ADCXQ t3, mul0 - ADOXQ mul0, t0 - ADCXQ t2, t1 - - MOVQ acc6, mul1 - MULXQ acc7, mul0, t2 - ADOXQ mul0, t1 - ADOXQ acc0, t2 - - XORQ t3, t3 - - // *2 - ADOXQ acc1, acc1 - ADOXQ acc2, acc2 - ADOXQ acc3, acc3 - ADOXQ t0, t0 - ADOXQ t1, t1 - ADOXQ t2, t2 - ADOXQ acc0, t3 - - // Missing products - MOVQ acc4, mul1 - MULXQ mul1, acc0, acc4 - ADDQ acc4, acc1 - - MOVQ acc5, mul1 - MULXQ mul1, mul0, acc4 - ADCXQ mul0, acc2 - ADCXQ acc4, acc3 - - MOVQ acc6, mul1 - MULXQ mul1, mul0, acc4 - ADCXQ mul0, t0 - ADCXQ acc4, t1 - - MOVQ acc7, mul1 - MULXQ mul1, mul0, acc4 - ADCXQ mul0, t2 - ADCXQ acc4, t3 - // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] - sm2P256SqrReductionInternal() - + p256SqrInternalInlineAdx RET /* ---------------------------------------*/ @@ -1069,98 +397,98 @@ internalSqrBMI2: MOVQ acc2, t2 \ MOVQ acc3, t3 \ \// Add in case the operand was > p256 - ADDQ $-1, acc0 \ - ADCQ p256p<>+0x08(SB), acc1 \ - ADCQ $-1, acc2 \ - ADCQ p256p<>+0x018(SB), acc3 \ - ADCQ $0, mul0 \ - CMOVQNE t0, acc0 \ - CMOVQNE t1, acc1 \ - CMOVQNE t2, acc2 \ - CMOVQNE t3, acc3 \ + ADDQ $-1, acc0 \ + ADCQ p256p<>+0x08(SB), acc1 \ + ADCQ $-1, acc2 \ + ADCQ p256p<>+0x018(SB), acc3 \ + ADCQ $0, mul0 \ // ZF := 1 if mul0 == 0 after ADC + CMOVQNE t0, acc0 \ // CMOVQNE: Move if not equal (ZF == 0) + CMOVQNE t1, acc1 \ + CMOVQNE t2, acc2 \ + CMOVQNE t3, acc3 \ \// If condition is 0, keep original value - TESTQ DX, DX \ - CMOVQEQ acc4, acc0 \ + TESTQ DX, DX \ // ZF := 1 if (DX AND DX == 0) + CMOVQEQ acc4, acc0 \ // CMOVQEQ: Move if equal (ZF == 1) CMOVQEQ acc5, acc1 \ CMOVQEQ acc6, acc2 \ CMOVQEQ acc7, acc3 \ \// Store result - MOVQ acc0, y2in(8*0) \ - MOVQ acc1, y2in(8*1) \ - MOVQ acc2, y2in(8*2) \ - MOVQ acc3, y2in(8*3) \ + MOVQ acc0, y2in(8*0) \ + MOVQ acc1, y2in(8*1) \ + MOVQ acc2, y2in(8*2) \ + MOVQ acc3, y2in(8*3) \ \// Begin point add - LDacc (z1in) \ - CALL sm2P256SqrInternal(SB) \// z1ˆ2 - ST (z1sqr) \ + LDacc (z1in) \ + CALL sm2P256SqrInternal(SB) \// z1ˆ2 + ST (z1sqr) \ \ - LDt (x2in) \ - CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2 + LDt (x2in) \ + CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2 \ - LDt (x1in) \ - CALL sm2P256SubInternal(SB) \// h = u2 - u1 - ST (h) \ + LDt (x1in) \ + p256SubInline2 \// h = u2 - u1 + ST (h) \ \ - LDt (z1in) \ - CALL sm2P256MulInternal(SB) \// z3 = h * z1 - ST (zout) \ + LDt (z1in) \ + CALL sm2P256MulInternal(SB) \// z3 = h * z1 + ST (zout) \ \ - LDacc (z1sqr) \ - CALL sm2P256MulInternal(SB) \// z1ˆ3 + LDacc (z1sqr) \ + CALL sm2P256MulInternal(SB) \// z1ˆ3 \ - LDt (y2in) \ - CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3 - ST (s2) \ + LDt (y2in) \ + CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3 + ST (s2) \ \ - LDt (y1in) \ - CALL sm2P256SubInternal(SB) \// r = s2 - s1 - ST (r) \ + LDt (y1in) \ + p256SubInline2 \// r = s2 - s1 + ST (r) \ \ - CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 - ST (rsqr) \ + CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 + ST (rsqr) \ \ - LDacc (h) \ - CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2 - ST (hsqr) \ + LDacc (h) \ + CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2 + ST (hsqr) \ \ - LDt (h) \ - CALL sm2P256MulInternal(SB) \// hcub = hˆ3 - ST (hcub) \ + LDt (h) \ + CALL sm2P256MulInternal(SB) \// hcub = hˆ3 + ST (hcub) \ \ - LDt (y1in) \ - CALL sm2P256MulInternal(SB) \// y1 * hˆ3 - ST (s2) \ + LDt (y1in) \ + CALL sm2P256MulInternal(SB) \// y1 * hˆ3 + ST (s2) \ \ - LDacc (x1in) \ - LDt (hsqr) \ - CALL sm2P256MulInternal(SB) \// u1 * hˆ2 - ST (h) \ + LDacc (x1in) \ + LDt (hsqr) \ + CALL sm2P256MulInternal(SB) \// u1 * hˆ2 + ST (h) \ \ - p256MulBy2Inline \// u1 * hˆ2 * 2, inline - LDacc (rsqr) \ - CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2 + p256MulBy2Inline \// u1 * hˆ2 * 2, inline + LDacc (rsqr) \ + p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 \ - LDt (hcub) \ - CALL sm2P256SubInternal(SB) \ - ST (xout) \ + LDt (hcub) \ + p256SubInline2 \ + ST (xout) \ \ - MOVQ acc4, t0 \ - MOVQ acc5, t1 \ - MOVQ acc6, t2 \ - MOVQ acc7, t3 \ - LDacc (h) \ - CALL sm2P256SubInternal(SB) \ + MOVQ acc4, t0 \ + MOVQ acc5, t1 \ + MOVQ acc6, t2 \ + MOVQ acc7, t3 \ + LDacc (h) \ + p256SubInline2 \ \ - LDt (r) \ - CALL sm2P256MulInternal(SB) \ + LDt (r) \ + CALL sm2P256MulInternal(SB) \ \ - LDt (s2) \ - CALL sm2P256SubInternal(SB) \ - ST (yout) \ + LDt (s2) \ + p256SubInline2 \ + ST (yout) \ \// Load stored values from stack - MOVQ rptr, AX \ - MOVL sel_save, BX \ - MOVL zero_save, CX \ + MOVQ rptr, AX \ + MOVL sel_save, BX \ + MOVL zero_save, CX \ // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int) TEXT ·p256PointAddAffineAsm(SB),0,$512-48 @@ -1372,36 +700,6 @@ pointaddaffine_avx2: #undef sel_save #undef zero_save -// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero -// otherwise. It writes to [acc4..acc7], t0 and t1. -TEXT sm2P256IsZero(SB),NOSPLIT,$0 - // AX contains a flag that is set if the input is zero. - XORQ AX, AX - MOVQ $1, t1 - - // Check whether [acc4..acc7] are all zero. - MOVQ acc4, t0 - ORQ acc5, t0 - ORQ acc6, t0 - ORQ acc7, t0 - - // Set the zero flag if so. (CMOV of a constant to a register doesn't - // appear to be supported in Go. Thus t1 = 1.) - CMOVQEQ t1, AX - - // XOR [acc4..acc7] with P and compare with zero again. - XORQ $-1, acc4 - XORQ p256p<>+0x08(SB), acc5 - XORQ $-1, acc6 - XORQ p256p<>+0x018(SB), acc7 - ORQ acc5, acc4 - ORQ acc6, acc4 - ORQ acc7, acc4 - - // Set the zero flag if so. - CMOVQEQ t1, AX - RET - /* ---------------------------------------*/ #define x1in(off) (32*0 + off)(SP) #define y1in(off) (32*1 + off)(SP) @@ -1449,9 +747,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0 ST (s2) \ \ LDt (s1) \ - CALL sm2P256SubInternal(SB) \// r = s2 - s1 + p256SubInline2 \// r = s2 - s1 ST (r) \ - CALL sm2P256IsZero(SB) \ + p256IsZeroInline \ MOVQ AX, points_eq \ \ LDacc (z2sqr) \ @@ -1464,9 +762,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0 ST (u2) \ \ LDt (u1) \ - CALL sm2P256SubInternal(SB) \// h = u2 - u1 + p256SubInline2 \// h = u2 - u1 ST (h) \ - CALL sm2P256IsZero(SB) \ + p256IsZeroInline \ ANDQ points_eq, AX \ MOVQ AX, points_eq \ \ @@ -1500,10 +798,10 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0 \ p256MulBy2Inline \// u1 * hˆ2 * 2, inline LDacc (rsqr) \ - CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2 + p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 \ LDt (hcub) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ ST (xout) \ \ MOVQ acc4, t0 \ @@ -1511,13 +809,13 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0 MOVQ acc6, t2 \ MOVQ acc7, t3 \ LDacc (u2) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ \ LDt (r) \ CALL sm2P256MulInternal(SB) \ \ LDt (s2) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ ST (yout) \ //func p256PointAddAsm(res, in1, in2 *SM2P256Point) int @@ -1652,23 +950,23 @@ pointadd_avx2: #define rptr (32*7)(SP) #define calZ() \ - LDacc (z) \ - CALL sm2P256SqrInternal(SB) \ - ST (zsqr) \ // ZZ = Z1^2 + LDacc (z) \ + CALL sm2P256SqrInternal(SB) \ + ST (zsqr) \ // ZZ = Z1^2 \ - LDt (x) \ - p256AddInline \ - STt (m) \ // M = ZZ + X1 + LDt (x) \ + p256AddInline \ + STt (m) \ // M = ZZ + X1 \ - LDacc (z) \ - LDt (y) \ - CALL sm2P256MulInternal(SB) \ // Z1 * Y1 - p256MulBy2Inline \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2 + LDacc (z) \ + LDt (y) \ + CALL sm2P256MulInternal(SB) \ // Z1 * Y1 + p256MulBy2Inline \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2 #define calX() \ LDacc (x) \ LDt (zsqr) \ - CALL sm2P256SubInternal(SB) \ // X1 - ZZ + p256SubInline2 \ // X1 - ZZ LDt (m) \ CALL sm2P256MulInternal(SB) \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2 ST (m) \ @@ -1717,18 +1015,18 @@ pointadd_avx2: LDacc (m) \ CALL sm2P256SqrInternal(SB) \ // M^2 = (3 * (X1^2 - ZZ^2))^2 LDt (tmp) \ - CALL sm2P256SubInternal(SB) \ // X3 = M^2 - 2*S + p256SubInline2 \ // X3 = M^2 - 2*S #define calY() \ acc2t \ LDacc (s) \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY) - CALL sm2P256SubInternal(SB) \ // S - X3 + p256SubInline2 \ // S - X3 \ LDt (m) \ CALL sm2P256MulInternal(SB) \ // M * (S - X3) \ LDt (y) \ - CALL sm2P256SubInternal(SB) \ // Y3 = M * (S - X3) - 8 * YYYYY + p256SubInline2 \ // Y3 = M * (S - X3) - 8 * YYYYY #define lastP256PointDouble() \ \ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl diff --git a/internal/sm2ec/p256_macros_amd64.s b/internal/sm2ec/p256_macros_amd64.s index 383161b..ac091dc 100644 --- a/internal/sm2ec/p256_macros_amd64.s +++ b/internal/sm2ec/p256_macros_amd64.s @@ -28,100 +28,100 @@ GLOBL p256ordK0<>(SB), 8, $8 GLOBL p256ord<>(SB), 8, $32 GLOBL p256one<>(SB), 8, $32 -#define p256SqrMontReduce() \ +#define p256SqrMontReduceInline \ \ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1] - MOVQ acc0, AX \ - MOVQ acc0, DX \ - SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part - SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part + MOVQ acc0, AX \ + MOVQ acc0, DX \ + SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part + SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part \// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0 - SUBQ AX, acc1 \ - SBBQ DX, acc2 \ - SBBQ AX, acc3 \ - MOVQ acc0, AX \ - SBBQ DX, acc0 \ + SUBQ AX, acc1 \ + SBBQ DX, acc2 \ + SBBQ AX, acc3 \ + MOVQ acc0, AX \ + SBBQ DX, acc0 \ \ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1], \ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry. - ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1) - ADCQ $0, acc2 \ // acc2' = acc2 + carry1 - ADCQ $0, acc3 \ // acc3' = acc3 + carry2 - ADCQ $0, acc0 \ // acc0' = acc0 + carry3 + ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1) + ADCQ $0, acc2 \ // acc2' = acc2 + carry1 + ADCQ $0, acc3 \ // acc3' = acc3 + carry2 + ADCQ $0, acc0 \ // acc0' = acc0 + carry3 \ // Second reduction step - MOVQ acc1, AX \ - MOVQ acc1, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ + MOVQ acc1, AX \ + MOVQ acc1, DX \ + SHLQ $32, AX \ + SHRQ $32, DX \ \ - SUBQ AX, acc2 \ - SBBQ DX, acc3 \ - SBBQ AX, acc0 \ - MOVQ acc1, AX \ - SBBQ DX, acc1 \ + SUBQ AX, acc2 \ + SBBQ DX, acc3 \ + SBBQ AX, acc0 \ + MOVQ acc1, AX \ + SBBQ DX, acc1 \ \ - ADDQ AX, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ + ADDQ AX, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ \ // Third reduction step - MOVQ acc2, AX \ - MOVQ acc2, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ + MOVQ acc2, AX \ + MOVQ acc2, DX \ + SHLQ $32, AX \ + SHRQ $32, DX \ \ - SUBQ AX, acc3 \ - SBBQ DX, acc0 \ - SBBQ AX, acc1 \ - MOVQ acc2, AX \ - SBBQ DX, acc2 \ + SUBQ AX, acc3 \ + SBBQ DX, acc0 \ + SBBQ AX, acc1 \ + MOVQ acc2, AX \ + SBBQ DX, acc2 \ \ - ADDQ AX, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ + ADDQ AX, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ \ // Last reduction step - XORQ t0, t0 \ - MOVQ acc3, AX \ - MOVQ acc3, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ + XORQ t0, t0 \ + MOVQ acc3, AX \ + MOVQ acc3, DX \ + SHLQ $32, AX \ + SHRQ $32, DX \ \ - SUBQ AX, acc0 \ - SBBQ DX, acc1 \ - SBBQ AX, acc2 \ - MOVQ acc3, AX \ - SBBQ DX, acc3 \ + SUBQ AX, acc0 \ + SBBQ DX, acc1 \ + SBBQ AX, acc2 \ + MOVQ acc3, AX \ + SBBQ DX, acc3 \ \ - ADDQ AX, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ + ADDQ AX, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ \ // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 \ - ADCQ acc5, acc1 \ - ADCQ y_ptr, acc2 \ - ADCQ x_ptr, acc3 \ + ADCQ acc4, acc0 \ + ADCQ acc5, acc1 \ + ADCQ y_ptr, acc2 \ + ADCQ x_ptr, acc3 \ ADCQ $0, t0 #define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ - MOVQ a0, b0 \ - MOVQ a1, b1 \ - MOVQ a2, b2 \ - MOVQ a3, b3 \ + MOVQ a0, b0 \ + MOVQ a1, b1 \ + MOVQ a2, b2 \ + MOVQ a3, b3 \ \ // Subtract p256 - SUBQ $-1, a0 \ - SBBQ p256p<>+0x08(SB), a1 \ - SBBQ $-1, a2 \ - SBBQ p256p<>+0x018(SB), a3 \ - SBBQ $0, a4 \ + SUBQ $-1, a0 \ + SBBQ p256p<>+0x08(SB), a1 \ + SBBQ $-1, a2 \ + SBBQ p256p<>+0x018(SB), a3 \ + SBBQ $0, a4 \ \ - CMOVQCS b0, a0 \ - CMOVQCS b1, a1 \ - CMOVQCS b2, a2 \ - CMOVQCS b3, a3 \ + CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1) + CMOVQCS b1, a1 \ + CMOVQCS b2, a2 \ + CMOVQCS b3, a3 \ \ - MOVQ a0, (8*0)(res) \ - MOVQ a1, (8*1)(res) \ - MOVQ a2, (8*2)(res) \ + MOVQ a0, (8*0)(res) \ + MOVQ a1, (8*1)(res) \ + MOVQ a2, (8*2)(res) \ MOVQ a3, (8*3)(res) /* ---------------------------------------*/ @@ -138,7 +138,7 @@ GLOBL p256one<>(SB), 8, $32 SBBQ p256ord<>+0x18(SB), a3 \ SBBQ $0, a4 \ \ - CMOVQCS b0, a0 \ + CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1) CMOVQCS b1, a1 \ CMOVQCS b2, a2 \ CMOVQCS b3, a3 \ @@ -148,175 +148,175 @@ GLOBL p256one<>(SB), 8, $32 MOVQ a2, (8*2)(res) \ MOVQ a3, (8*3)(res) -#define sm2P256SqrReductionInternal() \ +#define sm2P256SqrReductionInline \ \ // First reduction step - MOVQ acc0, mul0 \ - MOVQ acc0, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ + MOVQ acc0, mul0 \ + MOVQ acc0, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ \ - SUBQ mul0, acc1 \ - SBBQ mul1, acc2 \ - SBBQ mul0, acc3 \ - MOVQ acc0, mul0 \ - SBBQ mul1, acc0 \ + SUBQ mul0, acc1 \ + SBBQ mul1, acc2 \ + SBBQ mul0, acc3 \ + MOVQ acc0, mul0 \ + SBBQ mul1, acc0 \ \ - ADDQ mul0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ + ADDQ mul0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ \ // Second reduction step - MOVQ acc1, mul0 \ - MOVQ acc1, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ + MOVQ acc1, mul0 \ + MOVQ acc1, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ \ - SUBQ mul0, acc2 \ - SBBQ mul1, acc3 \ - SBBQ mul0, acc0 \ - MOVQ acc1, mul0 \ - SBBQ mul1, acc1 \ + SUBQ mul0, acc2 \ + SBBQ mul1, acc3 \ + SBBQ mul0, acc0 \ + MOVQ acc1, mul0 \ + SBBQ mul1, acc1 \ \ - ADDQ mul0, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ + ADDQ mul0, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ \ // Third reduction step - MOVQ acc2, mul0 \ - MOVQ acc2, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ + MOVQ acc2, mul0 \ + MOVQ acc2, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ \ - SUBQ mul0, acc3 \ - SBBQ mul1, acc0 \ - SBBQ mul0, acc1 \ - MOVQ acc2, mul0 \ - SBBQ mul1, acc2 \ + SUBQ mul0, acc3 \ + SBBQ mul1, acc0 \ + SBBQ mul0, acc1 \ + MOVQ acc2, mul0 \ + SBBQ mul1, acc2 \ \ - ADDQ mul0, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ + ADDQ mul0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ \ // Last reduction step - MOVQ acc3, mul0 \ - MOVQ acc3, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ + MOVQ acc3, mul0 \ + MOVQ acc3, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ \ - SUBQ mul0, acc0 \ - SBBQ mul1, acc1 \ - SBBQ mul0, acc2 \ - MOVQ acc3, mul0 \ - SBBQ mul1, acc3 \ + SUBQ mul0, acc0 \ + SBBQ mul1, acc1 \ + SBBQ mul0, acc2 \ + MOVQ acc3, mul0 \ + SBBQ mul1, acc3 \ \ - ADDQ mul0, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ - MOVQ $0, mul0 \ + ADDQ mul0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ + MOVQ $0, mul0 \ \ // Add bits [511:256] of the result - ADCQ acc0, t0 \ - ADCQ acc1, t1 \ - ADCQ acc2, t2 \ - ADCQ acc3, t3 \ - ADCQ $0, mul0 \ + ADCQ acc0, t0 \ + ADCQ acc1, t1 \ + ADCQ acc2, t2 \ + ADCQ acc3, t3 \ + ADCQ $0, mul0 \ \ // Copy result - MOVQ t0, acc4 \ - MOVQ t1, acc5 \ - MOVQ t2, acc6 \ - MOVQ t3, acc7 \ + MOVQ t0, acc4 \ + MOVQ t1, acc5 \ + MOVQ t2, acc6 \ + MOVQ t3, acc7 \ \ // Subtract p256 - SUBQ $-1, acc4 \ - SBBQ p256p<>+0x08(SB), acc5 \ - SBBQ $-1, acc6 \ - SBBQ p256p<>+0x018(SB), acc7 \ - SBBQ $0, mul0 \ + SUBQ $-1, acc4 \ + SBBQ p256p<>+0x08(SB), acc5 \ + SBBQ $-1, acc6 \ + SBBQ p256p<>+0x018(SB), acc7\ + SBBQ $0, mul0 \ \ // If the result of the subtraction is negative, restore the previous result - CMOVQCS t0, acc4 \ - CMOVQCS t1, acc5 \ - CMOVQCS t2, acc6 \ + CMOVQCS t0, acc4 \ // CMOVQCS: Move if below (CF == 1) + CMOVQCS t1, acc5 \ + CMOVQCS t2, acc6 \ CMOVQCS t3, acc7 -#define sm2P256MulReductionInternal() \ +#define sm2P256MulReductionInline \ \// First reduction step - MOVQ acc0, mul0 \ - MOVQ acc0, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ + MOVQ acc0, mul0 \ + MOVQ acc0, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ \ - SUBQ mul0, acc1 \ - SBBQ mul1, acc2 \ - SBBQ mul0, acc3 \ - MOVQ acc0, mul0 \ - SBBQ mul1, acc0 \ + SUBQ mul0, acc1 \ + SBBQ mul1, acc2 \ + SBBQ mul0, acc3 \ + MOVQ acc0, mul0 \ + SBBQ mul1, acc0 \ \ - ADDQ mul0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ + ADDQ mul0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ \// Second reduction step - MOVQ acc1, mul0 \ - MOVQ acc1, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ + MOVQ acc1, mul0 \ + MOVQ acc1, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ \ - SUBQ mul0, acc2 \ - SBBQ mul1, acc3 \ - SBBQ mul0, acc0 \ - MOVQ acc1, mul0 \ - SBBQ mul1, acc1 \ + SUBQ mul0, acc2 \ + SBBQ mul1, acc3 \ + SBBQ mul0, acc0 \ + MOVQ acc1, mul0 \ + SBBQ mul1, acc1 \ \ - ADDQ mul0, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ + ADDQ mul0, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ \// Third reduction step - MOVQ acc2, mul0 \ - MOVQ acc2, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ + MOVQ acc2, mul0 \ + MOVQ acc2, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ \ - SUBQ mul0, acc3 \ - SBBQ mul1, acc0 \ - SBBQ mul0, acc1 \ - MOVQ acc2, mul0 \ - SBBQ mul1, acc2 \ + SUBQ mul0, acc3 \ + SBBQ mul1, acc0 \ + SBBQ mul0, acc1 \ + MOVQ acc2, mul0 \ + SBBQ mul1, acc2 \ \ - ADDQ mul0, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ + ADDQ mul0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ \// Last reduction step - MOVQ acc3, mul0 \ - MOVQ acc3, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ + MOVQ acc3, mul0 \ + MOVQ acc3, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ \ - SUBQ mul0, acc0 \ - SBBQ mul1, acc1 \ - SBBQ mul0, acc2 \ - MOVQ acc3, mul0 \ - SBBQ mul1, acc3 \ + SUBQ mul0, acc0 \ + SBBQ mul1, acc1 \ + SBBQ mul0, acc2 \ + MOVQ acc3, mul0 \ + SBBQ mul1, acc3 \ \ - ADDQ mul0, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ + ADDQ mul0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ ADCQ $0, acc3 #define p256PointDoubleInit() \ - MOVOU (16*0)(BX), X0 \ - MOVOU (16*1)(BX), X1 \ - MOVOU (16*2)(BX), X2 \ - MOVOU (16*3)(BX), X3 \ - MOVOU (16*4)(BX), X4 \ - MOVOU (16*5)(BX), X5 \ + MOVOU (16*0)(BX), X0;\ + MOVOU (16*1)(BX), X1;\ + MOVOU (16*2)(BX), X2;\ + MOVOU (16*3)(BX), X3;\ + MOVOU (16*4)(BX), X4;\ + MOVOU (16*5)(BX), X5;\ \ - MOVOU X0, x(16*0) \ - MOVOU X1, x(16*1) \ - MOVOU X2, y(16*0) \ - MOVOU X3, y(16*1) \ - MOVOU X4, z(16*0) \ - MOVOU X5, z(16*1) + MOVOU X0, x(16*0);\ + MOVOU X1, x(16*1);\ + MOVOU X2, y(16*0);\ + MOVOU X3, y(16*1);\ + MOVOU X4, z(16*0);\ + MOVOU X5, z(16*1); /* ---------------------------------------*/ // [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4] @@ -336,7 +336,7 @@ GLOBL p256one<>(SB), 8, $32 SBBQ $-1, t2;\ SBBQ p256p<>+0x018(SB), t3;\ SBBQ $0, mul0;\ - CMOVQCS acc4, t0;\ + CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1) CMOVQCS acc5, t1;\ CMOVQCS acc6, t2;\ CMOVQCS acc7, t3; @@ -359,7 +359,7 @@ GLOBL p256one<>(SB), 8, $32 SBBQ $-1, acc6;\ SBBQ p256p<>+0x018(SB), acc7;\ SBBQ $0, mul0;\ - CMOVQCS t0, acc4;\ + CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1) CMOVQCS t1, acc5;\ CMOVQCS t2, acc6;\ CMOVQCS t3, acc7; @@ -386,7 +386,7 @@ GLOBL p256one<>(SB), 8, $32 SBBQ $-1, acc6;\ SBBQ p256p<>+0x018(SB), acc7;\ SBBQ $0, mul0;\ - CMOVQCS t0, acc4;\ + CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1) CMOVQCS t1, acc5;\ CMOVQCS t2, acc6;\ CMOVQCS t3, acc7;\ @@ -405,7 +405,7 @@ GLOBL p256one<>(SB), 8, $32 SBBQ $-1, t2;\ SBBQ p256p<>+0x018(SB), t3;\ SBBQ $0, mul0;\ - CMOVQCS acc4, t0;\ + CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1) CMOVQCS acc5, t1;\ CMOVQCS acc6, t2;\ CMOVQCS acc7, t3; @@ -428,7 +428,718 @@ GLOBL p256one<>(SB), 8, $32 SBBQ $-1, t2;\ SBBQ p256p<>+0x018(SB), t3;\ SBBQ $0, mul0;\ - CMOVQCS acc4, t0;\ + CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1) CMOVQCS acc5, t1;\ CMOVQCS acc6, t2;\ CMOVQCS acc7, t3; + +/* ---------------------------------------*/ +// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0] +#define p256SubInline2 \ + XORQ mul0, mul0;\ + SUBQ t0, acc4;\ + SBBQ t1, acc5;\ + SBBQ t2, acc6;\ + SBBQ t3, acc7;\ + SBBQ $0, mul0;\ + MOVQ acc4, acc0;\ + MOVQ acc5, acc1;\ + MOVQ acc6, acc2;\ + MOVQ acc7, acc3;\ + ADDQ $-1, acc4;\ + ADCQ p256p<>+0x08(SB), acc5;\ + ADCQ $-1, acc6;\ + ADCQ p256p<>+0x018(SB), acc7;\ + ANDQ $1, mul0;\ + CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1) + CMOVQEQ acc1, acc5;\ + CMOVQEQ acc2, acc6;\ + CMOVQEQ acc3, acc7;\ + +/* ---------------------------------------*/ +#define p256SqrRound(t1) \ + \// y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0;\ + \ + MOVQ (8*1)(x_ptr), AX;\ + MULQ t0;\ + MOVQ AX, acc1;\ + MOVQ DX, acc2;\ + \ + MOVQ (8*2)(x_ptr), AX;\ + MULQ t0;\ + ADDQ AX, acc2;\ + ADCQ $0, DX;\ + MOVQ DX, acc3;\ + \ + MOVQ (8*3)(x_ptr), AX;\ + MULQ t0;\ + ADDQ AX, acc3;\ + ADCQ $0, DX;\ + MOVQ DX, acc4;\ + \// y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0;\ + \ + MOVQ (8*2)(x_ptr), AX;\ + MULQ t0;\ + ADDQ AX, acc3;\ + ADCQ $0, DX;\ + MOVQ DX, t1;\ + \ + MOVQ (8*3)(x_ptr), AX;\ + MULQ t0;\ + ADDQ t1, acc4;\ + ADCQ $0, DX;\ + ADDQ AX, acc4;\ + ADCQ $0, DX;\ + MOVQ DX, acc5;\ + \// y[3] * y[2] + MOVQ (8*2)(x_ptr), t0;\ + \ + MOVQ (8*3)(x_ptr), AX;\ + MULQ t0;\ + ADDQ AX, acc5;\ + ADCQ $0, DX;\ + MOVQ DX, y_ptr;\ + XORQ t1, t1;\ + \// *2 + ADDQ acc1, acc1;\ + ADCQ acc2, acc2;\ + ADCQ acc3, acc3;\ + ADCQ acc4, acc4;\ + ADCQ acc5, acc5;\ + ADCQ y_ptr, y_ptr;\ + ADCQ $0, t1;\ + \// Missing products + MOVQ (8*0)(x_ptr), AX;\ + MULQ AX;\ + MOVQ AX, acc0;\ + MOVQ DX, t0;\ + \ + MOVQ (8*1)(x_ptr), AX;\ + MULQ AX;\ + ADDQ t0, acc1;\ + ADCQ AX, acc2;\ + ADCQ $0, DX;\ + MOVQ DX, t0;\ + \ + MOVQ (8*2)(x_ptr), AX;\ + MULQ AX;\ + ADDQ t0, acc3;\ + ADCQ AX, acc4;\ + ADCQ $0, DX;\ + MOVQ DX, t0;\ + \ + MOVQ (8*3)(x_ptr), AX;\ + MULQ AX;\ + ADDQ t0, acc5;\ + ADCQ AX, y_ptr;\ + ADCQ DX, t1;\ + MOVQ t1, x_ptr;\ + \// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + p256SqrMontReduceInline;\ + p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\ + MOVQ res_ptr, x_ptr; + +/* ---------------------------------------*/ +#define p256SqrRoundAdx(t1) \ + XORQ acc0, acc0;\ + XORQ y_ptr, y_ptr;\ + \// x[1:] * x[0] + MOVQ (8*0)(x_ptr), DX;\ + MULXQ (8*1)(x_ptr), acc1, acc2;\ + \ + MULXQ (8*2)(x_ptr), AX, acc3;\ + ADOXQ AX, acc2;\ + \ + MULXQ (8*3)(x_ptr), AX, acc4;\ + ADOXQ AX, acc3;\ + ADOXQ y_ptr, acc4;\ + \ + \// x[2:] * x[1] + MOVQ (8*1)(x_ptr), DX;\ + MULXQ (8*2)(x_ptr), AX, t1;\ + ADOXQ AX, acc3;\ + \ + MULXQ (8*3)(x_ptr), AX, acc5;\ + ADCXQ t1, AX;\ + ADOXQ AX, acc4;\ + ADCXQ y_ptr, acc5;\ + \ + \// y[x] * x[2] + MOVQ (8*2)(x_ptr), DX;\ + MULXQ (8*3)(x_ptr), AX, y_ptr ;\ + ADOXQ AX, acc5;\ + ADOXQ acc0, y_ptr;\ + \ + XORQ t1, t1;\ + \ + \// *2 + ADOXQ acc1, acc1;\ + ADOXQ acc2, acc2;\ + ADOXQ acc3, acc3;\ + ADOXQ acc4, acc4;\ + ADOXQ acc5, acc5;\ + ADOXQ y_ptr, y_ptr;\ + ADOXQ acc0, t1;\ + \ + \// Missing products + MOVQ (8*0)(x_ptr), DX;\ + MULXQ DX, acc0, t0;\ + ADCXQ t0, acc1;\ + \ + MOVQ (8*1)(x_ptr), DX;\ + MULXQ DX, AX, t0;\ + ADCXQ AX, acc2;\ + ADCXQ t0, acc3;\ + \ + MOVQ (8*2)(x_ptr), DX;\ + MULXQ DX, AX, t0 ;\ + ADCXQ AX, acc4;\ + ADCXQ t0, acc5;\ + \ + MOVQ (8*3)(x_ptr), DX;\ + MULXQ DX, AX, x_ptr;\ + ADCXQ AX, y_ptr;\ + ADCXQ t1, x_ptr;\ + \ + \// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + p256SqrMontReduceInline;\ + p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\ + MOVQ res_ptr, x_ptr; + +/* ---------------------------------------*/ +#define p256OrdSqrRound(t1) \ + \// y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0;\ + \ + MOVQ (8*1)(x_ptr), AX;\ + MULQ t0;\ + MOVQ AX, acc1;\ + MOVQ DX, acc2;\ + \ + MOVQ (8*2)(x_ptr), AX;\ + MULQ t0;\ + ADDQ AX, acc2;\ + ADCQ $0, DX;\ + MOVQ DX, acc3;\ + \ + MOVQ (8*3)(x_ptr), AX;\ + MULQ t0;\ + ADDQ AX, acc3;\ + ADCQ $0, DX;\ + MOVQ DX, acc4;\ + \// y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0;\ + \ + MOVQ (8*2)(x_ptr), AX;\ + MULQ t0;\ + ADDQ AX, acc3;\ + ADCQ $0, DX;\ + MOVQ DX, t1;\ + \ + MOVQ (8*3)(x_ptr), AX;\ + MULQ t0;\ + ADDQ t1, acc4;\ + ADCQ $0, DX;\ + ADDQ AX, acc4;\ + ADCQ $0, DX;\ + MOVQ DX, acc5;\ + \// y[3] * y[2] + MOVQ (8*2)(x_ptr), t0;\ + \ + MOVQ (8*3)(x_ptr), AX;\ + MULQ t0;\ + ADDQ AX, acc5;\ + ADCQ $0, DX;\ + MOVQ DX, y_ptr;\ + XORQ t1, t1;\ + \// *2 + ADDQ acc1, acc1;\ + ADCQ acc2, acc2;\ + ADCQ acc3, acc3;\ + ADCQ acc4, acc4;\ + ADCQ acc5, acc5;\ + ADCQ y_ptr, y_ptr;\ + ADCQ $0, t1;\ + \// Missing products + MOVQ (8*0)(x_ptr), AX;\ + MULQ AX;\ + MOVQ AX, acc0;\ + MOVQ DX, t0;\ + \ + MOVQ (8*1)(x_ptr), AX;\ + MULQ AX;\ + ADDQ t0, acc1;\ + ADCQ AX, acc2;\ + ADCQ $0, DX;\ + MOVQ DX, t0;\ + \ + MOVQ (8*2)(x_ptr), AX;\ + MULQ AX;\ + ADDQ t0, acc3;\ + ADCQ AX, acc4;\ + ADCQ $0, DX;\ + MOVQ DX, t0;\ + \ + MOVQ (8*3)(x_ptr), AX;\ + MULQ AX;\ + ADDQ t0, acc5;\ + ADCQ AX, y_ptr;\ + ADCQ DX, t1;\ + MOVQ t1, x_ptr;\ + \ + \// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + MOVQ acc0, AX;\ + MULQ p256ordK0<>(SB);\ + MOVQ AX, t0;\ // Y = t0 = (k0 * acc0) mod 2^64 + \ + MOVQ p256ord<>+0x00(SB), AX;\ + MULQ t0;\ + ADDQ AX, acc0;\ // (carry1, acc0) = acc0 + L(t0 * ord0) + ADCQ $0, DX;\ // DX = carry1 + H(t0 * ord0) + MOVQ DX, t1;\ // t1 = carry1 + H(t0 * ord0) + MOVQ t0, acc0;\ // acc0 = t0 + \ + \// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 + MOVQ t0, AX;\ + MOVQ t0, DX;\ + SHLQ $32, AX;\ + SHRQ $32, DX;\ + \ + SUBQ t0, acc2;\ + SBBQ AX, acc3;\ + SBBQ DX, acc0;\ + \ + MOVQ p256ord<>+0x08(SB), AX;\ + MULQ t0;\ + ADDQ t1, acc1;\ // (carry2, acc1) = acc1 + t1 + ADCQ $0, DX;\ // DX = carry2 + H(t0*ord1) + \ + ADDQ AX, acc1;\ // (carry3, acc1) = acc1 + t1 + L(t0*ord1) + ADCQ DX, acc2;\ + ADCQ $0, acc3;\ + ADCQ $0, acc0;\ + \ + \// Second reduction step + MOVQ acc1, AX;\ + MULQ p256ordK0<>(SB);\ + MOVQ AX, t0;\ + \ + MOVQ p256ord<>+0x00(SB), AX;\ + MULQ t0;\ + ADDQ AX, acc1;\ + ADCQ $0, DX;\ + MOVQ DX, t1;\ + MOVQ t0, acc1;\ + \ + MOVQ t0, AX;\ + MOVQ t0, DX;\ + SHLQ $32, AX;\ + SHRQ $32, DX;\ + \ + SUBQ t0, acc3;\ + SBBQ AX, acc0;\ + SBBQ DX, acc1;\ + \ + MOVQ p256ord<>+0x08(SB), AX;\ + MULQ t0;\ + ADDQ t1, acc2;\ + ADCQ $0, DX;\ + \ + ADDQ AX, acc2;\ + ADCQ DX, acc3;\ + ADCQ $0, acc0;\ + ADCQ $0, acc1;\ + \ + \// Third reduction step + MOVQ acc2, AX;\ + MULQ p256ordK0<>(SB);\ + MOVQ AX, t0;\ + \ + MOVQ p256ord<>+0x00(SB), AX;\ + MULQ t0;\ + ADDQ AX, acc2;\ + ADCQ $0, DX;\ + MOVQ DX, t1;\ + MOVQ t0, acc2;\ + \ + MOVQ t0, AX;\ + MOVQ t0, DX;\ + SHLQ $32, AX;\ + SHRQ $32, DX;\ + \ + SUBQ t0, acc0;\ + SBBQ AX, acc1;\ + SBBQ DX, acc2;\ + \ + MOVQ p256ord<>+0x08(SB), AX;\ + MULQ t0;\ + ADDQ t1, acc3;\ + ADCQ $0, DX;\ + \ + ADDQ AX, acc3;\ + ADCQ DX, acc0;\ + ADCQ $0, acc1;\ + ADCQ $0, acc2;\ + \ + \// Last reduction step + MOVQ acc3, AX;\ + MULQ p256ordK0<>(SB);\ + MOVQ AX, t0;\ + \ + MOVQ p256ord<>+0x00(SB), AX;\ + MULQ t0;\ + ADDQ AX, acc3;\ + ADCQ $0, DX;\ + MOVQ DX, t1;\ + MOVQ t0, acc3;\ + \ + MOVQ t0, AX;\ + MOVQ t0, DX;\ + SHLQ $32, AX;\ + SHRQ $32, DX;\ + \ + SUBQ t0, acc1;\ + SBBQ AX, acc2;\ + SBBQ DX, acc3;\ + \ + MOVQ p256ord<>+0x08(SB), AX;\ + MULQ t0;\ + ADDQ t1, acc0;\ + ADCQ $0, DX;\ + \ + ADDQ AX, acc0;\ + ADCQ DX, acc1;\ + ADCQ $0, acc2;\ + ADCQ $0, acc3;\ + XORQ t0, t0;\ + \// Add bits [511:256] of the sqr result + ADCQ acc4, acc0;\ + ADCQ acc5, acc1;\ + ADCQ y_ptr, acc2;\ + ADCQ x_ptr, acc3;\ + ADCQ $0, t0;\ + \ + p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\ + MOVQ res_ptr, x_ptr; + +/* ---------------------------------------*/ +#define p256OrdSqrRoundAdx(t1) \ + XORQ acc0, acc0;\ + XORQ y_ptr, y_ptr;\ + \// y[1:] * y[0] + MOVQ (8*0)(x_ptr), DX;\ + MULXQ (8*1)(x_ptr), acc1, acc2 ;\ + \ + MULXQ (8*2)(x_ptr), AX, acc3;\ + ADOXQ AX, acc2;\ + \ + MULXQ (8*3)(x_ptr), AX, acc4;\ + ADOXQ AX, acc3;\ + ADOXQ y_ptr, acc4;\ + \ + \// y[2:] * y[1] + MOVQ (8*1)(x_ptr), DX;\ + MULXQ (8*2)(x_ptr), AX, t1;\ + ADOXQ AX, acc3;\ + \ + MULXQ (8*3)(x_ptr), AX, acc5;\ + ADCXQ t1, AX;\ + ADOXQ AX, acc4;\ + ADCXQ y_ptr, acc5;\ + \ + \// y[3] * y[2] + MOVQ (8*2)(x_ptr), DX;\ + MULXQ (8*3)(x_ptr), AX, y_ptr;\ + ADOXQ AX, acc5;\ + ADOXQ acc0, y_ptr;\ + \ + XORQ t1, t1;\ + \// *2 + ADOXQ acc1, acc1;\ + ADOXQ acc2, acc2;\ + ADOXQ acc3, acc3;\ + ADOXQ acc4, acc4;\ + ADOXQ acc5, acc5;\ + ADOXQ y_ptr, y_ptr;\ + ADOXQ acc0, t1;\ + \ + \// Missing products + MOVQ (8*0)(x_ptr), DX;\ + MULXQ DX, acc0, t0;\ + ADCXQ t0, acc1;\ + \ + MOVQ (8*1)(x_ptr), DX;\ + MULXQ DX, AX, t0;\ + ADCXQ AX, acc2;\ + ADCXQ t0, acc3;\ + \ + MOVQ (8*2)(x_ptr), DX;\ + MULXQ DX, AX, t0 ;\ + ADCXQ AX, acc4;\ + ADCXQ t0, acc5;\ + \ + MOVQ (8*3)(x_ptr), DX;\ + MULXQ DX, AX, x_ptr;\ + ADCXQ AX, y_ptr;\ + ADCXQ t1, x_ptr;\ + \ + \// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + \// First reduction step + MOVQ acc0, DX;\ + MULXQ p256ordK0<>(SB), DX, AX;\ + \ + MULXQ p256ord<>+0x00(SB), AX, t0;\ + ADOXQ AX, acc0 ;\// (carry1, acc0) = acc0 + t0 * ord0 + \ + MULXQ p256ord<>+0x08(SB), AX, t1;\ + ADCXQ t0, AX;\ + ADOXQ AX, acc1;\ + \ + MULXQ p256ord<>+0x10(SB), AX, t0;\ + ADCXQ t1, AX;\ + ADOXQ AX, acc2;\ + \ + MULXQ p256ord<>+0x18(SB), AX, acc0;\ + ADCXQ t0, AX;\ + ADOXQ AX, acc3;\ + MOVQ $0, t0;\ + ADCXQ t0, acc0;\ + ADOXQ t0, acc0;\ + \ + \// Second reduction step + MOVQ acc1, DX;\ + MULXQ p256ordK0<>(SB), DX, AX;\ + \ + MULXQ p256ord<>+0x00(SB), AX, t0;\ + ADOXQ AX, acc1;\ + \ + MULXQ p256ord<>+0x08(SB), AX, t1;\ + ADCXQ t0, AX;\ + ADOXQ AX, acc2;\ + \ + MULXQ p256ord<>+0x10(SB), AX, t0;\ + ADCXQ t1, AX;\ + ADOXQ AX, acc3;\ + \ + MULXQ p256ord<>+0x18(SB), AX, acc1;\ + ADCXQ t0, AX;\ + ADOXQ AX, acc0;\ + MOVQ $0, t0;\ + ADCXQ t0, acc1;\ + ADOXQ t0, acc1;\ + \ + \// Third reduction step + MOVQ acc2, DX;\ + MULXQ p256ordK0<>(SB), DX, AX;\ + \ + MULXQ p256ord<>+0x00(SB), AX, t0;\ + ADOXQ AX, acc2;\ + \ + MULXQ p256ord<>+0x08(SB), AX, t1;\ + ADCXQ t0, AX;\ + ADOXQ AX, acc3;\ + \ + MULXQ p256ord<>+0x10(SB), AX, t0;\ + ADCXQ t1, AX;\ + ADOXQ AX, acc0;\ + \ + MULXQ p256ord<>+0x18(SB), AX, acc2;\ + ADCXQ t0, AX;\ + ADOXQ AX, acc1;\ + MOVQ $0, t0;\ + ADCXQ t0, acc2;\ + ADOXQ t0, acc2;\ + \ + \// Last reduction step + MOVQ acc3, DX;\ + MULXQ p256ordK0<>(SB), DX, AX;\ + \ + MULXQ p256ord<>+0x00(SB), AX, t0;\ + ADOXQ AX, acc3;\ + \ + MULXQ p256ord<>+0x08(SB), AX, t1;\ + ADCXQ t0, AX;\ + ADOXQ AX, acc0;\ + \ + MULXQ p256ord<>+0x10(SB), AX, t0;\ + ADCXQ t1, AX;\ + ADOXQ AX, acc1;\ + \ + MULXQ p256ord<>+0x18(SB), AX, acc3;\ + ADCXQ t0, AX;\ + ADOXQ AX, acc2;\ + MOVQ $0, t0;\ + ADCXQ t0, acc3;\ + ADOXQ t0, acc3;\ + \ + XORQ t1, t1;\ + \// Add bits [511:256] of the sqr result + ADCXQ acc4, acc0;\ + ADCXQ acc5, acc1;\ + ADCXQ y_ptr, acc2;\ + ADCXQ x_ptr, acc3;\ + ADCXQ t1, t0;\ + \ + p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\ + MOVQ res_ptr, x_ptr; + +#define p256SqrInternalInline \ + MOVQ acc4, mul0;\ + MULQ acc5;\ + MOVQ mul0, acc1;\ + MOVQ mul1, acc2;\ + \ + MOVQ acc4, mul0;\ + MULQ acc6;\ + ADDQ mul0, acc2;\ + ADCQ $0, mul1;\ + MOVQ mul1, acc3;\ + \ + MOVQ acc4, mul0;\ + MULQ acc7;\ + ADDQ mul0, acc3;\ + ADCQ $0, mul1;\ + MOVQ mul1, t0;\ + \ + MOVQ acc5, mul0;\ + MULQ acc6;\ + ADDQ mul0, acc3;\ + ADCQ $0, mul1;\ + MOVQ mul1, acc0;\ + \ + MOVQ acc5, mul0;\ + MULQ acc7;\ + ADDQ acc0, t0;\ + ADCQ $0, mul1;\ + ADDQ mul0, t0;\ + ADCQ $0, mul1;\ + MOVQ mul1, t1;\ + \ + MOVQ acc6, mul0;\ + MULQ acc7;\ + ADDQ mul0, t1;\ + ADCQ $0, mul1;\ + MOVQ mul1, t2;\ + XORQ t3, t3;\ + \// *2 + ADDQ acc1, acc1;\ + ADCQ acc2, acc2;\ + ADCQ acc3, acc3;\ + ADCQ t0, t0;\ + ADCQ t1, t1;\ + ADCQ t2, t2;\ + ADCQ $0, t3;\ + \// Missing products + MOVQ acc4, mul0;\ + MULQ mul0;\ + MOVQ mul0, acc0;\ + MOVQ mul1, acc4;\ + \ + MOVQ acc5, mul0;\ + MULQ mul0;\ + ADDQ acc4, acc1;\ + ADCQ mul0, acc2;\ + ADCQ $0, mul1;\ + MOVQ mul1, acc4;\ + \ + MOVQ acc6, mul0;\ + MULQ mul0;\ + ADDQ acc4, acc3;\ + ADCQ mul0, t0;\ + ADCQ $0, mul1;\ + MOVQ mul1, acc4;\ + \ + MOVQ acc7, mul0;\ + MULQ mul0;\ + ADDQ acc4, t1;\ + ADCQ mul0, t2;\ + ADCQ mul1, t3;\ + \// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] + sm2P256SqrReductionInline; + +#define p256SqrInternalInlineAdx \ + XORQ acc0, acc0;\ + XORQ t2, t2;\ + MOVQ acc4, mul1;\ + MULXQ acc5, acc1, acc2;\ + \ + MULXQ acc6, mul0, acc3;\ + ADOXQ mul0, acc2;\ + \ + MULXQ acc7, mul0, t0;\ + ADOXQ mul0, acc3;\ + ADOXQ t2, t0;\ + \ + MOVQ acc5, mul1;\ + MULXQ acc6, mul0, t3;\ + ADOXQ mul0, acc3;\ + \ + MULXQ acc7, mul0, t1;\ + ADCXQ t3, mul0;\ + ADOXQ mul0, t0;\ + ADCXQ t2, t1;\ + \ + MOVQ acc6, mul1;\ + MULXQ acc7, mul0, t2;\ + ADOXQ mul0, t1;\ + ADOXQ acc0, t2;\ + XORQ t3, t3;\ + \ + \// *2 + ADOXQ acc1, acc1;\ + ADOXQ acc2, acc2;\ + ADOXQ acc3, acc3;\ + ADOXQ t0, t0;\ + ADOXQ t1, t1;\ + ADOXQ t2, t2;\ + ADOXQ acc0, t3;\ + \ + \// Missing products + MOVQ acc4, mul1;\ + MULXQ mul1, acc0, acc4;\ + ADDQ acc4, acc1;\ + \ + MOVQ acc5, mul1;\ + MULXQ mul1, mul0, acc4;\ + ADCXQ mul0, acc2;\ + ADCXQ acc4, acc3;\ + \ + MOVQ acc6, mul1;\ + MULXQ mul1, mul0, acc4;\ + ADCXQ mul0, t0;\ + ADCXQ acc4, t1;\ + \ + MOVQ acc7, mul1;\ + MULXQ mul1, mul0, acc4;\ + ADCXQ mul0, t2;\ + ADCXQ acc4, t3;\ + \// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] + sm2P256SqrReductionInline; + +// p256IsZeroInline returns 1 in AX if [acc4..acc7] represents zero and zero +// otherwise. It writes to [acc4..acc7], t0 and t1. +#define p256IsZeroInline \ + \// AX contains a flag that is set if the input is zero. + XORQ AX, AX;\ + MOVQ $1, t1;\ + \// Check whether [acc4..acc7] are all zero. + MOVQ acc4, t0;\ + ORQ acc5, t0;\ + ORQ acc6, t0;\ + ORQ acc7, t0;\ + \// Set the zero flag if so. (CMOV of a constant to a register doesn't + \// appear to be supported in Go. Thus t1 = 1.) + CMOVQEQ t1, AX;\ // CMOVQEQ: Move if equal (ZF == 1) + \// XOR [acc4..acc7] with P and compare with zero again. + XORQ $-1, acc4;\ + XORQ p256p<>+0x08(SB), acc5;\ + XORQ $-1, acc6;\ + XORQ p256p<>+0x018(SB), acc7;\ + ORQ acc5, acc4;\ + ORQ acc6, acc4;\ + ORQ acc7, acc4;\ + \// Set the zero flag if so. + \// CMOVQEQ: Move if equal (ZF == 1) + CMOVQEQ t1, AX; diff --git a/internal/sm2ec/p256_plugin_amd64.s b/internal/sm2ec/p256_plugin_amd64.s index 0507def..dcb97ee 100644 --- a/internal/sm2ec/p256_plugin_amd64.s +++ b/internal/sm2ec/p256_plugin_amd64.s @@ -6,11 +6,21 @@ // https://eprint.iacr.org/2013/816.pdf //go:build amd64 && !purego && plugin +// plugin mode - DO NOT use the R15 Register. +// Below functions are different: +// 1.p256Sqr +// 2.p256OrdSqr +// 3.sm2P256MulInternal +// 4.sm2P256SqrInternal + #include "textflag.h" #include "p256_macros_amd64.s" /* ---------------------------------------*/ +// This func is same as non-plugin mode, except that it uses BP to store n +// and does not use R15. +// // func p256Sqr(res, in *p256Element, n int) TEXT ·p256Sqr(SB),NOSPLIT,$0 MOVQ res+0(FP), res_ptr @@ -21,162 +31,21 @@ TEXT ·p256Sqr(SB),NOSPLIT,$0 JEQ sqrBMI2 sqrLoop: - // y[1:] * y[0] - MOVQ (8*0)(x_ptr), t0 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - MOVQ AX, acc1 - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 - // y[2:] * y[1] - MOVQ (8*1)(x_ptr), t0 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, BX - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ BX, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, acc5 - // y[3] * y[2] - MOVQ (8*2)(x_ptr), t0 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, y_ptr - XORQ BX, BX - // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, BX - // Missing products - MOVQ (8*0)(x_ptr), AX - MULQ AX - MOVQ AX, acc0 - MOVQ DX, t0 - - MOVQ (8*1)(x_ptr), AX - MULQ AX - ADDQ t0, acc1 - ADCQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*2)(x_ptr), AX - MULQ AX - ADDQ t0, acc3 - ADCQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*3)(x_ptr), AX - MULQ AX - ADDQ t0, acc5 - ADCQ AX, y_ptr - ADCQ DX, BX - MOVQ BX, x_ptr - - // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - p256SqrMontReduce() - p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) - MOVQ res_ptr, x_ptr + p256SqrRound(BX) DECQ BP JNE sqrLoop RET sqrBMI2: - XORQ acc0, acc0 - XORQ y_ptr, y_ptr - // x[1:] * x[0] - MOVQ (8*0)(x_ptr), DX - MULXQ (8*1)(x_ptr), acc1, acc2 - - MULXQ (8*2)(x_ptr), AX, acc3 - ADOXQ AX, acc2 - - MULXQ (8*3)(x_ptr), AX, acc4 - ADOXQ AX, acc3 - ADOXQ y_ptr, acc4 - - // x[2:] * x[1] - MOVQ (8*1)(x_ptr), DX - MULXQ (8*2)(x_ptr), AX, BX - ADOXQ AX, acc3 - - MULXQ (8*3)(x_ptr), AX, acc5 - ADCXQ BX, AX - ADOXQ AX, acc4 - ADCXQ y_ptr, acc5 - - // x[3] * x[2] - MOVQ (8*2)(x_ptr), DX - MULXQ (8*3)(x_ptr), AX, y_ptr - ADOXQ AX, acc5 - ADOXQ acc0, y_ptr - XORQ BX, BX - - // *2 - ADOXQ acc1, acc1 - ADOXQ acc2, acc2 - ADOXQ acc3, acc3 - ADOXQ acc4, acc4 - ADOXQ acc5, acc5 - ADOXQ y_ptr, y_ptr - ADOXQ acc0, BX - - // Missing products - MOVQ (8*0)(x_ptr), DX - MULXQ DX, acc0, t0 - ADCXQ t0, acc1 - - MOVQ (8*1)(x_ptr), DX - MULXQ DX, AX, t0 - ADCXQ AX, acc2 - ADCXQ t0, acc3 - - MOVQ (8*2)(x_ptr), DX - MULXQ DX, AX, t0 - ADCXQ AX, acc4 - ADCXQ t0, acc5 - - MOVQ (8*3)(x_ptr), DX - MULXQ DX, AX, x_ptr - ADCXQ AX, y_ptr - ADCXQ BX, x_ptr - - // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - p256SqrMontReduce() - p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) - MOVQ res_ptr, x_ptr + p256SqrRoundAdx(BX) DECQ BP JNE sqrBMI2 RET /* ---------------------------------------*/ +// This func is same as non-plugin mode, except that it uses BP to store n +// and does not use R15. +// // func p256OrdSqr(res, in *p256OrdElement, n int) TEXT ·p256OrdSqr(SB),NOSPLIT,$0 MOVQ res+0(FP), res_ptr @@ -187,385 +56,14 @@ TEXT ·p256OrdSqr(SB),NOSPLIT,$0 JEQ ordSqrLoopBMI2 ordSqrLoop: - // y[1:] * y[0] - MOVQ (8*0)(x_ptr), t0 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - MOVQ AX, acc1 - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 - // y[2:] * y[1] - MOVQ (8*1)(x_ptr), t0 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, BX - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ BX, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, acc5 - // y[3] * y[2] - MOVQ (8*2)(x_ptr), t0 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, y_ptr - XORQ BX, BX - // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, BX - // Missing products - MOVQ (8*0)(x_ptr), AX - MULQ AX - MOVQ AX, acc0 - MOVQ DX, t0 - - MOVQ (8*1)(x_ptr), AX - MULQ AX - ADDQ t0, acc1 - ADCQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*2)(x_ptr), AX - MULQ AX - ADDQ t0, acc3 - ADCQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*3)(x_ptr), AX - MULQ AX - ADDQ t0, acc5 - ADCQ AX, y_ptr - ADCQ DX, BX - MOVQ BX, x_ptr - - // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] - MOVQ acc0, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) - ADCQ $0, DX // DX = carry1 + H(t0 * ord0) - MOVQ DX, BX // BX = carry1 + H(t0 * ord0) - MOVQ t0, acc0 // acc0 = t0 - - // calculate the negative part: [acc0, acc3, acc2] - [0, 0x100000000, 1] * t0 - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc0 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ BX, acc1 // (carry2, acc1) = acc1 + BX - ADCQ $0, DX // DX = carry2 + H(t0*ord1) - - ADDQ AX, acc1 // (carry3, acc1) = acc1 + BX + L(t0*ord1) - ADCQ DX, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - - // Second reduction step - MOVQ acc1, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, BX - MOVQ t0, acc1 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc3 - SBBQ AX, acc0 - SBBQ DX, acc1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ BX, acc2 - ADCQ $0, DX - - ADDQ AX, acc2 - ADCQ DX, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - - // Third reduction step - MOVQ acc2, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, BX - MOVQ t0, acc2 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc0 - SBBQ AX, acc1 - SBBQ DX, acc2 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ BX, acc3 - ADCQ $0, DX - - ADDQ AX, acc3 - ADCQ DX, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - - // Last reduction step - MOVQ acc3, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, BX - MOVQ t0, acc3 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc1 - SBBQ AX, acc2 - SBBQ DX, acc3 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ BX, acc0 - ADCQ $0, DX - - ADDQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - - XORQ t0, t0 - // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 - ADCQ acc5, acc1 - ADCQ y_ptr, acc2 - ADCQ x_ptr, acc3 - ADCQ $0, t0 - - p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) - MOVQ res_ptr, x_ptr + p256OrdSqrRound(BX) DECQ BP JNE ordSqrLoop RET ordSqrLoopBMI2: - XORQ acc0, acc0 - XORQ y_ptr, y_ptr - // y[1:] * y[0] - MOVQ (8*0)(x_ptr), DX - MULXQ (8*1)(x_ptr), acc1, acc2 - - MULXQ (8*2)(x_ptr), AX, acc3 - ADOXQ AX, acc2 - - MULXQ (8*3)(x_ptr), AX, acc4 - ADOXQ AX, acc3 - ADOXQ y_ptr, acc4 - - // y[2:] * y[1] - MOVQ (8*1)(x_ptr), DX - MULXQ (8*2)(x_ptr), AX, BX - ADOXQ AX, acc3 - - MULXQ (8*3)(x_ptr), AX, acc5 - ADCXQ BX, AX - ADOXQ AX, acc4 - ADCXQ y_ptr, acc5 - - // y[3] * y[2] - MOVQ (8*2)(x_ptr), DX - MULXQ (8*3)(x_ptr), AX, y_ptr - ADOXQ AX, acc5 - ADOXQ acc0, y_ptr - - XORQ BX, BX - // *2 - ADOXQ acc1, acc1 - ADOXQ acc2, acc2 - ADOXQ acc3, acc3 - ADOXQ acc4, acc4 - ADOXQ acc5, acc5 - ADOXQ y_ptr, y_ptr - ADOXQ acc0, BX - - // Missing products - MOVQ (8*0)(x_ptr), DX - MULXQ DX, acc0, t0 - ADCXQ t0, acc1 - - MOVQ (8*1)(x_ptr), DX - MULXQ DX, AX, t0 - ADCXQ AX, acc2 - ADCXQ t0, acc3 - - MOVQ (8*2)(x_ptr), DX - MULXQ DX, AX, t0 - ADCXQ AX, acc4 - ADCXQ t0, acc5 - - MOVQ (8*3)(x_ptr), DX - MULXQ DX, AX, x_ptr - ADCXQ AX, y_ptr - ADCXQ BX, x_ptr - - // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] - // First reduction step - MOVQ acc0, DX - MULXQ p256ordK0<>(SB), DX, AX - - MULXQ p256ord<>+0x00(SB), AX, t0 - ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 - - MULXQ p256ord<>+0x08(SB), AX, BX - ADCXQ t0, AX - ADOXQ AX, acc1 - - MULXQ p256ord<>+0x10(SB), AX, t0 - ADCXQ BX, AX - ADOXQ AX, acc2 - - MULXQ p256ord<>+0x18(SB), AX, acc0 - ADCXQ t0, AX - ADOXQ AX, acc3 - MOVQ $0, t0 - ADCXQ t0, acc0 - ADOXQ t0, acc0 - - // Second reduction step - MOVQ acc1, DX - MULXQ p256ordK0<>(SB), DX, AX - - MULXQ p256ord<>+0x00(SB), AX, t0 - ADOXQ AX, acc1 - - MULXQ p256ord<>+0x08(SB), AX, BX - ADCXQ t0, AX - ADOXQ AX, acc2 - - MULXQ p256ord<>+0x10(SB), AX, t0 - ADCXQ BX, AX - ADOXQ AX, acc3 - - MULXQ p256ord<>+0x18(SB), AX, acc1 - ADCXQ t0, AX - ADOXQ AX, acc0 - MOVQ $0, t0 - ADCXQ t0, acc1 - ADOXQ t0, acc1 - - // Third reduction step - MOVQ acc2, DX - MULXQ p256ordK0<>(SB), DX, AX - - MULXQ p256ord<>+0x00(SB), AX, t0 - ADOXQ AX, acc2 - - MULXQ p256ord<>+0x08(SB), AX, BX - ADCXQ t0, AX - ADOXQ AX, acc3 - - MULXQ p256ord<>+0x10(SB), AX, t0 - ADCXQ BX, AX - ADOXQ AX, acc0 - - MULXQ p256ord<>+0x18(SB), AX, acc2 - ADCXQ t0, AX - ADOXQ AX, acc1 - MOVQ $0, t0 - ADCXQ t0, acc2 - ADOXQ t0, acc2 - - // Last reduction step - MOVQ acc3, DX - MULXQ p256ordK0<>(SB), DX, AX - - MULXQ p256ord<>+0x00(SB), AX, t0 - ADOXQ AX, acc3 - - MULXQ p256ord<>+0x08(SB), AX, BX - ADCXQ t0, AX - ADOXQ AX, acc0 - - MULXQ p256ord<>+0x10(SB), AX, t0 - ADCXQ BX, AX - ADOXQ AX, acc1 - - MULXQ p256ord<>+0x18(SB), AX, acc3 - ADCXQ t0, AX - ADOXQ AX, acc2 - MOVQ $0, t0 - ADCXQ t0, acc3 - ADOXQ t0, acc3 - - XORQ BX, BX - // Add bits [511:256] of the sqr result - ADCXQ acc4, acc0 - ADCXQ acc5, acc1 - ADCXQ y_ptr, acc2 - ADCXQ x_ptr, acc3 - ADCXQ BX, t0 - - p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) - MOVQ res_ptr, x_ptr + p256OrdSqrRoundAdx(BX) DECQ BP JNE ordSqrLoopBMI2 @@ -599,33 +97,6 @@ ordSqrLoopBMI2: #define t2 SI #define t3 R9 -/* ---------------------------------------*/ -// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0] -TEXT sm2P256SubInternal(SB),NOSPLIT,$0 - XORQ mul0, mul0 - SUBQ t0, acc4 - SBBQ t1, acc5 - SBBQ t2, acc6 - SBBQ t3, acc7 - SBBQ $0, mul0 - - MOVQ acc4, acc0 - MOVQ acc5, acc1 - MOVQ acc6, acc2 - MOVQ acc7, acc3 - - ADDQ $-1, acc4 - ADCQ p256p<>+0x08(SB), acc5 - ADCQ $-1, acc6 - ADCQ p256p<>+0x018(SB), acc7 - ANDQ $1, mul0 - - CMOVQEQ acc0, acc4 - CMOVQEQ acc1, acc5 - CMOVQEQ acc2, acc6 - CMOVQEQ acc3, acc7 - - RET /* ---------------------------------------*/ // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0] TEXT sm2P256MulInternal(SB),NOSPLIT,$8 @@ -634,7 +105,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 MOVQ acc4, mul0 MULQ t0 - MOVQ mul0, X0 + MOVQ mul0, X0 // uses X0 as temp register/storage MOVQ mul1, acc1 MOVQ acc4, mul0 @@ -746,7 +217,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 MOVQ mul1, acc7 PEXTRQ $0, X0, acc0 - sm2P256MulReductionInternal() + sm2P256MulReductionInline MOVQ $0, mul0 // Add bits [511:256] of the result ADCQ acc0, acc4 @@ -775,7 +246,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 internalMulBMI2: MOVQ acc4, mul1 MULXQ t0, acc0, acc1 - MOVQ acc0, X0 + MOVQ acc0, X0 // uses X0 as temp register/storage MULXQ t1, mul0, acc2 ADDQ mul0, acc1 @@ -848,7 +319,7 @@ internalMulBMI2: ADCQ $0, acc7 PEXTRQ $0, X0, acc0 - sm2P256MulReductionInternal() + sm2P256MulReductionInline MOVQ $0, mul0 // Add bits [511:256] of the result ADCQ acc0, acc4 @@ -881,140 +352,11 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalSqrBMI2 - MOVQ acc4, mul0 - MULQ acc5 - MOVQ mul0, acc1 - MOVQ mul1, acc2 - - MOVQ acc4, mul0 - MULQ acc6 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, acc3 - - MOVQ acc4, mul0 - MULQ acc7 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, t0 - - MOVQ acc5, mul0 - MULQ acc6 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, acc0 - - MOVQ acc5, mul0 - MULQ acc7 - ADDQ acc0, t0 - ADCQ $0, mul1 - ADDQ mul0, t0 - ADCQ $0, mul1 - MOVQ mul1, t1 - - MOVQ acc6, mul0 - MULQ acc7 - ADDQ mul0, t1 - ADCQ $0, mul1 - MOVQ mul1, t2 - XORQ t3, t3 - // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ t0, t0 - ADCQ t1, t1 - ADCQ t2, t2 - ADCQ $0, t3 - // Missing products - MOVQ acc4, mul0 - MULQ mul0 - MOVQ mul0, acc0 - MOVQ mul1, acc4 - - MOVQ acc5, mul0 - MULQ mul0 - ADDQ acc4, acc1 - ADCQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, acc4 - - MOVQ acc6, mul0 - MULQ mul0 - ADDQ acc4, acc3 - ADCQ mul0, t0 - ADCQ $0, mul1 - MOVQ mul1, acc4 - - MOVQ acc7, mul0 - MULQ mul0 - ADDQ acc4, t1 - ADCQ mul0, t2 - ADCQ mul1, t3 - // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] - sm2P256SqrReductionInternal() + p256SqrInternalInline RET internalSqrBMI2: - XORQ acc0, acc0 - XORQ t2, t2 - MOVQ acc4, mul1 - MULXQ acc5, acc1, acc2 - - MULXQ acc6, mul0, acc3 - ADOXQ mul0, acc2 - - MULXQ acc7, mul0, t0 - ADOXQ mul0, acc3 - ADOXQ t2, t0 - - MOVQ acc5, mul1 - MULXQ acc6, mul0, t3 - ADOXQ mul0, acc3 - - MULXQ acc7, mul0, t1 - ADCXQ t3, mul0 - ADOXQ mul0, t0 - ADCXQ t2, t1 - - MOVQ acc6, mul1 - MULXQ acc7, mul0, t2 - ADOXQ mul0, t1 - ADOXQ acc0, t2 - - XORQ t3, t3 - - // *2 - ADOXQ acc1, acc1 - ADOXQ acc2, acc2 - ADOXQ acc3, acc3 - ADOXQ t0, t0 - ADOXQ t1, t1 - ADOXQ t2, t2 - ADOXQ acc0, t3 - - // Missing products - MOVQ acc4, mul1 - MULXQ mul1, acc0, acc4 - ADDQ acc4, acc1 - - MOVQ acc5, mul1 - MULXQ mul1, mul0, acc4 - ADCXQ mul0, acc2 - ADCXQ acc4, acc3 - - MOVQ acc6, mul1 - MULXQ mul1, mul0, acc4 - ADCXQ mul0, t0 - ADCXQ acc4, t1 - - MOVQ acc7, mul1 - MULXQ mul1, mul0, acc4 - ADCXQ mul0, t2 - ADCXQ acc4, t3 - // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] - sm2P256SqrReductionInternal() - + p256SqrInternalInlineAdx RET /* ---------------------------------------*/ @@ -1099,7 +441,7 @@ internalSqrBMI2: CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2 \ LDt (x1in) \ - CALL sm2P256SubInternal(SB) \// h = u2 - u1 + p256SubInline2 \// h = u2 - u1 ST (h) \ \ LDt (z1in) \ @@ -1114,7 +456,7 @@ internalSqrBMI2: ST (s2) \ \ LDt (y1in) \ - CALL sm2P256SubInternal(SB) \// r = s2 - s1 + p256SubInline2 \// r = s2 - s1 ST (r) \ \ CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 @@ -1139,10 +481,10 @@ internalSqrBMI2: \ p256MulBy2Inline \// u1 * hˆ2 * 2, inline LDacc (rsqr) \ - CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2 + p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 \ LDt (hcub) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ ST (xout) \ \ MOVQ acc4, t0 \ @@ -1150,13 +492,13 @@ internalSqrBMI2: MOVQ acc6, t2 \ MOVQ acc7, t3 \ LDacc (h) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ \ LDt (r) \ CALL sm2P256MulInternal(SB) \ \ LDt (s2) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ ST (yout) \ \// Load stored values from stack MOVQ rptr, AX \ @@ -1373,36 +715,6 @@ pointaddaffine_avx2: #undef sel_save #undef zero_save -// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero -// otherwise. It writes to [acc4..acc7], t0 and t1. -TEXT sm2P256IsZero(SB),NOSPLIT,$0 - // AX contains a flag that is set if the input is zero. - XORQ AX, AX - MOVQ $1, t1 - - // Check whether [acc4..acc7] are all zero. - MOVQ acc4, t0 - ORQ acc5, t0 - ORQ acc6, t0 - ORQ acc7, t0 - - // Set the zero flag if so. (CMOV of a constant to a register doesn't - // appear to be supported in Go. Thus t1 = 1.) - CMOVQEQ t1, AX - - // XOR [acc4..acc7] with P and compare with zero again. - XORQ $-1, acc4 - XORQ p256p<>+0x08(SB), acc5 - XORQ $-1, acc6 - XORQ p256p<>+0x018(SB), acc7 - ORQ acc5, acc4 - ORQ acc6, acc4 - ORQ acc7, acc4 - - // Set the zero flag if so. - CMOVQEQ t1, AX - RET - /* ---------------------------------------*/ #define x1in(off) (32*0 + off)(SP) #define y1in(off) (32*1 + off)(SP) @@ -1450,9 +762,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0 ST (s2) \ \ LDt (s1) \ - CALL sm2P256SubInternal(SB) \// r = s2 - s1 + p256SubInline2 \// r = s2 - s1 ST (r) \ - CALL sm2P256IsZero(SB) \ + p256IsZeroInline \ MOVQ AX, points_eq \ \ LDacc (z2sqr) \ @@ -1465,9 +777,9 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0 ST (u2) \ \ LDt (u1) \ - CALL sm2P256SubInternal(SB) \// h = u2 - u1 + p256SubInline2 \// h = u2 - u1 ST (h) \ - CALL sm2P256IsZero(SB) \ + p256IsZeroInline \ ANDQ points_eq, AX \ MOVQ AX, points_eq \ \ @@ -1501,10 +813,10 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0 \ p256MulBy2Inline \// u1 * hˆ2 * 2, inline LDacc (rsqr) \ - CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2 + p256SubInline2 \// rˆ2 - u1 * hˆ2 * 2 \ LDt (hcub) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ ST (xout) \ \ MOVQ acc4, t0 \ @@ -1512,13 +824,13 @@ TEXT sm2P256IsZero(SB),NOSPLIT,$0 MOVQ acc6, t2 \ MOVQ acc7, t3 \ LDacc (u2) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ \ LDt (r) \ CALL sm2P256MulInternal(SB) \ \ LDt (s2) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ ST (yout) \ //func p256PointAddAsm(res, in1, in2 *SM2P256Point) int @@ -1669,7 +981,7 @@ pointadd_avx2: #define calX() \ LDacc (x) \ LDt (zsqr) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ LDt (m) \ CALL sm2P256MulInternal(SB) \ ST (m) \ @@ -1718,18 +1030,18 @@ pointadd_avx2: LDacc (m) \ CALL sm2P256SqrInternal(SB) \ LDt (tmp) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ #define calY() \ acc2t \ LDacc (s) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ \ LDt (m) \ CALL sm2P256MulInternal(SB) \ \ LDt (y) \ - CALL sm2P256SubInternal(SB) \ + p256SubInline2 \ #define lastP256PointDouble() \ \ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl