diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 3a66253..7f508ae 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -109,65 +109,65 @@ sqrLoop: RET sqrBMI2: - // y[1:] * y[0] + XORQ acc0, acc0 + XORQ y_ptr, y_ptr + // x[1:] * x[0] MOVQ (8*0)(x_ptr), DX - MULXQ (8*1)(x_ptr), acc1, acc2 MULXQ (8*2)(x_ptr), AX, acc3 - ADDQ AX, acc2 + ADOXQ AX, acc2 MULXQ (8*3)(x_ptr), AX, acc4 - ADCQ AX, acc3 - ADCQ $0, acc4 + ADOXQ AX, acc3 + ADOXQ y_ptr, acc4 - // y[2:] * y[1] + // x[2:] * x[1] MOVQ (8*1)(x_ptr), DX - MULXQ (8*2)(x_ptr), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 + ADOXQ AX, acc3 MULXQ (8*3)(x_ptr), AX, acc5 - ADCQ $0, acc5 - ADDQ AX, acc4 + ADCXQ t1, AX + ADOXQ AX, acc4 + ADCXQ y_ptr, acc5 - // y[3] * y[2] + // y[x] * x[2] MOVQ (8*2)(x_ptr), DX + MULXQ (8*3)(x_ptr), AX, y_ptr + ADOXQ AX, acc5 + ADOXQ acc0, y_ptr - MULXQ (8*3)(x_ptr), AX, y_ptr - ADCQ AX, acc5 - ADCQ $0, y_ptr XORQ t1, t1 // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, t1 + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ acc4, acc4 + ADOXQ acc5, acc5 + ADOXQ y_ptr, y_ptr + ADOXQ acc0, t1 // Missing products MOVQ (8*0)(x_ptr), DX MULXQ DX, acc0, t0 - ADDQ t0, acc1 + ADCXQ t0, acc1 MOVQ (8*1)(x_ptr), DX MULXQ DX, AX, t0 - ADCQ AX, acc2 - ADCQ t0, acc3 + ADCXQ AX, acc2 + ADCXQ t0, acc3 MOVQ (8*2)(x_ptr), DX - MULXQ DX, AX, t0 - ADCQ AX, acc4 - ADCQ t0, acc5 + MULXQ DX, AX, t0 + ADCXQ AX, acc4 + ADCXQ t0, acc5 MOVQ (8*3)(x_ptr), DX MULXQ DX, AX, x_ptr - ADCQ AX, y_ptr - ADCQ t1, x_ptr + ADCXQ AX, y_ptr + ADCXQ t1, x_ptr // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] p256SqrMontReduce() @@ -901,11 +901,11 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 MULQ acc6 ADDQ mul0, acc3 ADCQ $0, mul1 - MOVQ mul1, hlp + MOVQ mul1, acc0 MOVQ acc5, mul0 MULQ acc7 - ADDQ hlp, t0 + ADDQ acc0, t0 ADCQ $0, mul1 ADDQ mul0, t0 ADCQ $0, mul1 @@ -955,39 +955,42 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 RET internalSqrBMI2: + XORQ acc0, acc0 + XORQ t2, t2 MOVQ acc4, mul1 MULXQ acc5, acc1, acc2 MULXQ acc6, mul0, acc3 - ADDQ mul0, acc2 + ADOXQ mul0, acc2 MULXQ acc7, mul0, t0 - ADCQ mul0, acc3 - ADCQ $0, t0 + ADOXQ mul0, acc3 + ADOXQ t2, t0 MOVQ acc5, mul1 - MULXQ acc6, mul0, hlp - ADDQ mul0, acc3 - ADCQ hlp, t0 + MULXQ acc6, mul0, t3 + ADOXQ mul0, acc3 MULXQ acc7, mul0, t1 - ADCQ $0, t1 - ADDQ mul0, t0 + ADCXQ t3, mul0 + ADOXQ mul0, t0 + ADCXQ t2, t1 MOVQ acc6, mul1 MULXQ acc7, mul0, t2 - ADCQ mul0, t1 - ADCQ $0, t2 + ADOXQ mul0, t1 + ADOXQ acc0, t2 + XORQ t3, t3 // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ t0, t0 - ADCQ t1, t1 - ADCQ t2, t2 - ADCQ $0, t3 + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ t0, t0 + ADOXQ t1, t1 + ADOXQ t2, t2 + ADOXQ acc0, t3 // Missing products MOVQ acc4, mul1 @@ -996,18 +999,18 @@ internalSqrBMI2: MOVQ acc5, mul1 MULXQ mul1, mul0, acc4 - ADCQ mul0, acc2 - ADCQ acc4, acc3 + ADCXQ mul0, acc2 + ADCXQ acc4, acc3 MOVQ acc6, mul1 MULXQ mul1, mul0, acc4 - ADCQ mul0, t0 - ADCQ acc4, t1 + ADCXQ mul0, t0 + ADCXQ acc4, t1 MOVQ acc7, mul1 MULXQ mul1, mul0, acc4 - ADCQ mul0, t2 - ADCQ acc4, t3 + ADCXQ mul0, t2 + ADCXQ acc4, t3 // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] sm2P256SqrReductionInternal() diff --git a/internal/sm2ec/p256_common_amd64.s b/internal/sm2ec/p256_common_amd64.s index 7de66dd..513c0e6 100644 --- a/internal/sm2ec/p256_common_amd64.s +++ b/internal/sm2ec/p256_common_amd64.s @@ -168,7 +168,6 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0 /* ---------------------------------------*/ // func p256Mul(res, in1, in2 *p256Element) TEXT ·p256Mul(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr MOVQ in1+8(FP), x_ptr MOVQ in2+16(FP), y_ptr @@ -372,25 +371,27 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 ADCQ acc3, acc1 ADCQ $0, acc2 + MOVQ res+0(FP), res_ptr p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) RET mulBMI2: + XORQ acc5, acc5 + XORQ res_ptr, res_ptr // x * y[0] MOVQ (8*0)(y_ptr), DX MULXQ (8*0)(x_ptr), acc0, acc1 MULXQ (8*1)(x_ptr), AX, acc2 - ADDQ AX, acc1 + ADCXQ AX, acc1 MULXQ (8*2)(x_ptr), AX, acc3 - ADCQ AX, acc2 + ADCXQ AX, acc2 MULXQ (8*3)(x_ptr), AX, acc4 - ADCQ AX, acc3 - ADCQ $0, acc4 + ADCXQ AX, acc3 + ADCXQ acc5, acc4 - XORQ acc5, acc5 // First reduction step MOVQ acc0, AX MOVQ acc0, DX @@ -403,34 +404,32 @@ mulBMI2: MOVQ acc0, AX SBBQ DX, acc0 - ADDQ AX, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - ADCQ acc0, acc4 - ADCQ $0, acc5 + ADOXQ AX, acc1 + ADOXQ res_ptr, acc2 + ADOXQ res_ptr, acc3 + ADOXQ acc0, acc4 + ADOXQ res_ptr, acc5 XORQ acc0, acc0 // x * y[1] MOVQ (8*1)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, BX - ADDQ AX, acc1 - ADCQ BX, acc2 + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc1 MULXQ (8*1)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc2 - ADCQ BX, acc3 + ADCXQ t0, AX + ADOXQ AX, acc2 - MULXQ (8*2)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc3 - ADCQ BX, acc4 + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc3 MULXQ (8*3)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc4 - ADCQ BX, acc5 - ADCQ $0, acc0 + ADCXQ t0, AX + ADOXQ AX, acc4 + ADCXQ acc0, BX + ADOXQ BX, acc5 + ADOXQ res_ptr, acc0 // Second reduction step MOVQ acc1, AX @@ -444,35 +443,33 @@ mulBMI2: MOVQ acc1, AX SBBQ DX, acc1 - ADDQ AX, acc2 - ADCQ $0, acc3 - ADCQ $0, acc4 - ADCQ acc1, acc5 - ADCQ $0, acc0 + ADOXQ AX, acc2 + ADOXQ res_ptr, acc3 + ADOXQ res_ptr, acc4 + ADOXQ acc1, acc5 + ADOXQ res_ptr, acc0 XORQ acc1, acc1 // x * y[2] MOVQ (8*2)(y_ptr), DX - - MULXQ (8*0)(x_ptr), AX, BX - ADDQ AX, acc2 - ADCQ BX, acc3 + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc2 MULXQ (8*1)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc3 - ADCQ BX, acc4 + ADCXQ t0, AX + ADOXQ AX, acc3 - MULXQ (8*2)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc4 - ADCQ BX, acc5 + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc4 MULXQ (8*3)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc5 - ADCQ BX, acc0 - ADCQ $0, acc1 + ADCXQ t0, AX + ADOXQ AX, acc5 + ADCXQ res_ptr, BX + ADOXQ BX, acc0 + ADOXQ res_ptr, acc1 + // Third reduction step MOVQ acc2, AX MOVQ acc2, DX @@ -485,35 +482,33 @@ mulBMI2: MOVQ acc2, AX SBBQ DX, acc2 - ADDQ AX, acc3 - ADCQ $0, acc4 - ADCQ $0, acc5 - ADCQ acc2, acc0 - ADCQ $0, acc1 + ADOXQ AX, acc3 + ADOXQ res_ptr, acc4 + ADOXQ res_ptr, acc5 + ADOXQ acc2, acc0 + ADOXQ res_ptr, acc1 XORQ acc2, acc2 // x * y[3] MOVQ (8*3)(y_ptr), DX - - MULXQ (8*0)(x_ptr), AX, BX - ADDQ AX, acc3 - ADCQ BX, acc4 + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc3 MULXQ (8*1)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc4 - ADCQ BX, acc5 + ADCXQ t0, AX + ADOXQ AX, acc4 - MULXQ (8*2)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc5 - ADCQ BX, acc0 + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ BX, AX + ADOXQ AX, acc5 MULXQ (8*3)(x_ptr), AX, BX - ADCQ $0, BX - ADDQ AX, acc0 - ADCQ BX, acc1 - ADCQ $0, acc2 + ADCXQ t0, AX + ADOXQ AX, acc0 + ADCXQ res_ptr, BX + ADOXQ BX, acc1 + ADOXQ res_ptr, acc2 + // Last reduction step MOVQ acc3, AX MOVQ acc3, DX @@ -526,12 +521,13 @@ mulBMI2: MOVQ acc3, AX SBBQ DX, acc3 - ADDQ AX, acc4 - ADCQ $0, acc5 - ADCQ $0, acc0 - ADCQ acc3, acc1 - ADCQ $0, acc2 + ADOXQ AX, acc4 + ADOXQ res_ptr, acc5 + ADOXQ res_ptr, acc0 + ADOXQ acc3, acc1 + ADOXQ res_ptr, acc2 + MOVQ res+0(FP), res_ptr p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) RET diff --git a/internal/sm2ec/p256_plugin_amd64.s b/internal/sm2ec/p256_plugin_amd64.s index cdf4b65..018c5b8 100644 --- a/internal/sm2ec/p256_plugin_amd64.s +++ b/internal/sm2ec/p256_plugin_amd64.s @@ -109,65 +109,64 @@ sqrLoop: RET sqrBMI2: - // y[1:] * y[0] + XORQ acc0, acc0 + XORQ y_ptr, y_ptr + // x[1:] * x[0] MOVQ (8*0)(x_ptr), DX - MULXQ (8*1)(x_ptr), acc1, acc2 - + MULXQ (8*2)(x_ptr), AX, acc3 - ADDQ AX, acc2 + ADOXQ AX, acc2 MULXQ (8*3)(x_ptr), AX, acc4 - ADCQ AX, acc3 - ADCQ $0, acc4 + ADOXQ AX, acc3 + ADOXQ y_ptr, acc4 - // y[2:] * y[1] + // x[2:] * x[1] MOVQ (8*1)(x_ptr), DX - MULXQ (8*2)(x_ptr), AX, BX - ADDQ AX, acc3 - ADCQ BX, acc4 + ADOXQ AX, acc3 MULXQ (8*3)(x_ptr), AX, acc5 - ADCQ $0, acc5 - ADDQ AX, acc4 + ADCXQ BX, AX + ADOXQ AX, acc4 + ADCXQ y_ptr, acc5 - // y[3] * y[2] + // x[3] * x[2] MOVQ (8*2)(x_ptr), DX - MULXQ (8*3)(x_ptr), AX, y_ptr - ADCQ AX, acc5 - ADCQ $0, y_ptr + ADOXQ AX, acc5 + ADOXQ acc0, y_ptr XORQ BX, BX // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, BX + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ acc4, acc4 + ADOXQ acc5, acc5 + ADOXQ y_ptr, y_ptr + ADOXQ acc0, BX // Missing products MOVQ (8*0)(x_ptr), DX MULXQ DX, acc0, t0 - ADDQ t0, acc1 + ADCXQ t0, acc1 MOVQ (8*1)(x_ptr), DX MULXQ DX, AX, t0 - ADCQ AX, acc2 - ADCQ t0, acc3 + ADCXQ AX, acc2 + ADCXQ t0, acc3 MOVQ (8*2)(x_ptr), DX MULXQ DX, AX, t0 - ADCQ AX, acc4 - ADCQ t0, acc5 + ADCXQ AX, acc4 + ADCXQ t0, acc5 MOVQ (8*3)(x_ptr), DX MULXQ DX, AX, x_ptr - ADCQ AX, y_ptr - ADCQ BX, x_ptr + ADCXQ AX, y_ptr + ADCXQ BX, x_ptr // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] p256SqrMontReduce() @@ -931,65 +930,68 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 MOVQ acc4, mul0 MULQ mul0 MOVQ mul0, acc0 - MOVQ DX, acc4 + MOVQ mul1, acc4 MOVQ acc5, mul0 MULQ mul0 ADDQ acc4, acc1 ADCQ mul0, acc2 - ADCQ $0, DX - MOVQ DX, acc4 + ADCQ $0, mul1 + MOVQ mul1, acc4 MOVQ acc6, mul0 MULQ mul0 ADDQ acc4, acc3 ADCQ mul0, t0 - ADCQ $0, DX - MOVQ DX, acc4 + ADCQ $0, mul1 + MOVQ mul1, acc4 MOVQ acc7, mul0 MULQ mul0 ADDQ acc4, t1 ADCQ mul0, t2 - ADCQ DX, t3 + ADCQ mul1, t3 // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] sm2P256SqrReductionInternal() RET internalSqrBMI2: + XORQ acc0, acc0 + XORQ t2, t2 MOVQ acc4, mul1 MULXQ acc5, acc1, acc2 MULXQ acc6, mul0, acc3 - ADDQ mul0, acc2 + ADOXQ mul0, acc2 MULXQ acc7, mul0, t0 - ADCQ mul0, acc3 - ADCQ $0, t0 + ADOXQ mul0, acc3 + ADOXQ t2, t0 MOVQ acc5, mul1 - MULXQ acc6, mul0, acc0 - ADDQ mul0, acc3 - ADCQ acc0, t0 + MULXQ acc6, mul0, t3 + ADOXQ mul0, acc3 MULXQ acc7, mul0, t1 - ADCQ $0, t1 - ADDQ mul0, t0 + ADCXQ t3, mul0 + ADOXQ mul0, t0 + ADCXQ t2, t1 MOVQ acc6, mul1 MULXQ acc7, mul0, t2 - ADCQ mul0, t1 - ADCQ $0, t2 + ADOXQ mul0, t1 + ADOXQ acc0, t2 + XORQ t3, t3 // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ t0, t0 - ADCQ t1, t1 - ADCQ t2, t2 - ADCQ $0, t3 + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ t0, t0 + ADOXQ t1, t1 + ADOXQ t2, t2 + ADOXQ acc0, t3 // Missing products MOVQ acc4, mul1 @@ -998,18 +1000,18 @@ internalSqrBMI2: MOVQ acc5, mul1 MULXQ mul1, mul0, acc4 - ADCQ mul0, acc2 - ADCQ acc4, acc3 + ADCXQ mul0, acc2 + ADCXQ acc4, acc3 MOVQ acc6, mul1 MULXQ mul1, mul0, acc4 - ADCQ mul0, t0 - ADCQ acc4, t1 + ADCXQ mul0, t0 + ADCXQ acc4, t1 MOVQ acc7, mul1 MULXQ mul1, mul0, acc4 - ADCQ mul0, t2 - ADCQ acc4, t3 + ADCXQ mul0, t2 + ADCXQ acc4, t3 // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] sm2P256SqrReductionInternal()