diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 43f1a20..75e126b 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -291,7 +291,7 @@ ordSqrLoop: ADCQ DX, acc2 ADCQ $0, acc3 ADCQ $0, acc0 - // calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 + // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 MOVQ t0, AX MOVQ t0, DX SHLQ $32, AX @@ -484,7 +484,7 @@ ordSqrLoopBMI2: ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3 ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4 ADCQ $0, acc0 // acc0 = t0 + carry5 - // calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 + // calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 MOVQ t0, AX //MOVQ t0, DX // This is not required due to t0=DX already SHLQ $32, AX @@ -759,66 +759,8 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADDQ mul0, acc6 ADCQ $0, mul1 MOVQ mul1, acc7 - // First reduction step - MOVQ acc0, mul0 - MOVQ acc0, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 + sm2P256MulReductionInternal() - SUBQ mul0, acc1 - SBBQ mul1, acc2 - SBBQ mul0, acc3 - SBBQ mul1, acc0 - // Second reduction step - MOVQ acc1, mul0 - MOVQ acc1, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc1, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - - SUBQ mul0, acc2 - SBBQ mul1, acc3 - SBBQ mul0, acc0 - SBBQ mul1, acc1 - // Third reduction step - MOVQ acc2, mul0 - MOVQ acc2, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc2, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - - SUBQ mul0, acc3 - SBBQ mul1, acc0 - SBBQ mul0, acc1 - SBBQ mul1, acc2 - // Last reduction step - MOVQ acc3, mul0 - MOVQ acc3, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc3, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - - SUBQ mul0, acc0 - SBBQ mul1, acc1 - SBBQ mul0, acc2 - SBBQ mul1, acc3 MOVQ $0, BP // Add bits [511:256] of the result ADCQ acc0, acc4 @@ -918,66 +860,7 @@ internalMulBMI2: ADDQ mul0, acc6 ADCQ $0, acc7 - // First reduction step - MOVQ acc0, mul0 - MOVQ acc0, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - - SUBQ mul0, acc1 - SBBQ mul1, acc2 - SBBQ mul0, acc3 - SBBQ mul1, acc0 - // Second reduction step - MOVQ acc1, mul0 - MOVQ acc1, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc1, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - - SUBQ mul0, acc2 - SBBQ mul1, acc3 - SBBQ mul0, acc0 - SBBQ mul1, acc1 - // Third reduction step - MOVQ acc2, mul0 - MOVQ acc2, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc2, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - - SUBQ mul0, acc3 - SBBQ mul1, acc0 - SBBQ mul0, acc1 - SBBQ mul1, acc2 - // Last reduction step - MOVQ acc3, mul0 - MOVQ acc3, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc3, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - - SUBQ mul0, acc0 - SBBQ mul1, acc1 - SBBQ mul0, acc2 - SBBQ mul1, acc3 + sm2P256MulReductionInternal() MOVQ $0, BP // Add bits [511:256] of the result ADCQ acc0, acc4 diff --git a/internal/sm2ec/p256_asm_arm64.s b/internal/sm2ec/p256_asm_arm64.s index 6c93e1b..398bde4 100644 --- a/internal/sm2ec/p256_asm_arm64.s +++ b/internal/sm2ec/p256_asm_arm64.s @@ -207,54 +207,57 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0 LSL $32, acc0, y0 LSR $32, acc0, y1 - ADDS acc0, acc1, acc1 - ADCS $0, acc2, acc2 - ADCS $0, acc3, acc3 - ADC $0, acc0, acc0 - SUBS y0, acc1 SBCS y1, acc2 SBCS y0, acc3 - SBC y1, acc0 + SBC y1, acc0, y0 + + ADDS acc0, acc1, acc1 + ADCS $0, acc2, acc2 + ADCS $0, acc3, acc3 + ADC $0, y0, acc0 + // Second reduction step LSL $32, acc1, y0 LSR $32, acc1, y1 - ADDS acc1, acc2, acc2 - ADCS $0, acc3, acc3 - ADCS $0, acc0, acc0 - ADC $0, acc1, acc1 - SUBS y0, acc2 SBCS y1, acc3 SBCS y0, acc0 - SBC y1, acc1 + SBC y1, acc1, y0 + + ADDS acc1, acc2, acc2 + ADCS $0, acc3, acc3 + ADCS $0, acc0, acc0 + ADC $0, y0, acc1 + // Third reduction step LSL $32, acc2, y0 LSR $32, acc2, y1 - ADDS acc2, acc3, acc3 - ADCS $0, acc0, acc0 - ADCS $0, acc1, acc1 - ADC $0, acc2, acc2 - SUBS y0, acc3 SBCS y1, acc0 SBCS y0, acc1 - SBC y1, acc2 + SBC y1, acc2, y0 + + ADDS acc2, acc3, acc3 + ADCS $0, acc0, acc0 + ADCS $0, acc1, acc1 + ADC $0, y0, acc2 + // Last reduction step LSL $32, acc3, y0 LSR $32, acc3, y1 - ADDS acc3, acc0, acc0 - ADCS $0, acc1, acc1 - ADCS $0, acc2, acc2 - ADC $0, acc3, acc3 - SUBS y0, acc0 SBCS y1, acc1 SBCS y0, acc2 - SBC y1, acc3 + SBC y1, acc3, y0 + + ADDS acc3, acc0, acc0 + ADCS $0, acc1, acc1 + ADCS $0, acc2, acc2 + ADC $0, y0, acc3 SUBS const0, acc0, t0 SBCS const1, acc1, t1 @@ -967,15 +970,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0 LSL $32, acc0, t0 LSR $32, acc0, t1 - ADDS acc0, acc1, acc1 - ADCS $0, acc2, acc2 - ADCS $0, acc3, acc3 - ADC $0, acc0, acc0 - SUBS t0, acc1 SBCS t1, acc2 SBCS t0, acc3 - SBC t1, acc0 + SBC t1, acc0, t0 + + ADDS acc0, acc1, acc1 + ADCS $0, acc2, acc2 + ADCS $0, acc3, acc3 + ADC $0, t0, acc0 // y[1] * x MUL y1, x0, t0 @@ -1003,15 +1006,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0 LSL $32, acc1, t0 LSR $32, acc1, t1 - ADDS acc1, acc2, acc2 - ADCS $0, acc3, acc3 - ADCS $0, acc0, acc0 - ADC $0, acc1, acc1 - SUBS t0, acc2 SBCS t1, acc3 SBCS t0, acc0 - SBC t1, acc1 + SBC t1, acc1, t0 + + ADDS acc1, acc2, acc2 + ADCS $0, acc3, acc3 + ADCS $0, acc0, acc0 + ADC $0, t0, acc1 // y[2] * x MUL y2, x0, t0 @@ -1039,15 +1042,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0 LSL $32, acc2, t0 LSR $32, acc2, t1 - ADDS acc2, acc3, acc3 - ADCS $0, acc0, acc0 - ADCS $0, acc1, acc1 - ADC $0, acc2, acc2 - SUBS t0, acc3 SBCS t1, acc0 SBCS t0, acc1 - SBC t1, acc2 + SBC t1, acc2, t0 + + ADDS acc2, acc3, acc3 + ADCS $0, acc0, acc0 + ADCS $0, acc1, acc1 + ADC $0, t0, acc2 // y[3] * x MUL y3, x0, t0 @@ -1075,15 +1078,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0 LSL $32, acc3, t0 LSR $32, acc3, t1 - ADDS acc3, acc0, acc0 - ADCS $0, acc1, acc1 - ADCS $0, acc2, acc2 - ADC $0, acc3, acc3 - SUBS t0, acc0 SBCS t1, acc1 SBCS t0, acc2 - SBC t1, acc3 + SBC t1, acc3, t0 + + ADDS acc3, acc0, acc0 + ADCS $0, acc1, acc1 + ADCS $0, acc2, acc2 + ADC $0, t0, acc3 // Add bits [511:256] of the mul result ADDS acc4, acc0, acc0 diff --git a/internal/sm2ec/p256_common_amd64.s b/internal/sm2ec/p256_common_amd64.s index 32e874d..614355b 100644 --- a/internal/sm2ec/p256_common_amd64.s +++ b/internal/sm2ec/p256_common_amd64.s @@ -207,19 +207,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 SHLQ $32, AX SHRQ $32, DX - ADDQ acc0, acc1 + SUBQ AX, acc1 + SBBQ DX, acc2 + SBBQ AX, acc3 + MOVQ acc0, AX + SBBQ DX, acc0 + + ADDQ AX, acc1 ADCQ $0, acc2 ADCQ $0, acc3 ADCQ acc0, acc4 ADCQ $0, acc5 - SUBQ AX, acc1 - SBBQ DX, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 - SBBQ $0, acc5 XORQ acc0, acc0 - // x * y[1] MOVQ (8*1)(y_ptr), t0 @@ -258,19 +258,19 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 SHLQ $32, AX SHRQ $32, DX - ADDQ acc1, acc2 + SUBQ AX, acc2 + SBBQ DX, acc3 + SBBQ AX, acc4 + MOVQ acc1, AX + SBBQ DX, acc1 + + ADDQ AX, acc2 ADCQ $0, acc3 ADCQ $0, acc4 ADCQ acc1, acc5 ADCQ $0, acc0 - SUBQ AX, acc2 - SBBQ DX, acc3 - SBBQ AX, acc4 - SBBQ DX, acc5 - SBBQ $0, acc0 XORQ acc1, acc1 - // x * y[2] MOVQ (8*2)(y_ptr), t0 @@ -309,17 +309,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 SHLQ $32, AX SHRQ $32, DX - ADDQ acc2, acc3 + SUBQ AX, acc3 + SBBQ DX, acc4 + SBBQ AX, acc5 + MOVQ acc2, AX + SBBQ DX, acc2 + + ADDQ AX, acc3 ADCQ $0, acc4 ADCQ $0, acc5 ADCQ acc2, acc0 ADCQ $0, acc1 - SUBQ AX, acc3 - SBBQ DX, acc4 - SBBQ AX, acc5 - SBBQ DX, acc0 - SBBQ $0, acc1 XORQ acc2, acc2 // x * y[3] MOVQ (8*3)(y_ptr), t0 @@ -359,17 +360,18 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 SHLQ $32, AX SHRQ $32, DX - ADDQ acc3, acc4 + SUBQ AX, acc4 + SBBQ DX, acc5 + SBBQ AX, acc0 + MOVQ acc3, AX + SBBQ DX, acc3 + + ADDQ AX, acc4 ADCQ $0, acc5 ADCQ $0, acc0 ADCQ acc3, acc1 ADCQ $0, acc2 - SUBQ AX, acc4 - SBBQ DX, acc5 - SBBQ AX, acc0 - SBBQ DX, acc1 - SBBQ $0, acc2 p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) RET @@ -395,19 +397,19 @@ mulBMI2: SHLQ $32, AX SHRQ $32, DX - ADDQ acc0, acc1 + SUBQ AX, acc1 + SBBQ DX, acc2 + SBBQ AX, acc3 + MOVQ acc0, AX + SBBQ DX, acc0 + + ADDQ AX, acc1 ADCQ $0, acc2 ADCQ $0, acc3 ADCQ acc0, acc4 ADCQ $0, acc5 - - SUBQ AX, acc1 - SBBQ DX, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 - SBBQ $0, acc5 - XORQ acc0, acc0 + XORQ acc0, acc0 // x * y[1] MOVQ (8*1)(y_ptr), DX MULXQ (8*0)(x_ptr), AX, BX @@ -436,19 +438,19 @@ mulBMI2: SHLQ $32, AX SHRQ $32, DX - ADDQ acc1, acc2 + SUBQ AX, acc2 + SBBQ DX, acc3 + SBBQ AX, acc4 + MOVQ acc1, AX + SBBQ DX, acc1 + + ADDQ AX, acc2 ADCQ $0, acc3 ADCQ $0, acc4 ADCQ acc1, acc5 ADCQ $0, acc0 - SUBQ AX, acc2 - SBBQ DX, acc3 - SBBQ AX, acc4 - SBBQ DX, acc5 - SBBQ $0, acc0 XORQ acc1, acc1 - // x * y[2] MOVQ (8*2)(y_ptr), DX @@ -477,17 +479,18 @@ mulBMI2: SHLQ $32, AX SHRQ $32, DX - ADDQ acc2, acc3 + SUBQ AX, acc3 + SBBQ DX, acc4 + SBBQ AX, acc5 + MOVQ acc2, AX + SBBQ DX, acc2 + + ADDQ AX, acc3 ADCQ $0, acc4 ADCQ $0, acc5 ADCQ acc2, acc0 ADCQ $0, acc1 - SUBQ AX, acc3 - SBBQ DX, acc4 - SBBQ AX, acc5 - SBBQ DX, acc0 - SBBQ $0, acc1 XORQ acc2, acc2 // x * y[3] MOVQ (8*3)(y_ptr), DX @@ -517,17 +520,18 @@ mulBMI2: SHLQ $32, AX SHRQ $32, DX - ADDQ acc3, acc4 + SUBQ AX, acc4 + SBBQ DX, acc5 + SBBQ AX, acc0 + MOVQ acc3, AX + SBBQ DX, acc3 + + ADDQ AX, acc4 ADCQ $0, acc5 ADCQ $0, acc0 ADCQ acc3, acc1 ADCQ $0, acc2 - SUBQ AX, acc4 - SBBQ DX, acc5 - SBBQ AX, acc0 - SBBQ DX, acc1 - SBBQ $0, acc2 p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) RET @@ -550,32 +554,35 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0 SHLQ $32, AX SHRQ $32, DX - ADDQ acc0, acc1 + SUBQ AX, acc1 + SBBQ DX, acc2 + SBBQ AX, acc3 + MOVQ acc0, AX + SBBQ DX, acc0 + + ADDQ AX, acc1 ADCQ $0, acc2 ADCQ $0, acc3 ADCQ acc0, acc4 - SUBQ AX, acc1 - SBBQ DX, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 XORQ acc5, acc5 - // Second stage MOVQ acc1, AX MOVQ acc1, DX SHLQ $32, AX SHRQ $32, DX - ADDQ acc1, acc2 + SUBQ AX, acc2 + SBBQ DX, acc3 + SBBQ AX, acc4 + MOVQ acc1, AX + SBBQ DX, acc5 + + ADDQ AX, acc2 ADCQ $0, acc3 ADCQ $0, acc4 ADCQ acc1, acc5 - SUBQ AX, acc2 - SBBQ DX, acc3 - SBBQ AX, acc4 - SBBQ DX, acc5 XORQ acc0, acc0 // Third stage MOVQ acc2, AX @@ -583,15 +590,17 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0 SHLQ $32, AX SHRQ $32, DX - ADDQ acc2, acc3 + SUBQ AX, acc3 + SBBQ DX, acc4 + SBBQ AX, acc5 + MOVQ acc2, AX + SBBQ DX, acc2 + + ADDQ AX, acc3 ADCQ $0, acc4 ADCQ $0, acc5 ADCQ acc2, acc0 - SUBQ AX, acc3 - SBBQ DX, acc4 - SBBQ AX, acc5 - SBBQ DX, acc0 XORQ acc1, acc1 // Last stage MOVQ acc3, AX @@ -599,15 +608,16 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0 SHLQ $32, AX SHRQ $32, DX - ADDQ acc3, acc4 - ADCQ $0, acc5 - ADCQ $0, acc0 - ADCQ acc3, acc1 - SUBQ AX, acc4 SBBQ DX, acc5 SBBQ AX, acc0 - SBBQ DX, acc1 + MOVQ acc3, AX + SBBQ DX, acc3 + + ADDQ AX, acc4 + ADCQ $0, acc5 + ADCQ $0, acc0 + ADCQ acc3, acc1 MOVQ acc4, x_ptr MOVQ acc5, acc3 diff --git a/internal/sm2ec/p256_macros_amd64.s b/internal/sm2ec/p256_macros_amd64.s index dd5fb3e..976854c 100644 --- a/internal/sm2ec/p256_macros_amd64.s +++ b/internal/sm2ec/p256_macros_amd64.s @@ -237,6 +237,72 @@ GLOBL p256one<>(SB), 8, $32 CMOVQCS t2, acc6 \ CMOVQCS t3, acc7 +#define sm2P256MulReductionInternal() \ + \// First reduction step + MOVQ acc0, mul0 \ + MOVQ acc0, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ + \ + SUBQ mul0, acc1 \ + SBBQ mul1, acc2 \ + SBBQ mul0, acc3 \ + MOVQ acc0, mul0 \ + SBBQ mul1, acc0 \ + \ + ADDQ mul0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + \// Second reduction step + MOVQ acc1, mul0 \ + MOVQ acc1, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ + \ + SUBQ mul0, acc2 \ + SBBQ mul1, acc3 \ + SBBQ mul0, acc0 \ + MOVQ acc1, mul0 \ + SBBQ mul1, acc1 \ + \ + ADDQ mul0, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + \// Third reduction step + MOVQ acc2, mul0 \ + MOVQ acc2, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ + \ + SUBQ mul0, acc3 \ + SBBQ mul1, acc0 \ + SBBQ mul0, acc1 \ + MOVQ acc2, mul0 \ + SBBQ mul1, acc2 \ + \ + ADDQ mul0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + \// Last reduction step + MOVQ acc3, mul0 \ + MOVQ acc3, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ + \ + SUBQ mul0, acc0 \ + SBBQ mul1, acc1 \ + SBBQ mul0, acc2 \ + MOVQ acc3, mul0 \ + SBBQ mul1, acc3 \ + \ + ADDQ mul0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 + #define p256PointDoubleInit() \ MOVOU (16*0)(BX), X0 \ MOVOU (16*1)(BX), X1 \ diff --git a/internal/sm2ec/p256_plugin_amd64.s b/internal/sm2ec/p256_plugin_amd64.s index 8594ca8..3223598 100644 --- a/internal/sm2ec/p256_plugin_amd64.s +++ b/internal/sm2ec/p256_plugin_amd64.s @@ -757,67 +757,9 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADDQ mul0, acc6 ADCQ $0, mul1 MOVQ mul1, acc7 - // First reduction step + PEXTRQ $0, X0, acc0 - MOVQ acc0, mul0 - MOVQ acc0, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - - SUBQ mul0, acc1 - SBBQ mul1, acc2 - SBBQ mul0, acc3 - SBBQ mul1, acc0 - // Second reduction step - MOVQ acc1, mul0 - MOVQ acc1, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc1, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - - SUBQ mul0, acc2 - SBBQ mul1, acc3 - SBBQ mul0, acc0 - SBBQ mul1, acc1 - // Third reduction step - MOVQ acc2, mul0 - MOVQ acc2, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc2, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - - SUBQ mul0, acc3 - SBBQ mul1, acc0 - SBBQ mul0, acc1 - SBBQ mul1, acc2 - // Last reduction step - MOVQ acc3, mul0 - MOVQ acc3, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc3, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - - SUBQ mul0, acc0 - SBBQ mul1, acc1 - SBBQ mul0, acc2 - SBBQ mul1, acc3 + sm2P256MulReductionInternal() MOVQ $0, mul0 // Add bits [511:256] of the result ADCQ acc0, acc4 @@ -918,67 +860,8 @@ internalMulBMI2: ADDQ mul0, acc6 ADCQ $0, acc7 - // First reduction step PEXTRQ $0, X0, acc0 - MOVQ acc0, mul0 - MOVQ acc0, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - - SUBQ mul0, acc1 - SBBQ mul1, acc2 - SBBQ mul0, acc3 - SBBQ mul1, acc0 - // Second reduction step - MOVQ acc1, mul0 - MOVQ acc1, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc1, acc2 - ADCQ $0, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - - SUBQ mul0, acc2 - SBBQ mul1, acc3 - SBBQ mul0, acc0 - SBBQ mul1, acc1 - // Third reduction step - MOVQ acc2, mul0 - MOVQ acc2, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc2, acc3 - ADCQ $0, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - - SUBQ mul0, acc3 - SBBQ mul1, acc0 - SBBQ mul0, acc1 - SBBQ mul1, acc2 - // Last reduction step - MOVQ acc3, mul0 - MOVQ acc3, mul1 - SHLQ $32, mul0 - SHRQ $32, mul1 - - ADDQ acc3, acc0 - ADCQ $0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - - SUBQ mul0, acc0 - SBBQ mul1, acc1 - SBBQ mul0, acc2 - SBBQ mul1, acc3 + sm2P256MulReductionInternal() MOVQ $0, mul0 // Add bits [511:256] of the result ADCQ acc0, acc4