diff --git a/internal/sm2ec/p256_asm_arm64.s b/internal/sm2ec/p256_asm_arm64.s index e1a5f15..6c93e1b 100644 --- a/internal/sm2ec/p256_asm_arm64.s +++ b/internal/sm2ec/p256_asm_arm64.s @@ -869,58 +869,62 @@ TEXT sm2P256SqrInternal<>(SB),NOSPLIT,$0 ADCS t0, acc6, acc6 UMULH x3, x3, t1 ADCS t1, acc7, acc7 + // First reduction step LSL $32, acc0, y0 LSR $32, acc0, y1 - ADDS acc0, acc1, acc1 - ADCS $0, acc2, acc2 - ADCS $0, acc3, acc3 - ADC $0, acc0, acc0 - SUBS y0, acc1 SBCS y1, acc2 SBCS y0, acc3 - SBC y1, acc0 + SBC y1, acc0, y0 + + ADDS acc0, acc1, acc1 + ADCS $0, acc2, acc2 + ADCS $0, acc3, acc3 + ADC $0, y0, acc0 + // Second reduction step LSL $32, acc1, y0 LSR $32, acc1, y1 - ADDS acc1, acc2, acc2 - ADCS $0, acc3, acc3 - ADCS $0, acc0, acc0 - ADC $0, acc1, acc1 - SUBS y0, acc2 SBCS y1, acc3 SBCS y0, acc0 - SBC y1, acc1 + SBC y1, acc1, y0 + + ADDS acc1, acc2, acc2 + ADCS $0, acc3, acc3 + ADCS $0, acc0, acc0 + ADC $0, y0, acc1 + // Third reduction step LSL $32, acc2, y0 LSR $32, acc2, y1 - ADDS acc2, acc3, acc3 - ADCS $0, acc0, acc0 - ADCS $0, acc1, acc1 - ADC $0, acc2, acc2 - SUBS y0, acc3 SBCS y1, acc0 SBCS y0, acc1 - SBC y1, acc2 + SBC y1, acc2, y0 + + ADDS acc2, acc3, acc3 + ADCS $0, acc0, acc0 + ADCS $0, acc1, acc1 + ADC $0, y0, acc2 + // Last reduction step LSL $32, acc3, y0 LSR $32, acc3, y1 - ADDS acc3, acc0, acc0 - ADCS $0, acc1, acc1 - ADCS $0, acc2, acc2 - ADC $0, acc3, acc3 - SUBS y0, acc0 SBCS y1, acc1 SBCS y0, acc2 - SBC y1, acc3 + SBC y1, acc3, y0 + + ADDS acc3, acc0, acc0 + ADCS $0, acc1, acc1 + ADCS $0, acc2, acc2 + ADC $0, y0, acc3 // Add bits [511:256] of the sqr result ADDS acc4, acc0, acc0 diff --git a/internal/sm2ec/p256_macros_amd64.s b/internal/sm2ec/p256_macros_amd64.s index 5cc2728..dd5fb3e 100644 --- a/internal/sm2ec/p256_macros_amd64.s +++ b/internal/sm2ec/p256_macros_amd64.s @@ -30,72 +30,76 @@ GLOBL p256one<>(SB), 8, $32 #define p256SqrMontReduce() \ \ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1] - MOVQ acc0, AX \ - MOVQ acc0, DX \ - SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part - SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part - \ // calculate the positive part first: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1], - \ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry. - ADDQ acc0, acc1 \ // acc1' = L (acc0 + acc1) - ADCQ $0, acc2 \ // acc2' = acc2 + carry1 - ADCQ $0, acc3 \ // acc3' = acc3 + carry2 - ADCQ $0, acc0 \ // acc0' = acc0 + carry3 + MOVQ acc0, AX \ + MOVQ acc0, DX \ + SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part + SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part \// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0 SUBQ AX, acc1 \ SBBQ DX, acc2 \ SBBQ AX, acc3 \ + MOVQ acc0, AX \ SBBQ DX, acc0 \ + \ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1], + \ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry. + ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1) + ADCQ $0, acc2 \ // acc2' = acc2 + carry1 + ADCQ $0, acc3 \ // acc3' = acc3 + carry2 + ADCQ $0, acc0 \ // acc0' = acc0 + carry3 \ // Second reduction step MOVQ acc1, AX \ MOVQ acc1, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ - \ - ADDQ acc1, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ + SHLQ $32, AX \ + SHRQ $32, DX \ \ SUBQ AX, acc2 \ SBBQ DX, acc3 \ SBBQ AX, acc0 \ + MOVQ acc1, AX \ SBBQ DX, acc1 \ + \ + ADDQ AX, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ \ // Third reduction step MOVQ acc2, AX \ MOVQ acc2, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ - \ - ADDQ acc2, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ + SHLQ $32, AX \ + SHRQ $32, DX \ \ SUBQ AX, acc3 \ SBBQ DX, acc0 \ SBBQ AX, acc1 \ + MOVQ acc2, AX \ SBBQ DX, acc2 \ - \ // Last reduction step - XORQ t0, t0 \ - MOVQ acc3, AX \ - MOVQ acc3, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ \ - ADDQ acc3, acc0 \ + ADDQ AX, acc3 \ + ADCQ $0, acc0 \ ADCQ $0, acc1 \ ADCQ $0, acc2 \ - ADCQ $0, acc3 \ + \ // Last reduction step + XORQ t0, t0 \ + MOVQ acc3, AX \ + MOVQ acc3, DX \ + SHLQ $32, AX \ + SHRQ $32, DX \ \ SUBQ AX, acc0 \ SBBQ DX, acc1 \ SBBQ AX, acc2 \ + MOVQ acc3, AX \ SBBQ DX, acc3 \ + \ + ADDQ AX, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ \ // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 \ - ADCQ acc5, acc1 \ - ADCQ y_ptr, acc2 \ - ADCQ x_ptr, acc3 \ + ADCQ acc4, acc0 \ + ADCQ acc5, acc1 \ + ADCQ y_ptr, acc2 \ + ADCQ x_ptr, acc3 \ ADCQ $0, t0 #define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ @@ -151,60 +155,64 @@ GLOBL p256one<>(SB), 8, $32 SHLQ $32, mul0 \ SHRQ $32, mul1 \ \ - ADDQ acc0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - \ SUBQ mul0, acc1 \ SBBQ mul1, acc2 \ SBBQ mul0, acc3 \ + MOVQ acc0, mul0 \ SBBQ mul1, acc0 \ + \ + ADDQ mul0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ \ // Second reduction step MOVQ acc1, mul0 \ MOVQ acc1, mul1 \ SHLQ $32, mul0 \ SHRQ $32, mul1 \ \ - ADDQ acc1, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - \ SUBQ mul0, acc2 \ SBBQ mul1, acc3 \ SBBQ mul0, acc0 \ + MOVQ acc1, mul0 \ SBBQ mul1, acc1 \ + \ + ADDQ mul0, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ \ // Third reduction step MOVQ acc2, mul0 \ MOVQ acc2, mul1 \ SHLQ $32, mul0 \ SHRQ $32, mul1 \ \ - ADDQ acc2, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ - \ SUBQ mul0, acc3 \ SBBQ mul1, acc0 \ SBBQ mul0, acc1 \ + MOVQ acc2, mul0 \ SBBQ mul1, acc2 \ + \ + ADDQ mul0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ \ // Last reduction step MOVQ acc3, mul0 \ MOVQ acc3, mul1 \ SHLQ $32, mul0 \ SHRQ $32, mul1 \ \ - ADDQ acc3, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ - \ SUBQ mul0, acc0 \ SBBQ mul1, acc1 \ SBBQ mul0, acc2 \ + MOVQ acc3, mul0 \ SBBQ mul1, acc3 \ + \ + ADDQ mul0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ MOVQ $0, mul0 \ \ // Add bits [511:256] of the result ADCQ acc0, t0 \