From ef55df365775405130158c866fbc87c005ac48a6 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Tue, 7 Nov 2023 15:33:52 +0800 Subject: [PATCH] sm9/bn256: use neon inst. for conditional move --- sm9/bn256/select_arm64.s | 347 +++++++++++---------------------------- 1 file changed, 93 insertions(+), 254 deletions(-) diff --git a/sm9/bn256/select_arm64.s b/sm9/bn256/select_arm64.s index 44eec5e..09f6cd8 100644 --- a/sm9/bn256/select_arm64.s +++ b/sm9/bn256/select_arm64.s @@ -95,137 +95,57 @@ TEXT ·gfP12MovCond(SB),NOSPLIT,$0 MOVD b+16(FP), b_ptr MOVD cond+24(FP), R3 - CMP $0, R3 - // Two remarks: - // 1) Will want to revisit NEON, when support is better - // 2) CSEL might not be constant time on all ARM processors - LDP 0*16(a_ptr), (R4, R5) - LDP 1*16(a_ptr), (R6, R7) - LDP 2*16(a_ptr), (R8, R9) - LDP 0*16(b_ptr), (R16, R17) - LDP 1*16(b_ptr), (R19, R20) - LDP 2*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 0*16(res_ptr) - STP (R6, R7), 1*16(res_ptr) - STP (R8, R9), 2*16(res_ptr) + VEOR V0.B16, V0.B16, V0.B16 + VMOV R3, V1.S4 + VCMEQ V0.S4, V1.S4, V2.S4 - LDP 3*16(a_ptr), (R4, R5) - LDP 4*16(a_ptr), (R6, R7) - LDP 5*16(a_ptr), (R8, R9) - LDP 3*16(b_ptr), (R16, R17) - LDP 4*16(b_ptr), (R19, R20) - LDP 5*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 3*16(res_ptr) - STP (R6, R7), 4*16(res_ptr) - STP (R8, R9), 5*16(res_ptr) + VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr) - LDP 6*16(a_ptr), (R4, R5) - LDP 7*16(a_ptr), (R6, R7) - LDP 8*16(a_ptr), (R8, R9) - LDP 6*16(b_ptr), (R16, R17) - LDP 7*16(b_ptr), (R19, R20) - LDP 8*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 6*16(res_ptr) - STP (R6, R7), 7*16(res_ptr) - STP (R8, R9), 8*16(res_ptr) + VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr) - LDP 9*16(a_ptr), (R4, R5) - LDP 10*16(a_ptr), (R6, R7) - LDP 11*16(a_ptr), (R8, R9) - LDP 9*16(b_ptr), (R16, R17) - LDP 10*16(b_ptr), (R19, R20) - LDP 11*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 9*16(res_ptr) - STP (R6, R7), 10*16(res_ptr) - STP (R8, R9), 11*16(res_ptr) + VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr) - LDP 12*16(a_ptr), (R4, R5) - LDP 13*16(a_ptr), (R6, R7) - LDP 14*16(a_ptr), (R8, R9) - LDP 12*16(b_ptr), (R16, R17) - LDP 13*16(b_ptr), (R19, R20) - LDP 14*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 12*16(res_ptr) - STP (R6, R7), 13*16(res_ptr) - STP (R8, R9), 14*16(res_ptr) + VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr) - LDP 15*16(a_ptr), (R4, R5) - LDP 16*16(a_ptr), (R6, R7) - LDP 17*16(a_ptr), (R8, R9) - LDP 15*16(b_ptr), (R16, R17) - LDP 16*16(b_ptr), (R19, R20) - LDP 17*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 15*16(res_ptr) - STP (R6, R7), 16*16(res_ptr) - STP (R8, R9), 17*16(res_ptr) + VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr) - LDP 18*16(a_ptr), (R4, R5) - LDP 19*16(a_ptr), (R6, R7) - LDP 20*16(a_ptr), (R8, R9) - LDP 18*16(b_ptr), (R16, R17) - LDP 19*16(b_ptr), (R19, R20) - LDP 20*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 18*16(res_ptr) - STP (R6, R7), 19*16(res_ptr) - STP (R8, R9), 20*16(res_ptr) - - LDP 21*16(a_ptr), (R4, R5) - LDP 22*16(a_ptr), (R6, R7) - LDP 23*16(a_ptr), (R8, R9) - LDP 21*16(b_ptr), (R16, R17) - LDP 22*16(b_ptr), (R19, R20) - LDP 23*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 21*16(res_ptr) - STP (R6, R7), 22*16(res_ptr) - STP (R8, R9), 23*16(res_ptr) + VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr) RET @@ -238,52 +158,25 @@ TEXT ·curvePointMovCond(SB),NOSPLIT,$0 MOVD b+16(FP), b_ptr MOVD cond+24(FP), R3 - CMP $0, R3 - // Two remarks: - // 1) Will want to revisit NEON, when support is better - // 2) CSEL might not be constant time on all ARM processors - LDP 0*16(a_ptr), (R4, R5) - LDP 1*16(a_ptr), (R6, R7) - LDP 2*16(a_ptr), (R8, R9) - LDP 0*16(b_ptr), (R16, R17) - LDP 1*16(b_ptr), (R19, R20) - LDP 2*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 0*16(res_ptr) - STP (R6, R7), 1*16(res_ptr) - STP (R8, R9), 2*16(res_ptr) + VEOR V0.B16, V0.B16, V0.B16 + VMOV R3, V1.S4 + VCMEQ V0.S4, V1.S4, V2.S4 - LDP 3*16(a_ptr), (R4, R5) - LDP 4*16(a_ptr), (R6, R7) - LDP 5*16(a_ptr), (R8, R9) - LDP 3*16(b_ptr), (R16, R17) - LDP 4*16(b_ptr), (R19, R20) - LDP 5*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 3*16(res_ptr) - STP (R6, R7), 4*16(res_ptr) - STP (R8, R9), 5*16(res_ptr) + VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr) - LDP 6*16(a_ptr), (R4, R5) - LDP 7*16(a_ptr), (R6, R7) - LDP 6*16(b_ptr), (R16, R17) - LDP 7*16(b_ptr), (R19, R20) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - STP (R4, R5), 6*16(res_ptr) - STP (R6, R7), 7*16(res_ptr) + VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr) RET @@ -296,94 +189,40 @@ TEXT ·twistPointMovCond(SB),NOSPLIT,$0 MOVD b+16(FP), b_ptr MOVD cond+24(FP), R3 - CMP $0, R3 - // Two remarks: - // 1) Will want to revisit NEON, when support is better - // 2) CSEL might not be constant time on all ARM processors - LDP 0*16(a_ptr), (R4, R5) - LDP 1*16(a_ptr), (R6, R7) - LDP 2*16(a_ptr), (R8, R9) - LDP 0*16(b_ptr), (R16, R17) - LDP 1*16(b_ptr), (R19, R20) - LDP 2*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 0*16(res_ptr) - STP (R6, R7), 1*16(res_ptr) - STP (R8, R9), 2*16(res_ptr) + VEOR V0.B16, V0.B16, V0.B16 + VMOV R3, V1.S4 + VCMEQ V0.S4, V1.S4, V2.S4 - LDP 3*16(a_ptr), (R4, R5) - LDP 4*16(a_ptr), (R6, R7) - LDP 5*16(a_ptr), (R8, R9) - LDP 3*16(b_ptr), (R16, R17) - LDP 4*16(b_ptr), (R19, R20) - LDP 5*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 3*16(res_ptr) - STP (R6, R7), 4*16(res_ptr) - STP (R8, R9), 5*16(res_ptr) + VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr) - LDP 6*16(a_ptr), (R4, R5) - LDP 7*16(a_ptr), (R6, R7) - LDP 8*16(a_ptr), (R8, R9) - LDP 6*16(b_ptr), (R16, R17) - LDP 7*16(b_ptr), (R19, R20) - LDP 8*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 6*16(res_ptr) - STP (R6, R7), 7*16(res_ptr) - STP (R8, R9), 8*16(res_ptr) + VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr) - LDP 9*16(a_ptr), (R4, R5) - LDP 10*16(a_ptr), (R6, R7) - LDP 11*16(a_ptr), (R8, R9) - LDP 9*16(b_ptr), (R16, R17) - LDP 10*16(b_ptr), (R19, R20) - LDP 11*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 9*16(res_ptr) - STP (R6, R7), 10*16(res_ptr) - STP (R8, R9), 11*16(res_ptr) + VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr) - LDP 12*16(a_ptr), (R4, R5) - LDP 13*16(a_ptr), (R6, R7) - LDP 14*16(a_ptr), (R8, R9) - LDP 12*16(b_ptr), (R16, R17) - LDP 13*16(b_ptr), (R19, R20) - LDP 14*16(b_ptr), (R21, R22) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R19, R6, R6 - CSEL EQ, R20, R7, R7 - CSEL EQ, R21, R8, R8 - CSEL EQ, R22, R9, R9 - STP (R4, R5), 12*16(res_ptr) - STP (R6, R7), 13*16(res_ptr) - STP (R8, R9), 14*16(res_ptr) - - LDP 15*16(a_ptr), (R4, R5) - LDP 15*16(b_ptr), (R16, R17) - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - STP (R4, R5), 15*16(res_ptr) + VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16] + VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16] + VBIT V2.B16, V7.B16, V3.B16 + VBIT V2.B16, V8.B16, V4.B16 + VBIT V2.B16, V9.B16, V5.B16 + VBIT V2.B16, V10.B16, V6.B16 + VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr) RET