sm9/bn256: use neon inst. for conditional move

This commit is contained in:
Sun Yimin 2023-11-07 15:33:52 +08:00 committed by GitHub
parent 06a310dd4d
commit ef55df3657
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -95,137 +95,57 @@ TEXT ·gfP12MovCond(SB),NOSPLIT,$0
MOVD b+16(FP), b_ptr MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3 MOVD cond+24(FP), R3
CMP $0, R3 VEOR V0.B16, V0.B16, V0.B16
// Two remarks: VMOV R3, V1.S4
// 1) Will want to revisit NEON, when support is better VCMEQ V0.S4, V1.S4, V2.S4
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)
LDP 3*16(a_ptr), (R4, R5) VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 4*16(a_ptr), (R6, R7) VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 5*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 3*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 4*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 5*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)
LDP 6*16(a_ptr), (R4, R5) VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 7*16(a_ptr), (R6, R7) VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 8*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 6*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 7*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 8*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
STP (R8, R9), 8*16(res_ptr)
LDP 9*16(a_ptr), (R4, R5) VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 10*16(a_ptr), (R6, R7) VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 11*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 9*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 10*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 11*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 9*16(res_ptr)
STP (R6, R7), 10*16(res_ptr)
STP (R8, R9), 11*16(res_ptr)
LDP 12*16(a_ptr), (R4, R5) VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 13*16(a_ptr), (R6, R7) VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 14*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 12*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 13*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 14*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 12*16(res_ptr)
STP (R6, R7), 13*16(res_ptr)
STP (R8, R9), 14*16(res_ptr)
LDP 15*16(a_ptr), (R4, R5) VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 16*16(a_ptr), (R6, R7) VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 17*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 15*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 16*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 17*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 15*16(res_ptr)
STP (R6, R7), 16*16(res_ptr)
STP (R8, R9), 17*16(res_ptr)
LDP 18*16(a_ptr), (R4, R5) VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 19*16(a_ptr), (R6, R7) VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 20*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 18*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 19*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 20*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 18*16(res_ptr)
STP (R6, R7), 19*16(res_ptr)
STP (R8, R9), 20*16(res_ptr)
LDP 21*16(a_ptr), (R4, R5)
LDP 22*16(a_ptr), (R6, R7)
LDP 23*16(a_ptr), (R8, R9)
LDP 21*16(b_ptr), (R16, R17)
LDP 22*16(b_ptr), (R19, R20)
LDP 23*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 21*16(res_ptr)
STP (R6, R7), 22*16(res_ptr)
STP (R8, R9), 23*16(res_ptr)
RET RET
@ -238,52 +158,25 @@ TEXT ·curvePointMovCond(SB),NOSPLIT,$0
MOVD b+16(FP), b_ptr MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3 MOVD cond+24(FP), R3
CMP $0, R3 VEOR V0.B16, V0.B16, V0.B16
// Two remarks: VMOV R3, V1.S4
// 1) Will want to revisit NEON, when support is better VCMEQ V0.S4, V1.S4, V2.S4
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)
LDP 3*16(a_ptr), (R4, R5) VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 4*16(a_ptr), (R6, R7) VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 5*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 3*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 4*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 5*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)
LDP 6*16(a_ptr), (R4, R5) VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 7*16(a_ptr), (R6, R7) VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 6*16(b_ptr), (R16, R17) VBIT V2.B16, V7.B16, V3.B16
LDP 7*16(b_ptr), (R19, R20) VBIT V2.B16, V8.B16, V4.B16
CSEL EQ, R16, R4, R4 VBIT V2.B16, V9.B16, V5.B16
CSEL EQ, R17, R5, R5 VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R19, R6, R6 VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr)
CSEL EQ, R20, R7, R7
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
RET RET
@ -296,94 +189,40 @@ TEXT ·twistPointMovCond(SB),NOSPLIT,$0
MOVD b+16(FP), b_ptr MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3 MOVD cond+24(FP), R3
CMP $0, R3 VEOR V0.B16, V0.B16, V0.B16
// Two remarks: VMOV R3, V1.S4
// 1) Will want to revisit NEON, when support is better VCMEQ V0.S4, V1.S4, V2.S4
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)
LDP 3*16(a_ptr), (R4, R5) VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 4*16(a_ptr), (R6, R7) VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 5*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 3*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 4*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 5*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)
LDP 6*16(a_ptr), (R4, R5) VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 7*16(a_ptr), (R6, R7) VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 8*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 6*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 7*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 8*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
STP (R8, R9), 8*16(res_ptr)
LDP 9*16(a_ptr), (R4, R5) VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 10*16(a_ptr), (R6, R7) VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 11*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 9*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 10*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 11*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 9*16(res_ptr)
STP (R6, R7), 10*16(res_ptr)
STP (R8, R9), 11*16(res_ptr)
LDP 12*16(a_ptr), (R4, R5) VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
LDP 13*16(a_ptr), (R6, R7) VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
LDP 14*16(a_ptr), (R8, R9) VBIT V2.B16, V7.B16, V3.B16
LDP 12*16(b_ptr), (R16, R17) VBIT V2.B16, V8.B16, V4.B16
LDP 13*16(b_ptr), (R19, R20) VBIT V2.B16, V9.B16, V5.B16
LDP 14*16(b_ptr), (R21, R22) VBIT V2.B16, V10.B16, V6.B16
CSEL EQ, R16, R4, R4 VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr)
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 12*16(res_ptr)
STP (R6, R7), 13*16(res_ptr)
STP (R8, R9), 14*16(res_ptr)
LDP 15*16(a_ptr), (R4, R5)
LDP 15*16(b_ptr), (R16, R17)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
STP (R4, R5), 15*16(res_ptr)
RET RET