sm9/bn256: use neon inst. for conditional move

This commit is contained in:
Sun Yimin 2023-11-07 15:33:52 +08:00 committed by GitHub
parent 06a310dd4d
commit ef55df3657
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -95,137 +95,57 @@ TEXT ·gfP12MovCond(SB),NOSPLIT,$0
MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3
CMP $0, R3
// Two remarks:
// 1) Will want to revisit NEON, when support is better
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)
VEOR V0.B16, V0.B16, V0.B16
VMOV R3, V1.S4
VCMEQ V0.S4, V1.S4, V2.S4
LDP 3*16(a_ptr), (R4, R5)
LDP 4*16(a_ptr), (R6, R7)
LDP 5*16(a_ptr), (R8, R9)
LDP 3*16(b_ptr), (R16, R17)
LDP 4*16(b_ptr), (R19, R20)
LDP 5*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)
VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
LDP 6*16(a_ptr), (R4, R5)
LDP 7*16(a_ptr), (R6, R7)
LDP 8*16(a_ptr), (R8, R9)
LDP 6*16(b_ptr), (R16, R17)
LDP 7*16(b_ptr), (R19, R20)
LDP 8*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
STP (R8, R9), 8*16(res_ptr)
VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
LDP 9*16(a_ptr), (R4, R5)
LDP 10*16(a_ptr), (R6, R7)
LDP 11*16(a_ptr), (R8, R9)
LDP 9*16(b_ptr), (R16, R17)
LDP 10*16(b_ptr), (R19, R20)
LDP 11*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 9*16(res_ptr)
STP (R6, R7), 10*16(res_ptr)
STP (R8, R9), 11*16(res_ptr)
VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
LDP 12*16(a_ptr), (R4, R5)
LDP 13*16(a_ptr), (R6, R7)
LDP 14*16(a_ptr), (R8, R9)
LDP 12*16(b_ptr), (R16, R17)
LDP 13*16(b_ptr), (R19, R20)
LDP 14*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 12*16(res_ptr)
STP (R6, R7), 13*16(res_ptr)
STP (R8, R9), 14*16(res_ptr)
VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
LDP 15*16(a_ptr), (R4, R5)
LDP 16*16(a_ptr), (R6, R7)
LDP 17*16(a_ptr), (R8, R9)
LDP 15*16(b_ptr), (R16, R17)
LDP 16*16(b_ptr), (R19, R20)
LDP 17*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 15*16(res_ptr)
STP (R6, R7), 16*16(res_ptr)
STP (R8, R9), 17*16(res_ptr)
VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
LDP 18*16(a_ptr), (R4, R5)
LDP 19*16(a_ptr), (R6, R7)
LDP 20*16(a_ptr), (R8, R9)
LDP 18*16(b_ptr), (R16, R17)
LDP 19*16(b_ptr), (R19, R20)
LDP 20*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 18*16(res_ptr)
STP (R6, R7), 19*16(res_ptr)
STP (R8, R9), 20*16(res_ptr)
LDP 21*16(a_ptr), (R4, R5)
LDP 22*16(a_ptr), (R6, R7)
LDP 23*16(a_ptr), (R8, R9)
LDP 21*16(b_ptr), (R16, R17)
LDP 22*16(b_ptr), (R19, R20)
LDP 23*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 21*16(res_ptr)
STP (R6, R7), 22*16(res_ptr)
STP (R8, R9), 23*16(res_ptr)
VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr)
RET
@ -238,52 +158,25 @@ TEXT ·curvePointMovCond(SB),NOSPLIT,$0
MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3
CMP $0, R3
// Two remarks:
// 1) Will want to revisit NEON, when support is better
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)
VEOR V0.B16, V0.B16, V0.B16
VMOV R3, V1.S4
VCMEQ V0.S4, V1.S4, V2.S4
LDP 3*16(a_ptr), (R4, R5)
LDP 4*16(a_ptr), (R6, R7)
LDP 5*16(a_ptr), (R8, R9)
LDP 3*16(b_ptr), (R16, R17)
LDP 4*16(b_ptr), (R19, R20)
LDP 5*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)
VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
LDP 6*16(a_ptr), (R4, R5)
LDP 7*16(a_ptr), (R6, R7)
LDP 6*16(b_ptr), (R16, R17)
LDP 7*16(b_ptr), (R19, R20)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr)
RET
@ -296,94 +189,40 @@ TEXT ·twistPointMovCond(SB),NOSPLIT,$0
MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3
CMP $0, R3
// Two remarks:
// 1) Will want to revisit NEON, when support is better
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)
VEOR V0.B16, V0.B16, V0.B16
VMOV R3, V1.S4
VCMEQ V0.S4, V1.S4, V2.S4
LDP 3*16(a_ptr), (R4, R5)
LDP 4*16(a_ptr), (R6, R7)
LDP 5*16(a_ptr), (R8, R9)
LDP 3*16(b_ptr), (R16, R17)
LDP 4*16(b_ptr), (R19, R20)
LDP 5*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)
VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
LDP 6*16(a_ptr), (R4, R5)
LDP 7*16(a_ptr), (R6, R7)
LDP 8*16(a_ptr), (R8, R9)
LDP 6*16(b_ptr), (R16, R17)
LDP 7*16(b_ptr), (R19, R20)
LDP 8*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
STP (R8, R9), 8*16(res_ptr)
VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
LDP 9*16(a_ptr), (R4, R5)
LDP 10*16(a_ptr), (R6, R7)
LDP 11*16(a_ptr), (R8, R9)
LDP 9*16(b_ptr), (R16, R17)
LDP 10*16(b_ptr), (R19, R20)
LDP 11*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 9*16(res_ptr)
STP (R6, R7), 10*16(res_ptr)
STP (R8, R9), 11*16(res_ptr)
VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)
LDP 12*16(a_ptr), (R4, R5)
LDP 13*16(a_ptr), (R6, R7)
LDP 14*16(a_ptr), (R8, R9)
LDP 12*16(b_ptr), (R16, R17)
LDP 13*16(b_ptr), (R19, R20)
LDP 14*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 12*16(res_ptr)
STP (R6, R7), 13*16(res_ptr)
STP (R8, R9), 14*16(res_ptr)
LDP 15*16(a_ptr), (R4, R5)
LDP 15*16(b_ptr), (R16, R17)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
STP (R4, R5), 15*16(res_ptr)
VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr)
RET