internal/sm2ec: uses neon inst. for p256SelectAffine

This commit is contained in:
Sun Yimin 2023-11-07 16:11:05 +08:00 committed by GitHub
parent ef55df3657
commit b5c593005d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -333,40 +333,29 @@ TEXT ·p256SelectAffine(SB),NOSPLIT,$0
MOVD table+8(FP), t1 MOVD table+8(FP), t1
MOVD res+0(FP), res_ptr MOVD res+0(FP), res_ptr
EOR x0, x0, x0 VMOV t0, V0.S4 // will use VDUP after upgrade go to 1.17+
EOR x1, x1, x1
EOR x2, x2, x2 VEOR V2.B16, V2.B16, V2.B16
EOR x3, x3, x3 VEOR V3.B16, V3.B16, V3.B16
EOR y0, y0, y0 VEOR V4.B16, V4.B16, V4.B16
EOR y1, y1, y1 VEOR V5.B16, V5.B16, V5.B16
EOR y2, y2, y2
EOR y3, y3, y3
MOVD $0, t2 MOVD $0, t2
loop_select: loop_select:
ADD $1, t2 ADD $1, t2
CMP t0, t2 VMOV t2, V1.S4 // will use VDUP after upgrade go to 1.17+
LDP.P 16(t1), (acc0, acc1) VCMEQ V0.S4, V1.S4, V10.S4
CSEL EQ, acc0, x0, x0 VLD1.P (64)(t1), [V6.B16, V7.B16, V8.B16, V9.B16]
CSEL EQ, acc1, x1, x1 VBIT V10.B16, V6.B16, V2.B16
LDP.P 16(t1), (acc2, acc3) VBIT V10.B16, V7.B16, V3.B16
CSEL EQ, acc2, x2, x2 VBIT V10.B16, V8.B16, V4.B16
CSEL EQ, acc3, x3, x3 VBIT V10.B16, V9.B16, V5.B16
LDP.P 16(t1), (acc4, acc5)
CSEL EQ, acc4, y0, y0
CSEL EQ, acc5, y1, y1
LDP.P 16(t1), (acc6, acc7)
CSEL EQ, acc6, y2, y2
CSEL EQ, acc7, y3, y3
CMP $32, t2 CMP $32, t2
BNE loop_select BNE loop_select
STP (x0, x1), 0*16(res_ptr) VST1 [V2.B16, V3.B16, V4.B16, V5.B16], (res_ptr)
STP (x2, x3), 1*16(res_ptr)
STP (y0, y1), 2*16(res_ptr)
STP (y2, y3), 3*16(res_ptr)
RET RET
/* ---------------------------------------*/ /* ---------------------------------------*/