mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-25 19:56:18 +08:00
390 lines
9.9 KiB
ArmAsm
390 lines
9.9 KiB
ArmAsm
//go:build arm64 && !purego
|
|
// +build arm64,!purego
|
|
|
|
#include "textflag.h"
|
|
|
|
#define res_ptr R0
|
|
#define a_ptr R1
|
|
#define b_ptr R2
|
|
|
|
/* ---------------------------------------*/
|
|
// func gfpCopy(res, a *gfP)
|
|
TEXT ·gfpCopy(SB),NOSPLIT,$0
|
|
MOVD res+0(FP), res_ptr
|
|
MOVD a+8(FP), a_ptr
|
|
|
|
VLD1 (a_ptr), [V0.B16, V1.B16]
|
|
VST1 [V0.B16, V1.B16], (res_ptr)
|
|
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func gfp2Copy(res, a *gfP2)
|
|
TEXT ·gfp2Copy(SB),NOSPLIT,$0
|
|
MOVD res+0(FP), res_ptr
|
|
MOVD a+8(FP), a_ptr
|
|
|
|
VLD1 (a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1 [V0.B16, V1.B16, V2.B16, V3.B16], (res_ptr)
|
|
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func gfp4Copy(res, a *gfP4)
|
|
TEXT ·gfp4Copy(SB),NOSPLIT,$0
|
|
MOVD res+0(FP), res_ptr
|
|
MOVD a+8(FP), a_ptr
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func gfp6Copy(res, a *gfP6)
|
|
TEXT ·gfp6Copy(SB),NOSPLIT,$0
|
|
MOVD res+0(FP), res_ptr
|
|
MOVD a+8(FP), a_ptr
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func gfp12Copy(res, a *gfP12)
|
|
TEXT ·gfp12Copy(SB),NOSPLIT,$0
|
|
MOVD res+0(FP), res_ptr
|
|
MOVD a+8(FP), a_ptr
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
VLD1.P 64(a_ptr), [V0.B16, V1.B16, V2.B16, V3.B16]
|
|
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(res_ptr)
|
|
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func gfP12MovCond(res, a, b *gfP12, cond int)
|
|
// If cond == 0 res=b, else res=a
|
|
TEXT ·gfP12MovCond(SB),NOSPLIT,$0
|
|
MOVD res+0(FP), res_ptr
|
|
MOVD a+8(FP), a_ptr
|
|
MOVD b+16(FP), b_ptr
|
|
MOVD cond+24(FP), R3
|
|
|
|
CMP $0, R3
|
|
// Two remarks:
|
|
// 1) Will want to revisit NEON, when support is better
|
|
// 2) CSEL might not be constant time on all ARM processors
|
|
LDP 0*16(a_ptr), (R4, R5)
|
|
LDP 1*16(a_ptr), (R6, R7)
|
|
LDP 2*16(a_ptr), (R8, R9)
|
|
LDP 0*16(b_ptr), (R16, R17)
|
|
LDP 1*16(b_ptr), (R19, R20)
|
|
LDP 2*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 0*16(res_ptr)
|
|
STP (R6, R7), 1*16(res_ptr)
|
|
STP (R8, R9), 2*16(res_ptr)
|
|
|
|
LDP 3*16(a_ptr), (R4, R5)
|
|
LDP 4*16(a_ptr), (R6, R7)
|
|
LDP 5*16(a_ptr), (R8, R9)
|
|
LDP 3*16(b_ptr), (R16, R17)
|
|
LDP 4*16(b_ptr), (R19, R20)
|
|
LDP 5*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 3*16(res_ptr)
|
|
STP (R6, R7), 4*16(res_ptr)
|
|
STP (R8, R9), 5*16(res_ptr)
|
|
|
|
LDP 6*16(a_ptr), (R4, R5)
|
|
LDP 7*16(a_ptr), (R6, R7)
|
|
LDP 8*16(a_ptr), (R8, R9)
|
|
LDP 6*16(b_ptr), (R16, R17)
|
|
LDP 7*16(b_ptr), (R19, R20)
|
|
LDP 8*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 6*16(res_ptr)
|
|
STP (R6, R7), 7*16(res_ptr)
|
|
STP (R8, R9), 8*16(res_ptr)
|
|
|
|
LDP 9*16(a_ptr), (R4, R5)
|
|
LDP 10*16(a_ptr), (R6, R7)
|
|
LDP 11*16(a_ptr), (R8, R9)
|
|
LDP 9*16(b_ptr), (R16, R17)
|
|
LDP 10*16(b_ptr), (R19, R20)
|
|
LDP 11*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 9*16(res_ptr)
|
|
STP (R6, R7), 10*16(res_ptr)
|
|
STP (R8, R9), 11*16(res_ptr)
|
|
|
|
LDP 12*16(a_ptr), (R4, R5)
|
|
LDP 13*16(a_ptr), (R6, R7)
|
|
LDP 14*16(a_ptr), (R8, R9)
|
|
LDP 12*16(b_ptr), (R16, R17)
|
|
LDP 13*16(b_ptr), (R19, R20)
|
|
LDP 14*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 12*16(res_ptr)
|
|
STP (R6, R7), 13*16(res_ptr)
|
|
STP (R8, R9), 14*16(res_ptr)
|
|
|
|
LDP 15*16(a_ptr), (R4, R5)
|
|
LDP 16*16(a_ptr), (R6, R7)
|
|
LDP 17*16(a_ptr), (R8, R9)
|
|
LDP 15*16(b_ptr), (R16, R17)
|
|
LDP 16*16(b_ptr), (R19, R20)
|
|
LDP 17*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 15*16(res_ptr)
|
|
STP (R6, R7), 16*16(res_ptr)
|
|
STP (R8, R9), 17*16(res_ptr)
|
|
|
|
LDP 18*16(a_ptr), (R4, R5)
|
|
LDP 19*16(a_ptr), (R6, R7)
|
|
LDP 20*16(a_ptr), (R8, R9)
|
|
LDP 18*16(b_ptr), (R16, R17)
|
|
LDP 19*16(b_ptr), (R19, R20)
|
|
LDP 20*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 18*16(res_ptr)
|
|
STP (R6, R7), 19*16(res_ptr)
|
|
STP (R8, R9), 20*16(res_ptr)
|
|
|
|
LDP 21*16(a_ptr), (R4, R5)
|
|
LDP 22*16(a_ptr), (R6, R7)
|
|
LDP 23*16(a_ptr), (R8, R9)
|
|
LDP 21*16(b_ptr), (R16, R17)
|
|
LDP 22*16(b_ptr), (R19, R20)
|
|
LDP 23*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 21*16(res_ptr)
|
|
STP (R6, R7), 22*16(res_ptr)
|
|
STP (R8, R9), 23*16(res_ptr)
|
|
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func curvePointMovCond(res, a, b *curvePoint, cond int)
|
|
// If cond == 0 res=b, else res=a
|
|
TEXT ·curvePointMovCond(SB),NOSPLIT,$0
|
|
MOVD res+0(FP), res_ptr
|
|
MOVD a+8(FP), a_ptr
|
|
MOVD b+16(FP), b_ptr
|
|
MOVD cond+24(FP), R3
|
|
|
|
CMP $0, R3
|
|
// Two remarks:
|
|
// 1) Will want to revisit NEON, when support is better
|
|
// 2) CSEL might not be constant time on all ARM processors
|
|
LDP 0*16(a_ptr), (R4, R5)
|
|
LDP 1*16(a_ptr), (R6, R7)
|
|
LDP 2*16(a_ptr), (R8, R9)
|
|
LDP 0*16(b_ptr), (R16, R17)
|
|
LDP 1*16(b_ptr), (R19, R20)
|
|
LDP 2*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 0*16(res_ptr)
|
|
STP (R6, R7), 1*16(res_ptr)
|
|
STP (R8, R9), 2*16(res_ptr)
|
|
|
|
LDP 3*16(a_ptr), (R4, R5)
|
|
LDP 4*16(a_ptr), (R6, R7)
|
|
LDP 5*16(a_ptr), (R8, R9)
|
|
LDP 3*16(b_ptr), (R16, R17)
|
|
LDP 4*16(b_ptr), (R19, R20)
|
|
LDP 5*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 3*16(res_ptr)
|
|
STP (R6, R7), 4*16(res_ptr)
|
|
STP (R8, R9), 5*16(res_ptr)
|
|
|
|
LDP 6*16(a_ptr), (R4, R5)
|
|
LDP 7*16(a_ptr), (R6, R7)
|
|
LDP 6*16(b_ptr), (R16, R17)
|
|
LDP 7*16(b_ptr), (R19, R20)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
STP (R4, R5), 6*16(res_ptr)
|
|
STP (R6, R7), 7*16(res_ptr)
|
|
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func twistPointMovCond(res, a, b *twistPoint, cond int)
|
|
// If cond == 0 res=b, else res=a
|
|
TEXT ·twistPointMovCond(SB),NOSPLIT,$0
|
|
MOVD res+0(FP), res_ptr
|
|
MOVD a+8(FP), a_ptr
|
|
MOVD b+16(FP), b_ptr
|
|
MOVD cond+24(FP), R3
|
|
|
|
CMP $0, R3
|
|
// Two remarks:
|
|
// 1) Will want to revisit NEON, when support is better
|
|
// 2) CSEL might not be constant time on all ARM processors
|
|
LDP 0*16(a_ptr), (R4, R5)
|
|
LDP 1*16(a_ptr), (R6, R7)
|
|
LDP 2*16(a_ptr), (R8, R9)
|
|
LDP 0*16(b_ptr), (R16, R17)
|
|
LDP 1*16(b_ptr), (R19, R20)
|
|
LDP 2*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 0*16(res_ptr)
|
|
STP (R6, R7), 1*16(res_ptr)
|
|
STP (R8, R9), 2*16(res_ptr)
|
|
|
|
LDP 3*16(a_ptr), (R4, R5)
|
|
LDP 4*16(a_ptr), (R6, R7)
|
|
LDP 5*16(a_ptr), (R8, R9)
|
|
LDP 3*16(b_ptr), (R16, R17)
|
|
LDP 4*16(b_ptr), (R19, R20)
|
|
LDP 5*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 3*16(res_ptr)
|
|
STP (R6, R7), 4*16(res_ptr)
|
|
STP (R8, R9), 5*16(res_ptr)
|
|
|
|
LDP 6*16(a_ptr), (R4, R5)
|
|
LDP 7*16(a_ptr), (R6, R7)
|
|
LDP 8*16(a_ptr), (R8, R9)
|
|
LDP 6*16(b_ptr), (R16, R17)
|
|
LDP 7*16(b_ptr), (R19, R20)
|
|
LDP 8*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 6*16(res_ptr)
|
|
STP (R6, R7), 7*16(res_ptr)
|
|
STP (R8, R9), 8*16(res_ptr)
|
|
|
|
LDP 9*16(a_ptr), (R4, R5)
|
|
LDP 10*16(a_ptr), (R6, R7)
|
|
LDP 11*16(a_ptr), (R8, R9)
|
|
LDP 9*16(b_ptr), (R16, R17)
|
|
LDP 10*16(b_ptr), (R19, R20)
|
|
LDP 11*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 9*16(res_ptr)
|
|
STP (R6, R7), 10*16(res_ptr)
|
|
STP (R8, R9), 11*16(res_ptr)
|
|
|
|
LDP 12*16(a_ptr), (R4, R5)
|
|
LDP 13*16(a_ptr), (R6, R7)
|
|
LDP 14*16(a_ptr), (R8, R9)
|
|
LDP 12*16(b_ptr), (R16, R17)
|
|
LDP 13*16(b_ptr), (R19, R20)
|
|
LDP 14*16(b_ptr), (R21, R22)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
CSEL EQ, R19, R6, R6
|
|
CSEL EQ, R20, R7, R7
|
|
CSEL EQ, R21, R8, R8
|
|
CSEL EQ, R22, R9, R9
|
|
STP (R4, R5), 12*16(res_ptr)
|
|
STP (R6, R7), 13*16(res_ptr)
|
|
STP (R8, R9), 14*16(res_ptr)
|
|
|
|
LDP 15*16(a_ptr), (R4, R5)
|
|
LDP 15*16(b_ptr), (R16, R17)
|
|
CSEL EQ, R16, R4, R4
|
|
CSEL EQ, R17, R5, R5
|
|
STP (R4, R5), 15*16(res_ptr)
|
|
|
|
RET
|