mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-25 19:56:18 +08:00
873 lines
16 KiB
ArmAsm
873 lines
16 KiB
ArmAsm
//go:build !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
#define res_ptr DI
|
|
#define x_ptr SI
|
|
#define y_ptr CX
|
|
|
|
// func gfpCopy(res, a *gfP)
|
|
TEXT ·gfpCopy(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ a+8(FP), x_ptr
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ copygfp_avx2
|
|
|
|
MOVOU (16*0)(x_ptr), X0
|
|
MOVOU (16*1)(x_ptr), X1
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
|
|
RET
|
|
|
|
copygfp_avx2:
|
|
VMOVDQU (x_ptr), Y0
|
|
VMOVDQU Y0, (res_ptr)
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func gfp2Copy(res, a *gfP2)
|
|
TEXT ·gfp2Copy(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ a+8(FP), x_ptr
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ copygfp2_avx2
|
|
|
|
MOVOU (16*0)(x_ptr), X0
|
|
MOVOU (16*1)(x_ptr), X1
|
|
MOVOU (16*2)(x_ptr), X2
|
|
MOVOU (16*3)(x_ptr), X3
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
MOVOU X2, (16*2)(res_ptr)
|
|
MOVOU X3, (16*3)(res_ptr)
|
|
RET
|
|
|
|
copygfp2_avx2:
|
|
VMOVDQU (32*0)(x_ptr), Y0
|
|
VMOVDQU (32*1)(x_ptr), Y1
|
|
|
|
VMOVDQU Y0, (32*0)(res_ptr)
|
|
VMOVDQU Y1, (32*1)(res_ptr)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func gfp4Copy(res, a *gfP4)
|
|
TEXT ·gfp4Copy(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ a+8(FP), x_ptr
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ copygfp4_avx2
|
|
|
|
MOVOU (16*0)(x_ptr), X0
|
|
MOVOU (16*1)(x_ptr), X1
|
|
MOVOU (16*2)(x_ptr), X2
|
|
MOVOU (16*3)(x_ptr), X3
|
|
|
|
MOVOU (16*4)(x_ptr), X4
|
|
MOVOU (16*5)(x_ptr), X5
|
|
MOVOU (16*6)(x_ptr), X6
|
|
MOVOU (16*7)(x_ptr), X7
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
MOVOU X2, (16*2)(res_ptr)
|
|
MOVOU X3, (16*3)(res_ptr)
|
|
|
|
MOVOU X4, (16*4)(res_ptr)
|
|
MOVOU X5, (16*5)(res_ptr)
|
|
MOVOU X6, (16*6)(res_ptr)
|
|
MOVOU X7, (16*7)(res_ptr)
|
|
|
|
RET
|
|
|
|
copygfp4_avx2:
|
|
VMOVDQU (32*0)(x_ptr), Y0
|
|
VMOVDQU (32*1)(x_ptr), Y1
|
|
VMOVDQU (32*2)(x_ptr), Y2
|
|
VMOVDQU (32*3)(x_ptr), Y3
|
|
|
|
VMOVDQU Y0, (32*0)(res_ptr)
|
|
VMOVDQU Y1, (32*1)(res_ptr)
|
|
VMOVDQU Y2, (32*2)(res_ptr)
|
|
VMOVDQU Y3, (32*3)(res_ptr)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func gfp6Copy(res, a *gfP6)
|
|
TEXT ·gfp6Copy(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ a+8(FP), x_ptr
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ copygfp6_avx2
|
|
|
|
MOVOU (16*0)(x_ptr), X0
|
|
MOVOU (16*1)(x_ptr), X1
|
|
MOVOU (16*2)(x_ptr), X2
|
|
MOVOU (16*3)(x_ptr), X3
|
|
|
|
MOVOU (16*4)(x_ptr), X4
|
|
MOVOU (16*5)(x_ptr), X5
|
|
MOVOU (16*6)(x_ptr), X6
|
|
MOVOU (16*7)(x_ptr), X7
|
|
|
|
MOVOU (16*8)(x_ptr), X8
|
|
MOVOU (16*9)(x_ptr), X9
|
|
MOVOU (16*10)(x_ptr), X10
|
|
MOVOU (16*11)(x_ptr), X11
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
MOVOU X2, (16*2)(res_ptr)
|
|
MOVOU X3, (16*3)(res_ptr)
|
|
|
|
MOVOU X4, (16*4)(res_ptr)
|
|
MOVOU X5, (16*5)(res_ptr)
|
|
MOVOU X6, (16*6)(res_ptr)
|
|
MOVOU X7, (16*7)(res_ptr)
|
|
|
|
MOVOU X8, (16*8)(res_ptr)
|
|
MOVOU X9, (16*9)(res_ptr)
|
|
MOVOU X10, (16*10)(res_ptr)
|
|
MOVOU X11, (16*11)(res_ptr)
|
|
|
|
RET
|
|
|
|
copygfp6_avx2:
|
|
VMOVDQU (32*0)(x_ptr), Y0
|
|
VMOVDQU (32*1)(x_ptr), Y1
|
|
VMOVDQU (32*2)(x_ptr), Y2
|
|
VMOVDQU (32*3)(x_ptr), Y3
|
|
VMOVDQU (32*4)(x_ptr), Y4
|
|
VMOVDQU (32*5)(x_ptr), Y5
|
|
|
|
VMOVDQU Y0, (32*0)(res_ptr)
|
|
VMOVDQU Y1, (32*1)(res_ptr)
|
|
VMOVDQU Y2, (32*2)(res_ptr)
|
|
VMOVDQU Y3, (32*3)(res_ptr)
|
|
VMOVDQU Y4, (32*4)(res_ptr)
|
|
VMOVDQU Y5, (32*5)(res_ptr)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func gfp12Copy(res, a *gfP12)
|
|
TEXT ·gfp12Copy(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ a+8(FP), x_ptr
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ copygfp12_avx2
|
|
|
|
MOVOU (16*0)(x_ptr), X0
|
|
MOVOU (16*1)(x_ptr), X1
|
|
MOVOU (16*2)(x_ptr), X2
|
|
MOVOU (16*3)(x_ptr), X3
|
|
|
|
MOVOU (16*4)(x_ptr), X4
|
|
MOVOU (16*5)(x_ptr), X5
|
|
MOVOU (16*6)(x_ptr), X6
|
|
MOVOU (16*7)(x_ptr), X7
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
MOVOU X2, (16*2)(res_ptr)
|
|
MOVOU X3, (16*3)(res_ptr)
|
|
|
|
MOVOU X4, (16*4)(res_ptr)
|
|
MOVOU X5, (16*5)(res_ptr)
|
|
MOVOU X6, (16*6)(res_ptr)
|
|
MOVOU X7, (16*7)(res_ptr)
|
|
|
|
MOVOU (16*8)(x_ptr), X0
|
|
MOVOU (16*9)(x_ptr), X1
|
|
MOVOU (16*10)(x_ptr), X2
|
|
MOVOU (16*11)(x_ptr), X3
|
|
|
|
MOVOU (16*12)(x_ptr), X4
|
|
MOVOU (16*13)(x_ptr), X5
|
|
MOVOU (16*14)(x_ptr), X6
|
|
MOVOU (16*15)(x_ptr), X7
|
|
|
|
MOVOU X0, (16*8)(res_ptr)
|
|
MOVOU X1, (16*9)(res_ptr)
|
|
MOVOU X2, (16*10)(res_ptr)
|
|
MOVOU X3, (16*11)(res_ptr)
|
|
|
|
MOVOU X4, (16*12)(res_ptr)
|
|
MOVOU X5, (16*13)(res_ptr)
|
|
MOVOU X6, (16*14)(res_ptr)
|
|
MOVOU X7, (16*15)(res_ptr)
|
|
|
|
MOVOU (16*16)(x_ptr), X0
|
|
MOVOU (16*17)(x_ptr), X1
|
|
MOVOU (16*18)(x_ptr), X2
|
|
MOVOU (16*19)(x_ptr), X3
|
|
|
|
MOVOU (16*20)(x_ptr), X4
|
|
MOVOU (16*21)(x_ptr), X5
|
|
MOVOU (16*22)(x_ptr), X6
|
|
MOVOU (16*23)(x_ptr), X7
|
|
|
|
MOVOU X0, (16*16)(res_ptr)
|
|
MOVOU X1, (16*17)(res_ptr)
|
|
MOVOU X2, (16*18)(res_ptr)
|
|
MOVOU X3, (16*19)(res_ptr)
|
|
|
|
MOVOU X4, (16*20)(res_ptr)
|
|
MOVOU X5, (16*21)(res_ptr)
|
|
MOVOU X6, (16*22)(res_ptr)
|
|
MOVOU X7, (16*23)(res_ptr)
|
|
|
|
RET
|
|
|
|
copygfp12_avx2:
|
|
VMOVDQU (32*0)(x_ptr), Y0
|
|
VMOVDQU (32*1)(x_ptr), Y1
|
|
VMOVDQU (32*2)(x_ptr), Y2
|
|
VMOVDQU (32*3)(x_ptr), Y3
|
|
|
|
VMOVDQU (32*4)(x_ptr), Y4
|
|
VMOVDQU (32*5)(x_ptr), Y5
|
|
VMOVDQU (32*6)(x_ptr), Y6
|
|
VMOVDQU (32*7)(x_ptr), Y7
|
|
|
|
VMOVDQU (32*8)(x_ptr), Y8
|
|
VMOVDQU (32*9)(x_ptr), Y9
|
|
VMOVDQU (32*10)(x_ptr), Y10
|
|
VMOVDQU (32*11)(x_ptr), Y11
|
|
|
|
VMOVDQU Y0, (32*0)(res_ptr)
|
|
VMOVDQU Y1, (32*1)(res_ptr)
|
|
VMOVDQU Y2, (32*2)(res_ptr)
|
|
VMOVDQU Y3, (32*3)(res_ptr)
|
|
|
|
VMOVDQU Y4, (32*4)(res_ptr)
|
|
VMOVDQU Y5, (32*5)(res_ptr)
|
|
VMOVDQU Y6, (32*6)(res_ptr)
|
|
VMOVDQU Y7, (32*7)(res_ptr)
|
|
|
|
VMOVDQU Y8, (32*8)(res_ptr)
|
|
VMOVDQU Y9, (32*9)(res_ptr)
|
|
VMOVDQU Y10, (32*10)(res_ptr)
|
|
VMOVDQU Y11, (32*11)(res_ptr)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func gfP12MovCond(res, a, b *gfP12, cond int)
|
|
TEXT ·gfP12MovCond(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ a+8(FP), x_ptr
|
|
MOVQ b+16(FP), y_ptr
|
|
MOVQ cond+24(FP), X12
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ move_avx2
|
|
|
|
PXOR X13, X13
|
|
PSHUFD $0, X12, X12
|
|
PCMPEQL X13, X12
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*0)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*1)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU X12, X2
|
|
MOVOU (16*2)(x_ptr), X8
|
|
PANDN X8, X2
|
|
|
|
MOVOU X12, X3
|
|
MOVOU (16*3)(x_ptr), X9
|
|
PANDN X9, X3
|
|
|
|
MOVOU X12, X4
|
|
MOVOU (16*4)(x_ptr), X10
|
|
PANDN X10, X4
|
|
|
|
MOVOU X12, X5
|
|
MOVOU (16*5)(x_ptr), X11
|
|
PANDN X11, X5
|
|
|
|
MOVOU (16*0)(y_ptr), X6
|
|
MOVOU (16*1)(y_ptr), X7
|
|
MOVOU (16*2)(y_ptr), X8
|
|
MOVOU (16*3)(y_ptr), X9
|
|
MOVOU (16*4)(y_ptr), X10
|
|
MOVOU (16*5)(y_ptr), X11
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
MOVOU X2, (16*2)(res_ptr)
|
|
MOVOU X3, (16*3)(res_ptr)
|
|
MOVOU X4, (16*4)(res_ptr)
|
|
MOVOU X5, (16*5)(res_ptr)
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*6)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*7)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU X12, X2
|
|
MOVOU (16*8)(x_ptr), X8
|
|
PANDN X8, X2
|
|
|
|
MOVOU X12, X3
|
|
MOVOU (16*9)(x_ptr), X9
|
|
PANDN X9, X3
|
|
|
|
MOVOU X12, X4
|
|
MOVOU (16*10)(x_ptr), X10
|
|
PANDN X10, X4
|
|
|
|
MOVOU X12, X5
|
|
MOVOU (16*11)(x_ptr), X11
|
|
PANDN X11, X5
|
|
|
|
MOVOU (16*6)(y_ptr), X6
|
|
MOVOU (16*7)(y_ptr), X7
|
|
MOVOU (16*8)(y_ptr), X8
|
|
MOVOU (16*9)(y_ptr), X9
|
|
MOVOU (16*10)(y_ptr), X10
|
|
MOVOU (16*11)(y_ptr), X11
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
|
|
MOVOU X0, (16*6)(res_ptr)
|
|
MOVOU X1, (16*7)(res_ptr)
|
|
MOVOU X2, (16*8)(res_ptr)
|
|
MOVOU X3, (16*9)(res_ptr)
|
|
MOVOU X4, (16*10)(res_ptr)
|
|
MOVOU X5, (16*11)(res_ptr)
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*12)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*13)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU X12, X2
|
|
MOVOU (16*14)(x_ptr), X8
|
|
PANDN X8, X2
|
|
|
|
MOVOU X12, X3
|
|
MOVOU (16*15)(x_ptr), X9
|
|
PANDN X9, X3
|
|
|
|
MOVOU X12, X4
|
|
MOVOU (16*16)(x_ptr), X10
|
|
PANDN X10, X4
|
|
|
|
MOVOU X12, X5
|
|
MOVOU (16*17)(x_ptr), X11
|
|
PANDN X11, X5
|
|
|
|
MOVOU (16*12)(y_ptr), X6
|
|
MOVOU (16*13)(y_ptr), X7
|
|
MOVOU (16*14)(y_ptr), X8
|
|
MOVOU (16*15)(y_ptr), X9
|
|
MOVOU (16*16)(y_ptr), X10
|
|
MOVOU (16*17)(y_ptr), X11
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
|
|
MOVOU X0, (16*12)(res_ptr)
|
|
MOVOU X1, (16*13)(res_ptr)
|
|
MOVOU X2, (16*14)(res_ptr)
|
|
MOVOU X3, (16*15)(res_ptr)
|
|
MOVOU X4, (16*16)(res_ptr)
|
|
MOVOU X5, (16*17)(res_ptr)
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*18)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*19)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU X12, X2
|
|
MOVOU (16*20)(x_ptr), X8
|
|
PANDN X8, X2
|
|
|
|
MOVOU X12, X3
|
|
MOVOU (16*21)(x_ptr), X9
|
|
PANDN X9, X3
|
|
|
|
MOVOU X12, X4
|
|
MOVOU (16*22)(x_ptr), X10
|
|
PANDN X10, X4
|
|
|
|
MOVOU X12, X5
|
|
MOVOU (16*23)(x_ptr), X11
|
|
PANDN X11, X5
|
|
|
|
MOVOU (16*18)(y_ptr), X6
|
|
MOVOU (16*19)(y_ptr), X7
|
|
MOVOU (16*20)(y_ptr), X8
|
|
MOVOU (16*21)(y_ptr), X9
|
|
MOVOU (16*22)(y_ptr), X10
|
|
MOVOU (16*23)(y_ptr), X11
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
|
|
MOVOU X0, (16*18)(res_ptr)
|
|
MOVOU X1, (16*19)(res_ptr)
|
|
MOVOU X2, (16*20)(res_ptr)
|
|
MOVOU X3, (16*21)(res_ptr)
|
|
MOVOU X4, (16*22)(res_ptr)
|
|
MOVOU X5, (16*23)(res_ptr)
|
|
|
|
RET
|
|
|
|
move_avx2:
|
|
VPXOR Y13, Y13, Y13
|
|
VPBROADCASTD X12, Y12
|
|
VPCMPEQD Y13, Y12, Y12
|
|
|
|
VPANDN (32*0)(x_ptr), Y12, Y0
|
|
VPANDN (32*1)(x_ptr), Y12, Y1
|
|
VPANDN (32*2)(x_ptr), Y12, Y2
|
|
VPANDN (32*3)(x_ptr), Y12, Y3
|
|
VPANDN (32*4)(x_ptr), Y12, Y4
|
|
VPANDN (32*5)(x_ptr), Y12, Y5
|
|
|
|
VPAND (32*0)(y_ptr), Y12, Y6
|
|
VPAND (32*1)(y_ptr), Y12, Y7
|
|
VPAND (32*2)(y_ptr), Y12, Y8
|
|
VPAND (32*3)(y_ptr), Y12, Y9
|
|
VPAND (32*4)(y_ptr), Y12, Y10
|
|
VPAND (32*5)(y_ptr), Y12, Y11
|
|
|
|
VPXOR Y6, Y0, Y0
|
|
VPXOR Y7, Y1, Y1
|
|
VPXOR Y8, Y2, Y2
|
|
VPXOR Y9, Y3, Y3
|
|
VPXOR Y10, Y4, Y4
|
|
VPXOR Y11, Y5, Y5
|
|
|
|
VMOVDQU Y0, (32*0)(res_ptr)
|
|
VMOVDQU Y1, (32*1)(res_ptr)
|
|
VMOVDQU Y2, (32*2)(res_ptr)
|
|
VMOVDQU Y3, (32*3)(res_ptr)
|
|
VMOVDQU Y4, (32*4)(res_ptr)
|
|
VMOVDQU Y5, (32*5)(res_ptr)
|
|
|
|
VPANDN (32*6)(x_ptr), Y12, Y0
|
|
VPANDN (32*7)(x_ptr), Y12, Y1
|
|
VPANDN (32*8)(x_ptr), Y12, Y2
|
|
VPANDN (32*9)(x_ptr), Y12, Y3
|
|
VPANDN (32*10)(x_ptr), Y12, Y4
|
|
VPANDN (32*11)(x_ptr), Y12, Y5
|
|
|
|
VPAND (32*6)(y_ptr), Y12, Y6
|
|
VPAND (32*7)(y_ptr), Y12, Y7
|
|
VPAND (32*8)(y_ptr), Y12, Y8
|
|
VPAND (32*9)(y_ptr), Y12, Y9
|
|
VPAND (32*10)(y_ptr), Y12, Y10
|
|
VPAND (32*11)(y_ptr), Y12, Y11
|
|
|
|
VPXOR Y6, Y0, Y0
|
|
VPXOR Y7, Y1, Y1
|
|
VPXOR Y8, Y2, Y2
|
|
VPXOR Y9, Y3, Y3
|
|
VPXOR Y10, Y4, Y4
|
|
VPXOR Y11, Y5, Y5
|
|
|
|
VMOVDQU Y0, (32*6)(res_ptr)
|
|
VMOVDQU Y1, (32*7)(res_ptr)
|
|
VMOVDQU Y2, (32*8)(res_ptr)
|
|
VMOVDQU Y3, (32*9)(res_ptr)
|
|
VMOVDQU Y4, (32*10)(res_ptr)
|
|
VMOVDQU Y5, (32*11)(res_ptr)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func curvePointMovCond(res, a, b *curvePoint, cond int)
|
|
TEXT ·curvePointMovCond(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ a+8(FP), x_ptr
|
|
MOVQ b+16(FP), y_ptr
|
|
MOVQ cond+24(FP), X12
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ move_avx2
|
|
|
|
PXOR X13, X13
|
|
PSHUFD $0, X12, X12
|
|
PCMPEQL X13, X12
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*0)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*1)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU X12, X2
|
|
MOVOU (16*2)(x_ptr), X8
|
|
PANDN X8, X2
|
|
|
|
MOVOU X12, X3
|
|
MOVOU (16*3)(x_ptr), X9
|
|
PANDN X9, X3
|
|
|
|
MOVOU X12, X4
|
|
MOVOU (16*4)(x_ptr), X10
|
|
PANDN X10, X4
|
|
|
|
MOVOU X12, X5
|
|
MOVOU (16*5)(x_ptr), X11
|
|
PANDN X11, X5
|
|
|
|
MOVOU (16*0)(y_ptr), X6
|
|
MOVOU (16*1)(y_ptr), X7
|
|
MOVOU (16*2)(y_ptr), X8
|
|
MOVOU (16*3)(y_ptr), X9
|
|
MOVOU (16*4)(y_ptr), X10
|
|
MOVOU (16*5)(y_ptr), X11
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
MOVOU X2, (16*2)(res_ptr)
|
|
MOVOU X3, (16*3)(res_ptr)
|
|
MOVOU X4, (16*4)(res_ptr)
|
|
MOVOU X5, (16*5)(res_ptr)
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*6)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*7)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU (16*6)(y_ptr), X6
|
|
MOVOU (16*7)(y_ptr), X7
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
|
|
MOVOU X0, (16*6)(res_ptr)
|
|
MOVOU X1, (16*7)(res_ptr)
|
|
|
|
RET
|
|
|
|
move_avx2:
|
|
VPXOR Y13, Y13, Y13
|
|
VPBROADCASTD X12, Y12
|
|
VPCMPEQD Y13, Y12, Y12
|
|
|
|
VPANDN (32*0)(x_ptr), Y12, Y0
|
|
VPANDN (32*1)(x_ptr), Y12, Y1
|
|
VPANDN (32*2)(x_ptr), Y12, Y2
|
|
VPANDN (32*3)(x_ptr), Y12, Y3
|
|
|
|
VPAND (32*0)(y_ptr), Y12, Y6
|
|
VPAND (32*1)(y_ptr), Y12, Y7
|
|
VPAND (32*2)(y_ptr), Y12, Y8
|
|
VPAND (32*3)(y_ptr), Y12, Y9
|
|
|
|
VPXOR Y6, Y0, Y0
|
|
VPXOR Y7, Y1, Y1
|
|
VPXOR Y8, Y2, Y2
|
|
VPXOR Y9, Y3, Y3
|
|
|
|
VMOVDQU Y0, (32*0)(res_ptr)
|
|
VMOVDQU Y1, (32*1)(res_ptr)
|
|
VMOVDQU Y2, (32*2)(res_ptr)
|
|
VMOVDQU Y3, (32*3)(res_ptr)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func twistPointMovCond(res, a, b *twistPoint, cond int)
|
|
TEXT ·twistPointMovCond(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ a+8(FP), x_ptr
|
|
MOVQ b+16(FP), y_ptr
|
|
MOVQ cond+24(FP), X12
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ move_avx2
|
|
|
|
PXOR X13, X13
|
|
PSHUFD $0, X12, X12
|
|
PCMPEQL X13, X12
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*0)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*1)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU X12, X2
|
|
MOVOU (16*2)(x_ptr), X8
|
|
PANDN X8, X2
|
|
|
|
MOVOU X12, X3
|
|
MOVOU (16*3)(x_ptr), X9
|
|
PANDN X9, X3
|
|
|
|
MOVOU X12, X4
|
|
MOVOU (16*4)(x_ptr), X10
|
|
PANDN X10, X4
|
|
|
|
MOVOU X12, X5
|
|
MOVOU (16*5)(x_ptr), X11
|
|
PANDN X11, X5
|
|
|
|
MOVOU (16*0)(y_ptr), X6
|
|
MOVOU (16*1)(y_ptr), X7
|
|
MOVOU (16*2)(y_ptr), X8
|
|
MOVOU (16*3)(y_ptr), X9
|
|
MOVOU (16*4)(y_ptr), X10
|
|
MOVOU (16*5)(y_ptr), X11
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
MOVOU X2, (16*2)(res_ptr)
|
|
MOVOU X3, (16*3)(res_ptr)
|
|
MOVOU X4, (16*4)(res_ptr)
|
|
MOVOU X5, (16*5)(res_ptr)
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*6)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*7)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU X12, X2
|
|
MOVOU (16*8)(x_ptr), X8
|
|
PANDN X8, X2
|
|
|
|
MOVOU X12, X3
|
|
MOVOU (16*9)(x_ptr), X9
|
|
PANDN X9, X3
|
|
|
|
MOVOU X12, X4
|
|
MOVOU (16*10)(x_ptr), X10
|
|
PANDN X10, X4
|
|
|
|
MOVOU X12, X5
|
|
MOVOU (16*11)(x_ptr), X11
|
|
PANDN X11, X5
|
|
|
|
MOVOU (16*6)(y_ptr), X6
|
|
MOVOU (16*7)(y_ptr), X7
|
|
MOVOU (16*8)(y_ptr), X8
|
|
MOVOU (16*9)(y_ptr), X9
|
|
MOVOU (16*10)(y_ptr), X10
|
|
MOVOU (16*11)(y_ptr), X11
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
|
|
MOVOU X0, (16*6)(res_ptr)
|
|
MOVOU X1, (16*7)(res_ptr)
|
|
MOVOU X2, (16*8)(res_ptr)
|
|
MOVOU X3, (16*9)(res_ptr)
|
|
MOVOU X4, (16*10)(res_ptr)
|
|
MOVOU X5, (16*11)(res_ptr)
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*12)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*13)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU X12, X2
|
|
MOVOU (16*14)(x_ptr), X8
|
|
PANDN X8, X2
|
|
|
|
MOVOU X12, X3
|
|
MOVOU (16*15)(x_ptr), X9
|
|
PANDN X9, X3
|
|
|
|
MOVOU (16*12)(y_ptr), X6
|
|
MOVOU (16*13)(y_ptr), X7
|
|
MOVOU (16*14)(y_ptr), X8
|
|
MOVOU (16*15)(y_ptr), X9
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
|
|
MOVOU X0, (16*12)(res_ptr)
|
|
MOVOU X1, (16*13)(res_ptr)
|
|
MOVOU X2, (16*14)(res_ptr)
|
|
MOVOU X3, (16*15)(res_ptr)
|
|
|
|
RET
|
|
|
|
move_avx2:
|
|
VPXOR Y13, Y13, Y13
|
|
VPBROADCASTD X12, Y12
|
|
VPCMPEQD Y13, Y12, Y12
|
|
|
|
VPANDN (32*0)(x_ptr), Y12, Y0
|
|
VPANDN (32*1)(x_ptr), Y12, Y1
|
|
VPANDN (32*2)(x_ptr), Y12, Y2
|
|
VPANDN (32*3)(x_ptr), Y12, Y3
|
|
VPANDN (32*4)(x_ptr), Y12, Y4
|
|
VPANDN (32*5)(x_ptr), Y12, Y5
|
|
|
|
VPAND (32*0)(y_ptr), Y12, Y6
|
|
VPAND (32*1)(y_ptr), Y12, Y7
|
|
VPAND (32*2)(y_ptr), Y12, Y8
|
|
VPAND (32*3)(y_ptr), Y12, Y9
|
|
VPAND (32*4)(y_ptr), Y12, Y10
|
|
VPAND (32*5)(y_ptr), Y12, Y11
|
|
|
|
VPXOR Y6, Y0, Y0
|
|
VPXOR Y7, Y1, Y1
|
|
VPXOR Y8, Y2, Y2
|
|
VPXOR Y9, Y3, Y3
|
|
VPXOR Y10, Y4, Y4
|
|
VPXOR Y11, Y5, Y5
|
|
|
|
VMOVDQU Y0, (32*0)(res_ptr)
|
|
VMOVDQU Y1, (32*1)(res_ptr)
|
|
VMOVDQU Y2, (32*2)(res_ptr)
|
|
VMOVDQU Y3, (32*3)(res_ptr)
|
|
VMOVDQU Y4, (32*4)(res_ptr)
|
|
VMOVDQU Y5, (32*5)(res_ptr)
|
|
|
|
VPANDN (32*6)(x_ptr), Y12, Y0
|
|
VPANDN (32*7)(x_ptr), Y12, Y1
|
|
|
|
VPAND (32*6)(y_ptr), Y12, Y6
|
|
VPAND (32*7)(y_ptr), Y12, Y7
|
|
|
|
VPXOR Y6, Y0, Y0
|
|
VPXOR Y7, Y1, Y1
|
|
|
|
VMOVDQU Y0, (32*6)(res_ptr)
|
|
VMOVDQU Y1, (32*7)(res_ptr)
|
|
|
|
VZEROUPPER
|
|
RET
|