sm9/bn256: gfP12MovCond asm implementation for amd64 & arm64

This commit is contained in:
Sun Yimin 2023-06-21 14:35:58 +08:00 committed by GitHub
parent 9bcbf9bae9
commit aa535e3aaa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 469 additions and 1 deletions

View File

@ -213,7 +213,7 @@ func (table *GTFieldTable) Select(p *GT, n uint8) {
p.p.SetOne() p.p.SetOne()
for i, f := range table { for i, f := range table {
cond := subtle.ConstantTimeByteEq(uint8(i+1), n) cond := subtle.ConstantTimeByteEq(uint8(i+1), n)
p.p.Select(f.p, p.p, cond) gfP12MovCond(p.p, f.p, p.p, cond)
} }
} }

296
sm9/bn256/select_amd64.s Normal file
View File

@ -0,0 +1,296 @@
//go:build amd64 && !purego
// +build amd64,!purego
#include "textflag.h"
#define res_ptr DI
#define x_ptr SI
#define y_ptr CX
// func gfP12MovCond(res, a, b *gfP12, cond int)
TEXT ·gfP12MovCond(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ a+8(FP), x_ptr
MOVQ b+16(FP), y_ptr
MOVQ cond+24(FP), X12
CMPB ·supportAVX2+0(SB), $0x01
JEQ move_avx2
PXOR X13, X13
PSHUFD $0, X12, X12
PCMPEQL X13, X12
MOVOU X12, X0
MOVOU (16*0)(x_ptr), X6
PANDN X6, X0
MOVOU X12, X1
MOVOU (16*1)(x_ptr), X7
PANDN X7, X1
MOVOU X12, X2
MOVOU (16*2)(x_ptr), X8
PANDN X8, X2
MOVOU X12, X3
MOVOU (16*3)(x_ptr), X9
PANDN X9, X3
MOVOU X12, X4
MOVOU (16*4)(x_ptr), X10
PANDN X10, X4
MOVOU X12, X5
MOVOU (16*5)(x_ptr), X11
PANDN X11, X5
MOVOU (16*0)(y_ptr), X6
MOVOU (16*1)(y_ptr), X7
MOVOU (16*2)(y_ptr), X8
MOVOU (16*3)(y_ptr), X9
MOVOU (16*4)(y_ptr), X10
MOVOU (16*5)(y_ptr), X11
PAND X12, X6
PAND X12, X7
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X6, X0
PXOR X7, X1
PXOR X8, X2
PXOR X9, X3
PXOR X10, X4
PXOR X11, X5
MOVOU X0, (16*0)(res_ptr)
MOVOU X1, (16*1)(res_ptr)
MOVOU X2, (16*2)(res_ptr)
MOVOU X3, (16*3)(res_ptr)
MOVOU X4, (16*4)(res_ptr)
MOVOU X5, (16*5)(res_ptr)
MOVOU X12, X0
MOVOU (16*6)(x_ptr), X6
PANDN X6, X0
MOVOU X12, X1
MOVOU (16*7)(x_ptr), X7
PANDN X7, X1
MOVOU X12, X2
MOVOU (16*8)(x_ptr), X8
PANDN X8, X2
MOVOU X12, X3
MOVOU (16*9)(x_ptr), X9
PANDN X9, X3
MOVOU X12, X4
MOVOU (16*10)(x_ptr), X10
PANDN X10, X4
MOVOU X12, X5
MOVOU (16*11)(x_ptr), X11
PANDN X11, X5
MOVOU (16*6)(y_ptr), X6
MOVOU (16*7)(y_ptr), X7
MOVOU (16*8)(y_ptr), X8
MOVOU (16*9)(y_ptr), X9
MOVOU (16*10)(y_ptr), X10
MOVOU (16*11)(y_ptr), X11
PAND X12, X6
PAND X12, X7
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X6, X0
PXOR X7, X1
PXOR X8, X2
PXOR X9, X3
PXOR X10, X4
PXOR X11, X5
MOVOU X0, (16*6)(res_ptr)
MOVOU X1, (16*7)(res_ptr)
MOVOU X2, (16*8)(res_ptr)
MOVOU X3, (16*9)(res_ptr)
MOVOU X4, (16*10)(res_ptr)
MOVOU X5, (16*11)(res_ptr)
MOVOU X12, X0
MOVOU (16*12)(x_ptr), X6
PANDN X6, X0
MOVOU X12, X1
MOVOU (16*13)(x_ptr), X7
PANDN X7, X1
MOVOU X12, X2
MOVOU (16*14)(x_ptr), X8
PANDN X8, X2
MOVOU X12, X3
MOVOU (16*15)(x_ptr), X9
PANDN X9, X3
MOVOU X12, X4
MOVOU (16*16)(x_ptr), X10
PANDN X10, X4
MOVOU X12, X5
MOVOU (16*17)(x_ptr), X11
PANDN X11, X5
MOVOU (16*12)(y_ptr), X6
MOVOU (16*13)(y_ptr), X7
MOVOU (16*14)(y_ptr), X8
MOVOU (16*15)(y_ptr), X9
MOVOU (16*16)(y_ptr), X10
MOVOU (16*17)(y_ptr), X11
PAND X12, X6
PAND X12, X7
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X6, X0
PXOR X7, X1
PXOR X8, X2
PXOR X9, X3
PXOR X10, X4
PXOR X11, X5
MOVOU X0, (16*12)(res_ptr)
MOVOU X1, (16*13)(res_ptr)
MOVOU X2, (16*14)(res_ptr)
MOVOU X3, (16*15)(res_ptr)
MOVOU X4, (16*16)(res_ptr)
MOVOU X5, (16*17)(res_ptr)
MOVOU X12, X0
MOVOU (16*18)(x_ptr), X6
PANDN X6, X0
MOVOU X12, X1
MOVOU (16*19)(x_ptr), X7
PANDN X7, X1
MOVOU X12, X2
MOVOU (16*20)(x_ptr), X8
PANDN X8, X2
MOVOU X12, X3
MOVOU (16*21)(x_ptr), X9
PANDN X9, X3
MOVOU X12, X4
MOVOU (16*22)(x_ptr), X10
PANDN X10, X4
MOVOU X12, X5
MOVOU (16*23)(x_ptr), X11
PANDN X11, X5
MOVOU (16*18)(y_ptr), X6
MOVOU (16*19)(y_ptr), X7
MOVOU (16*20)(y_ptr), X8
MOVOU (16*21)(y_ptr), X9
MOVOU (16*22)(y_ptr), X10
MOVOU (16*23)(y_ptr), X11
PAND X12, X6
PAND X12, X7
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X6, X0
PXOR X7, X1
PXOR X8, X2
PXOR X9, X3
PXOR X10, X4
PXOR X11, X5
MOVOU X0, (16*18)(res_ptr)
MOVOU X1, (16*19)(res_ptr)
MOVOU X2, (16*20)(res_ptr)
MOVOU X3, (16*21)(res_ptr)
MOVOU X4, (16*22)(res_ptr)
MOVOU X5, (16*23)(res_ptr)
RET
move_avx2:
VPXOR Y13, Y13, Y13
VPBROADCASTD X12, Y12
VPCMPEQD Y13, Y12, Y12
VPANDN (32*0)(x_ptr), Y12, Y0
VPANDN (32*1)(x_ptr), Y12, Y1
VPANDN (32*2)(x_ptr), Y12, Y2
VPANDN (32*3)(x_ptr), Y12, Y3
VPANDN (32*4)(x_ptr), Y12, Y4
VPANDN (32*5)(x_ptr), Y12, Y5
VPAND (32*0)(y_ptr), Y12, Y6
VPAND (32*1)(y_ptr), Y12, Y7
VPAND (32*2)(y_ptr), Y12, Y8
VPAND (32*3)(y_ptr), Y12, Y9
VPAND (32*4)(y_ptr), Y12, Y10
VPAND (32*5)(y_ptr), Y12, Y11
VPXOR Y6, Y0, Y0
VPXOR Y7, Y1, Y1
VPXOR Y8, Y2, Y2
VPXOR Y9, Y3, Y3
VPXOR Y10, Y4, Y4
VPXOR Y11, Y5, Y5
VMOVDQU Y0, (32*0)(res_ptr)
VMOVDQU Y1, (32*1)(res_ptr)
VMOVDQU Y2, (32*2)(res_ptr)
VMOVDQU Y3, (32*3)(res_ptr)
VMOVDQU Y4, (32*4)(res_ptr)
VMOVDQU Y5, (32*5)(res_ptr)
VPANDN (32*6)(x_ptr), Y12, Y0
VPANDN (32*7)(x_ptr), Y12, Y1
VPANDN (32*8)(x_ptr), Y12, Y2
VPANDN (32*9)(x_ptr), Y12, Y3
VPANDN (32*10)(x_ptr), Y12, Y4
VPANDN (32*11)(x_ptr), Y12, Y5
VPAND (32*6)(y_ptr), Y12, Y6
VPAND (32*7)(y_ptr), Y12, Y7
VPAND (32*8)(y_ptr), Y12, Y8
VPAND (32*9)(y_ptr), Y12, Y9
VPAND (32*10)(y_ptr), Y12, Y10
VPAND (32*11)(y_ptr), Y12, Y11
VPXOR Y6, Y0, Y0
VPXOR Y7, Y1, Y1
VPXOR Y8, Y2, Y2
VPXOR Y9, Y3, Y3
VPXOR Y10, Y4, Y4
VPXOR Y11, Y5, Y5
VMOVDQU Y0, (32*6)(res_ptr)
VMOVDQU Y1, (32*7)(res_ptr)
VMOVDQU Y2, (32*8)(res_ptr)
VMOVDQU Y3, (32*9)(res_ptr)
VMOVDQU Y4, (32*10)(res_ptr)
VMOVDQU Y5, (32*11)(res_ptr)
VZEROUPPER
RET

151
sm9/bn256/select_arm64.s Normal file
View File

@ -0,0 +1,151 @@
//go:build arm64 && !purego
// +build arm64,!purego
#include "textflag.h"
#define res_ptr R0
#define a_ptr R1
#define b_ptr R2
/* ---------------------------------------*/
// func gfP12MovCond(res, a, b *gfP12, cond int)
// If cond == 0 res=b, else res=a
TEXT ·gfP12MovCond(SB),NOSPLIT,$0
MOVD res+0(FP), res_ptr
MOVD a+8(FP), a_ptr
MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3
CMP $0, R3
// Two remarks:
// 1) Will want to revisit NEON, when support is better
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)
LDP 3*16(a_ptr), (R4, R5)
LDP 4*16(a_ptr), (R6, R7)
LDP 5*16(a_ptr), (R8, R9)
LDP 3*16(b_ptr), (R16, R17)
LDP 4*16(b_ptr), (R19, R20)
LDP 5*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)
LDP 6*16(a_ptr), (R4, R5)
LDP 7*16(a_ptr), (R6, R7)
LDP 8*16(a_ptr), (R8, R9)
LDP 6*16(b_ptr), (R16, R17)
LDP 7*16(b_ptr), (R19, R20)
LDP 8*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
STP (R8, R9), 8*16(res_ptr)
LDP 9*16(a_ptr), (R4, R5)
LDP 10*16(a_ptr), (R6, R7)
LDP 11*16(a_ptr), (R8, R9)
LDP 9*16(b_ptr), (R16, R17)
LDP 10*16(b_ptr), (R19, R20)
LDP 11*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 9*16(res_ptr)
STP (R6, R7), 10*16(res_ptr)
STP (R8, R9), 11*16(res_ptr)
LDP 12*16(a_ptr), (R4, R5)
LDP 13*16(a_ptr), (R6, R7)
LDP 14*16(a_ptr), (R8, R9)
LDP 12*16(b_ptr), (R16, R17)
LDP 13*16(b_ptr), (R19, R20)
LDP 14*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 12*16(res_ptr)
STP (R6, R7), 13*16(res_ptr)
STP (R8, R9), 14*16(res_ptr)
LDP 15*16(a_ptr), (R4, R5)
LDP 16*16(a_ptr), (R6, R7)
LDP 17*16(a_ptr), (R8, R9)
LDP 15*16(b_ptr), (R16, R17)
LDP 16*16(b_ptr), (R19, R20)
LDP 17*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 15*16(res_ptr)
STP (R6, R7), 16*16(res_ptr)
STP (R8, R9), 17*16(res_ptr)
LDP 18*16(a_ptr), (R4, R5)
LDP 19*16(a_ptr), (R6, R7)
LDP 20*16(a_ptr), (R8, R9)
LDP 18*16(b_ptr), (R16, R17)
LDP 19*16(b_ptr), (R19, R20)
LDP 20*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 18*16(res_ptr)
STP (R6, R7), 19*16(res_ptr)
STP (R8, R9), 20*16(res_ptr)
LDP 21*16(a_ptr), (R4, R5)
LDP 22*16(a_ptr), (R6, R7)
LDP 23*16(a_ptr), (R8, R9)
LDP 21*16(b_ptr), (R16, R17)
LDP 22*16(b_ptr), (R19, R20)
LDP 23*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 21*16(res_ptr)
STP (R6, R7), 22*16(res_ptr)
STP (R8, R9), 23*16(res_ptr)
RET

13
sm9/bn256/select_decl.go Normal file
View File

@ -0,0 +1,13 @@
//go:build (amd64 && !purego) || (arm64 && !purego)
// +build amd64,!purego arm64,!purego
package bn256
import "golang.org/x/sys/cpu"
var supportAVX2 = cpu.X86.HasAVX2
// If cond is 0, sets res = b, otherwise sets res = a.
//
//go:noescape
func gfP12MovCond(res, a, b *gfP12, cond int)

View File

@ -0,0 +1,8 @@
//go:build (!amd64 && !arm64) || purego
// +build !amd64,!arm64 purego
package bn256
func gfP12MovCond(res, a, b *gfP12, cond int) {
res.Select(a, b, cond)
}