mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-25 19:56:18 +08:00
1308 lines
20 KiB
ArmAsm
1308 lines
20 KiB
ArmAsm
//go:build !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
#include "p256_macros_amd64.s"
|
|
|
|
/* ---------------------------------------*/
|
|
// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
|
|
TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
|
|
JMP ·p256BigToLittle(SB)
|
|
/* ---------------------------------------*/
|
|
// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
|
|
TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
|
|
JMP ·p256BigToLittle(SB)
|
|
/* ---------------------------------------*/
|
|
// func p256LittleToBig(res *[32]byte, in *p256Element)
|
|
TEXT ·p256LittleToBig(SB),NOSPLIT,$0
|
|
JMP ·p256BigToLittle(SB)
|
|
/* ---------------------------------------*/
|
|
// func p256BigToLittle(res *p256Element, in *[32]byte)
|
|
TEXT ·p256BigToLittle(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ in+8(FP), x_ptr
|
|
|
|
MOVQ (8*0)(x_ptr), acc0
|
|
MOVQ (8*1)(x_ptr), acc1
|
|
MOVQ (8*2)(x_ptr), acc2
|
|
MOVQ (8*3)(x_ptr), acc3
|
|
|
|
BSWAPQ acc0
|
|
BSWAPQ acc1
|
|
BSWAPQ acc2
|
|
BSWAPQ acc3
|
|
|
|
MOVQ acc3, (8*0)(res_ptr)
|
|
MOVQ acc2, (8*1)(res_ptr)
|
|
MOVQ acc1, (8*2)(res_ptr)
|
|
MOVQ acc0, (8*3)(res_ptr)
|
|
|
|
RET
|
|
/* ---------------------------------------*/
|
|
// func p256MovCond(res, a, b *SM2P256Point, cond int)
|
|
TEXT ·p256MovCond(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ a+8(FP), x_ptr
|
|
MOVQ b+16(FP), y_ptr
|
|
MOVQ cond+24(FP), X12
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ move_avx2
|
|
|
|
PXOR X13, X13
|
|
PSHUFD $0, X12, X12
|
|
PCMPEQL X13, X12
|
|
|
|
MOVOU X12, X0
|
|
MOVOU (16*0)(x_ptr), X6
|
|
PANDN X6, X0
|
|
|
|
MOVOU X12, X1
|
|
MOVOU (16*1)(x_ptr), X7
|
|
PANDN X7, X1
|
|
|
|
MOVOU X12, X2
|
|
MOVOU (16*2)(x_ptr), X8
|
|
PANDN X8, X2
|
|
|
|
MOVOU X12, X3
|
|
MOVOU (16*3)(x_ptr), X9
|
|
PANDN X9, X3
|
|
|
|
MOVOU X12, X4
|
|
MOVOU (16*4)(x_ptr), X10
|
|
PANDN X10, X4
|
|
|
|
MOVOU X12, X5
|
|
MOVOU (16*5)(x_ptr), X11
|
|
PANDN X11, X5
|
|
|
|
MOVOU (16*0)(y_ptr), X6
|
|
MOVOU (16*1)(y_ptr), X7
|
|
MOVOU (16*2)(y_ptr), X8
|
|
MOVOU (16*3)(y_ptr), X9
|
|
MOVOU (16*4)(y_ptr), X10
|
|
MOVOU (16*5)(y_ptr), X11
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
MOVOU X2, (16*2)(res_ptr)
|
|
MOVOU X3, (16*3)(res_ptr)
|
|
MOVOU X4, (16*4)(res_ptr)
|
|
MOVOU X5, (16*5)(res_ptr)
|
|
|
|
RET
|
|
|
|
move_avx2:
|
|
VPXOR Y13, Y13, Y13
|
|
VPBROADCASTD X12, Y12
|
|
VPCMPEQD Y13, Y12, Y12
|
|
|
|
VPANDN (32*0)(x_ptr), Y12, Y0
|
|
VPANDN (32*1)(x_ptr), Y12, Y1
|
|
VPANDN (32*2)(x_ptr), Y12, Y2
|
|
|
|
VPAND (32*0)(y_ptr), Y12, Y3
|
|
VPAND (32*1)(y_ptr), Y12, Y4
|
|
VPAND (32*2)(y_ptr), Y12, Y5
|
|
|
|
VPXOR Y3, Y0, Y0
|
|
VPXOR Y4, Y1, Y1
|
|
VPXOR Y5, Y2, Y2
|
|
|
|
VMOVDQU Y0, (32*0)(res_ptr)
|
|
VMOVDQU Y1, (32*1)(res_ptr)
|
|
VMOVDQU Y2, (32*2)(res_ptr)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func p256NegCond(val *p256Element, cond int)
|
|
TEXT ·p256NegCond(SB),NOSPLIT,$0
|
|
MOVQ val+0(FP), res_ptr
|
|
MOVQ cond+8(FP), t0
|
|
// acc = poly
|
|
MOVQ $-1, acc0
|
|
MOVQ p256p<>+0x08(SB), acc1
|
|
MOVQ $-1, acc2
|
|
MOVQ p256p<>+0x18(SB), acc3
|
|
// Load the original value
|
|
MOVQ (8*0)(res_ptr), acc4
|
|
MOVQ (8*1)(res_ptr), x_ptr
|
|
MOVQ (8*2)(res_ptr), y_ptr
|
|
MOVQ (8*3)(res_ptr), acc5
|
|
// Speculatively subtract
|
|
SUBQ acc4, acc0
|
|
SBBQ x_ptr, acc1
|
|
SBBQ y_ptr, acc2
|
|
SBBQ acc5, acc3
|
|
// If condition is 0, keep original value
|
|
TESTQ t0, t0
|
|
CMOVQEQ acc4, acc0
|
|
CMOVQEQ x_ptr, acc1
|
|
CMOVQEQ y_ptr, acc2
|
|
CMOVQEQ acc5, acc3
|
|
// Store result
|
|
MOVQ acc0, (8*0)(res_ptr)
|
|
MOVQ acc1, (8*1)(res_ptr)
|
|
MOVQ acc2, (8*2)(res_ptr)
|
|
MOVQ acc3, (8*3)(res_ptr)
|
|
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func p256Mul(res, in1, in2 *p256Element)
|
|
TEXT ·p256Mul(SB),NOSPLIT,$0
|
|
MOVQ in1+8(FP), x_ptr
|
|
MOVQ in2+16(FP), y_ptr
|
|
|
|
CMPB ·supportBMI2+0(SB), $0x01
|
|
JEQ mulBMI2
|
|
|
|
// x * y[0]
|
|
MOVQ (8*0)(y_ptr), t0
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
MULQ t0
|
|
MOVQ AX, acc0
|
|
MOVQ DX, acc1
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc1
|
|
ADCQ $0, DX
|
|
MOVQ DX, acc2
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc2
|
|
ADCQ $0, DX
|
|
MOVQ DX, acc3
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc3
|
|
ADCQ $0, DX
|
|
MOVQ DX, acc4
|
|
XORQ acc5, acc5
|
|
// First reduction step
|
|
MOVQ acc0, AX
|
|
MOVQ acc0, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc1
|
|
SBBQ DX, acc2
|
|
SBBQ AX, acc3
|
|
MOVQ acc0, AX
|
|
SBBQ DX, acc0
|
|
|
|
ADDQ AX, acc1
|
|
ADCQ $0, acc2
|
|
ADCQ $0, acc3
|
|
ADCQ acc0, acc4
|
|
ADCQ $0, acc5
|
|
|
|
XORQ acc0, acc0
|
|
// x * y[1]
|
|
MOVQ (8*1)(y_ptr), t0
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc1
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc2
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc2
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc3
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc3
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc4
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc4
|
|
ADCQ DX, acc5
|
|
ADCQ $0, acc0
|
|
// Second reduction step
|
|
MOVQ acc1, AX
|
|
MOVQ acc1, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc2
|
|
SBBQ DX, acc3
|
|
SBBQ AX, acc4
|
|
MOVQ acc1, AX
|
|
SBBQ DX, acc1
|
|
|
|
ADDQ AX, acc2
|
|
ADCQ $0, acc3
|
|
ADCQ $0, acc4
|
|
ADCQ acc1, acc5
|
|
ADCQ $0, acc0
|
|
|
|
XORQ acc1, acc1
|
|
// x * y[2]
|
|
MOVQ (8*2)(y_ptr), t0
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc2
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc3
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc3
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc4
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc4
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc5
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc5
|
|
ADCQ DX, acc0
|
|
ADCQ $0, acc1
|
|
// Third reduction step
|
|
MOVQ acc2, AX
|
|
MOVQ acc2, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc3
|
|
SBBQ DX, acc4
|
|
SBBQ AX, acc5
|
|
MOVQ acc2, AX
|
|
SBBQ DX, acc2
|
|
|
|
ADDQ AX, acc3
|
|
ADCQ $0, acc4
|
|
ADCQ $0, acc5
|
|
ADCQ acc2, acc0
|
|
ADCQ $0, acc1
|
|
|
|
XORQ acc2, acc2
|
|
// x * y[3]
|
|
MOVQ (8*3)(y_ptr), t0
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc3
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc4
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc4
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc5
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc5
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc0
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc0
|
|
ADCQ DX, acc1
|
|
ADCQ $0, acc2
|
|
// Last reduction step
|
|
MOVQ acc3, AX
|
|
MOVQ acc3, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc4
|
|
SBBQ DX, acc5
|
|
SBBQ AX, acc0
|
|
MOVQ acc3, AX
|
|
SBBQ DX, acc3
|
|
|
|
ADDQ AX, acc4
|
|
ADCQ $0, acc5
|
|
ADCQ $0, acc0
|
|
ADCQ acc3, acc1
|
|
ADCQ $0, acc2
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
|
RET
|
|
|
|
mulBMI2:
|
|
XORQ acc5, acc5
|
|
XORQ res_ptr, res_ptr
|
|
// x * y[0]
|
|
MOVQ (8*0)(y_ptr), DX
|
|
MULXQ (8*0)(x_ptr), acc0, acc1
|
|
|
|
MULXQ (8*1)(x_ptr), AX, acc2
|
|
ADCXQ AX, acc1
|
|
|
|
MULXQ (8*2)(x_ptr), AX, acc3
|
|
ADCXQ AX, acc2
|
|
|
|
MULXQ (8*3)(x_ptr), AX, acc4
|
|
ADCXQ AX, acc3
|
|
ADCXQ acc5, acc4
|
|
|
|
// First reduction step
|
|
MOVQ acc0, AX
|
|
MOVQ acc0, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc1
|
|
SBBQ DX, acc2
|
|
SBBQ AX, acc3
|
|
MOVQ acc0, AX
|
|
SBBQ DX, acc0
|
|
|
|
ADOXQ AX, acc1
|
|
ADOXQ res_ptr, acc2
|
|
ADOXQ res_ptr, acc3
|
|
ADOXQ acc0, acc4
|
|
ADOXQ res_ptr, acc5
|
|
|
|
XORQ acc0, acc0
|
|
// x * y[1]
|
|
MOVQ (8*1)(y_ptr), DX
|
|
MULXQ (8*0)(x_ptr), AX, t0
|
|
ADOXQ AX, acc1
|
|
|
|
MULXQ (8*1)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc2
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc3
|
|
|
|
MULXQ (8*3)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc4
|
|
ADCXQ acc0, BX
|
|
ADOXQ BX, acc5
|
|
ADOXQ res_ptr, acc0
|
|
|
|
// Second reduction step
|
|
MOVQ acc1, AX
|
|
MOVQ acc1, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc2
|
|
SBBQ DX, acc3
|
|
SBBQ AX, acc4
|
|
MOVQ acc1, AX
|
|
SBBQ DX, acc1
|
|
|
|
ADOXQ AX, acc2
|
|
ADOXQ res_ptr, acc3
|
|
ADOXQ res_ptr, acc4
|
|
ADOXQ acc1, acc5
|
|
ADOXQ res_ptr, acc0
|
|
|
|
XORQ acc1, acc1
|
|
// x * y[2]
|
|
MOVQ (8*2)(y_ptr), DX
|
|
MULXQ (8*0)(x_ptr), AX, t0
|
|
ADOXQ AX, acc2
|
|
|
|
MULXQ (8*1)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc3
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc4
|
|
|
|
MULXQ (8*3)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc5
|
|
ADCXQ res_ptr, BX
|
|
ADOXQ BX, acc0
|
|
ADOXQ res_ptr, acc1
|
|
|
|
// Third reduction step
|
|
MOVQ acc2, AX
|
|
MOVQ acc2, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc3
|
|
SBBQ DX, acc4
|
|
SBBQ AX, acc5
|
|
MOVQ acc2, AX
|
|
SBBQ DX, acc2
|
|
|
|
ADOXQ AX, acc3
|
|
ADOXQ res_ptr, acc4
|
|
ADOXQ res_ptr, acc5
|
|
ADOXQ acc2, acc0
|
|
ADOXQ res_ptr, acc1
|
|
|
|
XORQ acc2, acc2
|
|
// x * y[3]
|
|
MOVQ (8*3)(y_ptr), DX
|
|
MULXQ (8*0)(x_ptr), AX, t0
|
|
ADOXQ AX, acc3
|
|
|
|
MULXQ (8*1)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc4
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc5
|
|
|
|
MULXQ (8*3)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc0
|
|
ADCXQ res_ptr, BX
|
|
ADOXQ BX, acc1
|
|
ADOXQ res_ptr, acc2
|
|
|
|
// Last reduction step
|
|
MOVQ acc3, AX
|
|
MOVQ acc3, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc4
|
|
SBBQ DX, acc5
|
|
SBBQ AX, acc0
|
|
MOVQ acc3, AX
|
|
SBBQ DX, acc3
|
|
|
|
ADOXQ AX, acc4
|
|
ADOXQ res_ptr, acc5
|
|
ADOXQ res_ptr, acc0
|
|
ADOXQ acc3, acc1
|
|
ADOXQ res_ptr, acc2
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func p256FromMont(res, in *p256Element)
|
|
TEXT ·p256FromMont(SB),NOSPLIT,$0
|
|
MOVQ res+0(FP), res_ptr
|
|
MOVQ in+8(FP), x_ptr
|
|
|
|
MOVQ (8*0)(x_ptr), acc0
|
|
MOVQ (8*1)(x_ptr), acc1
|
|
MOVQ (8*2)(x_ptr), acc2
|
|
MOVQ (8*3)(x_ptr), acc3
|
|
XORQ acc4, acc4
|
|
|
|
// Only reduce, no multiplications are needed
|
|
// First stage
|
|
MOVQ acc0, AX
|
|
MOVQ acc0, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc1
|
|
SBBQ DX, acc2
|
|
SBBQ AX, acc3
|
|
MOVQ acc0, AX
|
|
SBBQ DX, acc0
|
|
|
|
ADDQ AX, acc1
|
|
ADCQ $0, acc2
|
|
ADCQ $0, acc3
|
|
ADCQ acc0, acc4
|
|
|
|
XORQ acc5, acc5
|
|
// Second stage
|
|
MOVQ acc1, AX
|
|
MOVQ acc1, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc2
|
|
SBBQ DX, acc3
|
|
SBBQ AX, acc4
|
|
MOVQ acc1, AX
|
|
SBBQ DX, acc5
|
|
|
|
ADDQ AX, acc2
|
|
ADCQ $0, acc3
|
|
ADCQ $0, acc4
|
|
ADCQ acc1, acc5
|
|
|
|
XORQ acc0, acc0
|
|
// Third stage
|
|
MOVQ acc2, AX
|
|
MOVQ acc2, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc3
|
|
SBBQ DX, acc4
|
|
SBBQ AX, acc5
|
|
MOVQ acc2, AX
|
|
SBBQ DX, acc2
|
|
|
|
ADDQ AX, acc3
|
|
ADCQ $0, acc4
|
|
ADCQ $0, acc5
|
|
ADCQ acc2, acc0
|
|
|
|
XORQ acc1, acc1
|
|
// Last stage
|
|
MOVQ acc3, AX
|
|
MOVQ acc3, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ AX, acc4
|
|
SBBQ DX, acc5
|
|
SBBQ AX, acc0
|
|
MOVQ acc3, AX
|
|
SBBQ DX, acc3
|
|
|
|
ADDQ AX, acc4
|
|
ADCQ $0, acc5
|
|
ADCQ $0, acc0
|
|
ADCQ acc3, acc1
|
|
|
|
MOVQ acc4, x_ptr
|
|
MOVQ acc5, acc3
|
|
MOVQ acc0, t0
|
|
MOVQ acc1, BX
|
|
|
|
SUBQ $-1, acc4
|
|
SBBQ p256p<>+0x08(SB), acc5
|
|
SBBQ $-1, acc0
|
|
SBBQ p256p<>+0x018(SB), acc1
|
|
|
|
CMOVQCS x_ptr, acc4
|
|
CMOVQCS acc3, acc5
|
|
CMOVQCS t0, acc0
|
|
CMOVQCS BX, acc1
|
|
|
|
MOVQ acc4, (8*0)(res_ptr)
|
|
MOVQ acc5, (8*1)(res_ptr)
|
|
MOVQ acc0, (8*2)(res_ptr)
|
|
MOVQ acc1, (8*3)(res_ptr)
|
|
|
|
RET
|
|
/* ---------------------------------------*/
|
|
// func p256Select(res *SM2P256Point, table *p256Table, idx, limit int)
|
|
TEXT ·p256Select(SB),NOSPLIT,$0
|
|
//MOVQ idx+16(FP),AX
|
|
MOVQ table+8(FP),DI
|
|
MOVQ res+0(FP),DX
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ select_avx2
|
|
|
|
PXOR X15, X15 // X15 = 0
|
|
PCMPEQL X14, X14 // X14 = -1
|
|
PSUBL X14, X15 // X15 = 1
|
|
MOVL idx+16(FP), X14
|
|
PSHUFD $0, X14, X14
|
|
|
|
PXOR X0, X0
|
|
PXOR X1, X1
|
|
PXOR X2, X2
|
|
PXOR X3, X3
|
|
PXOR X4, X4
|
|
PXOR X5, X5
|
|
MOVQ limit+24(FP),AX
|
|
|
|
MOVOU X15, X13
|
|
|
|
loop_select:
|
|
|
|
MOVOU X13, X12
|
|
PADDL X15, X13
|
|
PCMPEQL X14, X12
|
|
|
|
MOVOU (16*0)(DI), X6
|
|
MOVOU (16*1)(DI), X7
|
|
MOVOU (16*2)(DI), X8
|
|
MOVOU (16*3)(DI), X9
|
|
MOVOU (16*4)(DI), X10
|
|
MOVOU (16*5)(DI), X11
|
|
ADDQ $(16*6), DI
|
|
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
|
|
DECQ AX
|
|
JNE loop_select
|
|
|
|
MOVOU X0, (16*0)(DX)
|
|
MOVOU X1, (16*1)(DX)
|
|
MOVOU X2, (16*2)(DX)
|
|
MOVOU X3, (16*3)(DX)
|
|
MOVOU X4, (16*4)(DX)
|
|
MOVOU X5, (16*5)(DX)
|
|
|
|
RET
|
|
|
|
select_avx2:
|
|
VPXOR Y15, Y15, Y15
|
|
VPCMPEQD Y14, Y14, Y14
|
|
VPSUBD Y14, Y15, Y15 // Y15 = 1
|
|
VPBROADCASTD idx+16(FP), Y14
|
|
|
|
MOVQ limit+24(FP),AX
|
|
VMOVDQU Y15, Y13
|
|
|
|
VPXOR Y0, Y0, Y0
|
|
VPXOR Y1, Y1, Y1
|
|
VPXOR Y2, Y2, Y2
|
|
|
|
loop_select_avx2:
|
|
VPCMPEQD Y14, Y13, Y12
|
|
VPADDD Y15, Y13, Y13
|
|
|
|
VPAND (32*0)(DI), Y12, Y3
|
|
VPAND (32*1)(DI), Y12, Y4
|
|
VPAND (32*2)(DI), Y12, Y5
|
|
|
|
ADDQ $(32*3), DI
|
|
|
|
VPXOR Y3, Y0, Y0
|
|
VPXOR Y4, Y1, Y1
|
|
VPXOR Y5, Y2, Y2
|
|
|
|
DECQ AX
|
|
JNE loop_select_avx2
|
|
|
|
VMOVDQU Y0, (32*0)(DX)
|
|
VMOVDQU Y1, (32*1)(DX)
|
|
VMOVDQU Y2, (32*2)(DX)
|
|
VZEROUPPER
|
|
RET
|
|
|
|
/* ---------------------------------------*/
|
|
// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
|
|
TEXT ·p256SelectAffine(SB),NOSPLIT,$0
|
|
MOVQ idx+16(FP),AX
|
|
MOVQ table+8(FP),DI
|
|
MOVQ res+0(FP),DX
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
JEQ select_base_avx2
|
|
|
|
PXOR X15, X15 // X15 = 0
|
|
PCMPEQL X14, X14 // X14 = -1
|
|
PSUBL X14, X15 // X15 = 1
|
|
MOVL idx+16(FP), X14 // x14 = idx
|
|
PSHUFD $0, X14, X14
|
|
|
|
MOVQ $16, AX
|
|
MOVOU X15, X13
|
|
|
|
PXOR X0, X0
|
|
PXOR X1, X1
|
|
PXOR X2, X2
|
|
PXOR X3, X3
|
|
|
|
loop_select_base:
|
|
|
|
MOVOU X13, X12
|
|
PADDL X15, X13
|
|
PCMPEQL X14, X12
|
|
|
|
MOVOU (16*0)(DI), X4
|
|
MOVOU (16*1)(DI), X5
|
|
MOVOU (16*2)(DI), X6
|
|
MOVOU (16*3)(DI), X7
|
|
|
|
MOVOU (16*4)(DI), X8
|
|
MOVOU (16*5)(DI), X9
|
|
MOVOU (16*6)(DI), X10
|
|
MOVOU (16*7)(DI), X11
|
|
|
|
ADDQ $(16*8), DI
|
|
|
|
PAND X12, X4
|
|
PAND X12, X5
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
|
|
MOVOU X13, X12
|
|
PADDL X15, X13
|
|
PCMPEQL X14, X12
|
|
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
|
|
PXOR X4, X0
|
|
PXOR X5, X1
|
|
PXOR X6, X2
|
|
PXOR X7, X3
|
|
|
|
PXOR X8, X0
|
|
PXOR X9, X1
|
|
PXOR X10, X2
|
|
PXOR X11, X3
|
|
|
|
DECQ AX
|
|
JNE loop_select_base
|
|
|
|
MOVOU X0, (16*0)(DX)
|
|
MOVOU X1, (16*1)(DX)
|
|
MOVOU X2, (16*2)(DX)
|
|
MOVOU X3, (16*3)(DX)
|
|
|
|
RET
|
|
|
|
select_base_avx2:
|
|
VPXOR Y15, Y15, Y15
|
|
VPCMPEQD Y14, Y14, Y14
|
|
VPSUBD Y14, Y15, Y15
|
|
VPBROADCASTD idx+16(FP), Y14
|
|
|
|
MOVQ $16, AX
|
|
VMOVDQU Y15, Y13
|
|
VPXOR Y0, Y0, Y0
|
|
VPXOR Y1, Y1, Y1
|
|
|
|
loop_select_base_avx2:
|
|
VPCMPEQD Y14, Y13, Y12
|
|
VPADDD Y15, Y13, Y13
|
|
|
|
VPAND (32*0)(DI), Y12, Y2
|
|
VPAND (32*1)(DI), Y12, Y3
|
|
|
|
VPCMPEQD Y14, Y13, Y12
|
|
VPADDD Y15, Y13, Y13
|
|
|
|
VPAND (32*2)(DI), Y12, Y4
|
|
VPAND (32*3)(DI), Y12, Y5
|
|
|
|
ADDQ $(32*4), DI
|
|
|
|
VPXOR Y2, Y0, Y0
|
|
VPXOR Y3, Y1, Y1
|
|
|
|
VPXOR Y4, Y0, Y0
|
|
VPXOR Y5, Y1, Y1
|
|
|
|
DECQ AX
|
|
JNE loop_select_base_avx2
|
|
|
|
VMOVDQU Y0, (32*0)(DX)
|
|
VMOVDQU Y1, (32*1)(DX)
|
|
VZEROUPPER
|
|
RET
|
|
|
|
//func p256OrdReduce(s *p256OrdElement)
|
|
TEXT ·p256OrdReduce(SB),NOSPLIT,$0
|
|
MOVQ s+0(FP), res_ptr
|
|
MOVQ (8*0)(res_ptr), acc0
|
|
MOVQ (8*1)(res_ptr), acc1
|
|
MOVQ (8*2)(res_ptr), acc2
|
|
MOVQ (8*3)(res_ptr), acc3
|
|
XORQ acc4, acc4
|
|
p256OrdReduceInline(acc0, acc1, acc2, acc3, acc4, acc5, x_ptr, y_ptr, t0, res_ptr)
|
|
RET
|
|
|
|
// func p256OrdMul(res, in1, in2 *p256OrdElement)
|
|
TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|
MOVQ in1+8(FP), x_ptr
|
|
MOVQ in2+16(FP), y_ptr
|
|
CMPB ·supportBMI2+0(SB), $0x01
|
|
JEQ ordMulBMI2
|
|
|
|
// x * y[0]
|
|
MOVQ (8*0)(y_ptr), t0
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
MULQ t0
|
|
MOVQ AX, acc0
|
|
MOVQ DX, acc1
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc1
|
|
ADCQ $0, DX
|
|
MOVQ DX, acc2
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc2
|
|
ADCQ $0, DX
|
|
MOVQ DX, acc3
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc3
|
|
ADCQ $0, DX
|
|
MOVQ DX, acc4
|
|
XORQ acc5, acc5
|
|
// First reduction step
|
|
MOVQ acc0, AX
|
|
MULQ p256ordK0<>(SB)
|
|
MOVQ AX, t0
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
MULQ t0
|
|
ADDQ AX, acc0
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ t0, acc0
|
|
MOVQ t0, AX
|
|
MOVQ t0, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ t0, acc2
|
|
SBBQ AX, acc3
|
|
SBBQ DX, acc0
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
MULQ t0
|
|
ADDQ BX, acc1
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc1
|
|
ADCQ DX, acc2
|
|
ADCQ $0, acc3
|
|
ADCQ acc0, acc4
|
|
ADCQ $0, acc5
|
|
|
|
XORQ acc0, acc0 // It seems this line is optional.
|
|
// x * y[1]
|
|
MOVQ (8*1)(y_ptr), t0
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc1
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc2
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc2
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc3
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc3
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc4
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc4
|
|
ADCQ DX, acc5
|
|
ADCQ $0, acc0
|
|
// Second reduction step
|
|
MOVQ acc1, AX
|
|
MULQ p256ordK0<>(SB)
|
|
MOVQ AX, t0
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
MULQ t0
|
|
ADDQ AX, acc1
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ t0, acc1
|
|
MOVQ t0, AX
|
|
MOVQ t0, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ t0, acc3
|
|
SBBQ AX, acc4
|
|
SBBQ DX, acc1
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
MULQ t0
|
|
ADDQ BX, acc2
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc2
|
|
ADCQ DX, acc3
|
|
ADCQ $0, acc4
|
|
ADCQ acc1, acc5
|
|
ADCQ $0, acc0
|
|
|
|
XORQ acc1, acc1 // It seems this line is optional.
|
|
// x * y[2]
|
|
MOVQ (8*2)(y_ptr), t0
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc2
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc3
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc3
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc4
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc4
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc5
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc5
|
|
ADCQ DX, acc0
|
|
ADCQ $0, acc1
|
|
// Third reduction step
|
|
MOVQ acc2, AX
|
|
MULQ p256ordK0<>(SB)
|
|
MOVQ AX, t0
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
MULQ t0
|
|
ADDQ AX, acc2
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ t0, acc2
|
|
MOVQ t0, AX
|
|
MOVQ t0, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ t0, acc4
|
|
SBBQ AX, acc5
|
|
SBBQ DX, acc2
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
MULQ t0
|
|
ADDQ BX, acc3
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc3
|
|
ADCQ DX, acc4
|
|
ADCQ $0, acc5
|
|
ADCQ acc2, acc0
|
|
ADCQ $0, acc1
|
|
|
|
XORQ acc2, acc2 // It seems this line is optional.
|
|
// x * y[3]
|
|
MOVQ (8*3)(y_ptr), t0
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ AX, acc3
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc4
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc4
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc5
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc5
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
MULQ t0
|
|
ADDQ BX, acc0
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc0
|
|
ADCQ DX, acc1
|
|
ADCQ $0, acc2
|
|
// Last reduction step
|
|
MOVQ acc3, AX
|
|
MULQ p256ordK0<>(SB)
|
|
MOVQ AX, t0
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
MULQ t0
|
|
ADDQ AX, acc3
|
|
ADCQ $0, DX
|
|
MOVQ DX, BX
|
|
|
|
MOVQ t0, acc3
|
|
MOVQ t0, AX
|
|
MOVQ t0, DX
|
|
SHLQ $32, AX
|
|
SHRQ $32, DX
|
|
|
|
SUBQ t0, acc5
|
|
SBBQ AX, acc0
|
|
SBBQ DX, acc3
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
MULQ t0
|
|
ADDQ BX, acc4
|
|
ADCQ $0, DX
|
|
ADDQ AX, acc4
|
|
ADCQ DX, acc5
|
|
ADCQ $0, acc0
|
|
ADCQ acc3, acc1
|
|
ADCQ $0, acc2
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
|
|
|
RET
|
|
|
|
ordMulBMI2:
|
|
XORQ acc5, acc5
|
|
XORQ res_ptr, res_ptr
|
|
// x * y[0]
|
|
MOVQ (8*0)(y_ptr), DX
|
|
MULXQ (8*0)(x_ptr), acc0, acc1
|
|
|
|
MULXQ (8*1)(x_ptr), AX, acc2
|
|
ADCXQ AX, acc1
|
|
|
|
MULXQ (8*2)(x_ptr), AX, acc3
|
|
ADCXQ AX, acc2
|
|
|
|
MULXQ (8*3)(x_ptr), AX, acc4
|
|
ADCXQ AX, acc3
|
|
ADCXQ acc5, acc4
|
|
|
|
// First reduction step
|
|
MOVQ acc0, DX
|
|
MULXQ p256ordK0<>(SB), DX, AX
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
|
ADOXQ AX, acc0
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc1
|
|
|
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc2
|
|
|
|
MULXQ p256ord<>+0x18(SB), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc3
|
|
|
|
ADCXQ res_ptr, BX
|
|
ADOXQ BX, acc4
|
|
ADOXQ res_ptr, acc5
|
|
XORQ acc0, acc0 // It seems this line is optional.
|
|
|
|
// x * y[1]
|
|
MOVQ (8*1)(y_ptr), DX
|
|
MULXQ (8*0)(x_ptr), AX, t0
|
|
ADOXQ AX, acc1
|
|
|
|
MULXQ (8*1)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc2
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc3
|
|
|
|
MULXQ (8*3)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc4
|
|
|
|
ADCXQ acc0, BX
|
|
ADOXQ BX, acc5
|
|
ADOXQ res_ptr, acc0
|
|
|
|
// Second reduction step
|
|
MOVQ acc1, DX
|
|
MULXQ p256ordK0<>(SB), DX, AX
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
|
ADOXQ AX, acc1
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc2
|
|
|
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc3
|
|
|
|
MULXQ p256ord<>+0x18(SB), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc4
|
|
|
|
ADCXQ res_ptr, BX
|
|
ADOXQ BX, acc5
|
|
ADOXQ res_ptr, acc0
|
|
XORQ acc1, acc1 // It seems this line is optional.
|
|
|
|
// x * y[2]
|
|
MOVQ (8*2)(y_ptr), DX
|
|
MULXQ (8*0)(x_ptr), AX, t0
|
|
ADOXQ AX, acc2
|
|
|
|
MULXQ (8*1)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc3
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc4
|
|
|
|
MULXQ (8*3)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc5
|
|
|
|
ADCXQ res_ptr, BX
|
|
ADOXQ BX, acc0
|
|
ADOXQ res_ptr, acc1
|
|
|
|
// Third reduction step
|
|
MOVQ acc2, DX
|
|
MULXQ p256ordK0<>(SB), DX, AX
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
|
ADOXQ AX, acc2
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc3
|
|
|
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc4
|
|
|
|
MULXQ p256ord<>+0x18(SB), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc5
|
|
|
|
ADCXQ res_ptr, BX
|
|
ADOXQ BX, acc0
|
|
ADOXQ res_ptr, acc1
|
|
XORQ acc2, acc2 // It seems this line is optional.
|
|
|
|
// x * y[3]
|
|
MOVQ (8*3)(y_ptr), DX
|
|
MULXQ (8*0)(x_ptr), AX, t0
|
|
ADOXQ AX, acc3
|
|
|
|
MULXQ (8*1)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc4
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc5
|
|
|
|
MULXQ (8*3)(x_ptr), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc0
|
|
|
|
ADCXQ res_ptr, BX
|
|
ADOXQ BX, acc1
|
|
ADOXQ res_ptr, acc2
|
|
|
|
// Last reduction step
|
|
MOVQ acc3, DX
|
|
MULXQ p256ordK0<>(SB), DX, AX
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t0
|
|
ADOXQ AX, acc3
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc4
|
|
|
|
MULXQ p256ord<>+0x10(SB), AX, t0
|
|
ADCXQ BX, AX
|
|
ADOXQ AX, acc5
|
|
|
|
MULXQ p256ord<>+0x18(SB), AX, BX
|
|
ADCXQ t0, AX
|
|
ADOXQ AX, acc0
|
|
|
|
ADCXQ res_ptr, BX
|
|
ADOXQ BX, acc1
|
|
ADOXQ res_ptr, acc2
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
|
|
|
RET
|