2022-08-17 15:23:59 +08:00
|
|
|
|
// This file contains constant-time, 64-bit assembly implementation of
|
|
|
|
|
// P256. The optimizations performed here are described in detail in:
|
|
|
|
|
// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
|
|
|
|
|
// 256-bit primes"
|
|
|
|
|
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
|
|
|
|
|
// https://eprint.iacr.org/2013/816.pdf
|
2023-09-07 08:50:10 +08:00
|
|
|
|
//go:build amd64 && !purego && !plugin
|
|
|
|
|
// +build amd64,!purego,!plugin
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
#include "textflag.h"
|
|
|
|
|
|
|
|
|
|
#define res_ptr DI
|
|
|
|
|
#define x_ptr SI
|
|
|
|
|
#define y_ptr CX
|
|
|
|
|
|
|
|
|
|
#define acc0 R8
|
|
|
|
|
#define acc1 R9
|
|
|
|
|
#define acc2 R10
|
|
|
|
|
#define acc3 R11
|
|
|
|
|
#define acc4 R12
|
|
|
|
|
#define acc5 R13
|
|
|
|
|
#define t0 R14
|
|
|
|
|
#define t1 R15
|
|
|
|
|
|
|
|
|
|
DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
|
|
|
|
|
DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
|
|
|
|
|
DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
|
|
|
|
|
DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff
|
|
|
|
|
DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975
|
|
|
|
|
DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123
|
|
|
|
|
DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b
|
|
|
|
|
DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
|
|
|
|
|
DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff
|
|
|
|
|
DATA p256one<>+0x00(SB)/8, $0x0000000000000001
|
|
|
|
|
DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
|
|
|
|
|
DATA p256one<>+0x10(SB)/8, $0x0000000000000000
|
|
|
|
|
DATA p256one<>+0x18(SB)/8, $0x0000000100000000
|
|
|
|
|
GLOBL p256p<>(SB), RODATA, $32
|
|
|
|
|
GLOBL p256ordK0<>(SB), RODATA, $8
|
|
|
|
|
GLOBL p256ord<>(SB), RODATA, $32
|
|
|
|
|
GLOBL p256one<>(SB), RODATA, $32
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
|
|
|
|
|
TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
|
|
|
|
|
JMP ·p256BigToLittle(SB)
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
|
|
|
|
|
TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
|
|
|
|
|
JMP ·p256BigToLittle(SB)
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
// func p256LittleToBig(res *[32]byte, in *p256Element)
|
|
|
|
|
TEXT ·p256LittleToBig(SB),NOSPLIT,$0
|
|
|
|
|
JMP ·p256BigToLittle(SB)
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
// func p256BigToLittle(res *p256Element, in *[32]byte)
|
|
|
|
|
TEXT ·p256BigToLittle(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
|
|
|
MOVQ in+8(FP), x_ptr
|
|
|
|
|
|
|
|
|
|
MOVQ (8*0)(x_ptr), acc0
|
|
|
|
|
MOVQ (8*1)(x_ptr), acc1
|
|
|
|
|
MOVQ (8*2)(x_ptr), acc2
|
|
|
|
|
MOVQ (8*3)(x_ptr), acc3
|
|
|
|
|
|
|
|
|
|
BSWAPQ acc0
|
|
|
|
|
BSWAPQ acc1
|
|
|
|
|
BSWAPQ acc2
|
|
|
|
|
BSWAPQ acc3
|
|
|
|
|
|
|
|
|
|
MOVQ acc3, (8*0)(res_ptr)
|
|
|
|
|
MOVQ acc2, (8*1)(res_ptr)
|
|
|
|
|
MOVQ acc1, (8*2)(res_ptr)
|
|
|
|
|
MOVQ acc0, (8*3)(res_ptr)
|
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
/* ---------------------------------------*/
|
2022-08-25 16:45:18 +08:00
|
|
|
|
// func p256MovCond(res, a, b *SM2P256Point, cond int)
|
2022-08-17 15:23:59 +08:00
|
|
|
|
TEXT ·p256MovCond(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
|
|
|
MOVQ a+8(FP), x_ptr
|
|
|
|
|
MOVQ b+16(FP), y_ptr
|
|
|
|
|
MOVQ cond+24(FP), X12
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
|
|
|
JEQ move_avx2
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
PXOR X13, X13
|
|
|
|
|
PSHUFD $0, X12, X12
|
|
|
|
|
PCMPEQL X13, X12
|
|
|
|
|
|
|
|
|
|
MOVOU X12, X0
|
|
|
|
|
MOVOU (16*0)(x_ptr), X6
|
|
|
|
|
PANDN X6, X0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVOU X12, X1
|
|
|
|
|
MOVOU (16*1)(x_ptr), X7
|
|
|
|
|
PANDN X7, X1
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVOU X12, X2
|
|
|
|
|
MOVOU (16*2)(x_ptr), X8
|
|
|
|
|
PANDN X8, X2
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVOU X12, X3
|
|
|
|
|
MOVOU (16*3)(x_ptr), X9
|
|
|
|
|
PANDN X9, X3
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVOU X12, X4
|
|
|
|
|
MOVOU (16*4)(x_ptr), X10
|
|
|
|
|
PANDN X10, X4
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVOU X12, X5
|
|
|
|
|
MOVOU (16*5)(x_ptr), X11
|
|
|
|
|
PANDN X11, X5
|
|
|
|
|
|
|
|
|
|
MOVOU (16*0)(y_ptr), X6
|
|
|
|
|
MOVOU (16*1)(y_ptr), X7
|
|
|
|
|
MOVOU (16*2)(y_ptr), X8
|
|
|
|
|
MOVOU (16*3)(y_ptr), X9
|
|
|
|
|
MOVOU (16*4)(y_ptr), X10
|
|
|
|
|
MOVOU (16*5)(y_ptr), X11
|
|
|
|
|
|
|
|
|
|
PAND X12, X6
|
|
|
|
|
PAND X12, X7
|
|
|
|
|
PAND X12, X8
|
|
|
|
|
PAND X12, X9
|
|
|
|
|
PAND X12, X10
|
|
|
|
|
PAND X12, X11
|
|
|
|
|
|
|
|
|
|
PXOR X6, X0
|
|
|
|
|
PXOR X7, X1
|
|
|
|
|
PXOR X8, X2
|
|
|
|
|
PXOR X9, X3
|
|
|
|
|
PXOR X10, X4
|
|
|
|
|
PXOR X11, X5
|
|
|
|
|
|
|
|
|
|
MOVOU X0, (16*0)(res_ptr)
|
|
|
|
|
MOVOU X1, (16*1)(res_ptr)
|
|
|
|
|
MOVOU X2, (16*2)(res_ptr)
|
|
|
|
|
MOVOU X3, (16*3)(res_ptr)
|
|
|
|
|
MOVOU X4, (16*4)(res_ptr)
|
|
|
|
|
MOVOU X5, (16*5)(res_ptr)
|
|
|
|
|
|
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
move_avx2:
|
|
|
|
|
VPXOR Y13, Y13, Y13
|
|
|
|
|
VPBROADCASTD X12, Y12
|
|
|
|
|
VPCMPEQD Y13, Y12, Y12
|
|
|
|
|
|
|
|
|
|
VPANDN (32*0)(x_ptr), Y12, Y0
|
|
|
|
|
VPANDN (32*1)(x_ptr), Y12, Y1
|
|
|
|
|
VPANDN (32*2)(x_ptr), Y12, Y2
|
|
|
|
|
|
2023-06-14 17:30:58 +08:00
|
|
|
|
VPAND (32*0)(y_ptr), Y12, Y3
|
|
|
|
|
VPAND (32*1)(y_ptr), Y12, Y4
|
|
|
|
|
VPAND (32*2)(y_ptr), Y12, Y5
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
VPXOR Y3, Y0, Y0
|
|
|
|
|
VPXOR Y4, Y1, Y1
|
|
|
|
|
VPXOR Y5, Y2, Y2
|
|
|
|
|
|
|
|
|
|
VMOVDQU Y0, (32*0)(res_ptr)
|
|
|
|
|
VMOVDQU Y1, (32*1)(res_ptr)
|
|
|
|
|
VMOVDQU Y2, (32*2)(res_ptr)
|
|
|
|
|
|
|
|
|
|
VZEROUPPER
|
|
|
|
|
RET
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
// func p256NegCond(val *p256Element, cond int)
|
|
|
|
|
TEXT ·p256NegCond(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ val+0(FP), res_ptr
|
|
|
|
|
MOVQ cond+8(FP), t0
|
|
|
|
|
// acc = poly
|
|
|
|
|
MOVQ $-1, acc0
|
|
|
|
|
MOVQ p256p<>+0x08(SB), acc1
|
|
|
|
|
MOVQ $-1, acc2
|
|
|
|
|
MOVQ p256p<>+0x18(SB), acc3
|
|
|
|
|
// Load the original value
|
|
|
|
|
MOVQ (8*0)(res_ptr), acc5
|
|
|
|
|
MOVQ (8*1)(res_ptr), x_ptr
|
|
|
|
|
MOVQ (8*2)(res_ptr), y_ptr
|
|
|
|
|
MOVQ (8*3)(res_ptr), t1
|
|
|
|
|
// Speculatively subtract
|
|
|
|
|
SUBQ acc5, acc0
|
|
|
|
|
SBBQ x_ptr, acc1
|
|
|
|
|
SBBQ y_ptr, acc2
|
|
|
|
|
SBBQ t1, acc3
|
|
|
|
|
// If condition is 0, keep original value
|
|
|
|
|
TESTQ t0, t0
|
|
|
|
|
CMOVQEQ acc5, acc0
|
|
|
|
|
CMOVQEQ x_ptr, acc1
|
|
|
|
|
CMOVQEQ y_ptr, acc2
|
|
|
|
|
CMOVQEQ t1, acc3
|
|
|
|
|
// Store result
|
|
|
|
|
MOVQ acc0, (8*0)(res_ptr)
|
|
|
|
|
MOVQ acc1, (8*1)(res_ptr)
|
|
|
|
|
MOVQ acc2, (8*2)(res_ptr)
|
|
|
|
|
MOVQ acc3, (8*3)(res_ptr)
|
|
|
|
|
|
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2023-06-16 15:52:28 +08:00
|
|
|
|
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
|
|
|
|
MOVQ a0, b0 \
|
|
|
|
|
MOVQ a1, b1 \
|
|
|
|
|
MOVQ a2, b2 \
|
|
|
|
|
MOVQ a3, b3 \
|
|
|
|
|
\ // Subtract p256
|
|
|
|
|
SUBQ $-1, a0 \
|
|
|
|
|
SBBQ p256p<>+0x08(SB), a1 \
|
|
|
|
|
SBBQ $-1, a2 \
|
|
|
|
|
SBBQ p256p<>+0x018(SB), a3 \
|
|
|
|
|
SBBQ $0, a4 \
|
|
|
|
|
\
|
|
|
|
|
CMOVQCS b0, a0 \
|
|
|
|
|
CMOVQCS b1, a1 \
|
|
|
|
|
CMOVQCS b2, a2 \
|
|
|
|
|
CMOVQCS b3, a3 \
|
|
|
|
|
\
|
|
|
|
|
MOVQ a0, (8*0)(res) \
|
|
|
|
|
MOVQ a1, (8*1)(res) \
|
|
|
|
|
MOVQ a2, (8*2)(res) \
|
|
|
|
|
MOVQ a3, (8*3)(res) \
|
|
|
|
|
|
|
|
|
|
#define p256SqrMontReduce() \
|
2023-07-26 10:26:32 +08:00
|
|
|
|
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ acc0, AX \
|
|
|
|
|
MOVQ acc0, DX \
|
2023-07-26 10:26:32 +08:00
|
|
|
|
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
|
|
|
|
|
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
|
|
|
|
|
\ // calculate the positive part first: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
|
|
|
|
|
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
|
|
|
|
|
ADDQ acc0, acc1 \ // acc1' = L (acc0 + acc1)
|
|
|
|
|
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
|
|
|
|
|
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
|
|
|
|
|
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
|
|
|
|
|
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
|
|
|
|
|
SUBQ AX, acc1 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
SBBQ DX, acc2 \
|
|
|
|
|
SBBQ AX, acc3 \
|
|
|
|
|
SBBQ DX, acc0 \
|
|
|
|
|
\ // Second reduction step
|
|
|
|
|
MOVQ acc1, AX \
|
|
|
|
|
MOVQ acc1, DX \
|
|
|
|
|
SHLQ $32, AX \
|
|
|
|
|
SHRQ $32, DX \
|
|
|
|
|
\
|
|
|
|
|
ADDQ acc1, acc2 \
|
|
|
|
|
ADCQ $0, acc3 \
|
|
|
|
|
ADCQ $0, acc0 \
|
|
|
|
|
ADCQ $0, acc1 \
|
|
|
|
|
\
|
|
|
|
|
SUBQ AX, acc2 \
|
|
|
|
|
SBBQ DX, acc3 \
|
|
|
|
|
SBBQ AX, acc0 \
|
|
|
|
|
SBBQ DX, acc1 \
|
|
|
|
|
\ // Third reduction step
|
|
|
|
|
MOVQ acc2, AX \
|
|
|
|
|
MOVQ acc2, DX \
|
|
|
|
|
SHLQ $32, AX \
|
|
|
|
|
SHRQ $32, DX \
|
|
|
|
|
\
|
|
|
|
|
ADDQ acc2, acc3 \
|
|
|
|
|
ADCQ $0, acc0 \
|
|
|
|
|
ADCQ $0, acc1 \
|
|
|
|
|
ADCQ $0, acc2 \
|
|
|
|
|
\
|
|
|
|
|
SUBQ AX, acc3 \
|
|
|
|
|
SBBQ DX, acc0 \
|
|
|
|
|
SBBQ AX, acc1 \
|
|
|
|
|
SBBQ DX, acc2 \
|
|
|
|
|
\ // Last reduction step
|
|
|
|
|
XORQ t0, t0 \
|
|
|
|
|
MOVQ acc3, AX \
|
|
|
|
|
MOVQ acc3, DX \
|
|
|
|
|
SHLQ $32, AX \
|
|
|
|
|
SHRQ $32, DX \
|
|
|
|
|
\
|
|
|
|
|
ADDQ acc3, acc0 \
|
|
|
|
|
ADCQ $0, acc1 \
|
|
|
|
|
ADCQ $0, acc2 \
|
|
|
|
|
ADCQ $0, acc3 \
|
|
|
|
|
\
|
|
|
|
|
SUBQ AX, acc0 \
|
|
|
|
|
SBBQ DX, acc1 \
|
|
|
|
|
SBBQ AX, acc2 \
|
|
|
|
|
SBBQ DX, acc3 \
|
|
|
|
|
\ // Add bits [511:256] of the sqr result
|
|
|
|
|
ADCQ acc4, acc0 \
|
|
|
|
|
ADCQ acc5, acc1 \
|
|
|
|
|
ADCQ y_ptr, acc2 \
|
|
|
|
|
ADCQ x_ptr, acc3 \
|
2023-06-16 15:52:28 +08:00
|
|
|
|
ADCQ $0, t0 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
// func p256Sqr(res, in *p256Element, n int)
|
|
|
|
|
TEXT ·p256Sqr(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
|
|
|
MOVQ in+8(FP), x_ptr
|
|
|
|
|
MOVQ n+16(FP), BX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
CMPB ·supportBMI2+0(SB), $0x01
|
|
|
|
|
JEQ sqrBMI2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
sqrLoop:
|
|
|
|
|
|
|
|
|
|
// y[1:] * y[0]
|
|
|
|
|
MOVQ (8*0)(x_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
MOVQ AX, acc1
|
|
|
|
|
MOVQ DX, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc4
|
|
|
|
|
// y[2:] * y[1]
|
|
|
|
|
MOVQ (8*1)(x_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc5
|
|
|
|
|
// y[3] * y[2]
|
|
|
|
|
MOVQ (8*2)(x_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, y_ptr
|
|
|
|
|
XORQ t1, t1
|
|
|
|
|
// *2
|
|
|
|
|
ADDQ acc1, acc1
|
|
|
|
|
ADCQ acc2, acc2
|
|
|
|
|
ADCQ acc3, acc3
|
|
|
|
|
ADCQ acc4, acc4
|
|
|
|
|
ADCQ acc5, acc5
|
|
|
|
|
ADCQ y_ptr, y_ptr
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
// Missing products
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ AX
|
|
|
|
|
MOVQ AX, acc0
|
|
|
|
|
MOVQ DX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ AX
|
|
|
|
|
ADDQ t0, acc1
|
|
|
|
|
ADCQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ AX
|
|
|
|
|
ADDQ t0, acc3
|
|
|
|
|
ADCQ AX, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ AX
|
|
|
|
|
ADDQ t0, acc5
|
|
|
|
|
ADCQ AX, y_ptr
|
|
|
|
|
ADCQ DX, t1
|
|
|
|
|
MOVQ t1, x_ptr
|
|
|
|
|
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
2023-06-16 15:52:28 +08:00
|
|
|
|
p256SqrMontReduce()
|
|
|
|
|
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
|
|
|
|
MOVQ res_ptr, x_ptr
|
|
|
|
|
DECQ BX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
JNE sqrLoop
|
|
|
|
|
RET
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
sqrBMI2:
|
|
|
|
|
// y[1:] * y[0]
|
|
|
|
|
MOVQ (8*0)(x_ptr), DX
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*1)(x_ptr), acc1, acc2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*2)(x_ptr), AX, acc3
|
|
|
|
|
ADDQ AX, acc2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*3)(x_ptr), AX, acc4
|
|
|
|
|
ADCQ AX, acc3
|
|
|
|
|
ADCQ $0, acc4
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// y[2:] * y[1]
|
|
|
|
|
MOVQ (8*1)(x_ptr), DX
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*3)(x_ptr), AX, acc5
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADDQ AX, acc4
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// y[3] * y[2]
|
|
|
|
|
MOVQ (8*2)(x_ptr), DX
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*3)(x_ptr), AX, y_ptr
|
|
|
|
|
ADCQ AX, acc5
|
|
|
|
|
ADCQ $0, y_ptr
|
|
|
|
|
XORQ t1, t1
|
|
|
|
|
|
|
|
|
|
// *2
|
|
|
|
|
ADDQ acc1, acc1
|
|
|
|
|
ADCQ acc2, acc2
|
|
|
|
|
ADCQ acc3, acc3
|
|
|
|
|
ADCQ acc4, acc4
|
|
|
|
|
ADCQ acc5, acc5
|
|
|
|
|
ADCQ y_ptr, y_ptr
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
|
|
|
|
|
// Missing products
|
|
|
|
|
MOVQ (8*0)(x_ptr), DX
|
|
|
|
|
MULXQ DX, acc0, t0
|
|
|
|
|
ADDQ t0, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), DX
|
|
|
|
|
MULXQ DX, AX, t0
|
|
|
|
|
ADCQ AX, acc2
|
|
|
|
|
ADCQ t0, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), DX
|
|
|
|
|
MULXQ DX, AX, t0
|
|
|
|
|
ADCQ AX, acc4
|
|
|
|
|
ADCQ t0, acc5
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), DX
|
|
|
|
|
MULXQ DX, AX, x_ptr
|
|
|
|
|
ADCQ AX, y_ptr
|
|
|
|
|
ADCQ t1, x_ptr
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
2023-06-16 15:52:28 +08:00
|
|
|
|
p256SqrMontReduce()
|
|
|
|
|
p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
|
|
|
|
MOVQ res_ptr, x_ptr
|
|
|
|
|
DECQ BX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
JNE sqrBMI2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
// func p256Mul(res, in1, in2 *p256Element)
|
|
|
|
|
TEXT ·p256Mul(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
|
|
|
MOVQ in1+8(FP), x_ptr
|
|
|
|
|
MOVQ in2+16(FP), y_ptr
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
CMPB ·supportBMI2+0(SB), $0x01
|
|
|
|
|
JEQ mulBMI2
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
// x * y[0]
|
|
|
|
|
MOVQ (8*0)(y_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
MOVQ AX, acc0
|
|
|
|
|
MOVQ DX, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc4
|
|
|
|
|
XORQ acc5, acc5
|
|
|
|
|
// First reduction step
|
|
|
|
|
MOVQ acc0, AX
|
|
|
|
|
MOVQ acc0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ acc0, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
|
|
|
|
|
SUBQ AX, acc1
|
|
|
|
|
SBBQ DX, acc2
|
|
|
|
|
SBBQ AX, acc3
|
|
|
|
|
SBBQ DX, acc4
|
|
|
|
|
SBBQ $0, acc5
|
|
|
|
|
XORQ acc0, acc0
|
|
|
|
|
|
|
|
|
|
// x * y[1]
|
|
|
|
|
MOVQ (8*1)(y_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ DX, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
// Second reduction step
|
|
|
|
|
MOVQ acc1, AX
|
|
|
|
|
MOVQ acc1, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc1, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
ADCQ acc1, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
|
|
|
|
|
SUBQ AX, acc2
|
|
|
|
|
SBBQ DX, acc3
|
|
|
|
|
SBBQ AX, acc4
|
|
|
|
|
SBBQ DX, acc5
|
|
|
|
|
SBBQ $0, acc0
|
|
|
|
|
XORQ acc1, acc1
|
|
|
|
|
|
|
|
|
|
// x * y[2]
|
|
|
|
|
MOVQ (8*2)(y_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc5
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ DX, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
// Third reduction step
|
|
|
|
|
MOVQ acc2, AX
|
|
|
|
|
MOVQ acc2, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc2, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADCQ acc2, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
|
|
|
|
|
SUBQ AX, acc3
|
|
|
|
|
SBBQ DX, acc4
|
|
|
|
|
SBBQ AX, acc5
|
|
|
|
|
SBBQ DX, acc0
|
|
|
|
|
SBBQ $0, acc1
|
|
|
|
|
XORQ acc2, acc2
|
|
|
|
|
// x * y[3]
|
|
|
|
|
MOVQ (8*3)(y_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc5
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc0
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc0
|
|
|
|
|
ADCQ DX, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
// Last reduction step
|
|
|
|
|
MOVQ acc3, AX
|
|
|
|
|
MOVQ acc3, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc3, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ acc3, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
|
|
|
|
|
SUBQ AX, acc4
|
|
|
|
|
SBBQ DX, acc5
|
|
|
|
|
SBBQ AX, acc0
|
|
|
|
|
SBBQ DX, acc1
|
|
|
|
|
SBBQ $0, acc2
|
2023-06-16 15:52:28 +08:00
|
|
|
|
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
|
2022-08-17 15:23:59 +08:00
|
|
|
|
RET
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
mulBMI2:
|
|
|
|
|
// x * y[0]
|
|
|
|
|
MOVQ (8*0)(y_ptr), DX
|
|
|
|
|
MULXQ (8*0)(x_ptr), acc0, acc1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*1)(x_ptr), AX, acc2
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
|
|
|
|
|
MULXQ (8*2)(x_ptr), AX, acc3
|
|
|
|
|
ADCQ AX, acc2
|
|
|
|
|
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, acc4
|
|
|
|
|
ADCQ AX, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
|
|
|
|
|
XORQ acc5, acc5
|
|
|
|
|
// First reduction step
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVQ acc0, AX
|
|
|
|
|
MOVQ acc0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ acc0, acc4
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ $0, acc5
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
SUBQ AX, acc1
|
|
|
|
|
SBBQ DX, acc2
|
|
|
|
|
SBBQ AX, acc3
|
|
|
|
|
SBBQ DX, acc4
|
2023-06-10 10:55:17 +08:00
|
|
|
|
SBBQ $0, acc5
|
|
|
|
|
XORQ acc0, acc0
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// x * y[1]
|
|
|
|
|
MOVQ (8*1)(y_ptr), DX
|
|
|
|
|
MULXQ (8*0)(x_ptr), AX, t1
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ t1, acc2
|
|
|
|
|
|
|
|
|
|
MULXQ (8*1)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ t1, acc3
|
|
|
|
|
|
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ t1, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
|
|
|
|
|
// Second reduction step
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVQ acc1, AX
|
|
|
|
|
MOVQ acc1, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc1, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
ADCQ acc1, acc5
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ $0, acc0
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
SUBQ AX, acc2
|
|
|
|
|
SBBQ DX, acc3
|
|
|
|
|
SBBQ AX, acc4
|
|
|
|
|
SBBQ DX, acc5
|
2023-06-10 10:55:17 +08:00
|
|
|
|
SBBQ $0, acc0
|
|
|
|
|
XORQ acc1, acc1
|
|
|
|
|
|
|
|
|
|
// x * y[2]
|
|
|
|
|
MOVQ (8*2)(y_ptr), DX
|
|
|
|
|
|
|
|
|
|
MULXQ (8*0)(x_ptr), AX, t1
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ t1, acc3
|
|
|
|
|
|
|
|
|
|
MULXQ (8*1)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ t1, acc5
|
|
|
|
|
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ t1, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
// Third reduction step
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVQ acc2, AX
|
|
|
|
|
MOVQ acc2, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc2, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADCQ acc2, acc0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ $0, acc1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
SUBQ AX, acc3
|
|
|
|
|
SBBQ DX, acc4
|
|
|
|
|
SBBQ AX, acc5
|
|
|
|
|
SBBQ DX, acc0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
SBBQ $0, acc1
|
|
|
|
|
XORQ acc2, acc2
|
|
|
|
|
// x * y[3]
|
|
|
|
|
MOVQ (8*3)(y_ptr), DX
|
|
|
|
|
|
|
|
|
|
MULXQ (8*0)(x_ptr), AX, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ (8*1)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ t1, acc5
|
|
|
|
|
|
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ t1, acc0
|
|
|
|
|
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc0
|
|
|
|
|
ADCQ t1, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
// Last reduction step
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVQ acc3, AX
|
|
|
|
|
MOVQ acc3, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc3, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ acc3, acc1
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ $0, acc2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
SUBQ AX, acc4
|
|
|
|
|
SBBQ DX, acc5
|
|
|
|
|
SBBQ AX, acc0
|
|
|
|
|
SBBQ DX, acc1
|
2023-06-10 10:55:17 +08:00
|
|
|
|
SBBQ $0, acc2
|
2023-06-16 15:52:28 +08:00
|
|
|
|
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
|
2022-08-17 15:23:59 +08:00
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
/* ---------------------------------------*/
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// func p256FromMont(res, in *p256Element)
|
|
|
|
|
TEXT ·p256FromMont(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
|
|
|
MOVQ in+8(FP), x_ptr
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ (8*0)(x_ptr), acc0
|
|
|
|
|
MOVQ (8*1)(x_ptr), acc1
|
|
|
|
|
MOVQ (8*2)(x_ptr), acc2
|
|
|
|
|
MOVQ (8*3)(x_ptr), acc3
|
|
|
|
|
XORQ acc4, acc4
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// Only reduce, no multiplications are needed
|
|
|
|
|
// First stage
|
|
|
|
|
MOVQ acc0, AX
|
|
|
|
|
MOVQ acc0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADDQ acc0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ acc0, acc4
|
|
|
|
|
|
|
|
|
|
SUBQ AX, acc1
|
|
|
|
|
SBBQ DX, acc2
|
|
|
|
|
SBBQ AX, acc3
|
|
|
|
|
SBBQ DX, acc4
|
|
|
|
|
XORQ acc5, acc5
|
|
|
|
|
|
|
|
|
|
// Second stage
|
|
|
|
|
MOVQ acc1, AX
|
|
|
|
|
MOVQ acc1, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc1, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
ADCQ acc1, acc5
|
|
|
|
|
|
|
|
|
|
SUBQ AX, acc2
|
|
|
|
|
SBBQ DX, acc3
|
|
|
|
|
SBBQ AX, acc4
|
|
|
|
|
SBBQ DX, acc5
|
|
|
|
|
XORQ acc0, acc0
|
|
|
|
|
// Third stage
|
|
|
|
|
MOVQ acc2, AX
|
|
|
|
|
MOVQ acc2, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc2, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADCQ acc2, acc0
|
|
|
|
|
|
|
|
|
|
SUBQ AX, acc3
|
|
|
|
|
SBBQ DX, acc4
|
|
|
|
|
SBBQ AX, acc5
|
|
|
|
|
SBBQ DX, acc0
|
|
|
|
|
XORQ acc1, acc1
|
|
|
|
|
// Last stage
|
|
|
|
|
MOVQ acc3, AX
|
|
|
|
|
MOVQ acc3, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
ADDQ acc3, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ acc3, acc1
|
|
|
|
|
|
|
|
|
|
SUBQ AX, acc4
|
|
|
|
|
SBBQ DX, acc5
|
|
|
|
|
SBBQ AX, acc0
|
|
|
|
|
SBBQ DX, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ acc4, x_ptr
|
|
|
|
|
MOVQ acc5, acc3
|
|
|
|
|
MOVQ acc0, t0
|
|
|
|
|
MOVQ acc1, t1
|
|
|
|
|
|
|
|
|
|
SUBQ $-1, acc4
|
|
|
|
|
SBBQ p256p<>+0x08(SB), acc5
|
|
|
|
|
SBBQ $-1, acc0
|
|
|
|
|
SBBQ p256p<>+0x018(SB), acc1
|
|
|
|
|
|
|
|
|
|
CMOVQCS x_ptr, acc4
|
|
|
|
|
CMOVQCS acc3, acc5
|
|
|
|
|
CMOVQCS t0, acc0
|
|
|
|
|
CMOVQCS t1, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ acc4, (8*0)(res_ptr)
|
|
|
|
|
MOVQ acc5, (8*1)(res_ptr)
|
|
|
|
|
MOVQ acc0, (8*2)(res_ptr)
|
|
|
|
|
MOVQ acc1, (8*3)(res_ptr)
|
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
/* ---------------------------------------*/
|
2023-06-14 17:30:58 +08:00
|
|
|
|
// func p256Select(res *SM2P256Point, table *p256Table, idx, limit int)
|
2023-06-10 10:55:17 +08:00
|
|
|
|
TEXT ·p256Select(SB),NOSPLIT,$0
|
|
|
|
|
//MOVQ idx+16(FP),AX
|
|
|
|
|
MOVQ table+8(FP),DI
|
|
|
|
|
MOVQ res+0(FP),DX
|
|
|
|
|
|
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
|
|
|
JEQ select_avx2
|
|
|
|
|
|
|
|
|
|
PXOR X15, X15 // X15 = 0
|
|
|
|
|
PCMPEQL X14, X14 // X14 = -1
|
|
|
|
|
PSUBL X14, X15 // X15 = 1
|
|
|
|
|
MOVL idx+16(FP), X14
|
|
|
|
|
PSHUFD $0, X14, X14
|
|
|
|
|
|
|
|
|
|
PXOR X0, X0
|
|
|
|
|
PXOR X1, X1
|
|
|
|
|
PXOR X2, X2
|
|
|
|
|
PXOR X3, X3
|
|
|
|
|
PXOR X4, X4
|
|
|
|
|
PXOR X5, X5
|
2023-06-14 17:30:58 +08:00
|
|
|
|
MOVQ limit+24(FP),AX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
MOVOU X15, X13
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
loop_select:
|
|
|
|
|
|
|
|
|
|
MOVOU X13, X12
|
|
|
|
|
PADDL X15, X13
|
|
|
|
|
PCMPEQL X14, X12
|
|
|
|
|
|
|
|
|
|
MOVOU (16*0)(DI), X6
|
|
|
|
|
MOVOU (16*1)(DI), X7
|
|
|
|
|
MOVOU (16*2)(DI), X8
|
|
|
|
|
MOVOU (16*3)(DI), X9
|
|
|
|
|
MOVOU (16*4)(DI), X10
|
|
|
|
|
MOVOU (16*5)(DI), X11
|
|
|
|
|
ADDQ $(16*6), DI
|
|
|
|
|
|
|
|
|
|
PAND X12, X6
|
|
|
|
|
PAND X12, X7
|
|
|
|
|
PAND X12, X8
|
|
|
|
|
PAND X12, X9
|
|
|
|
|
PAND X12, X10
|
|
|
|
|
PAND X12, X11
|
|
|
|
|
|
|
|
|
|
PXOR X6, X0
|
|
|
|
|
PXOR X7, X1
|
|
|
|
|
PXOR X8, X2
|
|
|
|
|
PXOR X9, X3
|
|
|
|
|
PXOR X10, X4
|
|
|
|
|
PXOR X11, X5
|
|
|
|
|
|
|
|
|
|
DECQ AX
|
|
|
|
|
JNE loop_select
|
|
|
|
|
|
|
|
|
|
MOVOU X0, (16*0)(DX)
|
|
|
|
|
MOVOU X1, (16*1)(DX)
|
|
|
|
|
MOVOU X2, (16*2)(DX)
|
|
|
|
|
MOVOU X3, (16*3)(DX)
|
|
|
|
|
MOVOU X4, (16*4)(DX)
|
|
|
|
|
MOVOU X5, (16*5)(DX)
|
|
|
|
|
|
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
select_avx2:
|
|
|
|
|
VPXOR Y15, Y15, Y15
|
|
|
|
|
VPCMPEQD Y14, Y14, Y14
|
|
|
|
|
VPSUBD Y14, Y15, Y15
|
|
|
|
|
MOVL idx+16(FP), X14 // x14 = idx
|
|
|
|
|
VPBROADCASTD X14, Y14
|
|
|
|
|
|
2023-06-14 17:30:58 +08:00
|
|
|
|
MOVQ limit+24(FP),AX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
VMOVDQU Y15, Y13
|
|
|
|
|
|
|
|
|
|
VPXOR Y0, Y0, Y0
|
|
|
|
|
VPXOR Y1, Y1, Y1
|
|
|
|
|
VPXOR Y2, Y2, Y2
|
|
|
|
|
|
|
|
|
|
loop_select_avx2:
|
|
|
|
|
VMOVDQU Y13, Y12
|
|
|
|
|
VPADDD Y15, Y13, Y13
|
|
|
|
|
VPCMPEQD Y14, Y12, Y12
|
|
|
|
|
|
2023-06-14 17:30:58 +08:00
|
|
|
|
VPAND (32*0)(DI), Y12, Y3
|
|
|
|
|
VPAND (32*1)(DI), Y12, Y4
|
|
|
|
|
VPAND (32*2)(DI), Y12, Y5
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
ADDQ $(32*3), DI
|
|
|
|
|
|
|
|
|
|
VPXOR Y3, Y0, Y0
|
|
|
|
|
VPXOR Y4, Y1, Y1
|
|
|
|
|
VPXOR Y5, Y2, Y2
|
|
|
|
|
|
|
|
|
|
DECQ AX
|
|
|
|
|
JNE loop_select_avx2
|
|
|
|
|
|
|
|
|
|
VMOVDQU Y0, (32*0)(DX)
|
|
|
|
|
VMOVDQU Y1, (32*1)(DX)
|
|
|
|
|
VMOVDQU Y2, (32*2)(DX)
|
|
|
|
|
VZEROUPPER
|
|
|
|
|
RET
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
|
|
|
|
|
TEXT ·p256SelectAffine(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ idx+16(FP),AX
|
|
|
|
|
MOVQ table+8(FP),DI
|
|
|
|
|
MOVQ res+0(FP),DX
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
|
|
|
JEQ select_base_avx2
|
|
|
|
|
|
|
|
|
|
PXOR X15, X15 // X15 = 0
|
2022-08-17 15:23:59 +08:00
|
|
|
|
PCMPEQL X14, X14 // X14 = -1
|
|
|
|
|
PSUBL X14, X15 // X15 = 1
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVL AX, X14 // x14 = idx
|
2022-08-17 15:23:59 +08:00
|
|
|
|
PSHUFD $0, X14, X14
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ $16, AX
|
|
|
|
|
MOVOU X15, X13
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
PXOR X0, X0
|
|
|
|
|
PXOR X1, X1
|
|
|
|
|
PXOR X2, X2
|
|
|
|
|
PXOR X3, X3
|
|
|
|
|
|
|
|
|
|
loop_select_base:
|
|
|
|
|
|
|
|
|
|
MOVOU X13, X12
|
|
|
|
|
PADDL X15, X13
|
|
|
|
|
PCMPEQL X14, X12
|
|
|
|
|
|
|
|
|
|
MOVOU (16*0)(DI), X4
|
|
|
|
|
MOVOU (16*1)(DI), X5
|
|
|
|
|
MOVOU (16*2)(DI), X6
|
|
|
|
|
MOVOU (16*3)(DI), X7
|
|
|
|
|
|
|
|
|
|
MOVOU (16*4)(DI), X8
|
|
|
|
|
MOVOU (16*5)(DI), X9
|
|
|
|
|
MOVOU (16*6)(DI), X10
|
|
|
|
|
MOVOU (16*7)(DI), X11
|
|
|
|
|
|
|
|
|
|
ADDQ $(16*8), DI
|
|
|
|
|
|
|
|
|
|
PAND X12, X4
|
|
|
|
|
PAND X12, X5
|
|
|
|
|
PAND X12, X6
|
|
|
|
|
PAND X12, X7
|
|
|
|
|
|
|
|
|
|
MOVOU X13, X12
|
|
|
|
|
PADDL X15, X13
|
|
|
|
|
PCMPEQL X14, X12
|
|
|
|
|
|
|
|
|
|
PAND X12, X8
|
|
|
|
|
PAND X12, X9
|
|
|
|
|
PAND X12, X10
|
|
|
|
|
PAND X12, X11
|
|
|
|
|
|
|
|
|
|
PXOR X4, X0
|
|
|
|
|
PXOR X5, X1
|
|
|
|
|
PXOR X6, X2
|
|
|
|
|
PXOR X7, X3
|
|
|
|
|
|
|
|
|
|
PXOR X8, X0
|
|
|
|
|
PXOR X9, X1
|
|
|
|
|
PXOR X10, X2
|
|
|
|
|
PXOR X11, X3
|
|
|
|
|
|
|
|
|
|
DECQ AX
|
|
|
|
|
JNE loop_select_base
|
|
|
|
|
|
|
|
|
|
MOVOU X0, (16*0)(DX)
|
|
|
|
|
MOVOU X1, (16*1)(DX)
|
|
|
|
|
MOVOU X2, (16*2)(DX)
|
|
|
|
|
MOVOU X3, (16*3)(DX)
|
|
|
|
|
|
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
select_base_avx2:
|
|
|
|
|
VPXOR Y15, Y15, Y15
|
|
|
|
|
VPCMPEQD Y14, Y14, Y14
|
|
|
|
|
VPSUBD Y14, Y15, Y15
|
|
|
|
|
MOVL AX, X14 // x14 = idx
|
|
|
|
|
VPBROADCASTD X14, Y14
|
|
|
|
|
|
|
|
|
|
MOVQ $16, AX
|
|
|
|
|
VMOVDQU Y15, Y13
|
|
|
|
|
VPXOR Y0, Y0, Y0
|
|
|
|
|
VPXOR Y1, Y1, Y1
|
|
|
|
|
|
|
|
|
|
loop_select_base_avx2:
|
|
|
|
|
VMOVDQU Y13, Y12
|
|
|
|
|
VPADDD Y15, Y13, Y13
|
|
|
|
|
VPCMPEQD Y14, Y12, Y12
|
|
|
|
|
|
2023-06-14 17:30:58 +08:00
|
|
|
|
VPAND (32*0)(DI), Y12, Y2
|
|
|
|
|
VPAND (32*1)(DI), Y12, Y3
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
VMOVDQU Y13, Y12
|
|
|
|
|
VPADDD Y15, Y13, Y13
|
|
|
|
|
VPCMPEQD Y14, Y12, Y12
|
|
|
|
|
|
2023-06-14 17:30:58 +08:00
|
|
|
|
VPAND (32*2)(DI), Y12, Y4
|
|
|
|
|
VPAND (32*3)(DI), Y12, Y5
|
|
|
|
|
|
|
|
|
|
ADDQ $(32*4), DI
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
VPXOR Y2, Y0, Y0
|
|
|
|
|
VPXOR Y3, Y1, Y1
|
|
|
|
|
|
|
|
|
|
VPXOR Y4, Y0, Y0
|
|
|
|
|
VPXOR Y5, Y1, Y1
|
|
|
|
|
|
|
|
|
|
DECQ AX
|
|
|
|
|
JNE loop_select_base_avx2
|
|
|
|
|
|
|
|
|
|
VMOVDQU Y0, (32*0)(DX)
|
|
|
|
|
VMOVDQU Y1, (32*1)(DX)
|
|
|
|
|
VZEROUPPER
|
|
|
|
|
RET
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
/* ---------------------------------------*/
|
2023-06-16 15:52:28 +08:00
|
|
|
|
#define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
|
|
|
|
\// Copy result [255:0]
|
|
|
|
|
MOVQ a0, b0 \
|
|
|
|
|
MOVQ a1, b1 \
|
|
|
|
|
MOVQ a2, b2 \
|
|
|
|
|
MOVQ a3, b3 \
|
|
|
|
|
\// Subtract p256
|
|
|
|
|
SUBQ p256ord<>+0x00(SB), a0 \
|
|
|
|
|
SBBQ p256ord<>+0x08(SB) ,a1 \
|
|
|
|
|
SBBQ p256ord<>+0x10(SB), a2 \
|
|
|
|
|
SBBQ p256ord<>+0x18(SB), a3 \
|
|
|
|
|
SBBQ $0, a4 \
|
|
|
|
|
\
|
|
|
|
|
CMOVQCS b0, a0 \
|
|
|
|
|
CMOVQCS b1, a1 \
|
|
|
|
|
CMOVQCS b2, a2 \
|
|
|
|
|
CMOVQCS b3, a3 \
|
|
|
|
|
\
|
|
|
|
|
MOVQ a0, (8*0)(res) \
|
|
|
|
|
MOVQ a1, (8*1)(res) \
|
|
|
|
|
MOVQ a2, (8*2)(res) \
|
|
|
|
|
MOVQ a3, (8*3)(res) \
|
|
|
|
|
|
|
|
|
|
//func p256OrdReduce(s *p256OrdElement)
|
|
|
|
|
TEXT ·p256OrdReduce(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ s+0(FP), res_ptr
|
|
|
|
|
MOVQ (8*0)(res_ptr), acc0
|
|
|
|
|
MOVQ (8*1)(res_ptr), acc1
|
|
|
|
|
MOVQ (8*2)(res_ptr), acc2
|
|
|
|
|
MOVQ (8*3)(res_ptr), acc3
|
|
|
|
|
XORQ acc4, acc4
|
|
|
|
|
p256OrdReduceInline(acc0, acc1, acc2, acc3, acc4, acc5, x_ptr, y_ptr, t0, res_ptr)
|
|
|
|
|
RET
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
// func p256OrdMul(res, in1, in2 *p256OrdElement)
|
|
|
|
|
TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
|
|
|
MOVQ in1+8(FP), x_ptr
|
|
|
|
|
MOVQ in2+16(FP), y_ptr
|
2023-06-10 10:55:17 +08:00
|
|
|
|
CMPB ·supportBMI2+0(SB), $0x01
|
|
|
|
|
JEQ ordMulBMI2
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
// x * y[0]
|
|
|
|
|
MOVQ (8*0)(y_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
MOVQ AX, acc0
|
|
|
|
|
MOVQ DX, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc4
|
|
|
|
|
XORQ acc5, acc5
|
|
|
|
|
// First reduction step
|
|
|
|
|
MOVQ acc0, AX
|
|
|
|
|
MULQ p256ordK0<>(SB)
|
|
|
|
|
MOVQ AX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc0
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc1
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ DX, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ t0, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc2
|
|
|
|
|
SBBQ AX, acc3
|
|
|
|
|
SBBQ DX, acc4
|
|
|
|
|
SBBQ $0, acc5
|
|
|
|
|
// x * y[1]
|
|
|
|
|
MOVQ (8*1)(y_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ DX, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
// Second reduction step
|
|
|
|
|
MOVQ acc1, AX
|
|
|
|
|
MULQ p256ordK0<>(SB)
|
|
|
|
|
MOVQ AX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ DX, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
ADCQ t0, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc3
|
|
|
|
|
SBBQ AX, acc4
|
|
|
|
|
SBBQ DX, acc5
|
|
|
|
|
SBBQ $0, acc0
|
|
|
|
|
// x * y[2]
|
|
|
|
|
MOVQ (8*2)(y_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc5
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ DX, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
// Third reduction step
|
|
|
|
|
MOVQ acc2, AX
|
|
|
|
|
MULQ p256ordK0<>(SB)
|
|
|
|
|
MOVQ AX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ DX, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADCQ t0, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc4
|
|
|
|
|
SBBQ AX, acc5
|
|
|
|
|
SBBQ DX, acc0
|
|
|
|
|
SBBQ $0, acc1
|
|
|
|
|
// x * y[3]
|
|
|
|
|
MOVQ (8*3)(y_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc5
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc0
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc0
|
|
|
|
|
ADCQ DX, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
// Last reduction step
|
|
|
|
|
MOVQ acc3, AX
|
|
|
|
|
MULQ p256ordK0<>(SB)
|
|
|
|
|
MOVQ AX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ DX, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ t0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc5
|
|
|
|
|
SBBQ AX, acc0
|
|
|
|
|
SBBQ DX, acc1
|
|
|
|
|
SBBQ $0, acc2
|
|
|
|
|
|
2023-06-16 15:52:28 +08:00
|
|
|
|
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ordMulBMI2:
|
|
|
|
|
// x * y[0]
|
|
|
|
|
MOVQ (8*0)(y_ptr), DX
|
|
|
|
|
MULXQ (8*0)(x_ptr), acc0, acc1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*1)(x_ptr), AX, acc2
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ $0, acc2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*2)(x_ptr), AX, acc3
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADDQ AX, acc2
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ $0, acc3
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*3)(x_ptr), AX, acc4
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADDQ AX, acc3
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ $0, acc4
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
XORQ acc5, acc5
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
// First reduction step
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ acc0, DX
|
|
|
|
|
MULXQ p256ordK0<>(SB), t0, AX
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t1
|
|
|
|
|
ADDQ AX, acc0
|
|
|
|
|
ADCQ t1, acc1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ t1, acc2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADCQ $0, acc3
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ t0, acc4
|
|
|
|
|
ADCQ $0, acc5
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
2022-08-17 15:23:59 +08:00
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
SUBQ t0, acc2
|
|
|
|
|
SBBQ AX, acc3
|
2023-06-10 10:55:17 +08:00
|
|
|
|
SBBQ DX, acc4
|
|
|
|
|
SBBQ $0, acc5
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// x * y[1]
|
|
|
|
|
MOVQ (8*1)(y_ptr), DX
|
|
|
|
|
MULXQ (8*0)(x_ptr), AX, t1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADDQ AX, acc1
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ t1, acc2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ (8*1)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ t1, acc3
|
|
|
|
|
|
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ t1, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
|
|
|
|
|
// Second reduction step
|
|
|
|
|
MOVQ acc1, DX
|
|
|
|
|
MULXQ p256ordK0<>(SB), t0, AX
|
|
|
|
|
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t1
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ t1, acc2
|
|
|
|
|
|
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ t1, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
ADCQ t0, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc3
|
|
|
|
|
SBBQ AX, acc4
|
|
|
|
|
SBBQ DX, acc5
|
|
|
|
|
SBBQ $0, acc0
|
|
|
|
|
|
|
|
|
|
// x * y[2]
|
|
|
|
|
MOVQ (8*2)(y_ptr), DX
|
|
|
|
|
MULXQ (8*0)(x_ptr), AX, t1
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ t1, acc3
|
|
|
|
|
|
|
|
|
|
MULXQ (8*1)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ t1, acc5
|
|
|
|
|
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ t1, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
|
|
|
|
|
// Third reduction step
|
|
|
|
|
MOVQ acc2, DX
|
|
|
|
|
MULXQ p256ordK0<>(SB), t0, AX
|
|
|
|
|
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t1
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ t1, acc3
|
|
|
|
|
|
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADCQ t0, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc4
|
|
|
|
|
SBBQ AX, acc5
|
|
|
|
|
SBBQ DX, acc0
|
|
|
|
|
SBBQ $0, acc1
|
|
|
|
|
|
|
|
|
|
// x * y[3]
|
|
|
|
|
MOVQ (8*3)(y_ptr), DX
|
|
|
|
|
MULXQ (8*0)(x_ptr), AX, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ (8*1)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ t1, acc5
|
|
|
|
|
|
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ t1, acc0
|
|
|
|
|
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc0
|
|
|
|
|
ADCQ t1, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
|
|
|
|
|
// Last reduction step
|
|
|
|
|
MOVQ acc3, DX
|
|
|
|
|
MULXQ p256ordK0<>(SB), t0, AX
|
|
|
|
|
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ t1, acc5
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ t0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc5
|
|
|
|
|
SBBQ AX, acc0
|
|
|
|
|
SBBQ DX, acc1
|
|
|
|
|
SBBQ $0, acc2
|
|
|
|
|
|
2023-06-16 15:52:28 +08:00
|
|
|
|
p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr)
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
// func p256OrdSqr(res, in *p256OrdElement, n int)
|
|
|
|
|
TEXT ·p256OrdSqr(SB),NOSPLIT,$0
|
|
|
|
|
MOVQ res+0(FP), res_ptr
|
|
|
|
|
MOVQ in+8(FP), x_ptr
|
|
|
|
|
MOVQ n+16(FP), BX
|
|
|
|
|
|
|
|
|
|
CMPB ·supportBMI2+0(SB), $0x01
|
|
|
|
|
JEQ ordSqrLoopBMI2
|
|
|
|
|
|
|
|
|
|
ordSqrLoop:
|
|
|
|
|
|
|
|
|
|
// y[1:] * y[0]
|
|
|
|
|
MOVQ (8*0)(x_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
MOVQ AX, acc1
|
|
|
|
|
MOVQ DX, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc4
|
|
|
|
|
// y[2:] * y[1]
|
|
|
|
|
MOVQ (8*1)(x_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc5
|
|
|
|
|
// y[3] * y[2]
|
|
|
|
|
MOVQ (8*2)(x_ptr), t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, y_ptr
|
|
|
|
|
XORQ t1, t1
|
|
|
|
|
// *2
|
|
|
|
|
ADDQ acc1, acc1
|
|
|
|
|
ADCQ acc2, acc2
|
|
|
|
|
ADCQ acc3, acc3
|
|
|
|
|
ADCQ acc4, acc4
|
|
|
|
|
ADCQ acc5, acc5
|
|
|
|
|
ADCQ y_ptr, y_ptr
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
// Missing products
|
|
|
|
|
MOVQ (8*0)(x_ptr), AX
|
|
|
|
|
MULQ AX
|
|
|
|
|
MOVQ AX, acc0
|
|
|
|
|
MOVQ DX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), AX
|
|
|
|
|
MULQ AX
|
|
|
|
|
ADDQ t0, acc1
|
|
|
|
|
ADCQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), AX
|
|
|
|
|
MULQ AX
|
|
|
|
|
ADDQ t0, acc3
|
|
|
|
|
ADCQ AX, acc4
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), AX
|
|
|
|
|
MULQ AX
|
|
|
|
|
ADDQ t0, acc5
|
|
|
|
|
ADCQ AX, y_ptr
|
|
|
|
|
ADCQ DX, t1
|
|
|
|
|
MOVQ t1, x_ptr
|
2023-07-26 10:26:32 +08:00
|
|
|
|
|
|
|
|
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
|
|
|
|
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ acc0, AX
|
|
|
|
|
MULQ p256ordK0<>(SB)
|
|
|
|
|
MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
|
|
|
|
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
|
|
|
MULQ t0
|
2023-07-26 10:26:32 +08:00
|
|
|
|
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ $0, DX // DX = carry1 + H(t0 * ord0)
|
|
|
|
|
MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0)
|
2023-07-26 10:26:32 +08:00
|
|
|
|
MOVQ t0, acc0 // acc0 = t0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1
|
|
|
|
|
ADCQ $0, DX // DX = carry2 + H(t0*ord1)
|
|
|
|
|
|
|
|
|
|
ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
|
|
|
|
|
ADCQ DX, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ $0, acc0
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc2
|
|
|
|
|
SBBQ AX, acc3
|
|
|
|
|
SBBQ DX, acc0
|
|
|
|
|
// Second reduction step
|
|
|
|
|
MOVQ acc1, AX
|
|
|
|
|
MULQ p256ordK0<>(SB)
|
|
|
|
|
MOVQ AX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
MOVQ t0, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
|
|
|
|
|
ADDQ AX, acc2
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ DX, acc3
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc3
|
|
|
|
|
SBBQ AX, acc0
|
|
|
|
|
SBBQ DX, acc1
|
|
|
|
|
// Third reduction step
|
|
|
|
|
MOVQ acc2, AX
|
|
|
|
|
MULQ p256ordK0<>(SB)
|
|
|
|
|
MOVQ AX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
MOVQ t0, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ DX, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc0
|
|
|
|
|
SBBQ AX, acc1
|
|
|
|
|
SBBQ DX, acc2
|
|
|
|
|
// Last reduction step
|
|
|
|
|
MOVQ acc3, AX
|
|
|
|
|
MULQ p256ordK0<>(SB)
|
|
|
|
|
MOVQ AX, t0
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x00(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, t1
|
|
|
|
|
MOVQ t0, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ p256ord<>+0x08(SB), AX
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ t1, acc0
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
|
|
|
|
|
ADDQ AX, acc0
|
|
|
|
|
ADCQ DX, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc1
|
|
|
|
|
SBBQ AX, acc2
|
|
|
|
|
SBBQ DX, acc3
|
|
|
|
|
|
|
|
|
|
XORQ t0, t0
|
|
|
|
|
// Add bits [511:256] of the sqr result
|
|
|
|
|
ADCQ acc4, acc0
|
|
|
|
|
ADCQ acc5, acc1
|
|
|
|
|
ADCQ y_ptr, acc2
|
|
|
|
|
ADCQ x_ptr, acc3
|
|
|
|
|
ADCQ $0, t0
|
|
|
|
|
|
2023-06-16 15:52:28 +08:00
|
|
|
|
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ res_ptr, x_ptr
|
|
|
|
|
DECQ BX
|
|
|
|
|
JNE ordSqrLoop
|
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
|
|
|
|
|
ordSqrLoopBMI2:
|
|
|
|
|
// y[1:] * y[0]
|
|
|
|
|
MOVQ (8*0)(x_ptr), DX
|
|
|
|
|
MULXQ (8*1)(x_ptr), acc1, acc2
|
|
|
|
|
|
|
|
|
|
MULXQ (8*2)(x_ptr), AX, acc3
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, acc4
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
|
|
|
|
|
// y[2:] * y[1]
|
|
|
|
|
MOVQ (8*1)(x_ptr), DX
|
|
|
|
|
MULXQ (8*2)(x_ptr), AX, t1
|
|
|
|
|
ADDQ AX, acc3
|
|
|
|
|
ADCQ t1, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, acc5
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADDQ AX, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
|
|
|
|
|
// y[3] * y[2]
|
|
|
|
|
MOVQ (8*2)(x_ptr), DX
|
|
|
|
|
MULXQ (8*3)(x_ptr), AX, y_ptr
|
|
|
|
|
ADDQ AX, acc5
|
|
|
|
|
ADCQ $0, y_ptr
|
|
|
|
|
|
|
|
|
|
XORQ t1, t1
|
|
|
|
|
// *2
|
|
|
|
|
ADDQ acc1, acc1
|
|
|
|
|
ADCQ acc2, acc2
|
|
|
|
|
ADCQ acc3, acc3
|
|
|
|
|
ADCQ acc4, acc4
|
|
|
|
|
ADCQ acc5, acc5
|
|
|
|
|
ADCQ y_ptr, y_ptr
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
|
|
|
|
|
// Missing products
|
|
|
|
|
MOVQ (8*0)(x_ptr), DX
|
|
|
|
|
MULXQ DX, acc0, t0
|
|
|
|
|
ADDQ t0, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ (8*1)(x_ptr), DX
|
|
|
|
|
MULXQ DX, AX, t0
|
|
|
|
|
ADCQ AX, acc2
|
|
|
|
|
ADCQ t0, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ (8*2)(x_ptr), DX
|
|
|
|
|
MULXQ DX, AX, t0
|
|
|
|
|
ADCQ AX, acc4
|
|
|
|
|
ADCQ t0, acc5
|
|
|
|
|
|
|
|
|
|
MOVQ (8*3)(x_ptr), DX
|
|
|
|
|
MULXQ DX, AX, x_ptr
|
|
|
|
|
ADCQ AX, y_ptr
|
|
|
|
|
ADCQ t1, x_ptr
|
|
|
|
|
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
|
|
|
|
// First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0]
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ acc0, DX
|
|
|
|
|
MULXQ p256ordK0<>(SB), t0, AX
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0]
|
|
|
|
|
// the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped.
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64
|
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t1
|
2023-07-26 10:26:32 +08:00
|
|
|
|
ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0)
|
|
|
|
|
ADCQ t1, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1
|
|
|
|
|
MOVQ t0, acc0 // acc0 = t0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
|
|
|
|
ADCQ $0, t1 // t1 = carry2 + H(t0*ord1)
|
2023-07-26 10:26:32 +08:00
|
|
|
|
ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1)
|
|
|
|
|
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
|
|
|
|
|
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
|
|
|
|
|
ADCQ $0, acc0 // acc0 = t0 + carry5
|
|
|
|
|
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ t0, AX
|
|
|
|
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc2
|
|
|
|
|
SBBQ AX, acc3
|
|
|
|
|
SBBQ DX, acc0
|
|
|
|
|
|
|
|
|
|
// Second reduction step
|
|
|
|
|
MOVQ acc1, DX
|
|
|
|
|
MULXQ p256ordK0<>(SB), t0, AX
|
|
|
|
|
|
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t1
|
|
|
|
|
ADDQ AX, acc1
|
|
|
|
|
ADCQ t1, acc2
|
|
|
|
|
MOVQ t0, acc1
|
|
|
|
|
|
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ AX, acc2
|
|
|
|
|
ADCQ t1, acc3
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
2022-08-17 15:23:59 +08:00
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc3
|
|
|
|
|
SBBQ AX, acc0
|
|
|
|
|
SBBQ DX, acc1
|
|
|
|
|
// Third reduction step
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ acc2, DX
|
|
|
|
|
MULXQ p256ordK0<>(SB), t0, AX
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADDQ AX, acc2
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ t1, acc3
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVQ t0, acc2
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADDQ AX, acc3
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ t1, acc0
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
2022-08-17 15:23:59 +08:00
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc0
|
|
|
|
|
SBBQ AX, acc1
|
|
|
|
|
SBBQ DX, acc2
|
|
|
|
|
// Last reduction step
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ acc3, DX
|
|
|
|
|
MULXQ p256ordK0<>(SB), t0, AX
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ t0, DX
|
|
|
|
|
MULXQ p256ord<>+0x00(SB), AX, t1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADDQ AX, acc3
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ t1, acc0
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVQ t0, acc3
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MULXQ p256ord<>+0x08(SB), AX, t1
|
|
|
|
|
ADCQ $0, t1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADDQ AX, acc0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ t1, acc1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ t0, AX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
//MOVQ t0, DX // This is not required due to t0=DX already
|
2022-08-17 15:23:59 +08:00
|
|
|
|
SHLQ $32, AX
|
|
|
|
|
SHRQ $32, DX
|
|
|
|
|
|
|
|
|
|
SUBQ t0, acc1
|
|
|
|
|
SBBQ AX, acc2
|
|
|
|
|
SBBQ DX, acc3
|
|
|
|
|
|
|
|
|
|
XORQ t0, t0
|
|
|
|
|
// Add bits [511:256] of the sqr result
|
|
|
|
|
ADCQ acc4, acc0
|
|
|
|
|
ADCQ acc5, acc1
|
|
|
|
|
ADCQ y_ptr, acc2
|
|
|
|
|
ADCQ x_ptr, acc3
|
|
|
|
|
ADCQ $0, t0
|
|
|
|
|
|
2023-06-16 15:52:28 +08:00
|
|
|
|
p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr)
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVQ res_ptr, x_ptr
|
|
|
|
|
DECQ BX
|
2023-06-10 10:55:17 +08:00
|
|
|
|
JNE ordSqrLoopBMI2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
#undef res_ptr
|
|
|
|
|
#undef x_ptr
|
|
|
|
|
#undef y_ptr
|
|
|
|
|
|
|
|
|
|
#undef acc0
|
|
|
|
|
#undef acc1
|
|
|
|
|
#undef acc2
|
|
|
|
|
#undef acc3
|
|
|
|
|
#undef acc4
|
|
|
|
|
#undef acc5
|
|
|
|
|
#undef t0
|
|
|
|
|
#undef t1
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
#define mul0 AX
|
|
|
|
|
#define mul1 DX
|
|
|
|
|
#define acc0 BX
|
|
|
|
|
#define acc1 CX
|
|
|
|
|
#define acc2 R8
|
|
|
|
|
#define acc3 R9
|
|
|
|
|
#define acc4 R10
|
|
|
|
|
#define acc5 R11
|
|
|
|
|
#define acc6 R12
|
|
|
|
|
#define acc7 R13
|
|
|
|
|
#define t0 R14
|
|
|
|
|
#define t1 R15
|
|
|
|
|
#define t2 DI
|
|
|
|
|
#define t3 SI
|
|
|
|
|
#define hlp BP
|
|
|
|
|
/* ---------------------------------------*/
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
|
2022-08-17 15:23:59 +08:00
|
|
|
|
TEXT sm2P256SubInternal(SB),NOSPLIT,$0
|
|
|
|
|
XORQ mul0, mul0
|
|
|
|
|
SUBQ t0, acc4
|
|
|
|
|
SBBQ t1, acc5
|
|
|
|
|
SBBQ t2, acc6
|
|
|
|
|
SBBQ t3, acc7
|
|
|
|
|
SBBQ $0, mul0
|
|
|
|
|
|
|
|
|
|
MOVQ acc4, acc0
|
|
|
|
|
MOVQ acc5, acc1
|
|
|
|
|
MOVQ acc6, acc2
|
|
|
|
|
MOVQ acc7, acc3
|
|
|
|
|
|
|
|
|
|
ADDQ $-1, acc4
|
|
|
|
|
ADCQ p256p<>+0x08(SB), acc5
|
|
|
|
|
ADCQ $-1, acc6
|
|
|
|
|
ADCQ p256p<>+0x018(SB), acc7
|
|
|
|
|
ANDQ $1, mul0
|
|
|
|
|
|
|
|
|
|
CMOVQEQ acc0, acc4
|
|
|
|
|
CMOVQEQ acc1, acc5
|
|
|
|
|
CMOVQEQ acc2, acc6
|
|
|
|
|
CMOVQEQ acc3, acc7
|
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
/* ---------------------------------------*/
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0]
|
2022-08-17 15:23:59 +08:00
|
|
|
|
TEXT sm2P256MulInternal(SB),NOSPLIT,$8
|
2023-06-10 10:55:17 +08:00
|
|
|
|
CMPB ·supportBMI2+0(SB), $0x01
|
|
|
|
|
JEQ internalMulBMI2
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVQ acc4, mul0
|
|
|
|
|
MULQ t0
|
|
|
|
|
MOVQ mul0, acc0
|
|
|
|
|
MOVQ mul1, acc1
|
|
|
|
|
|
|
|
|
|
MOVQ acc4, mul0
|
|
|
|
|
MULQ t1
|
|
|
|
|
ADDQ mul0, acc1
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ acc4, mul0
|
|
|
|
|
MULQ t2
|
|
|
|
|
ADDQ mul0, acc2
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ acc4, mul0
|
|
|
|
|
MULQ t3
|
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, acc4
|
|
|
|
|
|
|
|
|
|
MOVQ acc5, mul0
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ mul0, acc1
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc5, mul0
|
|
|
|
|
MULQ t1
|
|
|
|
|
ADDQ hlp, acc2
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, acc2
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc5, mul0
|
|
|
|
|
MULQ t2
|
|
|
|
|
ADDQ hlp, acc3
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc5, mul0
|
|
|
|
|
MULQ t3
|
|
|
|
|
ADDQ hlp, acc4
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, acc4
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, acc5
|
|
|
|
|
|
|
|
|
|
MOVQ acc6, mul0
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ mul0, acc2
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc6, mul0
|
|
|
|
|
MULQ t1
|
|
|
|
|
ADDQ hlp, acc3
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc6, mul0
|
|
|
|
|
MULQ t2
|
|
|
|
|
ADDQ hlp, acc4
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, acc4
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc6, mul0
|
|
|
|
|
MULQ t3
|
|
|
|
|
ADDQ hlp, acc5
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, acc5
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, acc6
|
|
|
|
|
|
|
|
|
|
MOVQ acc7, mul0
|
|
|
|
|
MULQ t0
|
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc7, mul0
|
|
|
|
|
MULQ t1
|
|
|
|
|
ADDQ hlp, acc4
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, acc4
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc7, mul0
|
|
|
|
|
MULQ t2
|
|
|
|
|
ADDQ hlp, acc5
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, acc5
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc7, mul0
|
|
|
|
|
MULQ t3
|
|
|
|
|
ADDQ hlp, acc6
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, acc6
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, acc7
|
|
|
|
|
// First reduction step
|
|
|
|
|
MOVQ acc0, mul0
|
|
|
|
|
MOVQ acc0, mul1
|
|
|
|
|
SHLQ $32, mul0
|
|
|
|
|
SHRQ $32, mul1
|
|
|
|
|
|
|
|
|
|
ADDQ acc0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
|
|
|
|
|
SUBQ mul0, acc1
|
|
|
|
|
SBBQ mul1, acc2
|
|
|
|
|
SBBQ mul0, acc3
|
|
|
|
|
SBBQ mul1, acc0
|
|
|
|
|
// Second reduction step
|
|
|
|
|
MOVQ acc1, mul0
|
|
|
|
|
MOVQ acc1, mul1
|
|
|
|
|
SHLQ $32, mul0
|
|
|
|
|
SHRQ $32, mul1
|
|
|
|
|
|
|
|
|
|
ADDQ acc1, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
|
|
|
|
|
SUBQ mul0, acc2
|
|
|
|
|
SBBQ mul1, acc3
|
|
|
|
|
SBBQ mul0, acc0
|
|
|
|
|
SBBQ mul1, acc1
|
|
|
|
|
// Third reduction step
|
|
|
|
|
MOVQ acc2, mul0
|
|
|
|
|
MOVQ acc2, mul1
|
|
|
|
|
SHLQ $32, mul0
|
|
|
|
|
SHRQ $32, mul1
|
|
|
|
|
|
|
|
|
|
ADDQ acc2, acc3
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
|
|
|
|
|
SUBQ mul0, acc3
|
|
|
|
|
SBBQ mul1, acc0
|
|
|
|
|
SBBQ mul0, acc1
|
|
|
|
|
SBBQ mul1, acc2
|
|
|
|
|
// Last reduction step
|
|
|
|
|
MOVQ acc3, mul0
|
|
|
|
|
MOVQ acc3, mul1
|
|
|
|
|
SHLQ $32, mul0
|
|
|
|
|
SHRQ $32, mul1
|
|
|
|
|
|
|
|
|
|
ADDQ acc3, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
|
|
|
|
|
SUBQ mul0, acc0
|
|
|
|
|
SBBQ mul1, acc1
|
|
|
|
|
SBBQ mul0, acc2
|
|
|
|
|
SBBQ mul1, acc3
|
|
|
|
|
MOVQ $0, BP
|
|
|
|
|
// Add bits [511:256] of the result
|
|
|
|
|
ADCQ acc0, acc4
|
|
|
|
|
ADCQ acc1, acc5
|
|
|
|
|
ADCQ acc2, acc6
|
|
|
|
|
ADCQ acc3, acc7
|
|
|
|
|
ADCQ $0, hlp
|
|
|
|
|
// Copy result
|
|
|
|
|
MOVQ acc4, acc0
|
|
|
|
|
MOVQ acc5, acc1
|
|
|
|
|
MOVQ acc6, acc2
|
|
|
|
|
MOVQ acc7, acc3
|
|
|
|
|
// Subtract p256
|
|
|
|
|
SUBQ $-1, acc4
|
|
|
|
|
SBBQ p256p<>+0x08(SB), acc5
|
|
|
|
|
SBBQ $-1, acc6
|
|
|
|
|
SBBQ p256p<>+0x018(SB), acc7
|
|
|
|
|
SBBQ $0, hlp
|
|
|
|
|
// If the result of the subtraction is negative, restore the previous result
|
|
|
|
|
CMOVQCS acc0, acc4
|
|
|
|
|
CMOVQCS acc1, acc5
|
|
|
|
|
CMOVQCS acc2, acc6
|
|
|
|
|
CMOVQCS acc3, acc7
|
|
|
|
|
|
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
internalMulBMI2:
|
|
|
|
|
MOVQ acc4, mul1
|
|
|
|
|
MULXQ t0, acc0, acc1
|
|
|
|
|
|
|
|
|
|
MULXQ t1, mul0, acc2
|
|
|
|
|
ADDQ mul0, acc1
|
|
|
|
|
|
|
|
|
|
MULXQ t2, mul0, acc3
|
2023-07-20 17:49:53 +08:00
|
|
|
|
ADCQ mul0, acc2
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
MULXQ t3, mul0, acc4
|
2023-07-20 17:49:53 +08:00
|
|
|
|
ADCQ mul0, acc3
|
2023-06-10 10:55:17 +08:00
|
|
|
|
ADCQ $0, acc4
|
|
|
|
|
|
|
|
|
|
MOVQ acc5, mul1
|
|
|
|
|
MULXQ t0, mul0, hlp
|
|
|
|
|
ADDQ mul0, acc1
|
|
|
|
|
ADCQ hlp, acc2
|
|
|
|
|
|
|
|
|
|
MULXQ t1, mul0, hlp
|
|
|
|
|
ADCQ $0, hlp
|
|
|
|
|
ADDQ mul0, acc2
|
|
|
|
|
ADCQ hlp, acc3
|
|
|
|
|
|
|
|
|
|
MULXQ t2, mul0, hlp
|
|
|
|
|
ADCQ $0, hlp
|
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ hlp, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ t3, mul0, acc5
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
ADDQ mul0, acc4
|
|
|
|
|
ADCQ $0, acc5
|
|
|
|
|
|
|
|
|
|
MOVQ acc6, mul1
|
|
|
|
|
MULXQ t0, mul0, hlp
|
|
|
|
|
ADDQ mul0, acc2
|
|
|
|
|
ADCQ hlp, acc3
|
|
|
|
|
|
|
|
|
|
MULXQ t1, mul0, hlp
|
|
|
|
|
ADCQ $0, hlp
|
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ hlp, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ t2, mul0, hlp
|
|
|
|
|
ADCQ $0, hlp
|
|
|
|
|
ADDQ mul0, acc4
|
|
|
|
|
ADCQ hlp, acc5
|
|
|
|
|
|
|
|
|
|
MULXQ t3, mul0, acc6
|
|
|
|
|
ADCQ $0, acc6
|
|
|
|
|
ADDQ mul0, acc5
|
|
|
|
|
ADCQ $0, acc6
|
|
|
|
|
|
|
|
|
|
MOVQ acc7, mul1
|
|
|
|
|
MULXQ t0, mul0, hlp
|
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ hlp, acc4
|
|
|
|
|
|
|
|
|
|
MULXQ t1, mul0, hlp
|
|
|
|
|
ADCQ $0, hlp
|
|
|
|
|
ADDQ mul0, acc4
|
|
|
|
|
ADCQ hlp, acc5
|
|
|
|
|
|
|
|
|
|
MULXQ t2, mul0, hlp
|
|
|
|
|
ADCQ $0, hlp
|
|
|
|
|
ADDQ mul0, acc5
|
|
|
|
|
ADCQ hlp, acc6
|
|
|
|
|
|
|
|
|
|
MULXQ t3, mul0, acc7
|
|
|
|
|
ADCQ $0, acc7
|
|
|
|
|
ADDQ mul0, acc6
|
|
|
|
|
ADCQ $0, acc7
|
|
|
|
|
|
|
|
|
|
// First reduction step
|
|
|
|
|
MOVQ acc0, mul0
|
|
|
|
|
MOVQ acc0, mul1
|
|
|
|
|
SHLQ $32, mul0
|
|
|
|
|
SHRQ $32, mul1
|
|
|
|
|
|
|
|
|
|
ADDQ acc0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
|
|
|
|
|
SUBQ mul0, acc1
|
|
|
|
|
SBBQ mul1, acc2
|
|
|
|
|
SBBQ mul0, acc3
|
|
|
|
|
SBBQ mul1, acc0
|
|
|
|
|
// Second reduction step
|
|
|
|
|
MOVQ acc1, mul0
|
|
|
|
|
MOVQ acc1, mul1
|
|
|
|
|
SHLQ $32, mul0
|
|
|
|
|
SHRQ $32, mul1
|
|
|
|
|
|
|
|
|
|
ADDQ acc1, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
|
|
|
|
|
SUBQ mul0, acc2
|
|
|
|
|
SBBQ mul1, acc3
|
|
|
|
|
SBBQ mul0, acc0
|
|
|
|
|
SBBQ mul1, acc1
|
|
|
|
|
// Third reduction step
|
|
|
|
|
MOVQ acc2, mul0
|
|
|
|
|
MOVQ acc2, mul1
|
|
|
|
|
SHLQ $32, mul0
|
|
|
|
|
SHRQ $32, mul1
|
|
|
|
|
|
|
|
|
|
ADDQ acc2, acc3
|
|
|
|
|
ADCQ $0, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
|
|
|
|
|
SUBQ mul0, acc3
|
|
|
|
|
SBBQ mul1, acc0
|
|
|
|
|
SBBQ mul0, acc1
|
|
|
|
|
SBBQ mul1, acc2
|
|
|
|
|
// Last reduction step
|
|
|
|
|
MOVQ acc3, mul0
|
|
|
|
|
MOVQ acc3, mul1
|
|
|
|
|
SHLQ $32, mul0
|
|
|
|
|
SHRQ $32, mul1
|
|
|
|
|
|
|
|
|
|
ADDQ acc3, acc0
|
|
|
|
|
ADCQ $0, acc1
|
|
|
|
|
ADCQ $0, acc2
|
|
|
|
|
ADCQ $0, acc3
|
|
|
|
|
|
|
|
|
|
SUBQ mul0, acc0
|
|
|
|
|
SBBQ mul1, acc1
|
|
|
|
|
SBBQ mul0, acc2
|
|
|
|
|
SBBQ mul1, acc3
|
|
|
|
|
MOVQ $0, BP
|
|
|
|
|
// Add bits [511:256] of the result
|
|
|
|
|
ADCQ acc0, acc4
|
|
|
|
|
ADCQ acc1, acc5
|
|
|
|
|
ADCQ acc2, acc6
|
|
|
|
|
ADCQ acc3, acc7
|
|
|
|
|
ADCQ $0, hlp
|
|
|
|
|
// Copy result
|
|
|
|
|
MOVQ acc4, acc0
|
|
|
|
|
MOVQ acc5, acc1
|
|
|
|
|
MOVQ acc6, acc2
|
|
|
|
|
MOVQ acc7, acc3
|
|
|
|
|
// Subtract p256
|
|
|
|
|
SUBQ $-1, acc4
|
|
|
|
|
SBBQ p256p<>+0x08(SB), acc5
|
|
|
|
|
SBBQ $-1, acc6
|
|
|
|
|
SBBQ p256p<>+0x018(SB), acc7
|
|
|
|
|
SBBQ $0, hlp
|
|
|
|
|
// If the result of the subtraction is negative, restore the previous result
|
|
|
|
|
CMOVQCS acc0, acc4
|
|
|
|
|
CMOVQCS acc1, acc5
|
|
|
|
|
CMOVQCS acc2, acc6
|
|
|
|
|
CMOVQCS acc3, acc7
|
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
|
|
|
|
|
#define sm2P256SqrReductionInternal() \
|
|
|
|
|
\ // First reduction step
|
|
|
|
|
MOVQ acc0, mul0 \
|
|
|
|
|
MOVQ acc0, mul1 \
|
2023-06-12 09:04:09 +08:00
|
|
|
|
SHLQ $32, mul0 \
|
|
|
|
|
SHRQ $32, mul1 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\
|
|
|
|
|
ADDQ acc0, acc1 \
|
2023-06-12 09:04:09 +08:00
|
|
|
|
ADCQ $0, acc2 \
|
|
|
|
|
ADCQ $0, acc3 \
|
|
|
|
|
ADCQ $0, acc0 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\
|
|
|
|
|
SUBQ mul0, acc1 \
|
|
|
|
|
SBBQ mul1, acc2 \
|
|
|
|
|
SBBQ mul0, acc3 \
|
|
|
|
|
SBBQ mul1, acc0 \
|
|
|
|
|
\ // Second reduction step
|
|
|
|
|
MOVQ acc1, mul0 \
|
|
|
|
|
MOVQ acc1, mul1 \
|
2023-06-12 09:04:09 +08:00
|
|
|
|
SHLQ $32, mul0 \
|
|
|
|
|
SHRQ $32, mul1 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\
|
|
|
|
|
ADDQ acc1, acc2 \
|
2023-06-12 09:04:09 +08:00
|
|
|
|
ADCQ $0, acc3 \
|
|
|
|
|
ADCQ $0, acc0 \
|
|
|
|
|
ADCQ $0, acc1 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\
|
|
|
|
|
SUBQ mul0, acc2 \
|
|
|
|
|
SBBQ mul1, acc3 \
|
|
|
|
|
SBBQ mul0, acc0 \
|
|
|
|
|
SBBQ mul1, acc1 \
|
|
|
|
|
\ // Third reduction step
|
|
|
|
|
MOVQ acc2, mul0 \
|
|
|
|
|
MOVQ acc2, mul1 \
|
2023-06-12 09:04:09 +08:00
|
|
|
|
SHLQ $32, mul0 \
|
|
|
|
|
SHRQ $32, mul1 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\
|
|
|
|
|
ADDQ acc2, acc3 \
|
2023-06-12 09:04:09 +08:00
|
|
|
|
ADCQ $0, acc0 \
|
|
|
|
|
ADCQ $0, acc1 \
|
|
|
|
|
ADCQ $0, acc2 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\
|
|
|
|
|
SUBQ mul0, acc3 \
|
|
|
|
|
SBBQ mul1, acc0 \
|
|
|
|
|
SBBQ mul0, acc1 \
|
|
|
|
|
SBBQ mul1, acc2 \
|
|
|
|
|
\ // Last reduction step
|
|
|
|
|
MOVQ acc3, mul0 \
|
|
|
|
|
MOVQ acc3, mul1 \
|
2023-06-12 09:04:09 +08:00
|
|
|
|
SHLQ $32, mul0 \
|
|
|
|
|
SHRQ $32, mul1 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\
|
|
|
|
|
ADDQ acc3, acc0 \
|
2023-06-12 09:04:09 +08:00
|
|
|
|
ADCQ $0, acc1 \
|
|
|
|
|
ADCQ $0, acc2 \
|
|
|
|
|
ADCQ $0, acc3 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\
|
|
|
|
|
SUBQ mul0, acc0 \
|
|
|
|
|
SBBQ mul1, acc1 \
|
|
|
|
|
SBBQ mul0, acc2 \
|
|
|
|
|
SBBQ mul1, acc3 \
|
2023-06-12 09:04:09 +08:00
|
|
|
|
MOVQ $0, BP \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\ // Add bits [511:256] of the result
|
2023-06-12 09:04:09 +08:00
|
|
|
|
ADCQ acc0, t0 \
|
|
|
|
|
ADCQ acc1, t1 \
|
|
|
|
|
ADCQ acc2, t2 \
|
|
|
|
|
ADCQ acc3, t3 \
|
|
|
|
|
ADCQ $0, hlp \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\ // Copy result
|
2023-06-12 09:04:09 +08:00
|
|
|
|
MOVQ t0, acc4 \
|
|
|
|
|
MOVQ t1, acc5 \
|
|
|
|
|
MOVQ t2, acc6 \
|
|
|
|
|
MOVQ t3, acc7 \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\ // Subtract p256
|
2023-06-12 09:04:09 +08:00
|
|
|
|
SUBQ $-1, acc4 \
|
|
|
|
|
SBBQ p256p<>+0x08(SB), acc5 \
|
|
|
|
|
SBBQ $-1, acc6 \
|
|
|
|
|
SBBQ p256p<>+0x018(SB), acc7 \
|
|
|
|
|
SBBQ $0, hlp \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
\ // If the result of the subtraction is negative, restore the previous result
|
|
|
|
|
CMOVQCS t0, acc4 \
|
|
|
|
|
CMOVQCS t1, acc5 \
|
|
|
|
|
CMOVQCS t2, acc6 \
|
|
|
|
|
CMOVQCS t3, acc7 \
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
/* ---------------------------------------*/
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4]^2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
|
2023-06-10 10:55:17 +08:00
|
|
|
|
CMPB ·supportBMI2+0(SB), $0x01
|
|
|
|
|
JEQ internalSqrBMI2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
MOVQ acc4, mul0
|
|
|
|
|
MULQ acc5
|
|
|
|
|
MOVQ mul0, acc1
|
|
|
|
|
MOVQ mul1, acc2
|
|
|
|
|
|
|
|
|
|
MOVQ acc4, mul0
|
|
|
|
|
MULQ acc6
|
|
|
|
|
ADDQ mul0, acc2
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, acc3
|
|
|
|
|
|
|
|
|
|
MOVQ acc4, mul0
|
|
|
|
|
MULQ acc7
|
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, t0
|
|
|
|
|
|
|
|
|
|
MOVQ acc5, mul0
|
|
|
|
|
MULQ acc6
|
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, hlp
|
|
|
|
|
|
|
|
|
|
MOVQ acc5, mul0
|
|
|
|
|
MULQ acc7
|
|
|
|
|
ADDQ hlp, t0
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
ADDQ mul0, t0
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, t1
|
|
|
|
|
|
|
|
|
|
MOVQ acc6, mul0
|
|
|
|
|
MULQ acc7
|
|
|
|
|
ADDQ mul0, t1
|
|
|
|
|
ADCQ $0, mul1
|
|
|
|
|
MOVQ mul1, t2
|
|
|
|
|
XORQ t3, t3
|
|
|
|
|
// *2
|
|
|
|
|
ADDQ acc1, acc1
|
|
|
|
|
ADCQ acc2, acc2
|
|
|
|
|
ADCQ acc3, acc3
|
|
|
|
|
ADCQ t0, t0
|
|
|
|
|
ADCQ t1, t1
|
|
|
|
|
ADCQ t2, t2
|
|
|
|
|
ADCQ $0, t3
|
|
|
|
|
// Missing products
|
|
|
|
|
MOVQ acc4, mul0
|
|
|
|
|
MULQ mul0
|
|
|
|
|
MOVQ mul0, acc0
|
|
|
|
|
MOVQ DX, acc4
|
|
|
|
|
|
|
|
|
|
MOVQ acc5, mul0
|
|
|
|
|
MULQ mul0
|
|
|
|
|
ADDQ acc4, acc1
|
|
|
|
|
ADCQ mul0, acc2
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc4
|
|
|
|
|
|
|
|
|
|
MOVQ acc6, mul0
|
|
|
|
|
MULQ mul0
|
|
|
|
|
ADDQ acc4, acc3
|
|
|
|
|
ADCQ mul0, t0
|
|
|
|
|
ADCQ $0, DX
|
|
|
|
|
MOVQ DX, acc4
|
|
|
|
|
|
|
|
|
|
MOVQ acc7, mul0
|
|
|
|
|
MULQ mul0
|
|
|
|
|
ADDQ acc4, t1
|
|
|
|
|
ADCQ mul0, t2
|
|
|
|
|
ADCQ DX, t3
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
2023-06-10 10:55:17 +08:00
|
|
|
|
sm2P256SqrReductionInternal()
|
|
|
|
|
RET
|
|
|
|
|
|
|
|
|
|
internalSqrBMI2:
|
|
|
|
|
MOVQ acc4, mul1
|
|
|
|
|
MULXQ acc5, acc1, acc2
|
|
|
|
|
|
|
|
|
|
MULXQ acc6, mul0, acc3
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADDQ mul0, acc2
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
MULXQ acc7, mul0, t0
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADCQ mul0, acc3
|
|
|
|
|
ADCQ $0, t0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
MOVQ acc5, mul1
|
|
|
|
|
MULXQ acc6, mul0, hlp
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADDQ mul0, acc3
|
|
|
|
|
ADCQ hlp, t0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
MULXQ acc7, mul0, t1
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADCQ $0, t1
|
|
|
|
|
ADDQ mul0, t0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
MOVQ acc6, mul1
|
|
|
|
|
MULXQ acc7, mul0, t2
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADCQ mul0, t1
|
|
|
|
|
ADCQ $0, t2
|
|
|
|
|
XORQ t3, t3
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// *2
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADDQ acc1, acc1
|
|
|
|
|
ADCQ acc2, acc2
|
|
|
|
|
ADCQ acc3, acc3
|
|
|
|
|
ADCQ t0, t0
|
|
|
|
|
ADCQ t1, t1
|
|
|
|
|
ADCQ t2, t2
|
|
|
|
|
ADCQ $0, t3
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// Missing products
|
|
|
|
|
MOVQ acc4, mul1
|
|
|
|
|
MULXQ mul1, acc0, acc4
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADDQ acc4, acc1
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ acc5, mul1
|
|
|
|
|
MULXQ mul1, mul0, acc4
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADCQ mul0, acc2
|
|
|
|
|
ADCQ acc4, acc3
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVQ acc6, mul1
|
|
|
|
|
MULXQ mul1, mul0, acc4
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADCQ mul0, t0
|
|
|
|
|
ADCQ acc4, t1
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
MOVQ acc7, mul1
|
|
|
|
|
MULXQ mul1, mul0, acc4
|
2023-07-21 18:06:22 +08:00
|
|
|
|
ADCQ mul0, t2
|
|
|
|
|
ADCQ acc4, t3
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
2023-06-10 10:55:17 +08:00
|
|
|
|
sm2P256SqrReductionInternal()
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
/* ---------------------------------------*/
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
|
2022-08-17 15:23:59 +08:00
|
|
|
|
#define p256MulBy2Inline\
|
|
|
|
|
XORQ mul0, mul0;\
|
|
|
|
|
ADDQ acc4, acc4;\
|
|
|
|
|
ADCQ acc5, acc5;\
|
|
|
|
|
ADCQ acc6, acc6;\
|
|
|
|
|
ADCQ acc7, acc7;\
|
|
|
|
|
ADCQ $0, mul0;\
|
|
|
|
|
MOVQ acc4, t0;\
|
|
|
|
|
MOVQ acc5, t1;\
|
|
|
|
|
MOVQ acc6, t2;\
|
|
|
|
|
MOVQ acc7, t3;\
|
|
|
|
|
SUBQ $-1, t0;\
|
|
|
|
|
SBBQ p256p<>+0x08(SB), t1;\
|
|
|
|
|
SBBQ $-1, t2;\
|
|
|
|
|
SBBQ p256p<>+0x018(SB), t3;\
|
|
|
|
|
SBBQ $0, mul0;\
|
|
|
|
|
CMOVQCS acc4, t0;\
|
|
|
|
|
CMOVQCS acc5, t1;\
|
|
|
|
|
CMOVQCS acc6, t2;\
|
|
|
|
|
CMOVQCS acc7, t3;
|
|
|
|
|
/* ---------------------------------------*/
|
2023-07-26 10:26:32 +08:00
|
|
|
|
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
|
2022-08-17 15:23:59 +08:00
|
|
|
|
#define p256AddInline \
|
|
|
|
|
XORQ mul0, mul0;\
|
|
|
|
|
ADDQ t0, acc4;\
|
|
|
|
|
ADCQ t1, acc5;\
|
|
|
|
|
ADCQ t2, acc6;\
|
|
|
|
|
ADCQ t3, acc7;\
|
|
|
|
|
ADCQ $0, mul0;\
|
|
|
|
|
MOVQ acc4, t0;\
|
|
|
|
|
MOVQ acc5, t1;\
|
|
|
|
|
MOVQ acc6, t2;\
|
|
|
|
|
MOVQ acc7, t3;\
|
|
|
|
|
SUBQ $-1, t0;\
|
|
|
|
|
SBBQ p256p<>+0x08(SB), t1;\
|
|
|
|
|
SBBQ $-1, t2;\
|
|
|
|
|
SBBQ p256p<>+0x018(SB), t3;\
|
|
|
|
|
SBBQ $0, mul0;\
|
|
|
|
|
CMOVQCS acc4, t0;\
|
|
|
|
|
CMOVQCS acc5, t1;\
|
|
|
|
|
CMOVQCS acc6, t2;\
|
|
|
|
|
CMOVQCS acc7, t3;
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
|
|
|
|
|
#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
|
|
|
|
|
#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
|
|
|
|
|
#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
|
|
|
|
|
#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
|
|
|
|
|
#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
#define x1in(off) (32*0 + off)(SP)
|
|
|
|
|
#define y1in(off) (32*1 + off)(SP)
|
|
|
|
|
#define z1in(off) (32*2 + off)(SP)
|
|
|
|
|
#define x2in(off) (32*3 + off)(SP)
|
|
|
|
|
#define y2in(off) (32*4 + off)(SP)
|
|
|
|
|
#define xout(off) (32*5 + off)(SP)
|
|
|
|
|
#define yout(off) (32*6 + off)(SP)
|
|
|
|
|
#define zout(off) (32*7 + off)(SP)
|
|
|
|
|
#define s2(off) (32*8 + off)(SP)
|
|
|
|
|
#define z1sqr(off) (32*9 + off)(SP)
|
|
|
|
|
#define h(off) (32*10 + off)(SP)
|
|
|
|
|
#define r(off) (32*11 + off)(SP)
|
|
|
|
|
#define hsqr(off) (32*12 + off)(SP)
|
|
|
|
|
#define rsqr(off) (32*13 + off)(SP)
|
|
|
|
|
#define hcub(off) (32*14 + off)(SP)
|
|
|
|
|
#define rptr (32*15)(SP)
|
|
|
|
|
#define sel_save (32*15 + 8)(SP)
|
|
|
|
|
#define zero_save (32*15 + 8 + 4)(SP)
|
|
|
|
|
|
2023-06-12 09:04:09 +08:00
|
|
|
|
#define p256PointAddAffineInline() \
|
|
|
|
|
\// Store pointer to result
|
|
|
|
|
MOVQ mul0, rptr \
|
|
|
|
|
MOVL t1, sel_save \
|
|
|
|
|
MOVL t2, zero_save \
|
|
|
|
|
\// Negate y2in based on sign
|
|
|
|
|
MOVQ (16*2 + 8*0)(CX), acc4 \
|
|
|
|
|
MOVQ (16*2 + 8*1)(CX), acc5 \
|
|
|
|
|
MOVQ (16*2 + 8*2)(CX), acc6 \
|
|
|
|
|
MOVQ (16*2 + 8*3)(CX), acc7 \
|
|
|
|
|
MOVQ $-1, acc0 \
|
|
|
|
|
MOVQ p256p<>+0x08(SB), acc1 \
|
|
|
|
|
MOVQ $-1, acc2 \
|
|
|
|
|
MOVQ p256p<>+0x018(SB), acc3 \
|
|
|
|
|
XORQ mul0, mul0 \
|
|
|
|
|
\// Speculatively subtract
|
|
|
|
|
SUBQ acc4, acc0 \
|
|
|
|
|
SBBQ acc5, acc1 \
|
|
|
|
|
SBBQ acc6, acc2 \
|
|
|
|
|
SBBQ acc7, acc3 \
|
|
|
|
|
SBBQ $0, mul0 \
|
|
|
|
|
MOVQ acc0, t0 \
|
|
|
|
|
MOVQ acc1, t1 \
|
|
|
|
|
MOVQ acc2, t2 \
|
|
|
|
|
MOVQ acc3, t3 \
|
|
|
|
|
\// Add in case the operand was > p256
|
|
|
|
|
ADDQ $-1, acc0 \
|
|
|
|
|
ADCQ p256p<>+0x08(SB), acc1 \
|
|
|
|
|
ADCQ $-1, acc2 \
|
|
|
|
|
ADCQ p256p<>+0x018(SB), acc3 \
|
|
|
|
|
ADCQ $0, mul0 \
|
|
|
|
|
CMOVQNE t0, acc0 \
|
|
|
|
|
CMOVQNE t1, acc1 \
|
|
|
|
|
CMOVQNE t2, acc2 \
|
|
|
|
|
CMOVQNE t3, acc3 \
|
|
|
|
|
\// If condition is 0, keep original value
|
|
|
|
|
TESTQ DX, DX \
|
|
|
|
|
CMOVQEQ acc4, acc0 \
|
|
|
|
|
CMOVQEQ acc5, acc1 \
|
|
|
|
|
CMOVQEQ acc6, acc2 \
|
|
|
|
|
CMOVQEQ acc7, acc3 \
|
|
|
|
|
\// Store result
|
|
|
|
|
MOVQ acc0, y2in(8*0) \
|
|
|
|
|
MOVQ acc1, y2in(8*1) \
|
|
|
|
|
MOVQ acc2, y2in(8*2) \
|
|
|
|
|
MOVQ acc3, y2in(8*3) \
|
|
|
|
|
\// Begin point add
|
|
|
|
|
LDacc (z1in) \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \// z1ˆ2
|
|
|
|
|
ST (z1sqr) \
|
|
|
|
|
\
|
|
|
|
|
LDt (x2in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2
|
|
|
|
|
\
|
|
|
|
|
LDt (x1in) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \// h = u2 - u1
|
|
|
|
|
ST (h) \
|
|
|
|
|
\
|
|
|
|
|
LDt (z1in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// z3 = h * z1
|
|
|
|
|
ST (zout) \
|
|
|
|
|
\
|
|
|
|
|
LDacc (z1sqr) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// z1ˆ3
|
|
|
|
|
\
|
|
|
|
|
LDt (y2in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3
|
|
|
|
|
ST (s2) \
|
|
|
|
|
\
|
|
|
|
|
LDt (y1in) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \// r = s2 - s1
|
|
|
|
|
ST (r) \
|
|
|
|
|
\
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
|
|
|
|
|
ST (rsqr) \
|
|
|
|
|
\
|
|
|
|
|
LDacc (h) \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2
|
|
|
|
|
ST (hsqr) \
|
|
|
|
|
\
|
|
|
|
|
LDt (h) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// hcub = hˆ3
|
|
|
|
|
ST (hcub) \
|
|
|
|
|
\
|
|
|
|
|
LDt (y1in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// y1 * hˆ3
|
|
|
|
|
ST (s2) \
|
|
|
|
|
\
|
|
|
|
|
LDacc (x1in) \
|
|
|
|
|
LDt (hsqr) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// u1 * hˆ2
|
|
|
|
|
ST (h) \
|
|
|
|
|
\
|
|
|
|
|
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
|
|
|
|
|
LDacc (rsqr) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
|
|
|
|
|
\
|
|
|
|
|
LDt (hcub) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
ST (xout) \
|
|
|
|
|
\
|
|
|
|
|
MOVQ acc4, t0 \
|
|
|
|
|
MOVQ acc5, t1 \
|
|
|
|
|
MOVQ acc6, t2 \
|
|
|
|
|
MOVQ acc7, t3 \
|
|
|
|
|
LDacc (h) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
\
|
|
|
|
|
LDt (r) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \
|
|
|
|
|
\
|
|
|
|
|
LDt (s2) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
ST (yout) \
|
|
|
|
|
\// Load stored values from stack
|
|
|
|
|
MOVQ rptr, AX \
|
|
|
|
|
MOVL sel_save, BX \
|
|
|
|
|
MOVL zero_save, CX \
|
|
|
|
|
|
2022-08-25 16:45:18 +08:00
|
|
|
|
// func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
|
2022-08-17 15:23:59 +08:00
|
|
|
|
TEXT ·p256PointAddAffineAsm(SB),0,$512-48
|
|
|
|
|
// Move input to stack in order to free registers
|
|
|
|
|
MOVQ res+0(FP), AX
|
|
|
|
|
MOVQ in1+8(FP), BX
|
|
|
|
|
MOVQ in2+16(FP), CX
|
|
|
|
|
MOVQ sign+24(FP), DX
|
|
|
|
|
MOVQ sel+32(FP), t1
|
|
|
|
|
MOVQ zero+40(FP), t2
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
|
|
|
JEQ pointaddaffine_avx2
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVOU (16*0)(BX), X0
|
|
|
|
|
MOVOU (16*1)(BX), X1
|
|
|
|
|
MOVOU (16*2)(BX), X2
|
|
|
|
|
MOVOU (16*3)(BX), X3
|
|
|
|
|
MOVOU (16*4)(BX), X4
|
|
|
|
|
MOVOU (16*5)(BX), X5
|
|
|
|
|
|
|
|
|
|
MOVOU X0, x1in(16*0)
|
|
|
|
|
MOVOU X1, x1in(16*1)
|
|
|
|
|
MOVOU X2, y1in(16*0)
|
|
|
|
|
MOVOU X3, y1in(16*1)
|
|
|
|
|
MOVOU X4, z1in(16*0)
|
|
|
|
|
MOVOU X5, z1in(16*1)
|
|
|
|
|
|
|
|
|
|
MOVOU (16*0)(CX), X0
|
|
|
|
|
MOVOU (16*1)(CX), X1
|
|
|
|
|
|
|
|
|
|
MOVOU X0, x2in(16*0)
|
|
|
|
|
MOVOU X1, x2in(16*1)
|
2023-06-12 09:04:09 +08:00
|
|
|
|
|
|
|
|
|
p256PointAddAffineInline()
|
2022-08-17 15:23:59 +08:00
|
|
|
|
// The result is not valid if (sel == 0), conditional choose
|
|
|
|
|
MOVOU xout(16*0), X0
|
|
|
|
|
MOVOU xout(16*1), X1
|
|
|
|
|
MOVOU yout(16*0), X2
|
|
|
|
|
MOVOU yout(16*1), X3
|
|
|
|
|
MOVOU zout(16*0), X4
|
|
|
|
|
MOVOU zout(16*1), X5
|
|
|
|
|
|
|
|
|
|
MOVL BX, X6
|
|
|
|
|
MOVL CX, X7
|
|
|
|
|
|
|
|
|
|
PXOR X8, X8
|
|
|
|
|
PCMPEQL X9, X9
|
|
|
|
|
|
|
|
|
|
PSHUFD $0, X6, X6
|
|
|
|
|
PSHUFD $0, X7, X7
|
|
|
|
|
|
|
|
|
|
PCMPEQL X8, X6
|
|
|
|
|
PCMPEQL X8, X7
|
|
|
|
|
|
|
|
|
|
MOVOU X6, X15
|
|
|
|
|
PANDN X9, X15
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
MOVOU x1in(16*0), X9
|
|
|
|
|
MOVOU x1in(16*1), X10
|
|
|
|
|
MOVOU y1in(16*0), X11
|
|
|
|
|
MOVOU y1in(16*1), X12
|
|
|
|
|
MOVOU z1in(16*0), X13
|
|
|
|
|
MOVOU z1in(16*1), X14
|
|
|
|
|
|
|
|
|
|
PAND X15, X0
|
|
|
|
|
PAND X15, X1
|
|
|
|
|
PAND X15, X2
|
|
|
|
|
PAND X15, X3
|
|
|
|
|
PAND X15, X4
|
|
|
|
|
PAND X15, X5
|
|
|
|
|
|
|
|
|
|
PAND X6, X9
|
|
|
|
|
PAND X6, X10
|
|
|
|
|
PAND X6, X11
|
|
|
|
|
PAND X6, X12
|
|
|
|
|
PAND X6, X13
|
|
|
|
|
PAND X6, X14
|
|
|
|
|
|
|
|
|
|
PXOR X9, X0
|
|
|
|
|
PXOR X10, X1
|
|
|
|
|
PXOR X11, X2
|
|
|
|
|
PXOR X12, X3
|
|
|
|
|
PXOR X13, X4
|
|
|
|
|
PXOR X14, X5
|
|
|
|
|
// Similarly if zero == 0
|
|
|
|
|
PCMPEQL X9, X9
|
|
|
|
|
MOVOU X7, X15
|
|
|
|
|
PANDN X9, X15
|
|
|
|
|
|
|
|
|
|
MOVOU x2in(16*0), X9
|
|
|
|
|
MOVOU x2in(16*1), X10
|
|
|
|
|
MOVOU y2in(16*0), X11
|
|
|
|
|
MOVOU y2in(16*1), X12
|
|
|
|
|
MOVOU p256one<>+0x00(SB), X13
|
|
|
|
|
MOVOU p256one<>+0x10(SB), X14
|
|
|
|
|
|
|
|
|
|
PAND X15, X0
|
|
|
|
|
PAND X15, X1
|
|
|
|
|
PAND X15, X2
|
|
|
|
|
PAND X15, X3
|
|
|
|
|
PAND X15, X4
|
|
|
|
|
PAND X15, X5
|
|
|
|
|
|
|
|
|
|
PAND X7, X9
|
|
|
|
|
PAND X7, X10
|
|
|
|
|
PAND X7, X11
|
|
|
|
|
PAND X7, X12
|
|
|
|
|
PAND X7, X13
|
|
|
|
|
PAND X7, X14
|
|
|
|
|
|
|
|
|
|
PXOR X9, X0
|
|
|
|
|
PXOR X10, X1
|
|
|
|
|
PXOR X11, X2
|
|
|
|
|
PXOR X12, X3
|
|
|
|
|
PXOR X13, X4
|
|
|
|
|
PXOR X14, X5
|
|
|
|
|
// Finally output the result
|
|
|
|
|
MOVOU X0, (16*0)(AX)
|
|
|
|
|
MOVOU X1, (16*1)(AX)
|
|
|
|
|
MOVOU X2, (16*2)(AX)
|
|
|
|
|
MOVOU X3, (16*3)(AX)
|
|
|
|
|
MOVOU X4, (16*4)(AX)
|
|
|
|
|
MOVOU X5, (16*5)(AX)
|
|
|
|
|
MOVQ $0, rptr
|
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
pointaddaffine_avx2:
|
|
|
|
|
VMOVDQU (32*0)(BX), Y0
|
|
|
|
|
VMOVDQU (32*1)(BX), Y1
|
|
|
|
|
VMOVDQU (32*2)(BX), Y2
|
|
|
|
|
|
|
|
|
|
VMOVDQU Y0, x1in(32*0)
|
|
|
|
|
VMOVDQU Y1, y1in(32*0)
|
|
|
|
|
VMOVDQU Y2, z1in(32*0)
|
|
|
|
|
|
|
|
|
|
VMOVDQU (32*0)(CX), Y0
|
|
|
|
|
VMOVDQU Y0, x2in(32*0)
|
|
|
|
|
|
2023-06-12 09:04:09 +08:00
|
|
|
|
p256PointAddAffineInline()
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// The result is not valid if (sel == 0), conditional choose
|
|
|
|
|
MOVL BX, X6
|
|
|
|
|
MOVL CX, X7
|
|
|
|
|
|
|
|
|
|
VPXOR Y8, Y8, Y8
|
|
|
|
|
VPCMPEQD Y9, Y9, Y9
|
|
|
|
|
|
|
|
|
|
VPBROADCASTD X6, Y6
|
|
|
|
|
VPBROADCASTD X7, Y7
|
|
|
|
|
|
|
|
|
|
VPCMPEQD Y8, Y6, Y6
|
|
|
|
|
VPCMPEQD Y8, Y7, Y7
|
|
|
|
|
|
|
|
|
|
VMOVDQU Y6, Y15
|
|
|
|
|
VPANDN Y9, Y15, Y15
|
|
|
|
|
|
2023-06-14 17:30:58 +08:00
|
|
|
|
VPAND xout(32*0), Y15, Y0
|
|
|
|
|
VPAND yout(32*0), Y15, Y1
|
|
|
|
|
VPAND zout(32*0), Y15, Y2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-14 17:30:58 +08:00
|
|
|
|
VPAND x1in(32*0), Y6, Y9
|
|
|
|
|
VPAND y1in(32*0), Y6, Y10
|
|
|
|
|
VPAND z1in(32*0), Y6, Y11
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
VPXOR Y9, Y0, Y0
|
|
|
|
|
VPXOR Y10, Y1, Y1
|
|
|
|
|
VPXOR Y11, Y2, Y2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
// Similarly if zero == 0
|
2023-06-10 10:55:17 +08:00
|
|
|
|
VPCMPEQD Y9, Y9, Y9
|
|
|
|
|
VPANDN Y9, Y7, Y15
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
VPAND Y15, Y0, Y0
|
|
|
|
|
VPAND Y15, Y1, Y1
|
|
|
|
|
VPAND Y15, Y2, Y2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
2023-06-14 17:30:58 +08:00
|
|
|
|
VPAND x2in(32*0), Y7, Y9
|
|
|
|
|
VPAND y2in(32*0), Y7, Y10
|
|
|
|
|
VPAND p256one<>+0x00(SB), Y7, Y11
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
VPXOR Y9, Y0, Y0
|
|
|
|
|
VPXOR Y10, Y1, Y1
|
|
|
|
|
VPXOR Y11, Y2, Y2
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
// Finally output the result
|
2023-06-10 10:55:17 +08:00
|
|
|
|
VMOVDQU Y0, (32*0)(AX)
|
|
|
|
|
VMOVDQU Y1, (32*1)(AX)
|
|
|
|
|
VMOVDQU Y2, (32*2)(AX)
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVQ $0, rptr
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
VZEROUPPER
|
|
|
|
|
RET
|
2022-08-17 15:23:59 +08:00
|
|
|
|
#undef x1in
|
|
|
|
|
#undef y1in
|
|
|
|
|
#undef z1in
|
|
|
|
|
#undef x2in
|
|
|
|
|
#undef y2in
|
|
|
|
|
#undef xout
|
|
|
|
|
#undef yout
|
|
|
|
|
#undef zout
|
|
|
|
|
#undef s2
|
|
|
|
|
#undef z1sqr
|
|
|
|
|
#undef h
|
|
|
|
|
#undef r
|
|
|
|
|
#undef hsqr
|
|
|
|
|
#undef rsqr
|
|
|
|
|
#undef hcub
|
|
|
|
|
#undef rptr
|
|
|
|
|
#undef sel_save
|
|
|
|
|
#undef zero_save
|
|
|
|
|
|
|
|
|
|
// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
|
|
|
|
|
// otherwise. It writes to [acc4..acc7], t0 and t1.
|
|
|
|
|
TEXT sm2P256IsZero(SB),NOSPLIT,$0
|
|
|
|
|
// AX contains a flag that is set if the input is zero.
|
|
|
|
|
XORQ AX, AX
|
|
|
|
|
MOVQ $1, t1
|
|
|
|
|
|
|
|
|
|
// Check whether [acc4..acc7] are all zero.
|
|
|
|
|
MOVQ acc4, t0
|
|
|
|
|
ORQ acc5, t0
|
|
|
|
|
ORQ acc6, t0
|
|
|
|
|
ORQ acc7, t0
|
|
|
|
|
|
|
|
|
|
// Set the zero flag if so. (CMOV of a constant to a register doesn't
|
|
|
|
|
// appear to be supported in Go. Thus t1 = 1.)
|
|
|
|
|
CMOVQEQ t1, AX
|
|
|
|
|
|
|
|
|
|
// XOR [acc4..acc7] with P and compare with zero again.
|
|
|
|
|
XORQ $-1, acc4
|
|
|
|
|
XORQ p256p<>+0x08(SB), acc5
|
|
|
|
|
XORQ $-1, acc6
|
|
|
|
|
XORQ p256p<>+0x018(SB), acc7
|
|
|
|
|
ORQ acc5, acc4
|
|
|
|
|
ORQ acc6, acc4
|
|
|
|
|
ORQ acc7, acc4
|
|
|
|
|
|
|
|
|
|
// Set the zero flag if so.
|
|
|
|
|
CMOVQEQ t1, AX
|
|
|
|
|
RET
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
#define x1in(off) (32*0 + off)(SP)
|
|
|
|
|
#define y1in(off) (32*1 + off)(SP)
|
|
|
|
|
#define z1in(off) (32*2 + off)(SP)
|
|
|
|
|
#define x2in(off) (32*3 + off)(SP)
|
|
|
|
|
#define y2in(off) (32*4 + off)(SP)
|
|
|
|
|
#define z2in(off) (32*5 + off)(SP)
|
|
|
|
|
|
|
|
|
|
#define xout(off) (32*6 + off)(SP)
|
|
|
|
|
#define yout(off) (32*7 + off)(SP)
|
|
|
|
|
#define zout(off) (32*8 + off)(SP)
|
|
|
|
|
|
|
|
|
|
#define u1(off) (32*9 + off)(SP)
|
|
|
|
|
#define u2(off) (32*10 + off)(SP)
|
|
|
|
|
#define s1(off) (32*11 + off)(SP)
|
|
|
|
|
#define s2(off) (32*12 + off)(SP)
|
|
|
|
|
#define z1sqr(off) (32*13 + off)(SP)
|
|
|
|
|
#define z2sqr(off) (32*14 + off)(SP)
|
|
|
|
|
#define h(off) (32*15 + off)(SP)
|
|
|
|
|
#define r(off) (32*16 + off)(SP)
|
|
|
|
|
#define hsqr(off) (32*17 + off)(SP)
|
|
|
|
|
#define rsqr(off) (32*18 + off)(SP)
|
|
|
|
|
#define hcub(off) (32*19 + off)(SP)
|
|
|
|
|
#define rptr (32*20)(SP)
|
|
|
|
|
#define points_eq (32*20+8)(SP)
|
|
|
|
|
|
2023-06-12 09:04:09 +08:00
|
|
|
|
#define p256PointAddInline() \
|
|
|
|
|
\// Begin point add
|
|
|
|
|
LDacc (z2in) \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \// z2ˆ2
|
|
|
|
|
ST (z2sqr) \
|
|
|
|
|
LDt (z2in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// z2ˆ3
|
|
|
|
|
LDt (y1in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// s1 = z2ˆ3*y1
|
|
|
|
|
ST (s1) \
|
|
|
|
|
\
|
|
|
|
|
LDacc (z1in) \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \// z1ˆ2
|
|
|
|
|
ST (z1sqr) \
|
|
|
|
|
LDt (z1in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// z1ˆ3
|
|
|
|
|
LDt (y2in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// s2 = z1ˆ3*y2
|
|
|
|
|
ST (s2) \
|
|
|
|
|
\
|
|
|
|
|
LDt (s1) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \// r = s2 - s1
|
|
|
|
|
ST (r) \
|
|
|
|
|
CALL sm2P256IsZero(SB) \
|
|
|
|
|
MOVQ AX, points_eq \
|
|
|
|
|
\
|
|
|
|
|
LDacc (z2sqr) \
|
|
|
|
|
LDt (x1in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// u1 = x1 * z2ˆ2
|
|
|
|
|
ST (u1) \
|
|
|
|
|
LDacc (z1sqr) \
|
|
|
|
|
LDt (x2in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2
|
|
|
|
|
ST (u2) \
|
|
|
|
|
\
|
|
|
|
|
LDt (u1) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \// h = u2 - u1
|
|
|
|
|
ST (h) \
|
|
|
|
|
CALL sm2P256IsZero(SB) \
|
|
|
|
|
ANDQ points_eq, AX \
|
|
|
|
|
MOVQ AX, points_eq \
|
|
|
|
|
\
|
|
|
|
|
LDacc (r) \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2
|
|
|
|
|
ST (rsqr) \
|
|
|
|
|
\
|
|
|
|
|
LDacc (h) \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2
|
|
|
|
|
ST (hsqr) \
|
|
|
|
|
\
|
|
|
|
|
LDt (h) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// hcub = hˆ3
|
|
|
|
|
ST (hcub) \
|
|
|
|
|
\
|
|
|
|
|
LDt (s1) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \
|
|
|
|
|
ST (s2) \
|
|
|
|
|
\
|
|
|
|
|
LDacc (z1in) \
|
|
|
|
|
LDt (z2in) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// z1 * z2
|
|
|
|
|
LDt (h) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// z1 * z2 * h
|
|
|
|
|
ST (zout) \
|
|
|
|
|
\
|
|
|
|
|
LDacc (hsqr) \
|
|
|
|
|
LDt (u1) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \// hˆ2 * u1
|
|
|
|
|
ST (u2) \
|
|
|
|
|
\
|
|
|
|
|
p256MulBy2Inline \// u1 * hˆ2 * 2, inline
|
|
|
|
|
LDacc (rsqr) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2
|
|
|
|
|
\
|
|
|
|
|
LDt (hcub) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
ST (xout) \
|
|
|
|
|
\
|
|
|
|
|
MOVQ acc4, t0 \
|
|
|
|
|
MOVQ acc5, t1 \
|
|
|
|
|
MOVQ acc6, t2 \
|
|
|
|
|
MOVQ acc7, t3 \
|
|
|
|
|
LDacc (u2) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
\
|
|
|
|
|
LDt (r) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \
|
|
|
|
|
\
|
|
|
|
|
LDt (s2) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
ST (yout) \
|
|
|
|
|
|
2022-08-25 16:45:18 +08:00
|
|
|
|
//func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
|
2022-08-17 15:23:59 +08:00
|
|
|
|
TEXT ·p256PointAddAsm(SB),0,$680-32
|
|
|
|
|
// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
|
|
|
|
|
// Move input to stack in order to free registers
|
|
|
|
|
MOVQ res+0(FP), AX
|
|
|
|
|
MOVQ in1+8(FP), BX
|
|
|
|
|
MOVQ in2+16(FP), CX
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
CMPB ·supportAVX2+0(SB), $0x01
|
|
|
|
|
JEQ pointadd_avx2
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
MOVOU (16*0)(BX), X0
|
|
|
|
|
MOVOU (16*1)(BX), X1
|
|
|
|
|
MOVOU (16*2)(BX), X2
|
|
|
|
|
MOVOU (16*3)(BX), X3
|
|
|
|
|
MOVOU (16*4)(BX), X4
|
|
|
|
|
MOVOU (16*5)(BX), X5
|
|
|
|
|
|
|
|
|
|
MOVOU X0, x1in(16*0)
|
|
|
|
|
MOVOU X1, x1in(16*1)
|
|
|
|
|
MOVOU X2, y1in(16*0)
|
|
|
|
|
MOVOU X3, y1in(16*1)
|
|
|
|
|
MOVOU X4, z1in(16*0)
|
|
|
|
|
MOVOU X5, z1in(16*1)
|
|
|
|
|
|
|
|
|
|
MOVOU (16*0)(CX), X0
|
|
|
|
|
MOVOU (16*1)(CX), X1
|
|
|
|
|
MOVOU (16*2)(CX), X2
|
|
|
|
|
MOVOU (16*3)(CX), X3
|
|
|
|
|
MOVOU (16*4)(CX), X4
|
|
|
|
|
MOVOU (16*5)(CX), X5
|
|
|
|
|
|
|
|
|
|
MOVOU X0, x2in(16*0)
|
|
|
|
|
MOVOU X1, x2in(16*1)
|
|
|
|
|
MOVOU X2, y2in(16*0)
|
|
|
|
|
MOVOU X3, y2in(16*1)
|
|
|
|
|
MOVOU X4, z2in(16*0)
|
|
|
|
|
MOVOU X5, z2in(16*1)
|
|
|
|
|
// Store pointer to result
|
|
|
|
|
MOVQ AX, rptr
|
2023-06-12 09:04:09 +08:00
|
|
|
|
p256PointAddInline()
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
MOVOU xout(16*0), X0
|
|
|
|
|
MOVOU xout(16*1), X1
|
|
|
|
|
MOVOU yout(16*0), X2
|
|
|
|
|
MOVOU yout(16*1), X3
|
|
|
|
|
MOVOU zout(16*0), X4
|
|
|
|
|
MOVOU zout(16*1), X5
|
|
|
|
|
// Finally output the result
|
|
|
|
|
MOVQ rptr, AX
|
|
|
|
|
MOVQ $0, rptr
|
|
|
|
|
MOVOU X0, (16*0)(AX)
|
|
|
|
|
MOVOU X1, (16*1)(AX)
|
|
|
|
|
MOVOU X2, (16*2)(AX)
|
|
|
|
|
MOVOU X3, (16*3)(AX)
|
|
|
|
|
MOVOU X4, (16*4)(AX)
|
|
|
|
|
MOVOU X5, (16*5)(AX)
|
|
|
|
|
|
|
|
|
|
MOVQ points_eq, AX
|
|
|
|
|
MOVQ AX, ret+24(FP)
|
|
|
|
|
|
|
|
|
|
RET
|
2023-06-10 10:55:17 +08:00
|
|
|
|
pointadd_avx2:
|
|
|
|
|
VMOVDQU (32*0)(BX), Y0
|
|
|
|
|
VMOVDQU (32*1)(BX), Y1
|
|
|
|
|
VMOVDQU (32*2)(BX), Y2
|
|
|
|
|
|
|
|
|
|
VMOVDQU Y0, x1in(32*0)
|
|
|
|
|
VMOVDQU Y1, y1in(32*0)
|
|
|
|
|
VMOVDQU Y2, z1in(32*0)
|
|
|
|
|
|
|
|
|
|
VMOVDQU (32*0)(CX), Y0
|
|
|
|
|
VMOVDQU (32*1)(CX), Y1
|
|
|
|
|
VMOVDQU (32*2)(CX), Y2
|
|
|
|
|
|
|
|
|
|
VMOVDQU Y0, x2in(32*0)
|
|
|
|
|
VMOVDQU Y1, y2in(32*0)
|
|
|
|
|
VMOVDQU Y2, z2in(32*0)
|
|
|
|
|
|
|
|
|
|
// Store pointer to result
|
|
|
|
|
MOVQ AX, rptr
|
2023-06-12 09:04:09 +08:00
|
|
|
|
p256PointAddInline()
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
|
|
|
|
VMOVDQU xout(32*0), Y0
|
|
|
|
|
VMOVDQU yout(32*0), Y1
|
|
|
|
|
VMOVDQU zout(32*0), Y2
|
|
|
|
|
// Finally output the result
|
|
|
|
|
MOVQ rptr, AX
|
|
|
|
|
MOVQ $0, rptr
|
|
|
|
|
VMOVDQU Y0, (32*0)(AX)
|
|
|
|
|
VMOVDQU Y1, (32*1)(AX)
|
|
|
|
|
VMOVDQU Y2, (32*2)(AX)
|
|
|
|
|
|
|
|
|
|
MOVQ points_eq, AX
|
|
|
|
|
MOVQ AX, ret+24(FP)
|
|
|
|
|
|
|
|
|
|
VZEROUPPER
|
|
|
|
|
RET
|
|
|
|
|
|
2022-08-17 15:23:59 +08:00
|
|
|
|
#undef x1in
|
|
|
|
|
#undef y1in
|
|
|
|
|
#undef z1in
|
|
|
|
|
#undef x2in
|
|
|
|
|
#undef y2in
|
|
|
|
|
#undef z2in
|
|
|
|
|
#undef xout
|
|
|
|
|
#undef yout
|
|
|
|
|
#undef zout
|
|
|
|
|
#undef s1
|
|
|
|
|
#undef s2
|
|
|
|
|
#undef u1
|
|
|
|
|
#undef u2
|
|
|
|
|
#undef z1sqr
|
|
|
|
|
#undef z2sqr
|
|
|
|
|
#undef h
|
|
|
|
|
#undef r
|
|
|
|
|
#undef hsqr
|
|
|
|
|
#undef rsqr
|
|
|
|
|
#undef hcub
|
|
|
|
|
#undef rptr
|
|
|
|
|
/* ---------------------------------------*/
|
|
|
|
|
#define x(off) (32*0 + off)(SP)
|
|
|
|
|
#define y(off) (32*1 + off)(SP)
|
|
|
|
|
#define z(off) (32*2 + off)(SP)
|
|
|
|
|
|
|
|
|
|
#define s(off) (32*3 + off)(SP)
|
|
|
|
|
#define m(off) (32*4 + off)(SP)
|
|
|
|
|
#define zsqr(off) (32*5 + off)(SP)
|
|
|
|
|
#define tmp(off) (32*6 + off)(SP)
|
|
|
|
|
#define rptr (32*7)(SP)
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
#define calZ() \
|
|
|
|
|
LDacc (z) \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \
|
|
|
|
|
ST (zsqr) \
|
|
|
|
|
\
|
|
|
|
|
LDt (x) \
|
|
|
|
|
p256AddInline \
|
|
|
|
|
STt (m) \
|
|
|
|
|
\
|
|
|
|
|
LDacc (z) \
|
|
|
|
|
LDt (y) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \
|
|
|
|
|
p256MulBy2Inline \
|
|
|
|
|
|
|
|
|
|
#define calX() \
|
|
|
|
|
LDacc (x) \
|
|
|
|
|
LDt (zsqr) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
LDt (m) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \
|
|
|
|
|
ST (m) \
|
|
|
|
|
\// Multiply by 3
|
|
|
|
|
p256MulBy2Inline \
|
|
|
|
|
LDacc (m) \
|
|
|
|
|
p256AddInline \
|
|
|
|
|
STt (m) \
|
|
|
|
|
\////////////////////////
|
|
|
|
|
LDacc (y) \
|
|
|
|
|
p256MulBy2Inline \
|
|
|
|
|
t2acc \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \
|
|
|
|
|
ST (s) \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \
|
|
|
|
|
\// Divide by 2
|
|
|
|
|
XORQ mul0, mul0 \
|
|
|
|
|
MOVQ acc4, t0 \
|
|
|
|
|
MOVQ acc5, t1 \
|
|
|
|
|
MOVQ acc6, t2 \
|
|
|
|
|
MOVQ acc7, t3 \
|
|
|
|
|
\
|
|
|
|
|
ADDQ $-1, acc4 \
|
|
|
|
|
ADCQ p256p<>+0x08(SB), acc5 \
|
|
|
|
|
ADCQ $-1, acc6 \
|
|
|
|
|
ADCQ p256p<>+0x018(SB), acc7 \
|
|
|
|
|
ADCQ $0, mul0 \
|
|
|
|
|
TESTQ $1, t0 \
|
|
|
|
|
\
|
|
|
|
|
CMOVQEQ t0, acc4 \
|
|
|
|
|
CMOVQEQ t1, acc5 \
|
|
|
|
|
CMOVQEQ t2, acc6 \
|
|
|
|
|
CMOVQEQ t3, acc7 \
|
|
|
|
|
ANDQ t0, mul0 \
|
|
|
|
|
\
|
|
|
|
|
SHRQ $1, acc5, acc4 \
|
|
|
|
|
SHRQ $1, acc6, acc5 \
|
|
|
|
|
SHRQ $1, acc7, acc6 \
|
|
|
|
|
SHRQ $1, mul0, acc7 \
|
|
|
|
|
ST (y) \
|
|
|
|
|
\/////////////////////////
|
|
|
|
|
LDacc (x) \
|
|
|
|
|
LDt (s) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \
|
|
|
|
|
ST (s) \
|
|
|
|
|
p256MulBy2Inline \
|
|
|
|
|
STt (tmp) \
|
|
|
|
|
\
|
|
|
|
|
LDacc (m) \
|
|
|
|
|
CALL sm2P256SqrInternal(SB) \
|
|
|
|
|
LDt (tmp) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
|
|
|
|
|
#define calY() \
|
|
|
|
|
acc2t \
|
|
|
|
|
LDacc (s) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
\
|
|
|
|
|
LDt (m) \
|
|
|
|
|
CALL sm2P256MulInternal(SB) \
|
|
|
|
|
\
|
|
|
|
|
LDt (y) \
|
|
|
|
|
CALL sm2P256SubInternal(SB) \
|
|
|
|
|
|
2023-06-12 09:04:09 +08:00
|
|
|
|
#define lastP256PointDouble() \
|
|
|
|
|
calZ() \
|
|
|
|
|
MOVQ rptr, AX \
|
|
|
|
|
\// Store z
|
|
|
|
|
MOVQ t0, (16*4 + 8*0)(AX) \
|
|
|
|
|
MOVQ t1, (16*4 + 8*1)(AX) \
|
|
|
|
|
MOVQ t2, (16*4 + 8*2)(AX) \
|
|
|
|
|
MOVQ t3, (16*4 + 8*3)(AX) \
|
|
|
|
|
\
|
|
|
|
|
calX() \
|
|
|
|
|
MOVQ rptr, AX \
|
|
|
|
|
\// Store x
|
|
|
|
|
MOVQ acc4, (16*0 + 8*0)(AX) \
|
|
|
|
|
MOVQ acc5, (16*0 + 8*1)(AX) \
|
|
|
|
|
MOVQ acc6, (16*0 + 8*2)(AX) \
|
|
|
|
|
MOVQ acc7, (16*0 + 8*3)(AX) \
|
|
|
|
|
\
|
|
|
|
|
calY() \
|
|
|
|
|
MOVQ rptr, AX \
|
|
|
|
|
\// Store y
|
|
|
|
|
MOVQ acc4, (16*2 + 8*0)(AX) \
|
|
|
|
|
MOVQ acc5, (16*2 + 8*1)(AX) \
|
|
|
|
|
MOVQ acc6, (16*2 + 8*2)(AX) \
|
|
|
|
|
MOVQ acc7, (16*2 + 8*3)(AX) \
|
|
|
|
|
\///////////////////////
|
|
|
|
|
MOVQ $0, rptr \
|
|
|
|
|
|
|
|
|
|
#define p256PointDoubleInit() \
|
|
|
|
|
MOVOU (16*0)(BX), X0 \
|
|
|
|
|
MOVOU (16*1)(BX), X1 \
|
|
|
|
|
MOVOU (16*2)(BX), X2 \
|
|
|
|
|
MOVOU (16*3)(BX), X3 \
|
|
|
|
|
MOVOU (16*4)(BX), X4 \
|
|
|
|
|
MOVOU (16*5)(BX), X5 \
|
|
|
|
|
\
|
|
|
|
|
MOVOU X0, x(16*0) \
|
|
|
|
|
MOVOU X1, x(16*1) \
|
|
|
|
|
MOVOU X2, y(16*0) \
|
|
|
|
|
MOVOU X3, y(16*1) \
|
|
|
|
|
MOVOU X4, z(16*0) \
|
|
|
|
|
MOVOU X5, z(16*1) \
|
|
|
|
|
|
|
|
|
|
//func p256PointDoubleAsm(res, in *SM2P256Point)
|
|
|
|
|
TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16
|
|
|
|
|
// Move input to stack in order to free registers
|
|
|
|
|
MOVQ res+0(FP), AX
|
|
|
|
|
MOVQ in+8(FP), BX
|
|
|
|
|
|
|
|
|
|
p256PointDoubleInit()
|
|
|
|
|
// Store pointer to result
|
|
|
|
|
MOVQ AX, rptr
|
|
|
|
|
// Begin point double
|
|
|
|
|
lastP256PointDouble()
|
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
|
2023-06-10 10:55:17 +08:00
|
|
|
|
#define storeTmpX() \
|
|
|
|
|
MOVQ acc4, x(8*0) \
|
|
|
|
|
MOVQ acc5, x(8*1) \
|
|
|
|
|
MOVQ acc6, x(8*2) \
|
|
|
|
|
MOVQ acc7, x(8*3) \
|
|
|
|
|
|
|
|
|
|
#define storeTmpY() \
|
|
|
|
|
MOVQ acc4, y(8*0) \
|
|
|
|
|
MOVQ acc5, y(8*1) \
|
|
|
|
|
MOVQ acc6, y(8*2) \
|
|
|
|
|
MOVQ acc7, y(8*3) \
|
|
|
|
|
|
|
|
|
|
#define storeTmpZ() \
|
|
|
|
|
MOVQ t0, z(8*0) \
|
|
|
|
|
MOVQ t1, z(8*1) \
|
|
|
|
|
MOVQ t2, z(8*2) \
|
|
|
|
|
MOVQ t3, z(8*3) \
|
|
|
|
|
|
2023-06-12 09:04:09 +08:00
|
|
|
|
#define p256PointDoubleRound() \
|
|
|
|
|
calZ() \
|
|
|
|
|
storeTmpZ() \
|
|
|
|
|
calX() \
|
|
|
|
|
storeTmpX() \
|
|
|
|
|
calY() \
|
|
|
|
|
storeTmpY() \
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2023-06-14 17:30:58 +08:00
|
|
|
|
//func p256PointDouble6TimesAsm(res, in *SM2P256Point)
|
|
|
|
|
TEXT ·p256PointDouble6TimesAsm(SB),NOSPLIT,$256-16
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// Move input to stack in order to free registers
|
|
|
|
|
MOVQ res+0(FP), AX
|
|
|
|
|
MOVQ in+8(FP), BX
|
|
|
|
|
|
2023-06-12 09:04:09 +08:00
|
|
|
|
p256PointDoubleInit()
|
2023-06-10 10:55:17 +08:00
|
|
|
|
// Store pointer to result
|
|
|
|
|
MOVQ AX, rptr
|
|
|
|
|
|
2023-06-15 21:50:25 +08:00
|
|
|
|
// point double 1-5 rounds
|
2023-06-14 17:30:58 +08:00
|
|
|
|
p256PointDoubleRound()
|
2023-06-12 09:04:09 +08:00
|
|
|
|
p256PointDoubleRound()
|
|
|
|
|
p256PointDoubleRound()
|
|
|
|
|
p256PointDoubleRound()
|
|
|
|
|
p256PointDoubleRound()
|
2023-06-10 10:55:17 +08:00
|
|
|
|
|
2023-06-15 21:50:25 +08:00
|
|
|
|
// last point double round
|
2023-06-12 09:04:09 +08:00
|
|
|
|
lastP256PointDouble()
|
2022-08-17 15:23:59 +08:00
|
|
|
|
|
|
|
|
|
RET
|
|
|
|
|
/* ---------------------------------------*/
|