gmsm/internal/sm2ec/p256_macros_amd64.s
2024-02-28 17:39:27 +08:00

435 lines
12 KiB
ArmAsm

#define res_ptr DI
#define x_ptr SI
#define y_ptr CX
#define acc0 R8
#define acc1 R9
#define acc2 R10
#define acc3 R11
#define acc4 R12
#define acc5 R13
#define t0 R14
DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff
DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975
DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123
DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b
DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff
DATA p256one<>+0x00(SB)/8, $0x0000000000000001
DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
DATA p256one<>+0x10(SB)/8, $0x0000000000000000
DATA p256one<>+0x18(SB)/8, $0x0000000100000000
GLOBL p256p<>(SB), 8, $32
GLOBL p256ordK0<>(SB), 8, $8
GLOBL p256ord<>(SB), 8, $32
GLOBL p256one<>(SB), 8, $32
#define p256SqrMontReduce() \
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
MOVQ acc0, AX \
MOVQ acc0, DX \
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
SUBQ AX, acc1 \
SBBQ DX, acc2 \
SBBQ AX, acc3 \
MOVQ acc0, AX \
SBBQ DX, acc0 \
\ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1)
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
\ // Second reduction step
MOVQ acc1, AX \
MOVQ acc1, DX \
SHLQ $32, AX \
SHRQ $32, DX \
\
SUBQ AX, acc2 \
SBBQ DX, acc3 \
SBBQ AX, acc0 \
MOVQ acc1, AX \
SBBQ DX, acc1 \
\
ADDQ AX, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
\ // Third reduction step
MOVQ acc2, AX \
MOVQ acc2, DX \
SHLQ $32, AX \
SHRQ $32, DX \
\
SUBQ AX, acc3 \
SBBQ DX, acc0 \
SBBQ AX, acc1 \
MOVQ acc2, AX \
SBBQ DX, acc2 \
\
ADDQ AX, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
\ // Last reduction step
XORQ t0, t0 \
MOVQ acc3, AX \
MOVQ acc3, DX \
SHLQ $32, AX \
SHRQ $32, DX \
\
SUBQ AX, acc0 \
SBBQ DX, acc1 \
SBBQ AX, acc2 \
MOVQ acc3, AX \
SBBQ DX, acc3 \
\
ADDQ AX, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
\ // Add bits [511:256] of the sqr result
ADCQ acc4, acc0 \
ADCQ acc5, acc1 \
ADCQ y_ptr, acc2 \
ADCQ x_ptr, acc3 \
ADCQ $0, t0
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
MOVQ a0, b0 \
MOVQ a1, b1 \
MOVQ a2, b2 \
MOVQ a3, b3 \
\ // Subtract p256
SUBQ $-1, a0 \
SBBQ p256p<>+0x08(SB), a1 \
SBBQ $-1, a2 \
SBBQ p256p<>+0x018(SB), a3 \
SBBQ $0, a4 \
\
CMOVQCS b0, a0 \
CMOVQCS b1, a1 \
CMOVQCS b2, a2 \
CMOVQCS b3, a3 \
\
MOVQ a0, (8*0)(res) \
MOVQ a1, (8*1)(res) \
MOVQ a2, (8*2)(res) \
MOVQ a3, (8*3)(res)
/* ---------------------------------------*/
#define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
\// Copy result [255:0]
MOVQ a0, b0 \
MOVQ a1, b1 \
MOVQ a2, b2 \
MOVQ a3, b3 \
\// Subtract p256
SUBQ p256ord<>+0x00(SB), a0 \
SBBQ p256ord<>+0x08(SB) ,a1 \
SBBQ p256ord<>+0x10(SB), a2 \
SBBQ p256ord<>+0x18(SB), a3 \
SBBQ $0, a4 \
\
CMOVQCS b0, a0 \
CMOVQCS b1, a1 \
CMOVQCS b2, a2 \
CMOVQCS b3, a3 \
\
MOVQ a0, (8*0)(res) \
MOVQ a1, (8*1)(res) \
MOVQ a2, (8*2)(res) \
MOVQ a3, (8*3)(res)
#define sm2P256SqrReductionInternal() \
\ // First reduction step
MOVQ acc0, mul0 \
MOVQ acc0, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc1 \
SBBQ mul1, acc2 \
SBBQ mul0, acc3 \
MOVQ acc0, mul0 \
SBBQ mul1, acc0 \
\
ADDQ mul0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
\ // Second reduction step
MOVQ acc1, mul0 \
MOVQ acc1, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc2 \
SBBQ mul1, acc3 \
SBBQ mul0, acc0 \
MOVQ acc1, mul0 \
SBBQ mul1, acc1 \
\
ADDQ mul0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
\ // Third reduction step
MOVQ acc2, mul0 \
MOVQ acc2, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc3 \
SBBQ mul1, acc0 \
SBBQ mul0, acc1 \
MOVQ acc2, mul0 \
SBBQ mul1, acc2 \
\
ADDQ mul0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
\ // Last reduction step
MOVQ acc3, mul0 \
MOVQ acc3, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc0 \
SBBQ mul1, acc1 \
SBBQ mul0, acc2 \
MOVQ acc3, mul0 \
SBBQ mul1, acc3 \
\
ADDQ mul0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
MOVQ $0, mul0 \
\ // Add bits [511:256] of the result
ADCQ acc0, t0 \
ADCQ acc1, t1 \
ADCQ acc2, t2 \
ADCQ acc3, t3 \
ADCQ $0, mul0 \
\ // Copy result
MOVQ t0, acc4 \
MOVQ t1, acc5 \
MOVQ t2, acc6 \
MOVQ t3, acc7 \
\ // Subtract p256
SUBQ $-1, acc4 \
SBBQ p256p<>+0x08(SB), acc5 \
SBBQ $-1, acc6 \
SBBQ p256p<>+0x018(SB), acc7 \
SBBQ $0, mul0 \
\ // If the result of the subtraction is negative, restore the previous result
CMOVQCS t0, acc4 \
CMOVQCS t1, acc5 \
CMOVQCS t2, acc6 \
CMOVQCS t3, acc7
#define sm2P256MulReductionInternal() \
\// First reduction step
MOVQ acc0, mul0 \
MOVQ acc0, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc1 \
SBBQ mul1, acc2 \
SBBQ mul0, acc3 \
MOVQ acc0, mul0 \
SBBQ mul1, acc0 \
\
ADDQ mul0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
\// Second reduction step
MOVQ acc1, mul0 \
MOVQ acc1, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc2 \
SBBQ mul1, acc3 \
SBBQ mul0, acc0 \
MOVQ acc1, mul0 \
SBBQ mul1, acc1 \
\
ADDQ mul0, acc2 \
ADCQ $0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
\// Third reduction step
MOVQ acc2, mul0 \
MOVQ acc2, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc3 \
SBBQ mul1, acc0 \
SBBQ mul0, acc1 \
MOVQ acc2, mul0 \
SBBQ mul1, acc2 \
\
ADDQ mul0, acc3 \
ADCQ $0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
\// Last reduction step
MOVQ acc3, mul0 \
MOVQ acc3, mul1 \
SHLQ $32, mul0 \
SHRQ $32, mul1 \
\
SUBQ mul0, acc0 \
SBBQ mul1, acc1 \
SBBQ mul0, acc2 \
MOVQ acc3, mul0 \
SBBQ mul1, acc3 \
\
ADDQ mul0, acc0 \
ADCQ $0, acc1 \
ADCQ $0, acc2 \
ADCQ $0, acc3
#define p256PointDoubleInit() \
MOVOU (16*0)(BX), X0 \
MOVOU (16*1)(BX), X1 \
MOVOU (16*2)(BX), X2 \
MOVOU (16*3)(BX), X3 \
MOVOU (16*4)(BX), X4 \
MOVOU (16*5)(BX), X5 \
\
MOVOU X0, x(16*0) \
MOVOU X1, x(16*1) \
MOVOU X2, y(16*0) \
MOVOU X3, y(16*1) \
MOVOU X4, z(16*0) \
MOVOU X5, z(16*1)
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
#define p256MulBy2Inline2\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
#define p256TripleInline\
XORQ mul0, mul0;\
MOVQ acc4, acc0;\
MOVQ acc5, acc1;\
MOVQ acc6, acc2;\
MOVQ acc7, acc3;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, acc4;\
SBBQ p256p<>+0x08(SB), acc5;\
SBBQ $-1, acc6;\
SBBQ p256p<>+0x018(SB), acc7;\
SBBQ $0, mul0;\
CMOVQCS t0, acc4;\
CMOVQCS t1, acc5;\
CMOVQCS t2, acc6;\
CMOVQCS t3, acc7;\
XORQ mul0, mul0;\
ADDQ acc0, acc4;\
ADCQ acc1, acc5;\
ADCQ acc2, acc6;\
ADCQ acc3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
#define p256AddInline \
XORQ mul0, mul0;\
ADDQ t0, acc4;\
ADCQ t1, acc5;\
ADCQ t2, acc6;\
ADCQ t3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256p<>+0x08(SB), t1;\
SBBQ $-1, t2;\
SBBQ p256p<>+0x018(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;