mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
365 lines
11 KiB
ArmAsm
365 lines
11 KiB
ArmAsm
#define res_ptr DI
|
|
#define x_ptr SI
|
|
#define y_ptr CX
|
|
|
|
#define acc0 R8
|
|
#define acc1 R9
|
|
#define acc2 R10
|
|
#define acc3 R11
|
|
#define acc4 R12
|
|
#define acc5 R13
|
|
#define t0 R14
|
|
|
|
DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
|
|
DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
|
|
DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
|
|
DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff
|
|
DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975
|
|
DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123
|
|
DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b
|
|
DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
|
|
DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff
|
|
DATA p256one<>+0x00(SB)/8, $0x0000000000000001
|
|
DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
|
|
DATA p256one<>+0x10(SB)/8, $0x0000000000000000
|
|
DATA p256one<>+0x18(SB)/8, $0x0000000100000000
|
|
GLOBL p256p<>(SB), 8, $32
|
|
GLOBL p256ordK0<>(SB), 8, $8
|
|
GLOBL p256ord<>(SB), 8, $32
|
|
GLOBL p256one<>(SB), 8, $32
|
|
|
|
#define p256SqrMontReduce() \
|
|
\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
|
|
MOVQ acc0, AX \
|
|
MOVQ acc0, DX \
|
|
SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part
|
|
SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part
|
|
\// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0
|
|
SUBQ AX, acc1 \
|
|
SBBQ DX, acc2 \
|
|
SBBQ AX, acc3 \
|
|
MOVQ acc0, AX \
|
|
SBBQ DX, acc0 \
|
|
\ // calculate the positive part: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1],
|
|
\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
|
|
ADDQ AX, acc1 \ // acc1' = L (acc0 + acc1)
|
|
ADCQ $0, acc2 \ // acc2' = acc2 + carry1
|
|
ADCQ $0, acc3 \ // acc3' = acc3 + carry2
|
|
ADCQ $0, acc0 \ // acc0' = acc0 + carry3
|
|
\ // Second reduction step
|
|
MOVQ acc1, AX \
|
|
MOVQ acc1, DX \
|
|
SHLQ $32, AX \
|
|
SHRQ $32, DX \
|
|
\
|
|
SUBQ AX, acc2 \
|
|
SBBQ DX, acc3 \
|
|
SBBQ AX, acc0 \
|
|
MOVQ acc1, AX \
|
|
SBBQ DX, acc1 \
|
|
\
|
|
ADDQ AX, acc2 \
|
|
ADCQ $0, acc3 \
|
|
ADCQ $0, acc0 \
|
|
ADCQ $0, acc1 \
|
|
\ // Third reduction step
|
|
MOVQ acc2, AX \
|
|
MOVQ acc2, DX \
|
|
SHLQ $32, AX \
|
|
SHRQ $32, DX \
|
|
\
|
|
SUBQ AX, acc3 \
|
|
SBBQ DX, acc0 \
|
|
SBBQ AX, acc1 \
|
|
MOVQ acc2, AX \
|
|
SBBQ DX, acc2 \
|
|
\
|
|
ADDQ AX, acc3 \
|
|
ADCQ $0, acc0 \
|
|
ADCQ $0, acc1 \
|
|
ADCQ $0, acc2 \
|
|
\ // Last reduction step
|
|
XORQ t0, t0 \
|
|
MOVQ acc3, AX \
|
|
MOVQ acc3, DX \
|
|
SHLQ $32, AX \
|
|
SHRQ $32, DX \
|
|
\
|
|
SUBQ AX, acc0 \
|
|
SBBQ DX, acc1 \
|
|
SBBQ AX, acc2 \
|
|
MOVQ acc3, AX \
|
|
SBBQ DX, acc3 \
|
|
\
|
|
ADDQ AX, acc0 \
|
|
ADCQ $0, acc1 \
|
|
ADCQ $0, acc2 \
|
|
ADCQ $0, acc3 \
|
|
\ // Add bits [511:256] of the sqr result
|
|
ADCQ acc4, acc0 \
|
|
ADCQ acc5, acc1 \
|
|
ADCQ y_ptr, acc2 \
|
|
ADCQ x_ptr, acc3 \
|
|
ADCQ $0, t0
|
|
|
|
#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
|
MOVQ a0, b0 \
|
|
MOVQ a1, b1 \
|
|
MOVQ a2, b2 \
|
|
MOVQ a3, b3 \
|
|
\ // Subtract p256
|
|
SUBQ $-1, a0 \
|
|
SBBQ p256p<>+0x08(SB), a1 \
|
|
SBBQ $-1, a2 \
|
|
SBBQ p256p<>+0x018(SB), a3 \
|
|
SBBQ $0, a4 \
|
|
\
|
|
CMOVQCS b0, a0 \
|
|
CMOVQCS b1, a1 \
|
|
CMOVQCS b2, a2 \
|
|
CMOVQCS b3, a3 \
|
|
\
|
|
MOVQ a0, (8*0)(res) \
|
|
MOVQ a1, (8*1)(res) \
|
|
MOVQ a2, (8*2)(res) \
|
|
MOVQ a3, (8*3)(res)
|
|
|
|
/* ---------------------------------------*/
|
|
#define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
|
|
\// Copy result [255:0]
|
|
MOVQ a0, b0 \
|
|
MOVQ a1, b1 \
|
|
MOVQ a2, b2 \
|
|
MOVQ a3, b3 \
|
|
\// Subtract p256
|
|
SUBQ p256ord<>+0x00(SB), a0 \
|
|
SBBQ p256ord<>+0x08(SB) ,a1 \
|
|
SBBQ p256ord<>+0x10(SB), a2 \
|
|
SBBQ p256ord<>+0x18(SB), a3 \
|
|
SBBQ $0, a4 \
|
|
\
|
|
CMOVQCS b0, a0 \
|
|
CMOVQCS b1, a1 \
|
|
CMOVQCS b2, a2 \
|
|
CMOVQCS b3, a3 \
|
|
\
|
|
MOVQ a0, (8*0)(res) \
|
|
MOVQ a1, (8*1)(res) \
|
|
MOVQ a2, (8*2)(res) \
|
|
MOVQ a3, (8*3)(res)
|
|
|
|
#define sm2P256SqrReductionInternal() \
|
|
\ // First reduction step
|
|
MOVQ acc0, mul0 \
|
|
MOVQ acc0, mul1 \
|
|
SHLQ $32, mul0 \
|
|
SHRQ $32, mul1 \
|
|
\
|
|
SUBQ mul0, acc1 \
|
|
SBBQ mul1, acc2 \
|
|
SBBQ mul0, acc3 \
|
|
MOVQ acc0, mul0 \
|
|
SBBQ mul1, acc0 \
|
|
\
|
|
ADDQ mul0, acc1 \
|
|
ADCQ $0, acc2 \
|
|
ADCQ $0, acc3 \
|
|
ADCQ $0, acc0 \
|
|
\ // Second reduction step
|
|
MOVQ acc1, mul0 \
|
|
MOVQ acc1, mul1 \
|
|
SHLQ $32, mul0 \
|
|
SHRQ $32, mul1 \
|
|
\
|
|
SUBQ mul0, acc2 \
|
|
SBBQ mul1, acc3 \
|
|
SBBQ mul0, acc0 \
|
|
MOVQ acc1, mul0 \
|
|
SBBQ mul1, acc1 \
|
|
\
|
|
ADDQ mul0, acc2 \
|
|
ADCQ $0, acc3 \
|
|
ADCQ $0, acc0 \
|
|
ADCQ $0, acc1 \
|
|
\ // Third reduction step
|
|
MOVQ acc2, mul0 \
|
|
MOVQ acc2, mul1 \
|
|
SHLQ $32, mul0 \
|
|
SHRQ $32, mul1 \
|
|
\
|
|
SUBQ mul0, acc3 \
|
|
SBBQ mul1, acc0 \
|
|
SBBQ mul0, acc1 \
|
|
MOVQ acc2, mul0 \
|
|
SBBQ mul1, acc2 \
|
|
\
|
|
ADDQ mul0, acc3 \
|
|
ADCQ $0, acc0 \
|
|
ADCQ $0, acc1 \
|
|
ADCQ $0, acc2 \
|
|
\ // Last reduction step
|
|
MOVQ acc3, mul0 \
|
|
MOVQ acc3, mul1 \
|
|
SHLQ $32, mul0 \
|
|
SHRQ $32, mul1 \
|
|
\
|
|
SUBQ mul0, acc0 \
|
|
SBBQ mul1, acc1 \
|
|
SBBQ mul0, acc2 \
|
|
MOVQ acc3, mul0 \
|
|
SBBQ mul1, acc3 \
|
|
\
|
|
ADDQ mul0, acc0 \
|
|
ADCQ $0, acc1 \
|
|
ADCQ $0, acc2 \
|
|
ADCQ $0, acc3 \
|
|
MOVQ $0, mul0 \
|
|
\ // Add bits [511:256] of the result
|
|
ADCQ acc0, t0 \
|
|
ADCQ acc1, t1 \
|
|
ADCQ acc2, t2 \
|
|
ADCQ acc3, t3 \
|
|
ADCQ $0, mul0 \
|
|
\ // Copy result
|
|
MOVQ t0, acc4 \
|
|
MOVQ t1, acc5 \
|
|
MOVQ t2, acc6 \
|
|
MOVQ t3, acc7 \
|
|
\ // Subtract p256
|
|
SUBQ $-1, acc4 \
|
|
SBBQ p256p<>+0x08(SB), acc5 \
|
|
SBBQ $-1, acc6 \
|
|
SBBQ p256p<>+0x018(SB), acc7 \
|
|
SBBQ $0, mul0 \
|
|
\ // If the result of the subtraction is negative, restore the previous result
|
|
CMOVQCS t0, acc4 \
|
|
CMOVQCS t1, acc5 \
|
|
CMOVQCS t2, acc6 \
|
|
CMOVQCS t3, acc7
|
|
|
|
#define sm2P256MulReductionInternal() \
|
|
\// First reduction step
|
|
MOVQ acc0, mul0 \
|
|
MOVQ acc0, mul1 \
|
|
SHLQ $32, mul0 \
|
|
SHRQ $32, mul1 \
|
|
\
|
|
SUBQ mul0, acc1 \
|
|
SBBQ mul1, acc2 \
|
|
SBBQ mul0, acc3 \
|
|
MOVQ acc0, mul0 \
|
|
SBBQ mul1, acc0 \
|
|
\
|
|
ADDQ mul0, acc1 \
|
|
ADCQ $0, acc2 \
|
|
ADCQ $0, acc3 \
|
|
ADCQ $0, acc0 \
|
|
\// Second reduction step
|
|
MOVQ acc1, mul0 \
|
|
MOVQ acc1, mul1 \
|
|
SHLQ $32, mul0 \
|
|
SHRQ $32, mul1 \
|
|
\
|
|
SUBQ mul0, acc2 \
|
|
SBBQ mul1, acc3 \
|
|
SBBQ mul0, acc0 \
|
|
MOVQ acc1, mul0 \
|
|
SBBQ mul1, acc1 \
|
|
\
|
|
ADDQ mul0, acc2 \
|
|
ADCQ $0, acc3 \
|
|
ADCQ $0, acc0 \
|
|
ADCQ $0, acc1 \
|
|
\// Third reduction step
|
|
MOVQ acc2, mul0 \
|
|
MOVQ acc2, mul1 \
|
|
SHLQ $32, mul0 \
|
|
SHRQ $32, mul1 \
|
|
\
|
|
SUBQ mul0, acc3 \
|
|
SBBQ mul1, acc0 \
|
|
SBBQ mul0, acc1 \
|
|
MOVQ acc2, mul0 \
|
|
SBBQ mul1, acc2 \
|
|
\
|
|
ADDQ mul0, acc3 \
|
|
ADCQ $0, acc0 \
|
|
ADCQ $0, acc1 \
|
|
ADCQ $0, acc2 \
|
|
\// Last reduction step
|
|
MOVQ acc3, mul0 \
|
|
MOVQ acc3, mul1 \
|
|
SHLQ $32, mul0 \
|
|
SHRQ $32, mul1 \
|
|
\
|
|
SUBQ mul0, acc0 \
|
|
SBBQ mul1, acc1 \
|
|
SBBQ mul0, acc2 \
|
|
MOVQ acc3, mul0 \
|
|
SBBQ mul1, acc3 \
|
|
\
|
|
ADDQ mul0, acc0 \
|
|
ADCQ $0, acc1 \
|
|
ADCQ $0, acc2 \
|
|
ADCQ $0, acc3
|
|
|
|
#define p256PointDoubleInit() \
|
|
MOVOU (16*0)(BX), X0 \
|
|
MOVOU (16*1)(BX), X1 \
|
|
MOVOU (16*2)(BX), X2 \
|
|
MOVOU (16*3)(BX), X3 \
|
|
MOVOU (16*4)(BX), X4 \
|
|
MOVOU (16*5)(BX), X5 \
|
|
\
|
|
MOVOU X0, x(16*0) \
|
|
MOVOU X1, x(16*1) \
|
|
MOVOU X2, y(16*0) \
|
|
MOVOU X3, y(16*1) \
|
|
MOVOU X4, z(16*0) \
|
|
MOVOU X5, z(16*1)
|
|
|
|
/* ---------------------------------------*/
|
|
// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
|
|
#define p256MulBy2Inline\
|
|
XORQ mul0, mul0;\
|
|
ADDQ acc4, acc4;\
|
|
ADCQ acc5, acc5;\
|
|
ADCQ acc6, acc6;\
|
|
ADCQ acc7, acc7;\
|
|
ADCQ $0, mul0;\
|
|
MOVQ acc4, t0;\
|
|
MOVQ acc5, t1;\
|
|
MOVQ acc6, t2;\
|
|
MOVQ acc7, t3;\
|
|
SUBQ $-1, t0;\
|
|
SBBQ p256p<>+0x08(SB), t1;\
|
|
SBBQ $-1, t2;\
|
|
SBBQ p256p<>+0x018(SB), t3;\
|
|
SBBQ $0, mul0;\
|
|
CMOVQCS acc4, t0;\
|
|
CMOVQCS acc5, t1;\
|
|
CMOVQCS acc6, t2;\
|
|
CMOVQCS acc7, t3;
|
|
/* ---------------------------------------*/
|
|
// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
|
|
#define p256AddInline \
|
|
XORQ mul0, mul0;\
|
|
ADDQ t0, acc4;\
|
|
ADCQ t1, acc5;\
|
|
ADCQ t2, acc6;\
|
|
ADCQ t3, acc7;\
|
|
ADCQ $0, mul0;\
|
|
MOVQ acc4, t0;\
|
|
MOVQ acc5, t1;\
|
|
MOVQ acc6, t2;\
|
|
MOVQ acc7, t3;\
|
|
SUBQ $-1, t0;\
|
|
SBBQ p256p<>+0x08(SB), t1;\
|
|
SBBQ $-1, t2;\
|
|
SBBQ p256p<>+0x018(SB), t3;\
|
|
SBBQ $0, mul0;\
|
|
CMOVQCS acc4, t0;\
|
|
CMOVQCS acc5, t1;\
|
|
CMOVQCS acc6, t2;\
|
|
CMOVQCS acc7, t3;
|