diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index e078cf7..c63c133 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -8,291 +8,9 @@ // +build amd64,!purego,!plugin #include "textflag.h" - -#define res_ptr DI -#define x_ptr SI -#define y_ptr CX - -#define acc0 R8 -#define acc1 R9 -#define acc2 R10 -#define acc3 R11 -#define acc4 R12 -#define acc5 R13 -#define t0 R14 +#include "p256_macros_amd64.s" #define t1 R15 -DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff -DATA p256p<>+0x08(SB)/8, $0xffffffff00000000 -DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff -DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff -DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975 -DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123 -DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b -DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff -DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff -DATA p256one<>+0x00(SB)/8, $0x0000000000000001 -DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff -DATA p256one<>+0x10(SB)/8, $0x0000000000000000 -DATA p256one<>+0x18(SB)/8, $0x0000000100000000 -GLOBL p256p<>(SB), RODATA, $32 -GLOBL p256ordK0<>(SB), RODATA, $8 -GLOBL p256ord<>(SB), RODATA, $32 -GLOBL p256one<>(SB), RODATA, $32 - -/* ---------------------------------------*/ -// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) -TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0 - JMP ·p256BigToLittle(SB) -/* ---------------------------------------*/ -// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) -TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0 - JMP ·p256BigToLittle(SB) -/* ---------------------------------------*/ -// func p256LittleToBig(res *[32]byte, in *p256Element) -TEXT ·p256LittleToBig(SB),NOSPLIT,$0 - JMP ·p256BigToLittle(SB) -/* ---------------------------------------*/ -// func p256BigToLittle(res *p256Element, in *[32]byte) -TEXT ·p256BigToLittle(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in+8(FP), x_ptr - - MOVQ (8*0)(x_ptr), acc0 - MOVQ (8*1)(x_ptr), acc1 - MOVQ (8*2)(x_ptr), acc2 - MOVQ (8*3)(x_ptr), acc3 - - BSWAPQ acc0 - BSWAPQ acc1 - BSWAPQ acc2 - BSWAPQ acc3 - - MOVQ acc3, (8*0)(res_ptr) - MOVQ acc2, (8*1)(res_ptr) - MOVQ acc1, (8*2)(res_ptr) - MOVQ acc0, (8*3)(res_ptr) - - RET -/* ---------------------------------------*/ -// func p256MovCond(res, a, b *SM2P256Point, cond int) -TEXT ·p256MovCond(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ a+8(FP), x_ptr - MOVQ b+16(FP), y_ptr - MOVQ cond+24(FP), X12 - - CMPB ·supportAVX2+0(SB), $0x01 - JEQ move_avx2 - - PXOR X13, X13 - PSHUFD $0, X12, X12 - PCMPEQL X13, X12 - - MOVOU X12, X0 - MOVOU (16*0)(x_ptr), X6 - PANDN X6, X0 - - MOVOU X12, X1 - MOVOU (16*1)(x_ptr), X7 - PANDN X7, X1 - - MOVOU X12, X2 - MOVOU (16*2)(x_ptr), X8 - PANDN X8, X2 - - MOVOU X12, X3 - MOVOU (16*3)(x_ptr), X9 - PANDN X9, X3 - - MOVOU X12, X4 - MOVOU (16*4)(x_ptr), X10 - PANDN X10, X4 - - MOVOU X12, X5 - MOVOU (16*5)(x_ptr), X11 - PANDN X11, X5 - - MOVOU (16*0)(y_ptr), X6 - MOVOU (16*1)(y_ptr), X7 - MOVOU (16*2)(y_ptr), X8 - MOVOU (16*3)(y_ptr), X9 - MOVOU (16*4)(y_ptr), X10 - MOVOU (16*5)(y_ptr), X11 - - PAND X12, X6 - PAND X12, X7 - PAND X12, X8 - PAND X12, X9 - PAND X12, X10 - PAND X12, X11 - - PXOR X6, X0 - PXOR X7, X1 - PXOR X8, X2 - PXOR X9, X3 - PXOR X10, X4 - PXOR X11, X5 - - MOVOU X0, (16*0)(res_ptr) - MOVOU X1, (16*1)(res_ptr) - MOVOU X2, (16*2)(res_ptr) - MOVOU X3, (16*3)(res_ptr) - MOVOU X4, (16*4)(res_ptr) - MOVOU X5, (16*5)(res_ptr) - - RET - -move_avx2: - VPXOR Y13, Y13, Y13 - VPBROADCASTD X12, Y12 - VPCMPEQD Y13, Y12, Y12 - - VPANDN (32*0)(x_ptr), Y12, Y0 - VPANDN (32*1)(x_ptr), Y12, Y1 - VPANDN (32*2)(x_ptr), Y12, Y2 - - VPAND (32*0)(y_ptr), Y12, Y3 - VPAND (32*1)(y_ptr), Y12, Y4 - VPAND (32*2)(y_ptr), Y12, Y5 - - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y2, Y2 - - VMOVDQU Y0, (32*0)(res_ptr) - VMOVDQU Y1, (32*1)(res_ptr) - VMOVDQU Y2, (32*2)(res_ptr) - - VZEROUPPER - RET - -/* ---------------------------------------*/ -// func p256NegCond(val *p256Element, cond int) -TEXT ·p256NegCond(SB),NOSPLIT,$0 - MOVQ val+0(FP), res_ptr - MOVQ cond+8(FP), t0 - // acc = poly - MOVQ $-1, acc0 - MOVQ p256p<>+0x08(SB), acc1 - MOVQ $-1, acc2 - MOVQ p256p<>+0x18(SB), acc3 - // Load the original value - MOVQ (8*0)(res_ptr), acc5 - MOVQ (8*1)(res_ptr), x_ptr - MOVQ (8*2)(res_ptr), y_ptr - MOVQ (8*3)(res_ptr), t1 - // Speculatively subtract - SUBQ acc5, acc0 - SBBQ x_ptr, acc1 - SBBQ y_ptr, acc2 - SBBQ t1, acc3 - // If condition is 0, keep original value - TESTQ t0, t0 - CMOVQEQ acc5, acc0 - CMOVQEQ x_ptr, acc1 - CMOVQEQ y_ptr, acc2 - CMOVQEQ t1, acc3 - // Store result - MOVQ acc0, (8*0)(res_ptr) - MOVQ acc1, (8*1)(res_ptr) - MOVQ acc2, (8*2)(res_ptr) - MOVQ acc3, (8*3)(res_ptr) - - RET - -#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ - MOVQ a0, b0 \ - MOVQ a1, b1 \ - MOVQ a2, b2 \ - MOVQ a3, b3 \ - \ // Subtract p256 - SUBQ $-1, a0 \ - SBBQ p256p<>+0x08(SB), a1 \ - SBBQ $-1, a2 \ - SBBQ p256p<>+0x018(SB), a3 \ - SBBQ $0, a4 \ - \ - CMOVQCS b0, a0 \ - CMOVQCS b1, a1 \ - CMOVQCS b2, a2 \ - CMOVQCS b3, a3 \ - \ - MOVQ a0, (8*0)(res) \ - MOVQ a1, (8*1)(res) \ - MOVQ a2, (8*2)(res) \ - MOVQ a3, (8*3)(res) \ - -#define p256SqrMontReduce() \ - \ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1] - MOVQ acc0, AX \ - MOVQ acc0, DX \ - SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part - SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part - \ // calculate the positive part first: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1], - \ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry. - ADDQ acc0, acc1 \ // acc1' = L (acc0 + acc1) - ADCQ $0, acc2 \ // acc2' = acc2 + carry1 - ADCQ $0, acc3 \ // acc3' = acc3 + carry2 - ADCQ $0, acc0 \ // acc0' = acc0 + carry3 - \// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0 - SUBQ AX, acc1 \ - SBBQ DX, acc2 \ - SBBQ AX, acc3 \ - SBBQ DX, acc0 \ - \ // Second reduction step - MOVQ acc1, AX \ - MOVQ acc1, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ - \ - ADDQ acc1, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - \ - SUBQ AX, acc2 \ - SBBQ DX, acc3 \ - SBBQ AX, acc0 \ - SBBQ DX, acc1 \ - \ // Third reduction step - MOVQ acc2, AX \ - MOVQ acc2, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ - \ - ADDQ acc2, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ - \ - SUBQ AX, acc3 \ - SBBQ DX, acc0 \ - SBBQ AX, acc1 \ - SBBQ DX, acc2 \ - \ // Last reduction step - XORQ t0, t0 \ - MOVQ acc3, AX \ - MOVQ acc3, DX \ - SHLQ $32, AX \ - SHRQ $32, DX \ - \ - ADDQ acc3, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ - \ - SUBQ AX, acc0 \ - SBBQ DX, acc1 \ - SBBQ AX, acc2 \ - SBBQ DX, acc3 \ - \ // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 \ - ADCQ acc5, acc1 \ - ADCQ y_ptr, acc2 \ - ADCQ x_ptr, acc3 \ - ADCQ $0, t0 \ - /* ---------------------------------------*/ // func p256Sqr(res, in *p256Element, n int) TEXT ·p256Sqr(SB),NOSPLIT,$0 @@ -460,1181 +178,6 @@ sqrBMI2: JNE sqrBMI2 RET -/* ---------------------------------------*/ -// func p256Mul(res, in1, in2 *p256Element) -TEXT ·p256Mul(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in1+8(FP), x_ptr - MOVQ in2+16(FP), y_ptr - - CMPB ·supportBMI2+0(SB), $0x01 - JEQ mulBMI2 - - // x * y[0] - MOVQ (8*0)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - MOVQ AX, acc0 - MOVQ DX, acc1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 - XORQ acc5, acc5 - // First reduction step - MOVQ acc0, AX - MOVQ acc0, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - ADCQ acc0, acc4 - ADCQ $0, acc5 - - SUBQ AX, acc1 - SBBQ DX, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 - SBBQ $0, acc5 - XORQ acc0, acc0 - - // x * y[1] - MOVQ (8*1)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 - ADCQ $0, acc0 - // Second reduction step - MOVQ acc1, AX - MOVQ acc1, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc1, acc2 - ADCQ $0, acc3 - ADCQ $0, acc4 - ADCQ acc1, acc5 - ADCQ $0, acc0 - - SUBQ AX, acc2 - SBBQ DX, acc3 - SBBQ AX, acc4 - SBBQ DX, acc5 - SBBQ $0, acc0 - XORQ acc1, acc1 - - // x * y[2] - MOVQ (8*2)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 - ADCQ $0, acc1 - // Third reduction step - MOVQ acc2, AX - MOVQ acc2, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc2, acc3 - ADCQ $0, acc4 - ADCQ $0, acc5 - ADCQ acc2, acc0 - ADCQ $0, acc1 - - SUBQ AX, acc3 - SBBQ DX, acc4 - SBBQ AX, acc5 - SBBQ DX, acc0 - SBBQ $0, acc1 - XORQ acc2, acc2 - // x * y[3] - MOVQ (8*3)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 - // Last reduction step - MOVQ acc3, AX - MOVQ acc3, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc3, acc4 - ADCQ $0, acc5 - ADCQ $0, acc0 - ADCQ acc3, acc1 - ADCQ $0, acc2 - - SUBQ AX, acc4 - SBBQ DX, acc5 - SBBQ AX, acc0 - SBBQ DX, acc1 - SBBQ $0, acc2 - p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr) - RET - -mulBMI2: - // x * y[0] - MOVQ (8*0)(y_ptr), DX - MULXQ (8*0)(x_ptr), acc0, acc1 - - MULXQ (8*1)(x_ptr), AX, acc2 - ADDQ AX, acc1 - - MULXQ (8*2)(x_ptr), AX, acc3 - ADCQ AX, acc2 - - MULXQ (8*3)(x_ptr), AX, acc4 - ADCQ AX, acc3 - ADCQ $0, acc4 - - XORQ acc5, acc5 - // First reduction step - MOVQ acc0, AX - MOVQ acc0, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - ADCQ acc0, acc4 - ADCQ $0, acc5 - - SUBQ AX, acc1 - SBBQ DX, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 - SBBQ $0, acc5 - XORQ acc0, acc0 - - // x * y[1] - MOVQ (8*1)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 - - MULXQ (8*1)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 - - MULXQ (8*2)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 - - MULXQ (8*3)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 - ADCQ $0, acc0 - - // Second reduction step - MOVQ acc1, AX - MOVQ acc1, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc1, acc2 - ADCQ $0, acc3 - ADCQ $0, acc4 - ADCQ acc1, acc5 - ADCQ $0, acc0 - - SUBQ AX, acc2 - SBBQ DX, acc3 - SBBQ AX, acc4 - SBBQ DX, acc5 - SBBQ $0, acc0 - XORQ acc1, acc1 - - // x * y[2] - MOVQ (8*2)(y_ptr), DX - - MULXQ (8*0)(x_ptr), AX, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 - - MULXQ (8*1)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 - - MULXQ (8*2)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 - - MULXQ (8*3)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc5 - ADCQ t1, acc0 - ADCQ $0, acc1 - // Third reduction step - MOVQ acc2, AX - MOVQ acc2, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc2, acc3 - ADCQ $0, acc4 - ADCQ $0, acc5 - ADCQ acc2, acc0 - ADCQ $0, acc1 - - SUBQ AX, acc3 - SBBQ DX, acc4 - SBBQ AX, acc5 - SBBQ DX, acc0 - SBBQ $0, acc1 - XORQ acc2, acc2 - // x * y[3] - MOVQ (8*3)(y_ptr), DX - - MULXQ (8*0)(x_ptr), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 - - MULXQ (8*1)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 - - MULXQ (8*2)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc5 - ADCQ t1, acc0 - - MULXQ (8*3)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc0 - ADCQ t1, acc1 - ADCQ $0, acc2 - // Last reduction step - MOVQ acc3, AX - MOVQ acc3, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc3, acc4 - ADCQ $0, acc5 - ADCQ $0, acc0 - ADCQ acc3, acc1 - ADCQ $0, acc2 - - SUBQ AX, acc4 - SBBQ DX, acc5 - SBBQ AX, acc0 - SBBQ DX, acc1 - SBBQ $0, acc2 - p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr) - RET - -/* ---------------------------------------*/ -// func p256FromMont(res, in *p256Element) -TEXT ·p256FromMont(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in+8(FP), x_ptr - - MOVQ (8*0)(x_ptr), acc0 - MOVQ (8*1)(x_ptr), acc1 - MOVQ (8*2)(x_ptr), acc2 - MOVQ (8*3)(x_ptr), acc3 - XORQ acc4, acc4 - - // Only reduce, no multiplications are needed - // First stage - MOVQ acc0, AX - MOVQ acc0, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc0, acc1 - ADCQ $0, acc2 - ADCQ $0, acc3 - ADCQ acc0, acc4 - - SUBQ AX, acc1 - SBBQ DX, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 - XORQ acc5, acc5 - - // Second stage - MOVQ acc1, AX - MOVQ acc1, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc1, acc2 - ADCQ $0, acc3 - ADCQ $0, acc4 - ADCQ acc1, acc5 - - SUBQ AX, acc2 - SBBQ DX, acc3 - SBBQ AX, acc4 - SBBQ DX, acc5 - XORQ acc0, acc0 - // Third stage - MOVQ acc2, AX - MOVQ acc2, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc2, acc3 - ADCQ $0, acc4 - ADCQ $0, acc5 - ADCQ acc2, acc0 - - SUBQ AX, acc3 - SBBQ DX, acc4 - SBBQ AX, acc5 - SBBQ DX, acc0 - XORQ acc1, acc1 - // Last stage - MOVQ acc3, AX - MOVQ acc3, DX - SHLQ $32, AX - SHRQ $32, DX - - ADDQ acc3, acc4 - ADCQ $0, acc5 - ADCQ $0, acc0 - ADCQ acc3, acc1 - - SUBQ AX, acc4 - SBBQ DX, acc5 - SBBQ AX, acc0 - SBBQ DX, acc1 - - MOVQ acc4, x_ptr - MOVQ acc5, acc3 - MOVQ acc0, t0 - MOVQ acc1, t1 - - SUBQ $-1, acc4 - SBBQ p256p<>+0x08(SB), acc5 - SBBQ $-1, acc0 - SBBQ p256p<>+0x018(SB), acc1 - - CMOVQCS x_ptr, acc4 - CMOVQCS acc3, acc5 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - - MOVQ acc4, (8*0)(res_ptr) - MOVQ acc5, (8*1)(res_ptr) - MOVQ acc0, (8*2)(res_ptr) - MOVQ acc1, (8*3)(res_ptr) - - RET -/* ---------------------------------------*/ -// func p256Select(res *SM2P256Point, table *p256Table, idx, limit int) -TEXT ·p256Select(SB),NOSPLIT,$0 - //MOVQ idx+16(FP),AX - MOVQ table+8(FP),DI - MOVQ res+0(FP),DX - - CMPB ·supportAVX2+0(SB), $0x01 - JEQ select_avx2 - - PXOR X15, X15 // X15 = 0 - PCMPEQL X14, X14 // X14 = -1 - PSUBL X14, X15 // X15 = 1 - MOVL idx+16(FP), X14 - PSHUFD $0, X14, X14 - - PXOR X0, X0 - PXOR X1, X1 - PXOR X2, X2 - PXOR X3, X3 - PXOR X4, X4 - PXOR X5, X5 - MOVQ limit+24(FP),AX - - MOVOU X15, X13 - -loop_select: - - MOVOU X13, X12 - PADDL X15, X13 - PCMPEQL X14, X12 - - MOVOU (16*0)(DI), X6 - MOVOU (16*1)(DI), X7 - MOVOU (16*2)(DI), X8 - MOVOU (16*3)(DI), X9 - MOVOU (16*4)(DI), X10 - MOVOU (16*5)(DI), X11 - ADDQ $(16*6), DI - - PAND X12, X6 - PAND X12, X7 - PAND X12, X8 - PAND X12, X9 - PAND X12, X10 - PAND X12, X11 - - PXOR X6, X0 - PXOR X7, X1 - PXOR X8, X2 - PXOR X9, X3 - PXOR X10, X4 - PXOR X11, X5 - - DECQ AX - JNE loop_select - - MOVOU X0, (16*0)(DX) - MOVOU X1, (16*1)(DX) - MOVOU X2, (16*2)(DX) - MOVOU X3, (16*3)(DX) - MOVOU X4, (16*4)(DX) - MOVOU X5, (16*5)(DX) - - RET - -select_avx2: - VPXOR Y15, Y15, Y15 - VPCMPEQD Y14, Y14, Y14 - VPSUBD Y14, Y15, Y15 - MOVL idx+16(FP), X14 // x14 = idx - VPBROADCASTD X14, Y14 - - MOVQ limit+24(FP),AX - VMOVDQU Y15, Y13 - - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - -loop_select_avx2: - VMOVDQU Y13, Y12 - VPADDD Y15, Y13, Y13 - VPCMPEQD Y14, Y12, Y12 - - VPAND (32*0)(DI), Y12, Y3 - VPAND (32*1)(DI), Y12, Y4 - VPAND (32*2)(DI), Y12, Y5 - - ADDQ $(32*3), DI - - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y2, Y2 - - DECQ AX - JNE loop_select_avx2 - - VMOVDQU Y0, (32*0)(DX) - VMOVDQU Y1, (32*1)(DX) - VMOVDQU Y2, (32*2)(DX) - VZEROUPPER - RET - -/* ---------------------------------------*/ -// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) -TEXT ·p256SelectAffine(SB),NOSPLIT,$0 - MOVQ idx+16(FP),AX - MOVQ table+8(FP),DI - MOVQ res+0(FP),DX - - CMPB ·supportAVX2+0(SB), $0x01 - JEQ select_base_avx2 - - PXOR X15, X15 // X15 = 0 - PCMPEQL X14, X14 // X14 = -1 - PSUBL X14, X15 // X15 = 1 - MOVL AX, X14 // x14 = idx - PSHUFD $0, X14, X14 - - MOVQ $16, AX - MOVOU X15, X13 - - PXOR X0, X0 - PXOR X1, X1 - PXOR X2, X2 - PXOR X3, X3 - -loop_select_base: - - MOVOU X13, X12 - PADDL X15, X13 - PCMPEQL X14, X12 - - MOVOU (16*0)(DI), X4 - MOVOU (16*1)(DI), X5 - MOVOU (16*2)(DI), X6 - MOVOU (16*3)(DI), X7 - - MOVOU (16*4)(DI), X8 - MOVOU (16*5)(DI), X9 - MOVOU (16*6)(DI), X10 - MOVOU (16*7)(DI), X11 - - ADDQ $(16*8), DI - - PAND X12, X4 - PAND X12, X5 - PAND X12, X6 - PAND X12, X7 - - MOVOU X13, X12 - PADDL X15, X13 - PCMPEQL X14, X12 - - PAND X12, X8 - PAND X12, X9 - PAND X12, X10 - PAND X12, X11 - - PXOR X4, X0 - PXOR X5, X1 - PXOR X6, X2 - PXOR X7, X3 - - PXOR X8, X0 - PXOR X9, X1 - PXOR X10, X2 - PXOR X11, X3 - - DECQ AX - JNE loop_select_base - - MOVOU X0, (16*0)(DX) - MOVOU X1, (16*1)(DX) - MOVOU X2, (16*2)(DX) - MOVOU X3, (16*3)(DX) - - RET - -select_base_avx2: - VPXOR Y15, Y15, Y15 - VPCMPEQD Y14, Y14, Y14 - VPSUBD Y14, Y15, Y15 - MOVL AX, X14 // x14 = idx - VPBROADCASTD X14, Y14 - - MOVQ $16, AX - VMOVDQU Y15, Y13 - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - -loop_select_base_avx2: - VMOVDQU Y13, Y12 - VPADDD Y15, Y13, Y13 - VPCMPEQD Y14, Y12, Y12 - - VPAND (32*0)(DI), Y12, Y2 - VPAND (32*1)(DI), Y12, Y3 - - VMOVDQU Y13, Y12 - VPADDD Y15, Y13, Y13 - VPCMPEQD Y14, Y12, Y12 - - VPAND (32*2)(DI), Y12, Y4 - VPAND (32*3)(DI), Y12, Y5 - - ADDQ $(32*4), DI - - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y1, Y1 - - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y1, Y1 - - DECQ AX - JNE loop_select_base_avx2 - - VMOVDQU Y0, (32*0)(DX) - VMOVDQU Y1, (32*1)(DX) - VZEROUPPER - RET - -/* ---------------------------------------*/ -#define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ - \// Copy result [255:0] - MOVQ a0, b0 \ - MOVQ a1, b1 \ - MOVQ a2, b2 \ - MOVQ a3, b3 \ - \// Subtract p256 - SUBQ p256ord<>+0x00(SB), a0 \ - SBBQ p256ord<>+0x08(SB) ,a1 \ - SBBQ p256ord<>+0x10(SB), a2 \ - SBBQ p256ord<>+0x18(SB), a3 \ - SBBQ $0, a4 \ - \ - CMOVQCS b0, a0 \ - CMOVQCS b1, a1 \ - CMOVQCS b2, a2 \ - CMOVQCS b3, a3 \ - \ - MOVQ a0, (8*0)(res) \ - MOVQ a1, (8*1)(res) \ - MOVQ a2, (8*2)(res) \ - MOVQ a3, (8*3)(res) \ - -//func p256OrdReduce(s *p256OrdElement) -TEXT ·p256OrdReduce(SB),NOSPLIT,$0 - MOVQ s+0(FP), res_ptr - MOVQ (8*0)(res_ptr), acc0 - MOVQ (8*1)(res_ptr), acc1 - MOVQ (8*2)(res_ptr), acc2 - MOVQ (8*3)(res_ptr), acc3 - XORQ acc4, acc4 - p256OrdReduceInline(acc0, acc1, acc2, acc3, acc4, acc5, x_ptr, y_ptr, t0, res_ptr) - RET - -// func p256OrdMul(res, in1, in2 *p256OrdElement) -TEXT ·p256OrdMul(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in1+8(FP), x_ptr - MOVQ in2+16(FP), y_ptr - CMPB ·supportBMI2+0(SB), $0x01 - JEQ ordMulBMI2 - - // x * y[0] - MOVQ (8*0)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - MOVQ AX, acc0 - MOVQ DX, acc1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 - XORQ acc5, acc5 - // First reduction step - MOVQ acc0, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ DX, acc2 - ADCQ $0, acc3 - ADCQ t0, acc4 - ADCQ $0, acc5 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 - SBBQ $0, acc5 - // x * y[1] - MOVQ (8*1)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 - ADCQ $0, acc0 - // Second reduction step - MOVQ acc1, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ DX, acc3 - ADCQ $0, acc4 - ADCQ t0, acc5 - ADCQ $0, acc0 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc3 - SBBQ AX, acc4 - SBBQ DX, acc5 - SBBQ $0, acc0 - // x * y[2] - MOVQ (8*2)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 - ADCQ $0, acc1 - // Third reduction step - MOVQ acc2, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 - ADCQ $0, acc5 - ADCQ t0, acc0 - ADCQ $0, acc1 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc4 - SBBQ AX, acc5 - SBBQ DX, acc0 - SBBQ $0, acc1 - // x * y[3] - MOVQ (8*3)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 - // Last reduction step - MOVQ acc3, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 - ADCQ $0, acc0 - ADCQ t0, acc1 - ADCQ $0, acc2 - - MOVQ t0, AX - MOVQ t0, DX - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc5 - SBBQ AX, acc0 - SBBQ DX, acc1 - SBBQ $0, acc2 - - p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr) - - RET - -ordMulBMI2: - // x * y[0] - MOVQ (8*0)(y_ptr), DX - MULXQ (8*0)(x_ptr), acc0, acc1 - - MULXQ (8*1)(x_ptr), AX, acc2 - ADDQ AX, acc1 - ADCQ $0, acc2 - - MULXQ (8*2)(x_ptr), AX, acc3 - ADDQ AX, acc2 - ADCQ $0, acc3 - - MULXQ (8*3)(x_ptr), AX, acc4 - ADDQ AX, acc3 - ADCQ $0, acc4 - - XORQ acc5, acc5 - - // First reduction step - MOVQ acc0, DX - MULXQ p256ordK0<>(SB), t0, AX - - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc0 - ADCQ t1, acc1 - - MULXQ p256ord<>+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 - ADCQ $0, acc3 - ADCQ t0, acc4 - ADCQ $0, acc5 - - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc2 - SBBQ AX, acc3 - SBBQ DX, acc4 - SBBQ $0, acc5 - - // x * y[1] - MOVQ (8*1)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 - - MULXQ (8*1)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 - - MULXQ (8*2)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 - - MULXQ (8*3)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 - ADCQ $0, acc0 - - // Second reduction step - MOVQ acc1, DX - MULXQ p256ordK0<>(SB), t0, AX - - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 - - MULXQ p256ord<>+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 - ADCQ $0, acc4 - ADCQ t0, acc5 - ADCQ $0, acc0 - - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc3 - SBBQ AX, acc4 - SBBQ DX, acc5 - SBBQ $0, acc0 - - // x * y[2] - MOVQ (8*2)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 - - MULXQ (8*1)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 - - MULXQ (8*2)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 - - MULXQ (8*3)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc5 - ADCQ t1, acc0 - ADCQ $0, acc1 - - // Third reduction step - MOVQ acc2, DX - MULXQ p256ordK0<>(SB), t0, AX - - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 - - MULXQ p256ord<>+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 - ADCQ $0, acc5 - ADCQ t0, acc0 - ADCQ $0, acc1 - - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc4 - SBBQ AX, acc5 - SBBQ DX, acc0 - SBBQ $0, acc1 - - // x * y[3] - MOVQ (8*3)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 - - MULXQ (8*1)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 - - MULXQ (8*2)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc5 - ADCQ t1, acc0 - - MULXQ (8*3)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc0 - ADCQ t1, acc1 - ADCQ $0, acc2 - - // Last reduction step - MOVQ acc3, DX - MULXQ p256ordK0<>(SB), t0, AX - - MOVQ t0, DX - MULXQ p256ord<>+0x00(SB), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 - - MULXQ p256ord<>+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 - ADCQ $0, acc0 - ADCQ t0, acc1 - ADCQ $0, acc2 - - MOVQ t0, AX - //MOVQ t0, DX // This is not required due to t0=DX already - SHLQ $32, AX - SHRQ $32, DX - - SUBQ t0, acc5 - SBBQ AX, acc0 - SBBQ DX, acc1 - SBBQ $0, acc2 - - p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1, res_ptr) - - RET - /* ---------------------------------------*/ // func p256OrdSqr(res, in *p256OrdElement, n int) TEXT ·p256OrdSqr(SB),NOSPLIT,$0 @@ -2462,91 +1005,6 @@ internalMulBMI2: RET -#define sm2P256SqrReductionInternal() \ - \ // First reduction step - MOVQ acc0, mul0 \ - MOVQ acc0, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ - \ - ADDQ acc0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - \ - SUBQ mul0, acc1 \ - SBBQ mul1, acc2 \ - SBBQ mul0, acc3 \ - SBBQ mul1, acc0 \ - \ // Second reduction step - MOVQ acc1, mul0 \ - MOVQ acc1, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ - \ - ADDQ acc1, acc2 \ - ADCQ $0, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - \ - SUBQ mul0, acc2 \ - SBBQ mul1, acc3 \ - SBBQ mul0, acc0 \ - SBBQ mul1, acc1 \ - \ // Third reduction step - MOVQ acc2, mul0 \ - MOVQ acc2, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ - \ - ADDQ acc2, acc3 \ - ADCQ $0, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ - \ - SUBQ mul0, acc3 \ - SBBQ mul1, acc0 \ - SBBQ mul0, acc1 \ - SBBQ mul1, acc2 \ - \ // Last reduction step - MOVQ acc3, mul0 \ - MOVQ acc3, mul1 \ - SHLQ $32, mul0 \ - SHRQ $32, mul1 \ - \ - ADDQ acc3, acc0 \ - ADCQ $0, acc1 \ - ADCQ $0, acc2 \ - ADCQ $0, acc3 \ - \ - SUBQ mul0, acc0 \ - SBBQ mul1, acc1 \ - SBBQ mul0, acc2 \ - SBBQ mul1, acc3 \ - MOVQ $0, BP \ - \ // Add bits [511:256] of the result - ADCQ acc0, t0 \ - ADCQ acc1, t1 \ - ADCQ acc2, t2 \ - ADCQ acc3, t3 \ - ADCQ $0, hlp \ - \ // Copy result - MOVQ t0, acc4 \ - MOVQ t1, acc5 \ - MOVQ t2, acc6 \ - MOVQ t3, acc7 \ - \ // Subtract p256 - SUBQ $-1, acc4 \ - SBBQ p256p<>+0x08(SB), acc5 \ - SBBQ $-1, acc6 \ - SBBQ p256p<>+0x018(SB), acc7 \ - SBBQ $0, hlp \ - \ // If the result of the subtraction is negative, restore the previous result - CMOVQCS t0, acc4 \ - CMOVQCS t1, acc5 \ - CMOVQCS t2, acc6 \ - CMOVQCS t3, acc7 \ - /* ---------------------------------------*/ // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4]^2 TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 @@ -2687,50 +1145,6 @@ internalSqrBMI2: RET /* ---------------------------------------*/ -// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4] -#define p256MulBy2Inline\ - XORQ mul0, mul0;\ - ADDQ acc4, acc4;\ - ADCQ acc5, acc5;\ - ADCQ acc6, acc6;\ - ADCQ acc7, acc7;\ - ADCQ $0, mul0;\ - MOVQ acc4, t0;\ - MOVQ acc5, t1;\ - MOVQ acc6, t2;\ - MOVQ acc7, t3;\ - SUBQ $-1, t0;\ - SBBQ p256p<>+0x08(SB), t1;\ - SBBQ $-1, t2;\ - SBBQ p256p<>+0x018(SB), t3;\ - SBBQ $0, mul0;\ - CMOVQCS acc4, t0;\ - CMOVQCS acc5, t1;\ - CMOVQCS acc6, t2;\ - CMOVQCS acc7, t3; -/* ---------------------------------------*/ -// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0] -#define p256AddInline \ - XORQ mul0, mul0;\ - ADDQ t0, acc4;\ - ADCQ t1, acc5;\ - ADCQ t2, acc6;\ - ADCQ t3, acc7;\ - ADCQ $0, mul0;\ - MOVQ acc4, t0;\ - MOVQ acc5, t1;\ - MOVQ acc6, t2;\ - MOVQ acc7, t3;\ - SUBQ $-1, t0;\ - SBBQ p256p<>+0x08(SB), t1;\ - SBBQ $-1, t2;\ - SBBQ p256p<>+0x018(SB), t3;\ - SBBQ $0, mul0;\ - CMOVQCS acc4, t0;\ - CMOVQCS acc5, t1;\ - CMOVQCS acc6, t2;\ - CMOVQCS acc7, t3; -/* ---------------------------------------*/ #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) @@ -3474,21 +1888,6 @@ pointadd_avx2: \/////////////////////// MOVQ $0, rptr \ -#define p256PointDoubleInit() \ - MOVOU (16*0)(BX), X0 \ - MOVOU (16*1)(BX), X1 \ - MOVOU (16*2)(BX), X2 \ - MOVOU (16*3)(BX), X3 \ - MOVOU (16*4)(BX), X4 \ - MOVOU (16*5)(BX), X5 \ - \ - MOVOU X0, x(16*0) \ - MOVOU X1, x(16*1) \ - MOVOU X2, y(16*0) \ - MOVOU X3, y(16*1) \ - MOVOU X4, z(16*0) \ - MOVOU X5, z(16*1) \ - //func p256PointDoubleAsm(res, in *SM2P256Point) TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16 // Move input to stack in order to free registers diff --git a/internal/sm2ec/p256_asm_ord.go b/internal/sm2ec/p256_asm_ord.go index de2548f..2dd7fee 100644 --- a/internal/sm2ec/p256_asm_ord.go +++ b/internal/sm2ec/p256_asm_ord.go @@ -1,5 +1,5 @@ -//go:build (amd64 && !purego && !plugin) || (arm64 && !purego) -// +build amd64,!purego,!plugin arm64,!purego +//go:build (amd64 && !purego) || (arm64 && !purego) +// +build amd64,!purego arm64,!purego package sm2ec diff --git a/internal/sm2ec/p256_common_amd64.s b/internal/sm2ec/p256_common_amd64.s new file mode 100644 index 0000000..5dca464 --- /dev/null +++ b/internal/sm2ec/p256_common_amd64.s @@ -0,0 +1,1318 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +#include "textflag.h" + +#include "p256_macros_amd64.s" + +/* ---------------------------------------*/ +// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) +TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0 + JMP ·p256BigToLittle(SB) +/* ---------------------------------------*/ +// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) +TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0 + JMP ·p256BigToLittle(SB) +/* ---------------------------------------*/ +// func p256LittleToBig(res *[32]byte, in *p256Element) +TEXT ·p256LittleToBig(SB),NOSPLIT,$0 + JMP ·p256BigToLittle(SB) +/* ---------------------------------------*/ +// func p256BigToLittle(res *p256Element, in *[32]byte) +TEXT ·p256BigToLittle(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+8(FP), x_ptr + + MOVQ (8*0)(x_ptr), acc0 + MOVQ (8*1)(x_ptr), acc1 + MOVQ (8*2)(x_ptr), acc2 + MOVQ (8*3)(x_ptr), acc3 + + BSWAPQ acc0 + BSWAPQ acc1 + BSWAPQ acc2 + BSWAPQ acc3 + + MOVQ acc3, (8*0)(res_ptr) + MOVQ acc2, (8*1)(res_ptr) + MOVQ acc1, (8*2)(res_ptr) + MOVQ acc0, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256MovCond(res, a, b *SM2P256Point, cond int) +TEXT ·p256MovCond(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ a+8(FP), x_ptr + MOVQ b+16(FP), y_ptr + MOVQ cond+24(FP), X12 + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ move_avx2 + + PXOR X13, X13 + PSHUFD $0, X12, X12 + PCMPEQL X13, X12 + + MOVOU X12, X0 + MOVOU (16*0)(x_ptr), X6 + PANDN X6, X0 + + MOVOU X12, X1 + MOVOU (16*1)(x_ptr), X7 + PANDN X7, X1 + + MOVOU X12, X2 + MOVOU (16*2)(x_ptr), X8 + PANDN X8, X2 + + MOVOU X12, X3 + MOVOU (16*3)(x_ptr), X9 + PANDN X9, X3 + + MOVOU X12, X4 + MOVOU (16*4)(x_ptr), X10 + PANDN X10, X4 + + MOVOU X12, X5 + MOVOU (16*5)(x_ptr), X11 + PANDN X11, X5 + + MOVOU (16*0)(y_ptr), X6 + MOVOU (16*1)(y_ptr), X7 + MOVOU (16*2)(y_ptr), X8 + MOVOU (16*3)(y_ptr), X9 + MOVOU (16*4)(y_ptr), X10 + MOVOU (16*5)(y_ptr), X11 + + PAND X12, X6 + PAND X12, X7 + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + + PXOR X6, X0 + PXOR X7, X1 + PXOR X8, X2 + PXOR X9, X3 + PXOR X10, X4 + PXOR X11, X5 + + MOVOU X0, (16*0)(res_ptr) + MOVOU X1, (16*1)(res_ptr) + MOVOU X2, (16*2)(res_ptr) + MOVOU X3, (16*3)(res_ptr) + MOVOU X4, (16*4)(res_ptr) + MOVOU X5, (16*5)(res_ptr) + + RET + +move_avx2: + VPXOR Y13, Y13, Y13 + VPBROADCASTD X12, Y12 + VPCMPEQD Y13, Y12, Y12 + + VPANDN (32*0)(x_ptr), Y12, Y0 + VPANDN (32*1)(x_ptr), Y12, Y1 + VPANDN (32*2)(x_ptr), Y12, Y2 + + VPAND (32*0)(y_ptr), Y12, Y3 + VPAND (32*1)(y_ptr), Y12, Y4 + VPAND (32*2)(y_ptr), Y12, Y5 + + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y2, Y2 + + VMOVDQU Y0, (32*0)(res_ptr) + VMOVDQU Y1, (32*1)(res_ptr) + VMOVDQU Y2, (32*2)(res_ptr) + + VZEROUPPER + RET + +/* ---------------------------------------*/ +// func p256NegCond(val *p256Element, cond int) +TEXT ·p256NegCond(SB),NOSPLIT,$0 + MOVQ val+0(FP), res_ptr + MOVQ cond+8(FP), t0 + // acc = poly + MOVQ $-1, acc0 + MOVQ p256p<>+0x08(SB), acc1 + MOVQ $-1, acc2 + MOVQ p256p<>+0x18(SB), acc3 + // Load the original value + MOVQ (8*0)(res_ptr), acc4 + MOVQ (8*1)(res_ptr), x_ptr + MOVQ (8*2)(res_ptr), y_ptr + MOVQ (8*3)(res_ptr), acc5 + // Speculatively subtract + SUBQ acc4, acc0 + SBBQ x_ptr, acc1 + SBBQ y_ptr, acc2 + SBBQ acc5, acc3 + // If condition is 0, keep original value + TESTQ t0, t0 + CMOVQEQ acc4, acc0 + CMOVQEQ x_ptr, acc1 + CMOVQEQ y_ptr, acc2 + CMOVQEQ acc5, acc3 + // Store result + MOVQ acc0, (8*0)(res_ptr) + MOVQ acc1, (8*1)(res_ptr) + MOVQ acc2, (8*2)(res_ptr) + MOVQ acc3, (8*3)(res_ptr) + + RET + +/* ---------------------------------------*/ +// func p256Mul(res, in1, in2 *p256Element) +TEXT ·p256Mul(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in1+8(FP), x_ptr + MOVQ in2+16(FP), y_ptr + + CMPB ·supportBMI2+0(SB), $0x01 + JEQ mulBMI2 + + // x * y[0] + MOVQ (8*0)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + MOVQ AX, acc0 + MOVQ DX, acc1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + XORQ acc5, acc5 + // First reduction step + MOVQ acc0, AX + MOVQ acc0, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ acc0, acc4 + ADCQ $0, acc5 + + SUBQ AX, acc1 + SBBQ DX, acc2 + SBBQ AX, acc3 + SBBQ DX, acc4 + SBBQ $0, acc5 + XORQ acc0, acc0 + + // x * y[1] + MOVQ (8*1)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // Second reduction step + MOVQ acc1, AX + MOVQ acc1, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc1, acc2 + ADCQ $0, acc3 + ADCQ $0, acc4 + ADCQ acc1, acc5 + ADCQ $0, acc0 + + SUBQ AX, acc2 + SBBQ DX, acc3 + SBBQ AX, acc4 + SBBQ DX, acc5 + SBBQ $0, acc0 + XORQ acc1, acc1 + + // x * y[2] + MOVQ (8*2)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // Third reduction step + MOVQ acc2, AX + MOVQ acc2, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc2, acc3 + ADCQ $0, acc4 + ADCQ $0, acc5 + ADCQ acc2, acc0 + ADCQ $0, acc1 + + SUBQ AX, acc3 + SBBQ DX, acc4 + SBBQ AX, acc5 + SBBQ DX, acc0 + SBBQ $0, acc1 + XORQ acc2, acc2 + // x * y[3] + MOVQ (8*3)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Last reduction step + MOVQ acc3, AX + MOVQ acc3, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc3, acc4 + ADCQ $0, acc5 + ADCQ $0, acc0 + ADCQ acc3, acc1 + ADCQ $0, acc2 + + SUBQ AX, acc4 + SBBQ DX, acc5 + SBBQ AX, acc0 + SBBQ DX, acc1 + SBBQ $0, acc2 + p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) + RET + +mulBMI2: + // x * y[0] + MOVQ (8*0)(y_ptr), DX + MULXQ (8*0)(x_ptr), acc0, acc1 + + MULXQ (8*1)(x_ptr), AX, acc2 + ADDQ AX, acc1 + + MULXQ (8*2)(x_ptr), AX, acc3 + ADCQ AX, acc2 + + MULXQ (8*3)(x_ptr), AX, acc4 + ADCQ AX, acc3 + ADCQ $0, acc4 + + XORQ acc5, acc5 + // First reduction step + MOVQ acc0, AX + MOVQ acc0, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ acc0, acc4 + ADCQ $0, acc5 + + SUBQ AX, acc1 + SBBQ DX, acc2 + SBBQ AX, acc3 + SBBQ DX, acc4 + SBBQ $0, acc5 + XORQ acc0, acc0 + + // x * y[1] + MOVQ (8*1)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, BX + ADDQ AX, acc1 + ADCQ BX, acc2 + + MULXQ (8*1)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc2 + ADCQ BX, acc3 + + MULXQ (8*2)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + + MULXQ (8*3)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc4 + ADCQ BX, acc5 + ADCQ $0, acc0 + + // Second reduction step + MOVQ acc1, AX + MOVQ acc1, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc1, acc2 + ADCQ $0, acc3 + ADCQ $0, acc4 + ADCQ acc1, acc5 + ADCQ $0, acc0 + + SUBQ AX, acc2 + SBBQ DX, acc3 + SBBQ AX, acc4 + SBBQ DX, acc5 + SBBQ $0, acc0 + XORQ acc1, acc1 + + // x * y[2] + MOVQ (8*2)(y_ptr), DX + + MULXQ (8*0)(x_ptr), AX, BX + ADDQ AX, acc2 + ADCQ BX, acc3 + + MULXQ (8*1)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + + MULXQ (8*2)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc4 + ADCQ BX, acc5 + + MULXQ (8*3)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc5 + ADCQ BX, acc0 + ADCQ $0, acc1 + // Third reduction step + MOVQ acc2, AX + MOVQ acc2, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc2, acc3 + ADCQ $0, acc4 + ADCQ $0, acc5 + ADCQ acc2, acc0 + ADCQ $0, acc1 + + SUBQ AX, acc3 + SBBQ DX, acc4 + SBBQ AX, acc5 + SBBQ DX, acc0 + SBBQ $0, acc1 + XORQ acc2, acc2 + // x * y[3] + MOVQ (8*3)(y_ptr), DX + + MULXQ (8*0)(x_ptr), AX, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + + MULXQ (8*1)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc4 + ADCQ BX, acc5 + + MULXQ (8*2)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc5 + ADCQ BX, acc0 + + MULXQ (8*3)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc0 + ADCQ BX, acc1 + ADCQ $0, acc2 + // Last reduction step + MOVQ acc3, AX + MOVQ acc3, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc3, acc4 + ADCQ $0, acc5 + ADCQ $0, acc0 + ADCQ acc3, acc1 + ADCQ $0, acc2 + + SUBQ AX, acc4 + SBBQ DX, acc5 + SBBQ AX, acc0 + SBBQ DX, acc1 + SBBQ $0, acc2 + p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) + RET + +/* ---------------------------------------*/ +// func p256FromMont(res, in *p256Element) +TEXT ·p256FromMont(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+8(FP), x_ptr + + MOVQ (8*0)(x_ptr), acc0 + MOVQ (8*1)(x_ptr), acc1 + MOVQ (8*2)(x_ptr), acc2 + MOVQ (8*3)(x_ptr), acc3 + XORQ acc4, acc4 + + // Only reduce, no multiplications are needed + // First stage + MOVQ acc0, AX + MOVQ acc0, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ acc0, acc4 + + SUBQ AX, acc1 + SBBQ DX, acc2 + SBBQ AX, acc3 + SBBQ DX, acc4 + XORQ acc5, acc5 + + // Second stage + MOVQ acc1, AX + MOVQ acc1, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc1, acc2 + ADCQ $0, acc3 + ADCQ $0, acc4 + ADCQ acc1, acc5 + + SUBQ AX, acc2 + SBBQ DX, acc3 + SBBQ AX, acc4 + SBBQ DX, acc5 + XORQ acc0, acc0 + // Third stage + MOVQ acc2, AX + MOVQ acc2, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc2, acc3 + ADCQ $0, acc4 + ADCQ $0, acc5 + ADCQ acc2, acc0 + + SUBQ AX, acc3 + SBBQ DX, acc4 + SBBQ AX, acc5 + SBBQ DX, acc0 + XORQ acc1, acc1 + // Last stage + MOVQ acc3, AX + MOVQ acc3, DX + SHLQ $32, AX + SHRQ $32, DX + + ADDQ acc3, acc4 + ADCQ $0, acc5 + ADCQ $0, acc0 + ADCQ acc3, acc1 + + SUBQ AX, acc4 + SBBQ DX, acc5 + SBBQ AX, acc0 + SBBQ DX, acc1 + + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, BX + + SUBQ $-1, acc4 + SBBQ p256p<>+0x08(SB), acc5 + SBBQ $-1, acc0 + SBBQ p256p<>+0x018(SB), acc1 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS BX, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256Select(res *SM2P256Point, table *p256Table, idx, limit int) +TEXT ·p256Select(SB),NOSPLIT,$0 + //MOVQ idx+16(FP),AX + MOVQ table+8(FP),DI + MOVQ res+0(FP),DX + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ select_avx2 + + PXOR X15, X15 // X15 = 0 + PCMPEQL X14, X14 // X14 = -1 + PSUBL X14, X15 // X15 = 1 + MOVL idx+16(FP), X14 + PSHUFD $0, X14, X14 + + PXOR X0, X0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + PXOR X4, X4 + PXOR X5, X5 + MOVQ limit+24(FP),AX + + MOVOU X15, X13 + +loop_select: + + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + + MOVOU (16*0)(DI), X6 + MOVOU (16*1)(DI), X7 + MOVOU (16*2)(DI), X8 + MOVOU (16*3)(DI), X9 + MOVOU (16*4)(DI), X10 + MOVOU (16*5)(DI), X11 + ADDQ $(16*6), DI + + PAND X12, X6 + PAND X12, X7 + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + + PXOR X6, X0 + PXOR X7, X1 + PXOR X8, X2 + PXOR X9, X3 + PXOR X10, X4 + PXOR X11, X5 + + DECQ AX + JNE loop_select + + MOVOU X0, (16*0)(DX) + MOVOU X1, (16*1)(DX) + MOVOU X2, (16*2)(DX) + MOVOU X3, (16*3)(DX) + MOVOU X4, (16*4)(DX) + MOVOU X5, (16*5)(DX) + + RET + +select_avx2: + VPXOR Y15, Y15, Y15 + VPCMPEQD Y14, Y14, Y14 + VPSUBD Y14, Y15, Y15 + MOVL idx+16(FP), X14 // x14 = idx + VPBROADCASTD X14, Y14 + + MOVQ limit+24(FP),AX + VMOVDQU Y15, Y13 + + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + +loop_select_avx2: + VMOVDQU Y13, Y12 + VPADDD Y15, Y13, Y13 + VPCMPEQD Y14, Y12, Y12 + + VPAND (32*0)(DI), Y12, Y3 + VPAND (32*1)(DI), Y12, Y4 + VPAND (32*2)(DI), Y12, Y5 + + ADDQ $(32*3), DI + + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y2, Y2 + + DECQ AX + JNE loop_select_avx2 + + VMOVDQU Y0, (32*0)(DX) + VMOVDQU Y1, (32*1)(DX) + VMOVDQU Y2, (32*2)(DX) + VZEROUPPER + RET + +/* ---------------------------------------*/ +// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) +TEXT ·p256SelectAffine(SB),NOSPLIT,$0 + MOVQ idx+16(FP),AX + MOVQ table+8(FP),DI + MOVQ res+0(FP),DX + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ select_base_avx2 + + PXOR X15, X15 // X15 = 0 + PCMPEQL X14, X14 // X14 = -1 + PSUBL X14, X15 // X15 = 1 + MOVL AX, X14 // x14 = idx + PSHUFD $0, X14, X14 + + MOVQ $16, AX + MOVOU X15, X13 + + PXOR X0, X0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + +loop_select_base: + + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + + MOVOU (16*0)(DI), X4 + MOVOU (16*1)(DI), X5 + MOVOU (16*2)(DI), X6 + MOVOU (16*3)(DI), X7 + + MOVOU (16*4)(DI), X8 + MOVOU (16*5)(DI), X9 + MOVOU (16*6)(DI), X10 + MOVOU (16*7)(DI), X11 + + ADDQ $(16*8), DI + + PAND X12, X4 + PAND X12, X5 + PAND X12, X6 + PAND X12, X7 + + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + + PXOR X8, X0 + PXOR X9, X1 + PXOR X10, X2 + PXOR X11, X3 + + DECQ AX + JNE loop_select_base + + MOVOU X0, (16*0)(DX) + MOVOU X1, (16*1)(DX) + MOVOU X2, (16*2)(DX) + MOVOU X3, (16*3)(DX) + + RET + +select_base_avx2: + VPXOR Y15, Y15, Y15 + VPCMPEQD Y14, Y14, Y14 + VPSUBD Y14, Y15, Y15 + MOVL AX, X14 // x14 = idx + VPBROADCASTD X14, Y14 + + MOVQ $16, AX + VMOVDQU Y15, Y13 + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + +loop_select_base_avx2: + VMOVDQU Y13, Y12 + VPADDD Y15, Y13, Y13 + VPCMPEQD Y14, Y12, Y12 + + VPAND (32*0)(DI), Y12, Y2 + VPAND (32*1)(DI), Y12, Y3 + + VMOVDQU Y13, Y12 + VPADDD Y15, Y13, Y13 + VPCMPEQD Y14, Y12, Y12 + + VPAND (32*2)(DI), Y12, Y4 + VPAND (32*3)(DI), Y12, Y5 + + ADDQ $(32*4), DI + + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y1, Y1 + + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y1, Y1 + + DECQ AX + JNE loop_select_base_avx2 + + VMOVDQU Y0, (32*0)(DX) + VMOVDQU Y1, (32*1)(DX) + VZEROUPPER + RET + +//func p256OrdReduce(s *p256OrdElement) +TEXT ·p256OrdReduce(SB),NOSPLIT,$0 + MOVQ s+0(FP), res_ptr + MOVQ (8*0)(res_ptr), acc0 + MOVQ (8*1)(res_ptr), acc1 + MOVQ (8*2)(res_ptr), acc2 + MOVQ (8*3)(res_ptr), acc3 + XORQ acc4, acc4 + p256OrdReduceInline(acc0, acc1, acc2, acc3, acc4, acc5, x_ptr, y_ptr, t0, res_ptr) + RET + +// func p256OrdMul(res, in1, in2 *p256OrdElement) +TEXT ·p256OrdMul(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in1+8(FP), x_ptr + MOVQ in2+16(FP), y_ptr + CMPB ·supportBMI2+0(SB), $0x01 + JEQ ordMulBMI2 + + // x * y[0] + MOVQ (8*0)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + MOVQ AX, acc0 + MOVQ DX, acc1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + XORQ acc5, acc5 + // First reduction step + MOVQ acc0, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ DX, acc2 + ADCQ $0, acc3 + ADCQ t0, acc4 + ADCQ $0, acc5 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc4 + SBBQ $0, acc5 + // x * y[1] + MOVQ (8*1)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // Second reduction step + MOVQ acc1, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ DX, acc3 + ADCQ $0, acc4 + ADCQ t0, acc5 + ADCQ $0, acc0 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc3 + SBBQ AX, acc4 + SBBQ DX, acc5 + SBBQ $0, acc0 + // x * y[2] + MOVQ (8*2)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // Third reduction step + MOVQ acc2, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc4 + ADCQ $0, acc5 + ADCQ t0, acc0 + ADCQ $0, acc1 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc4 + SBBQ AX, acc5 + SBBQ DX, acc0 + SBBQ $0, acc1 + // x * y[3] + MOVQ (8*3)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ BX, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Last reduction step + MOVQ acc3, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + ADCQ t0, acc1 + ADCQ $0, acc2 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc5 + SBBQ AX, acc0 + SBBQ DX, acc1 + SBBQ $0, acc2 + + p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) + + RET + +ordMulBMI2: + // x * y[0] + MOVQ (8*0)(y_ptr), DX + MULXQ (8*0)(x_ptr), acc0, acc1 + + MULXQ (8*1)(x_ptr), AX, acc2 + ADDQ AX, acc1 + ADCQ $0, acc2 + + MULXQ (8*2)(x_ptr), AX, acc3 + ADDQ AX, acc2 + ADCQ $0, acc3 + + MULXQ (8*3)(x_ptr), AX, acc4 + ADDQ AX, acc3 + ADCQ $0, acc4 + + XORQ acc5, acc5 + + // First reduction step + MOVQ acc0, DX + MULXQ p256ordK0<>(SB), t0, AX + + MOVQ t0, DX + MULXQ p256ord<>+0x00(SB), AX, BX + ADDQ AX, acc0 + ADCQ BX, acc1 + + MULXQ p256ord<>+0x08(SB), AX, BX + ADCQ $0, BX + ADDQ AX, acc1 + ADCQ BX, acc2 + ADCQ $0, acc3 + ADCQ t0, acc4 + ADCQ $0, acc5 + + MOVQ t0, AX + //MOVQ t0, DX // This is not required due to t0=DX already + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc4 + SBBQ $0, acc5 + + // x * y[1] + MOVQ (8*1)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, BX + ADDQ AX, acc1 + ADCQ BX, acc2 + + MULXQ (8*1)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc2 + ADCQ BX, acc3 + + MULXQ (8*2)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + + MULXQ (8*3)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc4 + ADCQ BX, acc5 + ADCQ $0, acc0 + + // Second reduction step + MOVQ acc1, DX + MULXQ p256ordK0<>(SB), t0, AX + + MOVQ t0, DX + MULXQ p256ord<>+0x00(SB), AX, BX + ADDQ AX, acc1 + ADCQ BX, acc2 + + MULXQ p256ord<>+0x08(SB), AX, BX + ADCQ $0, BX + ADDQ AX, acc2 + ADCQ BX, acc3 + ADCQ $0, acc4 + ADCQ t0, acc5 + ADCQ $0, acc0 + + MOVQ t0, AX + //MOVQ t0, DX // This is not required due to t0=DX already + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc3 + SBBQ AX, acc4 + SBBQ DX, acc5 + SBBQ $0, acc0 + + // x * y[2] + MOVQ (8*2)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, BX + ADDQ AX, acc2 + ADCQ BX, acc3 + + MULXQ (8*1)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + + MULXQ (8*2)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc4 + ADCQ BX, acc5 + + MULXQ (8*3)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc5 + ADCQ BX, acc0 + ADCQ $0, acc1 + + // Third reduction step + MOVQ acc2, DX + MULXQ p256ordK0<>(SB), t0, AX + + MOVQ t0, DX + MULXQ p256ord<>+0x00(SB), AX, BX + ADDQ AX, acc2 + ADCQ BX, acc3 + + MULXQ p256ord<>+0x08(SB), AX, BX + ADCQ $0, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + ADCQ $0, acc5 + ADCQ t0, acc0 + ADCQ $0, acc1 + + MOVQ t0, AX + //MOVQ t0, DX // This is not required due to t0=DX already + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc4 + SBBQ AX, acc5 + SBBQ DX, acc0 + SBBQ $0, acc1 + + // x * y[3] + MOVQ (8*3)(y_ptr), DX + MULXQ (8*0)(x_ptr), AX, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + + MULXQ (8*1)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc4 + ADCQ BX, acc5 + + MULXQ (8*2)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc5 + ADCQ BX, acc0 + + MULXQ (8*3)(x_ptr), AX, BX + ADCQ $0, BX + ADDQ AX, acc0 + ADCQ BX, acc1 + ADCQ $0, acc2 + + // Last reduction step + MOVQ acc3, DX + MULXQ p256ordK0<>(SB), t0, AX + + MOVQ t0, DX + MULXQ p256ord<>+0x00(SB), AX, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + + MULXQ p256ord<>+0x08(SB), AX, BX + ADCQ $0, BX + ADDQ AX, acc4 + ADCQ BX, acc5 + ADCQ $0, acc0 + ADCQ t0, acc1 + ADCQ $0, acc2 + + MOVQ t0, AX + //MOVQ t0, DX // This is not required due to t0=DX already + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc5 + SBBQ AX, acc0 + SBBQ DX, acc1 + SBBQ $0, acc2 + + p256OrdReduceInline(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) + + RET diff --git a/internal/sm2ec/p256_macros_amd64.s b/internal/sm2ec/p256_macros_amd64.s new file mode 100644 index 0000000..5cc2728 --- /dev/null +++ b/internal/sm2ec/p256_macros_amd64.s @@ -0,0 +1,290 @@ +#define res_ptr DI +#define x_ptr SI +#define y_ptr CX + +#define acc0 R8 +#define acc1 R9 +#define acc2 R10 +#define acc3 R11 +#define acc4 R12 +#define acc5 R13 +#define t0 R14 + +DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff +DATA p256p<>+0x08(SB)/8, $0xffffffff00000000 +DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff +DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff +DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975 +DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123 +DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b +DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff +DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff +DATA p256one<>+0x00(SB)/8, $0x0000000000000001 +DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff +DATA p256one<>+0x10(SB)/8, $0x0000000000000000 +DATA p256one<>+0x18(SB)/8, $0x0000000100000000 +GLOBL p256p<>(SB), 8, $32 +GLOBL p256ordK0<>(SB), 8, $8 +GLOBL p256ord<>(SB), 8, $32 +GLOBL p256one<>(SB), 8, $32 + +#define p256SqrMontReduce() \ + \ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1] + MOVQ acc0, AX \ + MOVQ acc0, DX \ + SHLQ $32, AX \ // AX = L(acc0 * 2^32), low part + SHRQ $32, DX \ // DX = H(acc0 * 2^32), high part + \ // calculate the positive part first: [1, 0, 0, 1] * acc0 + [0, acc3, acc2, acc1], + \ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry. + ADDQ acc0, acc1 \ // acc1' = L (acc0 + acc1) + ADCQ $0, acc2 \ // acc2' = acc2 + carry1 + ADCQ $0, acc3 \ // acc3' = acc3 + carry2 + ADCQ $0, acc0 \ // acc0' = acc0 + carry3 + \// calculate the negative part: [0, -0x100000000, 0, -0x100000000] * acc0 + SUBQ AX, acc1 \ + SBBQ DX, acc2 \ + SBBQ AX, acc3 \ + SBBQ DX, acc0 \ + \ // Second reduction step + MOVQ acc1, AX \ + MOVQ acc1, DX \ + SHLQ $32, AX \ + SHRQ $32, DX \ + \ + ADDQ acc1, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + \ + SUBQ AX, acc2 \ + SBBQ DX, acc3 \ + SBBQ AX, acc0 \ + SBBQ DX, acc1 \ + \ // Third reduction step + MOVQ acc2, AX \ + MOVQ acc2, DX \ + SHLQ $32, AX \ + SHRQ $32, DX \ + \ + ADDQ acc2, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + \ + SUBQ AX, acc3 \ + SBBQ DX, acc0 \ + SBBQ AX, acc1 \ + SBBQ DX, acc2 \ + \ // Last reduction step + XORQ t0, t0 \ + MOVQ acc3, AX \ + MOVQ acc3, DX \ + SHLQ $32, AX \ + SHRQ $32, DX \ + \ + ADDQ acc3, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ + \ + SUBQ AX, acc0 \ + SBBQ DX, acc1 \ + SBBQ AX, acc2 \ + SBBQ DX, acc3 \ + \ // Add bits [511:256] of the sqr result + ADCQ acc4, acc0 \ + ADCQ acc5, acc1 \ + ADCQ y_ptr, acc2 \ + ADCQ x_ptr, acc3 \ + ADCQ $0, t0 + +#define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ + MOVQ a0, b0 \ + MOVQ a1, b1 \ + MOVQ a2, b2 \ + MOVQ a3, b3 \ + \ // Subtract p256 + SUBQ $-1, a0 \ + SBBQ p256p<>+0x08(SB), a1 \ + SBBQ $-1, a2 \ + SBBQ p256p<>+0x018(SB), a3 \ + SBBQ $0, a4 \ + \ + CMOVQCS b0, a0 \ + CMOVQCS b1, a1 \ + CMOVQCS b2, a2 \ + CMOVQCS b3, a3 \ + \ + MOVQ a0, (8*0)(res) \ + MOVQ a1, (8*1)(res) \ + MOVQ a2, (8*2)(res) \ + MOVQ a3, (8*3)(res) + +/* ---------------------------------------*/ +#define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ + \// Copy result [255:0] + MOVQ a0, b0 \ + MOVQ a1, b1 \ + MOVQ a2, b2 \ + MOVQ a3, b3 \ + \// Subtract p256 + SUBQ p256ord<>+0x00(SB), a0 \ + SBBQ p256ord<>+0x08(SB) ,a1 \ + SBBQ p256ord<>+0x10(SB), a2 \ + SBBQ p256ord<>+0x18(SB), a3 \ + SBBQ $0, a4 \ + \ + CMOVQCS b0, a0 \ + CMOVQCS b1, a1 \ + CMOVQCS b2, a2 \ + CMOVQCS b3, a3 \ + \ + MOVQ a0, (8*0)(res) \ + MOVQ a1, (8*1)(res) \ + MOVQ a2, (8*2)(res) \ + MOVQ a3, (8*3)(res) + +#define sm2P256SqrReductionInternal() \ + \ // First reduction step + MOVQ acc0, mul0 \ + MOVQ acc0, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ + \ + ADDQ acc0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + \ + SUBQ mul0, acc1 \ + SBBQ mul1, acc2 \ + SBBQ mul0, acc3 \ + SBBQ mul1, acc0 \ + \ // Second reduction step + MOVQ acc1, mul0 \ + MOVQ acc1, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ + \ + ADDQ acc1, acc2 \ + ADCQ $0, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + \ + SUBQ mul0, acc2 \ + SBBQ mul1, acc3 \ + SBBQ mul0, acc0 \ + SBBQ mul1, acc1 \ + \ // Third reduction step + MOVQ acc2, mul0 \ + MOVQ acc2, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ + \ + ADDQ acc2, acc3 \ + ADCQ $0, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + \ + SUBQ mul0, acc3 \ + SBBQ mul1, acc0 \ + SBBQ mul0, acc1 \ + SBBQ mul1, acc2 \ + \ // Last reduction step + MOVQ acc3, mul0 \ + MOVQ acc3, mul1 \ + SHLQ $32, mul0 \ + SHRQ $32, mul1 \ + \ + ADDQ acc3, acc0 \ + ADCQ $0, acc1 \ + ADCQ $0, acc2 \ + ADCQ $0, acc3 \ + \ + SUBQ mul0, acc0 \ + SBBQ mul1, acc1 \ + SBBQ mul0, acc2 \ + SBBQ mul1, acc3 \ + MOVQ $0, mul0 \ + \ // Add bits [511:256] of the result + ADCQ acc0, t0 \ + ADCQ acc1, t1 \ + ADCQ acc2, t2 \ + ADCQ acc3, t3 \ + ADCQ $0, mul0 \ + \ // Copy result + MOVQ t0, acc4 \ + MOVQ t1, acc5 \ + MOVQ t2, acc6 \ + MOVQ t3, acc7 \ + \ // Subtract p256 + SUBQ $-1, acc4 \ + SBBQ p256p<>+0x08(SB), acc5 \ + SBBQ $-1, acc6 \ + SBBQ p256p<>+0x018(SB), acc7 \ + SBBQ $0, mul0 \ + \ // If the result of the subtraction is negative, restore the previous result + CMOVQCS t0, acc4 \ + CMOVQCS t1, acc5 \ + CMOVQCS t2, acc6 \ + CMOVQCS t3, acc7 + +#define p256PointDoubleInit() \ + MOVOU (16*0)(BX), X0 \ + MOVOU (16*1)(BX), X1 \ + MOVOU (16*2)(BX), X2 \ + MOVOU (16*3)(BX), X3 \ + MOVOU (16*4)(BX), X4 \ + MOVOU (16*5)(BX), X5 \ + \ + MOVOU X0, x(16*0) \ + MOVOU X1, x(16*1) \ + MOVOU X2, y(16*0) \ + MOVOU X3, y(16*1) \ + MOVOU X4, z(16*0) \ + MOVOU X5, z(16*1) + +/* ---------------------------------------*/ +// [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4] +#define p256MulBy2Inline\ + XORQ mul0, mul0;\ + ADDQ acc4, acc4;\ + ADCQ acc5, acc5;\ + ADCQ acc6, acc6;\ + ADCQ acc7, acc7;\ + ADCQ $0, mul0;\ + MOVQ acc4, t0;\ + MOVQ acc5, t1;\ + MOVQ acc6, t2;\ + MOVQ acc7, t3;\ + SUBQ $-1, t0;\ + SBBQ p256p<>+0x08(SB), t1;\ + SBBQ $-1, t2;\ + SBBQ p256p<>+0x018(SB), t3;\ + SBBQ $0, mul0;\ + CMOVQCS acc4, t0;\ + CMOVQCS acc5, t1;\ + CMOVQCS acc6, t2;\ + CMOVQCS acc7, t3; +/* ---------------------------------------*/ +// [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0] +#define p256AddInline \ + XORQ mul0, mul0;\ + ADDQ t0, acc4;\ + ADCQ t1, acc5;\ + ADCQ t2, acc6;\ + ADCQ t3, acc7;\ + ADCQ $0, mul0;\ + MOVQ acc4, t0;\ + MOVQ acc5, t1;\ + MOVQ acc6, t2;\ + MOVQ acc7, t3;\ + SUBQ $-1, t0;\ + SBBQ p256p<>+0x08(SB), t1;\ + SBBQ $-1, t2;\ + SBBQ p256p<>+0x018(SB), t3;\ + SBBQ $0, mul0;\ + CMOVQCS acc4, t0;\ + CMOVQCS acc5, t1;\ + CMOVQCS acc6, t2;\ + CMOVQCS acc7, t3; diff --git a/internal/sm2ec/p256_plugin_amd64.s b/internal/sm2ec/p256_plugin_amd64.s new file mode 100644 index 0000000..64b0d00 --- /dev/null +++ b/internal/sm2ec/p256_plugin_amd64.s @@ -0,0 +1,1961 @@ +// This file contains constant-time, 64-bit assembly implementation of +// P256. The optimizations performed here are described in detail in: +// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with +// 256-bit primes" +// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x +// https://eprint.iacr.org/2013/816.pdf +//go:build amd64 && !purego && plugin +// +build amd64,!purego,plugin + +#include "textflag.h" + +#include "p256_macro_amd64.s" + +/* ---------------------------------------*/ +// func p256Sqr(res, in *p256Element, n int) +TEXT ·p256Sqr(SB),NOSPLIT,$8-24 + MOVQ res+0(FP), res_ptr + MOVQ in+8(FP), x_ptr + MOVQ n+16(FP), BX + + CMPB ·supportBMI2+0(SB), $0x01 + JEQ sqrBMI2 + +sqrLoop: + MOVQ BX, (SP) + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + MOVQ AX, acc1 + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, acc5 + // y[3] * y[2] + MOVQ (8*2)(x_ptr), t0 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, y_ptr + XORQ BX, BX + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, BX + // Missing products + MOVQ (8*0)(x_ptr), AX + MULQ AX + MOVQ AX, acc0 + MOVQ DX, t0 + + MOVQ (8*1)(x_ptr), AX + MULQ AX + ADDQ t0, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*2)(x_ptr), AX + MULQ AX + ADDQ t0, acc3 + ADCQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*3)(x_ptr), AX + MULQ AX + ADDQ t0, acc5 + ADCQ AX, y_ptr + ADCQ DX, BX + MOVQ BX, x_ptr + + // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + p256SqrMontReduce() + p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) + MOVQ res_ptr, x_ptr + MOVQ (SP), BX + DECQ BX + JNE sqrLoop + RET + +sqrBMI2: + MOVQ BX, (SP) + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), DX + + MULXQ (8*1)(x_ptr), acc1, acc2 + + MULXQ (8*2)(x_ptr), AX, acc3 + ADDQ AX, acc2 + + MULXQ (8*3)(x_ptr), AX, acc4 + ADCQ AX, acc3 + ADCQ $0, acc4 + + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), DX + + MULXQ (8*2)(x_ptr), AX, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + + MULXQ (8*3)(x_ptr), AX, acc5 + ADCQ $0, acc5 + ADDQ AX, acc4 + + // y[3] * y[2] + MOVQ (8*2)(x_ptr), DX + + MULXQ (8*3)(x_ptr), AX, y_ptr + ADCQ AX, acc5 + ADCQ $0, y_ptr + XORQ BX, BX + + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, BX + + // Missing products + MOVQ (8*0)(x_ptr), DX + MULXQ DX, acc0, t0 + ADDQ t0, acc1 + + MOVQ (8*1)(x_ptr), DX + MULXQ DX, AX, t0 + ADCQ AX, acc2 + ADCQ t0, acc3 + + MOVQ (8*2)(x_ptr), DX + MULXQ DX, AX, t0 + ADCQ AX, acc4 + ADCQ t0, acc5 + + MOVQ (8*3)(x_ptr), DX + MULXQ DX, AX, x_ptr + ADCQ AX, y_ptr + ADCQ BX, x_ptr + + // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + p256SqrMontReduce() + p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) + MOVQ res_ptr, x_ptr + MOVQ (SP), BX + DECQ BX + JNE sqrBMI2 + RET + +/* ---------------------------------------*/ +// func p256OrdSqr(res, in *p256OrdElement, n int) +TEXT ·p256OrdSqr(SB),NOSPLIT,$8-24 + MOVQ res+0(FP), res_ptr + MOVQ in+8(FP), x_ptr + MOVQ n+16(FP), BX + + CMPB ·supportBMI2+0(SB), $0x01 + JEQ ordSqrLoopBMI2 + +ordSqrLoop: + MOVQ BX, (SP) + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + MOVQ AX, acc1 + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ BX, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, acc5 + // y[3] * y[2] + MOVQ (8*2)(x_ptr), t0 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, y_ptr + XORQ BX, BX + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, BX + // Missing products + MOVQ (8*0)(x_ptr), AX + MULQ AX + MOVQ AX, acc0 + MOVQ DX, t0 + + MOVQ (8*1)(x_ptr), AX + MULQ AX + ADDQ t0, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*2)(x_ptr), AX + MULQ AX + ADDQ t0, acc3 + ADCQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*3)(x_ptr), AX + MULQ AX + ADDQ t0, acc5 + ADCQ AX, y_ptr + ADCQ DX, BX + MOVQ BX, x_ptr + + // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] + MOVQ acc0, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 + // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] + // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) + ADCQ $0, DX // DX = carry1 + H(t0 * ord0) + MOVQ DX, BX // BX = carry1 + H(t0 * ord0) + MOVQ t0, acc0 // acc0 = t0 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc1 // (carry2, acc1) = acc1 + BX + ADCQ $0, DX // DX = carry2 + H(t0*ord1) + + ADDQ AX, acc1 // (carry3, acc1) = acc1 + BX + L(t0*ord1) + ADCQ DX, acc2 + ADCQ $0, acc3 + ADCQ $0, acc0 + // calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc0 + // Second reduction step + MOVQ acc1, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, BX + MOVQ t0, acc1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc2 + ADCQ $0, DX + + ADDQ AX, acc2 + ADCQ DX, acc3 + ADCQ $0, acc0 + ADCQ $0, acc1 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc3 + SBBQ AX, acc0 + SBBQ DX, acc1 + // Third reduction step + MOVQ acc2, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, BX + MOVQ t0, acc2 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc3 + ADCQ $0, DX + + ADDQ AX, acc3 + ADCQ DX, acc0 + ADCQ $0, acc1 + ADCQ $0, acc2 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc0 + SBBQ AX, acc1 + SBBQ DX, acc2 + // Last reduction step + MOVQ acc3, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, BX + MOVQ t0, acc3 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ BX, acc0 + ADCQ $0, DX + + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + + MOVQ t0, AX + MOVQ t0, DX + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc1 + SBBQ AX, acc2 + SBBQ DX, acc3 + + XORQ t0, t0 + // Add bits [511:256] of the sqr result + ADCQ acc4, acc0 + ADCQ acc5, acc1 + ADCQ y_ptr, acc2 + ADCQ x_ptr, acc3 + ADCQ $0, t0 + + p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) + MOVQ res_ptr, x_ptr + MOVQ (SP), BX + DECQ BX + JNE ordSqrLoop + + RET + +ordSqrLoopBMI2: + MOVQ BX, (SP) + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), DX + MULXQ (8*1)(x_ptr), acc1, acc2 + + MULXQ (8*2)(x_ptr), AX, acc3 + ADDQ AX, acc2 + ADCQ $0, acc3 + + MULXQ (8*3)(x_ptr), AX, acc4 + ADDQ AX, acc3 + ADCQ $0, acc4 + + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), DX + MULXQ (8*2)(x_ptr), AX, BX + ADDQ AX, acc3 + ADCQ BX, acc4 + + MULXQ (8*3)(x_ptr), AX, acc5 + ADCQ $0, acc5 + ADDQ AX, acc4 + ADCQ $0, acc5 + + // y[3] * y[2] + MOVQ (8*2)(x_ptr), DX + MULXQ (8*3)(x_ptr), AX, y_ptr + ADDQ AX, acc5 + ADCQ $0, y_ptr + + XORQ BX, BX + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, BX + + // Missing products + MOVQ (8*0)(x_ptr), DX + MULXQ DX, acc0, t0 + ADDQ t0, acc1 + + MOVQ (8*1)(x_ptr), DX + MULXQ DX, AX, t0 + ADCQ AX, acc2 + ADCQ t0, acc3 + + MOVQ (8*2)(x_ptr), DX + MULXQ DX, AX, t0 + ADCQ AX, acc4 + ADCQ t0, acc5 + + MOVQ (8*3)(x_ptr), DX + MULXQ DX, AX, x_ptr + ADCQ AX, y_ptr + ADCQ BX, x_ptr + + // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] + // First reduction step, [ord3, ord2, ord1, ord0] = [1, -0x100000000, -1, ord1, ord0] + MOVQ acc0, DX + MULXQ p256ordK0<>(SB), t0, AX + // calculate the positive part first: [1, 0, 0, ord1, ord0] * t0 + [0, acc3, acc2, acc1, acc0] + // the result is [acc0, acc3, acc2, acc1], last lowest limb is dropped. + MOVQ t0, DX // Y = t0 = (k0 * acc0) mod 2^64 + MULXQ p256ord<>+0x00(SB), AX, BX + ADDQ AX, acc0 // (carry1, acc0) = acc0 + L(t0 * ord0) + ADCQ BX, acc1 // (carry2, acc1) = acc1 + H(t0 * ord0) + carry1 + MOVQ t0, acc0 // acc0 = t0 + + MULXQ p256ord<>+0x08(SB), AX, BX + ADCQ $0, BX // BX = carry2 + H(t0*ord1) + ADDQ AX, acc1 // (carry3, acc1) = acc1 + L(t0*ord1) + ADCQ BX, acc2 // (carry4, acc2) = acc2 + BX + carry3 + ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4 + ADCQ $0, acc0 // acc0 = t0 + carry5 + // calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 + MOVQ t0, AX + //MOVQ t0, DX // This is not required due to t0=DX already + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc2 + SBBQ AX, acc3 + SBBQ DX, acc0 + + // Second reduction step + MOVQ acc1, DX + MULXQ p256ordK0<>(SB), t0, AX + + MOVQ t0, DX + MULXQ p256ord<>+0x00(SB), AX, BX + ADDQ AX, acc1 + ADCQ BX, acc2 + MOVQ t0, acc1 + + MULXQ p256ord<>+0x08(SB), AX, BX + ADCQ $0, BX + ADDQ AX, acc2 + ADCQ BX, acc3 + ADCQ $0, acc0 + ADCQ $0, acc1 + + MOVQ t0, AX + //MOVQ t0, DX // This is not required due to t0=DX already + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc3 + SBBQ AX, acc0 + SBBQ DX, acc1 + // Third reduction step + MOVQ acc2, DX + MULXQ p256ordK0<>(SB), t0, AX + + MOVQ t0, DX + MULXQ p256ord<>+0x00(SB), AX, BX + ADDQ AX, acc2 + ADCQ BX, acc3 + MOVQ t0, acc2 + + MULXQ p256ord<>+0x08(SB), AX, BX + ADCQ $0, BX + ADDQ AX, acc3 + ADCQ BX, acc0 + ADCQ $0, acc1 + ADCQ $0, acc2 + + MOVQ t0, AX + //MOVQ t0, DX // This is not required due to t0=DX already + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc0 + SBBQ AX, acc1 + SBBQ DX, acc2 + // Last reduction step + MOVQ acc3, DX + MULXQ p256ordK0<>(SB), t0, AX + + MOVQ t0, DX + MULXQ p256ord<>+0x00(SB), AX, BX + ADDQ AX, acc3 + ADCQ BX, acc0 + MOVQ t0, acc3 + + MULXQ p256ord<>+0x08(SB), AX, BX + ADCQ $0, BX + ADDQ AX, acc0 + ADCQ BX, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + + MOVQ t0, AX + //MOVQ t0, DX // This is not required due to t0=DX already + SHLQ $32, AX + SHRQ $32, DX + + SUBQ t0, acc1 + SBBQ AX, acc2 + SBBQ DX, acc3 + + XORQ t0, t0 + // Add bits [511:256] of the sqr result + ADCQ acc4, acc0 + ADCQ acc5, acc1 + ADCQ y_ptr, acc2 + ADCQ x_ptr, acc3 + ADCQ $0, t0 + + p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr) + MOVQ res_ptr, x_ptr + MOVQ (SP), BX + DECQ BX + JNE ordSqrLoopBMI2 + + RET + +/* ---------------------------------------*/ +#undef res_ptr +#undef x_ptr +#undef y_ptr + +#undef acc0 +#undef acc1 +#undef acc2 +#undef acc3 +#undef acc4 +#undef acc5 +#undef t0 +/* ---------------------------------------*/ +#define mul0 AX +#define mul1 DX +#define acc0 BX +#define acc1 CX +#define acc2 R8 +#define acc3 BP +#define acc4 R10 +#define acc5 R11 +#define acc6 R12 +#define acc7 R13 +#define t0 R14 +#define t1 DI +#define t2 SI +#define t3 R9 + +/* ---------------------------------------*/ +// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0] +TEXT sm2P256SubInternal(SB),NOSPLIT,$0 + XORQ mul0, mul0 + SUBQ t0, acc4 + SBBQ t1, acc5 + SBBQ t2, acc6 + SBBQ t3, acc7 + SBBQ $0, mul0 + + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + + ADDQ $-1, acc4 + ADCQ p256p<>+0x08(SB), acc5 + ADCQ $-1, acc6 + ADCQ p256p<>+0x018(SB), acc7 + ANDQ $1, mul0 + + CMOVQEQ acc0, acc4 + CMOVQEQ acc1, acc5 + CMOVQEQ acc2, acc6 + CMOVQEQ acc3, acc7 + + RET +/* ---------------------------------------*/ +// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0] +TEXT sm2P256MulInternal(SB),NOSPLIT,$8 + CMPB ·supportBMI2+0(SB), $0x01 + JEQ internalMulBMI2 + + MOVQ acc4, mul0 + MULQ t0 + MOVQ mul0, X0 + MOVQ mul1, acc1 + + MOVQ acc4, mul0 + MULQ t1 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc2 + + MOVQ acc4, mul0 + MULQ t2 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + + MOVQ acc4, mul0 + MULQ t3 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc4 + + MOVQ acc5, mul0 + MULQ t0 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc5, mul0 + MULQ t1 + ADDQ acc0, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc5, mul0 + MULQ t2 + ADDQ acc0, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc5, mul0 + MULQ t3 + ADDQ acc0, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ acc6, mul0 + MULQ t0 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc6, mul0 + MULQ t1 + ADDQ acc0, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc6, mul0 + MULQ t2 + ADDQ acc0, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc6, mul0 + MULQ t3 + ADDQ acc0, acc5 + ADCQ $0, mul1 + ADDQ mul0, acc5 + ADCQ $0, mul1 + MOVQ mul1, acc6 + + MOVQ acc7, mul0 + MULQ t0 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc7, mul0 + MULQ t1 + ADDQ acc0, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc7, mul0 + MULQ t2 + ADDQ acc0, acc5 + ADCQ $0, mul1 + ADDQ mul0, acc5 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc7, mul0 + MULQ t3 + ADDQ acc0, acc6 + ADCQ $0, mul1 + ADDQ mul0, acc6 + ADCQ $0, mul1 + MOVQ mul1, acc7 + // First reduction step + PEXTRQ $0, X0, acc0 + MOVQ acc0, mul0 + MOVQ acc0, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + + ADDQ acc0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ $0, acc0 + + SUBQ mul0, acc1 + SBBQ mul1, acc2 + SBBQ mul0, acc3 + SBBQ mul1, acc0 + // Second reduction step + MOVQ acc1, mul0 + MOVQ acc1, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + + ADDQ acc1, acc2 + ADCQ $0, acc3 + ADCQ $0, acc0 + ADCQ $0, acc1 + + SUBQ mul0, acc2 + SBBQ mul1, acc3 + SBBQ mul0, acc0 + SBBQ mul1, acc1 + // Third reduction step + MOVQ acc2, mul0 + MOVQ acc2, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + + ADDQ acc2, acc3 + ADCQ $0, acc0 + ADCQ $0, acc1 + ADCQ $0, acc2 + + SUBQ mul0, acc3 + SBBQ mul1, acc0 + SBBQ mul0, acc1 + SBBQ mul1, acc2 + // Last reduction step + MOVQ acc3, mul0 + MOVQ acc3, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + + ADDQ acc3, acc0 + ADCQ $0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + + SUBQ mul0, acc0 + SBBQ mul1, acc1 + SBBQ mul0, acc2 + SBBQ mul1, acc3 + MOVQ $0, mul0 + // Add bits [511:256] of the result + ADCQ acc0, acc4 + ADCQ acc1, acc5 + ADCQ acc2, acc6 + ADCQ acc3, acc7 + ADCQ $0, mul0 + // Copy result + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + // Subtract p256 + SUBQ $-1, acc4 + SBBQ p256p<>+0x08(SB), acc5 + SBBQ $-1, acc6 + SBBQ p256p<>+0x018(SB), acc7 + SBBQ $0, mul0 + // If the result of the subtraction is negative, restore the previous result + CMOVQCS acc0, acc4 + CMOVQCS acc1, acc5 + CMOVQCS acc2, acc6 + CMOVQCS acc3, acc7 + + RET +internalMulBMI2: + MOVQ acc4, mul1 + MULXQ t0, acc0, acc1 + MOVQ acc0, X0 + + MULXQ t1, mul0, acc2 + ADDQ mul0, acc1 + + MULXQ t2, mul0, acc3 + ADCQ mul0, acc2 + + MULXQ t3, mul0, acc4 + ADCQ mul0, acc3 + ADCQ $0, acc4 + + MOVQ acc5, mul1 + MULXQ t0, mul0, acc0 + ADDQ mul0, acc1 + ADCQ acc0, acc2 + + MULXQ t1, mul0, acc0 + ADCQ $0, acc0 + ADDQ mul0, acc2 + ADCQ acc0, acc3 + + MULXQ t2, mul0, acc0 + ADCQ $0, acc0 + ADDQ mul0, acc3 + ADCQ acc0, acc4 + + MULXQ t3, mul0, acc5 + ADCQ $0, acc5 + ADDQ mul0, acc4 + ADCQ $0, acc5 + + MOVQ acc6, mul1 + MULXQ t0, mul0, acc0 + ADDQ mul0, acc2 + ADCQ acc0, acc3 + + MULXQ t1, mul0, acc0 + ADCQ $0, acc0 + ADDQ mul0, acc3 + ADCQ acc0, acc4 + + MULXQ t2, mul0, acc0 + ADCQ $0, acc0 + ADDQ mul0, acc4 + ADCQ acc0, acc5 + + MULXQ t3, mul0, acc6 + ADCQ $0, acc6 + ADDQ mul0, acc5 + ADCQ $0, acc6 + + MOVQ acc7, mul1 + MULXQ t0, mul0, acc0 + ADDQ mul0, acc3 + ADCQ acc0, acc4 + + MULXQ t1, mul0, acc0 + ADCQ $0, acc0 + ADDQ mul0, acc4 + ADCQ acc0, acc5 + + MULXQ t2, mul0, acc0 + ADCQ $0, acc0 + ADDQ mul0, acc5 + ADCQ acc0, acc6 + + MULXQ t3, mul0, acc7 + ADCQ $0, acc7 + ADDQ mul0, acc6 + ADCQ $0, acc7 + + // First reduction step + PEXTRQ $0, X0, acc0 + MOVQ acc0, mul0 + MOVQ acc0, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + + ADDQ acc0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ $0, acc0 + + SUBQ mul0, acc1 + SBBQ mul1, acc2 + SBBQ mul0, acc3 + SBBQ mul1, acc0 + // Second reduction step + MOVQ acc1, mul0 + MOVQ acc1, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + + ADDQ acc1, acc2 + ADCQ $0, acc3 + ADCQ $0, acc0 + ADCQ $0, acc1 + + SUBQ mul0, acc2 + SBBQ mul1, acc3 + SBBQ mul0, acc0 + SBBQ mul1, acc1 + // Third reduction step + MOVQ acc2, mul0 + MOVQ acc2, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + + ADDQ acc2, acc3 + ADCQ $0, acc0 + ADCQ $0, acc1 + ADCQ $0, acc2 + + SUBQ mul0, acc3 + SBBQ mul1, acc0 + SBBQ mul0, acc1 + SBBQ mul1, acc2 + // Last reduction step + MOVQ acc3, mul0 + MOVQ acc3, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + + ADDQ acc3, acc0 + ADCQ $0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + + SUBQ mul0, acc0 + SBBQ mul1, acc1 + SBBQ mul0, acc2 + SBBQ mul1, acc3 + MOVQ $0, mul0 + // Add bits [511:256] of the result + ADCQ acc0, acc4 + ADCQ acc1, acc5 + ADCQ acc2, acc6 + ADCQ acc3, acc7 + ADCQ $0, mul0 + // Copy result + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + // Subtract p256 + SUBQ $-1, acc4 + SBBQ p256p<>+0x08(SB), acc5 + SBBQ $-1, acc6 + SBBQ p256p<>+0x018(SB), acc7 + SBBQ $0, mul0 + // If the result of the subtraction is negative, restore the previous result + CMOVQCS acc0, acc4 + CMOVQCS acc1, acc5 + CMOVQCS acc2, acc6 + CMOVQCS acc3, acc7 + + RET + +/* ---------------------------------------*/ +// [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4]^2 +TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 + CMPB ·supportBMI2+0(SB), $0x01 + JEQ internalSqrBMI2 + + MOVQ acc4, mul0 + MULQ acc5 + MOVQ mul0, acc1 + MOVQ mul1, acc2 + + MOVQ acc4, mul0 + MULQ acc6 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + + MOVQ acc4, mul0 + MULQ acc7 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ acc5, mul0 + MULQ acc6 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc0 + + MOVQ acc5, mul0 + MULQ acc7 + ADDQ acc0, t0 + ADCQ $0, mul1 + ADDQ mul0, t0 + ADCQ $0, mul1 + MOVQ mul1, t1 + + MOVQ acc6, mul0 + MULQ acc7 + ADDQ mul0, t1 + ADCQ $0, mul1 + MOVQ mul1, t2 + XORQ t3, t3 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ t0, t0 + ADCQ t1, t1 + ADCQ t2, t2 + ADCQ $0, t3 + // Missing products + MOVQ acc4, mul0 + MULQ mul0 + MOVQ mul0, acc0 + MOVQ DX, acc4 + + MOVQ acc5, mul0 + MULQ mul0 + ADDQ acc4, acc1 + ADCQ mul0, acc2 + ADCQ $0, DX + MOVQ DX, acc4 + + MOVQ acc6, mul0 + MULQ mul0 + ADDQ acc4, acc3 + ADCQ mul0, t0 + ADCQ $0, DX + MOVQ DX, acc4 + + MOVQ acc7, mul0 + MULQ mul0 + ADDQ acc4, t1 + ADCQ mul0, t2 + ADCQ DX, t3 + // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] + sm2P256SqrReductionInternal() + RET + +internalSqrBMI2: + MOVQ acc4, mul1 + MULXQ acc5, acc1, acc2 + + MULXQ acc6, mul0, acc3 + ADDQ mul0, acc2 + + MULXQ acc7, mul0, t0 + ADCQ mul0, acc3 + ADCQ $0, t0 + + MOVQ acc5, mul1 + MULXQ acc6, mul0, acc0 + ADDQ mul0, acc3 + ADCQ acc0, t0 + + MULXQ acc7, mul0, t1 + ADCQ $0, t1 + ADDQ mul0, t0 + + MOVQ acc6, mul1 + MULXQ acc7, mul0, t2 + ADCQ mul0, t1 + ADCQ $0, t2 + XORQ t3, t3 + + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ t0, t0 + ADCQ t1, t1 + ADCQ t2, t2 + ADCQ $0, t3 + + // Missing products + MOVQ acc4, mul1 + MULXQ mul1, acc0, acc4 + ADDQ acc4, acc1 + + MOVQ acc5, mul1 + MULXQ mul1, mul0, acc4 + ADCQ mul0, acc2 + ADCQ acc4, acc3 + + MOVQ acc6, mul1 + MULXQ mul1, mul0, acc4 + ADCQ mul0, t0 + ADCQ acc4, t1 + + MOVQ acc7, mul1 + MULXQ mul1, mul0, acc4 + ADCQ mul0, t2 + ADCQ acc4, t3 + // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] + sm2P256SqrReductionInternal() + + RET + +/* ---------------------------------------*/ +#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 +#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 +#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) +#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) +#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 +#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 +/* ---------------------------------------*/ +#define x1in(off) (32*0 + off)(SP) +#define y1in(off) (32*1 + off)(SP) +#define z1in(off) (32*2 + off)(SP) +#define x2in(off) (32*3 + off)(SP) +#define y2in(off) (32*4 + off)(SP) +#define xout(off) (32*5 + off)(SP) +#define yout(off) (32*6 + off)(SP) +#define zout(off) (32*7 + off)(SP) +#define s2(off) (32*8 + off)(SP) +#define z1sqr(off) (32*9 + off)(SP) +#define h(off) (32*10 + off)(SP) +#define r(off) (32*11 + off)(SP) +#define hsqr(off) (32*12 + off)(SP) +#define rsqr(off) (32*13 + off)(SP) +#define hcub(off) (32*14 + off)(SP) +#define rptr (32*15)(SP) +#define sel_save (32*15 + 8)(SP) +#define zero_save (32*15 + 8 + 4)(SP) + +#define p256PointAddAffineInline() \ + \// Store pointer to result + MOVQ mul0, rptr \ + MOVL t1, sel_save \ + MOVL t2, zero_save \ + \// Negate y2in based on sign + MOVQ (16*2 + 8*0)(CX), acc4 \ + MOVQ (16*2 + 8*1)(CX), acc5 \ + MOVQ (16*2 + 8*2)(CX), acc6 \ + MOVQ (16*2 + 8*3)(CX), acc7 \ + MOVQ $-1, acc0 \ + MOVQ p256p<>+0x08(SB), acc1 \ + MOVQ $-1, acc2 \ + MOVQ p256p<>+0x018(SB), acc3 \ + XORQ mul0, mul0 \ + \// Speculatively subtract + SUBQ acc4, acc0 \ + SBBQ acc5, acc1 \ + SBBQ acc6, acc2 \ + SBBQ acc7, acc3 \ + SBBQ $0, mul0 \ + MOVQ acc0, t0 \ + MOVQ acc1, t1 \ + MOVQ acc2, t2 \ + MOVQ acc3, t3 \ + \// Add in case the operand was > p256 + ADDQ $-1, acc0 \ + ADCQ p256p<>+0x08(SB), acc1 \ + ADCQ $-1, acc2 \ + ADCQ p256p<>+0x018(SB), acc3 \ + ADCQ $0, mul0 \ + CMOVQNE t0, acc0 \ + CMOVQNE t1, acc1 \ + CMOVQNE t2, acc2 \ + CMOVQNE t3, acc3 \ + \// If condition is 0, keep original value + TESTQ DX, DX \ + CMOVQEQ acc4, acc0 \ + CMOVQEQ acc5, acc1 \ + CMOVQEQ acc6, acc2 \ + CMOVQEQ acc7, acc3 \ + \// Store result + MOVQ acc0, y2in(8*0) \ + MOVQ acc1, y2in(8*1) \ + MOVQ acc2, y2in(8*2) \ + MOVQ acc3, y2in(8*3) \ + \// Begin point add + LDacc (z1in) \ + CALL sm2P256SqrInternal(SB) \// z1ˆ2 + ST (z1sqr) \ + \ + LDt (x2in) \ + CALL sm2P256MulInternal(SB) \// x2 * z1ˆ2 + \ + LDt (x1in) \ + CALL sm2P256SubInternal(SB) \// h = u2 - u1 + ST (h) \ + \ + LDt (z1in) \ + CALL sm2P256MulInternal(SB) \// z3 = h * z1 + ST (zout) \ + \ + LDacc (z1sqr) \ + CALL sm2P256MulInternal(SB) \// z1ˆ3 + \ + LDt (y2in) \ + CALL sm2P256MulInternal(SB) \// s2 = y2 * z1ˆ3 + ST (s2) \ + \ + LDt (y1in) \ + CALL sm2P256SubInternal(SB) \// r = s2 - s1 + ST (r) \ + \ + CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 + ST (rsqr) \ + \ + LDacc (h) \ + CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2 + ST (hsqr) \ + \ + LDt (h) \ + CALL sm2P256MulInternal(SB) \// hcub = hˆ3 + ST (hcub) \ + \ + LDt (y1in) \ + CALL sm2P256MulInternal(SB) \// y1 * hˆ3 + ST (s2) \ + \ + LDacc (x1in) \ + LDt (hsqr) \ + CALL sm2P256MulInternal(SB) \// u1 * hˆ2 + ST (h) \ + \ + p256MulBy2Inline \// u1 * hˆ2 * 2, inline + LDacc (rsqr) \ + CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2 + \ + LDt (hcub) \ + CALL sm2P256SubInternal(SB) \ + ST (xout) \ + \ + MOVQ acc4, t0 \ + MOVQ acc5, t1 \ + MOVQ acc6, t2 \ + MOVQ acc7, t3 \ + LDacc (h) \ + CALL sm2P256SubInternal(SB) \ + \ + LDt (r) \ + CALL sm2P256MulInternal(SB) \ + \ + LDt (s2) \ + CALL sm2P256SubInternal(SB) \ + ST (yout) \ + \// Load stored values from stack + MOVQ rptr, AX \ + MOVL sel_save, BX \ + MOVL zero_save, CX \ + +// func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int) +TEXT ·p256PointAddAffineAsm(SB),0,$512-48 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+8(FP), BX + MOVQ in2+16(FP), CX + MOVQ sign+24(FP), DX + MOVQ sel+32(FP), t1 + MOVQ zero+40(FP), t2 + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ pointaddaffine_avx2 + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x1in(16*0) + MOVOU X1, x1in(16*1) + MOVOU X2, y1in(16*0) + MOVOU X3, y1in(16*1) + MOVOU X4, z1in(16*0) + MOVOU X5, z1in(16*1) + + MOVOU (16*0)(CX), X0 + MOVOU (16*1)(CX), X1 + + MOVOU X0, x2in(16*0) + MOVOU X1, x2in(16*1) + + p256PointAddAffineInline() + // The result is not valid if (sel == 0), conditional choose + MOVOU xout(16*0), X0 + MOVOU xout(16*1), X1 + MOVOU yout(16*0), X2 + MOVOU yout(16*1), X3 + MOVOU zout(16*0), X4 + MOVOU zout(16*1), X5 + + MOVL BX, X6 + MOVL CX, X7 + + PXOR X8, X8 + PCMPEQL X9, X9 + + PSHUFD $0, X6, X6 + PSHUFD $0, X7, X7 + + PCMPEQL X8, X6 + PCMPEQL X8, X7 + + MOVOU X6, X15 + PANDN X9, X15 + + MOVOU x1in(16*0), X9 + MOVOU x1in(16*1), X10 + MOVOU y1in(16*0), X11 + MOVOU y1in(16*1), X12 + MOVOU z1in(16*0), X13 + MOVOU z1in(16*1), X14 + + PAND X15, X0 + PAND X15, X1 + PAND X15, X2 + PAND X15, X3 + PAND X15, X4 + PAND X15, X5 + + PAND X6, X9 + PAND X6, X10 + PAND X6, X11 + PAND X6, X12 + PAND X6, X13 + PAND X6, X14 + + PXOR X9, X0 + PXOR X10, X1 + PXOR X11, X2 + PXOR X12, X3 + PXOR X13, X4 + PXOR X14, X5 + // Similarly if zero == 0 + PCMPEQL X9, X9 + MOVOU X7, X15 + PANDN X9, X15 + + MOVOU x2in(16*0), X9 + MOVOU x2in(16*1), X10 + MOVOU y2in(16*0), X11 + MOVOU y2in(16*1), X12 + MOVOU p256one<>+0x00(SB), X13 + MOVOU p256one<>+0x10(SB), X14 + + PAND X15, X0 + PAND X15, X1 + PAND X15, X2 + PAND X15, X3 + PAND X15, X4 + PAND X15, X5 + + PAND X7, X9 + PAND X7, X10 + PAND X7, X11 + PAND X7, X12 + PAND X7, X13 + PAND X7, X14 + + PXOR X9, X0 + PXOR X10, X1 + PXOR X11, X2 + PXOR X12, X3 + PXOR X13, X4 + PXOR X14, X5 + // Finally output the result + MOVOU X0, (16*0)(AX) + MOVOU X1, (16*1)(AX) + MOVOU X2, (16*2)(AX) + MOVOU X3, (16*3)(AX) + MOVOU X4, (16*4)(AX) + MOVOU X5, (16*5)(AX) + MOVQ $0, rptr + + RET +pointaddaffine_avx2: + VMOVDQU (32*0)(BX), Y0 + VMOVDQU (32*1)(BX), Y1 + VMOVDQU (32*2)(BX), Y2 + + VMOVDQU Y0, x1in(32*0) + VMOVDQU Y1, y1in(32*0) + VMOVDQU Y2, z1in(32*0) + + VMOVDQU (32*0)(CX), Y0 + VMOVDQU Y0, x2in(32*0) + + p256PointAddAffineInline() + // The result is not valid if (sel == 0), conditional choose + MOVL BX, X6 + MOVL CX, X7 + + VPXOR Y8, Y8, Y8 + VPCMPEQD Y9, Y9, Y9 + + VPBROADCASTD X6, Y6 + VPBROADCASTD X7, Y7 + + VPCMPEQD Y8, Y6, Y6 + VPCMPEQD Y8, Y7, Y7 + + VMOVDQU Y6, Y15 + VPANDN Y9, Y15, Y15 + + VPAND xout(32*0), Y15, Y0 + VPAND yout(32*0), Y15, Y1 + VPAND zout(32*0), Y15, Y2 + + VPAND x1in(32*0), Y6, Y9 + VPAND y1in(32*0), Y6, Y10 + VPAND z1in(32*0), Y6, Y11 + + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y2, Y2 + + // Similarly if zero == 0 + VPCMPEQD Y9, Y9, Y9 + VPANDN Y9, Y7, Y15 + + VPAND Y15, Y0, Y0 + VPAND Y15, Y1, Y1 + VPAND Y15, Y2, Y2 + + VPAND x2in(32*0), Y7, Y9 + VPAND y2in(32*0), Y7, Y10 + VPAND p256one<>+0x00(SB), Y7, Y11 + + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y2, Y2 + + // Finally output the result + VMOVDQU Y0, (32*0)(AX) + VMOVDQU Y1, (32*1)(AX) + VMOVDQU Y2, (32*2)(AX) + MOVQ $0, rptr + + VZEROUPPER + RET +#undef x1in +#undef y1in +#undef z1in +#undef x2in +#undef y2in +#undef xout +#undef yout +#undef zout +#undef s2 +#undef z1sqr +#undef h +#undef r +#undef hsqr +#undef rsqr +#undef hcub +#undef rptr +#undef sel_save +#undef zero_save + +// sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero +// otherwise. It writes to [acc4..acc7], t0 and t1. +TEXT sm2P256IsZero(SB),NOSPLIT,$0 + // AX contains a flag that is set if the input is zero. + XORQ AX, AX + MOVQ $1, t1 + + // Check whether [acc4..acc7] are all zero. + MOVQ acc4, t0 + ORQ acc5, t0 + ORQ acc6, t0 + ORQ acc7, t0 + + // Set the zero flag if so. (CMOV of a constant to a register doesn't + // appear to be supported in Go. Thus t1 = 1.) + CMOVQEQ t1, AX + + // XOR [acc4..acc7] with P and compare with zero again. + XORQ $-1, acc4 + XORQ p256p<>+0x08(SB), acc5 + XORQ $-1, acc6 + XORQ p256p<>+0x018(SB), acc7 + ORQ acc5, acc4 + ORQ acc6, acc4 + ORQ acc7, acc4 + + // Set the zero flag if so. + CMOVQEQ t1, AX + RET + +/* ---------------------------------------*/ +#define x1in(off) (32*0 + off)(SP) +#define y1in(off) (32*1 + off)(SP) +#define z1in(off) (32*2 + off)(SP) +#define x2in(off) (32*3 + off)(SP) +#define y2in(off) (32*4 + off)(SP) +#define z2in(off) (32*5 + off)(SP) + +#define xout(off) (32*6 + off)(SP) +#define yout(off) (32*7 + off)(SP) +#define zout(off) (32*8 + off)(SP) + +#define u1(off) (32*9 + off)(SP) +#define u2(off) (32*10 + off)(SP) +#define s1(off) (32*11 + off)(SP) +#define s2(off) (32*12 + off)(SP) +#define z1sqr(off) (32*13 + off)(SP) +#define z2sqr(off) (32*14 + off)(SP) +#define h(off) (32*15 + off)(SP) +#define r(off) (32*16 + off)(SP) +#define hsqr(off) (32*17 + off)(SP) +#define rsqr(off) (32*18 + off)(SP) +#define hcub(off) (32*19 + off)(SP) +#define rptr (32*20)(SP) +#define points_eq (32*20+8)(SP) + +#define p256PointAddInline() \ + \// Begin point add + LDacc (z2in) \ + CALL sm2P256SqrInternal(SB) \// z2ˆ2 + ST (z2sqr) \ + LDt (z2in) \ + CALL sm2P256MulInternal(SB) \// z2ˆ3 + LDt (y1in) \ + CALL sm2P256MulInternal(SB) \// s1 = z2ˆ3*y1 + ST (s1) \ + \ + LDacc (z1in) \ + CALL sm2P256SqrInternal(SB) \// z1ˆ2 + ST (z1sqr) \ + LDt (z1in) \ + CALL sm2P256MulInternal(SB) \// z1ˆ3 + LDt (y2in) \ + CALL sm2P256MulInternal(SB) \// s2 = z1ˆ3*y2 + ST (s2) \ + \ + LDt (s1) \ + CALL sm2P256SubInternal(SB) \// r = s2 - s1 + ST (r) \ + CALL sm2P256IsZero(SB) \ + MOVQ AX, points_eq \ + \ + LDacc (z2sqr) \ + LDt (x1in) \ + CALL sm2P256MulInternal(SB) \// u1 = x1 * z2ˆ2 + ST (u1) \ + LDacc (z1sqr) \ + LDt (x2in) \ + CALL sm2P256MulInternal(SB) \// u2 = x2 * z1ˆ2 + ST (u2) \ + \ + LDt (u1) \ + CALL sm2P256SubInternal(SB) \// h = u2 - u1 + ST (h) \ + CALL sm2P256IsZero(SB) \ + ANDQ points_eq, AX \ + MOVQ AX, points_eq \ + \ + LDacc (r) \ + CALL sm2P256SqrInternal(SB) \// rsqr = rˆ2 + ST (rsqr) \ + \ + LDacc (h) \ + CALL sm2P256SqrInternal(SB) \// hsqr = hˆ2 + ST (hsqr) \ + \ + LDt (h) \ + CALL sm2P256MulInternal(SB) \// hcub = hˆ3 + ST (hcub) \ + \ + LDt (s1) \ + CALL sm2P256MulInternal(SB) \ + ST (s2) \ + \ + LDacc (z1in) \ + LDt (z2in) \ + CALL sm2P256MulInternal(SB) \// z1 * z2 + LDt (h) \ + CALL sm2P256MulInternal(SB) \// z1 * z2 * h + ST (zout) \ + \ + LDacc (hsqr) \ + LDt (u1) \ + CALL sm2P256MulInternal(SB) \// hˆ2 * u1 + ST (u2) \ + \ + p256MulBy2Inline \// u1 * hˆ2 * 2, inline + LDacc (rsqr) \ + CALL sm2P256SubInternal(SB) \// rˆ2 - u1 * hˆ2 * 2 + \ + LDt (hcub) \ + CALL sm2P256SubInternal(SB) \ + ST (xout) \ + \ + MOVQ acc4, t0 \ + MOVQ acc5, t1 \ + MOVQ acc6, t2 \ + MOVQ acc7, t3 \ + LDacc (u2) \ + CALL sm2P256SubInternal(SB) \ + \ + LDt (r) \ + CALL sm2P256MulInternal(SB) \ + \ + LDt (s2) \ + CALL sm2P256SubInternal(SB) \ + ST (yout) \ + +//func p256PointAddAsm(res, in1, in2 *SM2P256Point) int +TEXT ·p256PointAddAsm(SB),0,$680-32 + // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+8(FP), BX + MOVQ in2+16(FP), CX + + CMPB ·supportAVX2+0(SB), $0x01 + JEQ pointadd_avx2 + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x1in(16*0) + MOVOU X1, x1in(16*1) + MOVOU X2, y1in(16*0) + MOVOU X3, y1in(16*1) + MOVOU X4, z1in(16*0) + MOVOU X5, z1in(16*1) + + MOVOU (16*0)(CX), X0 + MOVOU (16*1)(CX), X1 + MOVOU (16*2)(CX), X2 + MOVOU (16*3)(CX), X3 + MOVOU (16*4)(CX), X4 + MOVOU (16*5)(CX), X5 + + MOVOU X0, x2in(16*0) + MOVOU X1, x2in(16*1) + MOVOU X2, y2in(16*0) + MOVOU X3, y2in(16*1) + MOVOU X4, z2in(16*0) + MOVOU X5, z2in(16*1) + // Store pointer to result + MOVQ AX, rptr + p256PointAddInline() + + MOVOU xout(16*0), X0 + MOVOU xout(16*1), X1 + MOVOU yout(16*0), X2 + MOVOU yout(16*1), X3 + MOVOU zout(16*0), X4 + MOVOU zout(16*1), X5 + // Finally output the result + MOVQ rptr, AX + MOVQ $0, rptr + MOVOU X0, (16*0)(AX) + MOVOU X1, (16*1)(AX) + MOVOU X2, (16*2)(AX) + MOVOU X3, (16*3)(AX) + MOVOU X4, (16*4)(AX) + MOVOU X5, (16*5)(AX) + + MOVQ points_eq, AX + MOVQ AX, ret+24(FP) + + RET +pointadd_avx2: + VMOVDQU (32*0)(BX), Y0 + VMOVDQU (32*1)(BX), Y1 + VMOVDQU (32*2)(BX), Y2 + + VMOVDQU Y0, x1in(32*0) + VMOVDQU Y1, y1in(32*0) + VMOVDQU Y2, z1in(32*0) + + VMOVDQU (32*0)(CX), Y0 + VMOVDQU (32*1)(CX), Y1 + VMOVDQU (32*2)(CX), Y2 + + VMOVDQU Y0, x2in(32*0) + VMOVDQU Y1, y2in(32*0) + VMOVDQU Y2, z2in(32*0) + + // Store pointer to result + MOVQ AX, rptr + p256PointAddInline() + + VMOVDQU xout(32*0), Y0 + VMOVDQU yout(32*0), Y1 + VMOVDQU zout(32*0), Y2 + // Finally output the result + MOVQ rptr, AX + MOVQ $0, rptr + VMOVDQU Y0, (32*0)(AX) + VMOVDQU Y1, (32*1)(AX) + VMOVDQU Y2, (32*2)(AX) + + MOVQ points_eq, AX + MOVQ AX, ret+24(FP) + + VZEROUPPER + RET + +#undef x1in +#undef y1in +#undef z1in +#undef x2in +#undef y2in +#undef z2in +#undef xout +#undef yout +#undef zout +#undef s1 +#undef s2 +#undef u1 +#undef u2 +#undef z1sqr +#undef z2sqr +#undef h +#undef r +#undef hsqr +#undef rsqr +#undef hcub +#undef rptr +/* ---------------------------------------*/ +#define x(off) (32*0 + off)(SP) +#define y(off) (32*1 + off)(SP) +#define z(off) (32*2 + off)(SP) + +#define s(off) (32*3 + off)(SP) +#define m(off) (32*4 + off)(SP) +#define zsqr(off) (32*5 + off)(SP) +#define tmp(off) (32*6 + off)(SP) +#define rptr (32*7)(SP) + +#define calZ() \ + LDacc (z) \ + CALL sm2P256SqrInternal(SB) \ + ST (zsqr) \ + \ + LDt (x) \ + p256AddInline \ + STt (m) \ + \ + LDacc (z) \ + LDt (y) \ + CALL sm2P256MulInternal(SB) \ + p256MulBy2Inline \ + +#define calX() \ + LDacc (x) \ + LDt (zsqr) \ + CALL sm2P256SubInternal(SB) \ + LDt (m) \ + CALL sm2P256MulInternal(SB) \ + ST (m) \ + \// Multiply by 3 + p256MulBy2Inline \ + LDacc (m) \ + p256AddInline \ + STt (m) \ + \//////////////////////// + LDacc (y) \ + p256MulBy2Inline \ + t2acc \ + CALL sm2P256SqrInternal(SB) \ + ST (s) \ + CALL sm2P256SqrInternal(SB) \ + \// Divide by 2 + XORQ mul0, mul0 \ + MOVQ acc4, t0 \ + MOVQ acc5, t1 \ + MOVQ acc6, t2 \ + MOVQ acc7, t3 \ + \ + ADDQ $-1, acc4 \ + ADCQ p256p<>+0x08(SB), acc5 \ + ADCQ $-1, acc6 \ + ADCQ p256p<>+0x018(SB), acc7 \ + ADCQ $0, mul0 \ + TESTQ $1, t0 \ + \ + CMOVQEQ t0, acc4 \ + CMOVQEQ t1, acc5 \ + CMOVQEQ t2, acc6 \ + CMOVQEQ t3, acc7 \ + ANDQ t0, mul0 \ + \ + SHRQ $1, acc5, acc4 \ + SHRQ $1, acc6, acc5 \ + SHRQ $1, acc7, acc6 \ + SHRQ $1, mul0, acc7 \ + ST (y) \ + \///////////////////////// + LDacc (x) \ + LDt (s) \ + CALL sm2P256MulInternal(SB) \ + ST (s) \ + p256MulBy2Inline \ + STt (tmp) \ + \ + LDacc (m) \ + CALL sm2P256SqrInternal(SB) \ + LDt (tmp) \ + CALL sm2P256SubInternal(SB) \ + +#define calY() \ + acc2t \ + LDacc (s) \ + CALL sm2P256SubInternal(SB) \ + \ + LDt (m) \ + CALL sm2P256MulInternal(SB) \ + \ + LDt (y) \ + CALL sm2P256SubInternal(SB) \ + +#define lastP256PointDouble() \ + calZ() \ + MOVQ rptr, AX \ + \// Store z + MOVQ t0, (16*4 + 8*0)(AX) \ + MOVQ t1, (16*4 + 8*1)(AX) \ + MOVQ t2, (16*4 + 8*2)(AX) \ + MOVQ t3, (16*4 + 8*3)(AX) \ + \ + calX() \ + MOVQ rptr, AX \ + \// Store x + MOVQ acc4, (16*0 + 8*0)(AX) \ + MOVQ acc5, (16*0 + 8*1)(AX) \ + MOVQ acc6, (16*0 + 8*2)(AX) \ + MOVQ acc7, (16*0 + 8*3)(AX) \ + \ + calY() \ + MOVQ rptr, AX \ + \// Store y + MOVQ acc4, (16*2 + 8*0)(AX) \ + MOVQ acc5, (16*2 + 8*1)(AX) \ + MOVQ acc6, (16*2 + 8*2)(AX) \ + MOVQ acc7, (16*2 + 8*3)(AX) \ + \/////////////////////// + MOVQ $0, rptr \ + +//func p256PointDoubleAsm(res, in *SM2P256Point) +TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in+8(FP), BX + + p256PointDoubleInit() + // Store pointer to result + MOVQ AX, rptr + // Begin point double + lastP256PointDouble() + + RET + +#define storeTmpX() \ + MOVQ acc4, x(8*0) \ + MOVQ acc5, x(8*1) \ + MOVQ acc6, x(8*2) \ + MOVQ acc7, x(8*3) \ + +#define storeTmpY() \ + MOVQ acc4, y(8*0) \ + MOVQ acc5, y(8*1) \ + MOVQ acc6, y(8*2) \ + MOVQ acc7, y(8*3) \ + +#define storeTmpZ() \ + MOVQ t0, z(8*0) \ + MOVQ t1, z(8*1) \ + MOVQ t2, z(8*2) \ + MOVQ t3, z(8*3) \ + +#define p256PointDoubleRound() \ + calZ() \ + storeTmpZ() \ + calX() \ + storeTmpX() \ + calY() \ + storeTmpY() \ + +//func p256PointDouble6TimesAsm(res, in *SM2P256Point) +TEXT ·p256PointDouble6TimesAsm(SB),NOSPLIT,$256-16 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in+8(FP), BX + + p256PointDoubleInit() + // Store pointer to result + MOVQ AX, rptr + + // point double 1-5 rounds + p256PointDoubleRound() + p256PointDoubleRound() + p256PointDoubleRound() + p256PointDoubleRound() + p256PointDoubleRound() + + // last point double round + lastP256PointDouble() + + RET +/* ---------------------------------------*/ diff --git a/internal/sm2ec/sm2p256.go b/internal/sm2ec/sm2p256.go index dd83cfd..cbd5638 100644 --- a/internal/sm2ec/sm2p256.go +++ b/internal/sm2ec/sm2p256.go @@ -4,8 +4,8 @@ // Code generated by generate.go. DO NOT EDIT. -//go:build !amd64 && !arm64 || purego || plugin -// +build !amd64,!arm64 purego plugin +//go:build !amd64 && !arm64 || purego +// +build !amd64,!arm64 purego package sm2ec diff --git a/internal/sm2ec/sm2p256_asm.go b/internal/sm2ec/sm2p256_asm.go index 7a98e97..96b5c30 100644 --- a/internal/sm2ec/sm2p256_asm.go +++ b/internal/sm2ec/sm2p256_asm.go @@ -7,8 +7,8 @@ // 256-bit primes" // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x // https://eprint.iacr.org/2013/816.pdf -//go:build (amd64 && !purego && !plugin) || (arm64 && !purego) -// +build amd64,!purego,!plugin arm64,!purego +//go:build (amd64 && !purego) || (arm64 && !purego) +// +build amd64,!purego arm64,!purego package sm2ec diff --git a/internal/sm2ec/sm2p256_asm_test.go b/internal/sm2ec/sm2p256_asm_test.go index f4ef0b8..f467825 100644 --- a/internal/sm2ec/sm2p256_asm_test.go +++ b/internal/sm2ec/sm2p256_asm_test.go @@ -1,5 +1,5 @@ -//go:build (amd64 && !purego && !plugin) || (arm64 && !purego) -// +build amd64,!purego,!plugin arm64,!purego +//go:build (amd64 && !purego) || (arm64 && !purego) +// +build amd64,!purego arm64,!purego package sm2ec diff --git a/internal/sm2ec/sm2p256_ord.go b/internal/sm2ec/sm2p256_ord.go index 1267def..1f3a5e7 100644 --- a/internal/sm2ec/sm2p256_ord.go +++ b/internal/sm2ec/sm2p256_ord.go @@ -1,5 +1,5 @@ -//go:build (!amd64 && !arm64) || purego || plugin -// +build !amd64,!arm64 purego plugin +//go:build (!amd64 && !arm64) || purego +// +build !amd64,!arm64 purego package sm2ec