diff --git a/internal/sm2ec/p256_asm_s390x.s b/internal/sm2ec/p256_asm_s390x.s new file mode 100644 index 0000000..4978a20 --- /dev/null +++ b/internal/sm2ec/p256_asm_s390x.s @@ -0,0 +1,1007 @@ +// This is a port of the NIST P256 s390x asm implementation to SM2 P256. +// +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego + +#include "textflag.h" +#include "go_asm.h" + +DATA p256ordK0<>+0x00(SB)/4, $0x72350975 +DATA p256ord<>+0x00(SB)/8, $0xfffffffeffffffff +DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff +DATA p256ord<>+0x10(SB)/8, $0x7203df6b21c6052b +DATA p256ord<>+0x18(SB)/8, $0x53bbf40939d54123 +DATA p256<>+0x00(SB)/8, $0xfffffffeffffffff // P256 +DATA p256<>+0x08(SB)/8, $0xffffffffffffffff // P256 +DATA p256<>+0x10(SB)/8, $0xffffffff00000000 // P256 +DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 +DATA p256<>+0x40(SB)/8, $0x0000000000000000 // SEL 0 0 d1 d0 +DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL 0 0 d1 d0 +DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask +DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask +DATA p256mul<>+0x00(SB)/8, $0xfffffffeffffffff // P256 +DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256 +DATA p256mul<>+0x10(SB)/8, $0xffffffff00000000 // P256 +DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 +DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 +DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 +DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 +DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 +DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 +DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 +DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 +DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 +DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 +DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 +DATA p256mul<>+0x80(SB)/8, $0x0000000100000000 // (1*2^256)%P256 +DATA p256mul<>+0x88(SB)/8, $0x0000000000000000 // (1*2^256)%P256 +DATA p256mul<>+0x90(SB)/8, $0x00000000ffffffff // (1*2^256)%P256 +DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 +GLOBL p256ordK0<>(SB), 8, $4 +GLOBL p256ord<>(SB), 8, $32 +GLOBL p256<>(SB), 8, $64 +GLOBL p256mul<>(SB), 8, $160 + +// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) +TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0 + JMP ·p256BigToLittle(SB) + +// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) +TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0 + JMP ·p256BigToLittle(SB) + +// --------------------------------------- +// func p256LittleToBig(res *[32]byte, in *p256Element) +TEXT ·p256LittleToBig(SB), NOSPLIT, $0 + JMP ·p256BigToLittle(SB) + +// func p256BigToLittle(res *p256Element, in *[32]byte) +#define res_ptr R1 +#define in_ptr R2 +#define T1L V2 +#define T1H V3 + +TEXT ·p256BigToLittle(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in+8(FP), in_ptr + + VL 0(in_ptr), T1H + VL 16(in_ptr), T1L + + VPDI $0x4, T1L, T1L, T1L + VPDI $0x4, T1H, T1H, T1H + + VST T1L, 0(res_ptr) + VST T1H, 16(res_ptr) + RET + +#undef res_ptr +#undef in_ptr +#undef T1L +#undef T1H + +// --------------------------------------- +// iff cond == 1 val <- -val +// func p256NegCond(val *p256Element, cond int) +#define P1ptr R1 +#define CPOOL R4 + +#define Y1L V0 +#define Y1H V1 +#define T1L V2 +#define T1H V3 + +#define PL V30 +#define PH V31 + +#define ZER V4 +#define SEL1 V5 +#define CAR1 V6 +TEXT ·p256NegCond(SB), NOSPLIT, $0 + MOVD val+0(FP), P1ptr + + MOVD $p256mul<>+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + VL 16(P1ptr), Y1H + VPDI $0x4, Y1H, Y1H, Y1H + VL 0(P1ptr), Y1L + VPDI $0x4, Y1L, Y1L, Y1L + + VLREPG cond+8(FP), SEL1 + VZERO ZER + VCEQG SEL1, ZER, SEL1 + + VSCBIQ Y1L, PL, CAR1 + VSQ Y1L, PL, T1L + VSBIQ PH, Y1H, CAR1, T1H + + VSEL Y1L, T1L, SEL1, Y1L + VSEL Y1H, T1H, SEL1, Y1H + + VPDI $0x4, Y1H, Y1H, Y1H + VST Y1H, 16(P1ptr) + VPDI $0x4, Y1L, Y1L, Y1L + VST Y1L, 0(P1ptr) + RET + +#undef P1ptr +#undef CPOOL +#undef Y1L +#undef Y1H +#undef T1L +#undef T1H +#undef PL +#undef PH +#undef ZER +#undef SEL1 +#undef CAR1 + +// --------------------------------------- +// if cond == 0 res <- b; else res <- a +// func p256MovCond(res, a, b *P256Point, cond int) +#define P3ptr R1 +#define P1ptr R2 +#define P2ptr R3 + +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X2L V6 +#define X2H V7 +#define Y2L V8 +#define Y2H V9 +#define Z2L V10 +#define Z2H V11 + +#define ZER V18 +#define SEL1 V19 +TEXT ·p256MovCond(SB), NOSPLIT, $0 + MOVD res+0(FP), P3ptr + MOVD a+8(FP), P1ptr + MOVD b+16(FP), P2ptr + VLREPG cond+24(FP), SEL1 + VZERO ZER + VCEQG SEL1, ZER, SEL1 + + VL 0(P1ptr), X1H + VL 16(P1ptr), X1L + VL 32(P1ptr), Y1H + VL 48(P1ptr), Y1L + VL 64(P1ptr), Z1H + VL 80(P1ptr), Z1L + + VL 0(P2ptr), X2H + VL 16(P2ptr), X2L + VL 32(P2ptr), Y2H + VL 48(P2ptr), Y2L + VL 64(P2ptr), Z2H + VL 80(P2ptr), Z2L + + VSEL X2L, X1L, SEL1, X1L + VSEL X2H, X1H, SEL1, X1H + VSEL Y2L, Y1L, SEL1, Y1L + VSEL Y2H, Y1H, SEL1, Y1H + VSEL Z2L, Z1L, SEL1, Z1L + VSEL Z2H, Z1H, SEL1, Z1H + + VST X1H, 0(P3ptr) + VST X1L, 16(P3ptr) + VST Y1H, 32(P3ptr) + VST Y1L, 48(P3ptr) + VST Z1H, 64(P3ptr) + VST Z1L, 80(P3ptr) + + RET + +#undef P3ptr +#undef P1ptr +#undef P2ptr +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Y2L +#undef Y2H +#undef Z2L +#undef Z2H +#undef ZER +#undef SEL1 + +// --------------------------------------- +// Constant time table access +// Indexed from 1 to 15, with -1 offset +// (index 0 is implicitly point at infinity) +// func p256Select(res *P256Point, table *p256Table, idx int) +#define P3ptr R1 +#define P1ptr R2 +#define COUNT R4 + +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X2L V6 +#define X2H V7 +#define Y2L V8 +#define Y2H V9 +#define Z2L V10 +#define Z2H V11 + +#define ONE V18 +#define IDX V19 +#define SEL1 V20 +#define SEL2 V21 +TEXT ·p256Select(SB), NOSPLIT, $0 + MOVD res+0(FP), P3ptr + MOVD table+8(FP), P1ptr + VLREPB idx+(16+7)(FP), IDX + VREPIB $1, ONE + VREPIB $1, SEL2 + MOVD $1, COUNT + + VZERO X1H + VZERO X1L + VZERO Y1H + VZERO Y1L + VZERO Z1H + VZERO Z1L + +loop_select: + VL 0(P1ptr), X2H + VL 16(P1ptr), X2L + VL 32(P1ptr), Y2H + VL 48(P1ptr), Y2L + VL 64(P1ptr), Z2H + VL 80(P1ptr), Z2L + + VCEQG SEL2, IDX, SEL1 + + VSEL X2L, X1L, SEL1, X1L + VSEL X2H, X1H, SEL1, X1H + VSEL Y2L, Y1L, SEL1, Y1L + VSEL Y2H, Y1H, SEL1, Y1H + VSEL Z2L, Z1L, SEL1, Z1L + VSEL Z2H, Z1H, SEL1, Z1H + + VAB SEL2, ONE, SEL2 + ADDW $1, COUNT + ADD $96, P1ptr + CMPW COUNT, $17 + BLT loop_select + + VST X1H, 0(P3ptr) + VST X1L, 16(P3ptr) + VST Y1H, 32(P3ptr) + VST Y1L, 48(P3ptr) + VST Z1H, 64(P3ptr) + VST Z1L, 80(P3ptr) + RET + +#undef P3ptr +#undef P1ptr +#undef COUNT +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Y2L +#undef Y2H +#undef Z2L +#undef Z2H +#undef ONE +#undef IDX +#undef SEL1 +#undef SEL2 + +// --------------------------------------- + +// func p256FromMont(res, in *p256Element) +#define res_ptr R1 +#define x_ptr R2 +#define CPOOL R4 + +#define T0 V0 +#define T1 V1 +#define T2 V2 +#define TT0 V3 +#define TT1 V4 + +#define ZER V6 +#define SEL1 V7 +#define SEL2 V8 +#define CAR1 V9 +#define CAR2 V10 +#define RED1 V11 +#define RED2 V12 +#define PL V14 +#define PH V15 + +TEXT ·p256FromMont(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in+8(FP), x_ptr + + VZERO T2 + VZERO ZER + MOVD $p256<>+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + VL 48(CPOOL), SEL2 + VL 64(CPOOL), SEL1 + + VL (0*16)(x_ptr), T0 + VPDI $0x4, T0, T0, T0 + VL (1*16)(x_ptr), T1 + VPDI $0x4, T1, T1, T1 + + // First round + VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 + VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0 + VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0 + VSCBIQ TT0, RED1, CAR1 + VSQ TT0, RED1, RED1 + VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow + + VSLDB $8, T1, T0, T0 + VSLDB $8, T2, T1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // Second round + VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 + VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0 + VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0 + VSCBIQ TT0, RED1, CAR1 + VSQ TT0, RED1, RED1 + VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow + + VSLDB $8, T1, T0, T0 + VSLDB $8, T2, T1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // Third round + VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 + VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0 + VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0 + VSCBIQ TT0, RED1, CAR1 + VSQ TT0, RED1, RED1 + VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow + + VSLDB $8, T1, T0, T0 + VSLDB $8, T2, T1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // Last round + VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 + VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0 + VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0 + VSCBIQ TT0, RED1, CAR1 + VSQ TT0, RED1, RED1 + VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow + + VSLDB $8, T1, T0, T0 + VSLDB $8, T2, T1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // --------------------------------------------------- + + VSCBIQ PL, T0, CAR1 + VSQ PL, T0, TT0 + VSBCBIQ T1, PH, CAR1, CAR2 + VSBIQ T1, PH, CAR1, TT1 + VSBIQ T2, ZER, CAR2, T2 + + // what output to use, TT1||TT0 or T1||T0? + VSEL T0, TT0, T2, T0 + VSEL T1, TT1, T2, T1 + + VPDI $0x4, T0, T0, TT0 + VST TT0, (0*16)(res_ptr) + VPDI $0x4, T1, T1, TT1 + VST TT1, (1*16)(res_ptr) + RET + +#undef res_ptr +#undef x_ptr +#undef CPOOL +#undef T0 +#undef T1 +#undef T2 +#undef TT0 +#undef TT1 +#undef ZER +#undef SEL1 +#undef SEL2 +#undef CAR1 +#undef CAR2 +#undef RED1 +#undef RED2 +#undef PL +#undef PH + +// Constant time table access +// Indexed from 1 to 15, with -1 offset +// (index 0 is implicitly point at infinity) +// func p256SelectBase(point *p256Point, table []p256Point, idx int) +// new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) + +#define P3ptr R1 +#define P1ptr R2 +#define COUNT R4 +#define CPOOL R5 + +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X2L V6 +#define X2H V7 +#define Y2L V8 +#define Y2H V9 +#define Z2L V10 +#define Z2H V11 +#define LE2BE V12 + +#define ONE V18 +#define IDX V19 +#define SEL1 V20 +#define SEL2 V21 + +TEXT ·p256SelectAffine(SB), NOSPLIT, $0 + MOVD res+0(FP), P3ptr + MOVD table+8(FP), P1ptr + MOVD $p256<>+0x00(SB), CPOOL + VLREPB idx+(16+7)(FP), IDX + VREPIB $1, ONE + VREPIB $1, SEL2 + MOVD $1, COUNT + VL 80(CPOOL), LE2BE + + VZERO X1H + VZERO X1L + VZERO Y1H + VZERO Y1L + +loop_select: + VL 0(P1ptr), X2H + VL 16(P1ptr), X2L + VL 32(P1ptr), Y2H + VL 48(P1ptr), Y2L + + VCEQG SEL2, IDX, SEL1 + + VSEL X2L, X1L, SEL1, X1L + VSEL X2H, X1H, SEL1, X1H + VSEL Y2L, Y1L, SEL1, Y1L + VSEL Y2H, Y1H, SEL1, Y1H + + VAB SEL2, ONE, SEL2 + ADDW $1, COUNT + ADD $64, P1ptr + CMPW COUNT, $65 + BLT loop_select + VST X1H, 0(P3ptr) + VST X1L, 16(P3ptr) + VST Y1H, 32(P3ptr) + VST Y1L, 48(P3ptr) + + RET + +#undef P3ptr +#undef P1ptr +#undef COUNT +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Y2L +#undef Y2H +#undef Z2L +#undef Z2H +#undef ONE +#undef IDX +#undef SEL1 +#undef SEL2 +#undef CPOOL + +// --------------------------------------- + +// func p256OrdMul(res, in1, in2 *p256OrdElement) +#define res_ptr R1 +#define x_ptr R2 +#define y_ptr R3 +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define M0 V4 +#define M1 V5 +#define T0 V6 +#define T1 V7 +#define T2 V8 +#define YDIG V9 + +#define ADD1 V16 +#define ADD1H V17 +#define ADD2 V18 +#define ADD2H V19 +#define RED1 V20 +#define RED1H V21 +#define RED2 V22 +#define RED2H V23 +#define CAR1 V24 +#define CAR1M V25 + +#define MK0 V30 +#define K0 V31 +TEXT ·p256OrdMul<>(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in1+8(FP), x_ptr + MOVD in2+16(FP), y_ptr + + VZERO T2 + MOVD $p256ordK0<>+0x00(SB), R4 + + // VLEF $3, 0(R4), K0 + WORD $0xE7F40000 + BYTE $0x38 + BYTE $0x03 + MOVD $p256ord<>+0x00(SB), R4 + VL 16(R4), M0 + VL 0(R4), M1 + + VL (0*16)(x_ptr), X0 + VPDI $0x4, X0, X0, X0 + VL (1*16)(x_ptr), X1 + VPDI $0x4, X1, X1, X1 + VL (0*16)(y_ptr), Y0 + VPDI $0x4, Y0, Y0, Y0 + VL (1*16)(y_ptr), Y1 + VPDI $0x4, Y1, Y1, Y1 + + // ---------------------------------------------------------------------------/ + VREPF $3, Y0, YDIG + VMLF X0, YDIG, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMLF X1, YDIG, ADD2 + VMLHF X0, YDIG, ADD1H + VMLHF X1, YDIG, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- +/* * + * ---+--------+--------+ + * T2| T1 | T0 | + * ---+--------+--------+ + * *(add)* + * +--------+--------+ + * | X1 | X0 | + * +--------+--------+ + * *(mul)* + * +--------+--------+ + * | YDIG | YDIG | + * +--------+--------+ + * *(add)* + * +--------+--------+ + * | M1 | M0 | + * +--------+--------+ + * *(mul)* + * +--------+--------+ + * | MK0 | MK0 | + * +--------+--------+ + * + * --------------------- + * + * +--------+--------+ + * | ADD2 | ADD1 | + * +--------+--------+ + * +--------+--------+ + * | ADD2H | ADD1H | + * +--------+--------+ + * +--------+--------+ + * | RED2 | RED1 | + * +--------+--------+ + * +--------+--------+ + * | RED2H | RED1H | + * +--------+--------+ + */ + VREPF $2, Y0, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $1, Y0, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $0, Y0, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $3, Y1, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $2, Y1, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $1, Y1, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $0, Y1, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + + VZERO RED1 + VSCBIQ M0, T0, CAR1 + VSQ M0, T0, ADD1 + VSBCBIQ T1, M1, CAR1, CAR1M + VSBIQ T1, M1, CAR1, ADD2 + VSBIQ T2, RED1, CAR1M, T2 + + // what output to use, ADD2||ADD1 or T1||T0? + VSEL T0, ADD1, T2, T0 + VSEL T1, ADD2, T2, T1 + + VPDI $0x4, T0, T0, T0 + VST T0, (0*16)(res_ptr) + VPDI $0x4, T1, T1, T1 + VST T1, (1*16)(res_ptr) + RET + +#undef res_ptr +#undef x_ptr +#undef y_ptr +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef M0 +#undef M1 +#undef T0 +#undef T1 +#undef T2 +#undef YDIG + +#undef ADD1 +#undef ADD1H +#undef ADD2 +#undef ADD2H +#undef RED1 +#undef RED1H +#undef RED2 +#undef RED2H +#undef CAR1 +#undef CAR1M + +#undef MK0 +#undef K0 + +TEXT ·p256Mul(SB), NOSPLIT, $0 + RET + +TEXT ·p256Sqr(SB), NOSPLIT, $0 + RET + +TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 + RET + +TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 + RET + +TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 + RET + +TEXT ·p256PointDouble6TimesAsm(SB), NOSPLIT, $0 + RET + +#define res_ptr R1 +#define CPOOL R4 +#define T0 V0 +#define T1 V1 +#define T2 V2 +#define TT0 V3 +#define TT1 V4 +#define CAR1 V5 +#define CAR2 V6 +#define PL V7 +#define PH V8 + +//func p256OrdReduce(s *p256OrdElement) +TEXT ·p256OrdReduce(SB),NOSPLIT,$0 + MOVD res+0(FP), res_ptr + + VZERO T2 + MOVD $p256ord<>+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + VL (0*16)(res_ptr), T0 + VPDI $0x4, T0, T0, T0 + VL (1*16)(res_ptr), T1 + VPDI $0x4, T1, T1, T1 + + VSCBIQ PL, T0, CAR1 + VSQ PL, T0, TT0 + VSBCBIQ T1, PH, CAR1, CAR2 + VSBIQ T1, PH, CAR1, TT1 + VSBIQ T2, ZER, CAR2, T2 + + // what output to use, TT1||TT0 or T1||T0? + VSEL T0, TT0, T2, T0 + VSEL T1, TT1, T2, T1 + + VPDI $0x4, T0, T0, TT0 + VST TT0, (0*16)(res_ptr) + VPDI $0x4, T1, T1, TT1 + VST TT1, (1*16)(res_ptr) + + RET +#undef res_ptr +#undef CPOOL +#undef T0 +#undef T1 +#undef T2 +#undef TT0 +#undef TT1 +#undef CAR1 +#undef CAR2 +#undef PL +#undef PH diff --git a/internal/sm2ec/sm2p256_asm_s390x.go b/internal/sm2ec/sm2p256_asm_s390x.go new file mode 100644 index 0000000..e428acd --- /dev/null +++ b/internal/sm2ec/sm2p256_asm_s390x.go @@ -0,0 +1,55 @@ +//go:build !purego + +package sm2ec + + +// p256Element is a P-256 base field element in [0, P-1] in the Montgomery +// domain (with R 2²⁵⁶) as four limbs in little-endian order value. +type p256Element [4]uint64 + +// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the +// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order. +type p256OrdElement [4]uint64 + +// Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p. +// +//go:noescape +func p256Mul(res, in1, in2 *p256Element) + +// Montgomery square, repeated n times (n >= 1). +// +//go:noescape +func p256Sqr(res, in *p256Element, n int) + +// Montgomery multiplication by R⁻¹, or 1 outside the domain. +// Sets res = in * R⁻¹, bringing res out of the Montgomery domain. +// +//go:noescape +func p256FromMont(res, in *p256Element) + +// If cond is not 0, sets val = -val mod p. +// +//go:noescape +func p256NegCond(val *p256Element, cond int) + +// If cond is 0, sets res = b, otherwise sets res = a. +// +//go:noescape +func p256MovCond(res, a, b *SM2P256Point, cond int) + +//go:noescape +func p256BigToLittle(res *p256Element, in *[32]byte) + +//go:noescape +func p256LittleToBig(res *[32]byte, in *p256Element) + +//go:noescape +func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) + +//go:noescape +func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) + +// p256OrdReduce ensures s is in the range [0, ord(G)-1]. +// +//go:noescape +func p256OrdReduce(s *p256OrdElement) diff --git a/internal/sm2ec/sm2p256_asm_s390x_test.go b/internal/sm2ec/sm2p256_asm_s390x_test.go new file mode 100644 index 0000000..c6ec3b9 --- /dev/null +++ b/internal/sm2ec/sm2p256_asm_s390x_test.go @@ -0,0 +1,82 @@ +//go:build s390x && !purego + +package sm2ec + +import ( + "math/big" + "testing" +) + +var bigOne = big.NewInt(1) + +// fromBig converts a *big.Int into a format used by this code. +func fromBig(out *[4]uint64, big *big.Int) { + for i := range out { + out[i] = 0 + } + + for i, v := range big.Bits() { + out[i] = uint64(v) + } +} + +func montFromBig(out *[4]uint64, n *big.Int) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) + r := new(big.Int).Lsh(bigOne, 256) + // out = big * R mod P + outBig := new(big.Int).Mul(n, r) + outBig.Mod(outBig, p) + fromBig(out, outBig) +} + +func toBigInt(in *p256Element) *big.Int { + var valBytes [32]byte + p256LittleToBig(&valBytes, in) + return new(big.Int).SetBytes(valBytes[:]) +} + +func ordElmToBigInt(in *p256OrdElement) *big.Int { + var valBytes [32]byte + p256OrdLittleToBig(&valBytes, in) + return new(big.Int).SetBytes(valBytes[:]) +} + +func testP256FromMont(v *big.Int, t *testing.T) { + val := new(p256Element) + montFromBig((*[4]uint64)(val), v) + res := new(p256Element) + p256FromMont(res, val) + if toBigInt(res).Cmp(v) != 0 { + t.Fatalf("p256FromMont failed for %v", v) + } +} + +func TestP256FromMont(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) + for i := 0; i < 20; i++ { + bigVal := big.NewInt(int64(i)) + testP256FromMont(bigVal, t) + bigVal = new(big.Int).Sub(p, big.NewInt(int64(i))) + testP256FromMont(bigVal, t) + } +} + +func testP256OrderReduce(v *big.Int, t *testing.T) { + val := new(p256OrdElement) + montFromBig((*[4]uint64)(val), v) + p256OrdReduce(val) + if ordElmToBigInt(val).Cmp(v) != 0 { + t.Fatalf("p256OrdReduce failed for %v", v) + } +} + + +func TestP256OrderReduce(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16) + for i := 0; i < 20; i++ { + bigVal := big.NewInt(int64(i)) + testP256OrderReduce(bigVal, t) + bigVal = new(big.Int).Sub(p, big.NewInt(int64(i))) + testP256OrderReduce(bigVal, t) + } +}