From 68d387a75b096e2bad4ae5fde29aa5366a5b3526 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Mon, 26 Aug 2024 10:53:41 +0800 Subject: [PATCH] internal/sm2ec: s390x p256Mul --- internal/sm2ec/p256_asm_s390x.s | 1397 +++++++++++++++++++++- internal/sm2ec/sm2p256_asm_s390x_test.go | 103 ++ 2 files changed, 1484 insertions(+), 16 deletions(-) diff --git a/internal/sm2ec/p256_asm_s390x.s b/internal/sm2ec/p256_asm_s390x.s index 48948ad..065aed0 100644 --- a/internal/sm2ec/p256_asm_s390x.s +++ b/internal/sm2ec/p256_asm_s390x.s @@ -28,24 +28,16 @@ DATA p256mul<>+0x10(SB)/8, $0xffffffff00000000 // P256 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 -DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 -DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 -DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 -DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 -DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 -DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 -DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 -DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 -DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 -DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 -DATA p256mul<>+0x80(SB)/8, $0x0000000100000000 // (1*2^256)%P256 -DATA p256mul<>+0x88(SB)/8, $0x0000000000000000 // (1*2^256)%P256 -DATA p256mul<>+0x90(SB)/8, $0x00000000ffffffff // (1*2^256)%P256 -DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 +DATA p256mul<>+0x30(SB)/8, $0x0405060708090a0b // SEL 0 0 d1 d0 +DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 +DATA p256mul<>+0x40(SB)/8, $0x0000000100000000 // (1*2^256)%P256 +DATA p256mul<>+0x48(SB)/8, $0x0000000000000000 // (1*2^256)%P256 +DATA p256mul<>+0x50(SB)/8, $0x00000000ffffffff // (1*2^256)%P256 +DATA p256mul<>+0x58(SB)/8, $0x0000000000000001 // (1*2^256)%P256 GLOBL p256ordK0<>(SB), 8, $4 GLOBL p256ord<>(SB), 8, $32 GLOBL p256<>(SB), 8, $64 -GLOBL p256mul<>(SB), 8, $160 +GLOBL p256mul<>(SB), 8, $96 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0 @@ -492,7 +484,7 @@ TEXT ·p256SelectAffine(SB), NOSPLIT, $0 VREPIB $1, ONE VREPIB $1, SEL2 MOVD $1, COUNT - VL 80(CPOOL), LE2BE + VL 48(CPOOL), LE2BE VZERO X1H VZERO X1L @@ -1038,19 +1030,1392 @@ loop: #undef T1 #undef K0 +// --------------------------------------- +// sm2p256MulInternal +// V0-V3,V30,V31 - Not Modified +// V4-V15 - Volatile + +#define CPOOL R4 + +// Parameters +#define X0 V0 // Not modified +#define X1 V1 // Not modified +#define Y0 V2 // Not modified +#define Y1 V3 // Not modified +#define T0 V4 +#define T1 V5 +#define P0 V30 // Not modified +#define P1 V31 // Not modified + +// Temporaries +#define YDIG V6 // Overloaded with CAR2, ZER +#define ADD1H V7 // Overloaded with ADD3H +#define ADD2H V8 // Overloaded with ADD4H +#define ADD3 V9 // Overloaded with SEL2,SEL5 +#define ADD4 V10 // Overloaded with SEL3,SEL6 +#define RED1 V11 // Overloaded with CAR2 +#define RED2 V12 +#define RED3 V13 // Overloaded with SEL1 +#define T2 V14 +// Overloaded temporaries +#define ADD1 V4 // Overloaded with T0 +#define ADD2 V5 // Overloaded with T1 +#define ADD3H V7 // Overloaded with ADD1H +#define ADD4H V8 // Overloaded with ADD2H +#define ZER V6 // Overloaded with YDIG, CAR2 +#define CAR1 V6 // Overloaded with YDIG, ZER +#define CAR2 V11 // Overloaded with RED1 +// Constant Selects +#define SEL1 V13 // Overloaded with RED3 +#define SEL2 V9 // Overloaded with ADD3,SEL5 +#define SEL3 V10 // Overloaded with ADD4,SEL6 +#define SEL4 V6 // Overloaded with YDIG,CAR2,ZER +#define SEL5 V9 // Overloaded with ADD3,SEL2 +#define SEL6 V10 // Overloaded with ADD4,SEL3 + +TEXT sm2p256MulInternal<>(SB), NOSPLIT, $0-0 + // --------------------------------------------------- + + VREPF $3, Y0, YDIG + VMLHF X0, YDIG, ADD1H + VMLHF X1, YDIG, ADD2H + VMLF X0, YDIG, ADD1 + VMLF X1, YDIG, ADD2 + + VREPF $2, Y0, YDIG + VMALF X0, YDIG, ADD1H, ADD3 + VMALF X1, YDIG, ADD2H, ADD4 + VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free + VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free + + VZERO ZER + VL 32(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDB $12, ADD2, ADD1, T0 // ADD1 Free + VSLDB $12, ZER, ADD2, T1 // ADD2 Free + + VACCQ T0, ADD3, CAR1 + VAQ T0, ADD3, T0 // ADD3 Free + VACCCQ T1, ADD4, CAR1, T2 + VACQ T1, ADD4, CAR1, T1 // ADD4 Free + + VL 48(CPOOL), SEL2 + VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] + VSLDB $4, RED1, ZER, RED3 // [ 0 d1 d0 0] + VSLDB $4, RED3, ZER, RED2 // [d1 d0 0 0] + VSCBIQ RED3, RED1, CAR1 + VSQ RED3, RED1, RED1 + VSBIQ RED2, RED3, CAR1, RED2 // Guaranteed not to underflow + + VSLDB $12, T1, T0, T0 + VSLDB $12, T2, T1, T1 + + VACCQ T0, ADD3H, CAR1 + VAQ T0, ADD3H, T0 + VACCCQ T1, ADD4H, CAR1, T2 + VACQ T1, ADD4H, CAR1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + // --------------------------------------------------- + + VREPF $1, Y0, YDIG + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 + VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 + + VREPF $0, Y0, YDIG + VMALF X0, YDIG, ADD1H, ADD3 + VMALF X1, YDIG, ADD2H, ADD4 + VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H + VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER + + VZERO ZER + VL 32(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0 + VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free + + VACCQ T0, ADD3, CAR1 + VAQ T0, ADD3, T0 + VACCCQ T1, ADD4, CAR1, T2 + VACQ T1, ADD4, CAR1, T1 + + VL 48(CPOOL), SEL2 + VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] + VSLDB $4, RED1, ZER, RED3 // [ 0 d1 d0 0] + VSLDB $4, RED3, ZER, RED2 // [d1 d0 0 0] + VSCBIQ RED3, RED1, CAR1 + VSQ RED3, RED1, RED1 + VSBIQ RED2, RED3, CAR1, RED2 // Guaranteed not to underflow + + VSLDB $12, T1, T0, T0 + VSLDB $12, T2, T1, T1 + + VACCQ T0, ADD3H, CAR1 + VAQ T0, ADD3H, T0 + VACCCQ T1, ADD4H, CAR1, T2 + VACQ T1, ADD4H, CAR1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + // --------------------------------------------------- + + VREPF $3, Y1, YDIG + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + VMALF X0, YDIG, T0, ADD1 + VMALF X1, YDIG, T1, ADD2 + + VREPF $2, Y1, YDIG + VMALF X0, YDIG, ADD1H, ADD3 + VMALF X1, YDIG, ADD2H, ADD4 + VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free + VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free + + VZERO ZER + VL 32(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDB $12, ADD2, ADD1, T0 // ADD1 Free + VSLDB $12, T2, ADD2, T1 // ADD2 Free + + VACCQ T0, ADD3, CAR1 + VAQ T0, ADD3, T0 + VACCCQ T1, ADD4, CAR1, T2 + VACQ T1, ADD4, CAR1, T1 + + VL 48(CPOOL), SEL2 + VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] + VSLDB $4, RED1, ZER, RED3 // [ 0 d1 d0 0] + VSLDB $4, RED3, ZER, RED2 // [d1 d0 0 0] + VSCBIQ RED3, RED1, CAR1 + VSQ RED3, RED1, RED1 + VSBIQ RED2, RED3, CAR1, RED2 // Guaranteed not to underflow + + VSLDB $12, T1, T0, T0 + VSLDB $12, T2, T1, T1 + + VACCQ T0, ADD3H, CAR1 + VAQ T0, ADD3H, T0 + VACCCQ T1, ADD4H, CAR1, T2 + VACQ T1, ADD4H, CAR1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + // --------------------------------------------------- + + VREPF $1, Y1, YDIG + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + VMALF X0, YDIG, T0, ADD1 + VMALF X1, YDIG, T1, ADD2 + + VREPF $0, Y1, YDIG + VMALF X0, YDIG, ADD1H, ADD3 + VMALF X1, YDIG, ADD2H, ADD4 + VMALHF X0, YDIG, ADD1H, ADD3H + VMALHF X1, YDIG, ADD2H, ADD4H + + VZERO ZER + VL 32(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDB $12, ADD2, ADD1, T0 + VSLDB $12, T2, ADD2, T1 + + VACCQ T0, ADD3, CAR1 + VAQ T0, ADD3, T0 + VACCCQ T1, ADD4, CAR1, T2 + VACQ T1, ADD4, CAR1, T1 + + VL 48(CPOOL), SEL2 + VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] + VSLDB $4, RED1, ZER, RED3 // [ 0 d1 d0 0] + VSLDB $4, RED3, ZER, RED2 // [d1 d0 0 0] + VSCBIQ RED3, RED1, CAR1 + VSQ RED3, RED1, RED1 + VSBIQ RED2, RED3, CAR1, RED2 // Guaranteed not to underflow + + VSLDB $12, T1, T0, T0 + VSLDB $12, T2, T1, T1 + + VACCQ T0, ADD3H, CAR1 + VAQ T0, ADD3H, T0 + VACCCQ T1, ADD4H, CAR1, T2 + VACQ T1, ADD4H, CAR1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // --------------------------------------------------- + + VZERO RED3 + VSCBIQ P0, T0, CAR1 + VSQ P0, T0, ADD1H + VSBCBIQ T1, P1, CAR1, CAR2 + VSBIQ T1, P1, CAR1, ADD2H + VSBIQ T2, RED3, CAR2, T2 + + // what output to use, ADD2H||ADD1H or T1||T0? + VSEL T0, ADD1H, T2, T0 + VSEL T1, ADD2H, T2, T1 + RET + +#undef CPOOL + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 + +#undef SEL1 +#undef SEL2 +#undef SEL3 +#undef SEL4 +#undef SEL5 +#undef SEL6 + +#undef YDIG +#undef ADD1H +#undef ADD2H +#undef ADD3 +#undef ADD4 +#undef RED1 +#undef RED2 +#undef RED3 +#undef T2 +#undef ADD1 +#undef ADD2 +#undef ADD3H +#undef ADD4H +#undef ZER +#undef CAR1 +#undef CAR2 + +// --------------------------------------- + +// Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 + +TEXT sm2p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0 + VLR X0, Y0 + VLR X1, Y1 + BR sm2p256MulInternal<>(SB) + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 + +#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ + VZERO ZER \ + VSCBIQ Y0, X0, CAR1 \ + VSQ Y0, X0, T0 \ + VSBCBIQ X1, Y1, CAR1, SEL1 \ + VSBIQ X1, Y1, CAR1, T1 \ + VSQ SEL1, ZER, SEL1 \ + \ + VACCQ T0, PL, CAR1 \ + VAQ T0, PL, TT0 \ + VACQ T1, PH, CAR1, TT1 \ + \ + VSEL T0, TT0, SEL1, T0 \ + VSEL T1, TT1, SEL1, T1 \ + +#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ + VACCQ X0, Y0, CAR1 \ + VAQ X0, Y0, T0 \ + VACCCQ X1, Y1, CAR1, T2 \ + VACQ X1, Y1, CAR1, T1 \ + \ + VZERO ZER \ + VSCBIQ PL, T0, CAR1 \ + VSQ PL, T0, TT0 \ + VSBCBIQ T1, PH, CAR1, CAR2 \ + VSBIQ T1, PH, CAR1, TT1 \ + VSBIQ T2, ZER, CAR2, SEL1 \ + \ + VSEL T0, TT0, SEL1, T0 \ + VSEL T1, TT1, SEL1, T1 + +#define p256HalfInternal(T1, T0, X1, X0) \ + VZERO ZER \ + VSBIQ ZER, ZER, X0, SEL1 \ + \ + VACCQ X0, PL, CAR1 \ + VAQ X0, PL, T0 \ + VACCCQ X1, PH, CAR1, T2 \ + VACQ X1, PH, CAR1, T1 \ + \ + VSEL X0, T0, SEL1, T0 \ + VSEL X1, T1, SEL1, T1 \ + VSEL ZER, T2, SEL1, T2 \ + \ + VSLDB $15, T2, ZER, TT1 \ + VSLDB $15, T1, ZER, TT0 \ + VREPIB $1, SEL1 \ + VSRL SEL1, T0, T0 \ + VSRL SEL1, T1, T1 \ + VREPIB $7, SEL1 \ + VSL SEL1, TT0, TT0 \ + VSL SEL1, TT1, TT1 \ + VO T0, TT0, T0 \ + VO T1, TT1, T1 + +// --------------------------------------- +// func p256Mul(res, in1, in2 *p256Element) +#define res_ptr R1 +#define x_ptr R2 +#define y_ptr R3 +#define CPOOL R4 + +// Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 + +// Constants +#define P0 V30 +#define P1 V31 TEXT ·p256Mul(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in1+8(FP), x_ptr + MOVD in2+16(FP), y_ptr + + VL (0*16)(x_ptr), X0 + VPDI $0x4, X0, X0, X0 + VL (1*16)(x_ptr), X1 + VPDI $0x4, X1, X1, X1 + VL (0*16)(y_ptr), Y0 + VPDI $0x4, Y0, Y0, Y0 + VL (1*16)(y_ptr), Y1 + VPDI $0x4, Y1, Y1, Y1 + + MOVD $p256mul<>+0x00(SB), CPOOL + VL 16(CPOOL), P0 + VL 0(CPOOL), P1 + + CALL sm2p256MulInternal<>(SB) + + VPDI $0x4, T0, T0, T0 + VST T0, (0*16)(res_ptr) + VPDI $0x4, T1, T1, T1 + VST T1, (1*16)(res_ptr) RET +#undef res_ptr +#undef x_ptr +#undef y_ptr +#undef CPOOL + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 + +// --------------------------------------- +// func p256Sqr(res, in *p256Element, n int) +#define res_ptr R1 +#define x_ptr R2 +#define y_ptr R3 +#define CPOOL R4 +#define COUNT R5 +#define N R6 + +// Parameters +#define X0 V0 +#define X1 V1 +#define T0 V4 +#define T1 V5 + +// Constants +#define P0 V30 +#define P1 V31 TEXT ·p256Sqr(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in+8(FP), x_ptr + + VL (0*16)(x_ptr), X0 + VPDI $0x4, X0, X0, X0 + VL (1*16)(x_ptr), X1 + VPDI $0x4, X1, X1, X1 + + MOVD $p256mul<>+0x00(SB), CPOOL + MOVD $0, COUNT + MOVD n+16(FP), N + VL 16(CPOOL), P0 + VL 0(CPOOL), P1 + +loop: + CALL sm2p256SqrInternal<>(SB) + VLR T0, X0 + VLR T1, X1 + ADDW $1, COUNT + CMPW COUNT, N + BLT loop + + VPDI $0x4, T0, T0, T0 + VST T0, (0*16)(res_ptr) + VPDI $0x4, T1, T1, T1 + VST T1, (1*16)(res_ptr) RET +#undef res_ptr +#undef x_ptr +#undef y_ptr +#undef CPOOL +#undef COUNT +#undef N + +#undef X0 +#undef X1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 + +// Point add with P2 being affine point +// If sign == 1 -> P2 = -P2 +// If sel == 0 -> P3 = P1 +// if zero == 0 -> P3 = P2 +// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) +#define P3ptr R1 +#define P1ptr R2 +#define P2ptr R3 +#define CPOOL R4 + +// Temporaries in REGs +#define Y2L V15 +#define Y2H V16 +#define T1L V17 +#define T1H V18 +#define T2L V19 +#define T2H V20 +#define T3L V21 +#define T3H V22 +#define T4L V23 +#define T4H V24 + +// Temps for Sub and Add +#define TT0 V11 +#define TT1 V12 +#define T2 V13 + +// p256MulAsm Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 + +#define PL V30 +#define PH V31 + +// Names for zero/sel selects +#define X1L V0 +#define X1H V1 +#define Y1L V2 // p256MulAsmParmY +#define Y1H V3 // p256MulAsmParmY +#define Z1L V4 +#define Z1H V5 +#define X2L V0 +#define X2H V1 +#define Z2L V4 +#define Z2H V5 +#define X3L V17 // T1L +#define X3H V18 // T1H +#define Y3L V21 // T3L +#define Y3H V22 // T3H +#define Z3L V28 +#define Z3H V29 + +#define ZER V6 +#define SEL1 V7 +#define CAR1 V8 +#define CAR2 V9 +/* * + * Three operand formula: + * Source: 2004 Hankerson–Menezes–Vanstone, page 91. + * T1 = Z1² + * T2 = T1*Z1 + * T1 = T1*X2 + * T2 = T2*Y2 + * T1 = T1-X1 + * T2 = T2-Y1 + * Z3 = Z1*T1 + * T3 = T1² + * T4 = T3*T1 + * T3 = T3*X1 + * T1 = 2*T3 + * X3 = T2² + * X3 = X3-T1 + * X3 = X3-T4 + * T3 = T3-X3 + * T3 = T3*T2 + * T4 = T4*Y1 + * Y3 = T3-T4 + + * Three operand formulas, but with MulInternal X,Y used to store temps +X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 +X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 +X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 +X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 +SUB(T2+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + // if (sign == 1) { + // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 + // } + + VL 48(P2ptr), Y2H + VPDI $0x4, Y2H, Y2H, Y2H + VL 32(P2ptr), Y2L + VPDI $0x4, Y2L, Y2L, Y2L + + VLREPG sign+24(FP), SEL1 + VZERO ZER + VCEQG SEL1, ZER, SEL1 + + VSCBIQ Y2L, PL, CAR1 + VSQ Y2L, PL, T1L + VSBIQ PH, Y2H, CAR1, T1H + + VSEL Y2L, T1L, SEL1, Y2L + VSEL Y2H, T1H, SEL1, Y2H + +/* * + * Three operand formula: + * Source: 2004 Hankerson–Menezes–Vanstone, page 91. + */ + // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 + VL 80(P1ptr), X1 // Z1H + VPDI $0x4, X1, X1, X1 + VL 64(P1ptr), X0 // Z1L + VPDI $0x4, X0, X0, X0 + VLR X0, Y0 + VLR X1, Y1 + CALL sm2p256SqrInternal<>(SB) + + // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 + VLR T0, X0 + VLR T1, X1 + CALL sm2p256MulInternal<>(SB) + VLR T0, T2L + VLR T1, T2H + + // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 + VL 16(P2ptr), Y1 // X2H + VPDI $0x4, Y1, Y1, Y1 + VL 0(P2ptr), Y0 // X2L + VPDI $0x4, Y0, Y0, Y0 + CALL sm2p256MulInternal<>(SB) + VLR T0, T1L + VLR T1, T1H + + // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 + VLR T2L, X0 + VLR T2H, X1 + VLR Y2L, Y0 + VLR Y2H, Y1 + CALL sm2p256MulInternal<>(SB) + + // SUB(T2(SB) + + // VST T1, 64(P3ptr) + // VST T0, 80(P3ptr) + VLR T0, Z3L + VLR T1, Z3H + + // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 + VLR Y0, X0 + VLR Y1, X1 + CALL sm2p256SqrInternal<>(SB) + VLR T0, X0 + VLR T1, X1 + + // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 + CALL sm2p256MulInternal<>(SB) + VLR T0, T4L + VLR T1, T4H + + // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 + VL 16(P1ptr), Y1 // X1H + VPDI $0x4, Y1, Y1, Y1 + VL 0(P1ptr), Y0 // X1L + VPDI $0x4, Y0, Y0, Y0 + CALL sm2p256MulInternal<>(SB) + VLR T0, T3L + VLR T1, T3H + + // ADD(T1(SB) + + // SUB(T(SB) + VLR T0, T3L + VLR T1, T3H + + // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 + VLR T4L, X0 + VLR T4H, X1 + VL 48(P1ptr), Y1 // Y1H + VPDI $0x4, Y1, Y1, Y1 + VL 32(P1ptr), Y0 // Y1L + VPDI $0x4, Y0, Y0, Y0 + CALL sm2p256MulInternal<>(SB) + + // SUB(T+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + // X=Z1; Y=Z1; MUL; T- // T1 = Z1² + VL 80(P1ptr), X1 // Z1H + VPDI $0x4, X1, X1, X1 + VL 64(P1ptr), X0 // Z1L + VPDI $0x4, X0, X0, X0 + VLR X0, Y0 + VLR X1, Y1 + CALL sm2p256SqrInternal<>(SB) + + // SUB(X(SB) + + // ADD(T2(SB) + VPDI $0x4, T1, T1, TT1 + VST TT1, 80(P3ptr) + VPDI $0x4, T0, T0, TT0 + VST TT0, 64(P3ptr) + + // X- ; Y=X ; MUL; T- // Y3 = Y3² + VLR X0, Y0 + VLR X1, Y1 + CALL sm2p256SqrInternal<>(SB) + + // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 + VLR T0, X0 + VLR T1, X1 + VL 16(P1ptr), Y1 + VPDI $0x4, Y1, Y1, Y1 + VL 0(P1ptr), Y0 + VPDI $0x4, Y0, Y0, Y0 + CALL sm2p256MulInternal<>(SB) + VLR T0, T3L + VLR T1, T3H + + // X- ; Y=X ; MUL; T- // Y3 = Y3² + VLR X0, Y0 + VLR X1, Y1 + CALL sm2p256SqrInternal<>(SB) + + // HAL(Y3(SB) + + // ADD(T1(SB) + + // SUB(Y3+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 + VL 80(P1ptr), X1 // Z1H + VPDI $0x4, X1, X1, X1 + VL 64(P1ptr), X0 // Z1L + VPDI $0x4, X0, X0, X0 + VLR X0, Y0 + VLR X1, Y1 + CALL sm2p256SqrInternal<>(SB) + + // X- ; Y=T ; MUL; R=T // R = Z1*T1 + VLR T0, Y0 + VLR T1, Y1 + CALL sm2p256MulInternal<>(SB) + VLR T0, RL + VLR T1, RH + + // X=X2; Y- ; MUL; H=T // H = X2*T1 + VL 16(P2ptr), X1 // X2H + VPDI $0x4, X1, X1, X1 + VL 0(P2ptr), X0 // X2L + VPDI $0x4, X0, X0, X0 + CALL sm2p256MulInternal<>(SB) + VLR T0, HL + VLR T1, HH + + // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 + VL 80(P2ptr), X1 // Z2H + VPDI $0x4, X1, X1, X1 + VL 64(P2ptr), X0 // Z2L + VPDI $0x4, X0, X0, X0 + VLR X0, Y0 + VLR X1, Y1 + CALL sm2p256SqrInternal<>(SB) + + // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 + VLR T0, Y0 + VLR T1, Y1 + CALL sm2p256MulInternal<>(SB) + VLR T0, S1L + VLR T1, S1H + + // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 + VL 16(P1ptr), X1 // X1H + VPDI $0x4, X1, X1, X1 + VL 0(P1ptr), X0 // X1L + VPDI $0x4, X0, X0, X0 + CALL sm2p256MulInternal<>(SB) + VLR T0, U1L + VLR T1, U1H + + // SUB(H(SB) + + // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H + VLR T0, X0 + VLR T1, X1 + VLR HL, Y0 + VLR HH, Y1 + CALL sm2p256MulInternal<>(SB) + VPDI $0x4, T1, T1, TT1 + VST TT1, 80(P3ptr) + VPDI $0x4, T0, T0, TT0 + VST TT0, 64(P3ptr) + + // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 + VL 48(P1ptr), X1 + VPDI $0x4, X1, X1, X1 + VL 32(P1ptr), X0 + VPDI $0x4, X0, X0, X0 + VLR S1L, Y0 + VLR S1H, Y1 + CALL sm2p256MulInternal<>(SB) + VLR T0, S1L + VLR T1, S1H + + // X=Y2; Y=R ; MUL; T- // R = Y2*R + VL 48(P2ptr), X1 + VPDI $0x4, X1, X1, X1 + VL 32(P2ptr), X0 + VPDI $0x4, X0, X0, X0 + VLR RL, Y0 + VLR RH, Y1 + CALL sm2p256MulInternal<>(SB) + + // SUB(R(SB) + + // X- ; Y=T ; MUL; T2=T // T2 = H*T1 + VLR T0, Y0 + VLR T1, Y1 + CALL sm2p256MulInternal<>(SB) + VLR T0, T2L + VLR T1, T2H + + // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 + VLR U1L, X0 + VLR U1H, X1 + CALL sm2p256MulInternal<>(SB) + VLR T0, U1L + VLR T1, U1H + + // X=R ; Y=R ; MUL; T- // X3 = R*R + VLR RL, X0 + VLR RH, X1 + VLR RL, Y0 + VLR RH, Y1 + CALL sm2p256SqrInternal<>(SB) + + // SUB(T(SB) + VLR T0, U1L + VLR T1, U1H + + // X=S1; Y=T2; MUL; T- // T2 = S1*T2 + VLR S1L, X0 + VLR S1H, X1 + VLR T2L, Y0 + VLR T2H, Y1 + CALL sm2p256MulInternal<>(SB) + + // SUB(T