From 187a4f7b803b2a7dcfff41dc2655a8517c2b5c7d Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Tue, 27 Aug 2024 15:14:15 +0800 Subject: [PATCH] internal/sm2ec: enable ppc64le step 1 --- internal/sm2ec/p256_asm_ppc64le.s | 1494 ++++++++++++++++++++ internal/sm2ec/sm2p256.go | 2 +- internal/sm2ec/sm2p256_asm.go | 2 +- internal/sm2ec/sm2p256_asm_ppc64le.go | 44 - internal/sm2ec/sm2p256_asm_ppc64le_test.go | 88 -- internal/sm2ec/sm2p256_asm_test.go | 2 +- 6 files changed, 1497 insertions(+), 135 deletions(-) delete mode 100644 internal/sm2ec/sm2p256_asm_ppc64le.go delete mode 100644 internal/sm2ec/sm2p256_asm_ppc64le_test.go diff --git a/internal/sm2ec/p256_asm_ppc64le.s b/internal/sm2ec/p256_asm_ppc64le.s index 896e68e..e77ba5f 100644 --- a/internal/sm2ec/p256_asm_ppc64le.s +++ b/internal/sm2ec/p256_asm_ppc64le.s @@ -700,3 +700,1497 @@ TEXT ·p256OrdReduce(SB),NOSPLIT,$0 #undef CAR2 #undef PL #undef PH + +// --------------------------------------- +// sm2p256MulInternal +// V0-V3 V30,V31 - Not Modified +// V4-V15 V27-V29 - Volatile + +#define CPOOL R7 + +// Parameters +#define X0 V0 // Not modified +#define X1 V1 // Not modified +#define Y0 V2 // Not modified +#define Y1 V3 // Not modified +#define T0 V4 // Result +#define T1 V5 // Result +#define P0 V30 // Not modified +#define P1 V31 // Not modified + +// Temporaries: lots of reused vector regs +#define YDIG V6 // Overloaded with CAR2 +#define ADD1H V7 // Overloaded with ADD3H +#define ADD2H V8 // Overloaded with ADD4H +#define ADD3 V9 // Overloaded with SEL2,SEL5 +#define ADD4 V10 // Overloaded with SEL3,SEL6 +#define RED1 V11 // Overloaded with CAR2 +#define RED2 V12 +#define RED3 V13 // Overloaded with SEL1 +#define T2 V14 +// Overloaded temporaries +#define ADD1 V4 // Overloaded with T0 +#define ADD2 V5 // Overloaded with T1 +#define ADD3H V7 // Overloaded with ADD1H +#define ADD4H V8 // Overloaded with ADD2H +#define ZER V28 // Overloaded with TMP1 +#define CAR1 V6 // Overloaded with YDIG +#define CAR2 V11 // Overloaded with RED1 +// Constant Selects +#define SEL1 V13 // Overloaded with RED3 +#define SEL2 V9 // Overloaded with ADD3,SEL5 +#define SEL3 V10 // Overloaded with ADD4,SEL6 +#define SEL4 V6 // Overloaded with YDIG,CAR1 +#define SEL5 V9 // Overloaded with ADD3,SEL2 +#define SEL6 V10 // Overloaded with ADD4,SEL3 + +// TMP1, TMP2 used in +// VMULT macros +#define TMP1 V13 // Overloaded with RED3 +#define TMP2 V27 +#define ONE V29 // 1s splatted by word + +TEXT sm2p256MulInternal<>(SB), NOSPLIT, $0-16 + // CPOOL loaded from caller + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + + // --------------------------------------------------- + + VSPLTW $3, Y0, YDIG // VREPF Y0 is input + + // VMLHF X0, YDIG, ADD1H + // VMLHF X1, YDIG, ADD2H + // VMLF X0, YDIG, ADD1 + // VMLF X1, YDIG, ADD2 + // + VMULT(X0, YDIG, ADD1, ADD1H) + VMULT(X1, YDIG, ADD2, ADD2H) + + VSPLTISW $1, ONE + VSPLTW $2, Y0, YDIG // VREPF + + // VMALF X0, YDIG, ADD1H, ADD3 + // VMALF X1, YDIG, ADD2H, ADD4 + // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free + // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free + VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) + VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) + + LXVD2X (R17)(CPOOL), SEL1 + VSPLTISB $0, ZER // VZERO ZER + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB + VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB + + VADDCUQ T0, ADD3, CAR1 // VACCQ + VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ + VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ + + LXVD2X (R18)(CPOOL), SEL2 + VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] + VSLDOI $4, RED1, ZER, RED3 // [ 0 d1 d0 0] + VSLDOI $4, RED3, ZER, RED2 // [d1 d0 0 0] + VSUBCUQ RED1, RED3, CAR1 + VSUBUQM RED1, RED3, RED1 + VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow + + VSLDOI $12, T1, T0, T0 // VSLDB + VSLDOI $12, T2, T1, T1 // VSLDB + + VADDCUQ T0, ADD3H, CAR1 // VACCQ + VADDUQM T0, ADD3H, T0 // VAQ + VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4H, CAR1, T1 // VACQ + + VADDCUQ T0, RED1, CAR1 // VACCQ + VADDUQM T0, RED1, T0 // VAQ + VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ + VADDEUQM T1, RED2, CAR1, T1 // VACQ + VADDUQM T2, CAR2, T2 // VAQ + + // --------------------------------------------------- + + VSPLTW $1, Y0, YDIG // VREPF + + // VMALHF X0, YDIG, T0, ADD1H + // VMALHF X1, YDIG, T1, ADD2H + // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 + // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 + VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) + VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) + + VSPLTW $0, Y0, YDIG // VREPF + + // VMALF X0, YDIG, ADD1H, ADD3 + // VMALF X1, YDIG, ADD2H, ADD4 + // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H + // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER + VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) + VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) + + VSPLTISB $0, ZER // VZERO ZER + LXVD2X (R17)(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB + VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB + + VADDCUQ T0, ADD3, CAR1 // VACCQ + VADDUQM T0, ADD3, T0 // VAQ + VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4, CAR1, T1 // VACQ + + LXVD2X (R18)(CPOOL), SEL2 + VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] + VSLDOI $4, RED1, ZER, RED3 // [ 0 d1 d0 0] + VSLDOI $4, RED3, ZER, RED2 // [d1 d0 0 0] + VSUBCUQ RED1, RED3, CAR1 + VSUBUQM RED1, RED3, RED1 + VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow + + VSLDOI $12, T1, T0, T0 // VSLDB + VSLDOI $12, T2, T1, T1 // VSLDB + + VADDCUQ T0, ADD3H, CAR1 // VACCQ + VADDUQM T0, ADD3H, T0 // VAQ + VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4H, CAR1, T1 // VACQ + + VADDCUQ T0, RED1, CAR1 // VACCQ + VADDUQM T0, RED1, T0 // VAQ + VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ + VADDEUQM T1, RED2, CAR1, T1 // VACQ + VADDUQM T2, CAR2, T2 // VAQ + // --------------------------------------------------- + + VSPLTW $3, Y1, YDIG // VREPF + + // VMALHF X0, YDIG, T0, ADD1H + // VMALHF X1, YDIG, T1, ADD2H + // VMALF X0, YDIG, T0, ADD1 + // VMALF X1, YDIG, T1, ADD2 + VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) + VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) + + VSPLTW $2, Y1, YDIG // VREPF + + // VMALF X0, YDIG, ADD1H, ADD3 + // VMALF X1, YDIG, ADD2H, ADD4 + // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free + // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free + VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) + VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) + + VSPLTISB $0, ZER // VZERO ZER + LXVD2X (R17)(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB + VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB + + VADDCUQ T0, ADD3, CAR1 // VACCQ + VADDUQM T0, ADD3, T0 // VAQ + VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4, CAR1, T1 // VACQ + + LXVD2X (R18)(CPOOL), SEL2 + VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] + VSLDOI $4, RED1, ZER, RED3 // [ 0 d1 d0 0] + VSLDOI $4, RED3, ZER, RED2 // [d1 d0 0 0] + VSUBCUQ RED1, RED3, CAR1 + VSUBUQM RED1, RED3, RED1 + VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow + + VSLDOI $12, T1, T0, T0 // VSLDB + VSLDOI $12, T2, T1, T1 // VSLDB + + VADDCUQ T0, ADD3H, CAR1 // VACCQ + VADDUQM T0, ADD3H, T0 // VAQ + VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4H, CAR1, T1 // VACQ + + VADDCUQ T0, RED1, CAR1 // VACCQ + VADDUQM T0, RED1, T0 // VAQ + VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ + VADDEUQM T1, RED2, CAR1, T1 // VACQ + VADDUQM T2, CAR2, T2 // VAQ + // --------------------------------------------------- + + VSPLTW $1, Y1, YDIG // VREPF + + // VMALHF X0, YDIG, T0, ADD1H + // VMALHF X1, YDIG, T1, ADD2H + // VMALF X0, YDIG, T0, ADD1 + // VMALF X1, YDIG, T1, ADD2 + VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) + VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) + + VSPLTW $0, Y1, YDIG // VREPF + + // VMALF X0, YDIG, ADD1H, ADD3 + // VMALF X1, YDIG, ADD2H, ADD4 + // VMALHF X0, YDIG, ADD1H, ADD3H + // VMALHF X1, YDIG, ADD2H, ADD4H + VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) + VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) + + VSPLTISB $0, ZER // VZERO ZER + LXVD2X (R17)(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDOI $12, ADD2, ADD1, T0 // VSLDB + VSLDOI $12, T2, ADD2, T1 // VSLDB + + VADDCUQ T0, ADD3, CAR1 // VACCQ + VADDUQM T0, ADD3, T0 // VAQ + VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4, CAR1, T1 // VACQ + + LXVD2X (R18)(CPOOL), SEL2 + VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] + VSLDOI $4, RED1, ZER, RED3 // [ 0 d1 d0 0] + VSLDOI $4, RED3, ZER, RED2 // [d1 d0 0 0] + VSUBCUQ RED1, RED3, CAR1 + VSUBUQM RED1, RED3, RED1 + VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow + + VSLDOI $12, T1, T0, T0 // VSLDB + VSLDOI $12, T2, T1, T1 // VSLDB + + VADDCUQ T0, ADD3H, CAR1 // VACCQ + VADDUQM T0, ADD3H, T0 // VAQ + VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4H, CAR1, T1 // VACQ + + VADDCUQ T0, RED1, CAR1 // VACCQ + VADDUQM T0, RED1, T0 // VAQ + VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ + VADDEUQM T1, RED2, CAR1, T1 // VACQ + VADDUQM T2, CAR2, T2 // VAQ + + // --------------------------------------------------- + + VSPLTISB $0, RED3 // VZERO RED3 + VSUBCUQ T0, P0, CAR1 // VSCBIQ + VSUBUQM T0, P0, ADD1H // VSQ + VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ + VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ + VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ + + // what output to use, ADD2H||ADD1H or T1||T0? + VSEL ADD1H, T0, T2, T0 + VSEL ADD2H, T1, T2, T1 + RET + +#undef CPOOL + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 + +#undef SEL1 +#undef SEL2 +#undef SEL3 +#undef SEL4 +#undef SEL5 +#undef SEL6 + +#undef YDIG +#undef ADD1H +#undef ADD2H +#undef ADD3 +#undef ADD4 +#undef RED1 +#undef RED2 +#undef RED3 +#undef T2 +#undef ADD1 +#undef ADD2 +#undef ADD3H +#undef ADD4H +#undef ZER +#undef CAR1 +#undef CAR2 + +#undef TMP1 +#undef TMP2 + +#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ + VSPLTISB $0, ZER \ // VZERO + VSUBCUQ X0, Y0, CAR1 \ + VSUBUQM X0, Y0, T0 \ + VSUBECUQ X1, Y1, CAR1, SEL1 \ + VSUBEUQM X1, Y1, CAR1, T1 \ + VSUBUQM ZER, SEL1, SEL1 \ // VSQ + \ + VADDCUQ T0, PL, CAR1 \ // VACCQ + VADDUQM T0, PL, TT0 \ // VAQ + VADDEUQM T1, PH, CAR1, TT1 \ // VACQ + \ + VSEL TT0, T0, SEL1, T0 \ + VSEL TT1, T1, SEL1, T1 \ + +#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ + VADDCUQ X0, Y0, CAR1 \ + VADDUQM X0, Y0, T0 \ + VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ + VADDEUQM X1, Y1, CAR1, T1 \ + \ + VSPLTISB $0, ZER \ + VSUBCUQ T0, PL, CAR1 \ // VSCBIQ + VSUBUQM T0, PL, TT0 \ + VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ + VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ + VSUBEUQM T2, ZER, CAR2, SEL1 \ + \ + VSEL TT0, T0, SEL1, T0 \ + VSEL TT1, T1, SEL1, T1 + +#define p256HalfInternal(T1, T0, X1, X0) \ + VSPLTISB $0, ZER \ + VSUBEUQM ZER, ZER, X0, SEL1 \ + \ + VADDCUQ X0, PL, CAR1 \ + VADDUQM X0, PL, T0 \ + VADDECUQ X1, PH, CAR1, T2 \ + VADDEUQM X1, PH, CAR1, T1 \ + \ + VSEL T0, X0, SEL1, T0 \ + VSEL T1, X1, SEL1, T1 \ + VSEL T2, ZER, SEL1, T2 \ + \ + VSLDOI $15, T2, ZER, TT1 \ + VSLDOI $15, T1, ZER, TT0 \ + VSPLTISB $1, SEL1 \ + VSR T0, SEL1, T0 \ // VSRL + VSR T1, SEL1, T1 \ + VSPLTISB $7, SEL1 \ // VREPIB + VSL TT0, SEL1, TT0 \ + VSL TT1, SEL1, TT1 \ + VOR T0, TT0, T0 \ + VOR T1, TT1, T1 + +#define res_ptr R3 +#define x_ptr R4 +#define y_ptr R5 +#define CPOOL R7 +#define TEMP R8 +#define N R9 + +// Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 + +// Constants +#define P0 V30 +#define P1 V31 +// func p256MulAsm(res, in1, in2 *p256Element) +TEXT ·p256Mul(SB), NOSPLIT, $0-24 + MOVD res+0(FP), res_ptr + MOVD in1+8(FP), x_ptr + MOVD in2+16(FP), y_ptr + MOVD $16, R16 + MOVD $32, R17 + + MOVD $p256mul<>+0x00(SB), CPOOL + + + LXVD2X (R0)(x_ptr), X0 + LXVD2X (R16)(x_ptr), X1 + + XXPERMDI X0, X0, $2, X0 + XXPERMDI X1, X1, $2, X1 + + LXVD2X (R0)(y_ptr), Y0 + LXVD2X (R16)(y_ptr), Y1 + + XXPERMDI Y0, Y0, $2, Y0 + XXPERMDI Y1, Y1, $2, Y1 + + LXVD2X (R16)(CPOOL), P1 + LXVD2X (R0)(CPOOL), P0 + + CALL sm2p256MulInternal<>(SB) + + MOVD $p256mul<>+0x00(SB), CPOOL + + XXPERMDI T0, T0, $2, T0 + XXPERMDI T1, T1, $2, T1 + STXVD2X T0, (R0)(res_ptr) + STXVD2X T1, (R16)(res_ptr) + RET + +// func p256Sqr(res, in *p256Element, n int) +TEXT ·p256Sqr(SB), NOSPLIT, $0-24 + MOVD res+0(FP), res_ptr + MOVD in+8(FP), x_ptr + MOVD $16, R16 + MOVD $32, R17 + + MOVD $p256mul<>+0x00(SB), CPOOL + + LXVD2X (R0)(x_ptr), X0 + LXVD2X (R16)(x_ptr), X1 + + XXPERMDI X0, X0, $2, X0 + XXPERMDI X1, X1, $2, X1 + +sqrLoop: + // Sqr uses same value for both + + VOR X0, X0, Y0 + VOR X1, X1, Y1 + + LXVD2X (R16)(CPOOL), P1 + LXVD2X (R0)(CPOOL), P0 + + CALL sm2p256MulInternal<>(SB) + + MOVD n+16(FP), N + ADD $-1, N + CMP $0, N + BEQ done + MOVD N, n+16(FP) // Save counter to avoid clobber + VOR T0, T0, X0 + VOR T1, T1, X1 + BR sqrLoop + +done: + MOVD $p256mul<>+0x00(SB), CPOOL + + XXPERMDI T0, T0, $2, T0 + XXPERMDI T1, T1, $2, T1 + STXVD2X T0, (R0)(res_ptr) + STXVD2X T1, (R16)(res_ptr) + RET + +#undef res_ptr +#undef x_ptr +#undef y_ptr +#undef CPOOL + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 + +#define P3ptr R3 +#define P1ptr R4 +#define P2ptr R5 +#define CPOOL R7 + +// Temporaries in REGs +#define Y2L V15 +#define Y2H V16 +#define T1L V17 +#define T1H V18 +#define T2L V19 +#define T2H V20 +#define T3L V21 +#define T3H V22 +#define T4L V23 +#define T4H V24 + +// Temps for Sub and Add +#define TT0 V11 +#define TT1 V12 +#define T2 V13 + +// p256MulAsm Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 + +#define PL V30 +#define PH V31 + +// Names for zero/sel selects +#define X1L V0 +#define X1H V1 +#define Y1L V2 // p256MulAsmParmY +#define Y1H V3 // p256MulAsmParmY +#define Z1L V4 +#define Z1H V5 +#define X2L V0 +#define X2H V1 +#define Z2L V4 +#define Z2H V5 +#define X3L V17 // T1L +#define X3H V18 // T1H +#define Y3L V21 // T3L +#define Y3H V22 // T3H +#define Z3L V25 +#define Z3H V26 + +#define ZER V6 +#define SEL1 V7 +#define CAR1 V8 +#define CAR2 V9 +/* * + * Three operand formula: + * Source: 2004 Hankerson–Menezes–Vanstone, page 91. + * T1 = Z1² + * T2 = T1*Z1 + * T1 = T1*X2 + * T2 = T2*Y2 + * T1 = T1-X1 + * T2 = T2-Y1 + * Z3 = Z1*T1 + * T3 = T1² + * T4 = T3*T1 + * T3 = T3*X1 + * T1 = 2*T3 + * X3 = T2² + * X3 = X3-T1 + * X3 = X3-T4 + * T3 = T3-X3 + * T3 = T3*T2 + * T4 = T4*Y1 + * Y3 = T3-T4 + + * Three operand formulas, but with MulInternal X,Y used to store temps +X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 +X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 +X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 +X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 +SUB(T2+0x00(SB), CPOOL + + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + MOVD $96, R21 + MOVD $112, R22 + MOVD $128, R23 + MOVD $144, R24 + MOVD $160, R25 + MOVD $104, R26 // offset of sign+24(FP) + + LXVD2X (R16)(CPOOL), PH + LXVD2X (R0)(CPOOL), PL + + LXVD2X (R17)(P2ptr), Y2L + LXVD2X (R18)(P2ptr), Y2H + XXPERMDI Y2H, Y2H, $2, Y2H + XXPERMDI Y2L, Y2L, $2, Y2L + + // Equivalent of VLREPG sign+24(FP), SEL1 + LXVDSX (R1)(R26), SEL1 + VSPLTISB $0, ZER + VCMPEQUD SEL1, ZER, SEL1 + + VSUBCUQ PL, Y2L, CAR1 + VSUBUQM PL, Y2L, T1L + VSUBEUQM PH, Y2H, CAR1, T1H + + VSEL T1L, Y2L, SEL1, Y2L + VSEL T1H, Y2H, SEL1, Y2H + +/* * + * Three operand formula: + * Source: 2004 Hankerson–Menezes–Vanstone, page 91. + */ + // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 + LXVD2X (R19)(P1ptr), X0 // Z1H + LXVD2X (R20)(P1ptr), X1 // Z1L + XXPERMDI X0, X0, $2, X0 + XXPERMDI X1, X1, $2, X1 + VOR X0, X0, Y0 + VOR X1, X1, Y1 + CALL sm2p256MulInternal<>(SB) + + // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 + VOR T0, T0, X0 + VOR T1, T1, X1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, T2L + VOR T1, T1, T2H + + // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 + MOVD in2+16(FP), P2ptr + LXVD2X (R0)(P2ptr), Y0 // X2H + LXVD2X (R16)(P2ptr), Y1 // X2L + XXPERMDI Y0, Y0, $2, Y0 + XXPERMDI Y1, Y1, $2, Y1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, T1L + VOR T1, T1, T1H + + // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 + VOR T2L, T2L, X0 + VOR T2H, T2H, X1 + VOR Y2L, Y2L, Y0 + VOR Y2H, Y2H, Y1 + CALL sm2p256MulInternal<>(SB) + + // SUB(T2(SB) + + VOR T0, T0, Z3L + VOR T1, T1, Z3H + + // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 + VOR Y0, Y0, X0 + VOR Y1, Y1, X1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, X0 + VOR T1, T1, X1 + + // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, T4L + VOR T1, T1, T4H + + // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 + MOVD in1+8(FP), P1ptr + LXVD2X (R0)(P1ptr), Y0 // X1H + LXVD2X (R16)(P1ptr), Y1 // X1L + XXPERMDI Y1, Y1, $2, Y1 + XXPERMDI Y0, Y0, $2, Y0 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, T3L + VOR T1, T1, T3H + + // ADD(T1(SB) + + // SUB(T(SB) + VOR T0, T0, T3L + VOR T1, T1, T3H + + // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 + VOR T4L, T4L, X0 + VOR T4H, T4H, X1 + MOVD in1+8(FP), P1ptr + LXVD2X (R17)(P1ptr), Y0 // Y1H + LXVD2X (R18)(P1ptr), Y1 // Y1L + XXPERMDI Y0, Y0, $2, Y0 + XXPERMDI Y1, Y1, $2, Y1 + CALL sm2p256MulInternal<>(SB) + + // SUB(T(SB) \ + \ + \// SUB(X(SB) \ + \ + \ // ADD(T2(SB) \ + \ + \ // Leave T0, T1 as is. + XXPERMDI T0, T0, $2, TT0 \ + XXPERMDI T1, T1, $2, TT1 \ + STXVD2X TT0, (R19)(P3ptr) \ + STXVD2X TT1, (R20)(P3ptr) \ + \ + \ // X- ; Y=X ; MUL; T- // Y3 = Y3² + VOR X0, X0, Y0 \ + VOR X1, X1, Y1 \ + CALL sm2p256MulInternal<>(SB) \ + \ + \ // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 + VOR T0, T0, X0 \ + VOR T1, T1, X1 \ + LXVD2X (R0)(P1ptr), Y0 \ + LXVD2X (R16)(P1ptr), Y1 \ + XXPERMDI Y0, Y0, $2, Y0 \ + XXPERMDI Y1, Y1, $2, Y1 \ + CALL sm2p256MulInternal<>(SB) \ + VOR T0, T0, T3L \ + VOR T1, T1, T3H \ + \ + \ // X- ; Y=X ; MUL; T- // Y3 = Y3² + VOR X0, X0, Y0 \ + VOR X1, X1, Y1 \ + CALL sm2p256MulInternal<>(SB) \ + \ + \ // HAL(Y3(SB) \ + \ + \ // ADD(T1(SB) \ + \ + \ // SUB(Y3+0x00(SB), CPOOL + + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + + LXVD2X (R16)(CPOOL), PH + LXVD2X (R0)(CPOOL), PL + + p256PointDoubleRound(P1ptr, P3ptr) + RET + +TEXT ·p256PointDouble6TimesAsm(SB), NOSPLIT, $0-16 + MOVD res+0(FP), P3ptr + MOVD in+8(FP), P1ptr + + MOVD $p256mul<>+0x00(SB), CPOOL + + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + + LXVD2X (R16)(CPOOL), PH + LXVD2X (R0)(CPOOL), PL + + p256PointDoubleRound(P1ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + p256PointDoubleRound(P3ptr, P3ptr) + RET + +#undef P3ptr +#undef P1ptr +#undef CPOOL +#undef X3L +#undef X3H +#undef Y3L +#undef Y3H +#undef T1L +#undef T1H +#undef T2L +#undef T2H +#undef T3L +#undef T3H +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef TT0 +#undef TT1 +#undef T2 +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef PL +#undef PH +#undef Z3L +#undef Z3H +#undef ZER +#undef SEL1 +#undef CAR1 +#undef CAR2 + +#define P3ptr R3 +#define P1ptr R4 +#define P2ptr R5 +#define CPOOL R7 +#define TRUE R14 +#define RES1 R9 +#define RES2 R10 + +// Temporaries in REGs +#define T1L V16 +#define T1H V17 +#define T2L V18 +#define T2H V19 +#define U1L V20 +#define U1H V21 +#define S1L V22 +#define S1H V23 +#define HL V24 +#define HH V25 +#define RL V26 +#define RH V27 + +// Temps for Sub and Add +#define ZER V6 +#define SEL1 V7 +#define CAR1 V8 +#define CAR2 V9 +#define TT0 V11 +#define TT1 V12 +#define T2 V13 + +// p256MulAsm Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 + +#define PL V30 +#define PH V31 +/* + * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" + * + * A = X₁×Z₂² + * B = Y₁×Z₂³ + * C = X₂×Z₁²-A + * D = Y₂×Z₁³-B + * X₃ = D² - 2A×C² - C³ + * Y₃ = D×(A×C² - X₃) - B×C³ + * Z₃ = Z₁×Z₂×C + * + * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 + * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R + * + * T1 = Z1*Z1 + * T2 = Z2*Z2 + * U1 = X1*T2 + * H = X2*T1 + * H = H-U1 + * Z3 = Z1*Z2 + * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array + * + * S1 = Z2*T2 + * S1 = Y1*S1 + * R = Z1*T1 + * R = Y2*R + * R = R-S1 + * + * T1 = H*H + * T2 = H*T1 + * U1 = U1*T1 + * + * X3 = R*R + * X3 = X3-T2 + * T1 = 2*U1 + * X3 = X3-T1 << store-out X3 result reg + * + * T2 = S1*T2 + * Y3 = U1-X3 + * Y3 = R*Y3 + * Y3 = Y3-T2 << store-out Y3 result reg + + // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 + // X- ; Y=T ; MUL; R=T // R = Z1*T1 + // X=X2; Y- ; MUL; H=T // H = X2*T1 + // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 + // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 + // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 + // SUB(H+0x00(SB), CPOOL + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + + LXVD2X (R16)(CPOOL), PH + LXVD2X (R0)(CPOOL), PL + + // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 + LXVD2X (R19)(P1ptr), X0 // Z1L + LXVD2X (R20)(P1ptr), X1 // Z1H + XXPERMDI X0, X0, $2, X0 + XXPERMDI X1, X1, $2, X1 + VOR X0, X0, Y0 + VOR X1, X1, Y1 + CALL sm2p256MulInternal<>(SB) + + // X- ; Y=T ; MUL; R=T // R = Z1*T1 + VOR T0, T0, Y0 + VOR T1, T1, Y1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, RL // SAVE: RL + VOR T1, T1, RH // SAVE: RH + + STXVD2X RH, (R1)(R17) // V27 has to be saved + + // X=X2; Y- ; MUL; H=T // H = X2*T1 + MOVD in2+16(FP), P2ptr + LXVD2X (R0)(P2ptr), X0 // X2L + LXVD2X (R16)(P2ptr), X1 // X2H + XXPERMDI X0, X0, $2, X0 + XXPERMDI X1, X1, $2, X1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, HL // SAVE: HL + VOR T1, T1, HH // SAVE: HH + + // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 + MOVD in2+16(FP), P2ptr + LXVD2X (R19)(P2ptr), X0 // Z2L + LXVD2X (R20)(P2ptr), X1 // Z2H + XXPERMDI X0, X0, $2, X0 + XXPERMDI X1, X1, $2, X1 + VOR X0, X0, Y0 + VOR X1, X1, Y1 + CALL sm2p256MulInternal<>(SB) + + // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 + VOR T0, T0, Y0 + VOR T1, T1, Y1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, S1L // SAVE: S1L + VOR T1, T1, S1H // SAVE: S1H + + // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 + MOVD in1+8(FP), P1ptr + LXVD2X (R0)(P1ptr), X0 // X1L + LXVD2X (R16)(P1ptr), X1 // X1H + XXPERMDI X0, X0, $2, X0 + XXPERMDI X1, X1, $2, X1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, U1L // SAVE: U1L + VOR T1, T1, U1H // SAVE: U1H + + // SUB(H(SB) + + // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H + VOR T0, T0, X0 + VOR T1, T1, X1 + VOR HL, HL, Y0 + VOR HH, HH, Y1 + CALL sm2p256MulInternal<>(SB) + MOVD res+0(FP), P3ptr + XXPERMDI T1, T1, $2, TT1 + XXPERMDI T0, T0, $2, TT0 + STXVD2X TT0, (R19)(P3ptr) + STXVD2X TT1, (R20)(P3ptr) + + // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 + MOVD in1+8(FP), P1ptr + LXVD2X (R17)(P1ptr), X0 + LXVD2X (R18)(P1ptr), X1 + XXPERMDI X0, X0, $2, X0 + XXPERMDI X1, X1, $2, X1 + VOR S1L, S1L, Y0 + VOR S1H, S1H, Y1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, S1L + VOR T1, T1, S1H + + // X=Y2; Y=R ; MUL; T- // R = Y2*R + MOVD in2+16(FP), P2ptr + LXVD2X (R17)(P2ptr), X0 + LXVD2X (R18)(P2ptr), X1 + XXPERMDI X0, X0, $2, X0 + XXPERMDI X1, X1, $2, X1 + VOR RL, RL, Y0 + + // VOR RH, RH, Y1 RH was saved above in D2X format + LXVD2X (R1)(R17), Y1 + CALL sm2p256MulInternal<>(SB) + + // SUB(R(SB) + + // X- ; Y=T ; MUL; T2=T // T2 = H*T1 + VOR T0, T0, Y0 + VOR T1, T1, Y1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, T2L + VOR T1, T1, T2H + + // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 + VOR U1L, U1L, X0 + VOR U1H, U1H, X1 + CALL sm2p256MulInternal<>(SB) + VOR T0, T0, U1L + VOR T1, T1, U1H + + // X=R ; Y=R ; MUL; T- // X3 = R*R + VOR RL, RL, X0 + + // VOR RH, RH, X1 + VOR RL, RL, Y0 + + // RH was saved above using STXVD2X + LXVD2X (R1)(R17), X1 + VOR X1, X1, Y1 + + // VOR RH, RH, Y1 + CALL sm2p256MulInternal<>(SB) + + // SUB(T(SB) + VOR T0, T0, U1L + VOR T1, T1, U1H + + // X=S1; Y=T2; MUL; T- // T2 = S1*T2 + VOR S1L, S1L, X0 + VOR S1H, S1H, X1 + VOR T2L, T2L, Y0 + VOR T2H, T2H, Y1 + CALL sm2p256MulInternal<>(SB) + + // SUB(T