gmsm/internal/sm2ec/p256_asm_s390x.s

1113 lines
22 KiB
ArmAsm
Raw Normal View History

2024-08-23 14:42:49 +08:00
// This is a port of the NIST P256 s390x asm implementation to SM2 P256.
//
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !purego
#include "textflag.h"
#include "go_asm.h"
DATA p256ordK0<>+0x00(SB)/4, $0x72350975
DATA p256ord<>+0x00(SB)/8, $0xfffffffeffffffff
DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
DATA p256ord<>+0x10(SB)/8, $0x7203df6b21c6052b
DATA p256ord<>+0x18(SB)/8, $0x53bbf40939d54123
DATA p256<>+0x00(SB)/8, $0xfffffffeffffffff // P256
DATA p256<>+0x08(SB)/8, $0xffffffffffffffff // P256
DATA p256<>+0x10(SB)/8, $0xffffffff00000000 // P256
DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
2024-08-23 15:07:22 +08:00
DATA p256<>+0x20(SB)/8, $0x0000000000000000 // SEL 0 0 d1 d0
DATA p256<>+0x28(SB)/8, $0x18191a1b1c1d1e1f // SEL 0 0 d1 d0
DATA p256<>+0x30(SB)/8, $0x0706050403020100 // LE2BE permute mask
DATA p256<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
2024-08-23 14:42:49 +08:00
DATA p256mul<>+0x00(SB)/8, $0xfffffffeffffffff // P256
DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
DATA p256mul<>+0x10(SB)/8, $0xffffffff00000000 // P256
DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
DATA p256mul<>+0x80(SB)/8, $0x0000000100000000 // (1*2^256)%P256
DATA p256mul<>+0x88(SB)/8, $0x0000000000000000 // (1*2^256)%P256
DATA p256mul<>+0x90(SB)/8, $0x00000000ffffffff // (1*2^256)%P256
DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
GLOBL p256ordK0<>(SB), 8, $4
GLOBL p256ord<>(SB), 8, $32
GLOBL p256<>(SB), 8, $64
GLOBL p256mul<>(SB), 8, $160
// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0
JMP ·p256BigToLittle(SB)
// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0
JMP ·p256BigToLittle(SB)
// ---------------------------------------
// func p256LittleToBig(res *[32]byte, in *p256Element)
TEXT ·p256LittleToBig(SB), NOSPLIT, $0
JMP ·p256BigToLittle(SB)
// func p256BigToLittle(res *p256Element, in *[32]byte)
#define res_ptr R1
#define in_ptr R2
#define T1L V2
#define T1H V3
TEXT ·p256BigToLittle(SB), NOSPLIT, $0
MOVD res+0(FP), res_ptr
MOVD in+8(FP), in_ptr
VL 0(in_ptr), T1H
VL 16(in_ptr), T1L
VPDI $0x4, T1L, T1L, T1L
VPDI $0x4, T1H, T1H, T1H
VST T1L, 0(res_ptr)
VST T1H, 16(res_ptr)
RET
#undef res_ptr
#undef in_ptr
#undef T1L
#undef T1H
// ---------------------------------------
// iff cond == 1 val <- -val
// func p256NegCond(val *p256Element, cond int)
#define P1ptr R1
#define CPOOL R4
#define Y1L V0
#define Y1H V1
#define T1L V2
#define T1H V3
#define PL V30
#define PH V31
#define ZER V4
#define SEL1 V5
#define CAR1 V6
TEXT ·p256NegCond(SB), NOSPLIT, $0
MOVD val+0(FP), P1ptr
MOVD $p256mul<>+0x00(SB), CPOOL
VL 16(CPOOL), PL
VL 0(CPOOL), PH
VL 16(P1ptr), Y1H
VPDI $0x4, Y1H, Y1H, Y1H
VL 0(P1ptr), Y1L
VPDI $0x4, Y1L, Y1L, Y1L
VLREPG cond+8(FP), SEL1
VZERO ZER
VCEQG SEL1, ZER, SEL1
VSCBIQ Y1L, PL, CAR1
VSQ Y1L, PL, T1L
VSBIQ PH, Y1H, CAR1, T1H
VSEL Y1L, T1L, SEL1, Y1L
VSEL Y1H, T1H, SEL1, Y1H
VPDI $0x4, Y1H, Y1H, Y1H
VST Y1H, 16(P1ptr)
VPDI $0x4, Y1L, Y1L, Y1L
VST Y1L, 0(P1ptr)
RET
#undef P1ptr
#undef CPOOL
#undef Y1L
#undef Y1H
#undef T1L
#undef T1H
#undef PL
#undef PH
#undef ZER
#undef SEL1
#undef CAR1
// ---------------------------------------
// if cond == 0 res <- b; else res <- a
// func p256MovCond(res, a, b *P256Point, cond int)
#define P3ptr R1
#define P1ptr R2
#define P2ptr R3
#define X1L V0
#define X1H V1
#define Y1L V2
#define Y1H V3
#define Z1L V4
#define Z1H V5
#define X2L V6
#define X2H V7
#define Y2L V8
#define Y2H V9
#define Z2L V10
#define Z2H V11
#define ZER V18
#define SEL1 V19
TEXT ·p256MovCond(SB), NOSPLIT, $0
MOVD res+0(FP), P3ptr
MOVD a+8(FP), P1ptr
MOVD b+16(FP), P2ptr
VLREPG cond+24(FP), SEL1
VZERO ZER
VCEQG SEL1, ZER, SEL1
VL 0(P1ptr), X1H
VL 16(P1ptr), X1L
VL 32(P1ptr), Y1H
VL 48(P1ptr), Y1L
VL 64(P1ptr), Z1H
VL 80(P1ptr), Z1L
VL 0(P2ptr), X2H
VL 16(P2ptr), X2L
VL 32(P2ptr), Y2H
VL 48(P2ptr), Y2L
VL 64(P2ptr), Z2H
VL 80(P2ptr), Z2L
VSEL X2L, X1L, SEL1, X1L
VSEL X2H, X1H, SEL1, X1H
VSEL Y2L, Y1L, SEL1, Y1L
VSEL Y2H, Y1H, SEL1, Y1H
VSEL Z2L, Z1L, SEL1, Z1L
VSEL Z2H, Z1H, SEL1, Z1H
VST X1H, 0(P3ptr)
VST X1L, 16(P3ptr)
VST Y1H, 32(P3ptr)
VST Y1L, 48(P3ptr)
VST Z1H, 64(P3ptr)
VST Z1L, 80(P3ptr)
RET
#undef P3ptr
#undef P1ptr
#undef P2ptr
#undef X1L
#undef X1H
#undef Y1L
#undef Y1H
#undef Z1L
#undef Z1H
#undef X2L
#undef X2H
#undef Y2L
#undef Y2H
#undef Z2L
#undef Z2H
#undef ZER
#undef SEL1
// ---------------------------------------
// Constant time table access
// Indexed from 1 to 15, with -1 offset
// (index 0 is implicitly point at infinity)
// func p256Select(res *P256Point, table *p256Table, idx int)
#define P3ptr R1
#define P1ptr R2
#define COUNT R4
#define X1L V0
#define X1H V1
#define Y1L V2
#define Y1H V3
#define Z1L V4
#define Z1H V5
#define X2L V6
#define X2H V7
#define Y2L V8
#define Y2H V9
#define Z2L V10
#define Z2H V11
#define ONE V18
#define IDX V19
#define SEL1 V20
#define SEL2 V21
TEXT ·p256Select(SB), NOSPLIT, $0
MOVD res+0(FP), P3ptr
MOVD table+8(FP), P1ptr
VLREPB idx+(16+7)(FP), IDX
VREPIB $1, ONE
VREPIB $1, SEL2
MOVD $1, COUNT
VZERO X1H
VZERO X1L
VZERO Y1H
VZERO Y1L
VZERO Z1H
VZERO Z1L
loop_select:
VL 0(P1ptr), X2H
VL 16(P1ptr), X2L
VL 32(P1ptr), Y2H
VL 48(P1ptr), Y2L
VL 64(P1ptr), Z2H
VL 80(P1ptr), Z2L
VCEQG SEL2, IDX, SEL1
VSEL X2L, X1L, SEL1, X1L
VSEL X2H, X1H, SEL1, X1H
VSEL Y2L, Y1L, SEL1, Y1L
VSEL Y2H, Y1H, SEL1, Y1H
VSEL Z2L, Z1L, SEL1, Z1L
VSEL Z2H, Z1H, SEL1, Z1H
VAB SEL2, ONE, SEL2
ADDW $1, COUNT
ADD $96, P1ptr
CMPW COUNT, $17
BLT loop_select
VST X1H, 0(P3ptr)
VST X1L, 16(P3ptr)
VST Y1H, 32(P3ptr)
VST Y1L, 48(P3ptr)
VST Z1H, 64(P3ptr)
VST Z1L, 80(P3ptr)
RET
#undef P3ptr
#undef P1ptr
#undef COUNT
#undef X1L
#undef X1H
#undef Y1L
#undef Y1H
#undef Z1L
#undef Z1H
#undef X2L
#undef X2H
#undef Y2L
#undef Y2H
#undef Z2L
#undef Z2H
#undef ONE
#undef IDX
#undef SEL1
#undef SEL2
// ---------------------------------------
// func p256FromMont(res, in *p256Element)
#define res_ptr R1
#define x_ptr R2
#define CPOOL R4
#define T0 V0
#define T1 V1
#define T2 V2
#define TT0 V3
#define TT1 V4
#define ZER V6
#define SEL1 V7
#define CAR1 V9
#define CAR2 V10
#define RED1 V11
#define RED2 V12
#define PL V14
#define PH V15
TEXT ·p256FromMont(SB), NOSPLIT, $0
MOVD res+0(FP), res_ptr
MOVD in+8(FP), x_ptr
VZERO T2
VZERO ZER
MOVD $p256<>+0x00(SB), CPOOL
VL 16(CPOOL), PL
VL 0(CPOOL), PH
2024-08-23 16:05:14 +08:00
VL 32(CPOOL), SEL1
2024-08-23 14:42:49 +08:00
VL (0*16)(x_ptr), T0
VPDI $0x4, T0, T0, T0
VL (1*16)(x_ptr), T1
VPDI $0x4, T1, T1, T1
// First round
VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0
VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0
VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0
VSCBIQ TT0, RED1, CAR1
VSQ TT0, RED1, RED1
VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
VSLDB $8, T1, T0, T0
VSLDB $8, T2, T1, T1
VACCQ T0, RED1, CAR1
VAQ T0, RED1, T0
VACCCQ T1, RED2, CAR1, CAR2
VACQ T1, RED2, CAR1, T1
VAQ T2, CAR2, T2
// Second round
VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0
VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0
VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0
VSCBIQ TT0, RED1, CAR1
VSQ TT0, RED1, RED1
VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
VSLDB $8, T1, T0, T0
VSLDB $8, T2, T1, T1
VACCQ T0, RED1, CAR1
VAQ T0, RED1, T0
VACCCQ T1, RED2, CAR1, CAR2
VACQ T1, RED2, CAR1, T1
VAQ T2, CAR2, T2
// Third round
VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0
VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0
VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0
VSCBIQ TT0, RED1, CAR1
VSQ TT0, RED1, RED1
VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
VSLDB $8, T1, T0, T0
VSLDB $8, T2, T1, T1
VACCQ T0, RED1, CAR1
VAQ T0, RED1, T0
VACCCQ T1, RED2, CAR1, CAR2
VACQ T1, RED2, CAR1, T1
VAQ T2, CAR2, T2
// Last round
VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0
VSLDB $4, RED1, ZER, TT0 // 0 d1 d0 0
VSLDB $4, TT0, ZER, RED2 // d1 d0 0 0
VSCBIQ TT0, RED1, CAR1
VSQ TT0, RED1, RED1
VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
VSLDB $8, T1, T0, T0
VSLDB $8, T2, T1, T1
VACCQ T0, RED1, CAR1
VAQ T0, RED1, T0
VACCCQ T1, RED2, CAR1, CAR2
VACQ T1, RED2, CAR1, T1
VAQ T2, CAR2, T2
// ---------------------------------------------------
VSCBIQ PL, T0, CAR1
VSQ PL, T0, TT0
VSBCBIQ T1, PH, CAR1, CAR2
VSBIQ T1, PH, CAR1, TT1
VSBIQ T2, ZER, CAR2, T2
// what output to use, TT1||TT0 or T1||T0?
VSEL T0, TT0, T2, T0
VSEL T1, TT1, T2, T1
VPDI $0x4, T0, T0, TT0
VST TT0, (0*16)(res_ptr)
VPDI $0x4, T1, T1, TT1
VST TT1, (1*16)(res_ptr)
RET
#undef res_ptr
#undef x_ptr
#undef CPOOL
#undef T0
#undef T1
#undef T2
#undef TT0
#undef TT1
#undef ZER
#undef SEL1
#undef CAR1
#undef CAR2
#undef RED1
#undef RED2
#undef PL
#undef PH
// Constant time table access
// Indexed from 1 to 15, with -1 offset
// (index 0 is implicitly point at infinity)
// func p256SelectBase(point *p256Point, table []p256Point, idx int)
// new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
#define P3ptr R1
#define P1ptr R2
#define COUNT R4
#define CPOOL R5
#define X1L V0
#define X1H V1
#define Y1L V2
#define Y1H V3
#define Z1L V4
#define Z1H V5
#define X2L V6
#define X2H V7
#define Y2L V8
#define Y2H V9
#define Z2L V10
#define Z2H V11
#define LE2BE V12
#define ONE V18
#define IDX V19
#define SEL1 V20
#define SEL2 V21
TEXT ·p256SelectAffine(SB), NOSPLIT, $0
MOVD res+0(FP), P3ptr
MOVD table+8(FP), P1ptr
MOVD $p256<>+0x00(SB), CPOOL
VLREPB idx+(16+7)(FP), IDX
VREPIB $1, ONE
VREPIB $1, SEL2
MOVD $1, COUNT
VL 80(CPOOL), LE2BE
VZERO X1H
VZERO X1L
VZERO Y1H
VZERO Y1L
loop_select:
VL 0(P1ptr), X2H
VL 16(P1ptr), X2L
VL 32(P1ptr), Y2H
VL 48(P1ptr), Y2L
VCEQG SEL2, IDX, SEL1
VSEL X2L, X1L, SEL1, X1L
VSEL X2H, X1H, SEL1, X1H
VSEL Y2L, Y1L, SEL1, Y1L
VSEL Y2H, Y1H, SEL1, Y1H
VAB SEL2, ONE, SEL2
ADDW $1, COUNT
ADD $64, P1ptr
CMPW COUNT, $65
BLT loop_select
VST X1H, 0(P3ptr)
VST X1L, 16(P3ptr)
VST Y1H, 32(P3ptr)
VST Y1L, 48(P3ptr)
RET
#undef P3ptr
#undef P1ptr
#undef COUNT
#undef X1L
#undef X1H
#undef Y1L
#undef Y1H
#undef Z1L
#undef Z1H
#undef X2L
#undef X2H
#undef Y2L
#undef Y2H
#undef Z2L
#undef Z2H
#undef ONE
#undef IDX
#undef SEL1
#undef SEL2
#undef CPOOL
// ---------------------------------------
2024-08-23 17:43:18 +08:00
// sm2p256OrdMulInternal
2024-08-23 14:42:49 +08:00
#define X0 V0
#define X1 V1
#define Y0 V2
#define Y1 V3
#define M0 V4
#define M1 V5
#define T0 V6
#define T1 V7
#define T2 V8
#define YDIG V9
#define ADD1 V16
#define ADD1H V17
#define ADD2 V18
#define ADD2H V19
#define RED1 V20
#define RED1H V21
#define RED2 V22
#define RED2H V23
#define CAR1 V24
#define CAR1M V25
#define MK0 V30
#define K0 V31
2024-08-23 17:43:18 +08:00
TEXT sm2p256OrdMulInternal<>(SB), NOSPLIT, $0-0
2024-08-23 14:42:49 +08:00
// ---------------------------------------------------------------------------/
VREPF $3, Y0, YDIG
VMLF X0, YDIG, ADD1
VMLF ADD1, K0, MK0
VREPF $3, MK0, MK0
VMLF X1, YDIG, ADD2
VMLHF X0, YDIG, ADD1H
VMLHF X1, YDIG, ADD2H
VMALF M0, MK0, ADD1, RED1
VMALHF M0, MK0, ADD1, RED1H
VMALF M1, MK0, ADD2, RED2
VMALHF M1, MK0, ADD2, RED2H
2024-08-23 17:43:18 +08:00
VZERO T2
2024-08-23 14:42:49 +08:00
VSLDB $12, RED2, RED1, RED1
VSLDB $12, T2, RED2, RED2
VACCQ RED1, ADD1H, CAR1
VAQ RED1, ADD1H, T0
VACCQ RED1H, T0, CAR1M
VAQ RED1H, T0, T0
// << ready for next MK0
VACQ RED2, ADD2H, CAR1, T1
VACCCQ RED2, ADD2H, CAR1, CAR1
VACCCQ RED2H, T1, CAR1M, T2
VACQ RED2H, T1, CAR1M, T1
VAQ CAR1, T2, T2
// ---------------------------------------------------
/* *
* ---+--------+--------+
* T2| T1 | T0 |
* ---+--------+--------+
* *(add)*
* +--------+--------+
* | X1 | X0 |
* +--------+--------+
* *(mul)*
* +--------+--------+
* | YDIG | YDIG |
* +--------+--------+
* *(add)*
* +--------+--------+
* | M1 | M0 |
* +--------+--------+
* *(mul)*
* +--------+--------+
* | MK0 | MK0 |
* +--------+--------+
*
* ---------------------
*
* +--------+--------+
* | ADD2 | ADD1 |
* +--------+--------+
* +--------+--------+
* | ADD2H | ADD1H |
* +--------+--------+
* +--------+--------+
* | RED2 | RED1 |
* +--------+--------+
* +--------+--------+
* | RED2H | RED1H |
* +--------+--------+
*/
VREPF $2, Y0, YDIG
VMALF X0, YDIG, T0, ADD1
VMLF ADD1, K0, MK0
VREPF $3, MK0, MK0
VMALF X1, YDIG, T1, ADD2
VMALHF X0, YDIG, T0, ADD1H
VMALHF X1, YDIG, T1, ADD2H
VMALF M0, MK0, ADD1, RED1
VMALHF M0, MK0, ADD1, RED1H
VMALF M1, MK0, ADD2, RED2
VMALHF M1, MK0, ADD2, RED2H
VSLDB $12, RED2, RED1, RED1
VSLDB $12, T2, RED2, RED2
VACCQ RED1, ADD1H, CAR1
VAQ RED1, ADD1H, T0
VACCQ RED1H, T0, CAR1M
VAQ RED1H, T0, T0
// << ready for next MK0
VACQ RED2, ADD2H, CAR1, T1
VACCCQ RED2, ADD2H, CAR1, CAR1
VACCCQ RED2H, T1, CAR1M, T2
VACQ RED2H, T1, CAR1M, T1
VAQ CAR1, T2, T2
// ---------------------------------------------------
VREPF $1, Y0, YDIG
VMALF X0, YDIG, T0, ADD1
VMLF ADD1, K0, MK0
VREPF $3, MK0, MK0
VMALF X1, YDIG, T1, ADD2
VMALHF X0, YDIG, T0, ADD1H
VMALHF X1, YDIG, T1, ADD2H
VMALF M0, MK0, ADD1, RED1
VMALHF M0, MK0, ADD1, RED1H
VMALF M1, MK0, ADD2, RED2
VMALHF M1, MK0, ADD2, RED2H
VSLDB $12, RED2, RED1, RED1
VSLDB $12, T2, RED2, RED2
VACCQ RED1, ADD1H, CAR1
VAQ RED1, ADD1H, T0
VACCQ RED1H, T0, CAR1M
VAQ RED1H, T0, T0
// << ready for next MK0
VACQ RED2, ADD2H, CAR1, T1
VACCCQ RED2, ADD2H, CAR1, CAR1
VACCCQ RED2H, T1, CAR1M, T2
VACQ RED2H, T1, CAR1M, T1
VAQ CAR1, T2, T2
// ---------------------------------------------------
VREPF $0, Y0, YDIG
VMALF X0, YDIG, T0, ADD1
VMLF ADD1, K0, MK0
VREPF $3, MK0, MK0
VMALF X1, YDIG, T1, ADD2
VMALHF X0, YDIG, T0, ADD1H
VMALHF X1, YDIG, T1, ADD2H
VMALF M0, MK0, ADD1, RED1
VMALHF M0, MK0, ADD1, RED1H
VMALF M1, MK0, ADD2, RED2
VMALHF M1, MK0, ADD2, RED2H
VSLDB $12, RED2, RED1, RED1
VSLDB $12, T2, RED2, RED2
VACCQ RED1, ADD1H, CAR1
VAQ RED1, ADD1H, T0
VACCQ RED1H, T0, CAR1M
VAQ RED1H, T0, T0
// << ready for next MK0
VACQ RED2, ADD2H, CAR1, T1
VACCCQ RED2, ADD2H, CAR1, CAR1
VACCCQ RED2H, T1, CAR1M, T2
VACQ RED2H, T1, CAR1M, T1
VAQ CAR1, T2, T2
// ---------------------------------------------------
VREPF $3, Y1, YDIG
VMALF X0, YDIG, T0, ADD1
VMLF ADD1, K0, MK0
VREPF $3, MK0, MK0
VMALF X1, YDIG, T1, ADD2
VMALHF X0, YDIG, T0, ADD1H
VMALHF X1, YDIG, T1, ADD2H
VMALF M0, MK0, ADD1, RED1
VMALHF M0, MK0, ADD1, RED1H
VMALF M1, MK0, ADD2, RED2
VMALHF M1, MK0, ADD2, RED2H
VSLDB $12, RED2, RED1, RED1
VSLDB $12, T2, RED2, RED2
VACCQ RED1, ADD1H, CAR1
VAQ RED1, ADD1H, T0
VACCQ RED1H, T0, CAR1M
VAQ RED1H, T0, T0
// << ready for next MK0
VACQ RED2, ADD2H, CAR1, T1
VACCCQ RED2, ADD2H, CAR1, CAR1
VACCCQ RED2H, T1, CAR1M, T2
VACQ RED2H, T1, CAR1M, T1
VAQ CAR1, T2, T2
// ---------------------------------------------------
VREPF $2, Y1, YDIG
VMALF X0, YDIG, T0, ADD1
VMLF ADD1, K0, MK0
VREPF $3, MK0, MK0
VMALF X1, YDIG, T1, ADD2
VMALHF X0, YDIG, T0, ADD1H
VMALHF X1, YDIG, T1, ADD2H
VMALF M0, MK0, ADD1, RED1
VMALHF M0, MK0, ADD1, RED1H
VMALF M1, MK0, ADD2, RED2
VMALHF M1, MK0, ADD2, RED2H
VSLDB $12, RED2, RED1, RED1
VSLDB $12, T2, RED2, RED2
VACCQ RED1, ADD1H, CAR1
VAQ RED1, ADD1H, T0
VACCQ RED1H, T0, CAR1M
VAQ RED1H, T0, T0
// << ready for next MK0
VACQ RED2, ADD2H, CAR1, T1
VACCCQ RED2, ADD2H, CAR1, CAR1
VACCCQ RED2H, T1, CAR1M, T2
VACQ RED2H, T1, CAR1M, T1
VAQ CAR1, T2, T2
// ---------------------------------------------------
VREPF $1, Y1, YDIG
VMALF X0, YDIG, T0, ADD1
VMLF ADD1, K0, MK0
VREPF $3, MK0, MK0
VMALF X1, YDIG, T1, ADD2
VMALHF X0, YDIG, T0, ADD1H
VMALHF X1, YDIG, T1, ADD2H
VMALF M0, MK0, ADD1, RED1
VMALHF M0, MK0, ADD1, RED1H
VMALF M1, MK0, ADD2, RED2
VMALHF M1, MK0, ADD2, RED2H
VSLDB $12, RED2, RED1, RED1
VSLDB $12, T2, RED2, RED2
VACCQ RED1, ADD1H, CAR1
VAQ RED1, ADD1H, T0
VACCQ RED1H, T0, CAR1M
VAQ RED1H, T0, T0
// << ready for next MK0
VACQ RED2, ADD2H, CAR1, T1
VACCCQ RED2, ADD2H, CAR1, CAR1
VACCCQ RED2H, T1, CAR1M, T2
VACQ RED2H, T1, CAR1M, T1
VAQ CAR1, T2, T2
// ---------------------------------------------------
VREPF $0, Y1, YDIG
VMALF X0, YDIG, T0, ADD1
VMLF ADD1, K0, MK0
VREPF $3, MK0, MK0
VMALF X1, YDIG, T1, ADD2
VMALHF X0, YDIG, T0, ADD1H
VMALHF X1, YDIG, T1, ADD2H
VMALF M0, MK0, ADD1, RED1
VMALHF M0, MK0, ADD1, RED1H
VMALF M1, MK0, ADD2, RED2
VMALHF M1, MK0, ADD2, RED2H
VSLDB $12, RED2, RED1, RED1
VSLDB $12, T2, RED2, RED2
VACCQ RED1, ADD1H, CAR1
VAQ RED1, ADD1H, T0
VACCQ RED1H, T0, CAR1M
VAQ RED1H, T0, T0
// << ready for next MK0
VACQ RED2, ADD2H, CAR1, T1
VACCCQ RED2, ADD2H, CAR1, CAR1
VACCCQ RED2H, T1, CAR1M, T2
VACQ RED2H, T1, CAR1M, T1
VAQ CAR1, T2, T2
// ---------------------------------------------------
VZERO RED1
VSCBIQ M0, T0, CAR1
VSQ M0, T0, ADD1
VSBCBIQ T1, M1, CAR1, CAR1M
VSBIQ T1, M1, CAR1, ADD2
VSBIQ T2, RED1, CAR1M, T2
// what output to use, ADD2||ADD1 or T1||T0?
VSEL T0, ADD1, T2, T0
VSEL T1, ADD2, T2, T1
RET
#undef X0
#undef X1
#undef Y0
#undef Y1
#undef M0
#undef M1
#undef T0
#undef T1
#undef T2
#undef YDIG
#undef ADD1
#undef ADD1H
#undef ADD2
#undef ADD2H
#undef RED1
#undef RED1H
#undef RED2
#undef RED2H
#undef CAR1
#undef CAR1M
#undef MK0
#undef K0
2024-08-23 17:43:18 +08:00
// ---------------------------------------
// Parameters
#define X0 V0
#define X1 V1
#define Y0 V2
#define Y1 V3
TEXT sm2p256OrdSqrInternal<>(SB), NOFRAME|NOSPLIT, $0
VLR X0, Y0
VLR X1, Y1
BR sm2p256OrdMulInternal<>(SB)
#undef X0
#undef X1
#undef Y0
#undef Y1
// ---------------------------------------
// func p256OrdMul(res, in1, in2 *p256OrdElement)
#define res_ptr R1
#define x_ptr R2
#define y_ptr R3
#define X0 V0
#define X1 V1
#define Y0 V2
#define Y1 V3
#define M0 V4
#define M1 V5
#define T0 V6
#define T1 V7
TEXT ·p256OrdMul(SB), NOSPLIT, $0
MOVD res+0(FP), res_ptr
MOVD in1+8(FP), x_ptr
MOVD in2+16(FP), y_ptr
MOVD $p256ordK0<>+0x00(SB), R4
2024-08-23 17:46:14 +08:00
VLEF $3, 0(R4), K0
//WORD $0xE7F40000
//BYTE $0x38
//BYTE $0x03
2024-08-23 17:43:18 +08:00
MOVD $p256ord<>+0x00(SB), R4
VL 16(R4), M0
VL 0(R4), M1
VL (0*16)(x_ptr), X0
VPDI $0x4, X0, X0, X0
VL (1*16)(x_ptr), X1
VPDI $0x4, X1, X1, X1
VL (0*16)(y_ptr), Y0
VPDI $0x4, Y0, Y0, Y0
VL (1*16)(y_ptr), Y1
VPDI $0x4, Y1, Y1, Y1
CALL sm2p256OrdMulInternal<>(SB)
VPDI $0x4, T0, T0, T0
VST T0, (0*16)(res_ptr)
VPDI $0x4, T1, T1, T1
VST T1, (1*16)(res_ptr)
RET
#undef res_ptr
#undef x_ptr
#undef y_ptr
#undef X0
#undef X1
#undef Y0
#undef Y1
#undef M0
#undef M1
#undef T0
#undef T1
// ---------------------------------------
// func p256OrdSqr(res, in *p256OrdElement, n int)
#define res_ptr R1
#define x_ptr R2
#define COUNT R5
#define N R6
#define X0 V0
#define X1 V1
#define M0 V4
#define M1 V5
#define T0 V6
#define T1 V7
TEXT ·p256OrdSqr(SB), NOSPLIT, $0
MOVD res+0(FP), res_ptr
MOVD in+8(FP), x_ptr
MOVD n+16(FP), N
MOVD $0, COUNT
MOVD $p256ordK0<>+0x00(SB), R4
2024-08-23 17:46:14 +08:00
VLEF $3, 0(R4), K0
//WORD $0xE7F40000
//BYTE $0x38
//BYTE $0x03
2024-08-23 17:43:18 +08:00
MOVD $p256ord<>+0x00(SB), R4
VL 16(R4), M0
VL 0(R4), M1
VL (0*16)(x_ptr), X0
VPDI $0x4, X0, X0, X0
VL (1*16)(x_ptr), X1
VPDI $0x4, X1, X1, X1
loop:
CALL sm2p256OrdSqrInternal<>(SB)
VLR T0, X0
VLR T1, X1
ADDW $1, COUNT
CMPW COUNT, N
BLT loop
VPDI $0x4, T0, T0, T0
VST T0, (0*16)(res_ptr)
VPDI $0x4, T1, T1, T1
VST T1, (1*16)(res_ptr)
RET
#undef res_ptr
#undef x_ptr
#undef COUNT
#undef N
#undef X0
#undef X1
#undef M0
#undef M1
#undef T0
#undef T1
2024-08-23 14:42:49 +08:00
TEXT ·p256Mul(SB), NOSPLIT, $0
RET
TEXT ·p256Sqr(SB), NOSPLIT, $0
RET
TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
RET
TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
RET
TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
RET
TEXT ·p256PointDouble6TimesAsm(SB), NOSPLIT, $0
RET
#define res_ptr R1
#define CPOOL R4
2024-08-23 14:42:49 +08:00
#define T0 V0
#define T1 V1
#define T2 V2
#define TT0 V3
#define TT1 V4
#define ZER V6
#define CAR1 V7
#define CAR2 V8
#define PL V9
#define PH V10
2024-08-23 14:42:49 +08:00
//func p256OrdReduce(s *p256OrdElement)
TEXT ·p256OrdReduce(SB),NOSPLIT,$0
MOVD res+0(FP), res_ptr
VZERO T2
VZERO ZER
2024-08-23 14:42:49 +08:00
MOVD $p256ord<>+0x00(SB), CPOOL
VL 16(CPOOL), PL
VL 0(CPOOL), PH
VL (0*16)(res_ptr), T0
VPDI $0x4, T0, T0, T0
VL (1*16)(res_ptr), T1
VPDI $0x4, T1, T1, T1
VSCBIQ PL, T0, CAR1
VSQ PL, T0, TT0
VSBCBIQ T1, PH, CAR1, CAR2
VSBIQ T1, PH, CAR1, TT1
VSBIQ T2, ZER, CAR2, T2
// what output to use, TT1||TT0 or T1||T0?
VSEL T0, TT0, T2, T0
VSEL T1, TT1, T2, T1
VPDI $0x4, T0, T0, TT0
VST TT0, (0*16)(res_ptr)
VPDI $0x4, T1, T1, TT1
VST TT1, (1*16)(res_ptr)
RET
#undef res_ptr
#undef CPOOL
#undef T0
#undef T1
#undef T2
#undef TT0
#undef TT1
#undef ZER
2024-08-23 14:42:49 +08:00
#undef CAR1
#undef CAR2
#undef PL
#undef PH