mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 04:36:19 +08:00
internal/sm2ec: ppc64le kick start
This commit is contained in:
parent
5af95316f0
commit
77c51c2295
9
.github/workflows/test_ppc64.yaml
vendored
9
.github/workflows/test_ppc64.yaml
vendored
@ -29,11 +29,12 @@ jobs:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build
|
||||
run: go build -v ./internal/bigmod/...
|
||||
|
||||
- name: Test
|
||||
run: go test -v -short ./internal/bigmod/...
|
||||
env:
|
||||
GODEBUG: x509sha1=1
|
||||
GOARCH: ${{ matrix.arch }}
|
||||
|
||||
- name: Test
|
||||
run: go test -v -short ./internal/sm2ec/...
|
||||
env:
|
||||
GOARCH: ${{ matrix.arch }}
|
||||
|
702
internal/sm2ec/p256_asm_ppc64le.s
Normal file
702
internal/sm2ec/p256_asm_ppc64le.s
Normal file
@ -0,0 +1,702 @@
|
||||
//go:build !purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// This is a port of the s390x asm implementation.
|
||||
// to ppc64le.
|
||||
|
||||
// Some changes were needed due to differences in
|
||||
// the Go opcodes and/or available instructions
|
||||
// between s390x and ppc64le.
|
||||
|
||||
// 1. There were operand order differences in the
|
||||
// VSUBUQM, VSUBCUQ, and VSEL instructions.
|
||||
|
||||
// 2. ppc64 does not have a multiply high and low
|
||||
// like s390x, so those were implemented using
|
||||
// macros to compute the equivalent values.
|
||||
|
||||
// 3. The LVX, STVX instructions on ppc64 require
|
||||
// 16 byte alignment of the data. To avoid that
|
||||
// requirement, data is loaded using LXVD2X and
|
||||
// STXVD2X with VPERM to reorder bytes correctly.
|
||||
|
||||
// I have identified some areas where I believe
|
||||
// changes would be needed to make this work for big
|
||||
// endian; however additional changes beyond what I
|
||||
// have noted are most likely needed to make it work.
|
||||
// - The string used with VPERM to swap the byte order
|
||||
// for loads and stores.
|
||||
// - The constants that are loaded from CPOOL.
|
||||
//
|
||||
|
||||
// The following constants are defined in an order
|
||||
// that is correct for use with LXVD2X/STXVD2X
|
||||
// on little endian.
|
||||
DATA p256ordK0<>+0x00(SB)/4, $0x72350975
|
||||
DATA p256ord<>+0x00(SB)/8, $0xfffffffeffffffff
|
||||
DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
|
||||
DATA p256ord<>+0x10(SB)/8, $0x7203df6b21c6052b
|
||||
DATA p256ord<>+0x18(SB)/8, $0x53bbf40939d54123
|
||||
DATA p256<>+0x00(SB)/8, $0xfffffffeffffffff // P256
|
||||
DATA p256<>+0x08(SB)/8, $0xffffffffffffffff // P256
|
||||
DATA p256<>+0x10(SB)/8, $0xffffffff00000000 // P256
|
||||
DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
|
||||
DATA p256<>+0x20(SB)/8, $0x0000000000000000 // SEL 0 0 d1 d0
|
||||
DATA p256<>+0x28(SB)/8, $0x18191a1b1c1d1e1f // SEL 0 0 d1 d0
|
||||
DATA p256mul<>+0x00(SB)/8, $0xffffffff00000000 // P256 original
|
||||
DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
|
||||
DATA p256mul<>+0x10(SB)/8, $0xfffffffeffffffff // P256 original
|
||||
DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
|
||||
DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
|
||||
DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
|
||||
DATA p256mul<>+0x30(SB)/8, $0x0405060708090a0b // SEL 0 0 d1 d0
|
||||
DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
|
||||
DATA p256mul<>+0x40(SB)/8, $0x00000000ffffffff // (1*2^256)%P256
|
||||
DATA p256mul<>+0x48(SB)/8, $0x0000000000000001 // (1*2^256)%P256
|
||||
DATA p256mul<>+0x50(SB)/8, $0x0000000100000000 // (1*2^256)%P256
|
||||
DATA p256mul<>+0x58(SB)/8, $0x0000000000000000 // (1*2^256)%P256
|
||||
|
||||
// External declarations for constants
|
||||
GLOBL p256ordK0<>(SB), 8, $4
|
||||
GLOBL p256ord<>(SB), 8, $32
|
||||
GLOBL p256<>(SB), 8, $48
|
||||
GLOBL p256mul<>(SB), 8, $96
|
||||
|
||||
// The following macros are used to implement the ppc64le
|
||||
// equivalent function from the corresponding s390x
|
||||
// instruction for vector multiply high, low, and add,
|
||||
// since there aren't exact equivalent instructions.
|
||||
// The corresponding s390x instructions appear in the
|
||||
// comments.
|
||||
// Implementation for big endian would have to be
|
||||
// investigated, I think it would be different.
|
||||
//
|
||||
//
|
||||
// Vector multiply word
|
||||
//
|
||||
// VMLF x0, x1, out_low
|
||||
// VMLHF x0, x1, out_hi
|
||||
#define VMULT(x1, x2, out_low, out_hi) \
|
||||
VMULEUW x1, x2, TMP1; \
|
||||
VMULOUW x1, x2, TMP2; \
|
||||
VMRGEW TMP1, TMP2, out_hi; \
|
||||
VMRGOW TMP1, TMP2, out_low
|
||||
|
||||
//
|
||||
// Vector multiply add word
|
||||
//
|
||||
// VMALF x0, x1, y, out_low
|
||||
// VMALHF x0, x1, y, out_hi
|
||||
#define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
|
||||
VMULEUW y, one, TMP2; \
|
||||
VMULOUW y, one, TMP1; \
|
||||
VMULEUW x1, x2, out_low; \
|
||||
VMULOUW x1, x2, out_hi; \
|
||||
VADDUDM TMP2, out_low, TMP2; \
|
||||
VADDUDM TMP1, out_hi, TMP1; \
|
||||
VMRGOW TMP2, TMP1, out_low; \
|
||||
VMRGEW TMP2, TMP1, out_hi
|
||||
|
||||
#define res_ptr R3
|
||||
#define a_ptr R4
|
||||
|
||||
#undef res_ptr
|
||||
#undef a_ptr
|
||||
|
||||
#define P1ptr R3
|
||||
#define CPOOL R7
|
||||
|
||||
#define Y1L V0
|
||||
#define Y1H V1
|
||||
#define T1L V2
|
||||
#define T1H V3
|
||||
|
||||
#define PL V30
|
||||
#define PH V31
|
||||
|
||||
#define CAR1 V6
|
||||
// func p256NegCond(val *p256Point, cond int)
|
||||
TEXT ·p256NegCond(SB), NOSPLIT, $0-16
|
||||
MOVD val+0(FP), P1ptr
|
||||
MOVD $16, R16
|
||||
|
||||
MOVD cond+8(FP), R6
|
||||
CMP $0, R6
|
||||
BC 12, 2, LR // just return if cond == 0
|
||||
|
||||
MOVD $p256mul<>+0x00(SB), CPOOL
|
||||
|
||||
LXVD2X (P1ptr)(R0), Y1L
|
||||
LXVD2X (P1ptr)(R16), Y1H
|
||||
|
||||
XXPERMDI Y1H, Y1H, $2, Y1H
|
||||
XXPERMDI Y1L, Y1L, $2, Y1L
|
||||
|
||||
LXVD2X (CPOOL)(R0), PL
|
||||
LXVD2X (CPOOL)(R16), PH
|
||||
|
||||
VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry
|
||||
VSUBUQM PL, Y1L, T1L // subtract part2 giving result
|
||||
VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
|
||||
|
||||
XXPERMDI T1H, T1H, $2, T1H
|
||||
XXPERMDI T1L, T1L, $2, T1L
|
||||
|
||||
STXVD2X T1L, (R0+P1ptr)
|
||||
STXVD2X T1H, (R16+P1ptr)
|
||||
RET
|
||||
|
||||
#undef P1ptr
|
||||
#undef CPOOL
|
||||
#undef Y1L
|
||||
#undef Y1H
|
||||
#undef T1L
|
||||
#undef T1H
|
||||
#undef PL
|
||||
#undef PH
|
||||
#undef CAR1
|
||||
|
||||
#define P3ptr R3
|
||||
#define P1ptr R4
|
||||
#define P2ptr R5
|
||||
|
||||
#define X1L V0
|
||||
#define X1H V1
|
||||
#define Y1L V2
|
||||
#define Y1H V3
|
||||
#define Z1L V4
|
||||
#define Z1H V5
|
||||
#define X2L V6
|
||||
#define X2H V7
|
||||
#define Y2L V8
|
||||
#define Y2H V9
|
||||
#define Z2L V10
|
||||
#define Z2H V11
|
||||
#define SEL V12
|
||||
#define ZER V13
|
||||
|
||||
// This function uses LXVD2X and STXVD2X to avoid the
|
||||
// data alignment requirement for LVX, STVX. Since
|
||||
// this code is just moving bytes and not doing arithmetic,
|
||||
// order of the bytes doesn't matter.
|
||||
//
|
||||
// func p256MovCond(res, a, b *p256Point, cond int)
|
||||
TEXT ·p256MovCond(SB), NOSPLIT, $0-32
|
||||
MOVD res+0(FP), P3ptr
|
||||
MOVD a+8(FP), P1ptr
|
||||
MOVD b+16(FP), P2ptr
|
||||
MOVD $16, R16
|
||||
MOVD $32, R17
|
||||
MOVD $48, R18
|
||||
MOVD $56, R21
|
||||
MOVD $64, R19
|
||||
MOVD $80, R20
|
||||
// cond is R1 + 24 (cond offset) + 32
|
||||
LXVDSX (R1)(R21), SEL
|
||||
VSPLTISB $0, ZER
|
||||
// SEL controls whether to store a or b
|
||||
VCMPEQUD SEL, ZER, SEL
|
||||
|
||||
LXVD2X (P1ptr+R0), X1H
|
||||
LXVD2X (P1ptr+R16), X1L
|
||||
LXVD2X (P1ptr+R17), Y1H
|
||||
LXVD2X (P1ptr+R18), Y1L
|
||||
LXVD2X (P1ptr+R19), Z1H
|
||||
LXVD2X (P1ptr+R20), Z1L
|
||||
|
||||
LXVD2X (P2ptr+R0), X2H
|
||||
LXVD2X (P2ptr+R16), X2L
|
||||
LXVD2X (P2ptr+R17), Y2H
|
||||
LXVD2X (P2ptr+R18), Y2L
|
||||
LXVD2X (P2ptr+R19), Z2H
|
||||
LXVD2X (P2ptr+R20), Z2L
|
||||
|
||||
VSEL X1H, X2H, SEL, X1H
|
||||
VSEL X1L, X2L, SEL, X1L
|
||||
VSEL Y1H, Y2H, SEL, Y1H
|
||||
VSEL Y1L, Y2L, SEL, Y1L
|
||||
VSEL Z1H, Z2H, SEL, Z1H
|
||||
VSEL Z1L, Z2L, SEL, Z1L
|
||||
|
||||
STXVD2X X1H, (P3ptr+R0)
|
||||
STXVD2X X1L, (P3ptr+R16)
|
||||
STXVD2X Y1H, (P3ptr+R17)
|
||||
STXVD2X Y1L, (P3ptr+R18)
|
||||
STXVD2X Z1H, (P3ptr+R19)
|
||||
STXVD2X Z1L, (P3ptr+R20)
|
||||
|
||||
RET
|
||||
|
||||
#undef P3ptr
|
||||
#undef P1ptr
|
||||
#undef P2ptr
|
||||
#undef X1L
|
||||
#undef X1H
|
||||
#undef Y1L
|
||||
#undef Y1H
|
||||
#undef Z1L
|
||||
#undef Z1H
|
||||
#undef X2L
|
||||
#undef X2H
|
||||
#undef Y2L
|
||||
#undef Y2H
|
||||
#undef Z2L
|
||||
#undef Z2H
|
||||
#undef SEL
|
||||
#undef ZER
|
||||
|
||||
#define P3ptr R3
|
||||
#define P1ptr R4
|
||||
#define COUNT R5
|
||||
|
||||
#define X1L V0
|
||||
#define X1H V1
|
||||
#define Y1L V2
|
||||
#define Y1H V3
|
||||
#define Z1L V4
|
||||
#define Z1H V5
|
||||
#define X2L V6
|
||||
#define X2H V7
|
||||
#define Y2L V8
|
||||
#define Y2H V9
|
||||
#define Z2L V10
|
||||
#define Z2H V11
|
||||
|
||||
#define ONE V18
|
||||
#define IDX V19
|
||||
#define SEL1 V20
|
||||
#define SEL2 V21
|
||||
// func p256Select(point *p256Point, table *p256Table, idx int, limit int)
|
||||
TEXT ·p256Select(SB), NOSPLIT, $0-24
|
||||
MOVD res+0(FP), P3ptr
|
||||
MOVD table+8(FP), P1ptr
|
||||
MOVD limit+24(FP), COUNT
|
||||
MOVD $16, R16
|
||||
MOVD $32, R17
|
||||
MOVD $48, R18
|
||||
MOVD $64, R19
|
||||
MOVD $80, R20
|
||||
|
||||
LXVDSX (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
|
||||
VSPLTB $7, SEL1, IDX // splat byte
|
||||
VSPLTISB $1, ONE // VREPIB $1, ONE
|
||||
VSPLTISB $1, SEL2 // VREPIB $1, SEL2
|
||||
MOVD COUNT, CTR // set up ctr
|
||||
|
||||
VSPLTISB $0, X1H // VZERO X1H
|
||||
VSPLTISB $0, X1L // VZERO X1L
|
||||
VSPLTISB $0, Y1H // VZERO Y1H
|
||||
VSPLTISB $0, Y1L // VZERO Y1L
|
||||
VSPLTISB $0, Z1H // VZERO Z1H
|
||||
VSPLTISB $0, Z1L // VZERO Z1L
|
||||
|
||||
loop_select:
|
||||
|
||||
// LVXD2X is used here since data alignment doesn't
|
||||
// matter.
|
||||
|
||||
LXVD2X (P1ptr+R0), X2H
|
||||
LXVD2X (P1ptr+R16), X2L
|
||||
LXVD2X (P1ptr+R17), Y2H
|
||||
LXVD2X (P1ptr+R18), Y2L
|
||||
LXVD2X (P1ptr+R19), Z2H
|
||||
LXVD2X (P1ptr+R20), Z2L
|
||||
|
||||
VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
|
||||
|
||||
// This will result in SEL1 being all 0s or 1s, meaning
|
||||
// the result is either X1L or X2L, no individual byte
|
||||
// selection.
|
||||
|
||||
VSEL X1L, X2L, SEL1, X1L
|
||||
VSEL X1H, X2H, SEL1, X1H
|
||||
VSEL Y1L, Y2L, SEL1, Y1L
|
||||
VSEL Y1H, Y2H, SEL1, Y1H
|
||||
VSEL Z1L, Z2L, SEL1, Z1L
|
||||
VSEL Z1H, Z2H, SEL1, Z1H
|
||||
|
||||
// Add 1 to all bytes in SEL2
|
||||
VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK
|
||||
ADD $96, P1ptr
|
||||
BDNZ loop_select
|
||||
|
||||
// STXVD2X is used here so that alignment doesn't
|
||||
// need to be verified. Since values were loaded
|
||||
// using LXVD2X this is OK.
|
||||
STXVD2X X1H, (P3ptr+R0)
|
||||
STXVD2X X1L, (P3ptr+R16)
|
||||
STXVD2X Y1H, (P3ptr+R17)
|
||||
STXVD2X Y1L, (P3ptr+R18)
|
||||
STXVD2X Z1H, (P3ptr+R19)
|
||||
STXVD2X Z1L, (P3ptr+R20)
|
||||
RET
|
||||
|
||||
#undef P3ptr
|
||||
#undef P1ptr
|
||||
#undef COUNT
|
||||
#undef X1L
|
||||
#undef X1H
|
||||
#undef Y1L
|
||||
#undef Y1H
|
||||
#undef Z1L
|
||||
#undef Z1H
|
||||
#undef X2L
|
||||
#undef X2H
|
||||
#undef Y2L
|
||||
#undef Y2H
|
||||
#undef Z2L
|
||||
#undef Z2H
|
||||
#undef ONE
|
||||
#undef IDX
|
||||
#undef SEL1
|
||||
#undef SEL2
|
||||
|
||||
// The following functions all reverse the byte order.
|
||||
|
||||
//func p256BigToLittle(res *p256Element, in *[32]byte)
|
||||
TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16
|
||||
MOVD res+0(FP), R3
|
||||
MOVD in+8(FP), R4
|
||||
BR p256InternalEndianSwap<>(SB)
|
||||
|
||||
//func p256LittleToBig(res *[32]byte, in *p256Element)
|
||||
TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16
|
||||
MOVD res+0(FP), R3
|
||||
MOVD in+8(FP), R4
|
||||
BR p256InternalEndianSwap<>(SB)
|
||||
|
||||
//func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
|
||||
TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16
|
||||
MOVD res+0(FP), R3
|
||||
MOVD in+8(FP), R4
|
||||
BR p256InternalEndianSwap<>(SB)
|
||||
|
||||
//func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
|
||||
TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16
|
||||
MOVD res+0(FP), R3
|
||||
MOVD in+8(FP), R4
|
||||
BR p256InternalEndianSwap<>(SB)
|
||||
|
||||
TEXT p256InternalEndianSwap<>(SB), NOSPLIT, $0-0
|
||||
// Index registers needed for BR movs
|
||||
MOVD $8, R9
|
||||
MOVD $16, R10
|
||||
MOVD $24, R14
|
||||
|
||||
MOVDBR (R0)(R4), R5
|
||||
MOVDBR (R9)(R4), R6
|
||||
MOVDBR (R10)(R4), R7
|
||||
MOVDBR (R14)(R4), R8
|
||||
|
||||
MOVD R8, 0(R3)
|
||||
MOVD R7, 8(R3)
|
||||
MOVD R6, 16(R3)
|
||||
MOVD R5, 24(R3)
|
||||
|
||||
RET
|
||||
|
||||
#define P3ptr R3
|
||||
#define P1ptr R4
|
||||
#define COUNT R5
|
||||
|
||||
#define X1L V0
|
||||
#define X1H V1
|
||||
#define Y1L V2
|
||||
#define Y1H V3
|
||||
#define Z1L V4
|
||||
#define Z1H V5
|
||||
#define X2L V6
|
||||
#define X2H V7
|
||||
#define Y2L V8
|
||||
#define Y2H V9
|
||||
#define Z2L V10
|
||||
#define Z2H V11
|
||||
|
||||
#define ONE V18
|
||||
#define IDX V19
|
||||
#define SEL1 V20
|
||||
#define SEL2 V21
|
||||
|
||||
// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
|
||||
TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
|
||||
MOVD res+0(FP), P3ptr
|
||||
MOVD table+8(FP), P1ptr
|
||||
MOVD $16, R16
|
||||
MOVD $32, R17
|
||||
MOVD $48, R18
|
||||
|
||||
LXVDSX (R1)(R18), SEL1
|
||||
VSPLTB $7, SEL1, IDX // splat byte
|
||||
|
||||
VSPLTISB $1, ONE // Vector with byte 1s
|
||||
VSPLTISB $1, SEL2 // Vector with byte 1s
|
||||
MOVD $32, COUNT
|
||||
MOVD COUNT, CTR // loop count
|
||||
|
||||
VSPLTISB $0, X1H // VZERO X1H
|
||||
VSPLTISB $0, X1L // VZERO X1L
|
||||
VSPLTISB $0, Y1H // VZERO Y1H
|
||||
VSPLTISB $0, Y1L // VZERO Y1L
|
||||
|
||||
loop_select:
|
||||
LXVD2X (P1ptr+R0), X2H
|
||||
LXVD2X (P1ptr+R16), X2L
|
||||
LXVD2X (P1ptr+R17), Y2H
|
||||
LXVD2X (P1ptr+R18), Y2L
|
||||
|
||||
VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
|
||||
|
||||
VSEL X1L, X2L, SEL1, X1L // Select if idx matched
|
||||
VSEL X1H, X2H, SEL1, X1H
|
||||
VSEL Y1L, Y2L, SEL1, Y1L
|
||||
VSEL Y1H, Y2H, SEL1, Y1H
|
||||
|
||||
VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1
|
||||
ADD $64, P1ptr // Next chunk
|
||||
BDNZ loop_select
|
||||
|
||||
STXVD2X X1H, (P3ptr+R0)
|
||||
STXVD2X X1L, (P3ptr+R16)
|
||||
STXVD2X Y1H, (P3ptr+R17)
|
||||
STXVD2X Y1L, (P3ptr+R18)
|
||||
RET
|
||||
|
||||
#undef P3ptr
|
||||
#undef P1ptr
|
||||
#undef COUNT
|
||||
#undef X1L
|
||||
#undef X1H
|
||||
#undef Y1L
|
||||
#undef Y1H
|
||||
#undef Z1L
|
||||
#undef Z1H
|
||||
#undef X2L
|
||||
#undef X2H
|
||||
#undef Y2L
|
||||
#undef Y2H
|
||||
#undef Z2L
|
||||
#undef Z2H
|
||||
#undef ONE
|
||||
#undef IDX
|
||||
#undef SEL1
|
||||
#undef SEL2
|
||||
|
||||
#define res_ptr R3
|
||||
#define x_ptr R4
|
||||
#define CPOOL R7
|
||||
|
||||
#define T0 V0
|
||||
#define T1 V1
|
||||
#define T2 V2
|
||||
#define TT0 V3
|
||||
#define TT1 V4
|
||||
|
||||
#define ZER V6
|
||||
#define SEL1 V7
|
||||
#define SEL2 V8
|
||||
#define CAR1 V9
|
||||
#define CAR2 V10
|
||||
#define RED1 V11
|
||||
#define RED2 V12
|
||||
#define PL V13
|
||||
#define PH V14
|
||||
|
||||
// func p256FromMont(res, in *p256Element)
|
||||
TEXT ·p256FromMont(SB), NOSPLIT, $0-16
|
||||
MOVD res+0(FP), res_ptr
|
||||
MOVD in+8(FP), x_ptr
|
||||
|
||||
MOVD $16, R16
|
||||
MOVD $32, R17
|
||||
MOVD $p256<>+0x00(SB), CPOOL
|
||||
|
||||
VSPLTISB $0, T2 // VZERO T2
|
||||
VSPLTISB $0, ZER // VZERO ZER
|
||||
|
||||
// Constants are defined so that the LXVD2X is correct
|
||||
LXVD2X (CPOOL+R0), PH
|
||||
LXVD2X (CPOOL+R16), PL
|
||||
|
||||
// VPERM byte selections
|
||||
LXVD2X (CPOOL+R17), SEL1
|
||||
|
||||
LXVD2X (R16)(x_ptr), T1
|
||||
LXVD2X (R0)(x_ptr), T0
|
||||
|
||||
// Put in true little endian order
|
||||
XXPERMDI T0, T0, $2, T0
|
||||
XXPERMDI T1, T1, $2, T1
|
||||
|
||||
// First round
|
||||
VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0
|
||||
VSLDOI $4, RED1, ZER, TT0 // 0 d1 d0 0
|
||||
VSLDOI $4, TT0, ZER, RED2 // d1 d0 0 0
|
||||
VSUBCUQ RED1, TT0, CAR1 // VSCBIQ TT0, RED1, CAR1
|
||||
VSUBUQM RED1, TT0, RED1 // VSQ TT0, RED1, RED1
|
||||
VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
|
||||
|
||||
VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
|
||||
VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
|
||||
|
||||
VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
|
||||
VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
|
||||
VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
|
||||
VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
|
||||
VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
|
||||
|
||||
// Second round
|
||||
VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0
|
||||
VSLDOI $4, RED1, ZER, TT0 // 0 d1 d0 0
|
||||
VSLDOI $4, TT0, ZER, RED2 // d1 d0 0 0
|
||||
VSUBCUQ RED1, TT0, CAR1 // VSCBIQ TT0, RED1, CAR1
|
||||
VSUBUQM RED1, TT0, RED1 // VSQ TT0, RED1, RED1
|
||||
VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
|
||||
|
||||
VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
|
||||
VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
|
||||
|
||||
VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
|
||||
VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
|
||||
VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
|
||||
VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
|
||||
VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
|
||||
|
||||
// Third round
|
||||
VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0
|
||||
VSLDOI $4, RED1, ZER, TT0 // 0 d1 d0 0
|
||||
VSLDOI $4, TT0, ZER, RED2 // d1 d0 0 0
|
||||
VSUBCUQ RED1, TT0, CAR1 // VSCBIQ TT0, RED1, CAR1
|
||||
VSUBUQM RED1, TT0, RED1 // VSQ TT0, RED1, RED1
|
||||
VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
|
||||
|
||||
VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
|
||||
VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
|
||||
|
||||
VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
|
||||
VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
|
||||
VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
|
||||
VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
|
||||
VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
|
||||
|
||||
// Last round
|
||||
VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0
|
||||
VSLDOI $4, RED1, ZER, TT0 // 0 d1 d0 0
|
||||
VSLDOI $4, TT0, ZER, RED2 // d1 d0 0 0
|
||||
VSUBCUQ RED1, TT0, CAR1 // VSCBIQ TT0, RED1, CAR1
|
||||
VSUBUQM RED1, TT0, RED1 // VSQ TT0, RED1, RED1
|
||||
VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
|
||||
|
||||
VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
|
||||
VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
|
||||
|
||||
VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
|
||||
VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
|
||||
VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
|
||||
VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
|
||||
VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
|
||||
|
||||
// ---------------------------------------------------
|
||||
|
||||
VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1
|
||||
VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0
|
||||
VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
|
||||
VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1
|
||||
VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2
|
||||
|
||||
VSEL TT0, T0, T2, T0
|
||||
VSEL TT1, T1, T2, T1
|
||||
|
||||
// Reorder the bytes so STXVD2X can be used.
|
||||
// TT0, TT1 used for VPERM result in case
|
||||
// the caller expects T0, T1 to be good.
|
||||
XXPERMDI T0, T0, $2, TT0
|
||||
XXPERMDI T1, T1, $2, TT1
|
||||
|
||||
STXVD2X TT0, (R0)(res_ptr)
|
||||
STXVD2X TT1, (R16)(res_ptr)
|
||||
RET
|
||||
|
||||
#undef res_ptr
|
||||
#undef x_ptr
|
||||
#undef CPOOL
|
||||
#undef T0
|
||||
#undef T1
|
||||
#undef T2
|
||||
#undef TT0
|
||||
#undef TT1
|
||||
#undef ZER
|
||||
#undef SEL1
|
||||
#undef SEL2
|
||||
#undef CAR1
|
||||
#undef CAR2
|
||||
#undef RED1
|
||||
#undef RED2
|
||||
#undef PL
|
||||
#undef PH
|
||||
|
||||
//func p256OrdReduce(s *p256OrdElement)
|
||||
#define res_ptr R1
|
||||
#define CPOOL R4
|
||||
|
||||
#define T0 V0
|
||||
#define T1 V1
|
||||
#define T2 V2
|
||||
#define TT0 V3
|
||||
#define TT1 V4
|
||||
|
||||
#define ZER V6
|
||||
#define CAR1 V7
|
||||
#define CAR2 V8
|
||||
#define PL V9
|
||||
#define PH V10
|
||||
|
||||
TEXT ·p256OrdReduce(SB),NOSPLIT,$0
|
||||
MOVD res+0(FP), res_ptr
|
||||
MOVD $16, R16
|
||||
|
||||
VSPLTISB $0, T2 // VZERO T2
|
||||
VSPLTISB $0, ZER // VZERO ZER
|
||||
|
||||
MOVD $p256ord<>+0x00(SB), CPOOL
|
||||
LXVD2X (CPOOL+R0), PH
|
||||
LXVD2X (CPOOL+R16), PL
|
||||
|
||||
LXVD2X (R16)(res_ptr), T1
|
||||
LXVD2X (R0)(res_ptr), T0
|
||||
|
||||
// Put in true little endian order
|
||||
XXPERMDI T0, T0, $2, T0
|
||||
XXPERMDI T1, T1, $2, T1
|
||||
|
||||
VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1
|
||||
VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0
|
||||
VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
|
||||
VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1
|
||||
VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2
|
||||
|
||||
VSEL TT0, T0, T2, T0
|
||||
VSEL TT1, T1, T2, T1
|
||||
|
||||
// Reorder the bytes so STXVD2X can be used.
|
||||
// TT0, TT1 used for VPERM result in case
|
||||
// the caller expects T0, T1 to be good.
|
||||
XXPERMDI T0, T0, $2, TT0
|
||||
XXPERMDI T1, T1, $2, TT1
|
||||
|
||||
STXVD2X TT0, (R0)(res_ptr)
|
||||
STXVD2X TT1, (R16)(res_ptr)
|
||||
|
||||
RET
|
||||
#undef res_ptr
|
||||
#undef CPOOL
|
||||
#undef T0
|
||||
#undef T1
|
||||
#undef T2
|
||||
#undef TT0
|
||||
#undef TT1
|
||||
#undef ZER
|
||||
#undef CAR1
|
||||
#undef CAR2
|
||||
#undef PL
|
||||
#undef PH
|
44
internal/sm2ec/sm2p256_asm_ppc64le.go
Normal file
44
internal/sm2ec/sm2p256_asm_ppc64le.go
Normal file
@ -0,0 +1,44 @@
|
||||
//go:build !purego
|
||||
|
||||
package sm2ec
|
||||
|
||||
// p256Element is a P-256 base field element in [0, P-1] in the Montgomery
|
||||
// domain (with R 2²⁵⁶) as four limbs in little-endian order value.
|
||||
type p256Element [4]uint64
|
||||
|
||||
// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
|
||||
// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
|
||||
type p256OrdElement [4]uint64
|
||||
|
||||
// Montgomery multiplication by R⁻¹, or 1 outside the domain.
|
||||
// Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
|
||||
//
|
||||
//go:noescape
|
||||
func p256FromMont(res, in *p256Element)
|
||||
|
||||
// If cond is not 0, sets val = -val mod p.
|
||||
//
|
||||
//go:noescape
|
||||
func p256NegCond(val *p256Element, cond int)
|
||||
|
||||
// If cond is 0, sets res = b, otherwise sets res = a.
|
||||
//
|
||||
//go:noescape
|
||||
func p256MovCond(res, a, b *SM2P256Point, cond int)
|
||||
|
||||
//go:noescape
|
||||
func p256BigToLittle(res *p256Element, in *[32]byte)
|
||||
|
||||
//go:noescape
|
||||
func p256LittleToBig(res *[32]byte, in *p256Element)
|
||||
|
||||
//go:noescape
|
||||
func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
|
||||
|
||||
//go:noescape
|
||||
func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
|
||||
|
||||
// p256OrdReduce ensures s is in the range [0, ord(G)-1].
|
||||
//
|
||||
//go:noescape
|
||||
func p256OrdReduce(s *p256OrdElement)
|
82
internal/sm2ec/sm2p256_asm_ppc64le_test.go
Normal file
82
internal/sm2ec/sm2p256_asm_ppc64le_test.go
Normal file
@ -0,0 +1,82 @@
|
||||
//go:build ppc64le && !purego
|
||||
|
||||
package sm2ec
|
||||
|
||||
import (
|
||||
"math/big"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var bigOne = big.NewInt(1)
|
||||
|
||||
// fromBig converts a *big.Int into a format used by this code.
|
||||
func fromBig(out *[4]uint64, big *big.Int) {
|
||||
for i := range out {
|
||||
out[i] = 0
|
||||
}
|
||||
|
||||
for i, v := range big.Bits() {
|
||||
out[i] = uint64(v)
|
||||
}
|
||||
}
|
||||
|
||||
func montFromBig(out *[4]uint64, n *big.Int) {
|
||||
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
|
||||
r := new(big.Int).Lsh(bigOne, 256)
|
||||
// out = big * R mod P
|
||||
outBig := new(big.Int).Mul(n, r)
|
||||
outBig.Mod(outBig, p)
|
||||
fromBig(out, outBig)
|
||||
}
|
||||
|
||||
func toBigInt(in *p256Element) *big.Int {
|
||||
var valBytes [32]byte
|
||||
p256LittleToBig(&valBytes, in)
|
||||
return new(big.Int).SetBytes(valBytes[:])
|
||||
}
|
||||
|
||||
func ordElmToBigInt(in *p256OrdElement) *big.Int {
|
||||
var valBytes [32]byte
|
||||
p256OrdLittleToBig(&valBytes, in)
|
||||
return new(big.Int).SetBytes(valBytes[:])
|
||||
}
|
||||
|
||||
func testP256FromMont(v *big.Int, t *testing.T) {
|
||||
val := new(p256Element)
|
||||
montFromBig((*[4]uint64)(val), v)
|
||||
res := new(p256Element)
|
||||
p256FromMont(res, val)
|
||||
if toBigInt(res).Cmp(v) != 0 {
|
||||
t.Fatalf("p256FromMont failed for %v", v)
|
||||
}
|
||||
}
|
||||
|
||||
func TestP256FromMont(t *testing.T) {
|
||||
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
|
||||
for i := 0; i < 20; i++ {
|
||||
bigVal := big.NewInt(int64(i))
|
||||
testP256FromMont(bigVal, t)
|
||||
bigVal = new(big.Int).Sub(p, big.NewInt(int64(i)))
|
||||
testP256FromMont(bigVal, t)
|
||||
}
|
||||
}
|
||||
|
||||
func testP256OrderReduce(v *big.Int, t *testing.T) {
|
||||
val := new(p256OrdElement)
|
||||
montFromBig((*[4]uint64)(val), v)
|
||||
p256OrdReduce(val)
|
||||
if ordElmToBigInt(val).Cmp(v) != 0 {
|
||||
t.Fatalf("p256OrdReduce failed for %v", v)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
func TestP256OrderReduce(t *testing.T) {
|
||||
p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
|
||||
for i := 0; i < 20; i++ {
|
||||
bigVal := big.NewInt(int64(i))
|
||||
testP256OrderReduce(bigVal, t)
|
||||
bigVal = new(big.Int).Sub(p, big.NewInt(int64(i)))
|
||||
testP256OrderReduce(bigVal, t)
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user