From 8be54ddf9ea2e1c431a83e5a9b533b87a06935f1 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 28 Aug 2024 08:18:43 +0800 Subject: [PATCH] internal/subtle: port ppc64x from golang --- .github/workflows/test_ppc64.yaml | 8 +- .github/workflows/test_s390x.yaml | 6 +- internal/subtle/xor_generic.go | 2 +- internal/subtle/xor_ppc64x.go | 10 +++ internal/subtle/xor_ppc64x.s | 142 ++++++++++++++++++++++++++++++ 5 files changed, 158 insertions(+), 10 deletions(-) create mode 100644 internal/subtle/xor_ppc64x.go create mode 100644 internal/subtle/xor_ppc64x.s diff --git a/.github/workflows/test_ppc64.yaml b/.github/workflows/test_ppc64.yaml index 9df5881..14c05bd 100644 --- a/.github/workflows/test_ppc64.yaml +++ b/.github/workflows/test_ppc64.yaml @@ -29,12 +29,8 @@ jobs: - name: Check out code uses: actions/checkout@v4 - - name: Test - run: go test -v -short ./internal/bigmod/... + - name: Test internal + run: go test ./internal/... env: GOARCH: ${{ matrix.arch }} - - name: Test SM2EC - run: go test -v ./internal/sm2ec/... - env: - GOARCH: ${{ matrix.arch }} diff --git a/.github/workflows/test_s390x.yaml b/.github/workflows/test_s390x.yaml index 3ca226d..dd4b3d1 100644 --- a/.github/workflows/test_s390x.yaml +++ b/.github/workflows/test_s390x.yaml @@ -29,11 +29,11 @@ jobs: - name: Check out code uses: actions/checkout@v4 - - name: Test SM2EC - run: go test -v ./internal/sm2ec/... + - name: Test internal + run: go test -v ./internal/... env: GOARCH: ${{ matrix.arch }} - + # - name: Test # run: go test -v -short ./... # env: diff --git a/internal/subtle/xor_generic.go b/internal/subtle/xor_generic.go index 18dafd0..c515f53 100644 --- a/internal/subtle/xor_generic.go +++ b/internal/subtle/xor_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // -//go:build purego || !(amd64 || arm64 || s390x) +//go:build purego || !(amd64 || arm64 || s390x || ppc64 || ppc64le) package subtle diff --git a/internal/subtle/xor_ppc64x.go b/internal/subtle/xor_ppc64x.go new file mode 100644 index 0000000..760463c --- /dev/null +++ b/internal/subtle/xor_ppc64x.go @@ -0,0 +1,10 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (ppc64 || ppc64le) && !purego + +package subtle + +//go:noescape +func xorBytes(dst, a, b *byte, n int) diff --git a/internal/subtle/xor_ppc64x.s b/internal/subtle/xor_ppc64x.s new file mode 100644 index 0000000..c1f72c5 --- /dev/null +++ b/internal/subtle/xor_ppc64x.s @@ -0,0 +1,142 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (ppc64 || ppc64le) && !purego + +#include "textflag.h" + +// func xorBytes(dst, a, b *byte, n int) +TEXT ·xorBytes(SB), NOSPLIT, $0 + MOVD dst+0(FP), R3 // R3 = dst + MOVD a+8(FP), R4 // R4 = a + MOVD b+16(FP), R5 // R5 = b + MOVD n+24(FP), R6 // R6 = n + + CMPU R6, $64, CR7 // Check if n ≥ 64 bytes + MOVD R0, R8 // R8 = index + CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes + BLE CR6, small // <= 8 + BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes + + // Case for n ≥ 64 bytes +preloop64: + SRD $6, R6, R7 // Set up loop counter + MOVD R7, CTR + MOVD $16, R10 + MOVD $32, R14 + MOVD $48, R15 + ANDCC $63, R6, R9 // Check for tailing bytes for later + PCALIGN $16 + // Case for >= 64 bytes + // Process 64 bytes per iteration + // Load 4 vectors of a and b + // XOR the corresponding vectors + // from a and b and store the result +loop64: + LXVD2X (R4)(R8), VS32 + LXVD2X (R4)(R10), VS34 + LXVD2X (R4)(R14), VS36 + LXVD2X (R4)(R15), VS38 + LXVD2X (R5)(R8), VS33 + LXVD2X (R5)(R10), VS35 + LXVD2X (R5)(R14), VS37 + LXVD2X (R5)(R15), VS39 + XXLXOR VS32, VS33, VS32 + XXLXOR VS34, VS35, VS34 + XXLXOR VS36, VS37, VS36 + XXLXOR VS38, VS39, VS38 + STXVD2X VS32, (R3)(R8) + STXVD2X VS34, (R3)(R10) + STXVD2X VS36, (R3)(R14) + STXVD2X VS38, (R3)(R15) + ADD $64, R8 + ADD $64, R10 + ADD $64, R14 + ADD $64, R15 + BDNZ loop64 + BC 12,2,LR // BEQLR + MOVD R9, R6 + CMP R6, $8 + BLE small + // Case for 8 <= n < 64 bytes + // Process 32 bytes if available +xor32: + CMP R6, $32 + BLT xor16 + ADD $16, R8, R9 + LXVD2X (R4)(R8), VS32 + LXVD2X (R4)(R9), VS33 + LXVD2X (R5)(R8), VS34 + LXVD2X (R5)(R9), VS35 + XXLXOR VS32, VS34, VS32 + XXLXOR VS33, VS35, VS33 + STXVD2X VS32, (R3)(R8) + STXVD2X VS33, (R3)(R9) + ADD $32, R8 + ADD $-32, R6 + CMP R6, $8 + BLE small + // Case for 8 <= n < 32 bytes + // Process 16 bytes if available +xor16: + CMP R6, $16 + BLT xor8 + LXVD2X (R4)(R8), VS32 + LXVD2X (R5)(R8), VS33 + XXLXOR VS32, VS33, VS32 + STXVD2X VS32, (R3)(R8) + ADD $16, R8 + ADD $-16, R6 +small: + CMP R6, $0 + BC 12,2,LR // BEQLR +xor8: +#ifdef GOPPC64_power10 + SLD $56,R6,R17 + ADD R4,R8,R18 + ADD R5,R8,R19 + ADD R3,R8,R20 + LXVL R18,R17,V0 + LXVL R19,R17,V1 + VXOR V0,V1,V1 + STXVL V1,R20,R17 + RET +#else + CMP R6, $8 + BLT xor4 + // Case for 8 ≤ n < 16 bytes + MOVD (R4)(R8), R14 // R14 = a[i,...,i+7] + MOVD (R5)(R8), R15 // R15 = b[i,...,i+7] + XOR R14, R15, R16 // R16 = a[] ^ b[] + SUB $8, R6 // n = n - 8 + MOVD R16, (R3)(R8) // Store to dst + ADD $8, R8 +xor4: + CMP R6, $4 + BLT xor2 + MOVWZ (R4)(R8), R14 + MOVWZ (R5)(R8), R15 + XOR R14, R15, R16 + MOVW R16, (R3)(R8) + ADD $4,R8 + ADD $-4,R6 +xor2: + CMP R6, $2 + BLT xor1 + MOVHZ (R4)(R8), R14 + MOVHZ (R5)(R8), R15 + XOR R14, R15, R16 + MOVH R16, (R3)(R8) + ADD $2,R8 + ADD $-2,R6 +xor1: + CMP R6, $0 + BC 12,2,LR // BEQLR + MOVBZ (R4)(R8), R14 // R14 = a[i] + MOVBZ (R5)(R8), R15 // R15 = b[i] + XOR R14, R15, R16 // R16 = a[i] ^ b[i] + MOVB R16, (R3)(R8) // Store to dst +#endif +done: + RET