From 8be54ddf9ea2e1c431a83e5a9b533b87a06935f1 Mon Sep 17 00:00:00 2001
From: Sun Yimin <emmansun@users.noreply.github.com>
Date: Wed, 28 Aug 2024 08:18:43 +0800
Subject: [PATCH] internal/subtle: port ppc64x from golang

---
 .github/workflows/test_ppc64.yaml |   8 +-
 .github/workflows/test_s390x.yaml |   6 +-
 internal/subtle/xor_generic.go    |   2 +-
 internal/subtle/xor_ppc64x.go     |  10 +++
 internal/subtle/xor_ppc64x.s      | 142 ++++++++++++++++++++++++++++++
 5 files changed, 158 insertions(+), 10 deletions(-)
 create mode 100644 internal/subtle/xor_ppc64x.go
 create mode 100644 internal/subtle/xor_ppc64x.s

diff --git a/.github/workflows/test_ppc64.yaml b/.github/workflows/test_ppc64.yaml
index 9df5881..14c05bd 100644
--- a/.github/workflows/test_ppc64.yaml
+++ b/.github/workflows/test_ppc64.yaml
@@ -29,12 +29,8 @@ jobs:
     - name: Check out code
       uses: actions/checkout@v4
 
-    - name: Test
-      run: go test -v -short ./internal/bigmod/...
+    - name: Test internal
+      run: go test ./internal/...
       env:
         GOARCH: ${{ matrix.arch }}      
 
-    - name: Test SM2EC
-      run: go test -v ./internal/sm2ec/...
-      env:
-        GOARCH: ${{ matrix.arch }}      
diff --git a/.github/workflows/test_s390x.yaml b/.github/workflows/test_s390x.yaml
index 3ca226d..dd4b3d1 100644
--- a/.github/workflows/test_s390x.yaml
+++ b/.github/workflows/test_s390x.yaml
@@ -29,11 +29,11 @@ jobs:
     - name: Check out code
       uses: actions/checkout@v4
 
-    - name: Test SM2EC
-      run: go test -v ./internal/sm2ec/...
+    - name: Test internal
+      run: go test -v ./internal/...
       env:
         GOARCH: ${{ matrix.arch }}
-  
+
 #    - name: Test
 #      run: go test -v -short ./...
 #      env:
diff --git a/internal/subtle/xor_generic.go b/internal/subtle/xor_generic.go
index 18dafd0..c515f53 100644
--- a/internal/subtle/xor_generic.go
+++ b/internal/subtle/xor_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //
-//go:build purego || !(amd64 || arm64 || s390x)
+//go:build purego || !(amd64 || arm64 || s390x || ppc64 || ppc64le)
 
 package subtle
 
diff --git a/internal/subtle/xor_ppc64x.go b/internal/subtle/xor_ppc64x.go
new file mode 100644
index 0000000..760463c
--- /dev/null
+++ b/internal/subtle/xor_ppc64x.go
@@ -0,0 +1,10 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (ppc64 || ppc64le) && !purego
+
+package subtle
+
+//go:noescape
+func xorBytes(dst, a, b *byte, n int)
diff --git a/internal/subtle/xor_ppc64x.s b/internal/subtle/xor_ppc64x.s
new file mode 100644
index 0000000..c1f72c5
--- /dev/null
+++ b/internal/subtle/xor_ppc64x.s
@@ -0,0 +1,142 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (ppc64 || ppc64le) && !purego
+
+#include "textflag.h"
+
+// func xorBytes(dst, a, b *byte, n int)
+TEXT ·xorBytes(SB), NOSPLIT, $0
+	MOVD	dst+0(FP), R3	// R3 = dst
+	MOVD	a+8(FP), R4	// R4 = a
+	MOVD	b+16(FP), R5	// R5 = b
+	MOVD	n+24(FP), R6	// R6 = n
+
+	CMPU	R6, $64, CR7	// Check if n ≥ 64 bytes
+	MOVD	R0, R8		// R8 = index
+	CMPU	R6, $8, CR6	// Check if 8 ≤ n < 64 bytes
+	BLE	CR6, small	// <= 8
+	BLT	CR7, xor32	// Case for 32 ≤ n < 64 bytes
+
+	// Case for n ≥ 64 bytes
+preloop64:
+	SRD	$6, R6, R7	// Set up loop counter
+	MOVD	R7, CTR
+	MOVD	$16, R10
+	MOVD	$32, R14
+	MOVD	$48, R15
+	ANDCC	$63, R6, R9	// Check for tailing bytes for later
+	PCALIGN $16
+	// Case for >= 64 bytes
+	// Process 64 bytes per iteration
+	// Load 4 vectors of a and b
+	// XOR the corresponding vectors
+	// from a and b and store the result
+loop64:
+	LXVD2X	(R4)(R8), VS32
+	LXVD2X	(R4)(R10), VS34
+	LXVD2X	(R4)(R14), VS36
+	LXVD2X	(R4)(R15), VS38
+	LXVD2X	(R5)(R8), VS33
+	LXVD2X	(R5)(R10), VS35
+	LXVD2X	(R5)(R14), VS37
+	LXVD2X	(R5)(R15), VS39
+	XXLXOR	VS32, VS33, VS32
+	XXLXOR	VS34, VS35, VS34
+	XXLXOR	VS36, VS37, VS36
+	XXLXOR	VS38, VS39, VS38
+	STXVD2X	VS32, (R3)(R8)
+	STXVD2X	VS34, (R3)(R10)
+	STXVD2X	VS36, (R3)(R14)
+	STXVD2X	VS38, (R3)(R15)
+	ADD	$64, R8
+	ADD	$64, R10
+	ADD	$64, R14
+	ADD	$64, R15
+	BDNZ	loop64
+	BC	12,2,LR		// BEQLR
+	MOVD	R9, R6
+	CMP	R6, $8
+	BLE	small
+	// Case for 8 <= n < 64 bytes
+	// Process 32 bytes if available
+xor32:
+	CMP	R6, $32
+	BLT	xor16
+	ADD	$16, R8, R9
+	LXVD2X	(R4)(R8), VS32
+	LXVD2X	(R4)(R9), VS33
+	LXVD2X	(R5)(R8), VS34
+	LXVD2X	(R5)(R9), VS35
+	XXLXOR	VS32, VS34, VS32
+	XXLXOR	VS33, VS35, VS33
+	STXVD2X	VS32, (R3)(R8)
+	STXVD2X	VS33, (R3)(R9)
+	ADD	$32, R8
+	ADD	$-32, R6
+	CMP	R6, $8
+	BLE	small
+	// Case for 8 <= n < 32 bytes
+	// Process 16 bytes if available
+xor16:
+	CMP	R6, $16
+	BLT	xor8
+	LXVD2X	(R4)(R8), VS32
+	LXVD2X	(R5)(R8), VS33
+	XXLXOR	VS32, VS33, VS32
+	STXVD2X	VS32, (R3)(R8)
+	ADD	$16, R8
+	ADD	$-16, R6
+small:
+	CMP	R6, $0
+	BC	12,2,LR		// BEQLR
+xor8:
+#ifdef GOPPC64_power10
+	SLD	$56,R6,R17
+	ADD	R4,R8,R18
+	ADD	R5,R8,R19
+	ADD	R3,R8,R20
+	LXVL	R18,R17,V0
+	LXVL	R19,R17,V1
+	VXOR	V0,V1,V1
+	STXVL	V1,R20,R17
+	RET
+#else
+	CMP	R6, $8
+	BLT	xor4
+	// Case for 8 ≤ n < 16 bytes
+	MOVD	(R4)(R8), R14   // R14 = a[i,...,i+7]
+	MOVD	(R5)(R8), R15   // R15 = b[i,...,i+7]
+	XOR	R14, R15, R16   // R16 = a[] ^ b[]
+	SUB	$8, R6          // n = n - 8
+	MOVD	R16, (R3)(R8)   // Store to dst
+	ADD	$8, R8
+xor4:
+	CMP	R6, $4
+	BLT	xor2
+	MOVWZ	(R4)(R8), R14
+	MOVWZ	(R5)(R8), R15
+	XOR	R14, R15, R16
+	MOVW	R16, (R3)(R8)
+	ADD	$4,R8
+	ADD	$-4,R6
+xor2:
+	CMP	R6, $2
+	BLT	xor1
+	MOVHZ	(R4)(R8), R14
+	MOVHZ	(R5)(R8), R15
+	XOR	R14, R15, R16
+	MOVH	R16, (R3)(R8)
+	ADD	$2,R8
+	ADD	$-2,R6
+xor1:
+	CMP	R6, $0
+	BC	12,2,LR		// BEQLR
+	MOVBZ	(R4)(R8), R14	// R14 = a[i]
+	MOVBZ	(R5)(R8), R15	// R15 = b[i]
+	XOR	R14, R15, R16	// R16 = a[i] ^ b[i]
+	MOVB	R16, (R3)(R8)	// Store to dst
+#endif
+done:
+	RET