From 4808935f6020cadf2b8e04c1873d77543225029f Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Mon, 23 Sep 2024 17:50:16 +0800 Subject: [PATCH] cipher: xts ppc64x --- cipher/xts_asm.go | 2 +- cipher/xts_asm_test.go | 2 +- cipher/xts_generic.go | 2 +- cipher/xts_ppc64x.s | 166 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 cipher/xts_ppc64x.s diff --git a/cipher/xts_asm.go b/cipher/xts_asm.go index 8f08d2f..5b0e0d0 100644 --- a/cipher/xts_asm.go +++ b/cipher/xts_asm.go @@ -1,4 +1,4 @@ -//go:build (amd64 || arm64 || s390x) && !purego +//go:build (amd64 || arm64 || s390x || ppc64 || ppc64le) && !purego package cipher diff --git a/cipher/xts_asm_test.go b/cipher/xts_asm_test.go index de383f1..f567f8e 100644 --- a/cipher/xts_asm_test.go +++ b/cipher/xts_asm_test.go @@ -1,4 +1,4 @@ -//go:build (amd64 || arm64 || s390x) && !purego +//go:build (amd64 || arm64 || s390x || ppc64 || ppc64le) && !purego package cipher diff --git a/cipher/xts_generic.go b/cipher/xts_generic.go index 1a91420..c091c65 100644 --- a/cipher/xts_generic.go +++ b/cipher/xts_generic.go @@ -1,4 +1,4 @@ -//go:build purego || !(amd64 || arm64 || s390x) +//go:build purego || !(amd64 || arm64 || s390x || ppc64 || ppc64le) package cipher diff --git a/cipher/xts_ppc64x.s b/cipher/xts_ppc64x.s new file mode 100644 index 0000000..57a3910 --- /dev/null +++ b/cipher/xts_ppc64x.s @@ -0,0 +1,166 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build (ppc64 || ppc64le) && !purego + +#include "textflag.h" + +DATA xtsMask<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // byte swap BE -> LE +DATA xtsMask<>+0x08(SB)/8, $0x0706050403020100 +DATA xtsMask<>+0x10(SB)/8, $0x0000000000000000 +DATA xtsMask<>+0x18(SB)/8, $0x0000000000000087 +DATA xtsMask<>+0x20(SB)/8, $0xe100000000000000 +DATA xtsMask<>+0x28(SB)/8, $0x0000000000000000 +GLOBL xtsMask<>(SB), (NOPTR+RODATA), $48 + +#define ESPERM V21 // Endian swapping permute into BE + +#define POLY V0 +#define B0 V1 +#define T0 V2 +#define T1 V3 +#define CPOOL R7 + +#define doubleTweak(B0, POLY, T0, T1) \ + \ // Multiply by 2 + VSPLTB $0, B0, T0 \ + VSPLTISB $7, T1 \ + VSRAB T0, T1, T0 \ + VAND POLY, T0, T0 \// T0 for reduction + \ + VSPLTISB $1, T1 \ + VSL B0, T1, T1 \ + VXOR T0, T1, B0 + +#define gbDoubleTweak(B0, POLY, T0, T1) \ + VSPLTB $15, B0, T0 \ + VSPLTISB $7, T1 \ + VSLB T0, T1, T0 \ + VSRAB T0, T1, T0 \ + VAND POLY, T0, T0 \ // T0 for reduction + VSPLTISB $1, T1 \ + VSR B0, T1, B0 \ + VXOR T0, B0, B0 + +// func mul2(tweak *[blockSize]byte, isGB bool) +TEXT ·mul2(SB),NOSPLIT,$0 + MOVD tweak+0(FP), R3 + MOVBZ isGB+8(FP), R4 + + MOVD $xtsMask<>(SB), CPOOL + + CMPB R4, $1 + BEQ gb_alg + + // Load polynomial for reduction + MOVD $16, R5 + LXVD2X (CPOOL)(R5), POLY + + // Load tweak + LXVD2X (R3), B0 +#ifdef GOARCH_ppc64le + XXPERMDI B0, B0, $2, B0 + doubleTweak(B0, POLY, T0, T1) + XXPERMDI B0, B0, $2, B0 +#else + LXVD2X (CPOOL), ESPERM + + VPERM B0, B0, ESPERM, B0 + doubleTweak(B0, POLY, T0, T1) + VPERM B0, B0, ESPERM, B0 +#endif + STXVD2X B0, (R3) + + RET + +gb_alg: + // Load polynomial for reduction + MOVD $32, R5 + LXVD2X (CPOOL)(R5), POLY + + // Load tweak + LXVD2X (R3), B0 +#ifdef GOARCH_ppc64le + LVX (CPOOL), ESPERM + VPERM B0, B0, ESPERM, B0 + gbDoubleTweak(B0, POLY, T0, T1) + VPERM B0, B0, ESPERM, B0 +#else + gbDoubleTweak(B0, POLY, T0, T1) +#endif + STXVD2X B0, (R3) + RET + +// func doubleTweaks(tweak *[blockSize]byte, tweaks []byte, isGB bool) +TEXT ·doubleTweaks(SB),NOSPLIT,$0 + MOVD tweak+0(FP), R3 + MOVD tweaks+8(FP), R4 + MOVD tweaks_len+16(FP), R5 + MOVBZ isGB+32(FP), R6 + + MOVD $xtsMask<>(SB), CPOOL + + // Load tweak + LXVD2X (R3), B0 + + CMPB R6, $1 + BEQ gb_alg + + SRD $4, R5 + MOVD R5, CTR + +#ifndef GOARCH_ppc64le + LXVD2X (CPOOL), ESPERM +#endif + // Load polynomial for reduction + MOVD $16, R5 + LXVD2X (CPOOL)(R5), POLY + +loop: + STXVD2X B0, (R4) + ADD $16, R4 + +#ifdef GOARCH_ppc64le + XXPERMDI B0, B0, $2, B0 + doubleTweak(B0, POLY, T0, T1) + XXPERMDI B0, B0, $2, B0 +#else + VPERM B0, B0, ESPERM, B0 + doubleTweak(B0, POLY, T0, T1) + VPERM B0, B0, ESPERM, B0 +#endif + + BDNZ loop + + STXVD2X B0, (R3) + RET + +gb_alg: + SRD $4, R5 + MOVD R5, CTR + + // Load polynomial for reduction + MOVD $32, R5 + LXVD2X (CPOOL)(R5), POLY + +#ifdef GOARCH_ppc64le + LVX (CPOOL), ESPERM +#endif + +gbLoop: + STXVD2X B0, (R4) + ADD $16, R4 + +#ifdef GOARCH_ppc64le + VPERM B0, B0, ESPERM, B0 + gbDoubleTweak(B0, POLY, T0, T1) + VPERM B0, B0, ESPERM, B0 +#else + gbDoubleTweak(B0, POLY, T0, T1) +#endif + + BDNZ gbLoop + + STXVD2X B0, (R3) + RET