// Copyright 2024 Sun Yimin. All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. //go:build (ppc64 || ppc64le) && !purego #include "textflag.h" #define REVERSE_WORDS V19 #define M1L V20 #define M1H V21 #define M2L V22 #define M2H V23 #define V_FOUR V24 #define NIBBLE_MASK V29 // For instruction emulation #define ESPERMW V31 // Endian swapping permute into BE #define TMP0 V10 #define TMP1 V11 #define TMP2 V12 #define TMP3 V13 #define IV V18 #include "aesni_macros_ppc64x.s" #ifdef NEEDS_PERMW #define REVERSE32LE_8BLOCKS \ VPERM V0, V0, ESPERMW, V0 \ VPERM V1, V1, ESPERMW, V1 \ VPERM V2, V2, ESPERMW, V2 \ VPERM V3, V3, ESPERMW, V3 \ VPERM V4, V4, ESPERMW, V4 \ VPERM V5, V5, ESPERMW, V5 \ VPERM V6, V6, ESPERMW, V6 \ VPERM V7, V7, ESPERMW, V7 #else #define REVERSE32LE_8BLOCKS #endif // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 #define dstPtr R3 #define srcPtr R4 #define rk R5 #define srcLen R6 // prepare/load constants VSPLTISB $4, V_FOUR; #ifdef NEEDS_PERMW MOVD $·rcon(SB), R4 LVX (R4), ESPERMW #endif MOVD $·rcon+0x10(SB), R4 LOAD_CONSTS(R4, R3) // Load IV MOVD iv+56(FP), R7 PPC64X_LXVW4X(R7, R0, IV) MOVD xk+0(FP), rk MOVD dst+8(FP), dstPtr MOVD src+32(FP), srcPtr MOVD src_len+40(FP), srcLen MOVD $16, R7 MOVD $32, R8 MOVD $48, R9 MOVD $64, R10 MOVD $80, R11 MOVD $96, R12 MOVD $112, R14 ADD srcPtr, srcLen, R15 ADD $-16, R15, R15 LXVD2X (R15)(R0), V14 // Load last 16 bytes of src into V14 CMP srcLen, $144 // 9 blocks BLT lessThan9blocks PCALIGN $16 loop8blocks: ADD $-128, srcLen ADD srcPtr, srcLen, R15 ADD $-16, R15, R16 ADD dstPtr, srcLen, R17 PPC64X_LXVW4X(R15, R0, V0) PPC64X_LXVW4X(R15, R7, V1) PPC64X_LXVW4X(R15, R8, V2) PPC64X_LXVW4X(R15, R9, V3) PPC64X_LXVW4X(R15, R10, V4) PPC64X_LXVW4X(R15, R11, V5) PPC64X_LXVW4X(R15, R12, V6) PPC64X_LXVW4X(R15, R14, V7) PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7) LXVW4X (rk)(R0), V8 PROCESS_8BLOCKS_4ROUND LXVW4X (rk)(R7), V8 PROCESS_8BLOCKS_4ROUND LXVW4X (rk)(R8), V8 PROCESS_8BLOCKS_4ROUND LXVW4X (rk)(R9), V8 PROCESS_8BLOCKS_4ROUND LXVW4X (rk)(R10), V8 PROCESS_8BLOCKS_4ROUND LXVW4X (rk)(R11), V8 PROCESS_8BLOCKS_4ROUND LXVW4X (rk)(R12), V8 PROCESS_8BLOCKS_4ROUND LXVW4X (rk)(R14), V8 PROCESS_8BLOCKS_4ROUND TRANSPOSE_MATRIX(V0, V1, V2, V3) TRANSPOSE_MATRIX(V4, V5, V6, V7) REVERSE32LE_8BLOCKS // for ppc64le LXVW4X (R16)(R0), TMP0 LXVW4X (R16)(R7), TMP1 LXVW4X (R16)(R8), TMP2 LXVW4X (R16)(R9), TMP3 VXOR V0, TMP0, V0 VXOR V1, TMP1, V1 VXOR V2, TMP2, V2 VXOR V3, TMP3, V3 LXVW4X (R16)(R10), TMP0 LXVW4X (R16)(R11), TMP1 LXVW4X (R16)(R12), TMP2 LXVW4X (R16)(R14), TMP3 VXOR V4, TMP0, V4 VXOR V5, TMP1, V5 VXOR V6, TMP2, V6 VXOR V7, TMP3, V7 STXVW4X V0, (R17)(R0) STXVW4X V1, (R17)(R7) STXVW4X V2, (R17)(R8) STXVW4X V3, (R17)(R9) STXVW4X V4, (R17)(R10) STXVW4X V5, (R17)(R11) STXVW4X V6, (R17)(R12) STXVW4X V7, (R17)(R14) CMP srcLen, $144 // 9 blocks BGE loop8blocks lessThan9blocks: CMP srcLen, $64 BLE ble4blocks ADD $-64, srcLen ADD srcPtr, srcLen, R15 ADD $-16, R15, R16 ADD dstPtr, srcLen, R17 PPC64X_LXVW4X(R15, R0, V0) PPC64X_LXVW4X(R15, R7, V1) PPC64X_LXVW4X(R15, R8, V2) PPC64X_LXVW4X(R15, R9, V3) VOR V0, V0, V5 VOR V1, V1, V6 VOR V2, V2, V7 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) LXVW4X (rk)(R0), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R7), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R8), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R9), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R10), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R11), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R12), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R14), V8 PROCESS_4BLOCKS_4ROUND TRANSPOSE_MATRIX(V0, V1, V2, V3) PPC64X_LXVW4X(R16, R0, V4) VXOR V0, V4, V0 VXOR V1, V5, V1 VXOR V2, V6, V2 VXOR V3, V7, V3 PPC64X_STXVW4X(V0, R17, R0) PPC64X_STXVW4X(V1, R17, R7) PPC64X_STXVW4X(V2, R17, R8) PPC64X_STXVW4X(V3, R17, R9) ble4blocks: CMPU srcLen, $48, CR1 CMPU srcLen, $32, CR2 CMPU srcLen, $16, CR3 BEQ CR1, eq3blocks BEQ CR2, eq2blocks BEQ CR3, eq1block PPC64X_LXVW4X(srcPtr, R0, V0) PPC64X_LXVW4X(srcPtr, R7, V1) PPC64X_LXVW4X(srcPtr, R8, V2) PPC64X_LXVW4X(srcPtr, R9, V3) VOR V0, V0, V4 VOR V1, V1, V5 VOR V2, V2, V6 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) LXVW4X (rk)(R0), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R7), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R8), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R9), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R10), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R11), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R12), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R14), V8 PROCESS_4BLOCKS_4ROUND TRANSPOSE_MATRIX(V0, V1, V2, V3) VXOR V0, IV, V0 VXOR V1, V4, V1 VXOR V2, V5, V2 VXOR V3, V6, V3 PPC64X_STXVW4X(V0, dstPtr, R0) PPC64X_STXVW4X(V1, dstPtr, R7) PPC64X_STXVW4X(V2, dstPtr, R8) PPC64X_STXVW4X(V3, dstPtr, R9) BR done eq3blocks: PPC64X_LXVW4X(srcPtr, R0, V0) PPC64X_LXVW4X(srcPtr, R7, V1) PPC64X_LXVW4X(srcPtr, R8, V2) VOR V0, V0, V4 VOR V1, V1, V5 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) LXVW4X (rk)(R0), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R7), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R8), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R9), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R10), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R11), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R12), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R14), V8 PROCESS_4BLOCKS_4ROUND TRANSPOSE_MATRIX(V0, V1, V2, V3) VXOR V0, IV, V0 VXOR V1, V4, V1 VXOR V2, V5, V2 PPC64X_STXVW4X(V0, dstPtr, R0) PPC64X_STXVW4X(V1, dstPtr, R7) PPC64X_STXVW4X(V2, dstPtr, R8) BR done eq2blocks: PPC64X_LXVW4X(srcPtr, R0, V0) PPC64X_LXVW4X(srcPtr, R7, V1) VOR V0, V0, V4 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) LXVW4X (rk)(R0), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R7), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R8), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R9), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R10), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R11), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R12), V8 PROCESS_4BLOCKS_4ROUND LXVW4X (rk)(R14), V8 PROCESS_4BLOCKS_4ROUND TRANSPOSE_MATRIX(V0, V1, V2, V3) VXOR V0, IV, V0 VXOR V1, V4, V1 PPC64X_STXVW4X(V0, dstPtr, R0) PPC64X_STXVW4X(V1, dstPtr, R7) BR done eq1block: PPC64X_LXVW4X(srcPtr, R0, V0) VSLDOI $4, V0, V0, V1 VSLDOI $4, V1, V1, V2 VSLDOI $4, V2, V2, V3 LXVW4X (rk)(R0), V8 PROCESS_SINGLEBLOCK_4ROUND LXVW4X (rk)(R7), V8 PROCESS_SINGLEBLOCK_4ROUND LXVW4X (rk)(R8), V8 PROCESS_SINGLEBLOCK_4ROUND LXVW4X (rk)(R9), V8 PROCESS_SINGLEBLOCK_4ROUND LXVW4X (rk)(R10), V8 PROCESS_SINGLEBLOCK_4ROUND LXVW4X (rk)(R11), V8 PROCESS_SINGLEBLOCK_4ROUND LXVW4X (rk)(R12), V8 PROCESS_SINGLEBLOCK_4ROUND LXVW4X (rk)(R14), V8 PROCESS_SINGLEBLOCK_4ROUND VSLDOI $4, V3, V3, V3 VSLDOI $4, V3, V2, V2 VSLDOI $4, V2, V1, V1 VSLDOI $4, V1, V0, V0 VXOR V0, IV, V0 PPC64X_STXVW4X(V0, dstPtr, R0) done: MOVD iv+56(FP), R7 STXVD2X V14, (R7)(R0) RET