gmsm/sm4/cbc_ppc64x.s

338 lines
7.0 KiB
ArmAsm

// Copyright 2024 Sun Yimin. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
//go:build (ppc64 || ppc64le) && !purego
#include "textflag.h"
#define REVERSE_WORDS V19
#define M1L V20
#define M1H V21
#define M2L V22
#define M2H V23
#define V_FOUR V24
#define M0 V25
#define M1 V26
#define M2 V27
#define M3 V28
#define NIBBLE_MASK V29
#define INVERSE_SHIFT_ROWS V30
// For instruction emulation
#define ESPERMW V31 // Endian swapping permute into BE
#define TMP0 V10
#define TMP1 V11
#define TMP2 V12
#define TMP3 V13
#define IV V18
#include "aesni_macros_ppc64x.s"
#ifdef NEEDS_PERMW
#define REVERSE32LE_8BLOCKS \
VPERM V0, V0, ESPERMW, V0 \
VPERM V1, V1, ESPERMW, V1 \
VPERM V2, V2, ESPERMW, V2 \
VPERM V3, V3, ESPERMW, V3 \
VPERM V4, V4, ESPERMW, V4 \
VPERM V5, V5, ESPERMW, V5 \
VPERM V6, V6, ESPERMW, V6 \
VPERM V7, V7, ESPERMW, V7
#else
#define REVERSE32LE_8BLOCKS
#endif
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
#define dstPtr R3
#define srcPtr R4
#define rk R5
#define srcLen R6
// prepare/load constants
VSPLTISW $4, V_FOUR;
#ifdef NEEDS_PERMW
MOVD $·rcon(SB), R4
LVX (R4), ESPERMW
#endif
MOVD $·rcon+0x10(SB), R4
LOAD_CONSTS(R4, R3)
// Load IV
MOVD iv+56(FP), R7
PPC64X_LXVW4X(R7, R0, IV)
MOVD xk+0(FP), rk
MOVD dst+8(FP), dstPtr
MOVD src+32(FP), srcPtr
MOVD src_len+40(FP), srcLen
MOVD $16, R7
MOVD $32, R8
MOVD $48, R9
MOVD $64, R10
MOVD $80, R11
MOVD $96, R12
MOVD $112, R14
ADD srcPtr, srcLen, R15
ADD $-16, R15, R15
LXVD2X (R15)(R0), V14 // Load last 16 bytes of src into V14
CMP srcLen, $144 // 9 blocks
BLT lessThan9blocks
PCALIGN $16
loop8blocks:
ADD $-128, srcLen
ADD srcPtr, srcLen, R15
ADD $-16, R15, R16
ADD dstPtr, srcLen, R17
PPC64X_LXVW4X(R15, R0, V0)
PPC64X_LXVW4X(R15, R7, V1)
PPC64X_LXVW4X(R15, R8, V2)
PPC64X_LXVW4X(R15, R9, V3)
PPC64X_LXVW4X(R15, R10, V4)
PPC64X_LXVW4X(R15, R11, V5)
PPC64X_LXVW4X(R15, R12, V6)
PPC64X_LXVW4X(R15, R14, V7)
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7)
LXVW4X (rk)(R0), V8
PROCESS_8BLOCKS_4ROUND
LXVW4X (rk)(R7), V8
PROCESS_8BLOCKS_4ROUND
LXVW4X (rk)(R8), V8
PROCESS_8BLOCKS_4ROUND
LXVW4X (rk)(R9), V8
PROCESS_8BLOCKS_4ROUND
LXVW4X (rk)(R10), V8
PROCESS_8BLOCKS_4ROUND
LXVW4X (rk)(R11), V8
PROCESS_8BLOCKS_4ROUND
LXVW4X (rk)(R12), V8
PROCESS_8BLOCKS_4ROUND
LXVW4X (rk)(R14), V8
PROCESS_8BLOCKS_4ROUND
TRANSPOSE_MATRIX(V0, V1, V2, V3)
TRANSPOSE_MATRIX(V4, V5, V6, V7)
REVERSE32LE_8BLOCKS // for ppc64le
LXVW4X (R16)(R0), TMP0
LXVW4X (R16)(R7), TMP1
LXVW4X (R16)(R8), TMP2
LXVW4X (R16)(R9), TMP3
VXOR V0, TMP0, V0
VXOR V1, TMP1, V1
VXOR V2, TMP2, V2
VXOR V3, TMP3, V3
LXVW4X (R16)(R10), TMP0
LXVW4X (R16)(R11), TMP1
LXVW4X (R16)(R12), TMP2
LXVW4X (R16)(R14), TMP3
VXOR V4, TMP0, V4
VXOR V5, TMP1, V5
VXOR V6, TMP2, V6
VXOR V7, TMP3, V7
STXVW4X V0, (R17)(R0)
STXVW4X V1, (R17)(R7)
STXVW4X V2, (R17)(R8)
STXVW4X V3, (R17)(R9)
STXVW4X V4, (R17)(R10)
STXVW4X V5, (R17)(R11)
STXVW4X V6, (R17)(R12)
STXVW4X V7, (R17)(R14)
CMP srcLen, $144 // 9 blocks
BGE loop8blocks
lessThan9blocks:
CMP srcLen, $64
BLE ble4blocks
ADD $-64, srcLen
ADD srcPtr, srcLen, R15
ADD $-16, R15, R16
ADD dstPtr, srcLen, R17
PPC64X_LXVW4X(R15, R0, V0)
PPC64X_LXVW4X(R15, R7, V1)
PPC64X_LXVW4X(R15, R8, V2)
PPC64X_LXVW4X(R15, R9, V3)
VOR V0, V0, V5
VOR V1, V1, V6
VOR V2, V2, V7
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
LXVW4X (rk)(R0), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R7), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R8), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R9), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R10), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R11), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R12), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R14), V8
PROCESS_4BLOCKS_4ROUND
TRANSPOSE_MATRIX(V0, V1, V2, V3)
PPC64X_LXVW4X(R16, R0, V4)
VXOR V0, V4, V0
VXOR V1, V5, V1
VXOR V2, V6, V2
VXOR V3, V7, V3
PPC64X_STXVW4X(V0, R17, R0)
PPC64X_STXVW4X(V1, R17, R7)
PPC64X_STXVW4X(V2, R17, R8)
PPC64X_STXVW4X(V3, R17, R9)
ble4blocks:
CMPU srcLen, $48, CR1
CMPU srcLen, $32, CR2
CMPU srcLen, $16, CR3
BEQ CR1, eq3blocks
BEQ CR2, eq2blocks
BEQ CR3, eq1block
PPC64X_LXVW4X(srcPtr, R0, V0)
PPC64X_LXVW4X(srcPtr, R7, V1)
PPC64X_LXVW4X(srcPtr, R8, V2)
PPC64X_LXVW4X(srcPtr, R9, V3)
VOR V0, V0, V4
VOR V1, V1, V5
VOR V2, V2, V6
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
LXVW4X (rk)(R0), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R7), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R8), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R9), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R10), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R11), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R12), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R14), V8
PROCESS_4BLOCKS_4ROUND
TRANSPOSE_MATRIX(V0, V1, V2, V3)
VXOR V0, IV, V0
VXOR V1, V4, V1
VXOR V2, V5, V2
VXOR V3, V6, V3
PPC64X_STXVW4X(V0, dstPtr, R0)
PPC64X_STXVW4X(V1, dstPtr, R7)
PPC64X_STXVW4X(V2, dstPtr, R8)
PPC64X_STXVW4X(V3, dstPtr, R9)
BR done
eq3blocks:
PPC64X_LXVW4X(srcPtr, R0, V0)
PPC64X_LXVW4X(srcPtr, R7, V1)
PPC64X_LXVW4X(srcPtr, R8, V2)
VOR V0, V0, V4
VOR V1, V1, V5
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
LXVW4X (rk)(R0), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R7), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R8), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R9), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R10), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R11), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R12), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R14), V8
PROCESS_4BLOCKS_4ROUND
TRANSPOSE_MATRIX(V0, V1, V2, V3)
VXOR V0, IV, V0
VXOR V1, V4, V1
VXOR V2, V5, V2
PPC64X_STXVW4X(V0, dstPtr, R0)
PPC64X_STXVW4X(V1, dstPtr, R7)
PPC64X_STXVW4X(V2, dstPtr, R8)
BR done
eq2blocks:
PPC64X_LXVW4X(srcPtr, R0, V0)
PPC64X_LXVW4X(srcPtr, R7, V1)
VOR V0, V0, V4
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
LXVW4X (rk)(R0), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R7), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R8), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R9), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R10), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R11), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R12), V8
PROCESS_4BLOCKS_4ROUND
LXVW4X (rk)(R14), V8
PROCESS_4BLOCKS_4ROUND
TRANSPOSE_MATRIX(V0, V1, V2, V3)
VXOR V0, IV, V0
VXOR V1, V4, V1
PPC64X_STXVW4X(V0, dstPtr, R0)
PPC64X_STXVW4X(V1, dstPtr, R7)
BR done
eq1block:
PPC64X_LXVW4X(srcPtr, R0, V0)
VSLDOI $4, V0, V0, V1
VSLDOI $4, V1, V1, V2
VSLDOI $4, V2, V2, V3
LXVW4X (rk)(R0), V8
PROCESS_SINGLEBLOCK_4ROUND
LXVW4X (rk)(R7), V8
PROCESS_SINGLEBLOCK_4ROUND
LXVW4X (rk)(R8), V8
PROCESS_SINGLEBLOCK_4ROUND
LXVW4X (rk)(R9), V8
PROCESS_SINGLEBLOCK_4ROUND
LXVW4X (rk)(R10), V8
PROCESS_SINGLEBLOCK_4ROUND
LXVW4X (rk)(R11), V8
PROCESS_SINGLEBLOCK_4ROUND
LXVW4X (rk)(R12), V8
PROCESS_SINGLEBLOCK_4ROUND
LXVW4X (rk)(R14), V8
PROCESS_SINGLEBLOCK_4ROUND
VSLDOI $4, V3, V3, V3
VSLDOI $4, V3, V2, V2
VSLDOI $4, V2, V1, V1
VSLDOI $4, V1, V0, V0
VXOR V0, IV, V0
PPC64X_STXVW4X(V0, dstPtr, R0)
done:
MOVD iv+56(FP), R7
STXVD2X V14, (R7)(R0)
RET