gmsm/sm4/asm_ppc64x.s

254 lines
5.3 KiB
ArmAsm
Raw Normal View History

2024-09-10 16:55:33 +08:00
// Copyright 2024 Sun Yimin. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
//go:build (ppc64 || ppc64le) && !purego
#include "textflag.h"
2024-09-10 17:46:54 +08:00
#define REVERSE_WORDS V19
#define M1L V20
#define M1H V21
#define M2L V22
#define M2H V23
2024-09-10 16:55:33 +08:00
#define V_FOUR V24
#define M0 V25
#define M1 V26
#define M2 V27
#define M3 V28
#define NIBBLE_MASK V29
#define INVERSE_SHIFT_ROWS V30
// For instruction emulation
#define ESPERMW V31 // Endian swapping permute into BE
2024-09-12 15:53:53 +08:00
#define TMP0 V10
#define TMP1 V11
#define TMP2 V12
#define TMP3 V13
#include "aesni_macros_ppc64x.s"
2024-09-10 16:55:33 +08:00
#define SM4_TAO_L2(x, y, z) \
SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L2 linear transforms ##################//
VSPLTISW $13, z; \
VRLW x, z, y; \ // y = x <<< 13
2024-09-10 22:37:31 +08:00
VXOR x, y, x; \
2024-09-10 16:55:33 +08:00
VSPLTISW $10, z; \
2024-09-11 15:35:06 +08:00
VRLW y, z, y; \ // y = x <<< 23
2024-09-10 16:55:33 +08:00
VXOR x, y, x
#define SM4_EXPANDKEY_ROUND(CK, x, y, z, t0, t1, t2, t3, target) \
VXOR t1, CK, x; \
VXOR t2, x, x; \
VXOR t3, x, x; \
SM4_TAO_L2(x, y, z); \
VXOR x, t0, t0; \
VSLDOI $4, target, t0, target
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
// prepare/load constants
VSPLTISW $4, V_FOUR;
#ifdef NEEDS_PERMW
MOVD $·rcon(SB), R4
LVX (R4), ESPERMW
#endif
MOVD $·rcon+0x50(SB), R4
2024-09-10 17:38:12 +08:00
LXVD2X (R4)(R0), REVERSE_WORDS
2024-09-10 16:55:33 +08:00
MOVD $16, R3
LXVD2X (R4)(R3), NIBBLE_MASK
MOVD $32, R3
LXVD2X (R4)(R3), INVERSE_SHIFT_ROWS
MOVD $48, R3
LXVD2X (R4)(R3), M1L
MOVD $64, R3
LXVD2X (R4)(R3), M1H
MOVD $80, R3
LXVD2X (R4)(R3), M2L
MOVD $96, R3
LXVD2X (R4)(R3), M2H
MOVD key+0(FP), R3
MOVD ck+8(FP), R4
MOVD enc+16(FP), R5
MOVD dec+24(FP), R6
ADD $112, R6
2024-09-11 16:09:48 +08:00
// load fk
MOVD $·fk+0(SB), R7
LXVW4X (R7), V4
2024-09-10 16:55:33 +08:00
// load key
PPC64X_LXVW4X(R3, R0, V0)
2024-09-11 16:09:48 +08:00
// xor key with fk
VXOR V0, V4, V0
2024-09-10 17:46:54 +08:00
VSLDOI $4, V0, V0, V1
2024-09-11 16:09:48 +08:00
VSLDOI $4, V1, V1, V2
VSLDOI $4, V2, V2, V3
// prepare counter
MOVD $8, R7
MOVD R7, CTR
2024-09-11 14:58:51 +08:00
2024-09-10 16:55:33 +08:00
ksLoop:
LXVW4X (R4), V4
2024-09-10 17:04:41 +08:00
SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V0, V1, V2, V3, V5)
2024-09-10 17:46:54 +08:00
VSLDOI $4, V4, V4, V4
2024-09-10 17:04:41 +08:00
SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V1, V2, V3, V0, V5)
2024-09-10 17:46:54 +08:00
VSLDOI $4, V4, V4, V4
2024-09-10 17:04:41 +08:00
SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V2, V3, V0, V1, V5)
2024-09-10 17:46:54 +08:00
VSLDOI $4, V4, V4, V4
2024-09-10 17:04:41 +08:00
SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V3, V0, V1, V2, V5)
2024-09-10 16:55:33 +08:00
STXVW4X V5, (R5)
2024-09-10 17:46:54 +08:00
VPERM V5, V5, REVERSE_WORDS, V5
STXVW4X V5, (R6)
2024-09-10 16:55:33 +08:00
ADD $16, R5
ADD $16, R4
ADD $-16, R6
BDNZ ksLoop
2024-09-11 16:09:48 +08:00
2024-09-10 16:55:33 +08:00
RET
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
2024-09-12 08:14:06 +08:00
// prepare/load constants
VSPLTISW $4, V_FOUR;
#ifdef NEEDS_PERMW
MOVD $·rcon(SB), R4
LVX (R4), ESPERMW
#endif
MOVD $·rcon+0x50(SB), R4
LXVD2X (R4)(R0), REVERSE_WORDS
MOVD $16, R3
LXVD2X (R4)(R3), NIBBLE_MASK
MOVD $32, R3
LXVD2X (R4)(R3), INVERSE_SHIFT_ROWS
MOVD $48, R3
LXVD2X (R4)(R3), M1L
MOVD $64, R3
LXVD2X (R4)(R3), M1H
MOVD $80, R3
LXVD2X (R4)(R3), M2L
MOVD $96, R3
LXVD2X (R4)(R3), M2H
MOVD xk+0(FP), R3
MOVD dst+8(FP), R4
MOVD src+16(FP), R5
// load src
PPC64X_LXVW4X(R5, R0, V0)
VSLDOI $4, V0, V0, V1
VSLDOI $4, V1, V1, V2
VSLDOI $4, V2, V2, V3
// prepare counter
MOVD $8, R7
MOVD R7, CTR
encryptBlockLoop:
// load xk
2024-09-12 15:53:53 +08:00
LXVW4X (R3), V8
PROCESS_SINGLEBLOCK_4ROUND
2024-09-12 08:47:10 +08:00
ADD $16, R3
2024-09-12 08:14:06 +08:00
BDNZ encryptBlockLoop
VSLDOI $4, V3, V3, V3
VSLDOI $4, V3, V2, V2
VSLDOI $4, V2, V1, V1
VSLDOI $4, V1, V0, V0
PPC64X_STXVW4X(V0, R4, R0)
RET
// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
// prepare/load constants
VSPLTISW $4, V_FOUR;
#ifdef NEEDS_PERMW
MOVD $·rcon(SB), R4
LVX (R4), ESPERMW
#endif
2024-09-12 15:53:53 +08:00
LOAD_CONSTS(R4, R3)
MOVD xk+0(FP), R3
MOVD dst+8(FP), R4
MOVD src+32(FP), R5
MOVD src_len+40(FP), R6
CMP R6, $128
BEQ enc8blocks
enc4blocks:
MOVD $16, R7
2024-09-12 15:53:53 +08:00
MOVD $32, R8
MOVD $48, R9
PPC64X_LXVW4X(R5, R0, V0)
PPC64X_LXVW4X(R5, R7, V1)
2024-09-12 15:53:53 +08:00
PPC64X_LXVW4X(R5, R8, V2)
PPC64X_LXVW4X(R5, R9, V3)
2024-09-12 11:50:39 +08:00
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
// prepare counter
MOVD $8, R7
MOVD R7, CTR
enc4blocksLoop:
// load xk
2024-09-12 15:53:53 +08:00
LXVW4X (R3), V8
PROCESS_4BLOCKS_4ROUND
ADD $16, R3
BDNZ enc4blocksLoop
TRANSPOSE_MATRIX(V0, V1, V2, V3)
2024-09-12 11:50:39 +08:00
PPC64X_STXVW4X(V0, R4, R0)
2024-09-12 11:31:19 +08:00
PPC64X_STXVW4X(V1, R4, R7)
2024-09-12 15:53:53 +08:00
PPC64X_STXVW4X(V2, R4, R8)
PPC64X_STXVW4X(V3, R4, R9)
RET
enc8blocks:
MOVD $16, R7
2024-09-12 15:53:53 +08:00
MOVD $32, R8
MOVD $48, R9
MOVD $64, R10
MOVD $80, R11
MOVD $96, R12
MOVD $112, R14
PPC64X_LXVW4X(R5, R0, V0)
PPC64X_LXVW4X(R5, R7, V1)
2024-09-12 15:53:53 +08:00
PPC64X_LXVW4X(R5, R8, V2)
PPC64X_LXVW4X(R5, R9, V3)
PPC64X_LXVW4X(R5, R10, V4)
PPC64X_LXVW4X(R5, R11, V5)
PPC64X_LXVW4X(R5, R12, V6)
PPC64X_LXVW4X(R5, R14, V7)
2024-09-12 11:50:39 +08:00
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7)
// prepare counter
MOVD $8, R7
MOVD R7, CTR
enc8blocksLoop:
LXVW4X (R3), V8
2024-09-12 15:53:53 +08:00
PROCESS_8BLOCKS_4ROUND
ADD $16, R3
2024-09-12 10:53:35 +08:00
BDNZ enc8blocksLoop
TRANSPOSE_MATRIX(V0, V1, V2, V3)
TRANSPOSE_MATRIX(V4, V5, V6, V7)
2024-09-12 11:50:39 +08:00
PPC64X_STXVW4X(V0, R4, R0)
2024-09-12 11:31:19 +08:00
PPC64X_STXVW4X(V1, R4, R7)
2024-09-12 15:53:53 +08:00
PPC64X_STXVW4X(V2, R4, R8)
PPC64X_STXVW4X(V3, R4, R9)
PPC64X_STXVW4X(V4, R4, R10)
PPC64X_STXVW4X(V5, R4, R11)
PPC64X_STXVW4X(V6, R4, R12)
PPC64X_STXVW4X(V7, R4, R14)
2024-09-10 16:55:33 +08:00
RET