sm3: ppc64x change T handling method #267

This commit is contained in:
Sun Yimin 2024-11-13 17:41:09 +08:00 committed by GitHub
parent d263ac5889
commit 8f45f4efcb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -5,7 +5,6 @@
//go:build (ppc64 || ppc64le) && !purego
#include "textflag.h"
#include "sm3_const_asm.s"
#define a V0
#define e V1
@ -15,6 +14,10 @@
#define g V5
#define d V6
#define h V7
#define T0 V8
#define T1 V9
#define T2 V10
#define ONE V11
#define TMP0 V12
#define TMP1 V13
#define TMP2 V14
@ -25,6 +28,12 @@
// For instruction emulation
#define ESPERMW V31 // Endian swapping permute into BE
DATA t_const<>+0x00(SB)/8, $0x79cc451979cc4519
DATA t_const<>+0x08(SB)/8, $0x79cc451979cc4519
DATA t_const<>+0x10(SB)/8, $0x9d8a7a879d8a7a87
DATA t_const<>+0x18(SB)/8, $0x9d8a7a879d8a7a87
GLOBL t_const<>(SB), RODATA, $32
#define R_x08 R15
#define R_x10 R16
#define R_x18 R17
@ -83,20 +92,11 @@
XXPERMDI TMP2, TMP3, $0, T1; \
XXPERMDI TMP2, TMP3, $3, T3
// Load constant T, How to simlify it?
// Solution 1: big constant table like sha256block_ppc64x.s
// Solution 2: 2 constant T, rotate shift left one bit every time
// Solution 1's performance is better but it uses more memory.
#define LOAD_T(index, const, target) \
MOVD $const, R_TMP \
MTVSRWZ R_TMP, target \
VSPLTW $1, target, target
#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
#define ROUND_00_11(index, T, a, b, c, d, e, f, g, h) \
PROLD(a, TMP0, 12) \
VOR TMP0, TMP0, TMP1 \
LOAD_T(index, const, TMP2) \
VADDUWM TMP2, TMP0, TMP0 \
VADDUWM T, TMP0, TMP0 \
VRLW T, ONE, T \
VADDUWM e, TMP0, TMP0 \
PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1
VXOR TMP2, TMP1, TMP0 \ // TMP0 = SS2
@ -141,16 +141,16 @@
STXVW4X TMP1, (wordPtr) \
ADD $16, wordPtr \
#define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \
#define ROUND_12_15(index, T, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index) \
ROUND_00_11(index, const, a, b, c, d, e, f, g, h)
ROUND_00_11(index, T, a, b, c, d, e, f, g, h)
#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \
#define ROUND_16_63(index, T, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it
PROLD(a, TMP0, 12) \
VOR TMP0, TMP0, TMP4 \
LOAD_T(index, const, TMP2) \
VADDUWM TMP2, TMP0, TMP0 \
VADDUWM T, TMP0, TMP0 \
VRLW T, ONE, T \
VADDUWM e, TMP0, TMP0 \
PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1
VXOR TMP2, TMP4, TMP0 \ // TMP0 = SS2
@ -188,6 +188,10 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
MOVD $·flip_mask(SB), R4
LVX (R4), ESPERMW
#endif
VSPLTISW $1, ONE
MOVD $t_const<>(SB), R4
LXVD2X (R0)(R4), T0
LXVD2X (R_x10)(R4), T1
#define digPtr R11
#define srcPtrPtr R5
#define statePtr R4
@ -247,72 +251,74 @@ loop:
prepare4Words
prepare4Words
ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)
ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)
XXLOR T0, T0, T2
ROUND_00_11(0, T2, a, b, c, d, e, f, g, h)
ROUND_00_11(1, T2, h, a, b, c, d, e, f, g)
ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)
ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)
ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)
ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)
ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)
ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)
ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)
ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)
ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)
ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)
ROUND_00_11(3, T2, f, g, h, a, b, c, d, e)
ROUND_00_11(4, T2, e, f, g, h, a, b, c, d)
ROUND_00_11(5, T2, d, e, f, g, h, a, b, c)
ROUND_00_11(6, T2, c, d, e, f, g, h, a, b)
ROUND_00_11(7, T2, b, c, d, e, f, g, h, a)
ROUND_00_11(8, T2, a, b, c, d, e, f, g, h)
ROUND_00_11(9, T2, h, a, b, c, d, e, f, g)
ROUND_00_11(10, T2, g, h, a, b, c, d, e, f)
ROUND_00_11(11, T2, f, g, h, a, b, c, d, e)
ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)
ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)
ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)
ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)
ROUND_12_15(12, T2, e, f, g, h, a, b, c, d)
ROUND_12_15(13, T2, d, e, f, g, h, a, b, c)
ROUND_12_15(14, T2, c, d, e, f, g, h, a, b)
ROUND_12_15(15, T2, b, c, d, e, f, g, h, a)
ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)
ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)
ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)
ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)
ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)
ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)
ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)
ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)
ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)
ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)
ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)
ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)
ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)
ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)
ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)
ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)
ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)
ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)
ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)
ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)
ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)
ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)
ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)
ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)
ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)
ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)
ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)
ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)
ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)
ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)
ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)
ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)
ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)
ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)
ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)
ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)
ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)
ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)
ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)
ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)
ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)
ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)
ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)
ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)
ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)
ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)
ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)
XXLOR T1, T1, T2
ROUND_16_63(16, T2, a, b, c, d, e, f, g, h)
ROUND_16_63(17, T2, h, a, b, c, d, e, f, g)
ROUND_16_63(18, T2, g, h, a, b, c, d, e, f)
ROUND_16_63(19, T2, f, g, h, a, b, c, d, e)
ROUND_16_63(20, T2, e, f, g, h, a, b, c, d)
ROUND_16_63(21, T2, d, e, f, g, h, a, b, c)
ROUND_16_63(22, T2, c, d, e, f, g, h, a, b)
ROUND_16_63(23, T2, b, c, d, e, f, g, h, a)
ROUND_16_63(24, T2, a, b, c, d, e, f, g, h)
ROUND_16_63(25, T2, h, a, b, c, d, e, f, g)
ROUND_16_63(26, T2, g, h, a, b, c, d, e, f)
ROUND_16_63(27, T2, f, g, h, a, b, c, d, e)
ROUND_16_63(28, T2, e, f, g, h, a, b, c, d)
ROUND_16_63(29, T2, d, e, f, g, h, a, b, c)
ROUND_16_63(30, T2, c, d, e, f, g, h, a, b)
ROUND_16_63(31, T2, b, c, d, e, f, g, h, a)
ROUND_16_63(32, T2, a, b, c, d, e, f, g, h)
ROUND_16_63(33, T2, h, a, b, c, d, e, f, g)
ROUND_16_63(34, T2, g, h, a, b, c, d, e, f)
ROUND_16_63(35, T2, f, g, h, a, b, c, d, e)
ROUND_16_63(36, T2, e, f, g, h, a, b, c, d)
ROUND_16_63(37, T2, d, e, f, g, h, a, b, c)
ROUND_16_63(38, T2, c, d, e, f, g, h, a, b)
ROUND_16_63(39, T2, b, c, d, e, f, g, h, a)
ROUND_16_63(40, T2, a, b, c, d, e, f, g, h)
ROUND_16_63(41, T2, h, a, b, c, d, e, f, g)
ROUND_16_63(42, T2, g, h, a, b, c, d, e, f)
ROUND_16_63(43, T2, f, g, h, a, b, c, d, e)
ROUND_16_63(44, T2, e, f, g, h, a, b, c, d)
ROUND_16_63(45, T2, d, e, f, g, h, a, b, c)
ROUND_16_63(46, T2, c, d, e, f, g, h, a, b)
ROUND_16_63(47, T2, b, c, d, e, f, g, h, a)
ROUND_16_63(48, T2, a, b, c, d, e, f, g, h)
ROUND_16_63(49, T2, h, a, b, c, d, e, f, g)
ROUND_16_63(50, T2, g, h, a, b, c, d, e, f)
ROUND_16_63(51, T2, f, g, h, a, b, c, d, e)
ROUND_16_63(52, T2, e, f, g, h, a, b, c, d)
ROUND_16_63(53, T2, d, e, f, g, h, a, b, c)
ROUND_16_63(54, T2, c, d, e, f, g, h, a, b)
ROUND_16_63(55, T2, b, c, d, e, f, g, h, a)
ROUND_16_63(56, T2, a, b, c, d, e, f, g, h)
ROUND_16_63(57, T2, h, a, b, c, d, e, f, g)
ROUND_16_63(58, T2, g, h, a, b, c, d, e, f)
ROUND_16_63(59, T2, f, g, h, a, b, c, d, e)
ROUND_16_63(60, T2, e, f, g, h, a, b, c, d)
ROUND_16_63(61, T2, d, e, f, g, h, a, b, c)
ROUND_16_63(62, T2, c, d, e, f, g, h, a, b)
ROUND_16_63(63, T2, b, c, d, e, f, g, h, a)
XXLXOR V0, VS24, V0
XXLXOR V1, VS25, V1