sm3: ppc64x, improve code readability

This commit is contained in:
Sun Yimin 2024-09-09 08:46:02 +08:00 committed by GitHub
parent b606bd773f
commit 79e33e2970
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -29,7 +29,12 @@
// For instruction emulation // For instruction emulation
#define ESPERMW V31 // Endian swapping permute into BE #define ESPERMW V31 // Endian swapping permute into BE
#define TEMP R19 #define R_x08 R15
#define R_x10 R16
#define R_x18 R17
#define R_x20 R18
#define R_x30 R19
#define R_TMP R19
DATA ·mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word DATA ·mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word
DATA ·mask+0x08(SB)/8, $0x0302010007060504 DATA ·mask+0x08(SB)/8, $0x0302010007060504
@ -70,8 +75,8 @@ GLOBL ·mask(SB), RODATA, $80
VRLW s, TMP5, r VRLW s, TMP5, r
#define loadWordByIndex(W, i) \ #define loadWordByIndex(W, i) \
MOVD $(16*(i)), TEMP \ MOVD $(16*(i)), R_TMP \
LXVW4X (TEMP)(statePtr), W LXVW4X (R_TMP)(statePtr), W
// one word is 16 bytes // one word is 16 bytes
#define prepare4Words \ #define prepare4Words \
@ -81,14 +86,11 @@ GLOBL ·mask(SB), RODATA, $80
PPC64X_LXVW4X(srcPtr4, srcPtrPtr, V19); \ PPC64X_LXVW4X(srcPtr4, srcPtrPtr, V19); \
TRANSPOSE_MATRIX(V16, V17, V18, V19); \ TRANSPOSE_MATRIX(V16, V17, V18, V19); \
ADD $16, srcPtrPtr; \ ADD $16, srcPtrPtr; \
STXVW4X V16, (wordPtr); \ STXVW4X V16, (R0)(wordPtr); \
ADD $16, wordPtr; \ STXVW4X V17, (R_x10)(wordPtr); \
STXVW4X V17, (wordPtr); \ STXVW4X V18, (R_x20)(wordPtr); \
ADD $16, wordPtr; \ STXVW4X V19, (R_x30)(wordPtr); \
STXVW4X V18, (wordPtr); \ ADD $64, wordPtr
ADD $16, wordPtr; \
STXVW4X V19, (wordPtr); \
ADD $16, wordPtr
#define TRANSPOSE_MATRIX(T0, T1, T2, T3) \ #define TRANSPOSE_MATRIX(T0, T1, T2, T3) \
VPERM T0, T1, M0, TMP0; \ VPERM T0, T1, M0, TMP0; \
@ -105,8 +107,8 @@ GLOBL ·mask(SB), RODATA, $80
// Solution 2: 2 constant T, rotate shift left one bit every time // Solution 2: 2 constant T, rotate shift left one bit every time
// Solution 1's performance is better but it uses more memory. // Solution 1's performance is better but it uses more memory.
#define LOAD_T(index, const, target) \ #define LOAD_T(index, const, target) \
MOVD $const, TEMP \ MOVD $const, R_TMP \
MTVSRWZ TEMP, target \ MTVSRWZ R_TMP, target \
VSPLTW $1, target, target VSPLTW $1, target, target
#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ #define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
@ -200,11 +202,11 @@ GLOBL ·mask(SB), RODATA, $80
// Used general purpose registers R4-R12, R15-R19. // Used general purpose registers R4-R12, R15-R19.
// blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int) // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
TEXT ·blockMultBy4(SB), NOSPLIT, $0 TEXT ·blockMultBy4(SB), NOSPLIT, $0
MOVD $8, R15 MOVD $8, R_x08
MOVD $16, R16 MOVD $16, R_x10
MOVD $24, R17 MOVD $24, R_x18
MOVD $32, R18 MOVD $32, R_x20
MOVD $48, TEMP MOVD $48, R_x30
#ifdef NEEDS_PERMW #ifdef NEEDS_PERMW
MOVD $·mask(SB), R4 MOVD $·mask(SB), R4
LVX (R4), ESPERMW LVX (R4), ESPERMW
@ -213,9 +215,9 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
MOVD $·mask+0x10(SB), R4 MOVD $·mask+0x10(SB), R4
#endif #endif
LXVD2X (R0)(R4), M0 LXVD2X (R0)(R4), M0
LXVD2X (R16)(R4), M1 LXVD2X (R_x10)(R4), M1
LXVD2X (R18)(R4), M2 LXVD2X (R_x20)(R4), M2
LXVD2X (TEMP)(R4), M3 LXVD2X (R_x30)(R4), M3
#define digPtr R11 #define digPtr R11
#define srcPtrPtr R5 #define srcPtrPtr R5
#define statePtr R4 #define statePtr R4
@ -231,26 +233,26 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
MOVD blocks+24(FP), blockCount MOVD blocks+24(FP), blockCount
// load state // load state
MOVD (R0)(digPtr), TEMP MOVD (R0)(digPtr), R_TMP
LXVW4X (R0)(TEMP), a LXVW4X (R0)(R_TMP), a
LXVW4X (R16)(TEMP), e LXVW4X (R_x10)(R_TMP), e
MOVD (R15)(digPtr), TEMP MOVD (R_x08)(digPtr), R_TMP
LXVW4X (R0)(TEMP), b LXVW4X (R0)(R_TMP), b
LXVW4X (R16)(TEMP), f LXVW4X (R_x10)(R_TMP), f
MOVD (R16)(digPtr), TEMP MOVD (R_x10)(digPtr), R_TMP
LXVW4X (R0)(TEMP), c LXVW4X (R0)(R_TMP), c
LXVW4X (R16)(TEMP), g LXVW4X (R_x10)(R_TMP), g
MOVD (R17)(digPtr), TEMP MOVD (R_x18)(digPtr), R_TMP
LXVW4X (R0)(TEMP), d LXVW4X (R0)(R_TMP), d
LXVW4X (R16)(TEMP), h LXVW4X (R_x10)(R_TMP), h
TRANSPOSE_MATRIX(a, b, c, d) TRANSPOSE_MATRIX(a, b, c, d)
TRANSPOSE_MATRIX(e, f, g, h) TRANSPOSE_MATRIX(e, f, g, h)
MOVD (R0)(srcPtrPtr), srcPtr1 MOVD (R0)(srcPtrPtr), srcPtr1
MOVD (R15)(srcPtrPtr), srcPtr2 MOVD (R_x08)(srcPtrPtr), srcPtr2
MOVD (R16)(srcPtrPtr), srcPtr3 MOVD (R_x10)(srcPtrPtr), srcPtr3
MOVD (R17)(srcPtrPtr), srcPtr4 MOVD (R_x18)(srcPtrPtr), srcPtr4
MOVD $0, srcPtrPtr MOVD $0, srcPtrPtr
MOVD blockCount, CTR MOVD blockCount, CTR
@ -358,18 +360,18 @@ end:
TRANSPOSE_MATRIX(e, f, g, h) TRANSPOSE_MATRIX(e, f, g, h)
// save state // save state
MOVD (R0)(digPtr), TEMP MOVD (R0)(digPtr), R_TMP
STXVW4X a, (R0)(TEMP) STXVW4X a, (R0)(R_TMP)
STXVW4X e, (R16)(TEMP) STXVW4X e, (R_x10)(R_TMP)
MOVD (R15)(digPtr), TEMP MOVD (R_x08)(digPtr), R_TMP
STXVW4X b, (R0)(TEMP) STXVW4X b, (R0)(R_TMP)
STXVW4X f, (R16)(TEMP) STXVW4X f, (R_x10)(R_TMP)
MOVD (R16)(digPtr), TEMP MOVD (R_x10)(digPtr), R_TMP
STXVW4X c, (R0)(TEMP) STXVW4X c, (R0)(R_TMP)
STXVW4X g, (R16)(TEMP) STXVW4X g, (R_x10)(R_TMP)
MOVD (R17)(digPtr), TEMP MOVD (R_x18)(digPtr), R_TMP
STXVW4X d, (R0)(TEMP) STXVW4X d, (R0)(R_TMP)
STXVW4X h, (R16)(TEMP) STXVW4X h, (R_x10)(R_TMP)
RET RET