sm3: ppc64x, s390x mult4 add comments

This commit is contained in:
Sun Yimin 2024-09-06 17:20:01 +08:00 committed by GitHub
parent 2d3220793c
commit 0799942ef9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 41 additions and 36 deletions

View File

@ -29,6 +29,8 @@
// For instruction emulation // For instruction emulation
#define ESPERMW V31 // Endian swapping permute into BE #define ESPERMW V31 // Endian swapping permute into BE
#define TEMP R19
DATA ·mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word DATA ·mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word
DATA ·mask+0x08(SB)/8, $0x0302010007060504 DATA ·mask+0x08(SB)/8, $0x0302010007060504
DATA ·mask+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix DATA ·mask+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix
@ -59,13 +61,17 @@ GLOBL ·mask(SB), RODATA, $80
#endif // defined(GOARCH_ppc64le) #endif // defined(GOARCH_ppc64le)
// r = s <<< n // r = s <<< n
// Due to VSPLTISW's limitation, the n MUST be [0, 15],
// If n > 15, we have to call it multiple times.
// VSPLTISW takes a 5-bit immediate value as an operand.
// I also did NOT find one vector instruction to use immediate value for ROTL.
#define PROLD(s, r, n) \ #define PROLD(s, r, n) \
VSPLTISW $n, TMP5 \ VSPLTISW $n, TMP5 \
VRLW s, TMP5, r VRLW s, TMP5, r
#define loadWordByIndex(W, i) \ #define loadWordByIndex(W, i) \
MOVD $(16*(i)), R19 \ MOVD $(16*(i)), TEMP \
LXVW4X (R19)(statePtr), W LXVW4X (TEMP)(statePtr), W
// one word is 16 bytes // one word is 16 bytes
#define prepare4Words \ #define prepare4Words \
@ -95,12 +101,12 @@ GLOBL ·mask(SB), RODATA, $80
VPERM TMP2, TMP3, M3, T3 VPERM TMP2, TMP3, M3, T3
// Load constant T, How to simlify it? // Load constant T, How to simlify it?
// Solution 1: big constant table // Solution 1: big constant table like sha256block_ppc64x.s
// Solution 2: 2 constant T, rotate shift left one bit every time // Solution 2: 2 constant T, rotate shift left one bit every time
// Which solution's performance is better? // Solution 1's performance is better but it uses more memory.
#define LOAD_T(index, const, target) \ #define LOAD_T(index, const, target) \
MOVD $const, R19 \ MOVD $const, TEMP \
MTVSRWZ R19, target \ MTVSRWZ TEMP, target \
VSPLTW $1, target, target VSPLTW $1, target, target
#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ #define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
@ -128,7 +134,7 @@ GLOBL ·mask(SB), RODATA, $80
PROLD(TMP4, b, 9) \ // b = b <<< 9 PROLD(TMP4, b, 9) \ // b = b <<< 9
VOR TMP1, TMP1, h \ // h = TT1 VOR TMP1, TMP1, h \ // h = TT1
PROLD(f, TMP4, 10) \ PROLD(f, TMP4, 10) \
PROLD(TMP4, f, 9) \ // f = f <<< 19 PROLD(TMP4, f, 9) \ // f = f <<< 19, Here we had to ROTL twice: ROTL 10, then ROTL 9
PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9 PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9
PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17 PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17
VXOR TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9) VXOR TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9)
@ -185,7 +191,7 @@ GLOBL ·mask(SB), RODATA, $80
PROLD(TMP1, b, 9) \ // b = b <<< 9 PROLD(TMP1, b, 9) \ // b = b <<< 9
VOR TMP4, TMP4, h \ // h = TT1 VOR TMP4, TMP4, h \ // h = TT1
PROLD(f, TMP1, 10) \ PROLD(f, TMP1, 10) \
PROLD(TMP1, f, 9) \ // f = f <<< 19 PROLD(TMP1, f, 9) \ // f = f <<< 19, Here we had to ROTL twice: ROTL 10, then ROTL 9
PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9 PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9
PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17 PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17
VXOR TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9) VXOR TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9)
@ -198,7 +204,7 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
MOVD $16, R16 MOVD $16, R16
MOVD $24, R17 MOVD $24, R17
MOVD $32, R18 MOVD $32, R18
MOVD $48, R19 MOVD $48, TEMP
#ifdef NEEDS_PERMW #ifdef NEEDS_PERMW
MOVD $·mask(SB), R4 MOVD $·mask(SB), R4
LVX (R4), ESPERMW LVX (R4), ESPERMW
@ -209,7 +215,7 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
LXVD2X (R0)(R4), M0 LXVD2X (R0)(R4), M0
LXVD2X (R16)(R4), M1 LXVD2X (R16)(R4), M1
LXVD2X (R18)(R4), M2 LXVD2X (R18)(R4), M2
LXVD2X (R19)(R4), M3 LXVD2X (TEMP)(R4), M3
#define digPtr R11 #define digPtr R11
#define srcPtrPtr R5 #define srcPtrPtr R5
#define statePtr R4 #define statePtr R4
@ -225,18 +231,18 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
MOVD blocks+24(FP), blockCount MOVD blocks+24(FP), blockCount
// load state // load state
MOVD (R0)(digPtr), R19 MOVD (R0)(digPtr), TEMP
LXVW4X (R0)(R19), a LXVW4X (R0)(TEMP), a
LXVW4X (R16)(R19), e LXVW4X (R16)(TEMP), e
MOVD (R15)(digPtr), R19 MOVD (R15)(digPtr), TEMP
LXVW4X (R0)(R19), b LXVW4X (R0)(TEMP), b
LXVW4X (R16)(R19), f LXVW4X (R16)(TEMP), f
MOVD (R16)(digPtr), R19 MOVD (R16)(digPtr), TEMP
LXVW4X (R0)(R19), c LXVW4X (R0)(TEMP), c
LXVW4X (R16)(R19), g LXVW4X (R16)(TEMP), g
MOVD (R17)(digPtr), R19 MOVD (R17)(digPtr), TEMP
LXVW4X (R0)(R19), d LXVW4X (R0)(TEMP), d
LXVW4X (R16)(R19), h LXVW4X (R16)(TEMP), h
TRANSPOSE_MATRIX(a, b, c, d) TRANSPOSE_MATRIX(a, b, c, d)
TRANSPOSE_MATRIX(e, f, g, h) TRANSPOSE_MATRIX(e, f, g, h)
@ -352,18 +358,18 @@ end:
TRANSPOSE_MATRIX(e, f, g, h) TRANSPOSE_MATRIX(e, f, g, h)
// save state // save state
MOVD (R0)(digPtr), R19 MOVD (R0)(digPtr), TEMP
STXVW4X a, (R0)(R19) STXVW4X a, (R0)(TEMP)
STXVW4X e, (R16)(R19) STXVW4X e, (R16)(TEMP)
MOVD (R15)(digPtr), R19 MOVD (R15)(digPtr), TEMP
STXVW4X b, (R0)(R19) STXVW4X b, (R0)(TEMP)
STXVW4X f, (R16)(R19) STXVW4X f, (R16)(TEMP)
MOVD (R16)(digPtr), R19 MOVD (R16)(digPtr), TEMP
STXVW4X c, (R0)(R19) STXVW4X c, (R0)(TEMP)
STXVW4X g, (R16)(R19) STXVW4X g, (R16)(TEMP)
MOVD (R17)(digPtr), R19 MOVD (R17)(digPtr), TEMP
STXVW4X d, (R0)(R19) STXVW4X d, (R0)(TEMP)
STXVW4X h, (R16)(R19) STXVW4X h, (R16)(TEMP)
RET RET

View File

@ -5,7 +5,6 @@
//go:build !purego //go:build !purego
#include "textflag.h" #include "textflag.h"
#include "go_asm.h"
DATA mask<>+0x00(SB)/8, $0x0001020310111213 DATA mask<>+0x00(SB)/8, $0x0001020310111213
DATA mask<>+0x08(SB)/8, $0x0405060714151617 DATA mask<>+0x08(SB)/8, $0x0405060714151617
@ -15,7 +14,7 @@ DATA mask<>+0x20(SB)/8, $0x0001020304050607
DATA mask<>+0x28(SB)/8, $0x1011121314151617 DATA mask<>+0x28(SB)/8, $0x1011121314151617
DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f
DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f
GLOBL mask<>(SB), 8, $64 GLOBL mask<>(SB), RODATA, $64
#define a V0 #define a V0
#define e V1 #define e V1