sm3: ppc64x block, remove useless code and add comments #245

This commit is contained in:
Sun Yimin 2024-09-09 17:28:42 +08:00 committed by GitHub
parent 86d7648b6f
commit a564207436
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -7,6 +7,11 @@
#include "textflag.h" #include "textflag.h"
#include "sm3_const_asm.s" #include "sm3_const_asm.s"
// We can also use MFVSRWZ to extract the first word from vector register
// and then use VSLDOI to shift the vector register, thus we can avoid the usage of memory (buffer).
// But due to we have no real phisical machine to test the performance difference,
// we'll keep current implementation first.
#ifdef GOARCH_ppc64le #ifdef GOARCH_ppc64le
#define NEEDS_PERMW #define NEEDS_PERMW
@ -77,17 +82,10 @@ GLOBL ·flip_mask(SB), RODATA, $16
XOR tmp, out; \ XOR tmp, out; \
XOR tt2, out XOR tt2, out
// Load w from buffer
#define LOAD_WORD1(idx, dst) \
MOVWZ $(idx*4)(BUFFER), dst
// Load w' from buffer
#define LOAD_WORD2(idx, dst) \
MOVWZ $(idx*4 + 16)(BUFFER), dst
// For rounds [0 - 16) // For rounds [0 - 16)
// addr1 for w, addr2 for w'
#define DO_ROUND_N_0(addr1, addr2, const, a, b, c, d, e, f, g, h) \ #define DO_ROUND_N_0(addr1, addr2, const, a, b, c, d, e, f, g, h) \
; \ // ############################# RND N + 0 ############################// ; \
SS12(a, e, const, y2, y0); \ SS12(a, e, const, y2, y0); \
MOVWZ addr1, y1; \ MOVWZ addr1, y1; \
ADD y1, y2; \ // y2 = SS1 + W ADD y1, y2; \ // y2 = SS1 + W
@ -110,8 +108,9 @@ GLOBL ·flip_mask(SB), RODATA, $16
P0(y2, y0, d) P0(y2, y0, d)
// For rounds [16 - 64) // For rounds [16 - 64)
// addr1 for w, addr2 for w'
#define DO_ROUND_N_1(addr1, addr2, const, a, b, c, d, e, f, g, h) \ #define DO_ROUND_N_1(addr1, addr2, const, a, b, c, d, e, f, g, h) \
; \ // ############################# RND N + 0 ############################// ; \
SS12(a, e, const, y2, y0); \ SS12(a, e, const, y2, y0); \
MOVWZ addr1, y1; \ MOVWZ addr1, y1; \
ADD y1, y2; \ // y2 = SS1 + W ADD y1, y2; \ // y2 = SS1 + W
@ -150,24 +149,24 @@ GLOBL ·flip_mask(SB), RODATA, $16
PROLD(XTMP0, XTMP1, 7); \ // XTMP1 = W[-13] rol 7 PROLD(XTMP0, XTMP1, 7); \ // XTMP1 = W[-13] rol 7
VSLDOI $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w10, w11, w12, w13} VSLDOI $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w10, w11, w12, w13}
VXOR XTMP0, XTMP1, XTMP0; \ // XTMP0 = W[-6] xor (W[-13] rol 7) VXOR XTMP0, XTMP1, XTMP0; \ // XTMP0 = W[-6] xor (W[-13] rol 7)
; \ ; \ // Prepare P1 parameters
VSLDOI $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10} VSLDOI $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10}
VXOR XTMP1, XWORD0, XTMP1; \ // XTMP1 = W[-9] xor W[-16] VXOR XTMP1, XWORD0, XTMP1; \ // XTMP1 = W[-9] xor W[-16]
VSLDOI $4, XWORD3, XWORD2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w8} VSLDOI $4, XWORD3, XWORD2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w8}
PROLD(XTMP3, XTMP2, 15); \ // XTMP2 = W[-3] rol 15 PROLD(XTMP3, XTMP2, 15); \ // XTMP2 = W[-3] rol 15
VXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx} VXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx}
; \ ; \ // P1
PROLD(XTMP2, XTMP4, 15); \ // XTMP4 = = XTMP2 rol 15 {ABxx} PROLD(XTMP2, XTMP4, 15); \ // XTMP4 = = XTMP2 rol 15 {ABxx}
PROLD(XTMP4, XTMP3, 8); \ // XTMP3 = XTMP2 rol 23 {ABxx} PROLD(XTMP4, XTMP3, 8); \ // XTMP3 = XTMP2 rol 23 {ABxx}
VXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) VXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx})
VXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx}) VXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx})
; \ // First 2 words message schedule result ; \ // First 2 words message schedule result
VXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {w[0], w[1], ..., ...} VXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {w[0], w[1], ..., ...}
; \ ; \ // Prepare P1 parameters
VSLDOI $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w0} VSLDOI $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w0}
PROLD(XTMP3, XTMP4, 15); \ // XTMP4 = W[-3] rol 15 PROLD(XTMP3, XTMP4, 15); \ // XTMP4 = W[-3] rol 15
VXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD} VXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD}
; \ ; \ // P1
PROLD(XTMP4, XTMP3, 15); \ // XTMP3 = = XTMP4 rol 15 {ABCD} PROLD(XTMP4, XTMP3, 15); \ // XTMP3 = = XTMP4 rol 15 {ABCD}
PROLD(XTMP3, XTMP1, 8); \ // XTMP1 = XTMP4 rol 23 {ABCD} PROLD(XTMP3, XTMP1, 8); \ // XTMP1 = XTMP4 rol 23 {ABCD}
VXOR XTMP4, XTMP3, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) VXOR XTMP4, XTMP3, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD})
@ -181,7 +180,6 @@ TEXT ·blockASM(SB), NOSPLIT, $0
#ifdef NEEDS_PERMW #ifdef NEEDS_PERMW
MOVD $·flip_mask(SB), TEMP MOVD $·flip_mask(SB), TEMP
LVX (TEMP), ESPERMW LVX (TEMP), ESPERMW
ADD $0x10, TEMP
#endif #endif
MOVD dig+0(FP), CTX MOVD dig+0(FP), CTX