sm3: ppc64x predefine constants for ROTL

This commit is contained in:
Sun Yimin 2024-11-14 08:32:31 +08:00 committed by GitHub
parent 8f45f4efcb
commit dee08a50f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 63 additions and 36 deletions

View File

@ -59,6 +59,9 @@
#define XTMP4 V8 #define XTMP4 V8
#define XFER V9 #define XFER V9
#define V_x07 V10
#define V_x08 V11
#define V_x0F V12
// For instruction emulation // For instruction emulation
#define ESPERMW V31 // Endian swapping permute into BE #define ESPERMW V31 // Endian swapping permute into BE
@ -143,29 +146,29 @@ GLOBL ·flip_mask(SB), RODATA, $16
#define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \ #define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \
VSLDOI $12, XWORD0, XWORD1, XTMP0; \ // XTMP0 = W[-13] = {w3, w4, w5, w6} VSLDOI $12, XWORD0, XWORD1, XTMP0; \ // XTMP0 = W[-13] = {w3, w4, w5, w6}
PROLD(XTMP0, XTMP1, 7); \ // XTMP1 = W[-13] rol 7 VRLW XTMP0, V_x07, XTMP1; \ // XTMP1 = W[-13] rol 7
VSLDOI $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w10, w11, w12, w13} VSLDOI $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w10, w11, w12, w13}
VXOR XTMP0, XTMP1, XTMP0; \ // XTMP0 = W[-6] xor (W[-13] rol 7) VXOR XTMP0, XTMP1, XTMP0; \ // XTMP0 = W[-6] xor (W[-13] rol 7)
; \ // Prepare P1 parameters ; \ // Prepare P1 parameters
VSLDOI $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10} VSLDOI $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10}
VXOR XTMP1, XWORD0, XTMP1; \ // XTMP1 = W[-9] xor W[-16] VXOR XTMP1, XWORD0, XTMP1; \ // XTMP1 = W[-9] xor W[-16]
VSLDOI $4, XWORD3, XWORD2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w8} VSLDOI $4, XWORD3, XWORD2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w8}
PROLD(XTMP3, XTMP2, 15); \ // XTMP2 = W[-3] rol 15 VRLW XTMP3, V_x0F, XTMP2; \ // XTMP2 = W[-3] rol 15
VXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx} VXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx}
; \ // P1 ; \ // P1
PROLD(XTMP2, XTMP4, 15); \ // XTMP4 = = XTMP2 rol 15 {ABxx} VRLW XTMP2, V_x0F, XTMP4; \ // XTMP4 = XTMP2 rol 15 {ABxx}
PROLD(XTMP4, XTMP3, 8); \ // XTMP3 = XTMP2 rol 23 {ABxx} VRLW XTMP4, V_x08, XTMP3; \ // XTMP3 = XTMP4 rol 8 {ABxx} = XTMP2 rol 23 {ABxx}
VXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) VXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx})
VXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx}) VXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx})
; \ // First 2 words message schedule result ; \ // First 2 words message schedule result
VXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {w[0], w[1], ..., ...} VXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {w[0], w[1], ..., ...}
; \ // Prepare P1 parameters ; \ // Prepare P1 parameters
VSLDOI $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w0} VSLDOI $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w0}
PROLD(XTMP3, XTMP4, 15); \ // XTMP4 = W[-3] rol 15 VRLW XTMP3, V_x0F, XTMP4; \ // XTMP4 = W[-3] rol 15
VXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD} VXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD}
; \ // P1 ; \ // P1
PROLD(XTMP4, XTMP3, 15); \ // XTMP3 = = XTMP4 rol 15 {ABCD} VRLW XTMP4, V_x0F, XTMP3; \ // XTMP3 = XTMP4 rol 15 {ABCD}
PROLD(XTMP3, XTMP1, 8); \ // XTMP1 = XTMP4 rol 23 {ABCD} VRLW XTMP3, V_x08, XTMP1; \ // XTMP1 = XTMP4 rol 8 {ABCD} = XTMP4 rol 23 {ABCD}
VXOR XTMP4, XTMP3, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) VXOR XTMP4, XTMP3, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD})
VXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) XOR (XTMP4 rol 23 {ABCD}) VXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) XOR (XTMP4 rol 23 {ABCD})
; \ // 4 words message schedule result ; \ // 4 words message schedule result
@ -202,6 +205,10 @@ TEXT ·blockASM(SB), NOSPLIT, $0
MOVWZ 24(CTX), g MOVWZ 24(CTX), g
MOVWZ 28(CTX), h MOVWZ 28(CTX), h
VSPLTISW $7, V_x07
VSPLTISW $8, V_x08
VSPLTISW $15, V_x0F
loop: loop:
PPC64X_LXVW4X(INP, R_x000, XWORD0) PPC64X_LXVW4X(INP, R_x000, XWORD0)
PPC64X_LXVW4X(INP, R_x010, XWORD1) PPC64X_LXVW4X(INP, R_x010, XWORD1)

View File

@ -14,10 +14,12 @@
#define g V5 #define g V5
#define d V6 #define d V6
#define h V7 #define h V7
#define T0 V8 #define T0 V8
#define T1 V9 #define T1 V9
#define T2 V10 #define T2 V10
#define ONE V11 #define ONE V11
#define TMP0 V12 #define TMP0 V12
#define TMP1 V13 #define TMP1 V13
#define TMP2 V14 #define TMP2 V14
@ -25,6 +27,18 @@
#define TMP4 V16 #define TMP4 V16
#define TMP5 V17 #define TMP5 V17
#define DATA0 V16
#define DATA1 V17
#define DATA2 V18
#define DATA3 V19
#define V_x07 V20
#define V_x08 V21
#define V_x09 V22
#define V_x0C V23
#define V_x13 V24
#define V_x0F V25
// For instruction emulation // For instruction emulation
#define ESPERMW V31 // Endian swapping permute into BE #define ESPERMW V31 // Endian swapping permute into BE
@ -67,19 +81,19 @@ GLOBL t_const<>(SB), RODATA, $32
// one word is 16 bytes // one word is 16 bytes
#define prepare4Words \ #define prepare4Words \
PPC64X_LXVW4X(srcPtr1, srcPtrPtr, V16); \ PPC64X_LXVW4X(srcPtr1, srcPtrPtr, DATA0); \
PPC64X_LXVW4X(srcPtr2, srcPtrPtr, V17); \ PPC64X_LXVW4X(srcPtr2, srcPtrPtr, DATA1); \
PPC64X_LXVW4X(srcPtr3, srcPtrPtr, V18); \ PPC64X_LXVW4X(srcPtr3, srcPtrPtr, DATA2); \
PPC64X_LXVW4X(srcPtr4, srcPtrPtr, V19); \ PPC64X_LXVW4X(srcPtr4, srcPtrPtr, DATA3); \
TRANSPOSE_MATRIX(V16, V17, V18, V19); \ TRANSPOSE_MATRIX(DATA0, DATA1, DATA2, DATA3);\
ADD $16, srcPtrPtr; \ ADD $16, srcPtrPtr; \
STXVW4X V16, (wordPtr); \ STXVW4X DATA0, (wordPtr); \
ADD $16, wordPtr; \ ADD $16, wordPtr; \
STXVW4X V17, (wordPtr); \ STXVW4X DATA1, (wordPtr); \
ADD $16, wordPtr; \ ADD $16, wordPtr; \
STXVW4X V18, (wordPtr); \ STXVW4X DATA2, (wordPtr); \
ADD $16, wordPtr; \ ADD $16, wordPtr; \
STXVW4X V19, (wordPtr); \ STXVW4X DATA3, (wordPtr); \
ADD $16, wordPtr ADD $16, wordPtr
#define TRANSPOSE_MATRIX(T0, T1, T2, T3) \ #define TRANSPOSE_MATRIX(T0, T1, T2, T3) \
@ -93,12 +107,12 @@ GLOBL t_const<>(SB), RODATA, $32
XXPERMDI TMP2, TMP3, $3, T3 XXPERMDI TMP2, TMP3, $3, T3
#define ROUND_00_11(index, T, a, b, c, d, e, f, g, h) \ #define ROUND_00_11(index, T, a, b, c, d, e, f, g, h) \
PROLD(a, TMP0, 12) \ VRLW a, V_x0C, TMP0 \
VOR TMP0, TMP0, TMP1 \ VOR TMP0, TMP0, TMP1 \
VADDUWM T, TMP0, TMP0 \ VADDUWM T, TMP0, TMP0 \
VRLW T, ONE, T \ VRLW T, ONE, T \
VADDUWM e, TMP0, TMP0 \ VADDUWM e, TMP0, TMP0 \
PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 VRLW TMP0, V_x07, TMP2 \ // TMP2 = SS1
VXOR TMP2, TMP1, TMP0 \ // TMP0 = SS2 VXOR TMP2, TMP1, TMP0 \ // TMP0 = SS2
VXOR a, b, TMP1 \ VXOR a, b, TMP1 \
VXOR c, TMP1, TMP1 \ VXOR c, TMP1, TMP1 \
@ -114,27 +128,27 @@ GLOBL t_const<>(SB), RODATA, $32
VXOR g, TMP4, TMP4 \ VXOR g, TMP4, TMP4 \
VADDUWM TMP4, TMP3, TMP3 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 VADDUWM TMP4, TMP3, TMP3 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
VOR b, b, TMP4 \ VOR b, b, TMP4 \
PROLD(TMP4, b, 9) \ // b = b <<< 9 VRLW TMP4, V_x09, b \ // b = b <<< 9
VOR TMP1, TMP1, h \ // h = TT1 VOR TMP1, TMP1, h \ // h = TT1
PROLD(f, f, 19) \ // f = f <<< 19 VRLW f, V_x13, f \ // f = f <<< 19
PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9 VRLW TMP3, V_x09, TMP4 \ // TMP4 = TT2 <<< 9
PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17 VRLW TMP4, V_x08, TMP0 \ // TMP0 = TT2 <<< 17
VXOR TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9) VXOR TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9)
VXOR TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) VXOR TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
#define MESSAGE_SCHEDULE(index) \ #define MESSAGE_SCHEDULE(index) \
loadWordByIndex(TMP0, index+1) \ // Wj-3 loadWordByIndex(TMP0, index+1) \ // Wj-3
PROLD(TMP0, TMP1, 15) \ VRLW TMP0, V_x0F, TMP1 \
loadWordByIndex(TMP0, index-12) \ // Wj-16 loadWordByIndex(TMP0, index-12) \ // Wj-16
VXOR TMP0, TMP1, TMP0 \ VXOR TMP0, TMP1, TMP0 \
loadWordByIndex(TMP1, index-5) \ // Wj-9 loadWordByIndex(TMP1, index-5) \ // Wj-9
VXOR TMP0, TMP1, TMP0 \ VXOR TMP0, TMP1, TMP0 \
PROLD(TMP0, TMP1, 15) \ VRLW TMP0, V_x0F, TMP1 \
PROLD(TMP1, TMP2, 8) \ VRLW TMP1, V_x08, TMP2 \
VXOR TMP1, TMP0, TMP0 \ VXOR TMP1, TMP0, TMP0 \
VXOR TMP2, TMP0, TMP0 \ // P1 VXOR TMP2, TMP0, TMP0 \ // P1
loadWordByIndex(TMP1, index-9) \ // Wj-13 loadWordByIndex(TMP1, index-9) \ // Wj-13
PROLD(TMP1, TMP2, 7) \ VRLW TMP1, V_x07, TMP2 \
VXOR TMP2, TMP0, TMP0 \ VXOR TMP2, TMP0, TMP0 \
loadWordByIndex(TMP1, index-2) \ // Wj-6 loadWordByIndex(TMP1, index-2) \ // Wj-6
VXOR TMP1, TMP0, TMP1 \ VXOR TMP1, TMP0, TMP1 \
@ -147,12 +161,12 @@ GLOBL t_const<>(SB), RODATA, $32
#define ROUND_16_63(index, T, a, b, c, d, e, f, g, h) \ #define ROUND_16_63(index, T, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it
PROLD(a, TMP0, 12) \ VRLW a, V_x0C, TMP0 \
VOR TMP0, TMP0, TMP4 \ VOR TMP0, TMP0, TMP4 \
VADDUWM T, TMP0, TMP0 \ VADDUWM T, TMP0, TMP0 \
VRLW T, ONE, T \ VRLW T, ONE, T \
VADDUWM e, TMP0, TMP0 \ VADDUWM e, TMP0, TMP0 \
PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 VRLW TMP0, V_x07, TMP2 \ // TMP2 = SS1
VXOR TMP2, TMP4, TMP0 \ // TMP0 = SS2 VXOR TMP2, TMP4, TMP0 \ // TMP0 = SS2
VOR a, b, TMP3 \ VOR a, b, TMP3 \
VAND a, b, TMP4 \ VAND a, b, TMP4 \
@ -170,11 +184,11 @@ GLOBL t_const<>(SB), RODATA, $32
VXOR g, TMP1, TMP1 \ // (f XOR g) AND e XOR g VXOR g, TMP1, TMP1 \ // (f XOR g) AND e XOR g
VADDUWM TMP3, TMP1, TMP3 \ // TT2 VADDUWM TMP3, TMP1, TMP3 \ // TT2
VOR b, b, TMP1 \ VOR b, b, TMP1 \
PROLD(TMP1, b, 9) \ // b = b <<< 9 VRLW TMP1, V_x09, b \ // b = b <<< 9
VOR TMP4, TMP4, h \ // h = TT1 VOR TMP4, TMP4, h \ // h = TT1
PROLD(f, f, 19) \ // f = f <<< 19 VRLW f, V_x13, f \ // f = f <<< 19
PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9 VRLW TMP3, V_x09, TMP1 \ // TMP1 = TT2 <<< 9
PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17 VRLW TMP1, V_x08, TMP0 \ // TMP0 = TT2 <<< 17
VXOR TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9) VXOR TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9)
VXOR TMP1, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) VXOR TMP1, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
@ -189,6 +203,12 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
LVX (R4), ESPERMW LVX (R4), ESPERMW
#endif #endif
VSPLTISW $1, ONE VSPLTISW $1, ONE
VSPLTISW $7, V_x07
VSPLTISW $8, V_x08
VSPLTISW $9, V_x09
VSPLTISW $12, V_x0C
VSPLTISW $15, V_x0F
VSPLTISW $19, V_x13
MOVD $t_const<>(SB), R4 MOVD $t_const<>(SB), R4
LXVD2X (R0)(R4), T0 LXVD2X (R0)(R4), T0
LXVD2X (R_x10)(R4), T1 LXVD2X (R_x10)(R4), T1