diff --git a/sm3/sm3block_ppc64x.s b/sm3/sm3block_ppc64x.s index ebe19e2..9e0ac61 100644 --- a/sm3/sm3block_ppc64x.s +++ b/sm3/sm3block_ppc64x.s @@ -59,6 +59,9 @@ #define XTMP4 V8 #define XFER V9 +#define V_x07 V10 +#define V_x08 V11 +#define V_x0F V12 // For instruction emulation #define ESPERMW V31 // Endian swapping permute into BE @@ -143,29 +146,29 @@ GLOBL ·flip_mask(SB), RODATA, $16 #define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \ VSLDOI $12, XWORD0, XWORD1, XTMP0; \ // XTMP0 = W[-13] = {w3, w4, w5, w6} - PROLD(XTMP0, XTMP1, 7); \ // XTMP1 = W[-13] rol 7 + VRLW XTMP0, V_x07, XTMP1; \ // XTMP1 = W[-13] rol 7 VSLDOI $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w10, w11, w12, w13} VXOR XTMP0, XTMP1, XTMP0; \ // XTMP0 = W[-6] xor (W[-13] rol 7) ; \ // Prepare P1 parameters VSLDOI $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10} VXOR XTMP1, XWORD0, XTMP1; \ // XTMP1 = W[-9] xor W[-16] VSLDOI $4, XWORD3, XWORD2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w8} - PROLD(XTMP3, XTMP2, 15); \ // XTMP2 = W[-3] rol 15 + VRLW XTMP3, V_x0F, XTMP2; \ // XTMP2 = W[-3] rol 15 VXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx} ; \ // P1 - PROLD(XTMP2, XTMP4, 15); \ // XTMP4 = = XTMP2 rol 15 {ABxx} - PROLD(XTMP4, XTMP3, 8); \ // XTMP3 = XTMP2 rol 23 {ABxx} + VRLW XTMP2, V_x0F, XTMP4; \ // XTMP4 = XTMP2 rol 15 {ABxx} + VRLW XTMP4, V_x08, XTMP3; \ // XTMP3 = XTMP4 rol 8 {ABxx} = XTMP2 rol 23 {ABxx} VXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) VXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx}) ; \ // First 2 words message schedule result VXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {w[0], w[1], ..., ...} ; \ // Prepare P1 parameters VSLDOI $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w0} - PROLD(XTMP3, XTMP4, 15); \ // XTMP4 = W[-3] rol 15 + VRLW XTMP3, V_x0F, XTMP4; \ // XTMP4 = W[-3] rol 15 VXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD} ; \ // P1 - PROLD(XTMP4, XTMP3, 15); \ // XTMP3 = = XTMP4 rol 15 {ABCD} - PROLD(XTMP3, XTMP1, 8); \ // XTMP1 = XTMP4 rol 23 {ABCD} + VRLW XTMP4, V_x0F, XTMP3; \ // XTMP3 = XTMP4 rol 15 {ABCD} + VRLW XTMP3, V_x08, XTMP1; \ // XTMP1 = XTMP4 rol 8 {ABCD} = XTMP4 rol 23 {ABCD} VXOR XTMP4, XTMP3, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) VXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) XOR (XTMP4 rol 23 {ABCD}) ; \ // 4 words message schedule result @@ -202,6 +205,10 @@ TEXT ·blockASM(SB), NOSPLIT, $0 MOVWZ 24(CTX), g MOVWZ 28(CTX), h + VSPLTISW $7, V_x07 + VSPLTISW $8, V_x08 + VSPLTISW $15, V_x0F + loop: PPC64X_LXVW4X(INP, R_x000, XWORD0) PPC64X_LXVW4X(INP, R_x010, XWORD1) diff --git a/sm3/sm3blocks_ppc64x.s b/sm3/sm3blocks_ppc64x.s index 05311fb..bcdcc3a 100644 --- a/sm3/sm3blocks_ppc64x.s +++ b/sm3/sm3blocks_ppc64x.s @@ -14,10 +14,12 @@ #define g V5 #define d V6 #define h V7 + #define T0 V8 #define T1 V9 #define T2 V10 #define ONE V11 + #define TMP0 V12 #define TMP1 V13 #define TMP2 V14 @@ -25,6 +27,18 @@ #define TMP4 V16 #define TMP5 V17 +#define DATA0 V16 +#define DATA1 V17 +#define DATA2 V18 +#define DATA3 V19 + +#define V_x07 V20 +#define V_x08 V21 +#define V_x09 V22 +#define V_x0C V23 +#define V_x13 V24 +#define V_x0F V25 + // For instruction emulation #define ESPERMW V31 // Endian swapping permute into BE @@ -67,19 +81,19 @@ GLOBL t_const<>(SB), RODATA, $32 // one word is 16 bytes #define prepare4Words \ - PPC64X_LXVW4X(srcPtr1, srcPtrPtr, V16); \ - PPC64X_LXVW4X(srcPtr2, srcPtrPtr, V17); \ - PPC64X_LXVW4X(srcPtr3, srcPtrPtr, V18); \ - PPC64X_LXVW4X(srcPtr4, srcPtrPtr, V19); \ - TRANSPOSE_MATRIX(V16, V17, V18, V19); \ - ADD $16, srcPtrPtr; \ - STXVW4X V16, (wordPtr); \ - ADD $16, wordPtr; \ - STXVW4X V17, (wordPtr); \ - ADD $16, wordPtr; \ - STXVW4X V18, (wordPtr); \ - ADD $16, wordPtr; \ - STXVW4X V19, (wordPtr); \ + PPC64X_LXVW4X(srcPtr1, srcPtrPtr, DATA0); \ + PPC64X_LXVW4X(srcPtr2, srcPtrPtr, DATA1); \ + PPC64X_LXVW4X(srcPtr3, srcPtrPtr, DATA2); \ + PPC64X_LXVW4X(srcPtr4, srcPtrPtr, DATA3); \ + TRANSPOSE_MATRIX(DATA0, DATA1, DATA2, DATA3);\ + ADD $16, srcPtrPtr; \ + STXVW4X DATA0, (wordPtr); \ + ADD $16, wordPtr; \ + STXVW4X DATA1, (wordPtr); \ + ADD $16, wordPtr; \ + STXVW4X DATA2, (wordPtr); \ + ADD $16, wordPtr; \ + STXVW4X DATA3, (wordPtr); \ ADD $16, wordPtr #define TRANSPOSE_MATRIX(T0, T1, T2, T3) \ @@ -93,12 +107,12 @@ GLOBL t_const<>(SB), RODATA, $32 XXPERMDI TMP2, TMP3, $3, T3 #define ROUND_00_11(index, T, a, b, c, d, e, f, g, h) \ - PROLD(a, TMP0, 12) \ + VRLW a, V_x0C, TMP0 \ VOR TMP0, TMP0, TMP1 \ VADDUWM T, TMP0, TMP0 \ VRLW T, ONE, T \ VADDUWM e, TMP0, TMP0 \ - PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 + VRLW TMP0, V_x07, TMP2 \ // TMP2 = SS1 VXOR TMP2, TMP1, TMP0 \ // TMP0 = SS2 VXOR a, b, TMP1 \ VXOR c, TMP1, TMP1 \ @@ -114,27 +128,27 @@ GLOBL t_const<>(SB), RODATA, $32 VXOR g, TMP4, TMP4 \ VADDUWM TMP4, TMP3, TMP3 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 VOR b, b, TMP4 \ - PROLD(TMP4, b, 9) \ // b = b <<< 9 + VRLW TMP4, V_x09, b \ // b = b <<< 9 VOR TMP1, TMP1, h \ // h = TT1 - PROLD(f, f, 19) \ // f = f <<< 19 - PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9 - PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17 + VRLW f, V_x13, f \ // f = f <<< 19 + VRLW TMP3, V_x09, TMP4 \ // TMP4 = TT2 <<< 9 + VRLW TMP4, V_x08, TMP0 \ // TMP0 = TT2 <<< 17 VXOR TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9) VXOR TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) #define MESSAGE_SCHEDULE(index) \ loadWordByIndex(TMP0, index+1) \ // Wj-3 - PROLD(TMP0, TMP1, 15) \ + VRLW TMP0, V_x0F, TMP1 \ loadWordByIndex(TMP0, index-12) \ // Wj-16 VXOR TMP0, TMP1, TMP0 \ loadWordByIndex(TMP1, index-5) \ // Wj-9 VXOR TMP0, TMP1, TMP0 \ - PROLD(TMP0, TMP1, 15) \ - PROLD(TMP1, TMP2, 8) \ + VRLW TMP0, V_x0F, TMP1 \ + VRLW TMP1, V_x08, TMP2 \ VXOR TMP1, TMP0, TMP0 \ VXOR TMP2, TMP0, TMP0 \ // P1 loadWordByIndex(TMP1, index-9) \ // Wj-13 - PROLD(TMP1, TMP2, 7) \ + VRLW TMP1, V_x07, TMP2 \ VXOR TMP2, TMP0, TMP0 \ loadWordByIndex(TMP1, index-2) \ // Wj-6 VXOR TMP1, TMP0, TMP1 \ @@ -147,12 +161,12 @@ GLOBL t_const<>(SB), RODATA, $32 #define ROUND_16_63(index, T, a, b, c, d, e, f, g, h) \ MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it - PROLD(a, TMP0, 12) \ + VRLW a, V_x0C, TMP0 \ VOR TMP0, TMP0, TMP4 \ VADDUWM T, TMP0, TMP0 \ VRLW T, ONE, T \ VADDUWM e, TMP0, TMP0 \ - PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 + VRLW TMP0, V_x07, TMP2 \ // TMP2 = SS1 VXOR TMP2, TMP4, TMP0 \ // TMP0 = SS2 VOR a, b, TMP3 \ VAND a, b, TMP4 \ @@ -170,11 +184,11 @@ GLOBL t_const<>(SB), RODATA, $32 VXOR g, TMP1, TMP1 \ // (f XOR g) AND e XOR g VADDUWM TMP3, TMP1, TMP3 \ // TT2 VOR b, b, TMP1 \ - PROLD(TMP1, b, 9) \ // b = b <<< 9 + VRLW TMP1, V_x09, b \ // b = b <<< 9 VOR TMP4, TMP4, h \ // h = TT1 - PROLD(f, f, 19) \ // f = f <<< 19 - PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9 - PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17 + VRLW f, V_x13, f \ // f = f <<< 19 + VRLW TMP3, V_x09, TMP1 \ // TMP1 = TT2 <<< 9 + VRLW TMP1, V_x08, TMP0 \ // TMP0 = TT2 <<< 17 VXOR TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9) VXOR TMP1, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) @@ -189,6 +203,12 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0 LVX (R4), ESPERMW #endif VSPLTISW $1, ONE + VSPLTISW $7, V_x07 + VSPLTISW $8, V_x08 + VSPLTISW $9, V_x09 + VSPLTISW $12, V_x0C + VSPLTISW $15, V_x0F + VSPLTISW $19, V_x13 MOVD $t_const<>(SB), R4 LXVD2X (R0)(R4), T0 LXVD2X (R_x10)(R4), T1