diff --git a/sm4/aesni_macros_ppc64x.s b/sm4/aesni_macros_ppc64x.s index fa8d303..b4cf64b 100644 --- a/sm4/aesni_macros_ppc64x.s +++ b/sm4/aesni_macros_ppc64x.s @@ -32,17 +32,9 @@ VPERM VS, VS, ESPERMW, VS \ STXVW4X VS, (RA+RB) -#define CBC_STXVW4X(VS, VT, RA, RB) \ - VPERM VS, VS, ESPERMW, VS \ - VXOR VS, VT, VS \ - STXVW4X VS, (RA+RB) - #else #define PPC64X_LXVW4X(RA,RB,VT) LXVW4X (RA+RB), VT #define PPC64X_STXVW4X(VS, RA, RB) STXVW4X VS, (RA+RB) -#define CBC_STXVW4X(VS, VT, RA, RB) \ - VXOR VS, VT, VS \ - STXVW4X VS, (RA+RB) #endif // defined(GOARCH_ppc64le) // r = s <<< n diff --git a/sm4/cbc_ppc64x.s b/sm4/cbc_ppc64x.s index 910e3f9..4766eef 100644 --- a/sm4/cbc_ppc64x.s +++ b/sm4/cbc_ppc64x.s @@ -29,6 +29,20 @@ #include "aesni_macros_ppc64x.s" +#ifdef NEEDS_PERMW +#define REVERSE32LE_8BLOCKS \ + VPERM V0, V0, ESPERMW, V0 \ + VPERM V1, V1, ESPERMW, V1 \ + VPERM V2, V2, ESPERMW, V2 \ + VPERM V3, V3, ESPERMW, V3 \ + VPERM V4, V4, ESPERMW, V4 \ + VPERM V5, V5, ESPERMW, V5 \ + VPERM V6, V6, ESPERMW, V6 \ + VPERM V7, V7, ESPERMW, V7 +#else +#define REVERSE32LE_8BLOCKS +#endif + // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 #define dstPtr R3 @@ -105,23 +119,32 @@ loop8blocks: TRANSPOSE_MATRIX(V0, V1, V2, V3) TRANSPOSE_MATRIX(V4, V5, V6, V7) + REVERSE32LE_8BLOCKS // for ppc64le + LXVW4X (R16)(R0), TMP0 LXVW4X (R16)(R7), TMP1 LXVW4X (R16)(R8), TMP2 - LXVW4X (R16)(R9), TMP3 - CBC_STXVW4X(V0, TMP0, R17, R0) - CBC_STXVW4X(V1, TMP1, R17, R7) - CBC_STXVW4X(V2, TMP2, R17, R8) - CBC_STXVW4X(V3, TMP3, R17, R9) - + LXVW4X (R16)(R9), TMP3 + VXOR V0, TMP0, V0 + VXOR V1, TMP1, V1 + VXOR V2, TMP2, V2 + VXOR V3, TMP3, V3 LXVW4X (R16)(R10), TMP0 LXVW4X (R16)(R11), TMP1 LXVW4X (R16)(R12), TMP2 LXVW4X (R16)(R14), TMP3 - CBC_STXVW4X(V4, TMP0, R17, R10) - CBC_STXVW4X(V5, TMP1, R17, R11) - CBC_STXVW4X(V6, TMP2, R17, R12) - CBC_STXVW4X(V7, TMP3, R17, R14) + VXOR V4, TMP0, V4 + VXOR V5, TMP1, V5 + VXOR V6, TMP2, V6 + VXOR V7, TMP3, V7 + STXVW4X V0, (R17)(R0) + STXVW4X V1, (R17)(R7) + STXVW4X V2, (R17)(R8) + STXVW4X V3, (R17)(R9) + STXVW4X V4, (R17)(R10) + STXVW4X V5, (R17)(R11) + STXVW4X V6, (R17)(R12) + STXVW4X V7, (R17)(R14) CMP srcLen, $144 // 9 blocks BGE loop8blocks