diff --git a/sm3/sm3blocks_ppc64x.s b/sm3/sm3blocks_ppc64x.s index f67fcf4..8eb7db9 100644 --- a/sm3/sm3blocks_ppc64x.s +++ b/sm3/sm3blocks_ppc64x.s @@ -6,26 +6,37 @@ #include "textflag.h" -DATA mask<>+0x00(SB)/8, $0x0001020310111213 -DATA mask<>+0x08(SB)/8, $0x0405060714151617 -DATA mask<>+0x10(SB)/8, $0x08090a0b18191a1b -DATA mask<>+0x18(SB)/8, $0x0c0d0e0f1c1d1e1f -DATA mask<>+0x20(SB)/8, $0x0001020304050607 -DATA mask<>+0x28(SB)/8, $0x1011121314151617 -DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f -DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f -GLOBL mask<>(SB), 8, $64 +// For P9 instruction emulation +#define ESPERM V21 // Endian swapping permute into BE +#define TMP2 V22 // Temporary for P8_STXVB16X/P8_STXVB16X + +DATA ·mask+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap +DATA ·mask+0x08(SB)/8, $0x0706050403020100 +DATA ·mask+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix +DATA ·mask+0x18(SB)/8, $0x0405060714151617 +DATA ·mask+0x20(SB)/8, $0x08090a0b18191a1b +DATA ·mask+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f +DATA ·mask+0x30(SB)/8, $0x0001020304050607 +DATA ·mask+0x38(SB)/8, $0x1011121314151617 +DATA ·mask+0x40(SB)/8, $0x08090a0b0c0d0e0f +DATA ·mask+0x48(SB)/8, $0x18191a1b1c1d1e1f +GLOBL ·mask(SB), RODATA, $80 #ifdef GOARCH_ppc64le # ifdef GOPPC64_power9 #define P8_LXVB16X(RA,RB,VT) LXVB16X (RA+RB), VT #define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA+RB) # else +// On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned +// doublewords and byte-swapping each doubleword to emulate BE load/stores. +#define NEEDS_ESPERM #define P8_LXVB16X(RA,RB,VT) \ - LXVD2X (RA+RB), VT + LXVD2X (RA+RB), VT \ + VPERM VT, VT, ESPERM, VT #define P8_STXVB16X(VS,RA,RB) \ - STXVD2X VS, (RA+RB) + VPERM VS, VS, ESPERM, TMP2 \ + STXVD2X TMP2, (RA+RB) # endif // defined(GOPPC64_power9) #else @@ -52,6 +63,19 @@ TEXT ·transposeMatrix(SB),NOSPLIT,$0 MOVD $32, R8 MOVD $48, R9 +#ifdef NEEDS_ESPERM + MOVD $·mask(SB), R4 + LVX (R4), ESPERM + ADD $0x10, R4 +#else + MOVD $·mask+0x10(SB), R4 +#endif + + LVX (R0)(R4), V8 + LVX (R6)(R4), V9 + LVX (R8)(R4), V10 + LVX (R9)(R4), V11 + MOVD (R0)(R3), R4 P8_LXVB16X(R4, R0, V0) P8_LXVB16X(R4, R6, V4) @@ -65,12 +89,6 @@ TEXT ·transposeMatrix(SB),NOSPLIT,$0 P8_LXVB16X(R4, R0, V3) P8_LXVB16X(R4, R6, V7) - MOVD $mask<>+0x00(SB), R4 - - LVX (R0)(R4), V8 - LVX (R6)(R4), V9 - LVX (R8)(R4), V10 - LVX (R9)(R4), V11 TRANSPOSE_MATRIX(V0, V1, V2, V3, V8, V9, V10, V11, V12, V13, V14, V15) TRANSPOSE_MATRIX(V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15)