diff --git a/sm4/aesni_macros_ppc64x.s b/sm4/aesni_macros_ppc64x.s new file mode 100644 index 0000000..d919b06 --- /dev/null +++ b/sm4/aesni_macros_ppc64x.s @@ -0,0 +1,228 @@ +DATA ·rcon+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word +DATA ·rcon+0x08(SB)/8, $0x0302010007060504 +DATA ·rcon+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix +DATA ·rcon+0x18(SB)/8, $0x0405060714151617 +DATA ·rcon+0x20(SB)/8, $0x08090a0b18191a1b +DATA ·rcon+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f +DATA ·rcon+0x30(SB)/8, $0x0001020304050607 +DATA ·rcon+0x38(SB)/8, $0x1011121314151617 +DATA ·rcon+0x40(SB)/8, $0x08090a0b0c0d0e0f +DATA ·rcon+0x48(SB)/8, $0x18191a1b1c1d1e1f +DATA ·rcon+0x50(SB)/8, $0x0c0d0e0f08090a0b // reverse words +DATA ·rcon+0x58(SB)/8, $0x0405060700010203 +DATA ·rcon+0x60(SB)/8, $0x0F0F0F0F0F0F0F0F // nibble mask +DATA ·rcon+0x68(SB)/8, $0x0F0F0F0F0F0F0F0F +DATA ·rcon+0x70(SB)/8, $0x000D0A0704010E0B // inverse shift rows +DATA ·rcon+0x78(SB)/8, $0x0805020F0C090603 +DATA ·rcon+0x80(SB)/8, $0x691CA0D5B6C37F0A // affine transform matrix m1 low +DATA ·rcon+0x88(SB)/8, $0x53269AEF8CF94530 +DATA ·rcon+0x90(SB)/8, $0x009837AF6CF45BC3 // affine transform matrix m1 high +DATA ·rcon+0x98(SB)/8, $0xAB339C04C75FF068 +DATA ·rcon+0xa0(SB)/8, $0x616EF1FE050A959A // affine transform matrix m2 low +DATA ·rcon+0xa8(SB)/8, $0xF5FA656A919E010E +DATA ·rcon+0xb0(SB)/8, $0x00A4E044CD692D89 // affine transform matrix m2 high +DATA ·rcon+0xb8(SB)/8, $0xA50145E168CC882C +GLOBL ·rcon(SB), 8, $192 + +#define LOAD_CONSTS(baseAddrReg, offsetReg) \ + MOVD $·rcon+0x10(SB), baseAddrReg; \ + LXVD2X (baseAddrReg)(R0), M0; \ + MOVD $0x10, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), M1; \ + MOVD $0x20, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), M2; \ + MOVD $0x30, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), M3; \ + MOVD $0x40, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), REVERSE_WORDS; \ + MOVD $0x50, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), NIBBLE_MASK; \ + MOVD $0x60, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), INVERSE_SHIFT_ROWS; \ + MOVD $0x70, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), M1L; \ + MOVD $0x80, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), M1H; \ + MOVD $0x90, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), M2L; \ + MOVD $0xa0, offsetReg; \ + LXVD2X (baseAddrReg)(offsetReg), M2H + +#ifdef GOARCH_ppc64le +#define NEEDS_PERMW + +#define PPC64X_LXVW4X(RA,RB,VT) \ + LXVW4X (RA+RB), VT \ + VPERM VT, VT, ESPERMW, VT + +#define PPC64X_STXVW4X(VS, RA, RB) \ + VPERM VS, VS, ESPERMW, VS \ + STXVW4X VS, (RA+RB) + +#else +#define PPC64X_LXVW4X(RA,RB,VT) LXVW4X (RA+RB), VT +#define PPC64X_STXVW4X(VS, RA, RB) STXVW4X VS, (RA+RB) +#endif // defined(GOARCH_ppc64le) + +// r = s <<< n +// Due to VSPLTISW's limitation, the n MUST be [0, 15], +// If n > 15, we have to call it multiple times. +// VSPLTISW takes a 5-bit immediate value as an operand. +// I also did NOT find one vector instruction to use immediate value for ROTL. +#define PROLD(s, r, tmp, n) \ + VSPLTISW $n, tmp \ + VRLW s, tmp, r + +// input: from high to low +// t0 = t0.S3, t0.S2, t0.S1, t0.S0 +// t1 = t1.S3, t1.S2, t1.S1, t1.S0 +// t2 = t2.S3, t2.S2, t2.S1, t2.S0 +// t3 = t3.S3, t3.S2, t3.S1, t3.S0 +// output: from high to low +// t0 = t3.S0, t2.S0, t1.S0, t0.S0 +// t1 = t3.S1, t2.S1, t1.S1, t0.S1 +// t2 = t3.S2, t2.S2, t1.S2, t0.S2 +// t3 = t3.S3, t2.S3, t1.S3, t0.S3 +#define PRE_TRANSPOSE_MATRIX(T0, T1, T2, T3) \ + VPERM T0, T1, M0, TMP0; \ + VPERM T2, T3, M0, TMP1; \ + VPERM T0, T1, M1, TMP2; \ + VPERM T2, T3, M1, TMP3; \ + VPERM TMP0, TMP1, M2, T0; \ + VPERM TMP0, TMP1, M3, T1; \ + VPERM TMP2, TMP3, M2, T2; \ + VPERM TMP2, TMP3, M3, T3 + +// input: from high to low +// t0 = t0.S3, t0.S2, t0.S1, t0.S0 +// t1 = t1.S3, t1.S2, t1.S1, t1.S0 +// t2 = t2.S3, t2.S2, t2.S1, t2.S0 +// t3 = t3.S3, t3.S2, t3.S1, t3.S0 +// output: from high to low +// t0 = t0.S0, t1.S0, t2.S0, t3.S0 +// t1 = t0.S1, t1.S1, t2.S1, t3.S1 +// t2 = t0.S2, t1.S2, t2.S2, t3.S2 +// t3 = t0.S3, t1.S3, t2.S3, t3.S3 +#define TRANSPOSE_MATRIX(T0, T1, T2, T3) \ + VPERM T1, T0, M0, TMP0; \ + VPERM T1, T0, M1, TMP1; \ + VPERM T3, T2, M0, TMP2; \ + VPERM T3, T2, M1, TMP3; \ + VPERM TMP2, TMP0, M2, T0; \ + VPERM TMP2, TMP0, M3, T1; \ + VPERM TMP3, TMP1, M2, T2; \ + VPERM TMP3, TMP1, M3, T3; \ + +// Affine Transform +// parameters: +// - L: table low nibbles +// - H: table high nibbles +// - x: 128 bits register as sbox input/output data +// - y: 128 bits temp register +// - z: 128 bits temp register +#define AFFINE_TRANSFORM(L, H, V_FOUR, x, y, z) \ + VAND NIBBLE_MASK, x, z; \ + VPERM L, L, z, y; \ + VSRD x, V_FOUR, x; \ + VAND NIBBLE_MASK, x, z; \ + VPERM H, H, z, x; \ + VXOR y, x, x + +// Affine Transform +// parameters: +// - L: table low nibbles +// - H: table high nibbles +// - x: 128 bits register as sbox input/output data +// - y: 128 bits temp register +// - z: 128 bits temp register +#define AFFINE_TRANSFORM_NOTX(L, H, V_FOUR, x, y, z) \ + VNOR x, x, z; \ // z = NOT(x) + VAND NIBBLE_MASK, z, z; \ + VPERM L, L, z, y; \ + VSRD x, V_FOUR, x; \ + VAND NIBBLE_MASK, x, z; \ + VPERM H, H, z, x; \ + VXOR y, x, x + +// SM4 sbox function +// parameters: +// - x: 128 bits register as sbox input/output data +// - y: 128 bits temp register +// - z: 128 bits temp register +#define SM4_SBOX(x, y, z) \ + AFFINE_TRANSFORM(M1L, M1H, V_FOUR, x, y, z); \ + VPERM x, x, INVERSE_SHIFT_ROWS, x; \ + VCIPHERLAST x, NIBBLE_MASK, x; \ + AFFINE_TRANSFORM_NOTX(M2L, M2H, V_FOUR, x, y, z) + +// SM4 TAO L1 function +// parameters: +// - x: 128 bits register as TAO_L1 input/output data +// - tmp1: 128 bits temp register +// - tmp2: 128 bits temp register +// - tmp3: 128 bits temp register +#define SM4_TAO_L1(x, tmp1, tmp2, tmp3) \ + SM4_SBOX(x, tmp1, tmp2); \ + ; \ //#################### 4 parallel L1 linear transforms ##################// + VSPLTISW $8, tmp3; \ + VRLW x, tmp3, tmp1; \ // tmp1 = x <<< 8 + VRLW tmp1, tmp3, tmp2; \ // tmp2 = x <<< 16 + VXOR x, tmp1, tmp1; \ // tmp1 = x xor (x <<< 8) + VXOR tmp1, tmp2, tmp1; \ // tmp1 = x xor (x <<< 8) xor (x <<< 16) + VRLW tmp2, tmp3, tmp2; \ // tmp2 = x <<< 24 + VXOR tmp2, x, x; \ // x = x xor (x <<< 24) + VSPLTISW $2, tmp3; \ + VRLW tmp1, tmp3, tmp1; \ // tmp1 = (x xor (x <<< 8) xor (x <<< 16)) <<< 2 + VXOR tmp1, x, x + +// SM4 round function +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - RK: round key register +// - x: 128 bits temp register +// - tmp1: 128 bits temp register +// - tmp2: 128 bits temp register +// - tmp3: 128 bits temp register +// - t0: 128 bits register for data as result +// - t1: 128 bits register for data +// - t2: 128 bits register for data +// - t3: 128 bits register for data +#define SM4_ROUND(RK, x, tmp1, tmp2, tmp3, t0, t1, t2, t3) \ + VXOR RK, t1, x; \ + VXOR t2, x, x; \ + VXOR t3, x, x; \ + SM4_TAO_L1(x, tmp1, tmp2, tmp3); \ + VXOR x, t0, t0 + +#define PROCESS_8BLOCKS_4ROUND \ + VSPLTW $0, V8, V9; \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V0, V1, V2, V3); \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V4, V5, V6, V7); \ + VSPLTW $1, V8, V9; \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V1, V2, V3, V0); \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V5, V6, V7, V4); \ + VSPLTW $2, V8, V9; \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V2, V3, V0, V1); \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V6, V7, V4, V5); \ + VSPLTW $3, V8, V9; \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V3, V0, V1, V2); \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V7, V4, V5, V6) + +#define PROCESS_4BLOCKS_4ROUND \ + VSPLTW $0, V8, V9; \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V0, V1, V2, V3); \ + VSPLTW $1, V8, V9; \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V1, V2, V3, V0); \ + VSPLTW $2, V8, V9; \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V2, V3, V0, V1); \ + VSPLTW $3, V8, V9; \ + SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V3, V0, V1, V2) + +#define PROCESS_SINGLEBLOCK_4ROUND \ + SM4_ROUND(V8, TMP0, TMP1, TMP2, TMP3, V0, V1, V2, V3); \ + VSLDOI $4, V8, V8, V8; \ + SM4_ROUND(V8, TMP0, TMP1, TMP2, TMP3, V1, V2, V3, V0); \ + VSLDOI $4, V8, V8, V8; \ + SM4_ROUND(V8, TMP0, TMP1, TMP2, TMP3, V2, V3, V0, V1); \ + VSLDOI $4, V8, V8, V8; \ + SM4_ROUND(V8, TMP0, TMP1, TMP2, TMP3, V3, V0, V1, V2) diff --git a/sm4/asm_ppc64x.s b/sm4/asm_ppc64x.s index 5ab2115..16879e5 100644 --- a/sm4/asm_ppc64x.s +++ b/sm4/asm_ppc64x.s @@ -6,7 +6,6 @@ #include "textflag.h" -#define ZERO V18 #define REVERSE_WORDS V19 #define M1L V20 #define M1H V21 @@ -22,139 +21,12 @@ // For instruction emulation #define ESPERMW V31 // Endian swapping permute into BE -DATA ·rcon+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word -DATA ·rcon+0x08(SB)/8, $0x0302010007060504 -DATA ·rcon+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix -DATA ·rcon+0x18(SB)/8, $0x0405060714151617 -DATA ·rcon+0x20(SB)/8, $0x08090a0b18191a1b -DATA ·rcon+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f -DATA ·rcon+0x30(SB)/8, $0x0001020304050607 -DATA ·rcon+0x38(SB)/8, $0x1011121314151617 -DATA ·rcon+0x40(SB)/8, $0x08090a0b0c0d0e0f -DATA ·rcon+0x48(SB)/8, $0x18191a1b1c1d1e1f -DATA ·rcon+0x50(SB)/8, $0x0c0d0e0f08090a0b // reverse words -DATA ·rcon+0x58(SB)/8, $0x0405060700010203 -DATA ·rcon+0x60(SB)/8, $0x0F0F0F0F0F0F0F0F // nibble mask -DATA ·rcon+0x68(SB)/8, $0x0F0F0F0F0F0F0F0F -DATA ·rcon+0x70(SB)/8, $0x000D0A0704010E0B // inverse shift rows -DATA ·rcon+0x78(SB)/8, $0x0805020F0C090603 -DATA ·rcon+0x80(SB)/8, $0x691CA0D5B6C37F0A // affine transform matrix m1 low -DATA ·rcon+0x88(SB)/8, $0x53269AEF8CF94530 -DATA ·rcon+0x90(SB)/8, $0x009837AF6CF45BC3 // affine transform matrix m1 high -DATA ·rcon+0x98(SB)/8, $0xAB339C04C75FF068 -DATA ·rcon+0xa0(SB)/8, $0x616EF1FE050A959A // affine transform matrix m2 low -DATA ·rcon+0xa8(SB)/8, $0xF5FA656A919E010E -DATA ·rcon+0xb0(SB)/8, $0x00A4E044CD692D89 // affine transform matrix m2 high -DATA ·rcon+0xb8(SB)/8, $0xA50145E168CC882C +#define TMP0 V10 +#define TMP1 V11 +#define TMP2 V12 +#define TMP3 V13 -GLOBL ·rcon(SB), RODATA, $192 - -#ifdef GOARCH_ppc64le -#define NEEDS_PERMW - -#define PPC64X_LXVW4X(RA,RB,VT) \ - LXVW4X (RA+RB), VT \ - VPERM VT, VT, ESPERMW, VT - -#define PPC64X_STXVW4X(VS, RA, RB) \ - VPERM VS, VS, ESPERMW, VS \ - STXVW4X VS, (RA+RB) - -#else -#define PPC64X_LXVW4X(RA,RB,VT) LXVW4X (RA+RB), VT -#define PPC64X_STXVW4X(VS, RA, RB) STXVW4X VS, (RA+RB) -#endif // defined(GOARCH_ppc64le) - -// r = s <<< n -// Due to VSPLTISW's limitation, the n MUST be [0, 15], -// If n > 15, we have to call it multiple times. -// VSPLTISW takes a 5-bit immediate value as an operand. -// I also did NOT find one vector instruction to use immediate value for ROTL. -#define PROLD(s, r, tmp, n) \ - VSPLTISW $n, tmp \ - VRLW s, tmp, r - -// input: from high to low -// t0 = t0.S3, t0.S2, t0.S1, t0.S0 -// t1 = t1.S3, t1.S2, t1.S1, t1.S0 -// t2 = t2.S3, t2.S2, t2.S1, t2.S0 -// t3 = t3.S3, t3.S2, t3.S1, t3.S0 -// output: from high to low -// t0 = t3.S0, t2.S0, t1.S0, t0.S0 -// t1 = t3.S1, t2.S1, t1.S1, t0.S1 -// t2 = t3.S2, t2.S2, t1.S2, t0.S2 -// t3 = t3.S3, t2.S3, t1.S3, t0.S3 -#define PRE_TRANSPOSE_MATRIX(T0, T1, T2, T3) \ - VPERM T0, T1, M0, TMP0; \ - VPERM T2, T3, M0, TMP1; \ - VPERM T0, T1, M1, TMP2; \ - VPERM T2, T3, M1, TMP3; \ - VPERM TMP0, TMP1, M2, T0; \ - VPERM TMP0, TMP1, M3, T1; \ - VPERM TMP2, TMP3, M2, T2; \ - VPERM TMP2, TMP3, M3, T3 - -// input: from high to low -// t0 = t0.S3, t0.S2, t0.S1, t0.S0 -// t1 = t1.S3, t1.S2, t1.S1, t1.S0 -// t2 = t2.S3, t2.S2, t2.S1, t2.S0 -// t3 = t3.S3, t3.S2, t3.S1, t3.S0 -// output: from high to low -// t0 = t0.S0, t1.S0, t2.S0, t3.S0 -// t1 = t0.S1, t1.S1, t2.S1, t3.S1 -// t2 = t0.S2, t1.S2, t2.S2, t3.S2 -// t3 = t0.S3, t1.S3, t2.S3, t3.S3 -#define TRANSPOSE_MATRIX(T0, T1, T2, T3) \ - VPERM T1, T0, M0, TMP0; \ - VPERM T1, T0, M1, TMP1; \ - VPERM T3, T2, M0, TMP2; \ - VPERM T3, T2, M1, TMP3; \ - VPERM TMP2, TMP0, M2, T0; \ - VPERM TMP2, TMP0, M3, T1; \ - VPERM TMP3, TMP1, M2, T2; \ - VPERM TMP3, TMP1, M3, T3; \ - -// Affine Transform -// parameters: -// - L: table low nibbles -// - H: table high nibbles -// - x: 128 bits register as sbox input/output data -// - y: 128 bits temp register -// - z: 128 bits temp register -#define AFFINE_TRANSFORM(L, H, V_FOUR, x, y, z) \ - VAND NIBBLE_MASK, x, z; \ - VPERM L, L, z, y; \ - VSRD x, V_FOUR, x; \ - VAND NIBBLE_MASK, x, z; \ - VPERM H, H, z, x; \ - VXOR y, x, x - -// Affine Transform -// parameters: -// - L: table low nibbles -// - H: table high nibbles -// - x: 128 bits register as sbox input/output data -// - y: 128 bits temp register -// - z: 128 bits temp register -#define AFFINE_TRANSFORM_NOTX(L, H, V_FOUR, x, y, z) \ - VNOR x, x, z; \ // z = NOT(x) - VAND NIBBLE_MASK, z, z; \ - VPERM L, L, z, y; \ - VSRD x, V_FOUR, x; \ - VAND NIBBLE_MASK, x, z; \ - VPERM H, H, z, x; \ - VXOR y, x, x - -// SM4 sbox function -// parameters: -// - x: 128 bits register as sbox input/output data -// - y: 128 bits temp register -// - z: 128 bits temp register -#define SM4_SBOX(x, y, z) \ - AFFINE_TRANSFORM(M1L, M1H, V_FOUR, x, y, z); \ - VPERM x, x, INVERSE_SHIFT_ROWS, x; \ - VCIPHERLAST x, NIBBLE_MASK, x; \ - AFFINE_TRANSFORM_NOTX(M2L, M2H, V_FOUR, x, y, z) +#include "aesni_macros_ppc64x.s" #define SM4_TAO_L2(x, y, z) \ SM4_SBOX(x, y, z); \ @@ -174,45 +46,6 @@ GLOBL ·rcon(SB), RODATA, $192 VXOR x, t0, t0; \ VSLDOI $4, target, t0, target -// SM4 TAO L1 function -// parameters: -// - x: 128 bits register as TAO_L1 input/output data -// - tmp1: 128 bits temp register -// - tmp2: 128 bits temp register -// - tmp3: 128 bits temp register -#define SM4_TAO_L1(x, tmp1, tmp2, tmp3) \ - SM4_SBOX(x, tmp1, tmp2); \ - ; \ //#################### 4 parallel L1 linear transforms ##################// - VSPLTISW $8, tmp3; \ - VRLW x, tmp3, tmp1; \ // tmp1 = x <<< 8 - VRLW tmp1, tmp3, tmp2; \ // tmp2 = x <<< 16 - VXOR x, tmp1, tmp1; \ // tmp1 = x xor (x <<< 8) - VXOR tmp1, tmp2, tmp1; \ // tmp1 = x xor (x <<< 8) xor (x <<< 16) - VRLW tmp2, tmp3, tmp2; \ // tmp2 = x <<< 24 - VXOR tmp2, x, x; \ // x = x xor (x <<< 24) - VSPLTISW $2, tmp3; \ - VRLW tmp1, tmp3, tmp1; \ // tmp1 = (x xor (x <<< 8) xor (x <<< 16)) <<< 2 - VXOR tmp1, x, x - -// SM4 round function -// t0 ^= tao_l1(t1^t2^t3^xk) -// parameters: -// - RK: round key register -// - x: 128 bits temp register -// - tmp1: 128 bits temp register -// - tmp2: 128 bits temp register -// - tmp3: 128 bits temp register -// - t0: 128 bits register for data as result -// - t1: 128 bits register for data -// - t2: 128 bits register for data -// - t3: 128 bits register for data -#define SM4_ROUND(RK, x, tmp1, tmp2, tmp3, t0, t1, t2, t3) \ - VXOR RK, t1, x; \ - VXOR t2, x, x; \ - VXOR t3, x, x; \ - SM4_TAO_L1(x, tmp1, tmp2, tmp3); \ - VXOR x, t0, t0 - // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) TEXT ·expandKeyAsm(SB),NOSPLIT,$0 // prepare/load constants @@ -319,15 +152,8 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 encryptBlockLoop: // load xk - LXVW4X (R3), V4 - SM4_ROUND(V4, V5, V6, V7, V8, V0, V1, V2, V3) - VSLDOI $4, V4, V4, V4 - SM4_ROUND(V4, V5, V6, V7, V8, V1, V2, V3, V0) - VSLDOI $4, V4, V4, V4 - SM4_ROUND(V4, V5, V6, V7, V8, V2, V3, V0, V1) - VSLDOI $4, V4, V4, V4 - SM4_ROUND(V4, V5, V6, V7, V8, V3, V0, V1, V2) - + LXVW4X (R3), V8 + PROCESS_SINGLEBLOCK_4ROUND ADD $16, R3 BDNZ encryptBlockLoop @@ -340,10 +166,6 @@ encryptBlockLoop: RET -#define TMP0 V10 -#define TMP1 V11 -#define TMP2 V12 -#define TMP3 V13 // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 // prepare/load constants @@ -352,28 +174,7 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 MOVD $·rcon(SB), R4 LVX (R4), ESPERMW #endif - MOVD $·rcon+0x10(SB), R4 - LXVD2X (R4)(R0), M0 - MOVD $0x10, R3 - LXVD2X (R4)(R3), M1 - MOVD $0x20, R3 - LXVD2X (R4)(R3), M2 - MOVD $0x30, R3 - LXVD2X (R4)(R3), M3 - MOVD $0x40, R3 - LXVD2X (R4)(R3), REVERSE_WORDS - MOVD $0x50, R3 - LXVD2X (R4)(R3), NIBBLE_MASK - MOVD $0x60, R3 - LXVD2X (R4)(R3), INVERSE_SHIFT_ROWS - MOVD $0x70, R3 - LXVD2X (R4)(R3), M1L - MOVD $0x80, R3 - LXVD2X (R4)(R3), M1H - MOVD $0x90, R3 - LXVD2X (R4)(R3), M2L - MOVD $0xa0, R3 - LXVD2X (R4)(R3), M2H + LOAD_CONSTS(R4, R3) MOVD xk+0(FP), R3 MOVD dst+8(FP), R4 @@ -384,13 +185,13 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 BEQ enc8blocks enc4blocks: - PPC64X_LXVW4X(R5, R0, V0) MOVD $16, R7 + MOVD $32, R8 + MOVD $48, R9 + PPC64X_LXVW4X(R5, R0, V0) PPC64X_LXVW4X(R5, R7, V1) - MOVD $32, R7 - PPC64X_LXVW4X(R5, R7, V2) - MOVD $48, R7 - PPC64X_LXVW4X(R5, R7, V3) + PPC64X_LXVW4X(R5, R8, V2) + PPC64X_LXVW4X(R5, R9, V3) PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) // prepare counter MOVD $8, R7 @@ -398,44 +199,34 @@ enc4blocks: enc4blocksLoop: // load xk - LXVW4X (R3), V4 - VSPLTW $0, V4, V8 - SM4_ROUND(V8, TMP0, TMP1, TMP2, TMP3, V0, V1, V2, V3) - VSPLTW $1, V4, V8 - SM4_ROUND(V8, TMP0, TMP1, TMP2, TMP3, V1, V2, V3, V0) - VSPLTW $2, V4, V8 - SM4_ROUND(V8, TMP0, TMP1, TMP2, TMP3, V2, V3, V0, V1) - VSPLTW $3, V4, V8 - SM4_ROUND(V8, TMP0, TMP1, TMP2, TMP3, V3, V0, V1, V2) + LXVW4X (R3), V8 + PROCESS_4BLOCKS_4ROUND ADD $16, R3 BDNZ enc4blocksLoop TRANSPOSE_MATRIX(V0, V1, V2, V3) PPC64X_STXVW4X(V0, R4, R0) - MOVD $16, R7 PPC64X_STXVW4X(V1, R4, R7) - MOVD $32, R7 - PPC64X_STXVW4X(V2, R4, R7) - MOVD $48, R7 - PPC64X_STXVW4X(V3, R4, R7) + PPC64X_STXVW4X(V2, R4, R8) + PPC64X_STXVW4X(V3, R4, R9) RET enc8blocks: - PPC64X_LXVW4X(R5, R0, V0) MOVD $16, R7 + MOVD $32, R8 + MOVD $48, R9 + MOVD $64, R10 + MOVD $80, R11 + MOVD $96, R12 + MOVD $112, R14 + PPC64X_LXVW4X(R5, R0, V0) PPC64X_LXVW4X(R5, R7, V1) - MOVD $32, R7 - PPC64X_LXVW4X(R5, R7, V2) - MOVD $48, R7 - PPC64X_LXVW4X(R5, R7, V3) - MOVD $64, R7 - PPC64X_LXVW4X(R5, R7, V4) - MOVD $80, R7 - PPC64X_LXVW4X(R5, R7, V5) - MOVD $96, R7 - PPC64X_LXVW4X(R5, R7, V6) - MOVD $112, R7 - PPC64X_LXVW4X(R5, R7, V7) + PPC64X_LXVW4X(R5, R8, V2) + PPC64X_LXVW4X(R5, R9, V3) + PPC64X_LXVW4X(R5, R10, V4) + PPC64X_LXVW4X(R5, R11, V5) + PPC64X_LXVW4X(R5, R12, V6) + PPC64X_LXVW4X(R5, R14, V7) PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7) // prepare counter @@ -444,41 +235,19 @@ enc8blocks: enc8blocksLoop: LXVW4X (R3), V8 - VSPLTW $0, V8, V9 - SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V0, V1, V2, V3) - SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V4, V5, V6, V7) - VSPLTW $1, V8, V9 - SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V1, V2, V3, V0) - SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V5, V6, V7, V4) - VSPLTW $2, V8, V9 - SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V2, V3, V0, V1) - SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V6, V7, V4, V5) - VSPLTW $3, V8, V9 - SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V3, V0, V1, V2) - SM4_ROUND(V9, TMP0, TMP1, TMP2, TMP3, V7, V4, V5, V6) + PROCESS_8BLOCKS_4ROUND ADD $16, R3 BDNZ enc8blocksLoop TRANSPOSE_MATRIX(V0, V1, V2, V3) TRANSPOSE_MATRIX(V4, V5, V6, V7) PPC64X_STXVW4X(V0, R4, R0) - MOVD $16, R7 PPC64X_STXVW4X(V1, R4, R7) - MOVD $32, R7 - PPC64X_STXVW4X(V2, R4, R7) - MOVD $48, R7 - PPC64X_STXVW4X(V3, R4, R7) - MOVD $64, R7 - PPC64X_STXVW4X(V4, R4, R7) - MOVD $80, R7 - PPC64X_STXVW4X(V5, R4, R7) - MOVD $96, R7 - PPC64X_STXVW4X(V6, R4, R7) - MOVD $112, R7 - PPC64X_STXVW4X(V7, R4, R7) + PPC64X_STXVW4X(V2, R4, R8) + PPC64X_STXVW4X(V3, R4, R9) + PPC64X_STXVW4X(V4, R4, R10) + PPC64X_STXVW4X(V5, R4, R11) + PPC64X_STXVW4X(V6, R4, R12) + PPC64X_STXVW4X(V7, R4, R14) RET -#undef TMP0 -#undef TMP1 -#undef TMP2 -#undef TMP3