diff --git a/.github/workflows/test_ppc64.yaml b/.github/workflows/test_ppc64.yaml index 40e3b80..9f9eb98 100644 --- a/.github/workflows/test_ppc64.yaml +++ b/.github/workflows/test_ppc64.yaml @@ -47,3 +47,9 @@ jobs: env: GOARCH: ${{ matrix.arch }} GOPPC64: ${{ matrix.ppc64 }} + + - name: Test Cipher + run: go test -v -short ./cipher/... + env: + GOARCH: ${{ matrix.arch }} + GOPPC64: ${{ matrix.ppc64 }} diff --git a/sm4/ecb_cipher_asm.go b/sm4/ecb_cipher_asm.go index 6cf5744..287cc27 100644 --- a/sm4/ecb_cipher_asm.go +++ b/sm4/ecb_cipher_asm.go @@ -1,4 +1,4 @@ -//go:build (amd64 || arm64) && !purego +//go:build (amd64 || arm64 || ppc64 || ppc64le)) && !purego package sm4 diff --git a/sm4/ecb_ppc64x.s b/sm4/ecb_ppc64x.s new file mode 100644 index 0000000..68e208b --- /dev/null +++ b/sm4/ecb_ppc64x.s @@ -0,0 +1,262 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build (ppc64 || ppc64le) && !purego + +#include "textflag.h" + +DATA ·rcon+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word +DATA ·rcon+0x08(SB)/8, $0x0302010007060504 +DATA ·rcon+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix +DATA ·rcon+0x18(SB)/8, $0x0405060714151617 +DATA ·rcon+0x20(SB)/8, $0x08090a0b18191a1b +DATA ·rcon+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f +DATA ·rcon+0x30(SB)/8, $0x0001020304050607 +DATA ·rcon+0x38(SB)/8, $0x1011121314151617 +DATA ·rcon+0x40(SB)/8, $0x08090a0b0c0d0e0f +DATA ·rcon+0x48(SB)/8, $0x18191a1b1c1d1e1f +DATA ·rcon+0x50(SB)/8, $0x0c0d0e0f08090a0b // reverse words +DATA ·rcon+0x58(SB)/8, $0x0405060700010203 +DATA ·rcon+0x60(SB)/8, $0x0F0F0F0F0F0F0F0F // nibble mask +DATA ·rcon+0x68(SB)/8, $0x0F0F0F0F0F0F0F0F +DATA ·rcon+0x70(SB)/8, $0x000D0A0704010E0B // inverse shift rows +DATA ·rcon+0x78(SB)/8, $0x0805020F0C090603 +DATA ·rcon+0x80(SB)/8, $0x691CA0D5B6C37F0A // affine transform matrix m1 low +DATA ·rcon+0x88(SB)/8, $0x53269AEF8CF94530 +DATA ·rcon+0x90(SB)/8, $0x009837AF6CF45BC3 // affine transform matrix m1 high +DATA ·rcon+0x98(SB)/8, $0xAB339C04C75FF068 +DATA ·rcon+0xa0(SB)/8, $0x616EF1FE050A959A // affine transform matrix m2 low +DATA ·rcon+0xa8(SB)/8, $0xF5FA656A919E010E +DATA ·rcon+0xb0(SB)/8, $0x00A4E044CD692D89 // affine transform matrix m2 high +DATA ·rcon+0xb8(SB)/8, $0xA50145E168CC882C +GLOBL ·rcon(SB), RODATA, $192 + +#define REVERSE_WORDS V19 +#define M1L V20 +#define M1H V21 +#define M2L V22 +#define M2H V23 +#define V_FOUR V24 +#define M0 V25 +#define M1 V26 +#define M2 V27 +#define M3 V28 +#define NIBBLE_MASK V29 +#define INVERSE_SHIFT_ROWS V30 +// For instruction emulation +#define ESPERMW V31 // Endian swapping permute into BE + +#define TMP0 V10 +#define TMP1 V11 +#define TMP2 V12 +#define TMP3 V13 + +#include "aesni_macros_ppc64x.s" + +// func encryptSm4Ecb(xk *uint32, dst, src []byte) +TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0 +#define dstPtr R3 +#define srcPtr R4 +#define rk R5 +#define rkSave R6 + + // prepare/load constants + VSPLTISW $4, V_FOUR; +#ifdef NEEDS_PERMW + MOVD $·rcon(SB), R4 + LVX (R4), ESPERMW +#endif + MOVD $·rcon+0x10(SB), R4 + LOAD_CONSTS(R4, R3) + + MOVD xk+0(FP), rk + MOVD dst+8(FP), dstPtr + MOVD src+32(FP), srcPtr + MOVD src_len+40(FP), srcLen + + CMP srcLen, $128 + BLT block64 + +preloop128: + SRD $7, srcLen, R7 // Set up loop counter + MOVD R7, CTR + MOVD $16, R7 + MOVD $32, R8 + MOVD $48, R10 + MOVD $64, R11 + MOVD $80, R12 + MOVD $96, R14 + MOVD $112, R15 + ANDCC $127, srcLen, R9 // Check for tailing bytes for later + PCALIGN $16 + +block128: + // Case for >= 128 bytes + PPC64X_LXVW4X(srcPtr, R0, V0) + PPC64X_LXVW4X(srcPtr, R7, V1) + PPC64X_LXVW4X(srcPtr, R8, V2) + PPC64X_LXVW4X(srcPtr, R10, V3) + PPC64X_LXVW4X(srcPtr, R11, V4) + PPC64X_LXVW4X(srcPtr, R12, V5) + PPC64X_LXVW4X(srcPtr, R14, V6) + PPC64X_LXVW4X(srcPtr, R15, V7) + PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) + PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7) + + LXVW4X (rk)(R0), V8 + PROCESS_8BLOCKS_4ROUND + LXVW4X (rk)(R7), V8 + PROCESS_8BLOCKS_4ROUND + LXVW4X (rk)(R8), V8 + PROCESS_8BLOCKS_4ROUND + LXVW4X (rk)(R10), V8 + PROCESS_8BLOCKS_4ROUND + LXVW4X (rk)(R11), V8 + PROCESS_8BLOCKS_4ROUND + LXVW4X (rk)(R12), V8 + PROCESS_8BLOCKS_4ROUND + LXVW4X (rk)(R14), V8 + PROCESS_8BLOCKS_4ROUND + LXVW4X (rk)(R15), V8 + PROCESS_8BLOCKS_4ROUND + + TRANSPOSE_MATRIX(V0, V1, V2, V3) + TRANSPOSE_MATRIX(V4, V5, V6, V7) + + PPC64X_STXVW4X(V0, dstPtr, R0) + PPC64X_STXVW4X(V1, dstPtr, R7) + PPC64X_STXVW4X(V2, dstPtr, R8) + PPC64X_STXVW4X(V3, dstPtr, R10) + PPC64X_STXVW4X(V4, dstPtr, R11) + PPC64X_STXVW4X(V5, dstPtr, R12) + PPC64X_STXVW4X(V6, dstPtr, R14) + PPC64X_STXVW4X(V7, dstPtr, R15) + + ADD $128, srcPtr + ADD $128, dstPtr + BDNZ block128 + BC 12,2,LR // BEQLR, fast return + MOVD R9, srcLen + +block64: + CMP srcLen, $64 + BLT lessThan64 + PPC64X_LXVW4X(srcPtr, R0, V0) + PPC64X_LXVW4X(srcPtr, R7, V1) + PPC64X_LXVW4X(srcPtr, R8, V2) + PPC64X_LXVW4X(srcPtr, R10, V3) + PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) + LXVW4X (rk)(R0), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R7), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R8), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R10), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R11), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R12), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R14), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R15), V8 + PROCESS_4BLOCKS_4ROUND + TRANSPOSE_MATRIX(V0, V1, V2, V3) + PPC64X_STXVW4X(V0, dstPtr, R0) + PPC64X_STXVW4X(V1, dstPtr, R7) + PPC64X_STXVW4X(V2, dstPtr, R8) + PPC64X_STXVW4X(V3, dstPtr, R10) + ADD $64, srcPtr + ADD $64, dstPtr + ADD $-64, srcLen + +lessThan64: + CMPU srcLen, $48, CR1 + CMPU srcLen, $32, CR2 + CMPU srcLen, $16, CR3 + BEQ CR1, block48 + BEQ CR2, block32 + BEQ CR3, block16 + RET + +block48: + PPC64X_LXVW4X(srcPtr, R0, V0) + PPC64X_LXVW4X(srcPtr, R7, V1) + PPC64X_LXVW4X(srcPtr, R8, V2) + PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) + LXVW4X (rk)(R0), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R7), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R8), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R10), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R11), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R12), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R14), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R15), V8 + PROCESS_4BLOCKS_4ROUND + TRANSPOSE_MATRIX(V0, V1, V2, V3) + PPC64X_STXVW4X(V0, dstPtr, R0) + PPC64X_STXVW4X(V1, dstPtr, R7) + PPC64X_STXVW4X(V2, dstPtr, R8) + RET + +block32: + PPC64X_LXVW4X(srcPtr, R0, V0) + PPC64X_LXVW4X(srcPtr, R7, V1) + PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) + LXVW4X (rk)(R0), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R7), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R8), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R10), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R11), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R12), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R14), V8 + PROCESS_4BLOCKS_4ROUND + LXVW4X (rk)(R15), V8 + PROCESS_4BLOCKS_4ROUND + TRANSPOSE_MATRIX(V0, V1, V2, V3) + PPC64X_STXVW4X(V0, dstPtr, R0) + PPC64X_STXVW4X(V1, dstPtr, R7) + RET + +block16: + PPC64X_LXVW4X(srcPtr, R0, V0) + VSLDOI $4, V0, V0, V1 + VSLDOI $4, V1, V1, V2 + VSLDOI $4, V2, V2, V3 + LXVW4X (rk)(R0), V8 + PROCESS_SINGLEBLOCK_4ROUND + LXVW4X (rk)(R7), V8 + PROCESS_SINGLEBLOCK_4ROUND + LXVW4X (rk)(R8), V8 + PROCESS_SINGLEBLOCK_4ROUND + LXVW4X (rk)(R10), V8 + PROCESS_SINGLEBLOCK_4ROUND + LXVW4X (rk)(R11), V8 + PROCESS_SINGLEBLOCK_4ROUND + LXVW4X (rk)(R12), V8 + PROCESS_SINGLEBLOCK_4ROUND + LXVW4X (rk)(R14), V8 + PROCESS_SINGLEBLOCK_4ROUND + LXVW4X (rk)(R15), V8 + PROCESS_SINGLEBLOCK_4ROUND + VSLDOI $4, V3, V3, V3 + VSLDOI $4, V3, V2, V2 + VSLDOI $4, V2, V1, V1 + VSLDOI $4, V1, V0, V0 + PPC64X_STXVW4X(V0, dstPtr, R0) + RET