diff --git a/sm3/sm3blocks_s390x.go b/sm3/sm3blocks_s390x.go new file mode 100644 index 0000000..01db70b --- /dev/null +++ b/sm3/sm3blocks_s390x.go @@ -0,0 +1,10 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build !purego + +package sm3 + +//go:noescape +func transposeMatrix(dig **[8]uint32) diff --git a/sm3/sm3blocks_s390x.s b/sm3/sm3blocks_s390x.s new file mode 100644 index 0000000..0755c83 --- /dev/null +++ b/sm3/sm3blocks_s390x.s @@ -0,0 +1,64 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build !purego + +#include "textflag.h" + +DATA mask<>+0x00(SB)/8, $0x0001020310111213 +DATA mask<>+0x08(SB)/8, $0x0405060714151617 +DATA mask<>+0x10(SB)/8, $0x08090a0b18191a1b +DATA mask<>+0x18(SB)/8, $0x0c0d0e0f1c1d1e1f +DATA mask<>+0x20(SB)/8, $0x0001020304050607 +DATA mask<>+0x28(SB)/8, $0x1011121314151617 +DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f +DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f +GLOBL mask<>(SB), 8, $64 + +#define TRANSPOSE_MATRIX(T0, T1, T2, T3, M1, M2, M3, M4, TMP0, TMP1, TMP2, TMP3) \ + VPERM T0, T1, M0, TMP0; \ + VPERM T2, T3, M0, TMP1; \ + VPERM T0, T1, M1, TMP2; \ + VPERM T2, T3, M1, TMP3; \ + VPERM TMP0, TMP1, M2, T0; \ + VPERM TMP0, TMP1, M3, T1; \ + VPERM TMP2, TMP3, M2, T2; \ + VPERM TMP2, TMP3, M3, T3 + +// transposeMatrix(dig **[8]uint32) +TEXT ·transposeMatrix(SB),NOSPLIT,$0 + MOVD dig+0(FP), R1 + + MOVD (R1), R2 + VL 0(R2), V0 + VL 16(R2), V4 + MOVD 8(R1), R2 + VL 0(R2), V1 + VL 16(R2), V5 + MOVD 16(R1), R2 + VL 0(R2), V2 + VL 16(R2), V6 + MOVD 24(R1), R2 + VL 0(R2), V3 + VL 16(R2), V7 + + MOVD $mask<>+0x00(SB), R2 + VLM 0(R2), V8, V11 + + TRANSPOSE_MATRIX(V0, V1, V2, V3, V8, V9, V10, V11, V12, V13, V14, V15) + TRANSPOSE_MATRIX(V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15) + + MOVD (R1), R2 + VST V0, 0(R2) + VST V4, 16(R2) + MOVD 8(R1), R2 + VST V1, 0(R2) + VST V5, 16(R2) + MOVD 16(R1), R2 + VST V2, 0(R2) + VST V6, 16(R2) + MOVD 24(R1), R2 + VST V3, 0(R2) + VST V7, 16(R2) + RET diff --git a/sm3/sm3blocks_s390x_test.go b/sm3/sm3blocks_s390x_test.go new file mode 100644 index 0000000..eda869e --- /dev/null +++ b/sm3/sm3blocks_s390x_test.go @@ -0,0 +1,28 @@ +//go:build s390x && !purego + +package sm3 + +import ( + "fmt" + "testing" +) + +func TestTransposeMatrix(t *testing.T) { + var m [4][8]uint32 + for i := 0; i < 4; i++ { + for j := 0; j < 8; j++ { + m[i][j] = uint32(i*4 + j) + fmt.Printf("%04x ", m[i][j]) + } + fmt.Println() + } + input := [4]*[8]uint32{&m[0], &m[1], &m[2], &m[3]} + transposeMatrix(&input[0]) + for i := 0; i < 4; i++ { + for j := 0; j < 8; j++ { + m[i][j] = uint32(i*4 + j) + fmt.Printf("%04x ", m[i][j]) + } + fmt.Println() + } +}