diff --git a/sm3/kdf_amd64.go b/sm3/kdf_amd64.go
index 08af870..33d2a08 100644
--- a/sm3/kdf_amd64.go
+++ b/sm3/kdf_amd64.go
@@ -1,3 +1,7 @@
+// Copyright 2024 Sun Yimin. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
 //go:build !purego
 
 package sm3
diff --git a/sm3/kdf_arm64.go b/sm3/kdf_arm64.go
index 83c51c1..dfd2349 100644
--- a/sm3/kdf_arm64.go
+++ b/sm3/kdf_arm64.go
@@ -1,3 +1,7 @@
+// Copyright 2024 Sun Yimin. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
 //go:build !purego
 
 package sm3
diff --git a/sm3/kdf_generic.go b/sm3/kdf_generic.go
index 3f07bd5..6bad4b2 100644
--- a/sm3/kdf_generic.go
+++ b/sm3/kdf_generic.go
@@ -1,4 +1,8 @@
-//go:build purego || !(amd64 || arm64)
+// Copyright 2024 Sun Yimin. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+//go:build purego || !(amd64 || arm64 || s390x)
 
 package sm3
 
diff --git a/sm3/kdf_mult4_asm.go b/sm3/kdf_mult4_asm.go
index 1ede296..9edc15c 100644
--- a/sm3/kdf_mult4_asm.go
+++ b/sm3/kdf_mult4_asm.go
@@ -1,4 +1,4 @@
-//go:build (amd64 || arm64) && !purego
+//go:build (amd64 || arm64 || s390x) && !purego
 
 package sm3
 
diff --git a/sm3/kdf_mult8_amd64.go b/sm3/kdf_mult8_amd64.go
index c91e7ac..1e2fd49 100644
--- a/sm3/kdf_mult8_amd64.go
+++ b/sm3/kdf_mult8_amd64.go
@@ -1,3 +1,7 @@
+// Copyright 2024 Sun Yimin. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
 //go:build !purego
 
 package sm3
diff --git a/sm3/sm3blocks_s390x.go b/sm3/kdf_s390x.go
similarity index 54%
rename from sm3/sm3blocks_s390x.go
rename to sm3/kdf_s390x.go
index 1a61e36..088e51c 100644
--- a/sm3/sm3blocks_s390x.go
+++ b/sm3/kdf_s390x.go
@@ -6,8 +6,10 @@
 
 package sm3
 
-//go:noescape
-func transposeMatrix(dig **[8]uint32)
+func kdf(baseMD *digest, keyLen int, limit int) []byte {
+	if limit < 4 {
+		return kdfGeneric(baseMD, keyLen, limit)
+	}
 
-//go:noescape
-func copyResultsBy4(dig *uint32, p *byte)
+	return kdfBy4(baseMD, keyLen, limit)
+}
diff --git a/sm3/sm3blocks_arm64.s b/sm3/sm3blocks_arm64.s
index 1babae3..9647598 100644
--- a/sm3/sm3blocks_arm64.s
+++ b/sm3/sm3blocks_arm64.s
@@ -1,3 +1,7 @@
+// Copyright 2024 Sun Yimin. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
 //go:build !purego
 
 #include "textflag.h"
diff --git a/sm3/sm3blocks_avx2_amd64.s b/sm3/sm3blocks_avx2_amd64.s
index a6f3342..1ccb7b7 100644
--- a/sm3/sm3blocks_avx2_amd64.s
+++ b/sm3/sm3blocks_avx2_amd64.s
@@ -1,3 +1,7 @@
+// Copyright 2024 Sun Yimin. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
 //go:build !purego
 
 #include "textflag.h"
diff --git a/sm3/sm3blocks_avx2_test.go b/sm3/sm3blocks_avx2_test.go
index 0c6676d..1e52d17 100644
--- a/sm3/sm3blocks_avx2_test.go
+++ b/sm3/sm3blocks_avx2_test.go
@@ -1,3 +1,7 @@
+// Copyright 2024 Sun Yimin. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
 //go:build amd64 && !purego
 
 package sm3
diff --git a/sm3/sm3blocks_s390x.s b/sm3/sm3blocks_s390x.s
index e560774..9f459a8 100644
--- a/sm3/sm3blocks_s390x.s
+++ b/sm3/sm3blocks_s390x.s
@@ -17,6 +17,32 @@ DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f
 DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f
 GLOBL mask<>(SB), 8, $64
 
+#define a V0
+#define e V1
+#define b V2
+#define f V3
+#define c V4
+#define g V5
+#define d V6
+#define h V7
+#define M0 V8
+#define M1 V9
+#define M2 V10
+#define M3 V11
+#define TMP0 V12
+#define TMP1 V13
+#define TMP2 V14
+#define TMP3 V15
+#define TMP4 V16
+#define aSave V24
+#define bSave V25
+#define cSave V26
+#define dSave V27
+#define eSave V28
+#define fSave V29
+#define gSave V30
+#define hSave V31
+
 #define TRANSPOSE_MATRIX(T0, T1, T2, T3, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) \
 	VPERM T0, T1, M0, TMP0; \
 	VPERM T2, T3, M0, TMP1; \
@@ -27,6 +53,115 @@ GLOBL mask<>(SB), 8, $64
 	VPERM TMP2, TMP3, M2, T2; \
 	VPERM TMP2, TMP3, M3, T3
 
+// r = s <<< n
+#define PROLD(s, r, n) \
+	VERLLF $n, s, r
+
+#define loadWordByIndex(W, start, i) \
+	VL $(4*i)(start), W
+
+// one word is 16 bytes
+#define prepare4Words \
+	VL 0(srcPtr1)(srcPtrPtr), V16; \
+	VL 0(srcPtr2)(srcPtrPtr), V17; \
+	VL 0(srcPtr3)(srcPtrPtr), V18; \
+	VL 0(srcPtr4)(srcPtrPtr), V19; \
+	TRANSPOSE_MATRIX(V16, V17, V18, V19, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3); \
+	VST V16, V19, (wordPtr); \
+	LAY 16(srcPtrPtr), srcPtrPtr; \
+	ADD $64, wordPtr
+
+#define LOAD_T(const, T) \
+	VREPIF $const, T
+
+#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
+	PROLD(a, TMP0, 12)               \
+	VLR TMP0, TMP1                   \
+	LOAD_T(const, TMP2)              \
+	VAF TMP2, TMP1, TMP1             \
+	VAF e, TMP1, TMP1                \
+	PROLD(TMP1, TMP2, 7)             \ // TMP2 = SS1
+	VX TMP2, TMP1, TMP0			     \ // TMP0 = SS2
+	VX a, b, TMP1                    \
+	VX c, TMP1, TMP1				 \
+	VAF TMP1, d, TMP1                \ // TMP1 = (a XOR b XOR c) + d
+	loadWordByIndex(TMP3, index)     \
+	loadWordByIndex(TMP4, index+4)   \
+	VX TMP3, TMP4, TMP4              \
+	VAF TMP4, TMP1, TMP1             \ // TMP1 = (a XOR b XOR c) + d + (Wt XOR Wt+4)
+	VAF TMP1, TMP0, TMP1			 \ // TMP1 = TT1
+	VAF h, TMP3, TMP3				 \
+	VAF TMP3, TMP2, TMP3			 \ // Wt + h + SS1
+	VX e, f, TMP4 				     \
+	VX g, TMP4, TMP4				 \
+	VAF TMP4, TMP3, TMP3			 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
+	VLR b, TMP4 					 \
+	PROLD(TMP4, b, 9)                \ // b = b <<< 9
+	VLR TMP1, h					     \ // h = TT1
+	VLR f, TMP4 					 \
+	PROLD(TMP4, f, 19)               \ // f = f <<< 19
+	PROLD(TMP3, TMP4, 9)             \ // TMP4 = TT2 <<< 9
+	PROLD(TMP4, TMP0, 8)             \ // TMP0 = TT2 <<< 17
+	VX TMP3, TMP4, TMP4 			 \ // TMP4 = TT2 XOR (TT2 <<< 9)
+	VX TMP4, TMP0, d                 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
+
+#define MESSAGE_SCHEDULE(index) \
+	loadWordByIndex(TMP0, index+1)    \ // Wj-3
+	PROLD(TMP0, TMP1, 15)             \
+	loadWordByIndex(TMP0, index-12)   \ // Wj-16
+	VX TMP0, TMP1, TMP0               \
+	loadWordByIndex(TMP1, index-5)    \ // Wj-9
+	VX TMP0, TMP1, TMP0               \
+	PROLD(TMP0, TMP1, 15)             \
+	PROLD(TMP1, TMP2, 8)              \
+	VX TMP1, TMP0, TMP0               \
+	VX TMP2, TMP0, TMP0               \ // P1
+	loadWordByIndex(TMP1, index-9)    \ // Wj-13
+	PROLD(TMP1, TMP2, 7)              \
+	VX TMP2, TMP0, TMP0               \
+	loadWordByIndex(TMP1, index-2)    \ // Wj-6
+	VX TMP1, TMP0, TMP1               \
+	VST TMP1, (wordPtr)               \
+	ADD $16, wordPtr                  \
+
+#define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \
+	MESSAGE_SCHEDULE(index)                               \
+	ROUND_00_11(index, const, a, b, c, d, e, f, g, h)     \
+
+#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \
+	MESSAGE_SCHEDULE(index)          \ // V11 is Wt+4 now, Pls do not use it
+	PROLD(a, TMP0, 12)               \
+	VLR TMP0, TMP1                   \
+	LOAD_T(const, tmp1)              \
+	VAF tmp1, TMP0, TMP0             \
+	VAF e, TMP0, TMP0                \
+	PROLD(TMP0, TMP2, 7)             \ // V14 = SS1
+	VX TMP2, TMP1, TMP0              \ // TMP0 = SS2
+	VO a, b, TMP3                    \
+	VN a, b, TMP1                    \
+	VN c, TMP3, TMP3                 \
+	VO TMP1, TMP3, TMP1              \ // (a AND b) OR (a AND c) OR (b AND c)
+	VAF TMP1, d, TMP1                \ // (a AND b) OR (a AND c) OR (b AND c) + d
+	loadWordByIndex(TMP3, index)     \ // Wj
+	VX TMP3, TMP4, TMP4              \ // Wj XOR Wj+4
+	VAF TMP1, TMP4, TMP1             \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)
+	VAF TMP1, TMP0, TMP1             \ // TT1
+	VAF h, TMP3, TMP3                \ // Wt + h
+	VAF TMP2, TMP3, TMP3             \ // Wt + h + SS1
+	VX f, g, TMP4                    \
+	VN TMP4, e, TMP4                 \
+	VX g, TMP4, TMP4                 \ // (f XOR g) AND e XOR g
+	VAF TMP3, TMP4, TMP3             \ // TT2
+	VLR b, TMP4                      \
+	PROLD(TMP4, b, 9)                \ // b = b <<< 9
+	VLR TMP1, h                      \ // h = TT1
+	VLR f, TMP4                      \
+	PROLD(TMP4, f, 19)               \ // f = f <<< 19
+	PROLD(TMP3, TMP4, 9)             \ // TMP4 = TT2 <<< 9
+	PROLD(TMP4, TMP0, 8)             \ // TMP0 = TT2 <<< 17
+	VX TMP3, TMP4, TMP4              \ // TMP4 = TT2 XOR (TT2 <<< 9)
+	VX TMP4, TMP0, d                 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
+
 // transposeMatrix(dig **[8]uint32)
 TEXT ·transposeMatrix(SB),NOSPLIT,$0
 	MOVD	dig+0(FP), R1
@@ -69,3 +204,155 @@ TEXT ·copyResultsBy4(SB),NOSPLIT,$0
 	VSTM V0, V7, (dstPtr)
 
 	RET
+
+
+// blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
+TEXT ·blockMultBy4(SB), NOSPLIT, $0
+#define digPtr R0
+#define srcPtrPtr R1
+#define statePtr R2
+#define blockCount R5
+#define srcPtr1 R6
+#define srcPtr2 R7
+#define srcPtr3 R8
+#define srcPtr4 R9
+#define wordPtr R10
+	MOVD	dig+0(FP), digPtr
+	MOVD	p+8(FP), srcPtrPtr
+	MOVD	buffer+16(FP), statePtr
+	MOVD	blocks+24(FP), blockCount
+
+	// load state
+	MOVD 0(digPtr), R4
+	VLM (R4), a, e
+	MOVD 8(digPtr), R4
+	VLM (R4), b, f
+	MOVD 16(digPtr), R4
+	VLM (R4), c, g
+	MOVD 24(digPtr), R4
+	VLM (R4), d, h
+
+	MOVD $mask<>+0x00(SB), R4
+	VLM (R4), M0, M3
+
+	TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3)
+	TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3)
+
+	MOVD (srcPtrPtr), srcPtr1
+	MOVD 8(srcPtrPtr), srcPtr2
+	MOVD 16(srcPtrPtr), srcPtr3
+	MOVD 24(srcPtrPtr), srcPtr4
+	MOVD $0, srcPtrPtr
+
+loop:
+	// save state
+	VLR a, aSave
+	VLR b, bSave
+	VLR c, cSave
+	VLR d, dSave
+	VLR e, eSave
+	VLR f, fSave
+	VLR g, gSave
+	VLR h, hSave
+
+	// reset wordPtr
+	MOVD statePtr, wordPtr
+
+	// load message block
+	prepare4Words
+	prepare4Words
+	prepare4Words
+	prepare4Words
+
+	ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)
+	ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)
+	ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)
+	ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)
+	ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)
+	ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)
+	ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)
+	ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)
+	ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)
+	ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)
+	ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)
+	ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)
+
+	ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)
+	ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)
+	ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)
+	ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)
+
+	ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)
+	ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)
+	ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)
+	ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)
+	ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)
+	ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)
+	ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)
+	ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)
+	ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)
+	ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)
+	ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)
+	ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)
+	ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)
+	ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)
+	ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)
+	ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)
+	ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)
+	ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)
+	ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)
+	ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)
+	ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)
+	ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)
+	ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)
+	ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)
+	ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)
+	ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)
+	ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)
+	ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)
+	ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)
+	ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)
+	ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)
+	ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)
+	ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)
+	ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)
+	ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)
+	ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)
+	ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)
+	ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)
+	ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)
+	ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)
+	ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)
+	ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)
+	ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)
+	ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)
+	ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)
+	ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)
+	ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
+	ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)
+
+	VX a, aSave, a
+	VX b, bSave, b
+	VX c, cSave, c
+	VX d, dSave, d
+	VX e, eSave, e
+	VX f, fSave, f
+	VX g, gSave, g
+	VX h, hSave, h
+
+	SUB $1, blockCount
+	CMPBGT blockCount, $0, loop
+
+	TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3)
+	TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3)
+
+	MOVD 	0(digPtr), R4
+	VSTM a, e, (R4)
+	MOVD 	8(digPtr), R4
+	VSTM b, f, (R4)
+	MOVD 	16(digPtr), R4
+	VSTM c, g, (R4)
+	MOVD 	24(digPtr), R4
+	VSTM d, h, (R4)
+
+	RET
diff --git a/sm3/sm3blocks_s390x_test.go b/sm3/sm3blocks_s390x_test.go
deleted file mode 100644
index 1467ab3..0000000
--- a/sm3/sm3blocks_s390x_test.go
+++ /dev/null
@@ -1,54 +0,0 @@
-//go:build s390x && !purego
-
-package sm3
-
-import (
-	"fmt"
-	"testing"
-)
-
-func TestTransposeMatrix(t *testing.T) {
-	var m [4][8]uint32
-	var k uint32 = 0
-	for i := 0; i < 4; i++ {
-		for j := 0; j < 8; j++ {
-			m[i][j] = k
-			k++
-			fmt.Printf("%04x ", m[i][j])
-		}
-		fmt.Println()
-	}
-	input := [4]*[8]uint32{&m[0], &m[1], &m[2], &m[3]}
-	transposeMatrix(&input[0])
-	fmt.Println()
-	fmt.Println()
-	for i := 0; i < 4; i++ {
-		for j := 0; j < 8; j++ {
-			fmt.Printf("%04x ", m[i][j])
-		}
-		fmt.Println()
-	}
-}
-
-func TestCopyResultsBy4(t *testing.T) {
-	var m [4][8]uint32
-	var k uint32 = 0
-	for i := 0; i < 4; i++ {
-		for j := 0; j < 8; j++ {
-			m[i][j] = k << 24
-			k++
-			fmt.Printf("%04x ", m[i][j])
-		}
-		fmt.Println()
-	}
-	var p [128]byte
-	copyResultsBy4(&m[0][0], &p[0])
-	fmt.Println()
-	fmt.Println()
-	for i := 0; i < 128; i++ {
-		fmt.Printf("%02x ", p[i])
-		if i%16 == 15 {
-			fmt.Println()
-		}
-	}
-}
diff --git a/sm3/sm3blocks_simd_amd64.s b/sm3/sm3blocks_simd_amd64.s
index dd8339f..5090353 100644
--- a/sm3/sm3blocks_simd_amd64.s
+++ b/sm3/sm3blocks_simd_amd64.s
@@ -1,3 +1,7 @@
+// Copyright 2024 Sun Yimin. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
 //go:build !purego
 
 #include "textflag.h"
diff --git a/sm3/sm3blocks_test.go b/sm3/sm3blocks_test.go
index 4e97972..c1d8ee8 100644
--- a/sm3/sm3blocks_test.go
+++ b/sm3/sm3blocks_test.go
@@ -1,4 +1,8 @@
-//go:build (amd64 || arm64) && !purego
+// Copyright 2024 Sun Yimin. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+//go:build (amd64 || arm64 || s390x) && !purego
 
 package sm3