diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s
index 45819e9..da505d9 100644
--- a/sm4/gcm_arm64.s
+++ b/sm4/gcm_arm64.s
@@ -157,6 +157,28 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
 #undef plen
 #undef dlen
 
+#define TRANSPOSE_MATRIX(r0, r1, r2, r3) \
+	VMOV t3.S[0], K0.S[0]                      \
+	VMOV t2.S[0], K0.S[1]                      \
+	VMOV t1.S[0], K0.S[2]                      \
+	VMOV t0.S[0], K0.S[3]                      \
+	VMOV t3.S[1], K1.S[0]                      \
+	VMOV t2.S[1], K1.S[1]                      \
+	VMOV t1.S[1], K1.S[2]                      \
+	VMOV t0.S[1], K1.S[3]                      \
+	VMOV t3.S[2], K2.S[0]                      \
+	VMOV t2.S[2], K2.S[1]                      \
+	VMOV t1.S[2], K2.S[2]                      \
+	VMOV t0.S[2], K2.S[3]                      \
+	VMOV t3.S[3], K3.S[0]                      \
+	VMOV t2.S[3], K3.S[1]                      \
+	VMOV t1.S[3], K3.S[2]                      \
+	VMOV t0.S[3], K3.S[3]                      \
+	VMOV K0, t0                                \
+	VMOV K1, t1                                \
+	VMOV K2, t2                                \
+	VMOV K3, t3                                \
+
 #define LOAD_SM4_AESNI_CONSTS() \
   LDP nibble_mask<>(SB), (R20, R21)          \
   VMOV R20, NIBBLE_MASK.D[0]                 \
@@ -566,7 +588,7 @@ encOctetsEnc4Blocks1:
 		VREV32 B1.B16, B1.B16
 		VREV32 B2.B16, B2.B16
 		VREV32 B3.B16, B3.B16
-
+		TRANSPOSE_MATRIX(B0, B1, B2, B3)
 		// encryption first 4 blocks
 		MOVD	rkSave, rk
 
@@ -583,6 +605,7 @@ encOctetsEnc4Blocks2:
 		VREV32 B5.B16, B5.B16
 		VREV32 B6.B16, B6.B16
 		VREV32 B7.B16, B7.B16
+		TRANSPOSE_MATRIX(B4, B5, B6, B7)
 
 		// XOR plaintext and store ciphertext
 		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
@@ -642,7 +665,7 @@ encNibblesLoop:
 	VREV32	B1.B16, B1.B16
 	VADD	B2.S4, INC.S4, B3.S4
 	VREV32	B2.B16, B2.B16
-	VADD	B3.S4, INC.S4, B4.S4
+	VADD	B3.S4, INC.S4, CTR.S4
 	VREV32	B3.B16, B3.B16
 
 	// encryption first 4 blocks
@@ -662,6 +685,7 @@ encNibblesEnc4Blocks:
 	VREV32 B1.B16, B1.B16
 	VREV32 B2.B16, B2.B16
 	VREV32 B3.B16, B3.B16
+	TRANSPOSE_MATRIX(B0, B1, B2, B3)
 
 	// XOR plaintext and store ciphertext
 	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
@@ -689,7 +713,7 @@ encStartSingles:
 	VREV32	B1.B16, B1.B16
 	VADD	B2.S4, INC.S4, B3.S4
 	VREV32	B2.B16, B2.B16
-	VADD	B3.S4, INC.S4, B4.S4
+	VADD	B3.S4, INC.S4, CTR.S4
 	VREV32	B3.B16, B3.B16
 
 	// encryption first 4 blocks
@@ -709,6 +733,7 @@ encSinglesEnc4Blocks:
 	VREV32 B1.B16, B1.B16
 	VREV32 B2.B16, B2.B16
 	VREV32 B3.B16, B3.B16
+	TRANSPOSE_MATRIX(B0, B1, B2, B3)
 
 	VMOV B0.B16, K0.B16
 	CMP	$16, srcPtrLen
@@ -867,6 +892,7 @@ decOctetsEnc4Blocks1:
 		VREV32 B1.B16, T2.B16
 		VREV32 B2.B16, B2.B16
 		VREV32 B3.B16, B3.B16
+		TRANSPOSE_MATRIX(B0, B1, B2, B3)
 
 		// encryption first 4 blocks
 		MOVD	rkSave, rk
@@ -884,6 +910,7 @@ decOctetsEnc4Blocks2:
 		VREV32 B5.B16, B5.B16
 		VREV32 B6.B16, B6.B16
 		VREV32 B7.B16, B7.B16
+		TRANSPOSE_MATRIX(B4, B5, B6, B7)
 
 		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
 		VEOR	B0.B16, T1.B16, T1.B16
@@ -944,7 +971,7 @@ decNibblesLoop:
 	VREV32	B1.B16, B1.B16
 	VADD	B2.S4, INC.S4, B3.S4
 	VREV32	B2.B16, B2.B16
-	VADD	B3.S4, INC.S4, B4.S4
+	VADD	B3.S4, INC.S4, CTR.S4
 	VREV32	B3.B16, B3.B16
 
 	// encryption first 4 blocks
@@ -964,6 +991,7 @@ decNibblesEnc4Blocks:
 	VREV32 B1.B16, B1.B16
 	VREV32 B2.B16, B2.B16
 	VREV32 B3.B16, B3.B16
+	TRANSPOSE_MATRIX(B0, B1, B2, B3)
 
 	// XOR plaintext and store ciphertext
 	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
@@ -994,7 +1022,7 @@ decStartSingles:
 	VREV32	B1.B16, B1.B16
 	VADD	B2.S4, INC.S4, B3.S4
 	VREV32	B2.B16, B2.B16
-	VADD	B3.S4, INC.S4, B4.S4
+	VADD	B3.S4, INC.S4, CTR.S4
 	VREV32	B3.B16, B3.B16
 
 	// encryption first 4 blocks
@@ -1014,6 +1042,7 @@ decSinglesEnc4Blocks:
 	VREV32 B1.B16, B1.B16
 	VREV32 B2.B16, B2.B16
 	VREV32 B3.B16, B3.B16
+	TRANSPOSE_MATRIX(B0, B1, B2, B3)
 
 	VMOV B0.B16, K0.B16
 	CMP	$16, srcPtrLen