diff --git a/zuc/eia256_asm_arm64.s b/zuc/eia256_asm_arm64.s
index e24a1db..35e4afb 100644
--- a/zuc/eia256_asm_arm64.s
+++ b/zuc/eia256_asm_arm64.s
@@ -80,18 +80,15 @@ TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
 	VLD1 (BX), [XTMP1.B16, XTMP2.B16]
 	VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
 	// TODO: Any better solution???
-	VMOV XTMP1.S[1], KS_L.S[0]
+	VDUP XTMP1.S[1], KS_L.S4
 	VMOV XTMP1.S[0], KS_L.S[1]
-	VMOV XTMP1.S[2], KS_L.S[2]
-	VMOV XTMP1.S[1], KS_L.S[3]	// KS bits [63:32 31:0 95:64 63:32]
-	VMOV XTMP1.S[3], KS_M1.S[0]
+	VMOV XTMP1.S[2], KS_L.S[2] // KS bits [63:32 31:0 95:64 63:32]
+	VDUP XTMP1.S[3], KS_M1.S4
 	VMOV XTMP1.S[2], KS_M1.S[1]
-	VMOV XTMP2.S[0], KS_M1.S[2]
-	VMOV XTMP1.S[3], KS_M1.S[3]	// KS bits [127:96 95:64 159:128 127:96]
-	VMOV XTMP2.S[1], KS_M2.S[0]
+	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
+	VDUP XTMP2.S[1], KS_M2.S4
 	VMOV XTMP2.S[0], KS_M2.S[1]
-	VMOV XTMP2.S[2], KS_M2.S[2]
-	VMOV XTMP2.S[1], KS_M2.S[3] // KS bits [191:160 159:128 223:192 191:160]
+	VMOV XTMP2.S[2], KS_M2.S[2] // KS bits [191:160 159:128 223:192 191:160]
     
 	// setup DATA
 	VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
@@ -159,21 +156,17 @@ TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
 	VLD1 (BX), [XTMP1.B16, XTMP2.B16]
 	VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
 	// TODO: Any better solution??? We can use VTBL, but there are no performance imprvoement if we can't reuse MASK constant
-	VMOV XTMP1.S[1], KS_L.S[0]
+	VDUP XTMP1.S[1], KS_L.S4
 	VMOV XTMP1.S[0], KS_L.S[1]
-	VMOV XTMP1.S[2], KS_L.S[2]
-	VMOV XTMP1.S[1], KS_L.S[3]	// KS bits [63:32 31:0 95:64 63:32]
-	VMOV XTMP1.S[3], KS_M1.S[0]
+	VMOV XTMP1.S[2], KS_L.S[2] // KS bits [63:32 31:0 95:64 63:32]
+	VDUP XTMP1.S[3], KS_M1.S4
 	VMOV XTMP1.S[2], KS_M1.S[1]
-	VMOV XTMP2.S[0], KS_M1.S[2]
-	VMOV XTMP1.S[3], KS_M1.S[3]	// KS bits [127:96 95:64 159:128 127:96]
-	VMOV XTMP2.S[1], KS_M2.S[0]
+	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
+	VDUP XTMP2.S[1], KS_M2.S4
 	VMOV XTMP2.S[0], KS_M2.S[1]
-	VMOV XTMP2.S[2], KS_M2.S[2]
-	VMOV XTMP2.S[1], KS_M2.S[3] // KS bits [191:160 159:128 223:192 191:160]
-	VMOV XTMP2.S[3], KS_H.S[0]
+	VMOV XTMP2.S[2], KS_M2.S[2] // KS bits [191:160 159:128 223:192 191:160]
+	VDUP XTMP2.S[3], KS_H.S4
 	VMOV XTMP2.S[2], KS_H.S[1]
-	VMOV XTMP2.S[3], KS_H.S[2]
 	VMOV XTMP2.S[2], KS_H.S[3] // KS bits [255:224 223:192 255:224 223:192]
     
 	// setup DATA
diff --git a/zuc/eia_asm_arm64.s b/zuc/eia_asm_arm64.s
index 6680364..ea8826c 100644
--- a/zuc/eia_asm_arm64.s
+++ b/zuc/eia_asm_arm64.s
@@ -81,14 +81,12 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
 	VLD1 (BX), [XTMP1.B16, XTMP2.B16]
 	VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
 	// TODO: Any better solution???
-	VMOV XTMP1.S[1], KS_L.S[0]
+	VDUP XTMP1.S[1], KS_L.S4
 	VMOV XTMP1.S[0], KS_L.S[1]
-	VMOV XTMP1.S[2], KS_L.S[2]
-	VMOV XTMP1.S[1], KS_L.S[3]	// KS bits [63:32 31:0 95:64 63:32]
-	VMOV XTMP1.S[3], KS_M1.S[0]
+	VMOV XTMP1.S[2], KS_L.S[2] // KS bits [63:32 31:0 95:64 63:32]
+	VDUP XTMP1.S[3], KS_M1.S4
 	VMOV XTMP1.S[2], KS_M1.S[1]
-	VMOV XTMP2.S[0], KS_M1.S[2]
-	VMOV XTMP1.S[3], KS_M1.S[3]	// KS bits [127:96 95:64 159:128 127:96]
+	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
 
 	// setup DATA
 	VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]