diff --git a/README.md b/README.md
index 2b4b7b3..b951364 100644
--- a/README.md
+++ b/README.md
@@ -11,9 +11,9 @@
 ## Packages
 * **SM2** - This is a SM2 sm2p256v1 implementation whose performance is similar like golang native NIST P256 under **amd64** and **arm64**, for implementation detail, please refer [SM2实现细节](https://github.com/emmansun/gmsm/wiki/SM2%E6%80%A7%E8%83%BD%E4%BC%98%E5%8C%96). It supports ShangMi sm2 digital signature, public key encryption algorithm and also key exchange.
 
-* **SM3** - This is also a SM3 implementation whose performance is similar like golang native SHA 256 with SIMD under **amd64**, for implementation detail, please refer [SM3性能优化](https://github.com/emmansun/gmsm/wiki/SM3%E6%80%A7%E8%83%BD%E4%BC%98%E5%8C%96). It also provides A64 cryptographic instructions SM3 POC without test.
+* **SM3** - This is also a SM3 implementation whose performance is similar like golang native SHA 256 with SIMD under **amd64**, for implementation detail, please refer [SM3性能优化](https://github.com/emmansun/gmsm/wiki/SM3%E6%80%A7%E8%83%BD%E4%BC%98%E5%8C%96). It also provides A64 cryptographic instructions SM3 tested with QEMU.
 
-* **SM4** - For SM4 implementation, SIMD & AES-NI are used under **amd64** and **arm64**, for detail please refer [SM4性能优化](https://github.com/emmansun/gmsm/wiki/SM4%E6%80%A7%E8%83%BD%E4%BC%98%E5%8C%96), it supports ECB/CBC/CFB/OFB/CTR/GCM/CCM/XTS modes. It also provides A64 cryptographic instructions SM4 POC without test.
+* **SM4** - For SM4 implementation, SIMD & AES-NI are used under **amd64** and **arm64**, for detail please refer [SM4性能优化](https://github.com/emmansun/gmsm/wiki/SM4%E6%80%A7%E8%83%BD%E4%BC%98%E5%8C%96), it supports ECB/CBC/CFB/OFB/CTR/GCM/CCM/XTS modes. It also provides A64 cryptographic instructions SM4 tested with QEMU.
 
 * **SM9** - For SM9 implementation, please reference [sm9/bn256 README.md](https://github.com/emmansun/gmsm/tree/main/sm9/bn256).
 
diff --git a/sm4/aesni_arm64.h b/sm4/aesni_arm64.h
index c1ad80a..d8a05b1 100644
--- a/sm4/aesni_arm64.h
+++ b/sm4/aesni_arm64.h
@@ -144,10 +144,9 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
 	VTBL R08_MASK.B16, [x.B16], y.B16;                   \
 	VEOR y.B16, x.B16, y.B16;                            \
 	VTBL R16_MASK.B16, [x.B16], z.B16;                   \
-	VEOR z.B16, y.B16, y.B16;                            \
-	VSHL $2, y.S4, z.S4;                                 \
-	VUSHR $30, y.S4, y.S4;                               \
-	VORR y.B16, z.B16, y.B16;                            \
+	VEOR z.B16, y.B16, z.B16;                            \
+	VSHL $2, z.S4, y.S4;                                 \
+	VSRI $30, z.S4, y.S4;                                \
 	VTBL R24_MASK.B16, [x.B16], z.B16;                   \
 	VEOR z.B16, x.B16, x.B16;                            \
 	VEOR y.B16, x.B16, x.B16
diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s
index 6f06cc5..62bded0 100644
--- a/sm4/asm_arm64.s
+++ b/sm4/asm_arm64.s
@@ -28,13 +28,11 @@
 #define SM4_TAO_L2(x, y)         \
 	SM4_SBOX(x, y, XTMP6);                      \
 	;                                           \ //####################  4 parallel L2 linear transforms ##################//
-	VSHL $13, x.S4, XTMP6.S4;                   \
-	VUSHR $19, x.S4, y.S4;                      \
-	VORR XTMP6.B16, y.B16, y.B16;               \
+	VSHL $13, x.S4, y.S4;                       \
+	VSRI $19, x.S4, y.S4;                       \
 	VSHL $23, x.S4, XTMP6.S4;                   \
-	VUSHR $9, x.S4, XTMP7.S4;                   \
-	VORR XTMP6.B16, XTMP7.B16, XTMP7.B16;       \
-	VEOR XTMP7.B16, y.B16, y.B16;               \
+	VSRI $9, x.S4, XTMP6.S4;                    \
+	VEOR XTMP6.B16, y.B16, y.B16;               \
 	VEOR x.B16, y.B16, x.B16
 
 #define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \