diff --git a/zuc/README.md b/zuc/README.md
index e82b00d..41d4d11 100644
--- a/zuc/README.md
+++ b/zuc/README.md
@@ -63,3 +63,8 @@ func (s *zucState32) f32(x0, x1, x2 uint32) uint32 {
     cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
     BenchmarkHash1K-6   	  317750	      3833 ns/op	 267.13 MB/s
     BenchmarkHash8K-6   	   40460	     28921 ns/op	 283.26 MB/s
+    BenchmarkHash1K_Tag64-6   	  302163	      3979 ns/op	 257.34 MB/s
+    BenchmarkHash8K_Tag64-6   	   39210	     30859 ns/op	 265.46 MB/s
+    BenchmarkHash1K_Tag128-6   	  279069	      4134 ns/op	 247.70 MB/s
+    BenchmarkHash8K_Tag128-6   	   38238	     31395 ns/op	 260.93 MB/s
+
diff --git a/zuc/eia256.go b/zuc/eia256.go
index fe55b40..123af21 100644
--- a/zuc/eia256.go
+++ b/zuc/eia256.go
@@ -83,7 +83,7 @@ func (m *ZUC256Mac) Reset() {
 	m.genKeywords(m.k0[:4])
 }
 
-func (m *ZUC256Mac) block(p []byte) {
+func block256Generic(m *ZUC256Mac, p []byte) {
 	var k64, t64 uint64
 	if m.tagSize == 4 {
 		t64 = uint64(m.t[0]) << 32
@@ -138,14 +138,14 @@ func (m *ZUC256Mac) Write(p []byte) (nn int, err error) {
 		n := copy(m.x[m.nx:], p)
 		m.nx += n
 		if m.nx == chunk {
-			m.block(m.x[:])
+			block256(m, m.x[:])
 			m.nx = 0
 		}
 		p = p[n:]
 	}
 	if len(p) >= chunk {
 		n := len(p) &^ (chunk - 1)
-		m.block(p[:n])
+		block256(m, p[:n])
 		p = p[n:]
 	}
 	if len(p) > 0 {
diff --git a/zuc/eia256_asm.go b/zuc/eia256_asm.go
new file mode 100644
index 0000000..09c3ede
--- /dev/null
+++ b/zuc/eia256_asm.go
@@ -0,0 +1,29 @@
+//go:build (amd64 && !generic)
+// +build amd64,!generic
+
+package zuc
+
+//go:noescape
+func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
+
+//go:noescape
+func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
+
+func block256(m *ZUC256Mac, p []byte) {
+	if supportsGFMUL {
+		for len(p) >= chunk {
+			m.genKeywords(m.k0[4:])
+			switch m.tagSize {
+			case 8:
+				eia256RoundTag8(&m.t[0], &m.k0[0], &p[0])
+			case 16:
+				eia256RoundTag16(&m.t[0], &m.k0[0], &p[0])
+			default:
+				eia3Round16B(&m.t[0], &m.k0[0], &p[0], m.tagSize)
+			}
+			p = p[chunk:]
+		}
+	} else {
+		block256Generic(m, p)
+	}
+}
diff --git a/zuc/eia256_asm_amd64.s b/zuc/eia256_asm_amd64.s
new file mode 100644
index 0000000..e830d55
--- /dev/null
+++ b/zuc/eia256_asm_amd64.s
@@ -0,0 +1,448 @@
+// Referenced https://github.com/intel/intel-ipsec-mb/
+//go:build amd64 && !generic
+// +build amd64,!generic
+
+#include "textflag.h"
+
+DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800
+DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901
+GLOBL bit_reverse_table_l<>(SB), RODATA, $16
+
+DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000
+DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010
+GLOBL bit_reverse_table_h<>(SB), RODATA, $16
+
+DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
+GLOBL bit_reverse_and_table<>(SB), RODATA, $16
+
+DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100
+DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
+GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16
+
+DATA shuf_mask_0_0_dw1_0<>+0x00(SB)/8, $0xffffffffffffffff
+DATA shuf_mask_0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
+GLOBL shuf_mask_0_0_dw1_0<>(SB), RODATA, $16
+
+DATA shuf_mask_0_0_0_dw1<>+0x00(SB)/8, $0xffffffffffffffff
+DATA shuf_mask_0_0_0_dw1<>+0x08(SB)/8, $0x07060504ffffffff
+GLOBL shuf_mask_0_0_0_dw1<>(SB), RODATA, $16
+
+DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908
+DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c
+GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
+
+DATA bits_32_63<>+0x00(SB)/8, $0xffffffff00000000
+DATA bits_32_63<>+0x08(SB)/8, $0x0000000000000000
+GLOBL bits_32_63<>(SB), RODATA, $16
+
+
+#define XTMP1 X1
+#define XTMP2 X2
+#define XTMP3 X3
+#define XTMP4 X4
+#define XTMP5 X5
+#define XTMP6 X6
+#define XDATA X7
+#define XDIGEST X8
+#define KS_L X9
+#define KS_M1 X10
+#define KS_M2 X11
+#define KS_H X12
+
+// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
+	MOVQ t+0(FP), AX
+	MOVQ ks+8(FP), BX
+	MOVQ p+16(FP), CX
+
+	CMPB ·useAVX(SB), $1
+	JE   avx
+
+	// Reverse data bytes
+	MOVUPS (0)(CX), XDATA
+	MOVOU bit_reverse_and_table<>(SB), XTMP4
+	MOVOU XDATA, XTMP2
+	PAND  XTMP4, XTMP2
+
+	PANDN XDATA, XTMP4
+	PSRLQ $4, XTMP4
+
+	MOVOU bit_reverse_table_h<>(SB), XTMP3
+	PSHUFB XTMP2, XTMP3
+
+	MOVOU bit_reverse_table_l<>(SB), XTMP1
+	PSHUFB XTMP4, XTMP1
+
+	PXOR XTMP1, XTMP3  // XTMP3 - bit reverse data bytes
+
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	MOVUPS (0*4)(BX), XTMP1
+	MOVUPS (2*4)(BX), XTMP2
+	MOVUPS (4*4)(BX), XTMP4
+	PSHUFD $0x61, XTMP1, KS_L  // KS bits [63:32 31:0 95:64 63:32]
+	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
+	PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
+
+	// setup DATA
+	MOVOU XTMP3, XTMP1
+	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
+	MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
+
+	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
+	MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Save data for following products
+	MOVOU XTMP2, XTMP5 //  Data bits [31:0 0s 63:32 0s]
+	MOVOU XTMP3, XTMP6 //  Data bits [95:64 0s 127:96 0s]
+
+	// Calculate lower 32 bits of tag
+	PCLMULQDQ $0x00, KS_L, XTMP1
+	PCLMULQDQ $0x11, KS_L, XTMP2
+	PCLMULQDQ $0x00, KS_M1, XDIGEST
+	PCLMULQDQ $0x11, KS_M1, XTMP3
+
+	// XOR all products and move bits 63-32 bits to lower 32 bits
+	PXOR XTMP1, XTMP2
+	PXOR XTMP3, XDIGEST
+	PXOR XTMP2, XDIGEST
+	MOVQ XDIGEST, XDIGEST // Clear top 64 bits
+	PSRLDQ $4, XDIGEST
+
+	// Prepare data and calculate bits 63-32 of tag
+	MOVOU XTMP5, XTMP1
+	MOVOU XTMP5, XTMP2
+	MOVOU XTMP6, XTMP3
+	MOVOU XTMP6, XTMP4
+
+	PCLMULQDQ $0x10, KS_L, XTMP1
+	PCLMULQDQ $0x01, KS_M1, XTMP2
+	PCLMULQDQ $0x10, KS_M1, XTMP3
+	PCLMULQDQ $0x01, KS_M2, XTMP4
+
+	// XOR all the products and keep only bits 63-32
+	PXOR XTMP2, XTMP1
+	PXOR XTMP4, XTMP3
+	PXOR XTMP3, XTMP1
+	PAND bits_32_63<>(SB), XTMP1
+
+	// OR with lower 32 bits, to construct 64 bits of tag
+	POR XTMP1, XDIGEST
+
+	// Update tag
+	MOVQ XDIGEST, R10
+	XORQ R10, (AX)
+
+	// Copy last 16 bytes of KS to the front
+	MOVUPS (4*4)(BX), XTMP1
+	MOVUPS XTMP1, (0*4)(BX)
+
+	RET
+
+avx:
+	VMOVDQU (0)(CX), XDATA
+
+	// Reverse data bytes
+	VMOVDQU bit_reverse_and_table<>(SB), XTMP1 
+	VPAND XTMP1, XDATA, XTMP2
+	VPANDN XDATA, XTMP1, XTMP3
+	VPSRLD $4, XTMP3, XTMP3
+
+	VMOVDQU bit_reverse_table_h<>(SB), XTMP1
+	VPSHUFB XTMP2, XTMP1, XTMP4
+	VMOVDQU bit_reverse_table_l<>(SB), XTMP1
+	VPSHUFB XTMP3, XTMP1, XTMP1
+	VPOR XTMP1, XTMP4, XTMP4
+	
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	VPSHUFD $0x61, (0*4)(BX), KS_L  // KS bits [63:32 31:0 95:64 63:32]
+	VPSHUFD $0x61, (2*4)(BX), KS_M1  // KS bits [63:32 31:0 95:64 63:32]
+	VPSHUFD $0x61, (4*4)(BX), KS_M2  // KS bits [191:160 159:128 223:192 191:160]
+
+	// setup DATA
+	// Data bytes [31:0 0s 63:32 0s]
+	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
+	// Data bytes [95:64 0s 127:96 0s]
+	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
+
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Calculate lower 32 bits of tag
+	VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
+	VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
+	VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
+	VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
+
+	VPXOR XTMP3, XTMP4, XTMP3
+	VPXOR XTMP5, XTMP6, XTMP5
+	VPXOR XTMP3, XTMP5, XTMP3
+
+	// Move previous result to low 32 bits and XOR with previous digest
+	VMOVQ XTMP3, XTMP3  // Clear top 64 bits
+	VPSRLDQ $4, XTMP3, XDIGEST
+
+	VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
+	VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
+	VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
+	VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
+
+	// XOR all the products and keep only 32-63 bits
+	VPXOR XTMP4, XTMP3, XTMP3
+	VPXOR XTMP6, XTMP5, XTMP5
+	VPXOR XTMP5, XTMP3, XTMP3
+	VPAND bits_32_63<>(SB), XTMP3, XTMP3
+
+	// XOR with bits 32-63 of previous digest
+	VPXOR XTMP3, XDIGEST, XDIGEST
+
+	// Update tag
+	VMOVQ XDIGEST, R10
+	XORQ R10, (AX)
+
+	// Copy last 16 bytes of KS to the front
+	VMOVDQU (4*4)(BX), XTMP1
+	VMOVDQU XTMP1, (0*4)(BX)
+
+	VZEROUPPER
+	RET
+
+// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
+	MOVQ t+0(FP), AX
+	MOVQ ks+8(FP), BX
+	MOVQ p+16(FP), CX
+
+	CMPB ·useAVX(SB), $1
+	JE   avx
+
+	// Reverse data bytes
+	MOVUPS (0)(CX), XDATA
+	MOVOU bit_reverse_and_table<>(SB), XTMP4
+	MOVOU XDATA, XTMP2
+	PAND  XTMP4, XTMP2
+
+	PANDN XDATA, XTMP4
+	PSRLQ $4, XTMP4
+
+	MOVOU bit_reverse_table_h<>(SB), XTMP3
+	PSHUFB XTMP2, XTMP3
+
+	MOVOU bit_reverse_table_l<>(SB), XTMP1
+	PSHUFB XTMP4, XTMP1
+
+	PXOR XTMP1, XTMP3  // XTMP3 - bit reverse data bytes
+
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	MOVUPS (0*4)(BX), XTMP1
+	MOVUPS (2*4)(BX), XTMP2
+	MOVUPS (4*4)(BX), XTMP4
+	PSHUFD $0x61, XTMP1, KS_L  // KS bits [63:32 31:0 95:64 63:32]
+	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
+	PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
+	PSHUFD $0xBB, XTMP4, KS_H // KS bits [255:224 223:192 255:224 223:192]
+
+	// setup DATA
+	MOVOU XTMP3, XTMP1
+	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
+	MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
+
+	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
+	MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Save data for following products
+	MOVOU XTMP2, XTMP5 //  Data bits [31:0 0s 63:32 0s]
+	MOVOU XTMP3, XTMP6 //  Data bits [95:64 0s 127:96 0s]
+
+	// Calculate lower 32 bits of tag
+	PCLMULQDQ $0x00, KS_L, XTMP1
+	PCLMULQDQ $0x11, KS_L, XTMP2
+	PCLMULQDQ $0x00, KS_M1, XDIGEST
+	PCLMULQDQ $0x11, KS_M1, XTMP3
+
+	// XOR all products and move bits 63-32 bits to lower 32 bits
+	PXOR XTMP1, XTMP2
+	PXOR XTMP3, XDIGEST
+	PXOR XTMP2, XDIGEST
+	MOVQ XDIGEST, XDIGEST // Clear top 64 bits
+	PSRLDQ $4, XDIGEST
+
+	// Prepare data and calculate bits 63-32 of tag
+	MOVOU XTMP5, XTMP1
+	MOVOU XTMP5, XTMP2
+	MOVOU XTMP6, XTMP3
+	MOVOU XTMP6, XTMP4
+
+	PCLMULQDQ $0x10, KS_L, XTMP1
+	PCLMULQDQ $0x01, KS_M1, XTMP2
+	PCLMULQDQ $0x10, KS_M1, XTMP3
+	PCLMULQDQ $0x01, KS_M2, XTMP4
+
+	// XOR all the products and keep only bits 63-32
+	PXOR XTMP2, XTMP1
+	PXOR XTMP4, XTMP3
+	PXOR XTMP3, XTMP1
+	PAND bits_32_63<>(SB), XTMP1
+
+	// OR with lower 32 bits, to construct 64 bits of tag
+	POR XTMP1, XDIGEST
+
+	// Prepare data and calculate bits 95-64 of tag
+	MOVOU XTMP5, XTMP1
+	MOVOU XTMP5, XTMP2
+	MOVOU XTMP6, XTMP3
+	MOVOU XTMP6, XTMP4
+
+	PCLMULQDQ $0x00, KS_M1, XTMP1
+	PCLMULQDQ $0x11, KS_M1, XTMP2
+	PCLMULQDQ $0x00, KS_M2, XTMP3
+	PCLMULQDQ $0x11, KS_M2, XTMP4
+
+	// XOR all the products and move bits 63-32 to bits 95-64
+	PXOR XTMP2, XTMP1
+	PXOR XTMP4, XTMP3
+	PXOR XTMP3, XTMP1
+	PSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP1
+
+	// OR with lower 64 bits, to construct 96 bits of tag
+	POR XTMP1, XDIGEST
+
+	// Prepare data and calculate bits 127-96 of tag
+	MOVOU XTMP5, XTMP1
+	MOVOU XTMP5, XTMP2
+	MOVOU XTMP6, XTMP3
+	MOVOU XTMP6, XTMP4
+
+	PCLMULQDQ $0x10, KS_M1, XTMP1
+	PCLMULQDQ $0x01, KS_M2, XTMP2
+	PCLMULQDQ $0x10, KS_M2, XTMP3
+	PCLMULQDQ $0x01, KS_H, XTMP4
+
+	// XOR all the products and move bits 63-32 to bits 127-96
+	PXOR XTMP2, XTMP1
+	PXOR XTMP4, XTMP3
+	PXOR XTMP3, XTMP1
+	PSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP1
+
+	// OR with lower 96 bits, to construct 128 bits of tag
+	POR XTMP1, XDIGEST
+
+	// Update tag
+	MOVUPS (AX), XTMP1
+	PXOR XTMP1, XDIGEST
+	MOVUPS XDIGEST, (AX)
+
+	// Copy last 16 bytes of KS to the front
+	MOVUPS (4*4)(BX), XTMP1
+	MOVUPS XTMP1, (0*4)(BX)
+
+	RET
+
+avx:
+	VMOVDQU (0)(CX), XDATA
+
+	// Reverse data bytes
+	VMOVDQU bit_reverse_and_table<>(SB), XTMP1 
+	VPAND XTMP1, XDATA, XTMP2
+	VPANDN XDATA, XTMP1, XTMP3
+	VPSRLD $4, XTMP3, XTMP3
+
+	VMOVDQU bit_reverse_table_h<>(SB), XTMP1
+	VPSHUFB XTMP2, XTMP1, XTMP4
+	VMOVDQU bit_reverse_table_l<>(SB), XTMP1
+	VPSHUFB XTMP3, XTMP1, XTMP1
+	VPOR XTMP1, XTMP4, XTMP4
+	
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	VPSHUFD $0x61, (0*4)(BX), KS_L  // KS bits [63:32 31:0 95:64 63:32]
+	VPSHUFD $0x61, (2*4)(BX), KS_M1  // KS bits [63:32 31:0 95:64 63:32]
+	VPSHUFD $0x61, (4*4)(BX), KS_M2  // KS bits [191:160 159:128 223:192 191:160]
+	VPSHUFD $0xBB, (4*4)(BX), KS_H  // KS bits [255:224 223:192 255:224 223:192]
+
+	// setup DATA
+	// Data bytes [31:0 0s 63:32 0s]
+	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
+	// Data bytes [95:64 0s 127:96 0s]
+	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
+
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Calculate lower 32 bits of tag
+	VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
+	VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
+	VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
+	VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
+
+	VPXOR XTMP3, XTMP4, XTMP3
+	VPXOR XTMP5, XTMP6, XTMP5
+	VPXOR XTMP3, XTMP5, XTMP3
+
+	// Move previous result to low 32 bits and XOR with previous digest
+	VMOVQ XTMP3, XTMP3  // Clear top 64 bits
+	VPSRLDQ $4, XTMP3, XDIGEST
+
+	VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
+	VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
+	VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
+	VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
+
+	// XOR all the products and keep only 32-63 bits
+	VPXOR XTMP4, XTMP3, XTMP3
+	VPXOR XTMP6, XTMP5, XTMP5
+	VPXOR XTMP5, XTMP3, XTMP3
+	VPAND bits_32_63<>(SB), XTMP3, XTMP3
+
+	// XOR with bits 32-63 of previous digest
+	VPXOR XTMP3, XDIGEST, XDIGEST
+
+	// Prepare data and calculate bits 95-64 of tag
+	VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3
+	VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4
+	VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5
+	VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6
+
+	// XOR all the products and move bits 63-32 to bits 95-64
+	VPXOR XTMP4, XTMP3, XTMP3
+	VPXOR XTMP6, XTMP5, XTMP5
+	VPXOR XTMP5, XTMP3, XTMP3
+
+	VPSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP3, XTMP3
+
+	// XOR with previous bits 64-95 of previous digest
+	VPXOR XTMP3, XDIGEST, XDIGEST
+
+	// Prepare data and calculate bits 127-96 of tag
+	VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3
+	VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4
+	VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5
+	VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6
+
+	// XOR all the products and move bits 63-32 to bits 127-96
+	VPXOR XTMP4, XTMP3, XTMP3
+	VPXOR XTMP6, XTMP5, XTMP5
+	VPXOR XTMP5, XTMP3, XTMP3
+
+	VPSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP3, XTMP3
+
+	// XOR with previous bits 64-95 of previous digest
+	VPXOR XTMP3, XDIGEST, XDIGEST
+
+	// Update tag
+	VPXOR (AX), XDIGEST, XDIGEST
+	VMOVDQA XDIGEST, (AX)
+
+	// Copy last 16 bytes of KS to the front
+	VMOVDQU (4*4)(BX), XTMP1
+	VMOVDQU XTMP1, (0*4)(BX)
+
+	VZEROUPPER
+	RET
diff --git a/zuc/eia256_generic.go b/zuc/eia256_generic.go
new file mode 100644
index 0000000..66598f1
--- /dev/null
+++ b/zuc/eia256_generic.go
@@ -0,0 +1,8 @@
+//go:build !amd64 || generic
+// +build !amd64 generic
+
+package zuc
+
+func block256(m *ZUC256Mac, p []byte) {
+	block256Generic(m, p)
+}
diff --git a/zuc/eia_asm.go b/zuc/eia_asm.go
index e084f5f..f6ad22a 100644
--- a/zuc/eia_asm.go
+++ b/zuc/eia_asm.go
@@ -6,7 +6,6 @@ package zuc
 import "golang.org/x/sys/cpu"
 
 var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
-var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
 
 //go:noescape
 func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
diff --git a/zuc/eia_asm_amd64.s b/zuc/eia_asm_amd64.s
index 10433fc..9f500b4 100644
--- a/zuc/eia_asm_amd64.s
+++ b/zuc/eia_asm_amd64.s
@@ -44,8 +44,8 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
 	MOVQ p+16(FP), CX
 	MOVQ tagSize+24(FP), DX
 
-	CMPB ·useAVX2(SB), $1
-	JE   avx2
+	CMPB ·useAVX(SB), $1
+	JE   avx
 
 	// Reverse data bytes
 	MOVUPS (0)(CX), XDATA
@@ -103,7 +103,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
 
 	RET
 
-avx2:
+avx:
 	VMOVDQU (0)(CX), XDATA
 
 	// Reverse data bytes
diff --git a/zuc/eia_test.go b/zuc/eia_test.go
index b7d9d5b..f512656 100644
--- a/zuc/eia_test.go
+++ b/zuc/eia_test.go
@@ -150,3 +150,57 @@ func Test_Finish(t *testing.T) {
 		}
 	}
 }
+
+func TestNewHash(t *testing.T) {
+	key := make([]byte, 16)
+	iv := make([]byte, 16)
+	_, err := NewHash(key[:1], iv)
+	if err == nil {
+		t.Fatal("error is expected")
+	}
+
+	_, err = NewHash(key, iv[:1])
+	if err == nil {
+		t.Fatal("error is expected")
+	}
+
+	h, err := NewHash(key, iv)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if h.Size() != 4 {
+		t.Fatal("eia3 mac size should be 4 bytes")
+	}
+	if h.BlockSize() != 16 {
+		t.Fatal("current eia3 implementation's block size should be 16 bytes")
+	}
+
+}
+
+func TestSum(t *testing.T) {
+	expected := "6c2db416"
+	h, err := NewEIAHash(zucEIATests[1].key, zucEIATests[1].count, zucEIATests[1].bearer, zucEIATests[1].direction)
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = h.Write([]byte("emmansun"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = h.Write([]byte("shangmi1"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = h.Write([]byte("emmansun shangmi"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = h.Write([]byte("emmansun shangmi 1234"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	mac := h.Sum(nil)
+	if hex.EncodeToString(mac) != expected {
+		t.Errorf("expected=%s, result=%s\n", expected, hex.EncodeToString(mac))
+	}
+}