zuc: refactoring

2025-10-14 07:10:45 +08:00 · 2024-11-11 17:53:57 +08:00 · 2024-11-11 17:53:57 +08:00 · 58ad15fde8
commit 58ad15fde8
parent 1f209d2317
8 changed files with 749 additions and 937 deletions
--- a/zuc/eia256_asm.go
+++ b/zuc/eia256_asm.go
@ -18,7 +18,7 @@ func block256(m *ZUC256Mac, p []byte) {
 			case 16:
 				eia256RoundTag16(&m.t[0], &m.k0[0], &p[0])
 			default:
-				eia3Round16B(&m.t[0], &m.k0[0], &p[0], m.tagSize)
+				eiaRoundTag4(&m.t[0], &m.k0[0], &p[0])
 			}
 			p = p[chunk:]
 		}
--- a/zuc/eia256_asm_amd64.s
+++ b/zuc/eia256_asm_amd64.s
@ -1,448 +0,0 @@
-// Referenced Intel(R) Multi-Buffer Crypto for IPsec
-// https://github.com/intel/intel-ipsec-mb/
-//go:build !purego
-
-#include "textflag.h"
-
-DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800
-DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901
-GLOBL bit_reverse_table_l<>(SB), RODATA, $16
-
-DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000
-DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010
-GLOBL bit_reverse_table_h<>(SB), RODATA, $16
-
-DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
-DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
-GLOBL bit_reverse_and_table<>(SB), RODATA, $16
-
-DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100
-DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
-GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16
-
-DATA shuf_mask_0_0_dw1_0<>+0x00(SB)/8, $0xffffffffffffffff
-DATA shuf_mask_0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
-GLOBL shuf_mask_0_0_dw1_0<>(SB), RODATA, $16
-
-DATA shuf_mask_0_0_0_dw1<>+0x00(SB)/8, $0xffffffffffffffff
-DATA shuf_mask_0_0_0_dw1<>+0x08(SB)/8, $0x07060504ffffffff
-GLOBL shuf_mask_0_0_0_dw1<>(SB), RODATA, $16
-
-DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908
-DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c
-GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
-
-DATA bits_32_63<>+0x00(SB)/8, $0xffffffff00000000
-DATA bits_32_63<>+0x08(SB)/8, $0x0000000000000000
-GLOBL bits_32_63<>(SB), RODATA, $16
-
-
-#define XTMP1 X1
-#define XTMP2 X2
-#define XTMP3 X3
-#define XTMP4 X4
-#define XTMP5 X5
-#define XTMP6 X6
-#define XDATA X7
-#define XDIGEST X8
-#define KS_L X9
-#define KS_M1 X10
-#define KS_M2 X11
-#define KS_H X12
-
-// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
-TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
-	MOVQ t+0(FP), AX
-	MOVQ ks+8(FP), BX
-	MOVQ p+16(FP), CX
-
-	CMPB ·useAVX(SB), $1
-	JE   avx
-
-	// Reverse data bytes
-	MOVUPS (0)(CX), XDATA
-	MOVOU bit_reverse_and_table<>(SB), XTMP4
-	MOVOU XDATA, XTMP2
-	PAND  XTMP4, XTMP2
-
-	PANDN XDATA, XTMP4
-	PSRLQ $4, XTMP4
-
-	MOVOU bit_reverse_table_h<>(SB), XTMP3
-	PSHUFB XTMP2, XTMP3
-
-	MOVOU bit_reverse_table_l<>(SB), XTMP1
-	PSHUFB XTMP4, XTMP1
-
-	PXOR XTMP1, XTMP3  // XTMP3 - bit reverse data bytes
-
-	// ZUC authentication part, 4x32 data bits
-	// setup KS
-	MOVUPS (0*4)(BX), XTMP1
-	MOVUPS (2*4)(BX), XTMP2
-	MOVUPS (4*4)(BX), XTMP4
-	PSHUFD $0x61, XTMP1, KS_L  // KS bits [63:32 31:0 95:64 63:32]
-	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
-	PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
-
-	// setup DATA
-	MOVOU XTMP3, XTMP1
-	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
-	MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
-
-	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
-	MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
-
-	// clmul
-	// xor the results from 4 32-bit words together
-	// Save data for following products
-	MOVOU XTMP2, XTMP5 //  Data bits [31:0 0s 63:32 0s]
-	MOVOU XTMP3, XTMP6 //  Data bits [95:64 0s 127:96 0s]
-
-	// Calculate lower 32 bits of tag
-	PCLMULQDQ $0x00, KS_L, XTMP1
-	PCLMULQDQ $0x11, KS_L, XTMP2
-	PCLMULQDQ $0x00, KS_M1, XDIGEST
-	PCLMULQDQ $0x11, KS_M1, XTMP3
-
-	// XOR all products and move bits 63-32 bits to lower 32 bits
-	PXOR XTMP1, XTMP2
-	PXOR XTMP3, XDIGEST
-	PXOR XTMP2, XDIGEST
-	MOVQ XDIGEST, XDIGEST // Clear top 64 bits
-	PSRLDQ $4, XDIGEST
-
-	// Prepare data and calculate bits 63-32 of tag
-	MOVOU XTMP5, XTMP1
-	MOVOU XTMP5, XTMP2
-	MOVOU XTMP6, XTMP3
-	MOVOU XTMP6, XTMP4
-
-	PCLMULQDQ $0x10, KS_L, XTMP1
-	PCLMULQDQ $0x01, KS_M1, XTMP2
-	PCLMULQDQ $0x10, KS_M1, XTMP3
-	PCLMULQDQ $0x01, KS_M2, XTMP4
-
-	// XOR all the products and keep only bits 63-32
-	PXOR XTMP2, XTMP1
-	PXOR XTMP4, XTMP3
-	PXOR XTMP3, XTMP1
-	PAND bits_32_63<>(SB), XTMP1
-
-	// OR with lower 32 bits, to construct 64 bits of tag
-	POR XTMP1, XDIGEST
-
-	// Update tag
-	MOVQ XDIGEST, R10
-	XORQ R10, (AX)
-
-	// Copy last 16 bytes of KS to the front
-	MOVUPS (4*4)(BX), XTMP1
-	MOVUPS XTMP1, (0*4)(BX)
-
-	RET
-
-avx:
-	VMOVDQU (0)(CX), XDATA
-
-	// Reverse data bytes
-	VMOVDQU bit_reverse_and_table<>(SB), XTMP1 
-	VPAND XTMP1, XDATA, XTMP2
-	VPANDN XDATA, XTMP1, XTMP3
-	VPSRLD $4, XTMP3, XTMP3
-
-	VMOVDQU bit_reverse_table_h<>(SB), XTMP1
-	VPSHUFB XTMP2, XTMP1, XTMP4
-	VMOVDQU bit_reverse_table_l<>(SB), XTMP1
-	VPSHUFB XTMP3, XTMP1, XTMP1
-	VPOR XTMP1, XTMP4, XTMP4
-	
-	// ZUC authentication part, 4x32 data bits
-	// setup KS
-	VPSHUFD $0x61, (0*4)(BX), KS_L  // KS bits [63:32 31:0 95:64 63:32]
-	VPSHUFD $0x61, (2*4)(BX), KS_M1  // KS bits [63:32 31:0 95:64 63:32]
-	VPSHUFD $0x61, (4*4)(BX), KS_M2  // KS bits [191:160 159:128 223:192 191:160]
-
-	// setup DATA
-	// Data bytes [31:0 0s 63:32 0s]
-	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
-	// Data bytes [95:64 0s 127:96 0s]
-	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
-
-
-	// clmul
-	// xor the results from 4 32-bit words together
-	// Calculate lower 32 bits of tag
-	VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
-	VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
-	VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
-	VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
-
-	VPXOR XTMP3, XTMP4, XTMP3
-	VPXOR XTMP5, XTMP6, XTMP5
-	VPXOR XTMP3, XTMP5, XTMP3
-
-	// Move previous result to low 32 bits and XOR with previous digest
-	VMOVQ XTMP3, XTMP3  // Clear top 64 bits
-	VPSRLDQ $4, XTMP3, XDIGEST
-
-	VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
-	VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
-	VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
-	VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
-
-	// XOR all the products and keep only 32-63 bits
-	VPXOR XTMP4, XTMP3, XTMP3
-	VPXOR XTMP6, XTMP5, XTMP5
-	VPXOR XTMP5, XTMP3, XTMP3
-	VPAND bits_32_63<>(SB), XTMP3, XTMP3
-
-	// XOR with bits 32-63 of previous digest
-	VPXOR XTMP3, XDIGEST, XDIGEST
-
-	// Update tag
-	VMOVQ XDIGEST, R10
-	XORQ R10, (AX)
-
-	// Copy last 16 bytes of KS to the front
-	VMOVDQU (4*4)(BX), XTMP1
-	VMOVDQU XTMP1, (0*4)(BX)
-
-	VZEROUPPER
-	RET
-
-// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
-TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
-	MOVQ t+0(FP), AX
-	MOVQ ks+8(FP), BX
-	MOVQ p+16(FP), CX
-
-	CMPB ·useAVX(SB), $1
-	JE   avx
-
-	// Reverse data bytes
-	MOVUPS (0)(CX), XDATA
-	MOVOU bit_reverse_and_table<>(SB), XTMP4
-	MOVOU XDATA, XTMP2
-	PAND  XTMP4, XTMP2
-
-	PANDN XDATA, XTMP4
-	PSRLQ $4, XTMP4
-
-	MOVOU bit_reverse_table_h<>(SB), XTMP3
-	PSHUFB XTMP2, XTMP3
-
-	MOVOU bit_reverse_table_l<>(SB), XTMP1
-	PSHUFB XTMP4, XTMP1
-
-	PXOR XTMP1, XTMP3  // XTMP3 - bit reverse data bytes
-
-	// ZUC authentication part, 4x32 data bits
-	// setup KS
-	MOVUPS (0*4)(BX), XTMP1
-	MOVUPS (2*4)(BX), XTMP2
-	MOVUPS (4*4)(BX), XTMP4
-	PSHUFD $0x61, XTMP1, KS_L  // KS bits [63:32 31:0 95:64 63:32]
-	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
-	PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
-	PSHUFD $0xBB, XTMP4, KS_H // KS bits [255:224 223:192 255:224 223:192]
-
-	// setup DATA
-	MOVOU XTMP3, XTMP1
-	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
-	MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
-
-	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
-	MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
-
-	// clmul
-	// xor the results from 4 32-bit words together
-	// Save data for following products
-	MOVOU XTMP2, XTMP5 //  Data bits [31:0 0s 63:32 0s]
-	MOVOU XTMP3, XTMP6 //  Data bits [95:64 0s 127:96 0s]
-
-	// Calculate lower 32 bits of tag
-	PCLMULQDQ $0x00, KS_L, XTMP1
-	PCLMULQDQ $0x11, KS_L, XTMP2
-	PCLMULQDQ $0x00, KS_M1, XDIGEST
-	PCLMULQDQ $0x11, KS_M1, XTMP3
-
-	// XOR all products and move bits 63-32 bits to lower 32 bits
-	PXOR XTMP1, XTMP2
-	PXOR XTMP3, XDIGEST
-	PXOR XTMP2, XDIGEST
-	MOVQ XDIGEST, XDIGEST // Clear top 64 bits
-	PSRLDQ $4, XDIGEST
-
-	// Prepare data and calculate bits 63-32 of tag
-	MOVOU XTMP5, XTMP1
-	MOVOU XTMP5, XTMP2
-	MOVOU XTMP6, XTMP3
-	MOVOU XTMP6, XTMP4
-
-	PCLMULQDQ $0x10, KS_L, XTMP1
-	PCLMULQDQ $0x01, KS_M1, XTMP2
-	PCLMULQDQ $0x10, KS_M1, XTMP3
-	PCLMULQDQ $0x01, KS_M2, XTMP4
-
-	// XOR all the products and keep only bits 63-32
-	PXOR XTMP2, XTMP1
-	PXOR XTMP4, XTMP3
-	PXOR XTMP3, XTMP1
-	PAND bits_32_63<>(SB), XTMP1
-
-	// OR with lower 32 bits, to construct 64 bits of tag
-	POR XTMP1, XDIGEST
-
-	// Prepare data and calculate bits 95-64 of tag
-	MOVOU XTMP5, XTMP1
-	MOVOU XTMP5, XTMP2
-	MOVOU XTMP6, XTMP3
-	MOVOU XTMP6, XTMP4
-
-	PCLMULQDQ $0x00, KS_M1, XTMP1
-	PCLMULQDQ $0x11, KS_M1, XTMP2
-	PCLMULQDQ $0x00, KS_M2, XTMP3
-	PCLMULQDQ $0x11, KS_M2, XTMP4
-
-	// XOR all the products and move bits 63-32 to bits 95-64
-	PXOR XTMP2, XTMP1
-	PXOR XTMP4, XTMP3
-	PXOR XTMP3, XTMP1
-	PSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP1
-
-	// OR with lower 64 bits, to construct 96 bits of tag
-	POR XTMP1, XDIGEST
-
-	// Prepare data and calculate bits 127-96 of tag
-	MOVOU XTMP5, XTMP1
-	MOVOU XTMP5, XTMP2
-	MOVOU XTMP6, XTMP3
-	MOVOU XTMP6, XTMP4
-
-	PCLMULQDQ $0x10, KS_M1, XTMP1
-	PCLMULQDQ $0x01, KS_M2, XTMP2
-	PCLMULQDQ $0x10, KS_M2, XTMP3
-	PCLMULQDQ $0x01, KS_H, XTMP4
-
-	// XOR all the products and move bits 63-32 to bits 127-96
-	PXOR XTMP2, XTMP1
-	PXOR XTMP4, XTMP3
-	PXOR XTMP3, XTMP1
-	PSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP1
-
-	// OR with lower 96 bits, to construct 128 bits of tag
-	POR XTMP1, XDIGEST
-
-	// Update tag
-	MOVUPS (AX), XTMP1
-	PXOR XTMP1, XDIGEST
-	MOVUPS XDIGEST, (AX)
-
-	// Copy last 16 bytes of KS to the front
-	MOVUPS (4*4)(BX), XTMP1
-	MOVUPS XTMP1, (0*4)(BX)
-
-	RET
-
-avx:
-	VMOVDQU (0)(CX), XDATA
-
-	// Reverse data bytes
-	VMOVDQU bit_reverse_and_table<>(SB), XTMP1 
-	VPAND XTMP1, XDATA, XTMP2
-	VPANDN XDATA, XTMP1, XTMP3
-	VPSRLD $4, XTMP3, XTMP3
-
-	VMOVDQU bit_reverse_table_h<>(SB), XTMP1
-	VPSHUFB XTMP2, XTMP1, XTMP4
-	VMOVDQU bit_reverse_table_l<>(SB), XTMP1
-	VPSHUFB XTMP3, XTMP1, XTMP1
-	VPOR XTMP1, XTMP4, XTMP4
-	
-	// ZUC authentication part, 4x32 data bits
-	// setup KS
-	VPSHUFD $0x61, (0*4)(BX), KS_L  // KS bits [63:32 31:0 95:64 63:32]
-	VPSHUFD $0x61, (2*4)(BX), KS_M1  // KS bits [63:32 31:0 95:64 63:32]
-	VPSHUFD $0x61, (4*4)(BX), KS_M2  // KS bits [191:160 159:128 223:192 191:160]
-	VPSHUFD $0xBB, (4*4)(BX), KS_H  // KS bits [255:224 223:192 255:224 223:192]
-
-	// setup DATA
-	// Data bytes [31:0 0s 63:32 0s]
-	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
-	// Data bytes [95:64 0s 127:96 0s]
-	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
-
-
-	// clmul
-	// xor the results from 4 32-bit words together
-	// Calculate lower 32 bits of tag
-	VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
-	VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
-	VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
-	VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
-
-	VPXOR XTMP3, XTMP4, XTMP3
-	VPXOR XTMP5, XTMP6, XTMP5
-	VPXOR XTMP3, XTMP5, XTMP3
-
-	// Move previous result to low 32 bits and XOR with previous digest
-	VMOVQ XTMP3, XTMP3  // Clear top 64 bits
-	VPSRLDQ $4, XTMP3, XDIGEST
-
-	VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
-	VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
-	VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
-	VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
-
-	// XOR all the products and keep only 32-63 bits
-	VPXOR XTMP4, XTMP3, XTMP3
-	VPXOR XTMP6, XTMP5, XTMP5
-	VPXOR XTMP5, XTMP3, XTMP3
-	VPAND bits_32_63<>(SB), XTMP3, XTMP3
-
-	// XOR with bits 32-63 of previous digest
-	VPXOR XTMP3, XDIGEST, XDIGEST
-
-	// Prepare data and calculate bits 95-64 of tag
-	VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3
-	VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4
-	VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5
-	VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6
-
-	// XOR all the products and move bits 63-32 to bits 95-64
-	VPXOR XTMP4, XTMP3, XTMP3
-	VPXOR XTMP6, XTMP5, XTMP5
-	VPXOR XTMP5, XTMP3, XTMP3
-
-	VPSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP3, XTMP3
-
-	// XOR with previous bits 64-95 of previous digest
-	VPXOR XTMP3, XDIGEST, XDIGEST
-
-	// Prepare data and calculate bits 127-96 of tag
-	VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3
-	VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4
-	VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5
-	VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6
-
-	// XOR all the products and move bits 63-32 to bits 127-96
-	VPXOR XTMP4, XTMP3, XTMP3
-	VPXOR XTMP6, XTMP5, XTMP5
-	VPXOR XTMP5, XTMP3, XTMP3
-
-	VPSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP3, XTMP3
-
-	// XOR with previous bits 64-95 of previous digest
-	VPXOR XTMP3, XDIGEST, XDIGEST
-
-	// Update tag
-	VPXOR (AX), XDIGEST, XDIGEST
-	VMOVDQA XDIGEST, (AX)
-
-	// Copy last 16 bytes of KS to the front
-	VMOVDQU (4*4)(BX), XTMP1
-	VMOVDQU XTMP1, (0*4)(BX)
-
-	VZEROUPPER
-	RET
--- a/zuc/eia256_asm_arm64.s
+++ b/zuc/eia256_asm_arm64.s
@ -1,208 +0,0 @@
-//go:build !purego
-
-#include "textflag.h"
-
-#define AX R2
-#define BX R3
-#define CX R4
-#define DX R5
-
-#define XTMP1 V1
-#define XTMP2 V2
-#define XTMP3 V3
-#define XTMP4 V4
-#define XTMP5 V5
-#define XTMP6 V6
-#define XDATA V7
-#define XDIGEST V8
-#define KS_L V9
-#define KS_M1 V10
-#define KS_M2 V11
-#define KS_H V12
-#define BIT_REV_AND_TAB V20
-#define BIT_REV_TAB_L V21
-#define BIT_REV_TAB_H V22
-#define SHUF_MASK_DW0_DW1 V23
-#define SHUF_MASK_DW2_DW3 V24
-
-#define LOAD_GLOBAL_DATA() \
-	MOVD $·eia_const(SB), R0                                                                      \
-	VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16, SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16]         \
-	MOVW $0x0F0F0F0F, R0                                                                                   \
-	VDUP R0, BIT_REV_AND_TAB.S4
-
-// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
-TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
-	MOVD t+0(FP), AX
-	MOVD ks+8(FP), BX
-	MOVD p+16(FP), CX
-
-	LOAD_GLOBAL_DATA()
-
-	// Reverse data bytes
-	VLD1 (CX), [XDATA.B16]
-	VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
-	VUSHR $4, XDATA.B16, XTMP1.B16
-
-	VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
-	VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
-	VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes
-
-	// ZUC authentication part, 4x32 data bits
-	// setup KS
-	VLD1 (BX), [XTMP1.B16, XTMP2.B16]
-	VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
-	// TODO: Any better solution???
-	VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
-	VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32]
-	VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
-	VDUP XTMP1.S[3], KS_M1.S4
-	VMOV XTMP1.S[2], KS_M1.S[1]
-	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
-	
-	// setup DATA
-	VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
-	VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
-
-	// clmul
-	// xor the results from 4 32-bit words together
-
-	// Calculate lower 32 bits of tag
-	VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
-	VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
-	VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
-	VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
-
-	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
-	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
-	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
-
-	// Move previous result to low 32 bits and XOR with previous digest
-	VMOV XTMP3.S[1], XDIGEST.S[0]
-
-	// Prepare data and calculate bits 63-32 of tag
-	VEXT	$8, KS_L.B16, KS_L.B16, XTMP5.B16
-	VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
-	VEXT	$8, XTMP1.B16, XTMP1.B16, XTMP5.B16
-	VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
-	VEXT	$8, KS_M1.B16, KS_M1.B16, XTMP1.B16
-	VPMULL XTMP1.D1, XTMP2.D1, XTMP5.Q1
-	VEXT	$8, XTMP2.B16, XTMP2.B16, XTMP1.B16
-	VPMULL KS_M2.D1, XTMP1.D1, XTMP6.Q1
-
-	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
-	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
-	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
-
-	VMOV XTMP3.S[1], XDIGEST.S[1]
-
-	VMOV XDIGEST.D[0], R10
-	MOVD (AX), R11
-	EOR R10, R11
-	MOVD R11, (AX)
-
-	RET
-
-// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
-TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
-	MOVD t+0(FP), AX
-	MOVD ks+8(FP), BX
-	MOVD p+16(FP), CX
-
-	LOAD_GLOBAL_DATA()
-
-	// Reverse data bytes
-	VLD1 (CX), [XDATA.B16]
-	VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
-	VUSHR $4, XDATA.B16, XTMP1.B16
-
-	VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
-	VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
-	VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes
-
-	// ZUC authentication part, 4x32 data bits
-	// setup KS
-	VLD1 (BX), [XTMP1.B16, XTMP2.B16]
-	VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
-	// TODO: Any better solution??? We can use VTBL, but there are no performance imprvoement if we can't reuse MASK constant
-	VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
-	VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16  // KS bits [63:32 31:0 95:64 63:32]
-	VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
-	VMOVQ $0x0b0a09080f0e0d0c, $0x0b0a09080f0e0d0c, XTMP4
-	VTBL XTMP4.B16, [XTMP2.B16], KS_H.B16  // KS bits [255:224 223:192 255:224 223:192]
-	VDUP XTMP1.S[3], KS_M1.S4
-	VMOV XTMP1.S[2], KS_M1.S[1]
-	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
-
-	// setup DATA
-	VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
-	VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
-
-	// clmul
-	// xor the results from 4 32-bit words together
-
-	// Calculate lower 32 bits of tag
-	VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
-	VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
-	VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
-	VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
-
-	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
-	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
-	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
-
-	// Move previous result to low 32 bits and XOR with previous digest
-	VMOV XTMP3.S[1], XDIGEST.S[0]
-
-	// Prepare data and calculate bits 63-32 of tag
-	VEXT	$8, KS_L.B16, KS_L.B16, XTMP5.B16
-	VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
-	VEXT	$8, XTMP1.B16, XTMP1.B16, XTMP5.B16
-	VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
-	VEXT	$8, KS_M1.B16, KS_M1.B16, XTMP6.B16
-	VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
-	VEXT	$8, XTMP2.B16, XTMP2.B16, KS_L.B16
-	VPMULL KS_M2.D1, KS_L.D1, XTMP6.Q1
-
-	// XOR all the products and keep only 32-63 bits
-	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
-	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
-	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
-
-	VMOV XTMP3.S[1], XDIGEST.S[1]
-
-	// Prepare data and calculate bits 95-64 of tag
-	VPMULL KS_M1.D1, XTMP1.D1, XTMP3.Q1
-	VPMULL2 KS_M1.D2, XTMP1.D2, XTMP4.Q1
-	VPMULL KS_M2.D1, XTMP2.D1, XTMP5.Q1
-	VPMULL2 KS_M2.D2, XTMP2.D2, XTMP6.Q1
-
-	// XOR all the products and move bits 63-32 to bits 95-64
-	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
-	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
-	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
-
-	VMOV XTMP3.S[1], XDIGEST.S[2]
-
-	// Prepare data and calculate bits 127-96 of tag
-	VEXT	$8, KS_M1.B16, KS_M1.B16, XTMP5.B16
-	VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
-	VEXT	$8, XTMP1.B16, XTMP1.B16, XTMP5.B16
-	VPMULL KS_M2.D1, XTMP5.D1, XTMP4.Q1
-	VEXT	$8, KS_M2.B16, KS_M2.B16, XTMP6.B16
-	VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
-	VEXT	$8, XTMP2.B16, XTMP2.B16, KS_L.B16
-	VPMULL KS_H.D1, KS_L.D1, XTMP6.Q1
-
-	// XOR all the products and move bits 63-32 to bits 127-96
-	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
-	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
-	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
-
-	VMOV XTMP3.S[1], XDIGEST.S[3]
-
-	VLD1 (AX), [XTMP1.B16]
-	VEOR XTMP1.B16, XDIGEST.B16, XDIGEST.B16
-	VST1 [XDIGEST.B16], (AX)
-
-	RET
--- a/zuc/eia256_asm_ppc64x.s
+++ b/zuc/eia256_asm_ppc64x.s
@ -1,196 +0,0 @@
-// Copyright 2024 Sun Yimin. All rights reserved.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-//go:build (ppc64 || ppc64le) && !purego
-
-#include "textflag.h"
-
-#define XTMP1 V0
-#define XTMP2 V1
-#define XTMP3 V2
-#define XTMP4 V3
-#define XTMP5 V4
-#define XTMP6 V5
-#define XDATA V6
-#define XDIGEST V7
-#define KS_L V8
-#define KS_M1 V9
-#define KS_M2 V10
-#define KS_H V11
-#define BIT_REV_TAB_L V12
-#define BIT_REV_TAB_H V13
-#define ZERO V15
-#define PTR R7
-
-// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
-TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
-	MOVD t+0(FP), R3
-	MOVD ks+8(FP), R4
-	MOVD p+16(FP), R5
-
-#ifndef GOARCH_ppc64le
-	MOVD	$·rcon(SB), PTR // PTR points to rcon addr
-	LVX	(PTR), XTMP1
-	ADD	$0x10, PTR
-#else
-	MOVD	$·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
-#endif
-
-	LXVD2X (R5)(R0), XDATA
-#ifndef GOARCH_ppc64le
-	VPERM XDATA, XDATA, XTMP1, XDATA
-#endif
-
-	VSPLTISB $4, XTMP2;
-	LXVD2X (PTR)(R0), BIT_REV_TAB_L
-	VSLB  BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H
-	VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, XDATA, XTMP3 // XTMP3 - bit reverse data bytes
-	
-	// ZUC authentication part, 4x32 data bits
-	// setup data
-	VSPLTISB $0, ZERO
-	MOVD $0x10, R8
-	LXVD2X (PTR)(R8), XTMP4
-	VPERM ZERO, XTMP3, XTMP4, XTMP1
-	MOVD $0x20, R8
-	LXVD2X (PTR)(R8), XTMP4
-	VPERM ZERO, XTMP3, XTMP4, XTMP2
-
-	// setup KS
-	LXVW4X (R4), KS_L
-	MOVD $8, R8
-	LXVW4X (R8)(R4), KS_M1
-	MOVD $16, R8
-	LXVW4X (R8)(R4), KS_M2
-	MOVD $0x30, R8
-	LXVD2X (PTR)(R8), XTMP4
-	VPERM KS_L, KS_L, XTMP4, KS_L
-	VPERM KS_M1, KS_M1, XTMP4, KS_M1
-	VPERM KS_M2, KS_M2, XTMP4, KS_M2
-
-	// clmul
-	// xor the results from 4 32-bit words together
-	// Calculate lower 32 bits of tag
-	VPMSUMD XTMP1, KS_L, XTMP3
-	VPMSUMD XTMP2, KS_M1, XTMP4
-	VXOR XTMP3, XTMP4, XTMP3
-	VSPLTW $2, XTMP3, XDIGEST
-
-	// Calculate upper 32 bits of tag
-	VSLDOI $8, KS_M1, KS_L, KS_L
-	VPMSUMD XTMP1, KS_L, XTMP3
-	VSLDOI $8, KS_M2, KS_M1, KS_M1
-	VPMSUMD XTMP2, KS_M1, XTMP4
-	VXOR XTMP3, XTMP4, XTMP3
-	VSPLTW $2, XTMP3, XTMP3
-
-	// Update tag
-#ifdef GOARCH_ppc64le
-	VSLDOI $12, XTMP3, XDIGEST, XDIGEST
-#else
-	VSLDOI $12, XDIGEST, XTMP3, XDIGEST
-#endif
-	MFVSRD XDIGEST, R8
-	MOVD (R3), R6
-	XOR R6, R8, R6
-	MOVD R6, (R3)
-
-	// Copy last 16 bytes of KS to the front
-	MOVD $16, R8
-	LXVD2X (R8)(R4), XTMP1
-	STXVD2X XTMP1, (R4)(R0)
-
-	RET
-
-// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
-TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
-	MOVD t+0(FP), R3
-	MOVD ks+8(FP), R4
-	MOVD p+16(FP), R5
-
-#ifndef GOARCH_ppc64le
-	MOVD	$·rcon(SB), PTR // PTR points to rcon addr
-	LVX	(PTR), XTMP1
-	ADD	$0x10, PTR
-#else
-	MOVD	$·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
-#endif
-
-	LXVD2X (R5)(R0), XDATA
-#ifndef GOARCH_ppc64le
-	VPERM XDATA, XDATA, XTMP1, XDATA
-#endif
-
-	VSPLTISB $4, XTMP2;
-	LXVD2X (PTR)(R0), BIT_REV_TAB_L
-	VSLB  BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H
-	VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, XDATA, XTMP3 // XTMP3 - bit reverse data bytes
-
-	// ZUC authentication part, 4x32 data bits
-	// setup data
-	VSPLTISB $0, ZERO
-	MOVD $0x10, R8
-	LXVD2X (PTR)(R8), XTMP4
-	VPERM ZERO, XTMP3, XTMP4, XTMP1
-	MOVD $0x20, R8
-	LXVD2X (PTR)(R8), XTMP4
-	VPERM ZERO, XTMP3, XTMP4, XTMP2
-
-	// setup KS
-	LXVW4X (R4), KS_L
-	MOVD $8, R8
-	LXVW4X (R8)(R4), KS_M1
-	MOVD $16, R8
-	LXVW4X (R8)(R4), KS_M2
-	VOR KS_M2, KS_M2, KS_H
-	MOVD $0x30, R8
-	LXVD2X (PTR)(R8), XTMP4
-	VPERM KS_L, KS_L, XTMP4, KS_L
-	VPERM KS_M1, KS_M1, XTMP4, KS_M1
-	VPERM KS_M2, KS_M2, XTMP4, KS_M2
-
-	// clmul
-	// xor the results from 4 32-bit words together
-	// Calculate lower 32 bits of tag
-	VPMSUMD XTMP1, KS_L, XTMP3
-	VPMSUMD XTMP2, KS_M1, XTMP4
-	VXOR XTMP3, XTMP4, XTMP3
-	VSLDOI $12, XTMP3, XTMP3, XDIGEST
-
-	// Calculate upper 32 bits of tag
-	VSLDOI $8, KS_M1, KS_L, KS_L
-	VPMSUMD XTMP1, KS_L, XTMP3
-	VSLDOI $8, KS_M2, KS_M1, XTMP5
-	VPMSUMD XTMP2, XTMP5, XTMP4
-	VXOR XTMP3, XTMP4, XTMP3
-	VSLDOI $8, XTMP3, XTMP3, XTMP3
-	VSLDOI $4, XDIGEST, XTMP3, XDIGEST
-
-	// calculate bits 95-64 of tag
-	VPMSUMD XTMP1, KS_M1, XTMP3
-	VPMSUMD XTMP2, KS_M2, XTMP4
-	VXOR XTMP3, XTMP4, XTMP3
-	VSLDOI $8, XTMP3, XTMP3, XTMP3
-	VSLDOI $4, XDIGEST, XTMP3, XDIGEST
-
-	// calculate bits 127-96 of tag
-	VSLDOI $8, KS_M2, KS_M1, KS_M1
-	VPMSUMD XTMP1, KS_M1, XTMP3
-	VSLDOI $8, KS_H, KS_M2, KS_M2
-	VPMSUMD XTMP2, KS_M2, XTMP4
-	VXOR XTMP3, XTMP4, XTMP3
-	VSLDOI $8, XTMP3, XTMP3, XTMP3
-	VSLDOI $4, XDIGEST, XTMP3, XDIGEST
-
-	// Update tag
-	LXVW4X (R3)(R0), XTMP1
-	VXOR XTMP1, XDIGEST, XDIGEST
-	STXVW4X XDIGEST, (R3)
-
-	// Copy last 16 bytes of KS to the front
-	MOVD $16, R8
-	LXVD2X (R8)(R4), XTMP1
-	STXVD2X XTMP1, (R4)(R0)
-
-	RET
--- a/zuc/eia_asm.go
+++ b/zuc/eia_asm.go
@ -9,13 +9,13 @@ import (
 var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD

 //go:noescape
-func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
+func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte)

 func block(m *ZUC128Mac, p []byte) {
 	if supportsGFMUL {
 		for len(p) >= chunk {
 			m.genKeywords(m.k0[4:])
-			eia3Round16B(&m.t, &m.k0[0], &p[0], m.tagSize)
+			eiaRoundTag4(&m.t, &m.k0[0], &p[0])
 			p = p[chunk:]
 		}
 	} else {
--- a/zuc/eia_asm_amd64.s
+++ b/zuc/eia_asm_amd64.s
@ -37,32 +37,41 @@ GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
 #define KS_M2 X11
 #define KS_H X12

-// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
-TEXT ·eia3Round16B(SB),NOSPLIT,$0
+#define BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2) \
+	MOVOU bit_reverse_and_table<>(SB), XTMP1; \
+	MOVOU XDATA, XTMP2;                       \
+	PAND  XTMP1, XTMP2;                       \
+	PANDN XDATA, XTMP1;                       \
+	PSRLQ $4, XTMP1;                          \
+	MOVOU bit_reverse_table_h<>(SB), XDATA;   \
+	PSHUFB XTMP2, XDATA;                      \
+	MOVOU bit_reverse_table_l<>(SB), XTMP2;   \
+	PSHUFB XTMP1, XTMP2;                      \
+	PXOR XTMP2, XDATA
+
+#define BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2) \
+	VMOVDQU bit_reverse_and_table<>(SB), XTMP1; \
+	VPAND XTMP1, XDATA, XTMP2;                  \
+	VPANDN XDATA, XTMP1, XTMP1;                 \
+	VPSRLD $4, XTMP1, XTMP1;                    \
+	VMOVDQU bit_reverse_table_h<>(SB), XDATA;   \
+	VPSHUFB XTMP2, XDATA, XDATA;                \
+	VMOVDQU bit_reverse_table_l<>(SB), XTMP2;   \
+	VPSHUFB XTMP1, XTMP2, XTMP1;                \
+	VPOR XTMP1, XDATA, XDATA
+
+// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eiaRoundTag4(SB),NOSPLIT,$0
 	MOVQ t+0(FP), AX
 	MOVQ ks+8(FP), BX
 	MOVQ p+16(FP), CX
-	MOVQ tagSize+24(FP), DX

 	CMPB ·useAVX(SB), $1
 	JE   avx

 	// Reverse data bytes
 	MOVUPS (0)(CX), XDATA
-	MOVOU bit_reverse_and_table<>(SB), XTMP4
-	MOVOU XDATA, XTMP2
-	PAND  XTMP4, XTMP2
-
-	PANDN XDATA, XTMP4
-	PSRLQ $4, XTMP4
-
-	MOVOU bit_reverse_table_h<>(SB), XTMP3
-	PSHUFB XTMP2, XTMP3
-
-	MOVOU bit_reverse_table_l<>(SB), XTMP1
-	PSHUFB XTMP4, XTMP1
-
-	PXOR XTMP1, XTMP3  // XTMP3 - bit reverse data bytes
+	BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2)

 	// ZUC authentication part, 4x32 data bits
 	// setup KS
@ -72,12 +81,12 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
 	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]

 	// setup DATA
-	MOVOU XTMP3, XTMP1
+	MOVOU XDATA, XTMP1
 	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
 	MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]

-	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
-	MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
+	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA
+	MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s]

 	// clmul
 	// xor the results from 4 32-bit words together
@ -85,11 +94,11 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
 	PCLMULQDQ $0x00, KS_L, XTMP1
 	PCLMULQDQ $0x11, KS_L, XTMP2
 	PCLMULQDQ $0x00, KS_M1, XDIGEST
-	PCLMULQDQ $0x11, KS_M1, XTMP3
+	PCLMULQDQ $0x11, KS_M1, XDATA

 	// XOR all products and move 32-bits to lower 32 bits
 	PXOR XTMP1, XTMP2
-	PXOR XTMP3, XDIGEST
+	PXOR XDATA, XDIGEST
 	PXOR XTMP2, XDIGEST
 	PSRLDQ $4, XDIGEST

@ -105,18 +114,8 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0

 avx:
 	VMOVDQU (0)(CX), XDATA
-
 	// Reverse data bytes
-	VMOVDQU bit_reverse_and_table<>(SB), XTMP1 
-	VPAND XTMP1, XDATA, XTMP2
-	VPANDN XDATA, XTMP1, XTMP3
-	VPSRLD $4, XTMP3, XTMP3
-
-	VMOVDQU bit_reverse_table_h<>(SB), XTMP1
-	VPSHUFB XTMP2, XTMP1, XTMP4
-	VMOVDQU bit_reverse_table_l<>(SB), XTMP1
-	VPSHUFB XTMP3, XTMP1, XTMP1
-	VPOR XTMP1, XTMP4, XTMP4
+	BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2)
 	
 	// ZUC authentication part, 4x32 data bits
 	// setup KS
@ -125,9 +124,9 @@ avx:

 	// setup DATA
 	// Data bytes [31:0 0s 63:32 0s]
-	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
+	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1
 	// Data bytes [95:64 0s 127:96 0s]
-	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
+	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2

 	// clmul
 	// xor the results from 4 32-bit words together
@ -140,14 +139,339 @@ avx:
 	VPXOR XTMP3, XTMP4, XTMP3
 	VPXOR XTMP5, XTMP6, XTMP5
 	VPXOR XTMP3, XTMP5, XDIGEST
+	VPSLLDQ $4, XDIGEST, XDIGEST

-	VMOVQ XDIGEST, R10
-	SHRQ $32, R10
+	// Update tag
+	MOVL XDIGEST, R10
 	XORL R10, (AX)

 	// Copy last 16 bytes of KS to the front
 	VMOVDQU (4*4)(BX), XTMP1
 	VMOVDQU XTMP1, (0*4)(BX)

-	VZEROUPPER
+	RET
+
+// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
+	MOVQ t+0(FP), AX
+	MOVQ ks+8(FP), BX
+	MOVQ p+16(FP), CX
+
+	CMPB ·useAVX(SB), $1
+	JE   avx
+
+	// Reverse data bytes
+	MOVUPS (0)(CX), XDATA
+	BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2)
+
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	MOVUPS (0*4)(BX), XTMP1
+	MOVUPS (2*4)(BX), XTMP2
+	MOVUPS (4*4)(BX), XTMP4
+	PSHUFD $0x61, XTMP1, KS_L  // KS bits [63:32 31:0 95:64 63:32]
+	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
+	PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
+
+	// setup DATA
+	MOVOU XDATA, XTMP1
+	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
+	MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
+
+	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA
+	MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s]
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Save data for following products
+	MOVOU XTMP2, XTMP5 //  Data bits [31:0 0s 63:32 0s]
+	MOVOU XDATA, XTMP6 //  Data bits [95:64 0s 127:96 0s]
+
+	// Calculate lower 32 bits of tag
+	PCLMULQDQ $0x00, KS_L, XTMP1
+	PCLMULQDQ $0x11, KS_L, XTMP2
+	PCLMULQDQ $0x00, KS_M1, XDIGEST
+	PCLMULQDQ $0x11, KS_M1, XDATA
+
+	// XOR all products and move bits 63-32 bits to lower 32 bits
+	PXOR XTMP1, XTMP2
+	PXOR XDATA, XDIGEST
+	PXOR XTMP2, XDIGEST
+	PSLLDQ $8, XDIGEST // Move bits 63-32 to bits 127-96
+
+	// Prepare data and calculate bits 63-32 of tag
+	MOVOU XTMP5, XTMP1
+	MOVOU XTMP5, XTMP2
+	MOVOU XTMP6, XTMP3
+	MOVOU XTMP6, XTMP4
+
+	PCLMULQDQ $0x10, KS_L, XTMP1
+	PCLMULQDQ $0x01, KS_M1, XTMP2
+	PCLMULQDQ $0x10, KS_M1, XTMP3
+	PCLMULQDQ $0x01, KS_M2, XTMP4
+
+	// XOR all the products and keep only bits 63-32
+	PXOR XTMP2, XTMP1
+	PXOR XTMP4, XTMP3
+	PXOR XTMP3, XTMP1
+	PSRLDQ $4, XTMP1           // Move bits 63-32 to bits 31-0
+
+	PALIGNR $12, XDIGEST, XTMP1 // XTMP1 || XDIGEST
+
+	// Update tag
+	MOVQ XTMP1, R10
+	XORQ R10, (AX)
+
+	// Copy last 16 bytes of KS to the front
+	MOVUPS (4*4)(BX), XTMP1
+	MOVUPS XTMP1, (0*4)(BX)
+
+	RET
+
+avx:
+	VMOVDQU (0)(CX), XDATA
+
+	// Reverse data bytes
+	BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2)
+	
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	VPSHUFD $0x61, (0*4)(BX), KS_L  // KS bits [63:32 31:0 95:64 63:32]
+	VPSHUFD $0x61, (2*4)(BX), KS_M1  // KS bits [63:32 31:0 95:64 63:32]
+	VPSHUFD $0x61, (4*4)(BX), KS_M2  // KS bits [191:160 159:128 223:192 191:160]
+
+	// setup DATA
+	// Data bytes [31:0 0s 63:32 0s]
+	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1
+	// Data bytes [95:64 0s 127:96 0s]
+	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Calculate lower 32 bits of tag
+	VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
+	VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
+	VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
+	VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
+
+	VPXOR XTMP3, XTMP4, XTMP3
+	VPXOR XTMP5, XTMP6, XTMP5
+	VPXOR XTMP3, XTMP5, XTMP3
+	VPSLLDQ $8, XTMP3, XDIGEST
+
+	VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
+	VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
+	VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
+	VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
+
+	// XOR all the products and keep only 32-63 bits
+	VPXOR XTMP4, XTMP3, XTMP3
+	VPXOR XTMP6, XTMP5, XTMP5
+	VPXOR XTMP5, XTMP3, XTMP3
+	VPSRLDQ $4, XTMP3, XTMP3
+
+	VPALIGNR $12, XDIGEST, XTMP3, XDIGEST
+
+	// Update tag
+	VMOVQ XDIGEST, R10
+	XORQ R10, (AX)
+
+	// Copy last 16 bytes of KS to the front
+	VMOVDQU (4*4)(BX), XTMP1
+	VMOVDQU XTMP1, (0*4)(BX)
+
+	RET
+
+// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
+	MOVQ t+0(FP), AX
+	MOVQ ks+8(FP), BX
+	MOVQ p+16(FP), CX
+
+	CMPB ·useAVX(SB), $1
+	JE   avx
+
+	// Reverse data bytes
+	MOVUPS (0)(CX), XDATA
+	BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2)
+
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	MOVUPS (0*4)(BX), XTMP1
+	MOVUPS (2*4)(BX), XTMP2
+	MOVUPS (4*4)(BX), XTMP4
+	PSHUFD $0x61, XTMP1, KS_L  // KS bits [63:32 31:0 95:64 63:32]
+	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
+	PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
+	PSHUFD $0xBB, XTMP4, KS_H  // KS bits [255:224 223:192 255:224 223:192]
+
+	// setup DATA
+	MOVOU XDATA, XTMP1
+	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
+	MOVOU XTMP1, XTMP2         // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
+
+	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA
+	MOVOU XDATA, XDIGEST       // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s]
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Save data for following products
+	MOVOU XTMP2, XTMP5 //  Data bits [31:0 0s 63:32 0s]
+	MOVOU XDATA, XTMP6 //  Data bits [95:64 0s 127:96 0s]
+
+	// Calculate lower 32 bits of tag
+	PCLMULQDQ $0x00, KS_L, XTMP1
+	PCLMULQDQ $0x11, KS_L, XTMP2
+	PCLMULQDQ $0x00, KS_M1, XDIGEST
+	PCLMULQDQ $0x11, KS_M1, XDATA
+
+	// XOR all products and move bits 63-32 bits to lower 32 bits
+	PXOR XTMP1, XTMP2
+	PXOR XDATA, XDIGEST
+	PXOR XTMP2, XDIGEST
+	PSLLDQ $8, XDIGEST // Move bits 63-32 to bits 127-96
+
+	// Prepare data and calculate bits 63-32 of tag
+	MOVOU XTMP5, XTMP1
+	MOVOU XTMP5, XTMP2
+	MOVOU XTMP6, XTMP3
+	MOVOU XTMP6, XTMP4
+
+	PCLMULQDQ $0x10, KS_L, XTMP1
+	PCLMULQDQ $0x01, KS_M1, XTMP2
+	PCLMULQDQ $0x10, KS_M1, XTMP3
+	PCLMULQDQ $0x01, KS_M2, XTMP4
+
+	// XOR all the products and keep only bits 63-32
+	PXOR XTMP2, XTMP1
+	PXOR XTMP4, XTMP3
+	PXOR XTMP3, XTMP1
+	PSRLDQ $4, XTMP1           // Move bits 63-32 to bits 31-0
+	PALIGNR $4, XDIGEST, XTMP1 // XTMP1 || XDIGEST
+
+	// Prepare data and calculate bits 95-64 of tag
+	MOVOU XTMP5, XDIGEST
+	MOVOU XTMP5, XTMP2
+	MOVOU XTMP6, XTMP3
+	MOVOU XTMP6, XTMP4
+
+	PCLMULQDQ $0x00, KS_M1, XDIGEST
+	PCLMULQDQ $0x11, KS_M1, XTMP2
+	PCLMULQDQ $0x00, KS_M2, XTMP3
+	PCLMULQDQ $0x11, KS_M2, XTMP4
+
+	// XOR all the products and move bits 63-32 to bits 95-64
+	PXOR XTMP2, XDIGEST
+	PXOR XTMP4, XTMP3
+	PXOR XTMP3, XDIGEST
+	PSRLDQ $4, XDIGEST          // Move bits 63-32 to bits 31-0
+	PALIGNR $4, XTMP1, XDIGEST  // XDIGEST || XTMP1
+
+	// Prepare data and calculate bits 127-96 of tag
+	MOVOU XTMP5, XTMP1
+	MOVOU XTMP5, XTMP2
+	MOVOU XTMP6, XTMP3
+	MOVOU XTMP6, XTMP4
+
+	PCLMULQDQ $0x10, KS_M1, XTMP1
+	PCLMULQDQ $0x01, KS_M2, XTMP2
+	PCLMULQDQ $0x10, KS_M2, XTMP3
+	PCLMULQDQ $0x01, KS_H, XTMP4
+
+	// XOR all the products and move bits 63-32 to bits 127-96
+	PXOR XTMP2, XTMP1
+	PXOR XTMP4, XTMP3
+	PXOR XTMP3, XTMP1
+	PSRLDQ $4, XTMP1           // Move bits 63-32 to bits 31-0
+	PALIGNR $4, XDIGEST, XTMP1 // XTMP1 || XDIGEST
+
+	// Update tag
+	MOVUPS (AX), XDIGEST
+	PXOR XTMP1, XDIGEST
+	MOVUPS XDIGEST, (AX)
+
+	// Copy last 16 bytes of KS to the front
+	MOVUPS (4*4)(BX), XTMP1
+	MOVUPS XTMP1, (0*4)(BX)
+
+	RET
+
+avx:
+	VMOVDQU (0)(CX), XDATA
+
+	// Reverse data bytes
+	BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2)
+	
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	VPSHUFD $0x61, (0*4)(BX), KS_L  // KS bits [63:32 31:0 95:64 63:32]
+	VPSHUFD $0x61, (2*4)(BX), KS_M1  // KS bits [63:32 31:0 95:64 63:32]
+	VPSHUFD $0x61, (4*4)(BX), KS_M2  // KS bits [191:160 159:128 223:192 191:160]
+	VPSHUFD $0xBB, (4*4)(BX), KS_H  // KS bits [255:224 223:192 255:224 223:192]
+
+	// setup DATA
+	// Data bytes [31:0 0s 63:32 0s]
+	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1
+	// Data bytes [95:64 0s 127:96 0s]
+	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Calculate lower 32 bits of tag
+	VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
+	VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
+	VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
+	VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
+
+	VPXOR XTMP3, XTMP4, XTMP3
+	VPXOR XTMP5, XTMP6, XTMP5
+	VPXOR XTMP3, XTMP5, XTMP3
+	VPSLLDQ $8, XTMP3, XDIGEST
+
+	VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
+	VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
+	VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
+	VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
+
+	// XOR all the products and keep only 32-63 bits
+	VPXOR XTMP4, XTMP3, XTMP3
+	VPXOR XTMP6, XTMP5, XTMP5
+	VPXOR XTMP5, XTMP3, XTMP3
+	VPSRLDQ $4, XTMP3, XTMP3
+	VPALIGNR $4, XDIGEST, XTMP3, XDIGEST
+
+	// Prepare data and calculate bits 95-64 of tag
+	VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3
+	VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4
+	VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5
+	VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6
+
+	// XOR all the products and move bits 63-32 to bits 95-64
+	VPXOR XTMP4, XTMP3, XTMP3
+	VPXOR XTMP6, XTMP5, XTMP5
+	VPXOR XTMP5, XTMP3, XTMP3
+	VPSRLDQ $4, XTMP3, XTMP3
+	VPALIGNR $4, XDIGEST, XTMP3, XDIGEST
+
+	// Prepare data and calculate bits 127-96 of tag
+	VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3
+	VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4
+	VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5
+	VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6
+
+	// XOR all the products and move bits 63-32 to bits 127-96
+	VPXOR XTMP4, XTMP3, XTMP3
+	VPXOR XTMP6, XTMP5, XTMP5
+	VPXOR XTMP5, XTMP3, XTMP3
+	VPSRLDQ $4, XTMP3, XTMP3
+	VPALIGNR $4, XDIGEST, XTMP3, XDIGEST
+
+	// Update tag
+	VPXOR (AX), XDIGEST, XDIGEST
+	VMOVDQA XDIGEST, (AX)
+
+	// Copy last 16 bytes of KS to the front
+	VMOVDQU (4*4)(BX), XTMP1
+	VMOVDQU XTMP1, (0*4)(BX)
+
 	RET
--- a/zuc/eia_asm_arm64.s
+++ b/zuc/eia_asm_arm64.s
@ -2,15 +2,15 @@

 #include "textflag.h"

-DATA ·eia_const+0x00(SB)/8, $0x0e060a020c040800 // bit_reverse_table low
-DATA ·eia_const+0x08(SB)/8, $0x0f070b030d050901
-DATA ·eia_const+0x10(SB)/8, $0xe060a020c0408000 // bit_reverse_table high
-DATA ·eia_const+0x18(SB)/8, $0xf070b030d0509010
-DATA ·eia_const+0x20(SB)/8, $0xffffffff03020100 // SHUF_MASK_DW0_DW1
-DATA ·eia_const+0x28(SB)/8, $0xffffffff07060504
-DATA ·eia_const+0x30(SB)/8, $0xffffffff0b0a0908 // SHUF_MASK_DW2_DW3
-DATA ·eia_const+0x38(SB)/8, $0xffffffff0f0e0d0c
-GLOBL ·eia_const(SB), RODATA, $64
+DATA eia_const<>+0x00(SB)/8, $0x0e060a020c040800 // bit_reverse_table low
+DATA eia_const<>+0x08(SB)/8, $0x0f070b030d050901
+DATA eia_const<>+0x10(SB)/8, $0xe060a020c0408000 // bit_reverse_table high
+DATA eia_const<>+0x18(SB)/8, $0xf070b030d0509010
+DATA eia_const<>+0x20(SB)/8, $0xffffffff03020100 // SHUF_MASK_DW0_DW1
+DATA eia_const<>+0x28(SB)/8, $0xffffffff07060504
+DATA eia_const<>+0x30(SB)/8, $0xffffffff0b0a0908 // SHUF_MASK_DW2_DW3
+DATA eia_const<>+0x38(SB)/8, $0xffffffff0f0e0d0c
+GLOBL eia_const<>(SB), RODATA, $64

 #define AX R2
 #define BX R3
@ -36,28 +36,29 @@ GLOBL ·eia_const(SB), RODATA, $64
 #define SHUF_MASK_DW2_DW3 V24

 #define LOAD_GLOBAL_DATA() \
-	MOVD $·eia_const(SB), R0                                                                              \
-	VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16, SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16]         \
-	MOVW $0x0F0F0F0F, R0                                                                                    \
-	VDUP R0, BIT_REV_AND_TAB.S4                               
+	MOVD $eia_const<>(SB), R0                                                                              \
+	VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16, SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16]       \
+	MOVW $0x0F0F0F0F, R0                                                                                  \
+	VDUP R0, BIT_REV_AND_TAB.S4

-// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
-TEXT ·eia3Round16B(SB),NOSPLIT,$0
+#define BIT_REVERSE(XDATA, XTMP1, XTMP2) \
+	VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP2.B16 \
+	VUSHR $4, XDATA.B16, XTMP1.B16                 \
+	VTBL XTMP2.B16, [BIT_REV_TAB_H.B16], XTMP2.B16 \
+	VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 \
+	VEOR XTMP1.B16, XTMP2.B16, XDATA.B16                        
+
+// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eiaRoundTag4(SB),NOSPLIT,$0
 	MOVD t+0(FP), AX
 	MOVD ks+8(FP), BX
 	MOVD p+16(FP), CX
-	MOVD tagSize+24(FP), DX

 	LOAD_GLOBAL_DATA()

 	// Reverse data bytes
 	VLD1 (CX), [XDATA.B16]
-	VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
-	VUSHR $4, XDATA.B16, XTMP1.B16
-
-	VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
-	VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
-	VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes
+	BIT_REVERSE(XDATA, XTMP1, XTMP2)

 	// ZUC authentication part, 4x32 data bits
 	// setup KS
@ -72,8 +73,8 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
 	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]

 	// setup DATA
-	VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
-	VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
+	VTBL SHUF_MASK_DW0_DW1.B16, [XDATA.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
+	VTBL SHUF_MASK_DW2_DW3.B16, [XDATA.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]

 	// clmul
 	// xor the results from 4 32-bit words together
@ -93,3 +94,169 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
 	MOVW R11, (AX)

 	RET
+
+// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
+	MOVD t+0(FP), AX
+	MOVD ks+8(FP), BX
+	MOVD p+16(FP), CX
+
+	LOAD_GLOBAL_DATA()
+
+	// Reverse data bytes
+	VLD1 (CX), [XDATA.B16]
+	BIT_REVERSE(XDATA, XTMP1, XTMP2)
+
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	VLD1 (BX), [XTMP1.B16, XTMP2.B16]
+	VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
+	// TODO: Any better solution???
+	VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
+	VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32]
+	VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
+	VDUP XTMP1.S[3], KS_M1.S4
+	VMOV XTMP1.S[2], KS_M1.S[1]
+	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
+	
+	// setup DATA
+	VTBL SHUF_MASK_DW0_DW1.B16, [XDATA.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
+	VTBL SHUF_MASK_DW2_DW3.B16, [XDATA.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
+
+	// clmul
+	// xor the results from 4 32-bit words together
+
+	// Calculate lower 32 bits of tag
+	VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
+	VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
+	VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
+	VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
+
+	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
+	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
+	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
+
+	// Move previous result to low 32 bits and XOR with previous digest
+	VMOV XTMP3.S[1], XDIGEST.S[0]
+
+	// Prepare data and calculate bits 63-32 of tag
+	VEXT	$8, KS_L.B16, KS_L.B16, XTMP5.B16
+	VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
+	VEXT	$8, XTMP1.B16, XTMP1.B16, XTMP5.B16
+	VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
+	VEXT	$8, KS_M1.B16, KS_M1.B16, XTMP1.B16
+	VPMULL XTMP1.D1, XTMP2.D1, XTMP5.Q1
+	VEXT	$8, XTMP2.B16, XTMP2.B16, XTMP1.B16
+	VPMULL KS_M2.D1, XTMP1.D1, XTMP6.Q1
+
+	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
+	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
+	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
+
+	VMOV XTMP3.S[1], XDIGEST.S[1]
+
+	VMOV XDIGEST.D[0], R10
+	MOVD (AX), R11
+	EOR R10, R11
+	MOVD R11, (AX)
+
+	RET
+
+// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
+	MOVD t+0(FP), AX
+	MOVD ks+8(FP), BX
+	MOVD p+16(FP), CX
+
+	LOAD_GLOBAL_DATA()
+
+	// Reverse data bytes
+	VLD1 (CX), [XDATA.B16]
+	BIT_REVERSE(XDATA, XTMP1, XTMP2)
+
+	// ZUC authentication part, 4x32 data bits
+	// setup KS
+	VLD1 (BX), [XTMP1.B16, XTMP2.B16]
+	VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
+	// TODO: Any better solution??? We can use VTBL, but there are no performance imprvoement if we can't reuse MASK constant
+	VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
+	VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16  // KS bits [63:32 31:0 95:64 63:32]
+	VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
+	VMOVQ $0x0b0a09080f0e0d0c, $0x0b0a09080f0e0d0c, XTMP4
+	VTBL XTMP4.B16, [XTMP2.B16], KS_H.B16  // KS bits [255:224 223:192 255:224 223:192]
+	VDUP XTMP1.S[3], KS_M1.S4
+	VMOV XTMP1.S[2], KS_M1.S[1]
+	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
+
+	// setup DATA
+	VTBL SHUF_MASK_DW0_DW1.B16, [XDATA.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
+	VTBL SHUF_MASK_DW2_DW3.B16, [XDATA.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
+
+	// clmul
+	// xor the results from 4 32-bit words together
+
+	// Calculate lower 32 bits of tag
+	VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
+	VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
+	VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
+	VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
+
+	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
+	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
+	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
+
+	// Move previous result to low 32 bits and XOR with previous digest
+	VMOV XTMP3.S[1], XDIGEST.S[0]
+
+	// Prepare data and calculate bits 63-32 of tag
+	VEXT	$8, KS_L.B16, KS_L.B16, XTMP5.B16
+	VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
+	VEXT	$8, XTMP1.B16, XTMP1.B16, XTMP5.B16
+	VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
+	VEXT	$8, KS_M1.B16, KS_M1.B16, XTMP6.B16
+	VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
+	VEXT	$8, XTMP2.B16, XTMP2.B16, KS_L.B16
+	VPMULL KS_M2.D1, KS_L.D1, XTMP6.Q1
+
+	// XOR all the products and keep only 32-63 bits
+	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
+	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
+	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
+
+	VMOV XTMP3.S[1], XDIGEST.S[1]
+
+	// Prepare data and calculate bits 95-64 of tag
+	VPMULL KS_M1.D1, XTMP1.D1, XTMP3.Q1
+	VPMULL2 KS_M1.D2, XTMP1.D2, XTMP4.Q1
+	VPMULL KS_M2.D1, XTMP2.D1, XTMP5.Q1
+	VPMULL2 KS_M2.D2, XTMP2.D2, XTMP6.Q1
+
+	// XOR all the products and move bits 63-32 to bits 95-64
+	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
+	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
+	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
+
+	VMOV XTMP3.S[1], XDIGEST.S[2]
+
+	// Prepare data and calculate bits 127-96 of tag
+	VEXT	$8, KS_M1.B16, KS_M1.B16, XTMP5.B16
+	VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
+	VEXT	$8, XTMP1.B16, XTMP1.B16, XTMP5.B16
+	VPMULL KS_M2.D1, XTMP5.D1, XTMP4.Q1
+	VEXT	$8, KS_M2.B16, KS_M2.B16, XTMP6.B16
+	VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
+	VEXT	$8, XTMP2.B16, XTMP2.B16, KS_L.B16
+	VPMULL KS_H.D1, KS_L.D1, XTMP6.Q1
+
+	// XOR all the products and move bits 63-32 to bits 127-96
+	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
+	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
+	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
+
+	VMOV XTMP3.S[1], XDIGEST.S[3]
+
+	VLD1 (AX), [XTMP1.B16]
+	VEOR XTMP1.B16, XDIGEST.B16, XDIGEST.B16
+	VST1 [XDIGEST.B16], (AX)
+
+	RET
--- a/zuc/eia_asm_ppc64x.s
+++ b/zuc/eia_asm_ppc64x.s
@ -6,43 +6,53 @@

 #include "textflag.h"

-DATA ·rcon+0x00(SB)/8, $0x0706050403020100 // Permute for vector doubleword endian swap
-DATA ·rcon+0x08(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·rcon+0x10(SB)/8, $0x0008040c020a060e // bit_reverse_table_l
-DATA ·rcon+0x18(SB)/8, $0x0109050d030b070f // bit_reverse_table_l
-DATA ·rcon+0x20(SB)/8, $0x0000000010111213 // data mask
-DATA ·rcon+0x28(SB)/8, $0x0000000014151617 // data mask
-DATA ·rcon+0x30(SB)/8, $0x0000000018191a1b // data mask
-DATA ·rcon+0x38(SB)/8, $0x000000001c1d1e1f // data mask
-DATA ·rcon+0x40(SB)/8, $0x0405060708090a0b // ks mask
-DATA ·rcon+0x48(SB)/8, $0x0001020304050607 // ks mask
-GLOBL ·rcon(SB), RODATA, $80
+DATA eia_const<>+0x00(SB)/8, $0x0706050403020100 // Permute for vector doubleword endian swap
+DATA eia_const<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908
+DATA eia_const<>+0x10(SB)/8, $0x0008040c020a060e // bit_reverse_table_l
+DATA eia_const<>+0x18(SB)/8, $0x0109050d030b070f // bit_reverse_table_l
+DATA eia_const<>+0x20(SB)/8, $0x0000000010111213 // data mask
+DATA eia_const<>+0x28(SB)/8, $0x0000000014151617 // data mask
+DATA eia_const<>+0x30(SB)/8, $0x0000000018191a1b // data mask
+DATA eia_const<>+0x38(SB)/8, $0x000000001c1d1e1f // data mask
+DATA eia_const<>+0x40(SB)/8, $0x0405060708090a0b // ks mask
+DATA eia_const<>+0x48(SB)/8, $0x0001020304050607 // ks mask
+GLOBL eia_const<>(SB), RODATA, $80

 #define XTMP1 V0
 #define XTMP2 V1
 #define XTMP3 V2
 #define XTMP4 V3
+#define XTMP5 V4
+#define XTMP6 V5
 #define XDATA V6
 #define XDIGEST V7
 #define KS_L V8
 #define KS_M1 V9
+#define KS_M2 V10
+#define KS_H V11
 #define BIT_REV_TAB_L V12
 #define BIT_REV_TAB_H V13
-
+#define ZERO V15
 #define PTR R7

-// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
-TEXT ·eia3Round16B(SB),NOSPLIT,$0
+#define BIT_REVERSE(addr, IN, OUT, XTMP) \
+	LXVD2X (addr)(R0), BIT_REV_TAB_L     \
+	VSPLTISB $4, XTMP                    \
+	VSLB  BIT_REV_TAB_L, XTMP, BIT_REV_TAB_H  \
+	VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, IN, OUT
+
+// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eiaRoundTag4(SB),NOSPLIT,$0
 	MOVD t+0(FP), R3
 	MOVD ks+8(FP), R4
 	MOVD p+16(FP), R5

 #ifndef GOARCH_ppc64le
-	MOVD	$·rcon(SB), PTR // PTR points to rcon addr
+	MOVD	$eia_const<>(SB), PTR
 	LVX	(PTR), XTMP1
 	ADD	$0x10, PTR
 #else
-	MOVD	$·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
+	MOVD	$eia_const<>+0x10(SB), PTR
 #endif

 	LXVD2X (R5)(R0), XDATA
@ -50,10 +60,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
 	VPERM XDATA, XDATA, XTMP1, XDATA
 #endif

-	VSPLTISB $4, XTMP2;
-	LXVD2X (PTR)(R0), BIT_REV_TAB_L
-	VSLB  BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H
-	VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, XDATA, XTMP3 // XTMP3 - bit reverse data bytes
+	BIT_REVERSE(PTR, XDATA, XTMP3, XTMP2)

 	// ZUC authentication part, 4x32 data bits
 	// setup data
@ -95,3 +102,169 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
 	STXVD2X XTMP1, (R4)(R0)

 	RET
+
+// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
+	MOVD t+0(FP), R3
+	MOVD ks+8(FP), R4
+	MOVD p+16(FP), R5
+
+#ifndef GOARCH_ppc64le
+	MOVD	$eia_const<>(SB), PTR
+	LVX	(PTR), XTMP1
+	ADD	$0x10, PTR
+#else
+	MOVD	$eia_const<>+0x10(SB), PTR
+#endif
+
+	LXVD2X (R5)(R0), XDATA
+#ifndef GOARCH_ppc64le
+	VPERM XDATA, XDATA, XTMP1, XDATA
+#endif
+
+	BIT_REVERSE(PTR, XDATA, XTMP3, XTMP2)
+	
+	// ZUC authentication part, 4x32 data bits
+	// setup data
+	VSPLTISB $0, ZERO
+	MOVD $0x10, R8
+	LXVD2X (PTR)(R8), XTMP4
+	VPERM ZERO, XTMP3, XTMP4, XTMP1
+	MOVD $0x20, R8
+	LXVD2X (PTR)(R8), XTMP4
+	VPERM ZERO, XTMP3, XTMP4, XTMP2
+
+	// setup KS
+	LXVW4X (R4), KS_L
+	MOVD $8, R8
+	LXVW4X (R8)(R4), KS_M1
+	MOVD $16, R8
+	LXVW4X (R8)(R4), KS_M2
+	MOVD $0x30, R8
+	LXVD2X (PTR)(R8), XTMP4
+	VPERM KS_L, KS_L, XTMP4, KS_L
+	VPERM KS_M1, KS_M1, XTMP4, KS_M1
+	VPERM KS_M2, KS_M2, XTMP4, KS_M2
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Calculate lower 32 bits of tag
+	VPMSUMD XTMP1, KS_L, XTMP3
+	VPMSUMD XTMP2, KS_M1, XTMP4
+	VXOR XTMP3, XTMP4, XTMP3
+	VSPLTW $2, XTMP3, XDIGEST
+
+	// Calculate upper 32 bits of tag
+	VSLDOI $8, KS_M1, KS_L, KS_L
+	VPMSUMD XTMP1, KS_L, XTMP3
+	VSLDOI $8, KS_M2, KS_M1, KS_M1
+	VPMSUMD XTMP2, KS_M1, XTMP4
+	VXOR XTMP3, XTMP4, XTMP3
+	VSPLTW $2, XTMP3, XTMP3
+
+	// Update tag
+#ifdef GOARCH_ppc64le
+	VSLDOI $12, XTMP3, XDIGEST, XDIGEST
+#else
+	VSLDOI $12, XDIGEST, XTMP3, XDIGEST
+#endif
+	MFVSRD XDIGEST, R8
+	MOVD (R3), R6
+	XOR R6, R8, R6
+	MOVD R6, (R3)
+
+	// Copy last 16 bytes of KS to the front
+	MOVD $16, R8
+	LXVD2X (R8)(R4), XTMP1
+	STXVD2X XTMP1, (R4)(R0)
+
+	RET
+
+// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
+TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
+	MOVD t+0(FP), R3
+	MOVD ks+8(FP), R4
+	MOVD p+16(FP), R5
+
+#ifndef GOARCH_ppc64le
+	MOVD	$eia_const<>(SB), PTR
+	LVX	(PTR), XTMP1
+	ADD	$0x10, PTR
+#else
+	MOVD	$eia_const<>+0x10(SB), PTR
+#endif
+
+	LXVD2X (R5)(R0), XDATA
+#ifndef GOARCH_ppc64le
+	VPERM XDATA, XDATA, XTMP1, XDATA
+#endif
+
+	BIT_REVERSE(PTR, XDATA, XTMP3, XTMP2)
+
+	// ZUC authentication part, 4x32 data bits
+	// setup data
+	VSPLTISB $0, ZERO
+	MOVD $0x10, R8
+	LXVD2X (PTR)(R8), XTMP4
+	VPERM ZERO, XTMP3, XTMP4, XTMP1
+	MOVD $0x20, R8
+	LXVD2X (PTR)(R8), XTMP4
+	VPERM ZERO, XTMP3, XTMP4, XTMP2
+
+	// setup KS
+	LXVW4X (R4), KS_L
+	MOVD $8, R8
+	LXVW4X (R8)(R4), KS_M1
+	MOVD $16, R8
+	LXVW4X (R8)(R4), KS_M2
+	VOR KS_M2, KS_M2, KS_H
+	MOVD $0x30, R8
+	LXVD2X (PTR)(R8), XTMP4
+	VPERM KS_L, KS_L, XTMP4, KS_L
+	VPERM KS_M1, KS_M1, XTMP4, KS_M1
+	VPERM KS_M2, KS_M2, XTMP4, KS_M2
+
+	// clmul
+	// xor the results from 4 32-bit words together
+	// Calculate lower 32 bits of tag
+	VPMSUMD XTMP1, KS_L, XTMP3
+	VPMSUMD XTMP2, KS_M1, XTMP4
+	VXOR XTMP3, XTMP4, XTMP3
+	VSLDOI $12, XTMP3, XTMP3, XDIGEST
+
+	// Calculate upper 32 bits of tag
+	VSLDOI $8, KS_M1, KS_L, KS_L
+	VPMSUMD XTMP1, KS_L, XTMP3
+	VSLDOI $8, KS_M2, KS_M1, XTMP5
+	VPMSUMD XTMP2, XTMP5, XTMP4
+	VXOR XTMP3, XTMP4, XTMP3
+	VSLDOI $8, XTMP3, XTMP3, XTMP3
+	VSLDOI $4, XDIGEST, XTMP3, XDIGEST
+
+	// calculate bits 95-64 of tag
+	VPMSUMD XTMP1, KS_M1, XTMP3
+	VPMSUMD XTMP2, KS_M2, XTMP4
+	VXOR XTMP3, XTMP4, XTMP3
+	VSLDOI $8, XTMP3, XTMP3, XTMP3
+	VSLDOI $4, XDIGEST, XTMP3, XDIGEST
+
+	// calculate bits 127-96 of tag
+	VSLDOI $8, KS_M2, KS_M1, KS_M1
+	VPMSUMD XTMP1, KS_M1, XTMP3
+	VSLDOI $8, KS_H, KS_M2, KS_M2
+	VPMSUMD XTMP2, KS_M2, XTMP4
+	VXOR XTMP3, XTMP4, XTMP3
+	VSLDOI $8, XTMP3, XTMP3, XTMP3
+	VSLDOI $4, XDIGEST, XTMP3, XDIGEST
+
+	// Update tag
+	LXVW4X (R3)(R0), XTMP1
+	VXOR XTMP1, XDIGEST, XDIGEST
+	STXVW4X XDIGEST, (R3)
+
+	// Copy last 16 bytes of KS to the front
+	MOVD $16, R8
+	LXVD2X (R8)(R4), XTMP1
+	STXVD2X XTMP1, (R4)(R0)
+
+	RET