diff --git a/zuc/eia256_asm.go b/zuc/eia256_asm.go index 39c1327..a66e594 100644 --- a/zuc/eia256_asm.go +++ b/zuc/eia256_asm.go @@ -18,7 +18,7 @@ func block256(m *ZUC256Mac, p []byte) { case 16: eia256RoundTag16(&m.t[0], &m.k0[0], &p[0]) default: - eia3Round16B(&m.t[0], &m.k0[0], &p[0], m.tagSize) + eiaRoundTag4(&m.t[0], &m.k0[0], &p[0]) } p = p[chunk:] } diff --git a/zuc/eia256_asm_amd64.s b/zuc/eia256_asm_amd64.s deleted file mode 100644 index 27c88d9..0000000 --- a/zuc/eia256_asm_amd64.s +++ /dev/null @@ -1,448 +0,0 @@ -// Referenced Intel(R) Multi-Buffer Crypto for IPsec -// https://github.com/intel/intel-ipsec-mb/ -//go:build !purego - -#include "textflag.h" - -DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800 -DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901 -GLOBL bit_reverse_table_l<>(SB), RODATA, $16 - -DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000 -DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010 -GLOBL bit_reverse_table_h<>(SB), RODATA, $16 - -DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f -DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f -GLOBL bit_reverse_and_table<>(SB), RODATA, $16 - -DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100 -DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504 -GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16 - -DATA shuf_mask_0_0_dw1_0<>+0x00(SB)/8, $0xffffffffffffffff -DATA shuf_mask_0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504 -GLOBL shuf_mask_0_0_dw1_0<>(SB), RODATA, $16 - -DATA shuf_mask_0_0_0_dw1<>+0x00(SB)/8, $0xffffffffffffffff -DATA shuf_mask_0_0_0_dw1<>+0x08(SB)/8, $0x07060504ffffffff -GLOBL shuf_mask_0_0_0_dw1<>(SB), RODATA, $16 - -DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908 -DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c -GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16 - -DATA bits_32_63<>+0x00(SB)/8, $0xffffffff00000000 -DATA bits_32_63<>+0x08(SB)/8, $0x0000000000000000 -GLOBL bits_32_63<>(SB), RODATA, $16 - - -#define XTMP1 X1 -#define XTMP2 X2 -#define XTMP3 X3 -#define XTMP4 X4 -#define XTMP5 X5 -#define XTMP6 X6 -#define XDATA X7 -#define XDIGEST X8 -#define KS_L X9 -#define KS_M1 X10 -#define KS_M2 X11 -#define KS_H X12 - -// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte) -TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 - MOVQ t+0(FP), AX - MOVQ ks+8(FP), BX - MOVQ p+16(FP), CX - - CMPB ·useAVX(SB), $1 - JE avx - - // Reverse data bytes - MOVUPS (0)(CX), XDATA - MOVOU bit_reverse_and_table<>(SB), XTMP4 - MOVOU XDATA, XTMP2 - PAND XTMP4, XTMP2 - - PANDN XDATA, XTMP4 - PSRLQ $4, XTMP4 - - MOVOU bit_reverse_table_h<>(SB), XTMP3 - PSHUFB XTMP2, XTMP3 - - MOVOU bit_reverse_table_l<>(SB), XTMP1 - PSHUFB XTMP4, XTMP1 - - PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes - - // ZUC authentication part, 4x32 data bits - // setup KS - MOVUPS (0*4)(BX), XTMP1 - MOVUPS (2*4)(BX), XTMP2 - MOVUPS (4*4)(BX), XTMP4 - PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32] - PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96] - PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160] - - // setup DATA - MOVOU XTMP3, XTMP1 - PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1 - MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s] - - PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3 - MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s] - - // clmul - // xor the results from 4 32-bit words together - // Save data for following products - MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s] - MOVOU XTMP3, XTMP6 // Data bits [95:64 0s 127:96 0s] - - // Calculate lower 32 bits of tag - PCLMULQDQ $0x00, KS_L, XTMP1 - PCLMULQDQ $0x11, KS_L, XTMP2 - PCLMULQDQ $0x00, KS_M1, XDIGEST - PCLMULQDQ $0x11, KS_M1, XTMP3 - - // XOR all products and move bits 63-32 bits to lower 32 bits - PXOR XTMP1, XTMP2 - PXOR XTMP3, XDIGEST - PXOR XTMP2, XDIGEST - MOVQ XDIGEST, XDIGEST // Clear top 64 bits - PSRLDQ $4, XDIGEST - - // Prepare data and calculate bits 63-32 of tag - MOVOU XTMP5, XTMP1 - MOVOU XTMP5, XTMP2 - MOVOU XTMP6, XTMP3 - MOVOU XTMP6, XTMP4 - - PCLMULQDQ $0x10, KS_L, XTMP1 - PCLMULQDQ $0x01, KS_M1, XTMP2 - PCLMULQDQ $0x10, KS_M1, XTMP3 - PCLMULQDQ $0x01, KS_M2, XTMP4 - - // XOR all the products and keep only bits 63-32 - PXOR XTMP2, XTMP1 - PXOR XTMP4, XTMP3 - PXOR XTMP3, XTMP1 - PAND bits_32_63<>(SB), XTMP1 - - // OR with lower 32 bits, to construct 64 bits of tag - POR XTMP1, XDIGEST - - // Update tag - MOVQ XDIGEST, R10 - XORQ R10, (AX) - - // Copy last 16 bytes of KS to the front - MOVUPS (4*4)(BX), XTMP1 - MOVUPS XTMP1, (0*4)(BX) - - RET - -avx: - VMOVDQU (0)(CX), XDATA - - // Reverse data bytes - VMOVDQU bit_reverse_and_table<>(SB), XTMP1 - VPAND XTMP1, XDATA, XTMP2 - VPANDN XDATA, XTMP1, XTMP3 - VPSRLD $4, XTMP3, XTMP3 - - VMOVDQU bit_reverse_table_h<>(SB), XTMP1 - VPSHUFB XTMP2, XTMP1, XTMP4 - VMOVDQU bit_reverse_table_l<>(SB), XTMP1 - VPSHUFB XTMP3, XTMP1, XTMP1 - VPOR XTMP1, XTMP4, XTMP4 - - // ZUC authentication part, 4x32 data bits - // setup KS - VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32] - VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32] - VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160] - - // setup DATA - // Data bytes [31:0 0s 63:32 0s] - VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1 - // Data bytes [95:64 0s 127:96 0s] - VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2 - - - // clmul - // xor the results from 4 32-bit words together - // Calculate lower 32 bits of tag - VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3 - VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4 - VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5 - VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6 - - VPXOR XTMP3, XTMP4, XTMP3 - VPXOR XTMP5, XTMP6, XTMP5 - VPXOR XTMP3, XTMP5, XTMP3 - - // Move previous result to low 32 bits and XOR with previous digest - VMOVQ XTMP3, XTMP3 // Clear top 64 bits - VPSRLDQ $4, XTMP3, XDIGEST - - VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3 - VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4 - VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5 - VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6 - - // XOR all the products and keep only 32-63 bits - VPXOR XTMP4, XTMP3, XTMP3 - VPXOR XTMP6, XTMP5, XTMP5 - VPXOR XTMP5, XTMP3, XTMP3 - VPAND bits_32_63<>(SB), XTMP3, XTMP3 - - // XOR with bits 32-63 of previous digest - VPXOR XTMP3, XDIGEST, XDIGEST - - // Update tag - VMOVQ XDIGEST, R10 - XORQ R10, (AX) - - // Copy last 16 bytes of KS to the front - VMOVDQU (4*4)(BX), XTMP1 - VMOVDQU XTMP1, (0*4)(BX) - - VZEROUPPER - RET - -// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte) -TEXT ·eia256RoundTag16(SB),NOSPLIT,$0 - MOVQ t+0(FP), AX - MOVQ ks+8(FP), BX - MOVQ p+16(FP), CX - - CMPB ·useAVX(SB), $1 - JE avx - - // Reverse data bytes - MOVUPS (0)(CX), XDATA - MOVOU bit_reverse_and_table<>(SB), XTMP4 - MOVOU XDATA, XTMP2 - PAND XTMP4, XTMP2 - - PANDN XDATA, XTMP4 - PSRLQ $4, XTMP4 - - MOVOU bit_reverse_table_h<>(SB), XTMP3 - PSHUFB XTMP2, XTMP3 - - MOVOU bit_reverse_table_l<>(SB), XTMP1 - PSHUFB XTMP4, XTMP1 - - PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes - - // ZUC authentication part, 4x32 data bits - // setup KS - MOVUPS (0*4)(BX), XTMP1 - MOVUPS (2*4)(BX), XTMP2 - MOVUPS (4*4)(BX), XTMP4 - PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32] - PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96] - PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160] - PSHUFD $0xBB, XTMP4, KS_H // KS bits [255:224 223:192 255:224 223:192] - - // setup DATA - MOVOU XTMP3, XTMP1 - PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1 - MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s] - - PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3 - MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s] - - // clmul - // xor the results from 4 32-bit words together - // Save data for following products - MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s] - MOVOU XTMP3, XTMP6 // Data bits [95:64 0s 127:96 0s] - - // Calculate lower 32 bits of tag - PCLMULQDQ $0x00, KS_L, XTMP1 - PCLMULQDQ $0x11, KS_L, XTMP2 - PCLMULQDQ $0x00, KS_M1, XDIGEST - PCLMULQDQ $0x11, KS_M1, XTMP3 - - // XOR all products and move bits 63-32 bits to lower 32 bits - PXOR XTMP1, XTMP2 - PXOR XTMP3, XDIGEST - PXOR XTMP2, XDIGEST - MOVQ XDIGEST, XDIGEST // Clear top 64 bits - PSRLDQ $4, XDIGEST - - // Prepare data and calculate bits 63-32 of tag - MOVOU XTMP5, XTMP1 - MOVOU XTMP5, XTMP2 - MOVOU XTMP6, XTMP3 - MOVOU XTMP6, XTMP4 - - PCLMULQDQ $0x10, KS_L, XTMP1 - PCLMULQDQ $0x01, KS_M1, XTMP2 - PCLMULQDQ $0x10, KS_M1, XTMP3 - PCLMULQDQ $0x01, KS_M2, XTMP4 - - // XOR all the products and keep only bits 63-32 - PXOR XTMP2, XTMP1 - PXOR XTMP4, XTMP3 - PXOR XTMP3, XTMP1 - PAND bits_32_63<>(SB), XTMP1 - - // OR with lower 32 bits, to construct 64 bits of tag - POR XTMP1, XDIGEST - - // Prepare data and calculate bits 95-64 of tag - MOVOU XTMP5, XTMP1 - MOVOU XTMP5, XTMP2 - MOVOU XTMP6, XTMP3 - MOVOU XTMP6, XTMP4 - - PCLMULQDQ $0x00, KS_M1, XTMP1 - PCLMULQDQ $0x11, KS_M1, XTMP2 - PCLMULQDQ $0x00, KS_M2, XTMP3 - PCLMULQDQ $0x11, KS_M2, XTMP4 - - // XOR all the products and move bits 63-32 to bits 95-64 - PXOR XTMP2, XTMP1 - PXOR XTMP4, XTMP3 - PXOR XTMP3, XTMP1 - PSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP1 - - // OR with lower 64 bits, to construct 96 bits of tag - POR XTMP1, XDIGEST - - // Prepare data and calculate bits 127-96 of tag - MOVOU XTMP5, XTMP1 - MOVOU XTMP5, XTMP2 - MOVOU XTMP6, XTMP3 - MOVOU XTMP6, XTMP4 - - PCLMULQDQ $0x10, KS_M1, XTMP1 - PCLMULQDQ $0x01, KS_M2, XTMP2 - PCLMULQDQ $0x10, KS_M2, XTMP3 - PCLMULQDQ $0x01, KS_H, XTMP4 - - // XOR all the products and move bits 63-32 to bits 127-96 - PXOR XTMP2, XTMP1 - PXOR XTMP4, XTMP3 - PXOR XTMP3, XTMP1 - PSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP1 - - // OR with lower 96 bits, to construct 128 bits of tag - POR XTMP1, XDIGEST - - // Update tag - MOVUPS (AX), XTMP1 - PXOR XTMP1, XDIGEST - MOVUPS XDIGEST, (AX) - - // Copy last 16 bytes of KS to the front - MOVUPS (4*4)(BX), XTMP1 - MOVUPS XTMP1, (0*4)(BX) - - RET - -avx: - VMOVDQU (0)(CX), XDATA - - // Reverse data bytes - VMOVDQU bit_reverse_and_table<>(SB), XTMP1 - VPAND XTMP1, XDATA, XTMP2 - VPANDN XDATA, XTMP1, XTMP3 - VPSRLD $4, XTMP3, XTMP3 - - VMOVDQU bit_reverse_table_h<>(SB), XTMP1 - VPSHUFB XTMP2, XTMP1, XTMP4 - VMOVDQU bit_reverse_table_l<>(SB), XTMP1 - VPSHUFB XTMP3, XTMP1, XTMP1 - VPOR XTMP1, XTMP4, XTMP4 - - // ZUC authentication part, 4x32 data bits - // setup KS - VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32] - VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32] - VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160] - VPSHUFD $0xBB, (4*4)(BX), KS_H // KS bits [255:224 223:192 255:224 223:192] - - // setup DATA - // Data bytes [31:0 0s 63:32 0s] - VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1 - // Data bytes [95:64 0s 127:96 0s] - VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2 - - - // clmul - // xor the results from 4 32-bit words together - // Calculate lower 32 bits of tag - VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3 - VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4 - VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5 - VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6 - - VPXOR XTMP3, XTMP4, XTMP3 - VPXOR XTMP5, XTMP6, XTMP5 - VPXOR XTMP3, XTMP5, XTMP3 - - // Move previous result to low 32 bits and XOR with previous digest - VMOVQ XTMP3, XTMP3 // Clear top 64 bits - VPSRLDQ $4, XTMP3, XDIGEST - - VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3 - VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4 - VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5 - VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6 - - // XOR all the products and keep only 32-63 bits - VPXOR XTMP4, XTMP3, XTMP3 - VPXOR XTMP6, XTMP5, XTMP5 - VPXOR XTMP5, XTMP3, XTMP3 - VPAND bits_32_63<>(SB), XTMP3, XTMP3 - - // XOR with bits 32-63 of previous digest - VPXOR XTMP3, XDIGEST, XDIGEST - - // Prepare data and calculate bits 95-64 of tag - VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3 - VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4 - VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5 - VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6 - - // XOR all the products and move bits 63-32 to bits 95-64 - VPXOR XTMP4, XTMP3, XTMP3 - VPXOR XTMP6, XTMP5, XTMP5 - VPXOR XTMP5, XTMP3, XTMP3 - - VPSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP3, XTMP3 - - // XOR with previous bits 64-95 of previous digest - VPXOR XTMP3, XDIGEST, XDIGEST - - // Prepare data and calculate bits 127-96 of tag - VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3 - VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4 - VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5 - VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6 - - // XOR all the products and move bits 63-32 to bits 127-96 - VPXOR XTMP4, XTMP3, XTMP3 - VPXOR XTMP6, XTMP5, XTMP5 - VPXOR XTMP5, XTMP3, XTMP3 - - VPSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP3, XTMP3 - - // XOR with previous bits 64-95 of previous digest - VPXOR XTMP3, XDIGEST, XDIGEST - - // Update tag - VPXOR (AX), XDIGEST, XDIGEST - VMOVDQA XDIGEST, (AX) - - // Copy last 16 bytes of KS to the front - VMOVDQU (4*4)(BX), XTMP1 - VMOVDQU XTMP1, (0*4)(BX) - - VZEROUPPER - RET diff --git a/zuc/eia256_asm_arm64.s b/zuc/eia256_asm_arm64.s deleted file mode 100644 index e67e9f3..0000000 --- a/zuc/eia256_asm_arm64.s +++ /dev/null @@ -1,208 +0,0 @@ -//go:build !purego - -#include "textflag.h" - -#define AX R2 -#define BX R3 -#define CX R4 -#define DX R5 - -#define XTMP1 V1 -#define XTMP2 V2 -#define XTMP3 V3 -#define XTMP4 V4 -#define XTMP5 V5 -#define XTMP6 V6 -#define XDATA V7 -#define XDIGEST V8 -#define KS_L V9 -#define KS_M1 V10 -#define KS_M2 V11 -#define KS_H V12 -#define BIT_REV_AND_TAB V20 -#define BIT_REV_TAB_L V21 -#define BIT_REV_TAB_H V22 -#define SHUF_MASK_DW0_DW1 V23 -#define SHUF_MASK_DW2_DW3 V24 - -#define LOAD_GLOBAL_DATA() \ - MOVD $·eia_const(SB), R0 \ - VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16, SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16] \ - MOVW $0x0F0F0F0F, R0 \ - VDUP R0, BIT_REV_AND_TAB.S4 - -// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte) -TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 - MOVD t+0(FP), AX - MOVD ks+8(FP), BX - MOVD p+16(FP), CX - - LOAD_GLOBAL_DATA() - - // Reverse data bytes - VLD1 (CX), [XDATA.B16] - VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 - VUSHR $4, XDATA.B16, XTMP1.B16 - - VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 - VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 - VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes - - // ZUC authentication part, 4x32 data bits - // setup KS - VLD1 (BX), [XTMP1.B16, XTMP2.B16] - VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front - // TODO: Any better solution??? - VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4 - VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32] - VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160] - VDUP XTMP1.S[3], KS_M1.S4 - VMOV XTMP1.S[2], KS_M1.S[1] - VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96] - - // setup DATA - VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s] - VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s] - - // clmul - // xor the results from 4 32-bit words together - - // Calculate lower 32 bits of tag - VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1 - VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1 - VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1 - VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1 - - VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 - VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 - VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 - - // Move previous result to low 32 bits and XOR with previous digest - VMOV XTMP3.S[1], XDIGEST.S[0] - - // Prepare data and calculate bits 63-32 of tag - VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16 - VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1 - VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16 - VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1 - VEXT $8, KS_M1.B16, KS_M1.B16, XTMP1.B16 - VPMULL XTMP1.D1, XTMP2.D1, XTMP5.Q1 - VEXT $8, XTMP2.B16, XTMP2.B16, XTMP1.B16 - VPMULL KS_M2.D1, XTMP1.D1, XTMP6.Q1 - - VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 - VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 - VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 - - VMOV XTMP3.S[1], XDIGEST.S[1] - - VMOV XDIGEST.D[0], R10 - MOVD (AX), R11 - EOR R10, R11 - MOVD R11, (AX) - - RET - -// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte) -TEXT ·eia256RoundTag16(SB),NOSPLIT,$0 - MOVD t+0(FP), AX - MOVD ks+8(FP), BX - MOVD p+16(FP), CX - - LOAD_GLOBAL_DATA() - - // Reverse data bytes - VLD1 (CX), [XDATA.B16] - VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 - VUSHR $4, XDATA.B16, XTMP1.B16 - - VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 - VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 - VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes - - // ZUC authentication part, 4x32 data bits - // setup KS - VLD1 (BX), [XTMP1.B16, XTMP2.B16] - VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front - // TODO: Any better solution??? We can use VTBL, but there are no performance imprvoement if we can't reuse MASK constant - VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4 - VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32] - VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160] - VMOVQ $0x0b0a09080f0e0d0c, $0x0b0a09080f0e0d0c, XTMP4 - VTBL XTMP4.B16, [XTMP2.B16], KS_H.B16 // KS bits [255:224 223:192 255:224 223:192] - VDUP XTMP1.S[3], KS_M1.S4 - VMOV XTMP1.S[2], KS_M1.S[1] - VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96] - - // setup DATA - VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s] - VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s] - - // clmul - // xor the results from 4 32-bit words together - - // Calculate lower 32 bits of tag - VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1 - VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1 - VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1 - VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1 - - VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 - VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 - VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 - - // Move previous result to low 32 bits and XOR with previous digest - VMOV XTMP3.S[1], XDIGEST.S[0] - - // Prepare data and calculate bits 63-32 of tag - VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16 - VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1 - VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16 - VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1 - VEXT $8, KS_M1.B16, KS_M1.B16, XTMP6.B16 - VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1 - VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16 - VPMULL KS_M2.D1, KS_L.D1, XTMP6.Q1 - - // XOR all the products and keep only 32-63 bits - VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 - VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 - VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 - - VMOV XTMP3.S[1], XDIGEST.S[1] - - // Prepare data and calculate bits 95-64 of tag - VPMULL KS_M1.D1, XTMP1.D1, XTMP3.Q1 - VPMULL2 KS_M1.D2, XTMP1.D2, XTMP4.Q1 - VPMULL KS_M2.D1, XTMP2.D1, XTMP5.Q1 - VPMULL2 KS_M2.D2, XTMP2.D2, XTMP6.Q1 - - // XOR all the products and move bits 63-32 to bits 95-64 - VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 - VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 - VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 - - VMOV XTMP3.S[1], XDIGEST.S[2] - - // Prepare data and calculate bits 127-96 of tag - VEXT $8, KS_M1.B16, KS_M1.B16, XTMP5.B16 - VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1 - VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16 - VPMULL KS_M2.D1, XTMP5.D1, XTMP4.Q1 - VEXT $8, KS_M2.B16, KS_M2.B16, XTMP6.B16 - VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1 - VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16 - VPMULL KS_H.D1, KS_L.D1, XTMP6.Q1 - - // XOR all the products and move bits 63-32 to bits 127-96 - VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 - VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 - VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 - - VMOV XTMP3.S[1], XDIGEST.S[3] - - VLD1 (AX), [XTMP1.B16] - VEOR XTMP1.B16, XDIGEST.B16, XDIGEST.B16 - VST1 [XDIGEST.B16], (AX) - - RET diff --git a/zuc/eia256_asm_ppc64x.s b/zuc/eia256_asm_ppc64x.s deleted file mode 100644 index 722d37a..0000000 --- a/zuc/eia256_asm_ppc64x.s +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright 2024 Sun Yimin. All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -//go:build (ppc64 || ppc64le) && !purego - -#include "textflag.h" - -#define XTMP1 V0 -#define XTMP2 V1 -#define XTMP3 V2 -#define XTMP4 V3 -#define XTMP5 V4 -#define XTMP6 V5 -#define XDATA V6 -#define XDIGEST V7 -#define KS_L V8 -#define KS_M1 V9 -#define KS_M2 V10 -#define KS_H V11 -#define BIT_REV_TAB_L V12 -#define BIT_REV_TAB_H V13 -#define ZERO V15 -#define PTR R7 - -// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte) -TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 - MOVD t+0(FP), R3 - MOVD ks+8(FP), R4 - MOVD p+16(FP), R5 - -#ifndef GOARCH_ppc64le - MOVD $·rcon(SB), PTR // PTR points to rcon addr - LVX (PTR), XTMP1 - ADD $0x10, PTR -#else - MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector) -#endif - - LXVD2X (R5)(R0), XDATA -#ifndef GOARCH_ppc64le - VPERM XDATA, XDATA, XTMP1, XDATA -#endif - - VSPLTISB $4, XTMP2; - LXVD2X (PTR)(R0), BIT_REV_TAB_L - VSLB BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H - VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, XDATA, XTMP3 // XTMP3 - bit reverse data bytes - - // ZUC authentication part, 4x32 data bits - // setup data - VSPLTISB $0, ZERO - MOVD $0x10, R8 - LXVD2X (PTR)(R8), XTMP4 - VPERM ZERO, XTMP3, XTMP4, XTMP1 - MOVD $0x20, R8 - LXVD2X (PTR)(R8), XTMP4 - VPERM ZERO, XTMP3, XTMP4, XTMP2 - - // setup KS - LXVW4X (R4), KS_L - MOVD $8, R8 - LXVW4X (R8)(R4), KS_M1 - MOVD $16, R8 - LXVW4X (R8)(R4), KS_M2 - MOVD $0x30, R8 - LXVD2X (PTR)(R8), XTMP4 - VPERM KS_L, KS_L, XTMP4, KS_L - VPERM KS_M1, KS_M1, XTMP4, KS_M1 - VPERM KS_M2, KS_M2, XTMP4, KS_M2 - - // clmul - // xor the results from 4 32-bit words together - // Calculate lower 32 bits of tag - VPMSUMD XTMP1, KS_L, XTMP3 - VPMSUMD XTMP2, KS_M1, XTMP4 - VXOR XTMP3, XTMP4, XTMP3 - VSPLTW $2, XTMP3, XDIGEST - - // Calculate upper 32 bits of tag - VSLDOI $8, KS_M1, KS_L, KS_L - VPMSUMD XTMP1, KS_L, XTMP3 - VSLDOI $8, KS_M2, KS_M1, KS_M1 - VPMSUMD XTMP2, KS_M1, XTMP4 - VXOR XTMP3, XTMP4, XTMP3 - VSPLTW $2, XTMP3, XTMP3 - - // Update tag -#ifdef GOARCH_ppc64le - VSLDOI $12, XTMP3, XDIGEST, XDIGEST -#else - VSLDOI $12, XDIGEST, XTMP3, XDIGEST -#endif - MFVSRD XDIGEST, R8 - MOVD (R3), R6 - XOR R6, R8, R6 - MOVD R6, (R3) - - // Copy last 16 bytes of KS to the front - MOVD $16, R8 - LXVD2X (R8)(R4), XTMP1 - STXVD2X XTMP1, (R4)(R0) - - RET - -// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte) -TEXT ·eia256RoundTag16(SB),NOSPLIT,$0 - MOVD t+0(FP), R3 - MOVD ks+8(FP), R4 - MOVD p+16(FP), R5 - -#ifndef GOARCH_ppc64le - MOVD $·rcon(SB), PTR // PTR points to rcon addr - LVX (PTR), XTMP1 - ADD $0x10, PTR -#else - MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector) -#endif - - LXVD2X (R5)(R0), XDATA -#ifndef GOARCH_ppc64le - VPERM XDATA, XDATA, XTMP1, XDATA -#endif - - VSPLTISB $4, XTMP2; - LXVD2X (PTR)(R0), BIT_REV_TAB_L - VSLB BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H - VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, XDATA, XTMP3 // XTMP3 - bit reverse data bytes - - // ZUC authentication part, 4x32 data bits - // setup data - VSPLTISB $0, ZERO - MOVD $0x10, R8 - LXVD2X (PTR)(R8), XTMP4 - VPERM ZERO, XTMP3, XTMP4, XTMP1 - MOVD $0x20, R8 - LXVD2X (PTR)(R8), XTMP4 - VPERM ZERO, XTMP3, XTMP4, XTMP2 - - // setup KS - LXVW4X (R4), KS_L - MOVD $8, R8 - LXVW4X (R8)(R4), KS_M1 - MOVD $16, R8 - LXVW4X (R8)(R4), KS_M2 - VOR KS_M2, KS_M2, KS_H - MOVD $0x30, R8 - LXVD2X (PTR)(R8), XTMP4 - VPERM KS_L, KS_L, XTMP4, KS_L - VPERM KS_M1, KS_M1, XTMP4, KS_M1 - VPERM KS_M2, KS_M2, XTMP4, KS_M2 - - // clmul - // xor the results from 4 32-bit words together - // Calculate lower 32 bits of tag - VPMSUMD XTMP1, KS_L, XTMP3 - VPMSUMD XTMP2, KS_M1, XTMP4 - VXOR XTMP3, XTMP4, XTMP3 - VSLDOI $12, XTMP3, XTMP3, XDIGEST - - // Calculate upper 32 bits of tag - VSLDOI $8, KS_M1, KS_L, KS_L - VPMSUMD XTMP1, KS_L, XTMP3 - VSLDOI $8, KS_M2, KS_M1, XTMP5 - VPMSUMD XTMP2, XTMP5, XTMP4 - VXOR XTMP3, XTMP4, XTMP3 - VSLDOI $8, XTMP3, XTMP3, XTMP3 - VSLDOI $4, XDIGEST, XTMP3, XDIGEST - - // calculate bits 95-64 of tag - VPMSUMD XTMP1, KS_M1, XTMP3 - VPMSUMD XTMP2, KS_M2, XTMP4 - VXOR XTMP3, XTMP4, XTMP3 - VSLDOI $8, XTMP3, XTMP3, XTMP3 - VSLDOI $4, XDIGEST, XTMP3, XDIGEST - - // calculate bits 127-96 of tag - VSLDOI $8, KS_M2, KS_M1, KS_M1 - VPMSUMD XTMP1, KS_M1, XTMP3 - VSLDOI $8, KS_H, KS_M2, KS_M2 - VPMSUMD XTMP2, KS_M2, XTMP4 - VXOR XTMP3, XTMP4, XTMP3 - VSLDOI $8, XTMP3, XTMP3, XTMP3 - VSLDOI $4, XDIGEST, XTMP3, XDIGEST - - // Update tag - LXVW4X (R3)(R0), XTMP1 - VXOR XTMP1, XDIGEST, XDIGEST - STXVW4X XDIGEST, (R3) - - // Copy last 16 bytes of KS to the front - MOVD $16, R8 - LXVD2X (R8)(R4), XTMP1 - STXVD2X XTMP1, (R4)(R0) - - RET diff --git a/zuc/eia_asm.go b/zuc/eia_asm.go index df321dc..fb06888 100644 --- a/zuc/eia_asm.go +++ b/zuc/eia_asm.go @@ -9,13 +9,13 @@ import ( var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD //go:noescape -func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) +func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte) func block(m *ZUC128Mac, p []byte) { if supportsGFMUL { for len(p) >= chunk { m.genKeywords(m.k0[4:]) - eia3Round16B(&m.t, &m.k0[0], &p[0], m.tagSize) + eiaRoundTag4(&m.t, &m.k0[0], &p[0]) p = p[chunk:] } } else { diff --git a/zuc/eia_asm_amd64.s b/zuc/eia_asm_amd64.s index dd1d9d4..375e390 100644 --- a/zuc/eia_asm_amd64.s +++ b/zuc/eia_asm_amd64.s @@ -37,32 +37,41 @@ GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16 #define KS_M2 X11 #define KS_H X12 -// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) -TEXT ·eia3Round16B(SB),NOSPLIT,$0 +#define BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2) \ + MOVOU bit_reverse_and_table<>(SB), XTMP1; \ + MOVOU XDATA, XTMP2; \ + PAND XTMP1, XTMP2; \ + PANDN XDATA, XTMP1; \ + PSRLQ $4, XTMP1; \ + MOVOU bit_reverse_table_h<>(SB), XDATA; \ + PSHUFB XTMP2, XDATA; \ + MOVOU bit_reverse_table_l<>(SB), XTMP2; \ + PSHUFB XTMP1, XTMP2; \ + PXOR XTMP2, XDATA + +#define BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2) \ + VMOVDQU bit_reverse_and_table<>(SB), XTMP1; \ + VPAND XTMP1, XDATA, XTMP2; \ + VPANDN XDATA, XTMP1, XTMP1; \ + VPSRLD $4, XTMP1, XTMP1; \ + VMOVDQU bit_reverse_table_h<>(SB), XDATA; \ + VPSHUFB XTMP2, XDATA, XDATA; \ + VMOVDQU bit_reverse_table_l<>(SB), XTMP2; \ + VPSHUFB XTMP1, XTMP2, XTMP1; \ + VPOR XTMP1, XDATA, XDATA + +// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte) +TEXT ·eiaRoundTag4(SB),NOSPLIT,$0 MOVQ t+0(FP), AX MOVQ ks+8(FP), BX MOVQ p+16(FP), CX - MOVQ tagSize+24(FP), DX CMPB ·useAVX(SB), $1 JE avx // Reverse data bytes MOVUPS (0)(CX), XDATA - MOVOU bit_reverse_and_table<>(SB), XTMP4 - MOVOU XDATA, XTMP2 - PAND XTMP4, XTMP2 - - PANDN XDATA, XTMP4 - PSRLQ $4, XTMP4 - - MOVOU bit_reverse_table_h<>(SB), XTMP3 - PSHUFB XTMP2, XTMP3 - - MOVOU bit_reverse_table_l<>(SB), XTMP1 - PSHUFB XTMP4, XTMP1 - - PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes + BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2) // ZUC authentication part, 4x32 data bits // setup KS @@ -72,12 +81,12 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0 PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96] // setup DATA - MOVOU XTMP3, XTMP1 + MOVOU XDATA, XTMP1 PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1 MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s] - PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3 - MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s] + PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA + MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s] // clmul // xor the results from 4 32-bit words together @@ -85,11 +94,11 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0 PCLMULQDQ $0x00, KS_L, XTMP1 PCLMULQDQ $0x11, KS_L, XTMP2 PCLMULQDQ $0x00, KS_M1, XDIGEST - PCLMULQDQ $0x11, KS_M1, XTMP3 + PCLMULQDQ $0x11, KS_M1, XDATA // XOR all products and move 32-bits to lower 32 bits PXOR XTMP1, XTMP2 - PXOR XTMP3, XDIGEST + PXOR XDATA, XDIGEST PXOR XTMP2, XDIGEST PSRLDQ $4, XDIGEST @@ -105,18 +114,8 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0 avx: VMOVDQU (0)(CX), XDATA - // Reverse data bytes - VMOVDQU bit_reverse_and_table<>(SB), XTMP1 - VPAND XTMP1, XDATA, XTMP2 - VPANDN XDATA, XTMP1, XTMP3 - VPSRLD $4, XTMP3, XTMP3 - - VMOVDQU bit_reverse_table_h<>(SB), XTMP1 - VPSHUFB XTMP2, XTMP1, XTMP4 - VMOVDQU bit_reverse_table_l<>(SB), XTMP1 - VPSHUFB XTMP3, XTMP1, XTMP1 - VPOR XTMP1, XTMP4, XTMP4 + BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2) // ZUC authentication part, 4x32 data bits // setup KS @@ -125,9 +124,9 @@ avx: // setup DATA // Data bytes [31:0 0s 63:32 0s] - VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1 + VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1 // Data bytes [95:64 0s 127:96 0s] - VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2 + VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2 // clmul // xor the results from 4 32-bit words together @@ -140,14 +139,339 @@ avx: VPXOR XTMP3, XTMP4, XTMP3 VPXOR XTMP5, XTMP6, XTMP5 VPXOR XTMP3, XTMP5, XDIGEST + VPSLLDQ $4, XDIGEST, XDIGEST - VMOVQ XDIGEST, R10 - SHRQ $32, R10 + // Update tag + MOVL XDIGEST, R10 XORL R10, (AX) // Copy last 16 bytes of KS to the front VMOVDQU (4*4)(BX), XTMP1 VMOVDQU XTMP1, (0*4)(BX) - VZEROUPPER + RET + +// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte) +TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 + MOVQ t+0(FP), AX + MOVQ ks+8(FP), BX + MOVQ p+16(FP), CX + + CMPB ·useAVX(SB), $1 + JE avx + + // Reverse data bytes + MOVUPS (0)(CX), XDATA + BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2) + + // ZUC authentication part, 4x32 data bits + // setup KS + MOVUPS (0*4)(BX), XTMP1 + MOVUPS (2*4)(BX), XTMP2 + MOVUPS (4*4)(BX), XTMP4 + PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32] + PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96] + PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160] + + // setup DATA + MOVOU XDATA, XTMP1 + PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1 + MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s] + + PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA + MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s] + + // clmul + // xor the results from 4 32-bit words together + // Save data for following products + MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s] + MOVOU XDATA, XTMP6 // Data bits [95:64 0s 127:96 0s] + + // Calculate lower 32 bits of tag + PCLMULQDQ $0x00, KS_L, XTMP1 + PCLMULQDQ $0x11, KS_L, XTMP2 + PCLMULQDQ $0x00, KS_M1, XDIGEST + PCLMULQDQ $0x11, KS_M1, XDATA + + // XOR all products and move bits 63-32 bits to lower 32 bits + PXOR XTMP1, XTMP2 + PXOR XDATA, XDIGEST + PXOR XTMP2, XDIGEST + PSLLDQ $8, XDIGEST // Move bits 63-32 to bits 127-96 + + // Prepare data and calculate bits 63-32 of tag + MOVOU XTMP5, XTMP1 + MOVOU XTMP5, XTMP2 + MOVOU XTMP6, XTMP3 + MOVOU XTMP6, XTMP4 + + PCLMULQDQ $0x10, KS_L, XTMP1 + PCLMULQDQ $0x01, KS_M1, XTMP2 + PCLMULQDQ $0x10, KS_M1, XTMP3 + PCLMULQDQ $0x01, KS_M2, XTMP4 + + // XOR all the products and keep only bits 63-32 + PXOR XTMP2, XTMP1 + PXOR XTMP4, XTMP3 + PXOR XTMP3, XTMP1 + PSRLDQ $4, XTMP1 // Move bits 63-32 to bits 31-0 + + PALIGNR $12, XDIGEST, XTMP1 // XTMP1 || XDIGEST + + // Update tag + MOVQ XTMP1, R10 + XORQ R10, (AX) + + // Copy last 16 bytes of KS to the front + MOVUPS (4*4)(BX), XTMP1 + MOVUPS XTMP1, (0*4)(BX) + + RET + +avx: + VMOVDQU (0)(CX), XDATA + + // Reverse data bytes + BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2) + + // ZUC authentication part, 4x32 data bits + // setup KS + VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32] + VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32] + VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160] + + // setup DATA + // Data bytes [31:0 0s 63:32 0s] + VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1 + // Data bytes [95:64 0s 127:96 0s] + VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2 + + // clmul + // xor the results from 4 32-bit words together + // Calculate lower 32 bits of tag + VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3 + VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4 + VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5 + VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6 + + VPXOR XTMP3, XTMP4, XTMP3 + VPXOR XTMP5, XTMP6, XTMP5 + VPXOR XTMP3, XTMP5, XTMP3 + VPSLLDQ $8, XTMP3, XDIGEST + + VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3 + VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4 + VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5 + VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6 + + // XOR all the products and keep only 32-63 bits + VPXOR XTMP4, XTMP3, XTMP3 + VPXOR XTMP6, XTMP5, XTMP5 + VPXOR XTMP5, XTMP3, XTMP3 + VPSRLDQ $4, XTMP3, XTMP3 + + VPALIGNR $12, XDIGEST, XTMP3, XDIGEST + + // Update tag + VMOVQ XDIGEST, R10 + XORQ R10, (AX) + + // Copy last 16 bytes of KS to the front + VMOVDQU (4*4)(BX), XTMP1 + VMOVDQU XTMP1, (0*4)(BX) + + RET + +// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte) +TEXT ·eia256RoundTag16(SB),NOSPLIT,$0 + MOVQ t+0(FP), AX + MOVQ ks+8(FP), BX + MOVQ p+16(FP), CX + + CMPB ·useAVX(SB), $1 + JE avx + + // Reverse data bytes + MOVUPS (0)(CX), XDATA + BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2) + + // ZUC authentication part, 4x32 data bits + // setup KS + MOVUPS (0*4)(BX), XTMP1 + MOVUPS (2*4)(BX), XTMP2 + MOVUPS (4*4)(BX), XTMP4 + PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32] + PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96] + PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160] + PSHUFD $0xBB, XTMP4, KS_H // KS bits [255:224 223:192 255:224 223:192] + + // setup DATA + MOVOU XDATA, XTMP1 + PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1 + MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s] + + PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA + MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s] + + // clmul + // xor the results from 4 32-bit words together + // Save data for following products + MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s] + MOVOU XDATA, XTMP6 // Data bits [95:64 0s 127:96 0s] + + // Calculate lower 32 bits of tag + PCLMULQDQ $0x00, KS_L, XTMP1 + PCLMULQDQ $0x11, KS_L, XTMP2 + PCLMULQDQ $0x00, KS_M1, XDIGEST + PCLMULQDQ $0x11, KS_M1, XDATA + + // XOR all products and move bits 63-32 bits to lower 32 bits + PXOR XTMP1, XTMP2 + PXOR XDATA, XDIGEST + PXOR XTMP2, XDIGEST + PSLLDQ $8, XDIGEST // Move bits 63-32 to bits 127-96 + + // Prepare data and calculate bits 63-32 of tag + MOVOU XTMP5, XTMP1 + MOVOU XTMP5, XTMP2 + MOVOU XTMP6, XTMP3 + MOVOU XTMP6, XTMP4 + + PCLMULQDQ $0x10, KS_L, XTMP1 + PCLMULQDQ $0x01, KS_M1, XTMP2 + PCLMULQDQ $0x10, KS_M1, XTMP3 + PCLMULQDQ $0x01, KS_M2, XTMP4 + + // XOR all the products and keep only bits 63-32 + PXOR XTMP2, XTMP1 + PXOR XTMP4, XTMP3 + PXOR XTMP3, XTMP1 + PSRLDQ $4, XTMP1 // Move bits 63-32 to bits 31-0 + PALIGNR $4, XDIGEST, XTMP1 // XTMP1 || XDIGEST + + // Prepare data and calculate bits 95-64 of tag + MOVOU XTMP5, XDIGEST + MOVOU XTMP5, XTMP2 + MOVOU XTMP6, XTMP3 + MOVOU XTMP6, XTMP4 + + PCLMULQDQ $0x00, KS_M1, XDIGEST + PCLMULQDQ $0x11, KS_M1, XTMP2 + PCLMULQDQ $0x00, KS_M2, XTMP3 + PCLMULQDQ $0x11, KS_M2, XTMP4 + + // XOR all the products and move bits 63-32 to bits 95-64 + PXOR XTMP2, XDIGEST + PXOR XTMP4, XTMP3 + PXOR XTMP3, XDIGEST + PSRLDQ $4, XDIGEST // Move bits 63-32 to bits 31-0 + PALIGNR $4, XTMP1, XDIGEST // XDIGEST || XTMP1 + + // Prepare data and calculate bits 127-96 of tag + MOVOU XTMP5, XTMP1 + MOVOU XTMP5, XTMP2 + MOVOU XTMP6, XTMP3 + MOVOU XTMP6, XTMP4 + + PCLMULQDQ $0x10, KS_M1, XTMP1 + PCLMULQDQ $0x01, KS_M2, XTMP2 + PCLMULQDQ $0x10, KS_M2, XTMP3 + PCLMULQDQ $0x01, KS_H, XTMP4 + + // XOR all the products and move bits 63-32 to bits 127-96 + PXOR XTMP2, XTMP1 + PXOR XTMP4, XTMP3 + PXOR XTMP3, XTMP1 + PSRLDQ $4, XTMP1 // Move bits 63-32 to bits 31-0 + PALIGNR $4, XDIGEST, XTMP1 // XTMP1 || XDIGEST + + // Update tag + MOVUPS (AX), XDIGEST + PXOR XTMP1, XDIGEST + MOVUPS XDIGEST, (AX) + + // Copy last 16 bytes of KS to the front + MOVUPS (4*4)(BX), XTMP1 + MOVUPS XTMP1, (0*4)(BX) + + RET + +avx: + VMOVDQU (0)(CX), XDATA + + // Reverse data bytes + BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2) + + // ZUC authentication part, 4x32 data bits + // setup KS + VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32] + VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32] + VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160] + VPSHUFD $0xBB, (4*4)(BX), KS_H // KS bits [255:224 223:192 255:224 223:192] + + // setup DATA + // Data bytes [31:0 0s 63:32 0s] + VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1 + // Data bytes [95:64 0s 127:96 0s] + VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2 + + // clmul + // xor the results from 4 32-bit words together + // Calculate lower 32 bits of tag + VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3 + VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4 + VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5 + VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6 + + VPXOR XTMP3, XTMP4, XTMP3 + VPXOR XTMP5, XTMP6, XTMP5 + VPXOR XTMP3, XTMP5, XTMP3 + VPSLLDQ $8, XTMP3, XDIGEST + + VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3 + VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4 + VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5 + VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6 + + // XOR all the products and keep only 32-63 bits + VPXOR XTMP4, XTMP3, XTMP3 + VPXOR XTMP6, XTMP5, XTMP5 + VPXOR XTMP5, XTMP3, XTMP3 + VPSRLDQ $4, XTMP3, XTMP3 + VPALIGNR $4, XDIGEST, XTMP3, XDIGEST + + // Prepare data and calculate bits 95-64 of tag + VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3 + VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4 + VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5 + VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6 + + // XOR all the products and move bits 63-32 to bits 95-64 + VPXOR XTMP4, XTMP3, XTMP3 + VPXOR XTMP6, XTMP5, XTMP5 + VPXOR XTMP5, XTMP3, XTMP3 + VPSRLDQ $4, XTMP3, XTMP3 + VPALIGNR $4, XDIGEST, XTMP3, XDIGEST + + // Prepare data and calculate bits 127-96 of tag + VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3 + VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4 + VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5 + VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6 + + // XOR all the products and move bits 63-32 to bits 127-96 + VPXOR XTMP4, XTMP3, XTMP3 + VPXOR XTMP6, XTMP5, XTMP5 + VPXOR XTMP5, XTMP3, XTMP3 + VPSRLDQ $4, XTMP3, XTMP3 + VPALIGNR $4, XDIGEST, XTMP3, XDIGEST + + // Update tag + VPXOR (AX), XDIGEST, XDIGEST + VMOVDQA XDIGEST, (AX) + + // Copy last 16 bytes of KS to the front + VMOVDQU (4*4)(BX), XTMP1 + VMOVDQU XTMP1, (0*4)(BX) + RET diff --git a/zuc/eia_asm_arm64.s b/zuc/eia_asm_arm64.s index 23f8190..2bd4487 100644 --- a/zuc/eia_asm_arm64.s +++ b/zuc/eia_asm_arm64.s @@ -2,15 +2,15 @@ #include "textflag.h" -DATA ·eia_const+0x00(SB)/8, $0x0e060a020c040800 // bit_reverse_table low -DATA ·eia_const+0x08(SB)/8, $0x0f070b030d050901 -DATA ·eia_const+0x10(SB)/8, $0xe060a020c0408000 // bit_reverse_table high -DATA ·eia_const+0x18(SB)/8, $0xf070b030d0509010 -DATA ·eia_const+0x20(SB)/8, $0xffffffff03020100 // SHUF_MASK_DW0_DW1 -DATA ·eia_const+0x28(SB)/8, $0xffffffff07060504 -DATA ·eia_const+0x30(SB)/8, $0xffffffff0b0a0908 // SHUF_MASK_DW2_DW3 -DATA ·eia_const+0x38(SB)/8, $0xffffffff0f0e0d0c -GLOBL ·eia_const(SB), RODATA, $64 +DATA eia_const<>+0x00(SB)/8, $0x0e060a020c040800 // bit_reverse_table low +DATA eia_const<>+0x08(SB)/8, $0x0f070b030d050901 +DATA eia_const<>+0x10(SB)/8, $0xe060a020c0408000 // bit_reverse_table high +DATA eia_const<>+0x18(SB)/8, $0xf070b030d0509010 +DATA eia_const<>+0x20(SB)/8, $0xffffffff03020100 // SHUF_MASK_DW0_DW1 +DATA eia_const<>+0x28(SB)/8, $0xffffffff07060504 +DATA eia_const<>+0x30(SB)/8, $0xffffffff0b0a0908 // SHUF_MASK_DW2_DW3 +DATA eia_const<>+0x38(SB)/8, $0xffffffff0f0e0d0c +GLOBL eia_const<>(SB), RODATA, $64 #define AX R2 #define BX R3 @@ -36,28 +36,29 @@ GLOBL ·eia_const(SB), RODATA, $64 #define SHUF_MASK_DW2_DW3 V24 #define LOAD_GLOBAL_DATA() \ - MOVD $·eia_const(SB), R0 \ - VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16, SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16] \ - MOVW $0x0F0F0F0F, R0 \ - VDUP R0, BIT_REV_AND_TAB.S4 + MOVD $eia_const<>(SB), R0 \ + VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16, SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16] \ + MOVW $0x0F0F0F0F, R0 \ + VDUP R0, BIT_REV_AND_TAB.S4 -// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) -TEXT ·eia3Round16B(SB),NOSPLIT,$0 +#define BIT_REVERSE(XDATA, XTMP1, XTMP2) \ + VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP2.B16 \ + VUSHR $4, XDATA.B16, XTMP1.B16 \ + VTBL XTMP2.B16, [BIT_REV_TAB_H.B16], XTMP2.B16 \ + VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 \ + VEOR XTMP1.B16, XTMP2.B16, XDATA.B16 + +// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte) +TEXT ·eiaRoundTag4(SB),NOSPLIT,$0 MOVD t+0(FP), AX MOVD ks+8(FP), BX MOVD p+16(FP), CX - MOVD tagSize+24(FP), DX LOAD_GLOBAL_DATA() // Reverse data bytes VLD1 (CX), [XDATA.B16] - VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 - VUSHR $4, XDATA.B16, XTMP1.B16 - - VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 - VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 - VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes + BIT_REVERSE(XDATA, XTMP1, XTMP2) // ZUC authentication part, 4x32 data bits // setup KS @@ -72,8 +73,8 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0 VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96] // setup DATA - VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s] - VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s] + VTBL SHUF_MASK_DW0_DW1.B16, [XDATA.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s] + VTBL SHUF_MASK_DW2_DW3.B16, [XDATA.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s] // clmul // xor the results from 4 32-bit words together @@ -93,3 +94,169 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0 MOVW R11, (AX) RET + +// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte) +TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 + MOVD t+0(FP), AX + MOVD ks+8(FP), BX + MOVD p+16(FP), CX + + LOAD_GLOBAL_DATA() + + // Reverse data bytes + VLD1 (CX), [XDATA.B16] + BIT_REVERSE(XDATA, XTMP1, XTMP2) + + // ZUC authentication part, 4x32 data bits + // setup KS + VLD1 (BX), [XTMP1.B16, XTMP2.B16] + VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front + // TODO: Any better solution??? + VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4 + VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32] + VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160] + VDUP XTMP1.S[3], KS_M1.S4 + VMOV XTMP1.S[2], KS_M1.S[1] + VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96] + + // setup DATA + VTBL SHUF_MASK_DW0_DW1.B16, [XDATA.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s] + VTBL SHUF_MASK_DW2_DW3.B16, [XDATA.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s] + + // clmul + // xor the results from 4 32-bit words together + + // Calculate lower 32 bits of tag + VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1 + VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1 + VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1 + VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1 + + VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 + VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 + VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 + + // Move previous result to low 32 bits and XOR with previous digest + VMOV XTMP3.S[1], XDIGEST.S[0] + + // Prepare data and calculate bits 63-32 of tag + VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16 + VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1 + VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16 + VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1 + VEXT $8, KS_M1.B16, KS_M1.B16, XTMP1.B16 + VPMULL XTMP1.D1, XTMP2.D1, XTMP5.Q1 + VEXT $8, XTMP2.B16, XTMP2.B16, XTMP1.B16 + VPMULL KS_M2.D1, XTMP1.D1, XTMP6.Q1 + + VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 + VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 + VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 + + VMOV XTMP3.S[1], XDIGEST.S[1] + + VMOV XDIGEST.D[0], R10 + MOVD (AX), R11 + EOR R10, R11 + MOVD R11, (AX) + + RET + +// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte) +TEXT ·eia256RoundTag16(SB),NOSPLIT,$0 + MOVD t+0(FP), AX + MOVD ks+8(FP), BX + MOVD p+16(FP), CX + + LOAD_GLOBAL_DATA() + + // Reverse data bytes + VLD1 (CX), [XDATA.B16] + BIT_REVERSE(XDATA, XTMP1, XTMP2) + + // ZUC authentication part, 4x32 data bits + // setup KS + VLD1 (BX), [XTMP1.B16, XTMP2.B16] + VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front + // TODO: Any better solution??? We can use VTBL, but there are no performance imprvoement if we can't reuse MASK constant + VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4 + VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32] + VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160] + VMOVQ $0x0b0a09080f0e0d0c, $0x0b0a09080f0e0d0c, XTMP4 + VTBL XTMP4.B16, [XTMP2.B16], KS_H.B16 // KS bits [255:224 223:192 255:224 223:192] + VDUP XTMP1.S[3], KS_M1.S4 + VMOV XTMP1.S[2], KS_M1.S[1] + VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96] + + // setup DATA + VTBL SHUF_MASK_DW0_DW1.B16, [XDATA.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s] + VTBL SHUF_MASK_DW2_DW3.B16, [XDATA.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s] + + // clmul + // xor the results from 4 32-bit words together + + // Calculate lower 32 bits of tag + VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1 + VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1 + VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1 + VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1 + + VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 + VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 + VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 + + // Move previous result to low 32 bits and XOR with previous digest + VMOV XTMP3.S[1], XDIGEST.S[0] + + // Prepare data and calculate bits 63-32 of tag + VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16 + VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1 + VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16 + VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1 + VEXT $8, KS_M1.B16, KS_M1.B16, XTMP6.B16 + VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1 + VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16 + VPMULL KS_M2.D1, KS_L.D1, XTMP6.Q1 + + // XOR all the products and keep only 32-63 bits + VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 + VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 + VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 + + VMOV XTMP3.S[1], XDIGEST.S[1] + + // Prepare data and calculate bits 95-64 of tag + VPMULL KS_M1.D1, XTMP1.D1, XTMP3.Q1 + VPMULL2 KS_M1.D2, XTMP1.D2, XTMP4.Q1 + VPMULL KS_M2.D1, XTMP2.D1, XTMP5.Q1 + VPMULL2 KS_M2.D2, XTMP2.D2, XTMP6.Q1 + + // XOR all the products and move bits 63-32 to bits 95-64 + VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 + VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 + VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 + + VMOV XTMP3.S[1], XDIGEST.S[2] + + // Prepare data and calculate bits 127-96 of tag + VEXT $8, KS_M1.B16, KS_M1.B16, XTMP5.B16 + VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1 + VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16 + VPMULL KS_M2.D1, XTMP5.D1, XTMP4.Q1 + VEXT $8, KS_M2.B16, KS_M2.B16, XTMP6.B16 + VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1 + VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16 + VPMULL KS_H.D1, KS_L.D1, XTMP6.Q1 + + // XOR all the products and move bits 63-32 to bits 127-96 + VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 + VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 + VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 + + VMOV XTMP3.S[1], XDIGEST.S[3] + + VLD1 (AX), [XTMP1.B16] + VEOR XTMP1.B16, XDIGEST.B16, XDIGEST.B16 + VST1 [XDIGEST.B16], (AX) + + RET diff --git a/zuc/eia_asm_ppc64x.s b/zuc/eia_asm_ppc64x.s index 2517200..3f23838 100644 --- a/zuc/eia_asm_ppc64x.s +++ b/zuc/eia_asm_ppc64x.s @@ -6,43 +6,53 @@ #include "textflag.h" -DATA ·rcon+0x00(SB)/8, $0x0706050403020100 // Permute for vector doubleword endian swap -DATA ·rcon+0x08(SB)/8, $0x0f0e0d0c0b0a0908 -DATA ·rcon+0x10(SB)/8, $0x0008040c020a060e // bit_reverse_table_l -DATA ·rcon+0x18(SB)/8, $0x0109050d030b070f // bit_reverse_table_l -DATA ·rcon+0x20(SB)/8, $0x0000000010111213 // data mask -DATA ·rcon+0x28(SB)/8, $0x0000000014151617 // data mask -DATA ·rcon+0x30(SB)/8, $0x0000000018191a1b // data mask -DATA ·rcon+0x38(SB)/8, $0x000000001c1d1e1f // data mask -DATA ·rcon+0x40(SB)/8, $0x0405060708090a0b // ks mask -DATA ·rcon+0x48(SB)/8, $0x0001020304050607 // ks mask -GLOBL ·rcon(SB), RODATA, $80 +DATA eia_const<>+0x00(SB)/8, $0x0706050403020100 // Permute for vector doubleword endian swap +DATA eia_const<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908 +DATA eia_const<>+0x10(SB)/8, $0x0008040c020a060e // bit_reverse_table_l +DATA eia_const<>+0x18(SB)/8, $0x0109050d030b070f // bit_reverse_table_l +DATA eia_const<>+0x20(SB)/8, $0x0000000010111213 // data mask +DATA eia_const<>+0x28(SB)/8, $0x0000000014151617 // data mask +DATA eia_const<>+0x30(SB)/8, $0x0000000018191a1b // data mask +DATA eia_const<>+0x38(SB)/8, $0x000000001c1d1e1f // data mask +DATA eia_const<>+0x40(SB)/8, $0x0405060708090a0b // ks mask +DATA eia_const<>+0x48(SB)/8, $0x0001020304050607 // ks mask +GLOBL eia_const<>(SB), RODATA, $80 #define XTMP1 V0 #define XTMP2 V1 #define XTMP3 V2 #define XTMP4 V3 +#define XTMP5 V4 +#define XTMP6 V5 #define XDATA V6 #define XDIGEST V7 #define KS_L V8 #define KS_M1 V9 +#define KS_M2 V10 +#define KS_H V11 #define BIT_REV_TAB_L V12 #define BIT_REV_TAB_H V13 - +#define ZERO V15 #define PTR R7 -// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) -TEXT ·eia3Round16B(SB),NOSPLIT,$0 +#define BIT_REVERSE(addr, IN, OUT, XTMP) \ + LXVD2X (addr)(R0), BIT_REV_TAB_L \ + VSPLTISB $4, XTMP \ + VSLB BIT_REV_TAB_L, XTMP, BIT_REV_TAB_H \ + VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, IN, OUT + +// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte) +TEXT ·eiaRoundTag4(SB),NOSPLIT,$0 MOVD t+0(FP), R3 MOVD ks+8(FP), R4 MOVD p+16(FP), R5 #ifndef GOARCH_ppc64le - MOVD $·rcon(SB), PTR // PTR points to rcon addr + MOVD $eia_const<>(SB), PTR LVX (PTR), XTMP1 ADD $0x10, PTR #else - MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector) + MOVD $eia_const<>+0x10(SB), PTR #endif LXVD2X (R5)(R0), XDATA @@ -50,10 +60,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0 VPERM XDATA, XDATA, XTMP1, XDATA #endif - VSPLTISB $4, XTMP2; - LXVD2X (PTR)(R0), BIT_REV_TAB_L - VSLB BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H - VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, XDATA, XTMP3 // XTMP3 - bit reverse data bytes + BIT_REVERSE(PTR, XDATA, XTMP3, XTMP2) // ZUC authentication part, 4x32 data bits // setup data @@ -95,3 +102,169 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0 STXVD2X XTMP1, (R4)(R0) RET + +// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte) +TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 + MOVD t+0(FP), R3 + MOVD ks+8(FP), R4 + MOVD p+16(FP), R5 + +#ifndef GOARCH_ppc64le + MOVD $eia_const<>(SB), PTR + LVX (PTR), XTMP1 + ADD $0x10, PTR +#else + MOVD $eia_const<>+0x10(SB), PTR +#endif + + LXVD2X (R5)(R0), XDATA +#ifndef GOARCH_ppc64le + VPERM XDATA, XDATA, XTMP1, XDATA +#endif + + BIT_REVERSE(PTR, XDATA, XTMP3, XTMP2) + + // ZUC authentication part, 4x32 data bits + // setup data + VSPLTISB $0, ZERO + MOVD $0x10, R8 + LXVD2X (PTR)(R8), XTMP4 + VPERM ZERO, XTMP3, XTMP4, XTMP1 + MOVD $0x20, R8 + LXVD2X (PTR)(R8), XTMP4 + VPERM ZERO, XTMP3, XTMP4, XTMP2 + + // setup KS + LXVW4X (R4), KS_L + MOVD $8, R8 + LXVW4X (R8)(R4), KS_M1 + MOVD $16, R8 + LXVW4X (R8)(R4), KS_M2 + MOVD $0x30, R8 + LXVD2X (PTR)(R8), XTMP4 + VPERM KS_L, KS_L, XTMP4, KS_L + VPERM KS_M1, KS_M1, XTMP4, KS_M1 + VPERM KS_M2, KS_M2, XTMP4, KS_M2 + + // clmul + // xor the results from 4 32-bit words together + // Calculate lower 32 bits of tag + VPMSUMD XTMP1, KS_L, XTMP3 + VPMSUMD XTMP2, KS_M1, XTMP4 + VXOR XTMP3, XTMP4, XTMP3 + VSPLTW $2, XTMP3, XDIGEST + + // Calculate upper 32 bits of tag + VSLDOI $8, KS_M1, KS_L, KS_L + VPMSUMD XTMP1, KS_L, XTMP3 + VSLDOI $8, KS_M2, KS_M1, KS_M1 + VPMSUMD XTMP2, KS_M1, XTMP4 + VXOR XTMP3, XTMP4, XTMP3 + VSPLTW $2, XTMP3, XTMP3 + + // Update tag +#ifdef GOARCH_ppc64le + VSLDOI $12, XTMP3, XDIGEST, XDIGEST +#else + VSLDOI $12, XDIGEST, XTMP3, XDIGEST +#endif + MFVSRD XDIGEST, R8 + MOVD (R3), R6 + XOR R6, R8, R6 + MOVD R6, (R3) + + // Copy last 16 bytes of KS to the front + MOVD $16, R8 + LXVD2X (R8)(R4), XTMP1 + STXVD2X XTMP1, (R4)(R0) + + RET + +// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte) +TEXT ·eia256RoundTag16(SB),NOSPLIT,$0 + MOVD t+0(FP), R3 + MOVD ks+8(FP), R4 + MOVD p+16(FP), R5 + +#ifndef GOARCH_ppc64le + MOVD $eia_const<>(SB), PTR + LVX (PTR), XTMP1 + ADD $0x10, PTR +#else + MOVD $eia_const<>+0x10(SB), PTR +#endif + + LXVD2X (R5)(R0), XDATA +#ifndef GOARCH_ppc64le + VPERM XDATA, XDATA, XTMP1, XDATA +#endif + + BIT_REVERSE(PTR, XDATA, XTMP3, XTMP2) + + // ZUC authentication part, 4x32 data bits + // setup data + VSPLTISB $0, ZERO + MOVD $0x10, R8 + LXVD2X (PTR)(R8), XTMP4 + VPERM ZERO, XTMP3, XTMP4, XTMP1 + MOVD $0x20, R8 + LXVD2X (PTR)(R8), XTMP4 + VPERM ZERO, XTMP3, XTMP4, XTMP2 + + // setup KS + LXVW4X (R4), KS_L + MOVD $8, R8 + LXVW4X (R8)(R4), KS_M1 + MOVD $16, R8 + LXVW4X (R8)(R4), KS_M2 + VOR KS_M2, KS_M2, KS_H + MOVD $0x30, R8 + LXVD2X (PTR)(R8), XTMP4 + VPERM KS_L, KS_L, XTMP4, KS_L + VPERM KS_M1, KS_M1, XTMP4, KS_M1 + VPERM KS_M2, KS_M2, XTMP4, KS_M2 + + // clmul + // xor the results from 4 32-bit words together + // Calculate lower 32 bits of tag + VPMSUMD XTMP1, KS_L, XTMP3 + VPMSUMD XTMP2, KS_M1, XTMP4 + VXOR XTMP3, XTMP4, XTMP3 + VSLDOI $12, XTMP3, XTMP3, XDIGEST + + // Calculate upper 32 bits of tag + VSLDOI $8, KS_M1, KS_L, KS_L + VPMSUMD XTMP1, KS_L, XTMP3 + VSLDOI $8, KS_M2, KS_M1, XTMP5 + VPMSUMD XTMP2, XTMP5, XTMP4 + VXOR XTMP3, XTMP4, XTMP3 + VSLDOI $8, XTMP3, XTMP3, XTMP3 + VSLDOI $4, XDIGEST, XTMP3, XDIGEST + + // calculate bits 95-64 of tag + VPMSUMD XTMP1, KS_M1, XTMP3 + VPMSUMD XTMP2, KS_M2, XTMP4 + VXOR XTMP3, XTMP4, XTMP3 + VSLDOI $8, XTMP3, XTMP3, XTMP3 + VSLDOI $4, XDIGEST, XTMP3, XDIGEST + + // calculate bits 127-96 of tag + VSLDOI $8, KS_M2, KS_M1, KS_M1 + VPMSUMD XTMP1, KS_M1, XTMP3 + VSLDOI $8, KS_H, KS_M2, KS_M2 + VPMSUMD XTMP2, KS_M2, XTMP4 + VXOR XTMP3, XTMP4, XTMP3 + VSLDOI $8, XTMP3, XTMP3, XTMP3 + VSLDOI $4, XDIGEST, XTMP3, XDIGEST + + // Update tag + LXVW4X (R3)(R0), XTMP1 + VXOR XTMP1, XDIGEST, XDIGEST + STXVW4X XDIGEST, (R3) + + // Copy last 16 bytes of KS to the front + MOVD $16, R8 + LXVD2X (R8)(R4), XTMP1 + STXVD2X XTMP1, (R4)(R0) + + RET