From 39274df2bd04599cdd3ebd41ea914665ad97df12 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 13 Jul 2022 11:51:44 +0800 Subject: [PATCH] zuc: EIA performance improvement --- zuc/README.md | 10 +- zuc/asm_amd64.s | 1064 +++++++++++++++++++++---------------------- zuc/asm_arm64.s | 608 ++++++++++++------------- zuc/eia.go | 16 +- zuc/eia_asm.go | 24 + zuc/eia_asm_amd64.s | 153 +++++++ zuc/eia_generic.go | 8 + 7 files changed, 1038 insertions(+), 845 deletions(-) create mode 100644 zuc/eia_asm.go create mode 100644 zuc/eia_asm_amd64.s create mode 100644 zuc/eia_generic.go diff --git a/zuc/README.md b/zuc/README.md index 98045c7..e82b00d 100644 --- a/zuc/README.md +++ b/zuc/README.md @@ -48,10 +48,18 @@ func (s *zucState32) f32(x0, x1, x2 uint32) uint32 { fmt.Println() ``` -## Performance with AMD64 SIMD & AESNI: +## EEA Performance with AMD64 SIMD & AESNI: goos: windows goarch: amd64 pkg: github.com/emmansun/gmsm/zuc cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz BenchmarkEncrypt1K-6 409755 2802 ns/op 363.62 MB/s BenchmarkEncrypt8K-6 54120 22413 ns/op 365.28 MB/s + +## EIA Performance with AMD64 SIMD & AESNI & CLMUL: + goos: windows + goarch: amd64 + pkg: github.com/emmansun/gmsm/zuc + cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz + BenchmarkHash1K-6 317750 3833 ns/op 267.13 MB/s + BenchmarkHash8K-6 40460 28921 ns/op 283.26 MB/s diff --git a/zuc/asm_amd64.s b/zuc/asm_amd64.s index 4d75de5..513b09c 100644 --- a/zuc/asm_amd64.s +++ b/zuc/asm_amd64.s @@ -85,121 +85,121 @@ GLOBL flip_mask<>(SB), RODATA, $16 #define OFFSET_BRC_X3 (21*4) #define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now - SHLL n, a \ - SHRL n, b \ - ORL b, a + SHLL n, a \ + SHRL n, b \ + ORL b, a #define Rotl_5_SSE(XDATA, XTMP0) \ - MOVOU XDATA, XTMP0 \ - PSLLQ $5, XTMP0 \ // should use pslld - PSRLQ $3, XDATA \ // should use psrld - PAND Top3_bits_of_the_byte<>(SB), XTMP0 \ - PAND Bottom5_bits_of_the_byte<>(SB), XDATA \ - POR XTMP0, XDATA + MOVOU XDATA, XTMP0 \ + PSLLQ $5, XTMP0 \ // should use pslld + PSRLQ $3, XDATA \ // should use psrld + PAND Top3_bits_of_the_byte<>(SB), XTMP0 \ + PAND Bottom5_bits_of_the_byte<>(SB), XDATA \ + POR XTMP0, XDATA #define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \ - MOVOU IN_OUT, XTMP1 \ - \ - PAND Low_nibble_mask<>(SB), IN_OUT \ - \ - PAND High_nibble_mask<>(SB), XTMP1 \ - PSRLQ $4, XTMP1 \ - \ - MOVOU P1<>(SB), XTMP2 \ - PSHUFB IN_OUT, XTMP2 \ - PXOR XTMP1, XTMP2 \ - \ - MOVOU P2<>(SB), XTMP1 \ - PSHUFB XTMP2, XTMP1 \ - PXOR IN_OUT, XTMP1 \ - \ - MOVOU P3<>(SB), IN_OUT \ - PSHUFB XTMP1, IN_OUT \ - PXOR XTMP2, IN_OUT \ - \ - PSLLQ $4, IN_OUT \ - POR XTMP1, IN_OUT \ - Rotl_5_SSE(IN_OUT, XTMP1) + MOVOU IN_OUT, XTMP1 \ + \ + PAND Low_nibble_mask<>(SB), IN_OUT \ + \ + PAND High_nibble_mask<>(SB), XTMP1 \ + PSRLQ $4, XTMP1 \ + \ + MOVOU P1<>(SB), XTMP2 \ + PSHUFB IN_OUT, XTMP2 \ + PXOR XTMP1, XTMP2 \ + \ + MOVOU P2<>(SB), XTMP1 \ + PSHUFB XTMP2, XTMP1 \ + PXOR IN_OUT, XTMP1 \ + \ + MOVOU P3<>(SB), IN_OUT \ + PSHUFB XTMP1, IN_OUT \ + PXOR XTMP2, IN_OUT \ + \ + PSLLQ $4, IN_OUT \ + POR XTMP1, IN_OUT \ + Rotl_5_SSE(IN_OUT, XTMP1) // Perform 8x8 matrix multiplication using lookup tables with partial results // for high and low nible of each input byte #define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \ - MOVOU Low_nibble_mask<>(SB), XTMP \ - PAND XIN, XTMP \ - \ - PSHUFB XTMP, XLO \ - \ - MOVOU High_nibble_mask<>(SB), XTMP \ - PAND XIN, XTMP \ - PSRLQ $4, XTMP \ - \ - PSHUFB XTMP, XHI_OUT \ - \ - PXOR XLO, XHI_OUT + MOVOU Low_nibble_mask<>(SB), XTMP \ + PAND XIN, XTMP \ + \ + PSHUFB XTMP, XLO \ + \ + MOVOU High_nibble_mask<>(SB), XTMP \ + PAND XIN, XTMP \ + PSRLQ $4, XTMP \ + \ + PSHUFB XTMP, XHI_OUT \ + \ + PXOR XLO, XHI_OUT // Compute 16 S1 box values from 16 bytes, stored in XMM register #define S1_comput_SSE(XIN_OUT, XTMP1, XTMP2, XTMP3) \ - MOVOU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1 \ - MOVOU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2 \ - MUL_PSHUFB_SSE(XIN_OUT, XTMP1, XTMP2, XTMP3) \ - \ - PSHUFB Shuf_mask<>(SB), XTMP2 \ - AESENCLAST Cancel_aes<>(SB), XTMP2 \ - \ - MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \ - MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \ - MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3) + MOVOU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1 \ + MOVOU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2 \ + MUL_PSHUFB_SSE(XIN_OUT, XTMP1, XTMP2, XTMP3) \ + \ + PSHUFB Shuf_mask<>(SB), XTMP2 \ + AESENCLAST Cancel_aes<>(SB), XTMP2 \ + \ + MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \ + MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \ + MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3) #define Rotl_5_AVX(XDATA, XTMP0) \ - VPSLLD $5, XDATA, XTMP0 \ - VPSRLD $3, XDATA, XDATA \ - VPAND Top3_bits_of_the_byte<>(SB), XTMP0, XTMP0 \ - VPAND Bottom5_bits_of_the_byte<>(SB), XDATA, XDATA \ - VPOR XTMP0, XDATA, XDATA + VPSLLD $5, XDATA, XTMP0 \ + VPSRLD $3, XDATA, XDATA \ + VPAND Top3_bits_of_the_byte<>(SB), XTMP0, XTMP0 \ + VPAND Bottom5_bits_of_the_byte<>(SB), XDATA, XDATA \ + VPOR XTMP0, XDATA, XDATA #define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \ - VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \ - VPSRLQ $4, XTMP1, XTMP1 \ - \ - VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ - \ - VMOVDQU P1<>(SB), XTMP2 \ - VPSHUFB IN_OUT, XTMP2, XTMP2 \ - VPXOR XTMP1, XTMP2, XTMP2 \ - \ - VMOVDQU P2<>(SB), XTMP1 \ - VPSHUFB XTMP2, XTMP1, XTMP1 \ - VPXOR IN_OUT, XTMP1, XTMP1 \ - \ - VMOVDQU P3<>(SB), IN_OUT \ - VPSHUFB XTMP1, IN_OUT, IN_OUT \ - VPXOR XTMP2, IN_OUT, IN_OUT \ - \ - VPSLLQ $4, IN_OUT, IN_OUT \ - VPOR XTMP1, IN_OUT, IN_OUT \ - Rotl_5_AVX(IN_OUT, XTMP1) + VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \ + VPSRLQ $4, XTMP1, XTMP1 \ + \ + VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ + \ + VMOVDQU P1<>(SB), XTMP2 \ + VPSHUFB IN_OUT, XTMP2, XTMP2 \ + VPXOR XTMP1, XTMP2, XTMP2 \ + \ + VMOVDQU P2<>(SB), XTMP1 \ + VPSHUFB XTMP2, XTMP1, XTMP1 \ + VPXOR IN_OUT, XTMP1, XTMP1 \ + \ + VMOVDQU P3<>(SB), IN_OUT \ + VPSHUFB XTMP1, IN_OUT, IN_OUT \ + VPXOR XTMP2, IN_OUT, IN_OUT \ + \ + VPSLLQ $4, IN_OUT, IN_OUT \ + VPOR XTMP1, IN_OUT, IN_OUT \ + Rotl_5_AVX(IN_OUT, XTMP1) // Perform 8x8 matrix multiplication using lookup tables with partial results // for high and low nible of each input byte #define MUL_PSHUFB_AVX(XIN, XLO, XHI_OUT, XTMP) \ - VPAND Low_nibble_mask<>(SB), XIN, XTMP \ - VPSHUFB XTMP, XLO, XLO \ - VPAND High_nibble_mask<>(SB), XIN, XTMP \ - VPSRLQ $4, XTMP, XTMP \ - VPSHUFB XTMP, XHI_OUT, XHI_OUT \ - VPXOR XLO, XHI_OUT, XHI_OUT + VPAND Low_nibble_mask<>(SB), XIN, XTMP \ + VPSHUFB XTMP, XLO, XLO \ + VPAND High_nibble_mask<>(SB), XIN, XTMP \ + VPSRLQ $4, XTMP, XTMP \ + VPSHUFB XTMP, XHI_OUT, XHI_OUT \ + VPXOR XLO, XHI_OUT, XHI_OUT // Compute 16 S1 box values from 16 bytes, stored in XMM register #define S1_comput_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3) \ - VMOVDQU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1 \ - VMOVDQU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2 \ - MUL_PSHUFB_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3) \ - VPSHUFB Shuf_mask<>(SB), XTMP2, XTMP2 \ - VAESENCLAST Cancel_aes<>(SB), XTMP2, XTMP2 \ - VMOVDQU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \ - VMOVDQU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \ - MUL_PSHUFB_AVX(XTMP2, XTMP1, XIN_OUT, XTMP3) + VMOVDQU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1 \ + VMOVDQU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2 \ + MUL_PSHUFB_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3) \ + VPSHUFB Shuf_mask<>(SB), XTMP2, XTMP2 \ + VAESENCLAST Cancel_aes<>(SB), XTMP2, XTMP2 \ + VMOVDQU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \ + VMOVDQU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \ + MUL_PSHUFB_AVX(XTMP2, XTMP1, XIN_OUT, XTMP3) // BITS_REORG(idx) // params @@ -209,532 +209,532 @@ GLOBL flip_mask<>(SB), RODATA, $16 // return // R12, R13, R14, R15 #define BITS_REORG(idx) \ - MOVL (((15 + idx) % 16)*4)(SI), R12 \ - MOVL (((14 + idx) % 16)*4)(SI), AX \ - MOVL (((11 + idx) % 16)*4)(SI), R13 \ - MOVL (((9 + idx) % 16)*4)(SI), BX \ - MOVL (((7 + idx) % 16)*4)(SI), R14 \ - MOVL (((5 + idx) % 16)*4)(SI), CX \ - MOVL (((2 + idx) % 16)*4)(SI), R15 \ - MOVL (((0 + idx) % 16)*4)(SI), DX \ - SHRL $15, R12 \ - SHLL $16, AX \ - SHLL $1, BX \ - SHLL $1, CX \ - SHLL $1, DX \ - SHLDL(R12, AX, $16) \ - SHLDL(R13, BX, $16) \ - SHLDL(R14, CX, $16) \ - SHLDL(R15, DX, $16) + MOVL (((15 + idx) % 16)*4)(SI), R12 \ + MOVL (((14 + idx) % 16)*4)(SI), AX \ + MOVL (((11 + idx) % 16)*4)(SI), R13 \ + MOVL (((9 + idx) % 16)*4)(SI), BX \ + MOVL (((7 + idx) % 16)*4)(SI), R14 \ + MOVL (((5 + idx) % 16)*4)(SI), CX \ + MOVL (((2 + idx) % 16)*4)(SI), R15 \ + MOVL (((0 + idx) % 16)*4)(SI), DX \ + SHRL $15, R12 \ + SHLL $16, AX \ + SHLL $1, BX \ + SHLL $1, CX \ + SHLL $1, DX \ + SHLDL(R12, AX, $16) \ + SHLDL(R13, BX, $16) \ + SHLDL(R14, CX, $16) \ + SHLDL(R15, DX, $16) #define LFSR_UPDT(idx) \ - MOVL (((0 + idx) % 16)*4)(SI), BX \ - MOVL (((4 + idx) % 16)*4)(SI), CX \ - MOVL (((10 + idx) % 16)*4)(SI), DX \ - MOVL (((13 + idx) % 16)*4)(SI), R8 \ - MOVL (((15 + idx) % 16)*4)(SI), R9 \ - ADDQ BX, AX \ - SHLQ $8, BX \ - SHLQ $20, CX \ - SHLQ $21, DX \ - SHLQ $17, R8 \ - SHLQ $15, R9 \ - ADDQ BX, AX \ - ADDQ CX, AX \ - ADDQ DX, AX \ - ADDQ R8, AX \ - ADDQ R9, AX \ - \ - MOVQ AX, BX \ - ANDQ $0x7FFFFFFF, AX \ - SHRQ $31, BX \ - ADDQ BX, AX \ - \ - MOVQ AX, BX \ - SUBQ $0x7FFFFFFF, AX \ - CMOVQCS BX, AX \ - \ - MOVL AX, (((0 + idx) % 16)*4)(SI) + MOVL (((0 + idx) % 16)*4)(SI), BX \ + MOVL (((4 + idx) % 16)*4)(SI), CX \ + MOVL (((10 + idx) % 16)*4)(SI), DX \ + MOVL (((13 + idx) % 16)*4)(SI), R8 \ + MOVL (((15 + idx) % 16)*4)(SI), R9 \ + ADDQ BX, AX \ + SHLQ $8, BX \ + SHLQ $20, CX \ + SHLQ $21, DX \ + SHLQ $17, R8 \ + SHLQ $15, R9 \ + ADDQ BX, AX \ + ADDQ CX, AX \ + ADDQ DX, AX \ + ADDQ R8, AX \ + ADDQ R9, AX \ + \ + MOVQ AX, BX \ + ANDQ $0x7FFFFFFF, AX \ + SHRQ $31, BX \ + ADDQ BX, AX \ + \ + MOVQ AX, BX \ + SUBQ $0x7FFFFFFF, AX \ + CMOVQCS BX, AX \ + \ + MOVL AX, (((0 + idx) % 16)*4)(SI) #define NONLIN_FUN() \ - MOVL R12, AX \ - XORL R10, AX \ - ADDL R11, AX \ - ADDL R13, R10 \ // W1= F_R1 + BRC_X1 - XORL R14, R11 \ // W2= F_R2 ^ BRC_X2 - \ - MOVL R10, DX \ - MOVL R11, CX \ - SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16) - SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16) - MOVL DX, BX \ - MOVL DX, CX \ - MOVL DX, R8 \ - MOVL DX, R9 \ - ROLL $2, BX \ - ROLL $10, CX \ - ROLL $18, R8 \ - ROLL $24, R9 \ - XORL BX, DX \ - XORL CX, DX \ - XORL R8, DX \ - XORL R9, DX \ // U = L1(P) = EDX, hi(RDX)=0 - MOVL R11, BX \ - MOVL R11, CX \ - MOVL R11, R8 \ - MOVL R11, R9 \ - ROLL $8, BX \ - ROLL $14, CX \ - ROLL $22, R8 \ - ROLL $30, R9 \ - XORL BX, R11 \ - XORL CX, R11 \ - XORL R8, R11 \ - XORL R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0 - SHLQ $32, R11 \ - XORQ R11, DX + MOVL R12, AX \ + XORL R10, AX \ + ADDL R11, AX \ + ADDL R13, R10 \ // W1= F_R1 + BRC_X1 + XORL R14, R11 \ // W2= F_R2 ^ BRC_X2 + \ + MOVL R10, DX \ + MOVL R11, CX \ + SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16) + SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16) + MOVL DX, BX \ + MOVL DX, CX \ + MOVL DX, R8 \ + MOVL DX, R9 \ + ROLL $2, BX \ + ROLL $10, CX \ + ROLL $18, R8 \ + ROLL $24, R9 \ + XORL BX, DX \ + XORL CX, DX \ + XORL R8, DX \ + XORL R9, DX \ // U = L1(P) = EDX, hi(RDX)=0 + MOVL R11, BX \ + MOVL R11, CX \ + MOVL R11, R8 \ + MOVL R11, R9 \ + ROLL $8, BX \ + ROLL $14, CX \ + ROLL $22, R8 \ + ROLL $30, R9 \ + XORL BX, R11 \ + XORL CX, R11 \ + XORL R8, R11 \ + XORL R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0 + SHLQ $32, R11 \ + XORQ R11, DX #define NONLIN_FUN_SSE() \ - NONLIN_FUN() \ - MOVQ DX, X0 \ - MOVOU X0, X1 \ - S0_comput_SSE(X1, X2, X3) \ - S1_comput_SSE(X0, X2, X3, X4) \ - \ - PAND mask_S1<>(SB), X0 \ - PAND mask_S0<>(SB), X1 \ - PXOR X1, X0 \ - \ - MOVL X0, R10 \ // F_R1 - PEXTRD $1, X0, R11 + NONLIN_FUN() \ + MOVQ DX, X0 \ + MOVOU X0, X1 \ + S0_comput_SSE(X1, X2, X3) \ + S1_comput_SSE(X0, X2, X3, X4) \ + \ + PAND mask_S1<>(SB), X0 \ + PAND mask_S0<>(SB), X1 \ + PXOR X1, X0 \ + \ + MOVL X0, R10 \ // F_R1 + PEXTRD $1, X0, R11 #define RESTORE_LFSR_0() \ - MOVL (0*4)(SI), AX \ - MOVUPS (4)(SI), X0 \ - MOVUPS (20)(SI), X1 \ - MOVUPS (36)(SI), X2 \ - MOVQ (52)(SI), BX \ - MOVL (60)(SI), CX \ - \ - MOVUPS X0, (SI) \ - MOVUPS X1, (16)(SI) \ - MOVUPS X2, (32)(SI) \ - MOVQ BX, (48)(SI) \ - MOVL CX, (56)(SI) \ - MOVL AX, (60)(SI) + MOVL (0*4)(SI), AX \ + MOVUPS (4)(SI), X0 \ + MOVUPS (20)(SI), X1 \ + MOVUPS (36)(SI), X2 \ + MOVQ (52)(SI), BX \ + MOVL (60)(SI), CX \ + \ + MOVUPS X0, (SI) \ + MOVUPS X1, (16)(SI) \ + MOVUPS X2, (32)(SI) \ + MOVQ BX, (48)(SI) \ + MOVL CX, (56)(SI) \ + MOVL AX, (60)(SI) #define RESTORE_LFSR_2() \ - MOVQ (0)(SI), AX \ - MOVUPS (8)(SI), X0 \ - MOVUPS (24)(SI), X1 \ - MOVUPS (40)(SI), X2 \ - MOVQ (56)(SI), BX \ - \ - MOVUPS X0, (SI) \ - MOVUPS X1, (16)(SI) \ - MOVUPS X2, (32)(SI) \ - MOVQ BX, (48)(SI) \ - MOVQ AX, (56)(SI) + MOVQ (0)(SI), AX \ + MOVUPS (8)(SI), X0 \ + MOVUPS (24)(SI), X1 \ + MOVUPS (40)(SI), X2 \ + MOVQ (56)(SI), BX \ + \ + MOVUPS X0, (SI) \ + MOVUPS X1, (16)(SI) \ + MOVUPS X2, (32)(SI) \ + MOVQ BX, (48)(SI) \ + MOVQ AX, (56)(SI) #define RESTORE_LFSR_4() \ - MOVUPS (0)(SI), X0 \ - MOVUPS (16)(SI), X1 \ - MOVUPS (32)(SI), X2 \ - MOVUPS (48)(SI), X3 \ - \ - MOVUPS X1, (0)(SI) \ - MOVUPS X2, (16)(SI) \ - MOVUPS X3, (32)(SI) \ - MOVUPS X0, (48)(SI) + MOVUPS (0)(SI), X0 \ + MOVUPS (16)(SI), X1 \ + MOVUPS (32)(SI), X2 \ + MOVUPS (48)(SI), X3 \ + \ + MOVUPS X1, (0)(SI) \ + MOVUPS X2, (16)(SI) \ + MOVUPS X3, (32)(SI) \ + MOVUPS X0, (48)(SI) #define RESTORE_LFSR_8() \ - MOVUPS (0)(SI), X0 \ - MOVUPS (16)(SI), X1 \ - MOVUPS (32)(SI), X2 \ - MOVUPS (48)(SI), X3 \ - \ - MOVUPS X2, (0)(SI) \ - MOVUPS X3, (16)(SI) \ - MOVUPS X0, (32)(SI) \ - MOVUPS X1, (48)(SI) + MOVUPS (0)(SI), X0 \ + MOVUPS (16)(SI), X1 \ + MOVUPS (32)(SI), X2 \ + MOVUPS (48)(SI), X3 \ + \ + MOVUPS X2, (0)(SI) \ + MOVUPS X3, (16)(SI) \ + MOVUPS X0, (32)(SI) \ + MOVUPS X1, (48)(SI) #define NONLIN_FUN_AVX() \ - NONLIN_FUN() \ - VMOVQ DX, X0 \ - VMOVDQA X0, X1 \ - S0_comput_AVX(X1, X2, X3) \ - S1_comput_AVX(X0, X2, X3, X4) \ - \ - VPAND mask_S1<>(SB), X0, X0 \ - VPAND mask_S0<>(SB), X1, X1 \ - VPXOR X1, X0, X0 \ - \ - MOVL X0, R10 \ // F_R1 - VPEXTRD $1, X0, R11 + NONLIN_FUN() \ + VMOVQ DX, X0 \ + VMOVDQA X0, X1 \ + S0_comput_AVX(X1, X2, X3) \ + S1_comput_AVX(X0, X2, X3, X4) \ + \ + VPAND mask_S1<>(SB), X0, X0 \ + VPAND mask_S0<>(SB), X1, X1 \ + VPXOR X1, X0, X0 \ + \ + MOVL X0, R10 \ // F_R1 + VPEXTRD $1, X0, R11 #define LOAD_STATE() \ - MOVL OFFSET_FR1(SI), R10 \ - MOVL OFFSET_FR2(SI), R11 \ - MOVL OFFSET_BRC_X0(SI), R12 \ - MOVL OFFSET_BRC_X1(SI), R13 \ - MOVL OFFSET_BRC_X2(SI), R14 \ - MOVL OFFSET_BRC_X3(SI), R15 + MOVL OFFSET_FR1(SI), R10 \ + MOVL OFFSET_FR2(SI), R11 \ + MOVL OFFSET_BRC_X0(SI), R12 \ + MOVL OFFSET_BRC_X1(SI), R13 \ + MOVL OFFSET_BRC_X2(SI), R14 \ + MOVL OFFSET_BRC_X3(SI), R15 #define SAVE_STATE() \ - MOVL R10, OFFSET_FR1(SI) \ - MOVL R11, OFFSET_FR2(SI) \ - MOVL R12, OFFSET_BRC_X0(SI) \ - MOVL R13, OFFSET_BRC_X1(SI) \ - MOVL R14, OFFSET_BRC_X2(SI) \ - MOVL R15, OFFSET_BRC_X3(SI) + MOVL R10, OFFSET_FR1(SI) \ + MOVL R11, OFFSET_FR2(SI) \ + MOVL R12, OFFSET_BRC_X0(SI) \ + MOVL R13, OFFSET_BRC_X1(SI) \ + MOVL R14, OFFSET_BRC_X2(SI) \ + MOVL R15, OFFSET_BRC_X3(SI) // func genKeywordAsm(s *zucState32) uint32 TEXT ·genKeywordAsm(SB),NOSPLIT,$0 - MOVQ pState+0(FP), SI - - LOAD_STATE() + MOVQ pState+0(FP), SI - BITS_REORG(0) + LOAD_STATE() + + BITS_REORG(0) CMPB ·useAVX(SB), $1 JE avx sse: - NONLIN_FUN_SSE() + NONLIN_FUN_SSE() - XORL R15, AX - MOVL AX, ret+8(FP) - XORQ AX, AX - LFSR_UPDT(0) - SAVE_STATE() - RESTORE_LFSR_0() + XORL R15, AX + MOVL AX, ret+8(FP) + XORQ AX, AX + LFSR_UPDT(0) + SAVE_STATE() + RESTORE_LFSR_0() - RET + RET avx: - NONLIN_FUN_AVX() + NONLIN_FUN_AVX() - XORL R15, AX - MOVL AX, ret+8(FP) - XORQ AX, AX - LFSR_UPDT(0) - SAVE_STATE() - RESTORE_LFSR_0() + XORL R15, AX + MOVL AX, ret+8(FP) + XORQ AX, AX + LFSR_UPDT(0) + SAVE_STATE() + RESTORE_LFSR_0() - VZEROUPPER - RET + VZEROUPPER + RET #define ROUND_SSE(idx) \ - BITS_REORG(idx) \ - NONLIN_FUN_SSE() \ - XORL R15, AX \ - MOVL AX, (idx*4)(DI) \ - XORQ AX, AX \ - LFSR_UPDT(idx) + BITS_REORG(idx) \ + NONLIN_FUN_SSE() \ + XORL R15, AX \ + MOVL AX, (idx*4)(DI) \ + XORQ AX, AX \ + LFSR_UPDT(idx) #define ROUND_AVX(idx) \ - BITS_REORG(idx) \ - NONLIN_FUN_AVX() \ - XORL R15, AX \ - MOVL AX, (idx*4)(DI) \ - XORQ AX, AX \ - LFSR_UPDT(idx) + BITS_REORG(idx) \ + NONLIN_FUN_AVX() \ + XORL R15, AX \ + MOVL AX, (idx*4)(DI) \ + XORQ AX, AX \ + LFSR_UPDT(idx) #define ROUND_REV32_SSE(idx) \ - BITS_REORG(idx) \ - NONLIN_FUN_SSE() \ - XORL R15, AX \ - BSWAPL AX \ - MOVL AX, (idx*4)(DI) \ - XORQ AX, AX \ - LFSR_UPDT(idx) + BITS_REORG(idx) \ + NONLIN_FUN_SSE() \ + XORL R15, AX \ + BSWAPL AX \ + MOVL AX, (idx*4)(DI) \ + XORQ AX, AX \ + LFSR_UPDT(idx) #define ROUND_REV32_AVX(idx) \ - BITS_REORG(idx) \ - NONLIN_FUN_AVX() \ - XORL R15, AX \ - BSWAPL AX \ - MOVL AX, (idx*4)(DI) \ - XORQ AX, AX \ - LFSR_UPDT(idx) + BITS_REORG(idx) \ + NONLIN_FUN_AVX() \ + XORL R15, AX \ + BSWAPL AX \ + MOVL AX, (idx*4)(DI) \ + XORQ AX, AX \ + LFSR_UPDT(idx) // func genKeyStreamAsm(keyStream []uint32, pState *zucState32) TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0 - MOVQ ks+0(FP), DI - MOVQ ks_len+8(FP), BP - MOVQ pState+24(FP), SI + MOVQ ks+0(FP), DI + MOVQ ks_len+8(FP), BP + MOVQ pState+24(FP), SI - LOAD_STATE() + LOAD_STATE() CMPB ·useAVX(SB), $1 JE avxZucSixteens sseZucSixteens: - CMPQ BP, $16 - JB sseZucOctet - SUBQ $16, BP - ROUND_SSE(0) - ROUND_SSE(1) - ROUND_SSE(2) - ROUND_SSE(3) - ROUND_SSE(4) - ROUND_SSE(5) - ROUND_SSE(6) - ROUND_SSE(7) - ROUND_SSE(8) - ROUND_SSE(9) - ROUND_SSE(10) - ROUND_SSE(11) - ROUND_SSE(12) - ROUND_SSE(13) - ROUND_SSE(14) - ROUND_SSE(15) - LEAQ 64(DI), DI - JMP sseZucSixteens + CMPQ BP, $16 + JB sseZucOctet + SUBQ $16, BP + ROUND_SSE(0) + ROUND_SSE(1) + ROUND_SSE(2) + ROUND_SSE(3) + ROUND_SSE(4) + ROUND_SSE(5) + ROUND_SSE(6) + ROUND_SSE(7) + ROUND_SSE(8) + ROUND_SSE(9) + ROUND_SSE(10) + ROUND_SSE(11) + ROUND_SSE(12) + ROUND_SSE(13) + ROUND_SSE(14) + ROUND_SSE(15) + LEAQ 64(DI), DI + JMP sseZucSixteens sseZucOctet: - CMPQ BP, $8 - JB sseZucNibble - SUBQ $8, BP - ROUND_SSE(0) - ROUND_SSE(1) - ROUND_SSE(2) - ROUND_SSE(3) - ROUND_SSE(4) - ROUND_SSE(5) - ROUND_SSE(6) - ROUND_SSE(7) - LEAQ 32(DI), DI - RESTORE_LFSR_8() + CMPQ BP, $8 + JB sseZucNibble + SUBQ $8, BP + ROUND_SSE(0) + ROUND_SSE(1) + ROUND_SSE(2) + ROUND_SSE(3) + ROUND_SSE(4) + ROUND_SSE(5) + ROUND_SSE(6) + ROUND_SSE(7) + LEAQ 32(DI), DI + RESTORE_LFSR_8() sseZucNibble: - CMPQ BP, $4 - JB sseZucDouble - SUBQ $4, BP - ROUND_SSE(0) - ROUND_SSE(1) - ROUND_SSE(2) - ROUND_SSE(3) - LEAQ 16(DI), DI - RESTORE_LFSR_4() + CMPQ BP, $4 + JB sseZucDouble + SUBQ $4, BP + ROUND_SSE(0) + ROUND_SSE(1) + ROUND_SSE(2) + ROUND_SSE(3) + LEAQ 16(DI), DI + RESTORE_LFSR_4() sseZucDouble: - CMPQ BP, $2 - JB sseZucSingle - SUBQ $2, BP - ROUND_SSE(0) - ROUND_SSE(1) - LEAQ 8(DI), DI - RESTORE_LFSR_2() + CMPQ BP, $2 + JB sseZucSingle + SUBQ $2, BP + ROUND_SSE(0) + ROUND_SSE(1) + LEAQ 8(DI), DI + RESTORE_LFSR_2() sseZucSingle: - TESTQ BP, BP - JE sseZucRet - ROUND_SSE(0) - RESTORE_LFSR_0() + TESTQ BP, BP + JE sseZucRet + ROUND_SSE(0) + RESTORE_LFSR_0() sseZucRet: - SAVE_STATE() - RET + SAVE_STATE() + RET avxZucSixteens: - CMPQ BP, $16 - JB avxZucOctet - SUBQ $16, BP - ROUND_AVX(0) - ROUND_AVX(1) - ROUND_AVX(2) - ROUND_AVX(3) - ROUND_AVX(4) - ROUND_AVX(5) - ROUND_AVX(6) - ROUND_AVX(7) - ROUND_AVX(8) - ROUND_AVX(9) - ROUND_AVX(10) - ROUND_AVX(11) - ROUND_AVX(12) - ROUND_AVX(13) - ROUND_AVX(14) - ROUND_AVX(15) - LEAQ 64(DI), DI - JMP avxZucSixteens + CMPQ BP, $16 + JB avxZucOctet + SUBQ $16, BP + ROUND_AVX(0) + ROUND_AVX(1) + ROUND_AVX(2) + ROUND_AVX(3) + ROUND_AVX(4) + ROUND_AVX(5) + ROUND_AVX(6) + ROUND_AVX(7) + ROUND_AVX(8) + ROUND_AVX(9) + ROUND_AVX(10) + ROUND_AVX(11) + ROUND_AVX(12) + ROUND_AVX(13) + ROUND_AVX(14) + ROUND_AVX(15) + LEAQ 64(DI), DI + JMP avxZucSixteens avxZucOctet: - CMPQ BP, $8 - JB avxZucNibble - SUBQ $8, BP - ROUND_AVX(0) - ROUND_AVX(1) - ROUND_AVX(2) - ROUND_AVX(3) - ROUND_AVX(4) - ROUND_AVX(5) - ROUND_AVX(6) - ROUND_AVX(7) - LEAQ 32(DI), DI - RESTORE_LFSR_8() + CMPQ BP, $8 + JB avxZucNibble + SUBQ $8, BP + ROUND_AVX(0) + ROUND_AVX(1) + ROUND_AVX(2) + ROUND_AVX(3) + ROUND_AVX(4) + ROUND_AVX(5) + ROUND_AVX(6) + ROUND_AVX(7) + LEAQ 32(DI), DI + RESTORE_LFSR_8() avxZucNibble: - CMPQ BP, $4 - JB avxZucDouble - SUBQ $4, BP - ROUND_AVX(0) - ROUND_AVX(1) - ROUND_AVX(2) - ROUND_AVX(3) - LEAQ 16(DI), DI - RESTORE_LFSR_4() + CMPQ BP, $4 + JB avxZucDouble + SUBQ $4, BP + ROUND_AVX(0) + ROUND_AVX(1) + ROUND_AVX(2) + ROUND_AVX(3) + LEAQ 16(DI), DI + RESTORE_LFSR_4() avxZucDouble: - CMPQ BP, $2 - JB avxZucSingle - SUBQ $2, BP - ROUND_AVX(0) - ROUND_AVX(1) - LEAQ 8(DI), DI - RESTORE_LFSR_2() + CMPQ BP, $2 + JB avxZucSingle + SUBQ $2, BP + ROUND_AVX(0) + ROUND_AVX(1) + LEAQ 8(DI), DI + RESTORE_LFSR_2() avxZucSingle: - TESTQ BP, BP - JE avxZucRet - ROUND_AVX(0) - RESTORE_LFSR_0() + TESTQ BP, BP + JE avxZucRet + ROUND_AVX(0) + RESTORE_LFSR_0() avxZucRet: - SAVE_STATE() - VZEROUPPER - RET + SAVE_STATE() + VZEROUPPER + RET // func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0 - MOVQ ks+0(FP), DI - MOVQ ks_len+8(FP), BP - MOVQ pState+24(FP), SI + MOVQ ks+0(FP), DI + MOVQ ks_len+8(FP), BP + MOVQ pState+24(FP), SI - SHRQ $2, BP + SHRQ $2, BP - LOAD_STATE() + LOAD_STATE() CMPB ·useAVX(SB), $1 JE avxZucSixteens sseZucSixteens: - CMPQ BP, $16 - JB sseZucOctet - SUBQ $16, BP - ROUND_REV32_SSE(0) - ROUND_REV32_SSE(1) - ROUND_REV32_SSE(2) - ROUND_REV32_SSE(3) - ROUND_REV32_SSE(4) - ROUND_REV32_SSE(5) - ROUND_REV32_SSE(6) - ROUND_REV32_SSE(7) - ROUND_REV32_SSE(8) - ROUND_REV32_SSE(9) - ROUND_REV32_SSE(10) - ROUND_REV32_SSE(11) - ROUND_REV32_SSE(12) - ROUND_REV32_SSE(13) - ROUND_REV32_SSE(14) - ROUND_REV32_SSE(15) - LEAQ 64(DI), DI - JMP sseZucSixteens + CMPQ BP, $16 + JB sseZucOctet + SUBQ $16, BP + ROUND_REV32_SSE(0) + ROUND_REV32_SSE(1) + ROUND_REV32_SSE(2) + ROUND_REV32_SSE(3) + ROUND_REV32_SSE(4) + ROUND_REV32_SSE(5) + ROUND_REV32_SSE(6) + ROUND_REV32_SSE(7) + ROUND_REV32_SSE(8) + ROUND_REV32_SSE(9) + ROUND_REV32_SSE(10) + ROUND_REV32_SSE(11) + ROUND_REV32_SSE(12) + ROUND_REV32_SSE(13) + ROUND_REV32_SSE(14) + ROUND_REV32_SSE(15) + LEAQ 64(DI), DI + JMP sseZucSixteens sseZucOctet: - CMPQ BP, $8 - JB sseZucNibble - SUBQ $8, BP - ROUND_REV32_SSE(0) - ROUND_REV32_SSE(1) - ROUND_REV32_SSE(2) - ROUND_REV32_SSE(3) - ROUND_REV32_SSE(4) - ROUND_REV32_SSE(5) - ROUND_REV32_SSE(6) - ROUND_REV32_SSE(7) - LEAQ 32(DI), DI - RESTORE_LFSR_8() + CMPQ BP, $8 + JB sseZucNibble + SUBQ $8, BP + ROUND_REV32_SSE(0) + ROUND_REV32_SSE(1) + ROUND_REV32_SSE(2) + ROUND_REV32_SSE(3) + ROUND_REV32_SSE(4) + ROUND_REV32_SSE(5) + ROUND_REV32_SSE(6) + ROUND_REV32_SSE(7) + LEAQ 32(DI), DI + RESTORE_LFSR_8() sseZucNibble: - CMPQ BP, $4 - JB sseZucDouble - SUBQ $4, BP - ROUND_REV32_SSE(0) - ROUND_REV32_SSE(1) - ROUND_REV32_SSE(2) - ROUND_REV32_SSE(3) - LEAQ 16(DI), DI - RESTORE_LFSR_4() + CMPQ BP, $4 + JB sseZucDouble + SUBQ $4, BP + ROUND_REV32_SSE(0) + ROUND_REV32_SSE(1) + ROUND_REV32_SSE(2) + ROUND_REV32_SSE(3) + LEAQ 16(DI), DI + RESTORE_LFSR_4() sseZucDouble: - CMPQ BP, $2 - JB sseZucSingle - SUBQ $2, BP - ROUND_REV32_SSE(0) - ROUND_REV32_SSE(1) - LEAQ 8(DI), DI - RESTORE_LFSR_2() + CMPQ BP, $2 + JB sseZucSingle + SUBQ $2, BP + ROUND_REV32_SSE(0) + ROUND_REV32_SSE(1) + LEAQ 8(DI), DI + RESTORE_LFSR_2() sseZucSingle: - TESTQ BP, BP - JE sseZucRet - ROUND_REV32_SSE(0) - RESTORE_LFSR_0() + TESTQ BP, BP + JE sseZucRet + ROUND_REV32_SSE(0) + RESTORE_LFSR_0() sseZucRet: - SAVE_STATE() - RET + SAVE_STATE() + RET avxZucSixteens: - CMPQ BP, $16 - JB avxZucOctet - SUBQ $16, BP - ROUND_REV32_AVX(0) - ROUND_REV32_AVX(1) - ROUND_REV32_AVX(2) - ROUND_REV32_AVX(3) - ROUND_REV32_AVX(4) - ROUND_REV32_AVX(5) - ROUND_REV32_AVX(6) - ROUND_REV32_AVX(7) - ROUND_REV32_AVX(8) - ROUND_REV32_AVX(9) - ROUND_REV32_AVX(10) - ROUND_REV32_AVX(11) - ROUND_REV32_AVX(12) - ROUND_REV32_AVX(13) - ROUND_REV32_AVX(14) - ROUND_REV32_AVX(15) - LEAQ 64(DI), DI - JMP avxZucSixteens + CMPQ BP, $16 + JB avxZucOctet + SUBQ $16, BP + ROUND_REV32_AVX(0) + ROUND_REV32_AVX(1) + ROUND_REV32_AVX(2) + ROUND_REV32_AVX(3) + ROUND_REV32_AVX(4) + ROUND_REV32_AVX(5) + ROUND_REV32_AVX(6) + ROUND_REV32_AVX(7) + ROUND_REV32_AVX(8) + ROUND_REV32_AVX(9) + ROUND_REV32_AVX(10) + ROUND_REV32_AVX(11) + ROUND_REV32_AVX(12) + ROUND_REV32_AVX(13) + ROUND_REV32_AVX(14) + ROUND_REV32_AVX(15) + LEAQ 64(DI), DI + JMP avxZucSixteens avxZucOctet: - CMPQ BP, $8 - JB avxZucNibble - SUBQ $8, BP - ROUND_REV32_AVX(0) - ROUND_REV32_AVX(1) - ROUND_REV32_AVX(2) - ROUND_REV32_AVX(3) - ROUND_REV32_AVX(4) - ROUND_REV32_AVX(5) - ROUND_REV32_AVX(6) - ROUND_REV32_AVX(7) - LEAQ 32(DI), DI - RESTORE_LFSR_8() + CMPQ BP, $8 + JB avxZucNibble + SUBQ $8, BP + ROUND_REV32_AVX(0) + ROUND_REV32_AVX(1) + ROUND_REV32_AVX(2) + ROUND_REV32_AVX(3) + ROUND_REV32_AVX(4) + ROUND_REV32_AVX(5) + ROUND_REV32_AVX(6) + ROUND_REV32_AVX(7) + LEAQ 32(DI), DI + RESTORE_LFSR_8() avxZucNibble: - CMPQ BP, $4 - JB avxZucDouble - SUBQ $4, BP - ROUND_REV32_AVX(0) - ROUND_REV32_AVX(1) - ROUND_REV32_AVX(2) - ROUND_REV32_AVX(3) - LEAQ 16(DI), DI - RESTORE_LFSR_4() + CMPQ BP, $4 + JB avxZucDouble + SUBQ $4, BP + ROUND_REV32_AVX(0) + ROUND_REV32_AVX(1) + ROUND_REV32_AVX(2) + ROUND_REV32_AVX(3) + LEAQ 16(DI), DI + RESTORE_LFSR_4() avxZucDouble: - CMPQ BP, $2 - JB avxZucSingle - SUBQ $2, BP - ROUND_REV32_AVX(0) - ROUND_REV32_AVX(1) - LEAQ 8(DI), DI - RESTORE_LFSR_2() + CMPQ BP, $2 + JB avxZucSingle + SUBQ $2, BP + ROUND_REV32_AVX(0) + ROUND_REV32_AVX(1) + LEAQ 8(DI), DI + RESTORE_LFSR_2() avxZucSingle: - TESTQ BP, BP - JE avxZucRet - ROUND_REV32_AVX(0) - RESTORE_LFSR_0() + TESTQ BP, BP + JE avxZucRet + ROUND_REV32_AVX(0) + RESTORE_LFSR_0() avxZucRet: - SAVE_STATE() - VZEROUPPER - RET + SAVE_STATE() + VZEROUPPER + RET diff --git a/zuc/asm_arm64.s b/zuc/asm_arm64.s index 00ee010..d3ce577 100644 --- a/zuc/asm_arm64.s +++ b/zuc/asm_arm64.s @@ -127,35 +127,35 @@ GLOBL mask_S1<>(SB), RODATA, $16 VMOV R1, INVERSE_SHIFT_ROWS.D[1] #define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now - LSLW n, a \ - LSRW n, b \ - ORRW b, a + LSLW n, a \ + LSRW n, b \ + ORRW b, a #define Rotl_5(XDATA, XTMP0) \ - VSHL $5, XDATA.S4, XTMP0.S4 \ - VUSHR $3, XDATA.S4, XDATA.S4 \ - VAND TOP3_BITS.B16, XTMP0.B16, XTMP0.B16 \ - VAND BOTTOM5_BITS.B16, XDATA.B16, XDATA.B16 \ - VORR XTMP0.B16, XDATA.B16, XDATA.B16 + VSHL $5, XDATA.S4, XTMP0.S4 \ + VUSHR $3, XDATA.S4, XDATA.S4 \ + VAND TOP3_BITS.B16, XTMP0.B16, XTMP0.B16 \ + VAND BOTTOM5_BITS.B16, XDATA.B16, XDATA.B16 \ + VORR XTMP0.B16, XDATA.B16, XDATA.B16 #define S0_comput(IN_OUT, XTMP1, XTMP2) \ - VUSHR $4, IN_OUT.S4, XTMP1.S4 \ - VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \ - \ - VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \ - \ - VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \ - VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16 \ - \ - VTBL XTMP2.B16, [P2.B16], XTMP1.B16 \ - VEOR IN_OUT.B16, XTMP1.B16, XTMP1.B16 \ - \ - VTBL XTMP1.B16, [P3.B16], IN_OUT.B16 \ - VEOR XTMP2.B16, IN_OUT.B16, IN_OUT.B16 \ - \ - VSHL $4, IN_OUT.S4, IN_OUT.S4 \ - VEOR XTMP1.B16, IN_OUT.B16, IN_OUT.B16 \ - Rotl_5(IN_OUT, XTMP1) + VUSHR $4, IN_OUT.S4, XTMP1.S4 \ + VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \ + \ + VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \ + \ + VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \ + VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16 \ + \ + VTBL XTMP2.B16, [P2.B16], XTMP1.B16 \ + VEOR IN_OUT.B16, XTMP1.B16, XTMP1.B16 \ + \ + VTBL XTMP1.B16, [P3.B16], IN_OUT.B16 \ + VEOR XTMP2.B16, IN_OUT.B16, IN_OUT.B16 \ + \ + VSHL $4, IN_OUT.S4, IN_OUT.S4 \ + VEOR XTMP1.B16, IN_OUT.B16, IN_OUT.B16 \ + Rotl_5(IN_OUT, XTMP1) #define S1_comput(x, XTMP1, XTMP2) \ VAND x.B16, NIBBLE_MASK.B16, XTMP1.B16; \ @@ -174,335 +174,335 @@ GLOBL mask_S1<>(SB), RODATA, $16 VEOR XTMP2.B16, XTMP1.B16, x.B16 #define BITS_REORG(idx) \ - MOVW (((15 + idx) % 16)*4)(SI), R12 \ - MOVW (((14 + idx) % 16)*4)(SI), AX \ - MOVW (((11 + idx) % 16)*4)(SI), R13 \ - MOVW (((9 + idx) % 16)*4)(SI), BX \ - MOVW (((7 + idx) % 16)*4)(SI), R14 \ - MOVW (((5 + idx) % 16)*4)(SI), CX \ - MOVW (((2 + idx) % 16)*4)(SI), R15 \ - MOVW (((0 + idx) % 16)*4)(SI), DX \ - LSRW $15, R12 \ - LSLW $16, AX \ - LSLW $1, BX \ - LSLW $1, CX \ - LSLW $1, DX \ - SHLDL(R12, AX, $16) \ - SHLDL(R13, BX, $16) \ - SHLDL(R14, CX, $16) \ - SHLDL(R15, DX, $16) + MOVW (((15 + idx) % 16)*4)(SI), R12 \ + MOVW (((14 + idx) % 16)*4)(SI), AX \ + MOVW (((11 + idx) % 16)*4)(SI), R13 \ + MOVW (((9 + idx) % 16)*4)(SI), BX \ + MOVW (((7 + idx) % 16)*4)(SI), R14 \ + MOVW (((5 + idx) % 16)*4)(SI), CX \ + MOVW (((2 + idx) % 16)*4)(SI), R15 \ + MOVW (((0 + idx) % 16)*4)(SI), DX \ + LSRW $15, R12 \ + LSLW $16, AX \ + LSLW $1, BX \ + LSLW $1, CX \ + LSLW $1, DX \ + SHLDL(R12, AX, $16) \ + SHLDL(R13, BX, $16) \ + SHLDL(R14, CX, $16) \ + SHLDL(R15, DX, $16) #define LFSR_UPDT(idx) \ - MOVW (((0 + idx) % 16)*4)(SI), BX \ - MOVW (((4 + idx) % 16)*4)(SI), CX \ - MOVW (((10 + idx) % 16)*4)(SI), DX \ - MOVW (((13 + idx) % 16)*4)(SI), R8 \ - MOVW (((15 + idx) % 16)*4)(SI), R9 \ - ADD BX, AX \ - LSL $8, BX \ - LSL $20, CX \ - LSL $21, DX \ - LSL $17, R8 \ - LSL $15, R9 \ - ADD BX, AX \ - ADD CX, AX \ - ADD DX, AX \ - ADD R8, AX \ - ADD R9, AX \ - \ - MOVD AX, BX \ - AND $0x7FFFFFFF, AX \ - LSR $31, BX \ - ADD BX, AX \ - \ - SUBS $0x7FFFFFFF, AX, BX \ - CSEL CS, BX, AX, AX \ - \ - MOVW AX, (((0 + idx) % 16)*4)(SI) + MOVW (((0 + idx) % 16)*4)(SI), BX \ + MOVW (((4 + idx) % 16)*4)(SI), CX \ + MOVW (((10 + idx) % 16)*4)(SI), DX \ + MOVW (((13 + idx) % 16)*4)(SI), R8 \ + MOVW (((15 + idx) % 16)*4)(SI), R9 \ + ADD BX, AX \ + LSL $8, BX \ + LSL $20, CX \ + LSL $21, DX \ + LSL $17, R8 \ + LSL $15, R9 \ + ADD BX, AX \ + ADD CX, AX \ + ADD DX, AX \ + ADD R8, AX \ + ADD R9, AX \ + \ + MOVD AX, BX \ + AND $0x7FFFFFFF, AX \ + LSR $31, BX \ + ADD BX, AX \ + \ + SUBS $0x7FFFFFFF, AX, BX \ + CSEL CS, BX, AX, AX \ + \ + MOVW AX, (((0 + idx) % 16)*4)(SI) #define NONLIN_FUN() \ - MOVW R12, AX \ - EORW R10, AX \ - ADDW R11, AX \ - ADDW R13, R10 \ // W1= F_R1 + BRC_X1 - EORW R14, R11 \ // W2= F_R2 ^ BRC_X2 - \ - MOVW R10, DX \ - MOVW R11, CX \ - SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16) - SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16) - MOVW DX, BX \ - MOVW DX, CX \ - MOVW DX, R8 \ - MOVW DX, R9 \ - RORW $30, BX \ - RORW $22, CX \ - RORW $14, R8 \ - RORW $8, R9 \ - EORW BX, DX \ - EORW CX, DX \ - EORW R8, DX \ - EORW R9, DX \ // U = L1(P) = EDX, hi(RDX)=0 - MOVW R11, BX \ - MOVW R11, CX \ - MOVW R11, R8 \ - MOVW R11, R9 \ - RORW $24, BX \ - RORW $18, CX \ - RORW $10, R8 \ - RORW $2, R9 \ - EORW BX, R11 \ - EORW CX, R11 \ - EORW R8, R11 \ - EORW R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0 - LSL $32, R11 \ - EOR R11, DX \ - VMOV DX, V0.D2 \ - VMOV V0.B16, V1.B16 \ - S0_comput(V1, V2, V3) \ - S1_comput(V0, V2, V3) \ - \ - VAND S1_MASK.B16, V0.B16, V0.B16 \ - VAND S0_MASK.B16, V1.B16, V1.B16 \ - VEOR V1.B16, V0.B16, V0.B16 \ - \ - VMOV V0.S[0], R10 \ // F_R1 - VMOV V0.S[1], R11 + MOVW R12, AX \ + EORW R10, AX \ + ADDW R11, AX \ + ADDW R13, R10 \ // W1= F_R1 + BRC_X1 + EORW R14, R11 \ // W2= F_R2 ^ BRC_X2 + \ + MOVW R10, DX \ + MOVW R11, CX \ + SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16) + SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16) + MOVW DX, BX \ + MOVW DX, CX \ + MOVW DX, R8 \ + MOVW DX, R9 \ + RORW $30, BX \ + RORW $22, CX \ + RORW $14, R8 \ + RORW $8, R9 \ + EORW BX, DX \ + EORW CX, DX \ + EORW R8, DX \ + EORW R9, DX \ // U = L1(P) = EDX, hi(RDX)=0 + MOVW R11, BX \ + MOVW R11, CX \ + MOVW R11, R8 \ + MOVW R11, R9 \ + RORW $24, BX \ + RORW $18, CX \ + RORW $10, R8 \ + RORW $2, R9 \ + EORW BX, R11 \ + EORW CX, R11 \ + EORW R8, R11 \ + EORW R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0 + LSL $32, R11 \ + EOR R11, DX \ + VMOV DX, V0.D2 \ + VMOV V0.B16, V1.B16 \ + S0_comput(V1, V2, V3) \ + S1_comput(V0, V2, V3) \ + \ + VAND S1_MASK.B16, V0.B16, V0.B16 \ + VAND S0_MASK.B16, V1.B16, V1.B16 \ + VEOR V1.B16, V0.B16, V0.B16 \ + \ + VMOV V0.S[0], R10 \ // F_R1 + VMOV V0.S[1], R11 #define RESTORE_LFSR_0() \ - MOVW.P 4(SI), AX \ - VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ - SUB $4, SI \ - MOVD (52)(SI), BX \ - MOVW (60)(SI), CX \ - \ - VST1 [V0.B16, V1.B16, V2.B16], (SI) \ - MOVD BX, (48)(SI) \ - MOVW CX, (56)(SI) \ - MOVW AX, (60)(SI) + MOVW.P 4(SI), AX \ + VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ + SUB $4, SI \ + MOVD (52)(SI), BX \ + MOVW (60)(SI), CX \ + \ + VST1 [V0.B16, V1.B16, V2.B16], (SI) \ + MOVD BX, (48)(SI) \ + MOVW CX, (56)(SI) \ + MOVW AX, (60)(SI) #define RESTORE_LFSR_2() \ - MOVD.P 8(SI), AX \ - VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ - SUB $8, SI \ - MOVD (56)(SI), BX \ - \ - VST1 [V0.B16, V1.B16, V2.B16], (SI) \ - MOVD BX, (48)(SI) \ - MOVD AX, (56)(SI) + MOVD.P 8(SI), AX \ + VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ + SUB $8, SI \ + MOVD (56)(SI), BX \ + \ + VST1 [V0.B16, V1.B16, V2.B16], (SI) \ + MOVD BX, (48)(SI) \ + MOVD AX, (56)(SI) #define RESTORE_LFSR_4() \ - VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ - \ - VST1.P [V1.B16, V2.B16, V3.B16], 48(SI) \ - VST1 [V0.B16], (SI) \ - SUB $48, SI + VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ + \ + VST1.P [V1.B16, V2.B16, V3.B16], 48(SI) \ + VST1 [V0.B16], (SI) \ + SUB $48, SI #define RESTORE_LFSR_8() \ - VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ - \ - VST1.P [V2.B16, V3.B16], 32(SI) \ - VST1 [V0.B16, V1.B16], (SI) \ - SUB $32, SI + VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ + \ + VST1.P [V2.B16, V3.B16], 32(SI) \ + VST1 [V0.B16, V1.B16], (SI) \ + SUB $32, SI #define LOAD_STATE(r) \ - MOVW 64+r, R10 \ - MOVW 68+r, R11 \ - MOVW 72+r, R12 \ - MOVW 76+r, R13 \ - MOVW 80+r, R14 \ - MOVW 84+r, R15 + MOVW 64+r, R10 \ + MOVW 68+r, R11 \ + MOVW 72+r, R12 \ + MOVW 76+r, R13 \ + MOVW 80+r, R14 \ + MOVW 84+r, R15 #define SAVE_STATE(r) \ - MOVW R10, 64+r \ - MOVW R11, 68+r \ - MOVW R12, 72+r \ - MOVW R13, 76+r \ - MOVW R14, 80+r \ - MOVW R15, 84+r + MOVW R10, 64+r \ + MOVW R11, 68+r \ + MOVW R12, 72+r \ + MOVW R13, 76+r \ + MOVW R14, 80+r \ + MOVW R15, 84+r // func genKeywordAsm(s *zucState32) uint32 TEXT ·genKeywordAsm(SB),NOSPLIT,$0 - LOAD_GLOBAL_DATA() - VEOR ZERO.B16, ZERO.B16, ZERO.B16 + LOAD_GLOBAL_DATA() + VEOR ZERO.B16, ZERO.B16, ZERO.B16 - MOVD pState+0(FP), SI - LOAD_STATE(0(SI)) + MOVD pState+0(FP), SI + LOAD_STATE(0(SI)) - BITS_REORG(0) - NONLIN_FUN() + BITS_REORG(0) + NONLIN_FUN() - EORW R15, AX - MOVW AX, ret+8(FP) - EOR AX, AX - LFSR_UPDT(0) - SAVE_STATE(0(SI)) - RESTORE_LFSR_0() + EORW R15, AX + MOVW AX, ret+8(FP) + EOR AX, AX + LFSR_UPDT(0) + SAVE_STATE(0(SI)) + RESTORE_LFSR_0() - RET + RET #define ONEROUND(idx) \ - BITS_REORG(idx) \ - NONLIN_FUN() \ - EORW R15, AX \ - MOVW AX, (idx*4)(DI) \ - EOR AX, AX \ - LFSR_UPDT(idx) + BITS_REORG(idx) \ + NONLIN_FUN() \ + EORW R15, AX \ + MOVW AX, (idx*4)(DI) \ + EOR AX, AX \ + LFSR_UPDT(idx) #define ROUND_REV32(idx) \ - BITS_REORG(idx) \ - NONLIN_FUN() \ - EORW R15, AX \ - REVW AX, AX \ - MOVW AX, (idx*4)(DI) \ - EOR AX, AX \ - LFSR_UPDT(idx) + BITS_REORG(idx) \ + NONLIN_FUN() \ + EORW R15, AX \ + REVW AX, AX \ + MOVW AX, (idx*4)(DI) \ + EOR AX, AX \ + LFSR_UPDT(idx) // func genKeyStreamAsm(keyStream []uint32, pState *zucState32) TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0 - LOAD_GLOBAL_DATA() - VEOR ZERO.B16, ZERO.B16, ZERO.B16 + LOAD_GLOBAL_DATA() + VEOR ZERO.B16, ZERO.B16, ZERO.B16 - MOVD ks+0(FP), DI - MOVD ks_len+8(FP), BP - MOVD pState+24(FP), SI + MOVD ks+0(FP), DI + MOVD ks_len+8(FP), BP + MOVD pState+24(FP), SI - LOAD_STATE(0(SI)) + LOAD_STATE(0(SI)) zucSixteens: - CMP $16, BP - BLT zucOctet - SUB $16, BP - ONEROUND(0) - ONEROUND(1) - ONEROUND(2) - ONEROUND(3) - ONEROUND(4) - ONEROUND(5) - ONEROUND(6) - ONEROUND(7) - ONEROUND(8) - ONEROUND(9) - ONEROUND(10) - ONEROUND(11) - ONEROUND(12) - ONEROUND(13) - ONEROUND(14) - ONEROUND(15) - ADD $4*16, DI - B zucSixteens + CMP $16, BP + BLT zucOctet + SUB $16, BP + ONEROUND(0) + ONEROUND(1) + ONEROUND(2) + ONEROUND(3) + ONEROUND(4) + ONEROUND(5) + ONEROUND(6) + ONEROUND(7) + ONEROUND(8) + ONEROUND(9) + ONEROUND(10) + ONEROUND(11) + ONEROUND(12) + ONEROUND(13) + ONEROUND(14) + ONEROUND(15) + ADD $4*16, DI + B zucSixteens zucOctet: - CMP $8, BP - BLT zucNibble - SUB $8, BP - ONEROUND(0) - ONEROUND(1) - ONEROUND(2) - ONEROUND(3) - ONEROUND(4) - ONEROUND(5) - ONEROUND(6) - ONEROUND(7) - ADD $2*16, DI - RESTORE_LFSR_8() + CMP $8, BP + BLT zucNibble + SUB $8, BP + ONEROUND(0) + ONEROUND(1) + ONEROUND(2) + ONEROUND(3) + ONEROUND(4) + ONEROUND(5) + ONEROUND(6) + ONEROUND(7) + ADD $2*16, DI + RESTORE_LFSR_8() zucNibble: - CMP $4, BP - BLT zucDouble - SUB $4, BP - ONEROUND(0) - ONEROUND(1) - ONEROUND(2) - ONEROUND(3) - ADD $1*16, DI - RESTORE_LFSR_4() + CMP $4, BP + BLT zucDouble + SUB $4, BP + ONEROUND(0) + ONEROUND(1) + ONEROUND(2) + ONEROUND(3) + ADD $1*16, DI + RESTORE_LFSR_4() zucDouble: - CMP $2, BP - BLT zucSingle - SUB $2, BP - ONEROUND(0) - ONEROUND(1) - ADD $8, DI - RESTORE_LFSR_2() + CMP $2, BP + BLT zucSingle + SUB $2, BP + ONEROUND(0) + ONEROUND(1) + ADD $8, DI + RESTORE_LFSR_2() zucSingle: - TBZ $0, BP, zucRet - ONEROUND(0) - RESTORE_LFSR_0() + TBZ $0, BP, zucRet + ONEROUND(0) + RESTORE_LFSR_0() zucRet: - SAVE_STATE(0(SI)) - RET + SAVE_STATE(0(SI)) + RET // func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0 - LOAD_GLOBAL_DATA() - VEOR ZERO.B16, ZERO.B16, ZERO.B16 + LOAD_GLOBAL_DATA() + VEOR ZERO.B16, ZERO.B16, ZERO.B16 - MOVD ks+0(FP), DI - MOVD ks_len+8(FP), BP - MOVD pState+24(FP), SI + MOVD ks+0(FP), DI + MOVD ks_len+8(FP), BP + MOVD pState+24(FP), SI - LSR $2, BP - LOAD_STATE(0(SI)) + LSR $2, BP + LOAD_STATE(0(SI)) zucSixteens: - CMP $16, BP - BLT zucOctet - SUB $16, BP - ROUND_REV32(0) - ROUND_REV32(1) - ROUND_REV32(2) - ROUND_REV32(3) - ROUND_REV32(4) - ROUND_REV32(5) - ROUND_REV32(6) - ROUND_REV32(7) - ROUND_REV32(8) - ROUND_REV32(9) - ROUND_REV32(10) - ROUND_REV32(11) - ROUND_REV32(12) - ROUND_REV32(13) - ROUND_REV32(14) - ROUND_REV32(15) - ADD $4*16, DI - B zucSixteens + CMP $16, BP + BLT zucOctet + SUB $16, BP + ROUND_REV32(0) + ROUND_REV32(1) + ROUND_REV32(2) + ROUND_REV32(3) + ROUND_REV32(4) + ROUND_REV32(5) + ROUND_REV32(6) + ROUND_REV32(7) + ROUND_REV32(8) + ROUND_REV32(9) + ROUND_REV32(10) + ROUND_REV32(11) + ROUND_REV32(12) + ROUND_REV32(13) + ROUND_REV32(14) + ROUND_REV32(15) + ADD $4*16, DI + B zucSixteens zucOctet: - CMP $8, BP - BLT zucNibble - SUB $8, BP - ROUND_REV32(0) - ROUND_REV32(1) - ROUND_REV32(2) - ROUND_REV32(3) - ROUND_REV32(4) - ROUND_REV32(5) - ROUND_REV32(6) - ROUND_REV32(7) - ADD $2*16, DI - RESTORE_LFSR_8() + CMP $8, BP + BLT zucNibble + SUB $8, BP + ROUND_REV32(0) + ROUND_REV32(1) + ROUND_REV32(2) + ROUND_REV32(3) + ROUND_REV32(4) + ROUND_REV32(5) + ROUND_REV32(6) + ROUND_REV32(7) + ADD $2*16, DI + RESTORE_LFSR_8() zucNibble: - CMP $4, BP - BLT zucDouble - SUB $4, BP - ROUND_REV32(0) - ROUND_REV32(1) - ROUND_REV32(2) - ROUND_REV32(3) - ADD $16, DI - RESTORE_LFSR_4() + CMP $4, BP + BLT zucDouble + SUB $4, BP + ROUND_REV32(0) + ROUND_REV32(1) + ROUND_REV32(2) + ROUND_REV32(3) + ADD $16, DI + RESTORE_LFSR_4() zucDouble: - CMP $2, BP - BLT zucSingle - SUB $2, BP - ROUND_REV32(0) - ROUND_REV32(1) - ADD $8, DI - RESTORE_LFSR_2() + CMP $2, BP + BLT zucSingle + SUB $2, BP + ROUND_REV32(0) + ROUND_REV32(1) + ADD $8, DI + RESTORE_LFSR_2() zucSingle: - TBZ $0, BP, zucRet - ROUND_REV32(0) - RESTORE_LFSR_0() + TBZ $0, BP, zucRet + ROUND_REV32(0) + RESTORE_LFSR_0() zucRet: - SAVE_STATE(0(SI)) - RET + SAVE_STATE(0(SI)) + RET diff --git a/zuc/eia.go b/zuc/eia.go index 0dad6b4..8018776 100644 --- a/zuc/eia.go +++ b/zuc/eia.go @@ -1,7 +1,5 @@ package zuc -// Just for reference, no performance advantage due to the block size / chunk are 4 bytes only! - import ( "encoding/binary" "fmt" @@ -29,6 +27,7 @@ func NewHash(key, iv []byte) (*ZUC128Mac, error) { ivLen := len(iv) mac := &ZUC128Mac{} mac.tagSize = 4 + switch k { default: return nil, fmt.Errorf("zuc/eia: invalid key size %d, expect 16 in bytes", k) @@ -38,6 +37,7 @@ func NewHash(key, iv []byte) (*ZUC128Mac, error) { } mac.loadKeyIV16(key, iv) } + // initialization for i := 0; i < 32; i++ { mac.bitReorganization() @@ -89,10 +89,10 @@ func (m *ZUC128Mac) Reset() { m.r1 = m.initState.r1 m.r2 = m.initState.r2 copy(m.lfsr[:], m.initState.lfsr[:]) - m.genKeywords(m.k0[:4]) + m.genKeywords(m.k0[:len(m.k0)/2]) } -func (m *ZUC128Mac) block(p []byte) { +func blockGeneric(m *ZUC128Mac, p []byte) { var k64, t64 uint64 t64 = uint64(m.t) << 32 for len(p) >= chunk { @@ -121,14 +121,14 @@ func (m *ZUC128Mac) Write(p []byte) (nn int, err error) { n := copy(m.x[m.nx:], p) m.nx += n if m.nx == chunk { - m.block(m.x[:]) + block(m, m.x[:]) m.nx = 0 } p = p[n:] } if len(p) >= chunk { n := len(p) &^ (chunk - 1) - m.block(p[:n]) + block(m, p[:n]) p = p[n:] } if len(p) > 0 { @@ -139,7 +139,7 @@ func (m *ZUC128Mac) Write(p []byte) (nn int, err error) { func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte { if m.nx >= chunk { - panic("m.nx >= 16") + panic("m.nx >= chunk") } kIdx := 0 if m.nx > 0 || additionalBits > 0 { @@ -147,7 +147,7 @@ func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte { t64 = uint64(m.t) << 32 m.x[m.nx] = b nRemainBits := 8*m.nx + additionalBits - if nRemainBits > 64 { + if nRemainBits > 2*32 { m.genKeywords(m.k0[4:6]) } words := (nRemainBits + 31) / 32 diff --git a/zuc/eia_asm.go b/zuc/eia_asm.go new file mode 100644 index 0000000..e084f5f --- /dev/null +++ b/zuc/eia_asm.go @@ -0,0 +1,24 @@ +//go:build (amd64 && !generic) +// +build amd64,!generic + +package zuc + +import "golang.org/x/sys/cpu" + +var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL +var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 + +//go:noescape +func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) + +func block(m *ZUC128Mac, p []byte) { + if supportsGFMUL { + for len(p) >= chunk { + m.genKeywords(m.k0[4:]) + eia3Round16B(&m.t, &m.k0[0], &p[0], m.tagSize) + p = p[chunk:] + } + } else { + blockGeneric(m, p) + } +} diff --git a/zuc/eia_asm_amd64.s b/zuc/eia_asm_amd64.s new file mode 100644 index 0000000..10433fc --- /dev/null +++ b/zuc/eia_asm_amd64.s @@ -0,0 +1,153 @@ +// Referenced https://github.com/intel/intel-ipsec-mb/ +//go:build amd64 && !generic +// +build amd64,!generic + +#include "textflag.h" + +DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800 +DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901 +GLOBL bit_reverse_table_l<>(SB), RODATA, $16 + +DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000 +DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010 +GLOBL bit_reverse_table_h<>(SB), RODATA, $16 + +DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f +DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f +GLOBL bit_reverse_and_table<>(SB), RODATA, $16 + +DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100 +DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504 +GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16 + +DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908 +DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c +GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16 + +#define XTMP1 X1 +#define XTMP2 X2 +#define XTMP3 X3 +#define XTMP4 X4 +#define XTMP5 X5 +#define XTMP6 X6 +#define XDATA X7 +#define XDIGEST X8 +#define KS_L X9 +#define KS_M1 X10 +#define KS_M2 X11 +#define KS_H X12 + +// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) +TEXT ·eia3Round16B(SB),NOSPLIT,$0 + MOVQ t+0(FP), AX + MOVQ ks+8(FP), BX + MOVQ p+16(FP), CX + MOVQ tagSize+24(FP), DX + + CMPB ·useAVX2(SB), $1 + JE avx2 + + // Reverse data bytes + MOVUPS (0)(CX), XDATA + MOVOU bit_reverse_and_table<>(SB), XTMP4 + MOVOU XDATA, XTMP2 + PAND XTMP4, XTMP2 + + PANDN XDATA, XTMP4 + PSRLQ $4, XTMP4 + + MOVOU bit_reverse_table_h<>(SB), XTMP3 + PSHUFB XTMP2, XTMP3 + + MOVOU bit_reverse_table_l<>(SB), XTMP1 + PSHUFB XTMP4, XTMP1 + + PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes + + // ZUC authentication part, 4x32 data bits + // setup KS + MOVUPS (0*4)(BX), XTMP1 + MOVUPS (2*4)(BX), XTMP2 + PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32] + PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96] + + // setup DATA + MOVOU XTMP3, XTMP1 + PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1 + MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s] + + PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3 + MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s] + + // clmul + // xor the results from 4 32-bit words together + // Calculate lower 32 bits of tag + PCLMULQDQ $0x00, KS_L, XTMP1 + PCLMULQDQ $0x11, KS_L, XTMP2 + PCLMULQDQ $0x00, KS_M1, XDIGEST + PCLMULQDQ $0x11, KS_M1, XTMP3 + + // XOR all products and move 32-bits to lower 32 bits + PXOR XTMP1, XTMP2 + PXOR XTMP3, XDIGEST + PXOR XTMP2, XDIGEST + PSRLDQ $4, XDIGEST + + // Update tag + MOVL XDIGEST, R10 + XORL R10, (AX) + + // Copy last 16 bytes of KS to the front + MOVUPS (4*4)(BX), XTMP1 + MOVUPS XTMP1, (0*4)(BX) + + RET + +avx2: + VMOVDQU (0)(CX), XDATA + + // Reverse data bytes + VMOVDQU bit_reverse_and_table<>(SB), XTMP1 + VPAND XTMP1, XDATA, XTMP2 + VPANDN XDATA, XTMP1, XTMP3 + VPSRLD $4, XTMP3, XTMP3 + + VMOVDQU bit_reverse_table_h<>(SB), XTMP1 + VPSHUFB XTMP2, XTMP1, XTMP4 + VMOVDQU bit_reverse_table_l<>(SB), XTMP1 + VPSHUFB XTMP3, XTMP1, XTMP1 + VPOR XTMP1, XTMP4, XTMP4 + + // ZUC authentication part, 4x32 data bits + // setup KS + VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32] + VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32] + + // setup DATA + // Data bytes [31:0 0s 63:32 0s] + VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1 + // Data bytes [95:64 0s 127:96 0s] + VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2 + + // clmul + // xor the results from 4 32-bit words together + // Calculate lower 32 bits of tag + VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3 + VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4 + VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5 + VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6 + + VPXOR XTMP3, XTMP4, XTMP3 + VPXOR XTMP5, XTMP6, XTMP5 + VPXOR XTMP3, XTMP5, XDIGEST + + VMOVQ XDIGEST, R10 + SHRQ $32, R10 + XORL R10, (AX) + + // Copy last 16 bytes of KS to the front + VMOVDQU (4*4)(BX), XTMP1 + VMOVDQU XTMP1, (0*4)(BX) + + VZEROUPPER + RET diff --git a/zuc/eia_generic.go b/zuc/eia_generic.go new file mode 100644 index 0000000..9899635 --- /dev/null +++ b/zuc/eia_generic.go @@ -0,0 +1,8 @@ +//go:build !amd64 || generic +// +build !amd64 generic + +package zuc + +func block(m *ZUC128Mac, p []byte) { + blockGeneric(m, p) +}