mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
zuc: add comments
This commit is contained in:
parent
5a85197445
commit
4423ede309
221
zuc/asm_amd64.s
221
zuc/asm_amd64.s
@ -89,52 +89,55 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
SHRL n, b \
|
SHRL n, b \
|
||||||
ORL b, a
|
ORL b, a
|
||||||
|
|
||||||
|
// Rotate left 5 bits in each byte, within an XMM register, SSE version.
|
||||||
#define Rotl_5_SSE(XDATA, XTMP0) \
|
#define Rotl_5_SSE(XDATA, XTMP0) \
|
||||||
MOVOU XDATA, XTMP0 \
|
MOVOU XDATA, XTMP0 \
|
||||||
PSLLQ $5, XTMP0 \ // should use pslld
|
PSLLL $5, XTMP0 \
|
||||||
PSRLQ $3, XDATA \ // should use psrld
|
PSRLL $3, XDATA \
|
||||||
PAND Top3_bits_of_the_byte<>(SB), XTMP0 \
|
PAND Top3_bits_of_the_byte<>(SB), XTMP0 \
|
||||||
PAND Bottom5_bits_of_the_byte<>(SB), XDATA \
|
PAND Bottom5_bits_of_the_byte<>(SB), XDATA \
|
||||||
POR XTMP0, XDATA
|
POR XTMP0, XDATA
|
||||||
|
|
||||||
|
// Compute 16 S0 box values from 16 bytes, SSE version.
|
||||||
#define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \
|
#define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \
|
||||||
MOVOU IN_OUT, XTMP1 \
|
MOVOU IN_OUT, XTMP1 \
|
||||||
\
|
\
|
||||||
PAND Low_nibble_mask<>(SB), IN_OUT \
|
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
|
||||||
\
|
\
|
||||||
PAND High_nibble_mask<>(SB), XTMP1 \
|
PAND High_nibble_mask<>(SB), XTMP1 \
|
||||||
PSRLQ $4, XTMP1 \
|
PSRLQ $4, XTMP1 \ // x1
|
||||||
\
|
\
|
||||||
MOVOU P1<>(SB), XTMP2 \
|
MOVOU P1<>(SB), XTMP2 \
|
||||||
PSHUFB IN_OUT, XTMP2 \
|
PSHUFB IN_OUT, XTMP2 \ // P1[x2]
|
||||||
PXOR XTMP1, XTMP2 \
|
PXOR XTMP1, XTMP2 \ // q = x1 ^ P1[x2], XTMP1 free
|
||||||
\
|
\
|
||||||
MOVOU P2<>(SB), XTMP1 \
|
MOVOU P2<>(SB), XTMP1 \
|
||||||
PSHUFB XTMP2, XTMP1 \
|
PSHUFB XTMP2, XTMP1 \ // P2[q]
|
||||||
PXOR IN_OUT, XTMP1 \
|
PXOR IN_OUT, XTMP1 \ // r = x2 ^ P2[q]; IN_OUT free
|
||||||
\
|
\
|
||||||
MOVOU P3<>(SB), IN_OUT \
|
MOVOU P3<>(SB), IN_OUT \
|
||||||
PSHUFB XTMP1, IN_OUT \
|
PSHUFB XTMP1, IN_OUT \ // P3[r]
|
||||||
PXOR XTMP2, IN_OUT \
|
PXOR XTMP2, IN_OUT \ // s = q ^ P3[r], XTMP2 free
|
||||||
\
|
\ // s << 4 (since high nibble of each byte is 0, no masking is required)
|
||||||
PSLLQ $4, IN_OUT \
|
PSLLQ $4, IN_OUT \
|
||||||
POR XTMP1, IN_OUT \
|
POR XTMP1, IN_OUT \ // t = (s << 4) | r
|
||||||
Rotl_5_SSE(IN_OUT, XTMP1)
|
Rotl_5_SSE(IN_OUT, XTMP1)
|
||||||
|
|
||||||
// Perform 8x8 matrix multiplication using lookup tables with partial results
|
// Perform 8x8 matrix multiplication using lookup tables with partial results
|
||||||
// for high and low nible of each input byte
|
// for high and low nible of each input byte, SSE versiion.
|
||||||
#define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \
|
#define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \
|
||||||
|
\ // Get low nibble of input data
|
||||||
MOVOU Low_nibble_mask<>(SB), XTMP \
|
MOVOU Low_nibble_mask<>(SB), XTMP \
|
||||||
PAND XIN, XTMP \
|
PAND XIN, XTMP \
|
||||||
\
|
\ // Get low nibble of output
|
||||||
PSHUFB XTMP, XLO \
|
PSHUFB XTMP, XLO \
|
||||||
\
|
\ // Get high nibble of input data
|
||||||
MOVOU High_nibble_mask<>(SB), XTMP \
|
MOVOU High_nibble_mask<>(SB), XTMP \
|
||||||
PAND XIN, XTMP \
|
PAND XIN, XTMP \
|
||||||
PSRLQ $4, XTMP \
|
PSRLQ $4, XTMP \
|
||||||
\
|
\ // Get high nibble of output
|
||||||
PSHUFB XTMP, XHI_OUT \
|
PSHUFB XTMP, XHI_OUT \
|
||||||
\
|
\ // XOR high and low nibbles to get full bytes
|
||||||
PXOR XLO, XHI_OUT
|
PXOR XLO, XHI_OUT
|
||||||
|
|
||||||
// Compute 16 S1 box values from 16 bytes, stored in XMM register
|
// Compute 16 S1 box values from 16 bytes, stored in XMM register
|
||||||
@ -150,7 +153,7 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
|
MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
|
||||||
MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3)
|
MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3)
|
||||||
|
|
||||||
|
// Rotate left 5 bits in each byte, within an XMM register, AVX version.
|
||||||
#define Rotl_5_AVX(XDATA, XTMP0) \
|
#define Rotl_5_AVX(XDATA, XTMP0) \
|
||||||
VPSLLD $5, XDATA, XTMP0 \
|
VPSLLD $5, XDATA, XTMP0 \
|
||||||
VPSRLD $3, XDATA, XDATA \
|
VPSRLD $3, XDATA, XDATA \
|
||||||
@ -158,81 +161,106 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
VPAND Bottom5_bits_of_the_byte<>(SB), XDATA, XDATA \
|
VPAND Bottom5_bits_of_the_byte<>(SB), XDATA, XDATA \
|
||||||
VPOR XTMP0, XDATA, XDATA
|
VPOR XTMP0, XDATA, XDATA
|
||||||
|
|
||||||
|
// Compute 16 S0 box values from 16 bytes, AVX version.
|
||||||
#define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \
|
#define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \
|
||||||
VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \
|
VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \
|
||||||
VPSRLQ $4, XTMP1, XTMP1 \
|
VPSRLQ $4, XTMP1, XTMP1 \ // x1
|
||||||
\
|
\
|
||||||
VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \
|
VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2
|
||||||
\
|
\
|
||||||
VMOVDQU P1<>(SB), XTMP2 \
|
VMOVDQU P1<>(SB), XTMP2 \
|
||||||
VPSHUFB IN_OUT, XTMP2, XTMP2 \
|
VPSHUFB IN_OUT, XTMP2, XTMP2 \ // P1[x2]
|
||||||
VPXOR XTMP1, XTMP2, XTMP2 \
|
VPXOR XTMP1, XTMP2, XTMP2 \ // q = x1 ^ P1[x2] ; XTMP1 free
|
||||||
\
|
\
|
||||||
VMOVDQU P2<>(SB), XTMP1 \
|
VMOVDQU P2<>(SB), XTMP1 \
|
||||||
VPSHUFB XTMP2, XTMP1, XTMP1 \
|
VPSHUFB XTMP2, XTMP1, XTMP1 \ // P2[q]
|
||||||
VPXOR IN_OUT, XTMP1, XTMP1 \
|
VPXOR IN_OUT, XTMP1, XTMP1 \ // r = x2 ^ P2[q] ; IN_OUT free
|
||||||
\
|
\
|
||||||
VMOVDQU P3<>(SB), IN_OUT \
|
VMOVDQU P3<>(SB), IN_OUT \
|
||||||
VPSHUFB XTMP1, IN_OUT, IN_OUT \
|
VPSHUFB XTMP1, IN_OUT, IN_OUT \ // P3[r]
|
||||||
VPXOR XTMP2, IN_OUT, IN_OUT \
|
VPXOR XTMP2, IN_OUT, IN_OUT \ // s = q ^ P3[r] ; XTMP2 free
|
||||||
\
|
\ // s << 4 (since high nibble of each byte is 0, no masking is required)
|
||||||
VPSLLQ $4, IN_OUT, IN_OUT \
|
VPSLLQ $4, IN_OUT, IN_OUT \
|
||||||
VPOR XTMP1, IN_OUT, IN_OUT \
|
VPOR XTMP1, IN_OUT, IN_OUT \ // t = (s << 4) | r
|
||||||
Rotl_5_AVX(IN_OUT, XTMP1)
|
Rotl_5_AVX(IN_OUT, XTMP1)
|
||||||
|
|
||||||
// Perform 8x8 matrix multiplication using lookup tables with partial results
|
// Perform 8x8 matrix multiplication using lookup tables with partial results
|
||||||
// for high and low nible of each input byte
|
// for high and low nible of each input byte, AVX version.
|
||||||
#define MUL_PSHUFB_AVX(XIN, XLO, XHI_OUT, XTMP) \
|
#define MUL_PSHUFB_AVX(XIN, XLO, XHI_OUT, XTMP) \
|
||||||
|
\ // Get low nibble of input data
|
||||||
VPAND Low_nibble_mask<>(SB), XIN, XTMP \
|
VPAND Low_nibble_mask<>(SB), XIN, XTMP \
|
||||||
|
\ // Get low nibble of output
|
||||||
VPSHUFB XTMP, XLO, XLO \
|
VPSHUFB XTMP, XLO, XLO \
|
||||||
|
\ // Get high nibble of input data
|
||||||
VPAND High_nibble_mask<>(SB), XIN, XTMP \
|
VPAND High_nibble_mask<>(SB), XIN, XTMP \
|
||||||
VPSRLQ $4, XTMP, XTMP \
|
VPSRLQ $4, XTMP, XTMP \
|
||||||
|
\ // Get high nibble of output
|
||||||
VPSHUFB XTMP, XHI_OUT, XHI_OUT \
|
VPSHUFB XTMP, XHI_OUT, XHI_OUT \
|
||||||
|
\ // XOR high and low nibbles to get full bytes
|
||||||
VPXOR XLO, XHI_OUT, XHI_OUT
|
VPXOR XLO, XHI_OUT, XHI_OUT
|
||||||
|
|
||||||
// Compute 16 S1 box values from 16 bytes, stored in XMM register
|
// Compute 16 S1 box values from 16 bytes, stored in XMM register
|
||||||
#define S1_comput_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3) \
|
#define S1_comput_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3) \
|
||||||
|
\ // gf2p8affineqb XIN_OUT, [rel Aes_to_Zuc], 0x00
|
||||||
VMOVDQU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1 \
|
VMOVDQU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1 \
|
||||||
VMOVDQU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2 \
|
VMOVDQU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2 \
|
||||||
MUL_PSHUFB_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3) \
|
MUL_PSHUFB_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3) \
|
||||||
|
\
|
||||||
VPSHUFB Shuf_mask<>(SB), XTMP2, XTMP2 \
|
VPSHUFB Shuf_mask<>(SB), XTMP2, XTMP2 \
|
||||||
VAESENCLAST Cancel_aes<>(SB), XTMP2, XTMP2 \
|
VAESENCLAST Cancel_aes<>(SB), XTMP2, XTMP2 \
|
||||||
|
\ // gf2p8affineqb XIN_OUT, [rel CombMatrix], 0x55
|
||||||
VMOVDQU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \
|
VMOVDQU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \
|
||||||
VMOVDQU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
|
VMOVDQU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
|
||||||
MUL_PSHUFB_AVX(XTMP2, XTMP1, XIN_OUT, XTMP3)
|
MUL_PSHUFB_AVX(XTMP2, XTMP1, XIN_OUT, XTMP3)
|
||||||
|
|
||||||
|
#define F_R1 R10
|
||||||
|
#define F_R2 R11
|
||||||
|
#define BRC_X0 R12
|
||||||
|
#define BRC_X1 R13
|
||||||
|
#define BRC_X2 R14
|
||||||
|
#define BRC_X3 R15
|
||||||
|
|
||||||
// BITS_REORG(idx)
|
// BITS_REORG(idx)
|
||||||
|
//
|
||||||
// params
|
// params
|
||||||
// %1 - round number
|
// %1 - round number
|
||||||
// uses
|
// uses
|
||||||
// AX, BX, CX, DX
|
// AX, BX, CX, DX
|
||||||
// return
|
// return
|
||||||
// R12, R13, R14, R15
|
// updates R12, R13, R14, R15
|
||||||
|
//
|
||||||
#define BITS_REORG(idx) \
|
#define BITS_REORG(idx) \
|
||||||
MOVL (((15 + idx) % 16)*4)(SI), R12 \
|
MOVL (((15 + idx) % 16)*4)(SI), BRC_X0 \
|
||||||
MOVL (((14 + idx) % 16)*4)(SI), AX \
|
MOVL (((14 + idx) % 16)*4)(SI), AX \
|
||||||
MOVL (((11 + idx) % 16)*4)(SI), R13 \
|
MOVL (((11 + idx) % 16)*4)(SI), BRC_X1 \
|
||||||
MOVL (((9 + idx) % 16)*4)(SI), BX \
|
MOVL (((9 + idx) % 16)*4)(SI), BX \
|
||||||
MOVL (((7 + idx) % 16)*4)(SI), R14 \
|
MOVL (((7 + idx) % 16)*4)(SI), BRC_X2 \
|
||||||
MOVL (((5 + idx) % 16)*4)(SI), CX \
|
MOVL (((5 + idx) % 16)*4)(SI), CX \
|
||||||
MOVL (((2 + idx) % 16)*4)(SI), R15 \
|
MOVL (((2 + idx) % 16)*4)(SI), BRC_X3 \
|
||||||
MOVL (((0 + idx) % 16)*4)(SI), DX \
|
MOVL (((0 + idx) % 16)*4)(SI), DX \
|
||||||
SHRL $15, R12 \
|
SHRL $15, BRC_X0 \
|
||||||
SHLL $16, AX \
|
SHLL $16, AX \
|
||||||
SHLL $1, BX \
|
SHLL $1, BX \
|
||||||
SHLL $1, CX \
|
SHLL $1, CX \
|
||||||
SHLL $1, DX \
|
SHLL $1, DX \
|
||||||
SHLDL(R12, AX, $16) \
|
SHLDL(BRC_X0, AX, $16) \
|
||||||
SHLDL(R13, BX, $16) \
|
SHLDL(BRC_X1, BX, $16) \
|
||||||
SHLDL(R14, CX, $16) \
|
SHLDL(BRC_X2, CX, $16) \
|
||||||
SHLDL(R15, DX, $16)
|
SHLDL(BRC_X3, DX, $16)
|
||||||
|
|
||||||
|
// LFSR_UPDT calculates the next state word and places/overwrites it to lfsr[idx % 16]
|
||||||
|
//
|
||||||
|
// params
|
||||||
|
// %1 - round number
|
||||||
|
// uses
|
||||||
|
// AX as input (ZERO or W), BX, CX, DX, R8, R9
|
||||||
#define LFSR_UPDT(idx) \
|
#define LFSR_UPDT(idx) \
|
||||||
MOVL (((0 + idx) % 16)*4)(SI), BX \
|
MOVL (((0 + idx) % 16)*4)(SI), BX \
|
||||||
MOVL (((4 + idx) % 16)*4)(SI), CX \
|
MOVL (((4 + idx) % 16)*4)(SI), CX \
|
||||||
MOVL (((10 + idx) % 16)*4)(SI), DX \
|
MOVL (((10 + idx) % 16)*4)(SI), DX \
|
||||||
MOVL (((13 + idx) % 16)*4)(SI), R8 \
|
MOVL (((13 + idx) % 16)*4)(SI), R8 \
|
||||||
MOVL (((15 + idx) % 16)*4)(SI), R9 \
|
MOVL (((15 + idx) % 16)*4)(SI), R9 \
|
||||||
|
\ // Calculate 64-bit LFSR feedback
|
||||||
ADDQ BX, AX \
|
ADDQ BX, AX \
|
||||||
SHLQ $8, BX \
|
SHLQ $8, BX \
|
||||||
SHLQ $20, CX \
|
SHLQ $20, CX \
|
||||||
@ -244,7 +272,7 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
ADDQ DX, AX \
|
ADDQ DX, AX \
|
||||||
ADDQ R8, AX \
|
ADDQ R8, AX \
|
||||||
ADDQ R9, AX \
|
ADDQ R9, AX \
|
||||||
\
|
\ // Reduce it to 31-bit value
|
||||||
MOVQ AX, BX \
|
MOVQ AX, BX \
|
||||||
ANDQ $0x7FFFFFFF, AX \
|
ANDQ $0x7FFFFFFF, AX \
|
||||||
SHRQ $31, BX \
|
SHRQ $31, BX \
|
||||||
@ -253,21 +281,21 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
MOVQ AX, BX \
|
MOVQ AX, BX \
|
||||||
SUBQ $0x7FFFFFFF, AX \
|
SUBQ $0x7FFFFFFF, AX \
|
||||||
CMOVQCS BX, AX \
|
CMOVQCS BX, AX \
|
||||||
\
|
\ // LFSR_S16 = (LFSR_S15++) = AX
|
||||||
MOVL AX, (((0 + idx) % 16)*4)(SI)
|
MOVL AX, (((0 + idx) % 16)*4)(SI)
|
||||||
|
|
||||||
#define NONLIN_FUN() \
|
#define NONLIN_FUN() \
|
||||||
MOVL R12, AX \
|
MOVL BRC_X0, AX \
|
||||||
XORL R10, AX \
|
XORL F_R1, AX \ // F_R1 xor BRC_X1
|
||||||
ADDL R11, AX \
|
ADDL F_R2, AX \ // W = (F_R1 xor BRC_X1) + F_R2
|
||||||
ADDL R13, R10 \ // W1= F_R1 + BRC_X1
|
ADDL BRC_X1, F_R1 \ // W1= F_R1 + BRC_X1
|
||||||
XORL R14, R11 \ // W2= F_R2 ^ BRC_X2
|
XORL BRC_X2, F_R2 \ // W2= F_R2 ^ BRC_X2
|
||||||
\
|
\
|
||||||
MOVL R10, DX \
|
MOVL F_R1, DX \
|
||||||
MOVL R11, CX \
|
MOVL F_R2, CX \
|
||||||
SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16)
|
SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16)
|
||||||
SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16)
|
SHLDL(F_R2, F_R1, $16) \ // Q = (W2 << 16) | (W1 >> 16)
|
||||||
MOVL DX, BX \
|
MOVL DX, BX \ // start L1
|
||||||
MOVL DX, CX \
|
MOVL DX, CX \
|
||||||
MOVL DX, R8 \
|
MOVL DX, R8 \
|
||||||
MOVL DX, R9 \
|
MOVL DX, R9 \
|
||||||
@ -279,21 +307,28 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
XORL CX, DX \
|
XORL CX, DX \
|
||||||
XORL R8, DX \
|
XORL R8, DX \
|
||||||
XORL R9, DX \ // U = L1(P) = EDX, hi(RDX)=0
|
XORL R9, DX \ // U = L1(P) = EDX, hi(RDX)=0
|
||||||
MOVL R11, BX \
|
MOVL F_R2, BX \
|
||||||
MOVL R11, CX \
|
MOVL F_R2, CX \
|
||||||
MOVL R11, R8 \
|
MOVL F_R2, R8 \
|
||||||
MOVL R11, R9 \
|
MOVL F_R2, R9 \
|
||||||
ROLL $8, BX \
|
ROLL $8, BX \
|
||||||
ROLL $14, CX \
|
ROLL $14, CX \
|
||||||
ROLL $22, R8 \
|
ROLL $22, R8 \
|
||||||
ROLL $30, R9 \
|
ROLL $30, R9 \
|
||||||
XORL BX, R11 \
|
XORL BX, F_R2 \
|
||||||
XORL CX, R11 \
|
XORL CX, F_R2 \
|
||||||
XORL R8, R11 \
|
XORL R8, F_R2 \
|
||||||
XORL R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0
|
XORL R9, F_R2 \ // V = L2(Q) = R11D, hi(R11)=0
|
||||||
SHLQ $32, R11 \
|
SHLQ $32, F_R2 \ // DX = V || U
|
||||||
XORQ R11, DX
|
XORQ F_R2, DX
|
||||||
|
|
||||||
|
// Non-Linear function F, SSE version.
|
||||||
|
// uses
|
||||||
|
// AX, BX, CX, DX, R8, R9
|
||||||
|
// X0, X1, X2, X3, X4
|
||||||
|
// return
|
||||||
|
// W in AX
|
||||||
|
// updated F_R1, F_R2
|
||||||
#define NONLIN_FUN_SSE() \
|
#define NONLIN_FUN_SSE() \
|
||||||
NONLIN_FUN() \
|
NONLIN_FUN() \
|
||||||
MOVQ DX, X0 \
|
MOVQ DX, X0 \
|
||||||
@ -305,16 +340,17 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
PAND mask_S0<>(SB), X1 \
|
PAND mask_S0<>(SB), X1 \
|
||||||
PXOR X1, X0 \
|
PXOR X1, X0 \
|
||||||
\
|
\
|
||||||
MOVL X0, R10 \ // F_R1
|
MOVL X0, F_R1 \ // F_R1
|
||||||
PEXTRD $1, X0, R11
|
PEXTRD $1, X0, F_R2
|
||||||
|
|
||||||
|
// RESTORE_LFSR_0, appends the first 4 bytes to last.
|
||||||
#define RESTORE_LFSR_0() \
|
#define RESTORE_LFSR_0() \
|
||||||
MOVL (0*4)(SI), AX \
|
MOVL (0*4)(SI), AX \ // first 4-bytes
|
||||||
MOVUPS (4)(SI), X0 \
|
MOVUPS (4)(SI), X0 \
|
||||||
MOVUPS (20)(SI), X1 \
|
MOVUPS (20)(SI), X1 \
|
||||||
MOVUPS (36)(SI), X2 \
|
MOVUPS (36)(SI), X2 \
|
||||||
MOVQ (52)(SI), BX \
|
MOVQ (52)(SI), BX \
|
||||||
MOVL (60)(SI), CX \
|
MOVL (60)(SI), CX \ // last 4-bytes
|
||||||
\
|
\
|
||||||
MOVUPS X0, (SI) \
|
MOVUPS X0, (SI) \
|
||||||
MOVUPS X1, (16)(SI) \
|
MOVUPS X1, (16)(SI) \
|
||||||
@ -323,12 +359,13 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
MOVL CX, (56)(SI) \
|
MOVL CX, (56)(SI) \
|
||||||
MOVL AX, (60)(SI)
|
MOVL AX, (60)(SI)
|
||||||
|
|
||||||
|
// RESTORE_LFSR_2, appends the first 8 bytes to last.
|
||||||
#define RESTORE_LFSR_2() \
|
#define RESTORE_LFSR_2() \
|
||||||
MOVQ (0)(SI), AX \
|
MOVQ (0)(SI), AX \ // first 8-bytes
|
||||||
MOVUPS (8)(SI), X0 \
|
MOVUPS (8)(SI), X0 \
|
||||||
MOVUPS (24)(SI), X1 \
|
MOVUPS (24)(SI), X1 \
|
||||||
MOVUPS (40)(SI), X2 \
|
MOVUPS (40)(SI), X2 \
|
||||||
MOVQ (56)(SI), BX \
|
MOVQ (56)(SI), BX \ // last 8-bytes
|
||||||
\
|
\
|
||||||
MOVUPS X0, (SI) \
|
MOVUPS X0, (SI) \
|
||||||
MOVUPS X1, (16)(SI) \
|
MOVUPS X1, (16)(SI) \
|
||||||
@ -336,17 +373,19 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
MOVQ BX, (48)(SI) \
|
MOVQ BX, (48)(SI) \
|
||||||
MOVQ AX, (56)(SI)
|
MOVQ AX, (56)(SI)
|
||||||
|
|
||||||
|
// RESTORE_LFSR_4, appends the first 16 bytes to last.
|
||||||
#define RESTORE_LFSR_4() \
|
#define RESTORE_LFSR_4() \
|
||||||
MOVUPS (0)(SI), X0 \
|
MOVUPS (0)(SI), X0 \ // first 16 bytes
|
||||||
MOVUPS (16)(SI), X1 \
|
MOVUPS (16)(SI), X1 \
|
||||||
MOVUPS (32)(SI), X2 \
|
MOVUPS (32)(SI), X2 \
|
||||||
MOVUPS (48)(SI), X3 \
|
MOVUPS (48)(SI), X3 \ // last 16 bytes
|
||||||
\
|
\
|
||||||
MOVUPS X1, (0)(SI) \
|
MOVUPS X1, (0)(SI) \
|
||||||
MOVUPS X2, (16)(SI) \
|
MOVUPS X2, (16)(SI) \
|
||||||
MOVUPS X3, (32)(SI) \
|
MOVUPS X3, (32)(SI) \
|
||||||
MOVUPS X0, (48)(SI)
|
MOVUPS X0, (48)(SI)
|
||||||
|
|
||||||
|
// RESTORE_LFSR_8, appends the first 32 bytes to last.
|
||||||
#define RESTORE_LFSR_8() \
|
#define RESTORE_LFSR_8() \
|
||||||
MOVUPS (0)(SI), X0 \
|
MOVUPS (0)(SI), X0 \
|
||||||
MOVUPS (16)(SI), X1 \
|
MOVUPS (16)(SI), X1 \
|
||||||
@ -358,6 +397,13 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
MOVUPS X0, (32)(SI) \
|
MOVUPS X0, (32)(SI) \
|
||||||
MOVUPS X1, (48)(SI)
|
MOVUPS X1, (48)(SI)
|
||||||
|
|
||||||
|
// Non-Linear function F, AVX version.
|
||||||
|
// uses
|
||||||
|
// AX, BX, CX, DX, R8, R9
|
||||||
|
// X0, X1, X2, X3, X4
|
||||||
|
// return
|
||||||
|
// W in AX
|
||||||
|
// updated F_R1, F_R2
|
||||||
#define NONLIN_FUN_AVX() \
|
#define NONLIN_FUN_AVX() \
|
||||||
NONLIN_FUN() \
|
NONLIN_FUN() \
|
||||||
VMOVQ DX, X0 \
|
VMOVQ DX, X0 \
|
||||||
@ -373,20 +419,20 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
|||||||
VPEXTRD $1, X0, R11
|
VPEXTRD $1, X0, R11
|
||||||
|
|
||||||
#define LOAD_STATE() \
|
#define LOAD_STATE() \
|
||||||
MOVL OFFSET_FR1(SI), R10 \
|
MOVL OFFSET_FR1(SI), F_R1 \
|
||||||
MOVL OFFSET_FR2(SI), R11 \
|
MOVL OFFSET_FR2(SI), F_R2 \
|
||||||
MOVL OFFSET_BRC_X0(SI), R12 \
|
MOVL OFFSET_BRC_X0(SI), BRC_X0 \
|
||||||
MOVL OFFSET_BRC_X1(SI), R13 \
|
MOVL OFFSET_BRC_X1(SI), BRC_X1 \
|
||||||
MOVL OFFSET_BRC_X2(SI), R14 \
|
MOVL OFFSET_BRC_X2(SI), BRC_X2 \
|
||||||
MOVL OFFSET_BRC_X3(SI), R15
|
MOVL OFFSET_BRC_X3(SI), BRC_X3
|
||||||
|
|
||||||
#define SAVE_STATE() \
|
#define SAVE_STATE() \
|
||||||
MOVL R10, OFFSET_FR1(SI) \
|
MOVL F_R1, OFFSET_FR1(SI) \
|
||||||
MOVL R11, OFFSET_FR2(SI) \
|
MOVL F_R2, OFFSET_FR2(SI) \
|
||||||
MOVL R12, OFFSET_BRC_X0(SI) \
|
MOVL BRC_X0, OFFSET_BRC_X0(SI) \
|
||||||
MOVL R13, OFFSET_BRC_X1(SI) \
|
MOVL BRC_X1, OFFSET_BRC_X1(SI) \
|
||||||
MOVL R14, OFFSET_BRC_X2(SI) \
|
MOVL BRC_X2, OFFSET_BRC_X2(SI) \
|
||||||
MOVL R15, OFFSET_BRC_X3(SI)
|
MOVL BRC_X3, OFFSET_BRC_X3(SI)
|
||||||
|
|
||||||
// func genKeywordAsm(s *zucState32) uint32
|
// func genKeywordAsm(s *zucState32) uint32
|
||||||
TEXT ·genKeywordAsm(SB),NOSPLIT,$0
|
TEXT ·genKeywordAsm(SB),NOSPLIT,$0
|
||||||
@ -401,10 +447,14 @@ TEXT ·genKeywordAsm(SB),NOSPLIT,$0
|
|||||||
sse:
|
sse:
|
||||||
NONLIN_FUN_SSE()
|
NONLIN_FUN_SSE()
|
||||||
|
|
||||||
XORL R15, AX
|
// (BRC_X3 xor W) as result
|
||||||
|
XORL BRC_X3, AX
|
||||||
MOVL AX, ret+8(FP)
|
MOVL AX, ret+8(FP)
|
||||||
|
|
||||||
|
// LFSRWithWorkMode
|
||||||
XORQ AX, AX
|
XORQ AX, AX
|
||||||
LFSR_UPDT(0)
|
LFSR_UPDT(0)
|
||||||
|
|
||||||
SAVE_STATE()
|
SAVE_STATE()
|
||||||
RESTORE_LFSR_0()
|
RESTORE_LFSR_0()
|
||||||
|
|
||||||
@ -413,14 +463,17 @@ sse:
|
|||||||
avx:
|
avx:
|
||||||
NONLIN_FUN_AVX()
|
NONLIN_FUN_AVX()
|
||||||
|
|
||||||
XORL R15, AX
|
// (BRC_X3 xor W) as result
|
||||||
|
XORL BRC_X3, AX
|
||||||
MOVL AX, ret+8(FP)
|
MOVL AX, ret+8(FP)
|
||||||
|
|
||||||
|
// LFSRWithWorkMode
|
||||||
XORQ AX, AX
|
XORQ AX, AX
|
||||||
LFSR_UPDT(0)
|
LFSR_UPDT(0)
|
||||||
|
|
||||||
SAVE_STATE()
|
SAVE_STATE()
|
||||||
RESTORE_LFSR_0()
|
RESTORE_LFSR_0()
|
||||||
|
|
||||||
VZEROUPPER
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
#define ROUND_SSE(idx) \
|
#define ROUND_SSE(idx) \
|
||||||
@ -594,7 +647,6 @@ avxZucSingle:
|
|||||||
RESTORE_LFSR_0()
|
RESTORE_LFSR_0()
|
||||||
avxZucRet:
|
avxZucRet:
|
||||||
SAVE_STATE()
|
SAVE_STATE()
|
||||||
VZEROUPPER
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
|
// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
|
||||||
@ -736,5 +788,4 @@ avxZucSingle:
|
|||||||
RESTORE_LFSR_0()
|
RESTORE_LFSR_0()
|
||||||
avxZucRet:
|
avxZucRet:
|
||||||
SAVE_STATE()
|
SAVE_STATE()
|
||||||
VZEROUPPER
|
|
||||||
RET
|
RET
|
||||||
|
12
zuc/core.go
12
zuc/core.go
@ -79,12 +79,12 @@ var zuc256_d = [3][16]byte{
|
|||||||
|
|
||||||
type zucState32 struct {
|
type zucState32 struct {
|
||||||
lfsr [16]uint32 // linear feedback shift register
|
lfsr [16]uint32 // linear feedback shift register
|
||||||
r1 uint32
|
r1 uint32 // register of F
|
||||||
r2 uint32
|
r2 uint32 // register of F
|
||||||
x0 uint32 // Output X0 of the bit reorganization
|
x0 uint32 // Output X0 of the bit reorganization
|
||||||
x1 uint32 // Output X1 of the bit reorganization
|
x1 uint32 // Output X1 of the bit reorganization
|
||||||
x2 uint32 // Output X2 of the bit reorganization
|
x2 uint32 // Output X2 of the bit reorganization
|
||||||
x3 uint32 // Output X3 of the bit reorganization
|
x3 uint32 // Output X3 of the bit reorganization
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *zucState32) bitReorganization() {
|
func (s *zucState32) bitReorganization() {
|
||||||
|
@ -8,11 +8,15 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
|
var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
|
||||||
var useAVX = cpu.X86.HasAVX
|
var useAVX = false //cpu.X86.HasAVX
|
||||||
|
|
||||||
|
// Generate single keyword, 4 bytes.
|
||||||
|
//
|
||||||
//go:noescape
|
//go:noescape
|
||||||
func genKeywordAsm(s *zucState32) uint32
|
func genKeywordAsm(s *zucState32) uint32
|
||||||
|
|
||||||
|
// Generate multiple keywords, n*4 bytes.
|
||||||
|
//
|
||||||
//go:noescape
|
//go:noescape
|
||||||
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user