Efficient Software Implementations of ZUC - gmsm

mirror of https://github.com/emmansun/gmsm.git synced 2025-10-13 23:00:47 +08:00

Table of Contents

主要涉及：
S1 Sbox生成
从AES S盒计算ZUC S1
计算查找表
AMD64 SHLD SHRD的性能
参考：

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

主要涉及：

Keystream generator

SIMD + AES-NI

Carryless multiplication, 无进位乘法, Use instruction PCLMULQDQ (AMD64)

Multi-Buffer, 多路并行

S1 Sbox生成

改编自AES 和 SM4 的 S 盒生成方法简介

from pyfinite import ffield

gen = 0b110001011
F = ffield.FField(8, gen, useLUT=0) # 这里一定要写useLUT=0，不然会出问题。。。

A = [0b01110111, 0b10111011, 0b11011101, 0b11101110, 0b11001011, 0b01101101, 0b00111110, 0b10010111]

def zuc_sbox_gen(x):
    '''
    输入x，输出S(x)
    '''
    x_inv = F.Inverse(x)
    y = 0
    for i, a in enumerate(A):
        if(x_inv&(1<<(7-i))):
            y ^= a  # 若该bit为1，则异或相应列
    return y^0x55

def print_table(table):
    for i, s in enumerate(table):
        print(f'0x%02X'%s,',', end='')    
        if (i+1) % 16 == 0:
            print()

sbox = []
for i in range(256):
    if i > 0:
        sbox.append(zuc_sbox_gen(i))  # 生成sbox
    else:
        sbox.append(0x55)

print_table(sbox)

从AES S盒计算ZUC S1

参考aes和sm4s盒复合域实现方法的做法：
$S_{zuc}(x)=L(S_{aes}(Mx)+C$，下面我们尝试进行推导 L, M, C
假设复合域求逆运算为 $f$，则:
S_{aes}(x)=A_{aes}X_{aes}f(X^{-1}_{aes}x) + 0x63

S_{zuc}(x)=A_{zuc}X_{zuc}f(X^{-1}_{zuc}x) + 0x55

得到
L=A_{zuc}X_{zuc}X^{-1}_{aes}A^{-1}{aes} \

M=X_{aes}X^{-1}_{zuc}

C=L\ 0x63+0x55

只有这种S盒和AES盒构造完全同构的，用AESENCLAST时，使用0x63作为enc_key才有点意义。

from pyfinite import ffield
from pyfinite import genericmatrix

XOR = lambda x,y:x^y
AND = lambda x,y:x&y
DIV = lambda x,y:x

def aes_f():
    gen = 0b100011011
    return ffield.FField(8, gen, useLUT=0)

def zuc_f():
    gen = 0b110001011
    return ffield.FField(8, gen, useLUT=0)    

aesf = aes_f()
zucf = zuc_f()

def field_pow2(x, F):
    return F.Multiply(x, x)

def field_pow3(x, F):
    return F.Multiply(x, field_pow2(x, F))

def field_pow4(x, F):
    return field_pow2(field_pow2(x, F), F)

def field_pow16(x, F):
    return field_pow4(field_pow4(x, F), F)    

def get_all_WZY(F):
    result_list = []
    for i in range(256):
        if field_pow2(i, F)^i^1 == 0:
            W=i
            W_2 = field_pow2(W, F)
            N = W_2
            for j in range(256):
                if field_pow2(j, F)^j^W_2 == 0:
                    Z = j
                    Z_4 = field_pow4(Z, F)
                    u = F.Multiply(field_pow2(N, F), Z)
                    for k in range(256):
                        if field_pow2(k, F)^k^u == 0:
                            Y = k
                            Y_16 = field_pow16(k, F)
                            result_list.append([W, W_2, Z, Z_4, Y, Y_16])
    return result_list

def gen_X(F, W, W_2, Z, Z_4, Y, Y_16):
    W_2_Z_4_Y_16 = F.Multiply(F.Multiply(W_2, Z_4), Y_16)
    W_Z_4_Y_16 = F.Multiply(F.Multiply(W, Z_4), Y_16)
    W_2_Z_Y_16 = F.Multiply(F.Multiply(W_2, Z), Y_16)
    W_Z_Y_16 = F.Multiply(F.Multiply(W, Z), Y_16)
    W_2_Z_4_Y = F.Multiply(F.Multiply(W_2, Z_4), Y)
    W_Z_4_Y = F.Multiply(F.Multiply(W, Z_4), Y)
    W_2_Z_Y = F.Multiply(F.Multiply(W_2, Z), Y)
    W_Z_Y = F.Multiply(F.Multiply(W, Z), Y)
    return [W_2_Z_4_Y_16, W_Z_4_Y_16, W_2_Z_Y_16, W_Z_Y_16, W_2_Z_4_Y, W_Z_4_Y, W_2_Z_Y, W_Z_Y]

def to_matrix(x):
    m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    m.SetRow(0, [(x[0] & 0x80) >> 7, (x[1] & 0x80) >> 7, (x[2] & 0x80) >> 7, (x[3] & 0x80) >> 7, (x[4] & 0x80) >> 7, (x[5] & 0x80) >> 7, (x[6] & 0x80) >> 7, (x[7] & 0x80) >> 7]) 
    m.SetRow(1, [(x[0] & 0x40) >> 6, (x[1] & 0x40) >> 6, (x[2] & 0x40) >> 6, (x[3] & 0x40) >> 6, (x[4] & 0x40) >> 6, (x[5] & 0x40) >> 6, (x[6] & 0x40) >> 6, (x[7] & 0x40) >> 6]) 
    m.SetRow(2, [(x[0] & 0x20) >> 5, (x[1] & 0x20) >> 5, (x[2] & 0x20) >> 5, (x[3] & 0x20) >> 5, (x[4] & 0x20) >> 5, (x[5] & 0x20) >> 5, (x[6] & 0x20) >> 5, (x[7] & 0x20) >> 5]) 
    m.SetRow(3, [(x[0] & 0x10) >> 4, (x[1] & 0x10) >> 4, (x[2] & 0x10) >> 4, (x[3] & 0x10) >> 4, (x[4] & 0x10) >> 4, (x[5] & 0x10) >> 4, (x[6] & 0x10) >> 4, (x[7] & 0x10) >> 4]) 
    m.SetRow(4, [(x[0] & 0x08) >> 3, (x[1] & 0x08) >> 3, (x[2] & 0x08) >> 3, (x[3] & 0x08) >> 3, (x[4] & 0x08) >> 3, (x[5] & 0x08) >> 3, (x[6] & 0x08) >> 3, (x[7] & 0x08) >> 3]) 
    m.SetRow(5, [(x[0] & 0x04) >> 2, (x[1] & 0x04) >> 2, (x[2] & 0x04) >> 2, (x[3] & 0x04) >> 2, (x[4] & 0x04) >> 2, (x[5] & 0x04) >> 2, (x[6] & 0x04) >> 2, (x[7] & 0x04) >> 2]) 
    m.SetRow(6, [(x[0] & 0x02) >> 1, (x[1] & 0x02) >> 1, (x[2] & 0x02) >> 1, (x[3] & 0x02) >> 1, (x[4] & 0x02) >> 1, (x[5] & 0x02) >> 1, (x[6] & 0x02) >> 1, (x[7] & 0x02) >> 1]) 
    m.SetRow(7, [(x[0] & 0x01) >> 0, (x[1] & 0x01) >> 0, (x[2] & 0x01) >> 0, (x[3] & 0x01) >> 0, (x[4] & 0x01) >> 0, (x[5] & 0x01) >> 0, (x[6] & 0x01) >> 0, (x[7] & 0x01) >> 0]) 
    return m

def matrix_col_byte(c):
    return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)

def matrix_row_byte(c):
    return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)

def matrix_cols(m):
    x = []
    for i in range(8):
        c = m.GetColumn(i)
        x.append(matrix_col_byte(c))
    return x

def matrix_rows(m):
    x = []
    for i in range(8):
        r = m.GetRow(i)
        x.append(matrix_row_byte(r))
    return x

def gen_X_inv(x):
    m = to_matrix(x)
    m_inv = m.Inverse()
    return matrix_cols(m_inv)

def G4_mul(x, y):
    '''
    GF(2^2) multiply operator, normal basis is {W^2, W}
    '''
    a = (x & 0x02) >> 1
    b = x & 0x01
    c = (y & 0x02) >> 1
    d = y & 0x01
    e = (a ^ b) & (c ^ d)
    return (((a & c) ^ e) << 1) | ((b & d) ^ e)

def G4_mul_N(x):
    '''
    GF(2^2) multiply N, normal basis is {W^2, W}, N = W^2
    '''
    a = (x & 0x02) >> 1
    b = x & 0x01
    p = b
    q = a ^ b
    return (p << 1) | q

def G4_mul_N2(x):
    '''
    GF(2^2) multiply N^2, normal basis is {W^2, W}, N = W^2
    '''
    a = (x & 0x02) >> 1
    b = x & 0x01
    return ((a ^ b) << 1) | a

def G4_inv(x):
    '''
    GF(2^2) inverse opertor
    '''        
    a = (x & 0x02) >> 1
    b = x & 0x01
    return (b << 1) | a

def G16_mul(x, y):
    '''
    GF(2^4) multiply operator, normal basis is {Z^4, Z}
    '''
    a = (x & 0xc) >> 2
    b = x & 0x03
    c = (y & 0xc) >> 2
    d = y & 0x03
    e = G4_mul(a ^ b, c ^ d)
    e = G4_mul_N(e)
    p = G4_mul(a, c) ^ e
    q = G4_mul(b, d) ^ e
    return (p << 2) | q

def G16_sq_mul_u(x):
    '''
    GF(2^4) x^2 * u operator, u = N^2 Z, N = W^2
    '''    
    a = (x & 0xc) >> 2
    b = x & 0x03
    p = G4_inv(a ^ b)
    q = G4_mul_N2(G4_inv(b))
    return (p << 2) | q

def G16_inv(x):
    '''
    GF(2^4) inverse opertor
    '''
    a = (x & 0xc) >> 2
    b = x & 0x03
    c = G4_mul_N(G4_inv(a ^ b))
    d = G4_mul(a, b)
    e = G4_inv(c ^ d)
    p = G4_mul(e, b)
    q = G4_mul(e, a)
    return (p << 2) | q

def G256_inv(x):
    '''
    GF(2^8) inverse opertor
    '''
    a = (x & 0xf0) >> 4
    b = x & 0x0f
    c = G16_sq_mul_u(a ^ b)
    d = G16_mul(a, b)
    e = G16_inv(c ^ d)
    p = G16_mul(e, b)
    q = G16_mul(e, a)
    return (p << 4) | q

def G256_new_basis(x, b):
    '''
    x presentation under new basis b
    '''
    y = 0
    for i in range(8):
        if x & (1<<((7-i))):
            y ^= b[i]
    return y

AES_A = [0b10001111, 0b11000111, 0b11100011, 0b11110001, 0b11111000, 0b01111100, 0b00111110, 0b00011111]
AES_C = [0, 1, 1, 0, 0, 0, 1, 1]

def AES_SBOX(X, X_inv):
    sbox = []
    for i in range(256):
        t = G256_new_basis(i, X_inv)
        t = G256_inv(t)
        t = G256_new_basis(t, X)
        t = G256_new_basis(t, AES_A)
        sbox.append(t ^ 0x63)
    return sbox

def print_sbox(sbox):
    for i, s in enumerate(sbox):
        print(f'%02x'%s,',', end='')    
        if (i+1) % 16 == 0:
            print()

def print_all_aes_sbox():
    result_list = get_all_WZY(aesf)
    for i, v in enumerate(result_list):
        X = gen_X(aesf, v[0], v[1], v[2], v[3], v[4], v[5])
        X_inv = gen_X_inv(X)
        print_sbox(AES_SBOX(X, X_inv))
        print()

ZUC_A = [0b01110111, 0b10111011, 0b11011101, 0b11101110, 0b11001011, 0b01101101, 0b00111110, 0b10010111]
ZUC_C = [0, 1, 0, 1, 0, 1, 0, 1]

def ZUC_SBOX(X, X_inv):
    sbox = []
    for i in range(256):
        t = G256_new_basis(i, X_inv)
        t = G256_inv(t)
        t = G256_new_basis(t, X)
        t = G256_new_basis(t, ZUC_A)
        sbox.append(t ^ 0x55)
    return sbox

def print_all_zuc_sbox():
    result_list = get_all_WZY(zucf)
    for i, v in enumerate(result_list):
        X = gen_X(zucf, v[0], v[1], v[2], v[3], v[4], v[5])
        X_inv = gen_X_inv(X)
        print_sbox(ZUC_SBOX(X, X_inv))
        print()    

def print_m(m):
    for i, s in enumerate(m):
        print(f'0x%02x'%s,',', end='')  

def gen_all_m1_c1_m2_c2():
    aes_result_list = get_all_WZY(aesf)
    zuc_result_list = get_all_WZY(zucf)
    Aaes = to_matrix(AES_A)
    Aaes_inv = Aaes.Inverse()
    Azuc = to_matrix(ZUC_A)
    Caes = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range(8):
        Caes.SetRow(i, [AES_C[i]])
    Czuc = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range(8):
        Czuc.SetRow(i, [ZUC_C[i]])
    for i, v1 in enumerate(aes_result_list):
        Xaes = to_matrix(gen_X(aesf, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]))
        Xaes_inv = Xaes.Inverse()
        for j, v2 in enumerate(zuc_result_list):
            Xzuc = to_matrix(gen_X(zucf, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]))
            Xzuc_inv = Xzuc.Inverse()
            M1 = Xaes * Xzuc_inv
            M2 = Azuc * Xzuc * Xaes_inv * Aaes_inv
            C2 = M2 * Caes
            print(f'M1=','', end='')
            print_m(matrix_rows(M1))
            print(f' C1=','', end='')
            print(hex(0x0))
            print(f'M2=','', end='')
            print_m(matrix_rows(M2))
            print(f' C2=','', end='')
            print(hex(0x55 ^ matrix_col_byte(C2.GetColumn(0))))
            print()

gen_all_m1_c1_m2_c2()

结果：

M1= 0x28 ,0x58 ,0xf6 ,0x76 ,0x8a ,0x40 ,0x3e ,0xf3 , C1= 0x0
M2= 0x81 ,0xfd ,0x57 ,0x8e ,0xdb ,0x6d ,0xf6 ,0x2e , C2= 0xab

M1= 0x3c ,0xaa ,0xe2 ,0x90 ,0xb2 ,0x78 ,0x3e ,0x2b , C1= 0x0
M2= 0x0e ,0x43 ,0x91 ,0x08 ,0xa3 ,0x93 ,0x70 ,0x6e , C2= 0xbc

M1= 0xc6 ,0xac ,0x18 ,0x9e ,0x5a ,0x4e ,0x12 ,0x95 , C1= 0x0
M2= 0x01 ,0x5d ,0x26 ,0x88 ,0xcc ,0xb3 ,0x36 ,0x96 , C2= 0xd8

M1= 0x0c ,0x5e ,0xd2 ,0xa6 ,0xbc ,0xa8 ,0x12 ,0xbf , C1= 0x0
M2= 0x87 ,0x25 ,0xe0 ,0x07 ,0x72 ,0x82 ,0xb9 ,0xdf , C2= 0x58

M1= 0x70 ,0x7c ,0xae ,0x1e ,0xf0 ,0xc8 ,0x06 ,0xdd , C1= 0x0
M2= 0x02 ,0xa5 ,0xd8 ,0x5a ,0x05 ,0xd9 ,0xed ,0x0d , C2= 0xfe

//Intel也用了这组
M1= 0x96 ,0x50 ,0x48 ,0xd4 ,0xe4 ,0xdc ,0x06 ,0x11 , C1= 0x0
M2= 0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c , C2= 0x32

M1= 0x52 ,0x6e ,0x8c ,0x02 ,0x26 ,0xc0 ,0xf4 ,0x47 , C1= 0x0
M2= 0x95 ,0x45 ,0x66 ,0xf5 ,0x9d ,0xe7 ,0x84 ,0x15 , C2= 0xec

M1= 0x6a ,0x42 ,0xb4 ,0x16 ,0xec ,0x0a ,0xf4 ,0xa7 , C1= 0x0
M2= 0x62 ,0xf2 ,0xa0 ,0xcd ,0xec ,0xae ,0xbc ,0xeb , C2= 0xb7

计算查找表

from pyfinite import genericmatrix

def XOR(x, y): return x ^ y
def AND(x, y): return x & y
def DIV(x, y): return x

def genCMatrix(c):
    Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for j in range (8):
        Imatrix.SetRow(j, [(0x63 >> (7 - j)) & 1])
    return Imatrix

def matrix_from_cols(cols):
    m = genericmatrix.GenericMatrix(size=(8, 8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range (8):
        k = 7 - i
        j = 1 << k
        m.SetRow(i, [(cols[0] & j) >> k, (cols[1] & j) >> k, (cols[2] & j) >> k, (cols[3] & j) >> k, (cols[4] & j) >> k, (cols[5] & j) >> k, (cols[6] & j) >> k, (cols[7] & j) >> k])    

    return m

def gen_matrix_based_table(table):
    return matrix_from_cols([table[0x80] ^ table[0], table[0x40] ^ table[0], table[0x20] ^ table[0], table[0x10] ^ table[0], table[0x08] ^ table[0], table[0x04] ^ table[0], table[0x02] ^ table[0], table[0x01] ^ table[0]])

def gen_matrix_based_high_low(high, low):
    table = []
    for i in range(16):
        for j in range(16):
            table.append(high[i] ^ low[j])    
    return gen_matrix_based_table(table) 

def matrix_col_byte(c):
    return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)

def gen_lookup(m, c):
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
        tmp = m * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def gen_lookup_low(m, c):
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            if j < 4:
                Imatrix.SetRow(j, [0])
            else:
                Imatrix.SetRow(j, [(i >> (7 - j)) & 1])        
        tmp = m * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def gen_lookup_high(m, c):
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            if j < 4:
                Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
            else:
                Imatrix.SetRow(j, [0])
        tmp = m * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def print_table(table):
    for i, s in enumerate(table):
        print(f'0x%02X'%s,',', end='')    
        if (i+1) % 16 == 0:
            print()

def print_high(table):
    for i, s in enumerate(table):
        if i % 16 == 0:
            print(f'0x%02X'%s,',', end='')    
    print()

def print_low(table):
    for i, s in enumerate(table):
        if i < 16:
            print(f'0x%02X'%s,',', end='')    
    print()

def to_matrix(x):
    m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range(8):
        m.SetRow(i, [(x[i] & 0x80) >> 7, (x[i] & 0x40) >> 6, (x[i] & 0x20) >> 5, (x[i] & 0x10) >> 4, (x[i] & 0x08) >> 3, (x[i] & 0x04) >> 2, (x[i] & 0x02) >> 1, (x[i] & 0x01) >> 0]) 
    return m

def gen_intel_c(m, c):
    Cmatrix = genCMatrix(0x63)
    c1 = m*Cmatrix
    return matrix_col_byte(c1.GetColumn(0)) ^ c

Mmatrix = to_matrix([0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x55))
print()
print('Low for AMD64 wich use Cancel AES 0x63')
print_low(gen_lookup_low(Mmatrix, 0x00))
print()
print('Low for ARM64')
print_low(gen_lookup_low(Mmatrix, 0x32^0x55))

结果：

High
0x55 ,0xBA ,0xCC ,0x23 ,0x15 ,0xFA ,0x8C ,0x63 ,0x09 ,0xE6 ,0x90 ,0x7F ,0x49 ,0xA6 ,0xD0 ,0x3F ,

Low for AMD64 wich use Cancel AES 0x63
0x00 ,0x14 ,0xAA ,0xBE ,0x71 ,0x65 ,0xDB ,0xCF ,0xB7 ,0xA3 ,0x1D ,0x09 ,0xC6 ,0xD2 ,0x6C ,0x78 ,

Low for ARM64
0x67 ,0x73 ,0xCD ,0xD9 ,0x16 ,0x02 ,0xBC ,0xA8 ,0xD0 ,0xC4 ,0x7A ,0x6E ,0xA1 ,0xB5 ,0x0B ,0x1F ,

当然，ARM64的外层查找表也可以写成：

Mmatrix = to_matrix([0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x00))
print()
print('Low for ARM64')
print_low(gen_lookup_low(Mmatrix, 0x32))

结果：

High
0x00 ,0xEF ,0x99 ,0x76 ,0x40 ,0xAF ,0xD9 ,0x36 ,0x5C ,0xB3 ,0xC5 ,0x2A ,0x1C ,0xF3 ,0x85 ,0x6A ,

Low for ARM64
0x32 ,0x26 ,0x98 ,0x8C ,0x43 ,0x57 ,0xE9 ,0xFD ,0x85 ,0x91 ,0x2F ,0x3B ,0xF4 ,0xE0 ,0x5E ,0x4A ,

内层查找表：

Mmatrix = to_matrix([0x96 ,0x50 ,0x48 ,0xd4 ,0xe4 ,0xdc ,0x06 ,0x11])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x00))
print()
print('Low')
print_low(gen_lookup_low(Mmatrix, 0x00))

结果：

High
0x00 ,0xD5 ,0x08 ,0xDD ,0x7C ,0xA9 ,0x74 ,0xA1 ,0x9C ,0x49 ,0x94 ,0x41 ,0xE0 ,0x35 ,0xE8 ,0x3D ,

Low
0x00 ,0x01 ,0x82 ,0x83 ,0x9E ,0x9F ,0x1C ,0x1D ,0x24 ,0x25 ,0xA6 ,0xA7 ,0xBA ,0xBB ,0x38 ,0x39 ,

AMD64 SHLD SHRD的性能

经过测试，SHLD/SHRD的性能还不如目前的多条指令实现。

	BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xC3; BYTE $0x10 \ // SHLDL(BRC_X0, AX, $16)
	BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xDC; BYTE $0x10 \ // SHLDL(BRC_X1, BX, $16)
	BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xCD; BYTE $0x10 \ // SHLDL(BRC_X2, CX, $16)
	BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xD6; BYTE $0x10

参考：

zuc sbox with aesni, This is the pure golang code to study ZUC implementation with AESENCLAST/AESE instruction.
Faster 128-EEA3 and 128-EIA3 Software, Delayed modular reduction & Carryless multiplication
Efficient Software Implementations of ZUC-256, 这篇文章有ZUC S0 和 S1 的较详细介绍。
Analyzing SNOW and ZUC Security Algorithms Using NIST SP 800-22 and Enhancing their Randomness
Intel(R) Multi-Buffer Crypto for IPsec Library，在Intel CPU架构实现所有优化。
Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode