mirror of
https://github.com/emmansun/gmsm.git
synced 2025-05-10 11:06:18 +08:00
Page:
Efficient Software Implementations of ZUC
Pages
Armv8.2 SM3和SM4
Efficient Software Implementations of ZUC
GCM for SM4
Golang ppc64x asm Reference
Golang s390x asm Reference
High‐Throughput Elliptic Curve Cryptography using AVX2 Vector Instructions
High‐assurance field inversion for curve‐based cryptography
Home
Intel CPU supports SM3 SM4
PQC: ML‐DSA
PQC: ML‐KEM
PQC: SLH‐DSA
SM2 WWMM (2)
SM2 WWMM
SM2加解密性能
SM2性能优化
SM2性能优化(续)
SM3中的FF2和GG2函数
SM3性能优化
SM4 with AESENCLAST
SM4 with GFNI
SM4性能优化
SM9实现及优化
go crypto and BoringCrypto
is my code constant time?
sbox generation
stealth addresses (隐身地址)
关于CTR模式
关于证书和TLS支持
实现Kyber所需的多项式和线性代数知识
实现ML‐DSA所需的多项式和线性代数知识
无进位乘法和GHASH
门限签名
Clone
Table of Contents
This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
主要涉及:
- Keystream generator
- SIMD + AES-NI
- EIA
- Carryless multiplication, 无进位乘法, Use instruction PCLMULQDQ (AMD64)
- Multi-Buffer, 多路并行
S1 Sbox生成
from pyfinite import ffield
gen = 0b110001011
F = ffield.FField(8, gen, useLUT=0) # 这里一定要写useLUT=0,不然会出问题。。。
A = [0b01110111, 0b10111011, 0b11011101, 0b11101110, 0b11001011, 0b01101101, 0b00111110, 0b10010111]
def zuc_sbox_gen(x):
'''
输入x,输出S(x)
'''
x_inv = F.Inverse(x)
y = 0
for i, a in enumerate(A):
if(x_inv&(1<<(7-i))):
y ^= a # 若该bit为1,则异或相应列
return y^0x55
def print_table(table):
for i, s in enumerate(table):
print(f'0x%02X'%s,',', end='')
if (i+1) % 16 == 0:
print()
sbox = []
for i in range(256):
if i > 0:
sbox.append(zuc_sbox_gen(i)) # 生成sbox
else:
sbox.append(0x55)
print_table(sbox)
从AES S盒计算ZUC S1
参考aes和sm4s盒复合域实现方法的做法:
$S_{zuc}(x)=L(S_{aes}(Mx)+C$,下面我们尝试进行推导 L, M, C
假设复合域求逆运算为 $f$,则:
S_{aes}(x)=A_{aes}X_{aes}f(X^{-1}_{aes}x) + 0x63
S_{zuc}(x)=A_{zuc}X_{zuc}f(X^{-1}_{zuc}x) + 0x55
得到
L=A_{zuc}X_{zuc}X^{-1}_{aes}A^{-1}{aes} \
M=X_{aes}X^{-1}_{zuc}
C=L\ 0x63+0x55
只有这种S盒和AES盒构造完全同构的,用AESENCLAST时,使用0x63作为enc_key才有点意义。
from pyfinite import ffield
from pyfinite import genericmatrix
XOR = lambda x,y:x^y
AND = lambda x,y:x&y
DIV = lambda x,y:x
def aes_f():
gen = 0b100011011
return ffield.FField(8, gen, useLUT=0)
def zuc_f():
gen = 0b110001011
return ffield.FField(8, gen, useLUT=0)
aesf = aes_f()
zucf = zuc_f()
def field_pow2(x, F):
return F.Multiply(x, x)
def field_pow3(x, F):
return F.Multiply(x, field_pow2(x, F))
def field_pow4(x, F):
return field_pow2(field_pow2(x, F), F)
def field_pow16(x, F):
return field_pow4(field_pow4(x, F), F)
def get_all_WZY(F):
result_list = []
for i in range(256):
if field_pow2(i, F)^i^1 == 0:
W=i
W_2 = field_pow2(W, F)
N = W_2
for j in range(256):
if field_pow2(j, F)^j^W_2 == 0:
Z = j
Z_4 = field_pow4(Z, F)
u = F.Multiply(field_pow2(N, F), Z)
for k in range(256):
if field_pow2(k, F)^k^u == 0:
Y = k
Y_16 = field_pow16(k, F)
result_list.append([W, W_2, Z, Z_4, Y, Y_16])
return result_list
def gen_X(F, W, W_2, Z, Z_4, Y, Y_16):
W_2_Z_4_Y_16 = F.Multiply(F.Multiply(W_2, Z_4), Y_16)
W_Z_4_Y_16 = F.Multiply(F.Multiply(W, Z_4), Y_16)
W_2_Z_Y_16 = F.Multiply(F.Multiply(W_2, Z), Y_16)
W_Z_Y_16 = F.Multiply(F.Multiply(W, Z), Y_16)
W_2_Z_4_Y = F.Multiply(F.Multiply(W_2, Z_4), Y)
W_Z_4_Y = F.Multiply(F.Multiply(W, Z_4), Y)
W_2_Z_Y = F.Multiply(F.Multiply(W_2, Z), Y)
W_Z_Y = F.Multiply(F.Multiply(W, Z), Y)
return [W_2_Z_4_Y_16, W_Z_4_Y_16, W_2_Z_Y_16, W_Z_Y_16, W_2_Z_4_Y, W_Z_4_Y, W_2_Z_Y, W_Z_Y]
def to_matrix(x):
m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
m.SetRow(0, [(x[0] & 0x80) >> 7, (x[1] & 0x80) >> 7, (x[2] & 0x80) >> 7, (x[3] & 0x80) >> 7, (x[4] & 0x80) >> 7, (x[5] & 0x80) >> 7, (x[6] & 0x80) >> 7, (x[7] & 0x80) >> 7])
m.SetRow(1, [(x[0] & 0x40) >> 6, (x[1] & 0x40) >> 6, (x[2] & 0x40) >> 6, (x[3] & 0x40) >> 6, (x[4] & 0x40) >> 6, (x[5] & 0x40) >> 6, (x[6] & 0x40) >> 6, (x[7] & 0x40) >> 6])
m.SetRow(2, [(x[0] & 0x20) >> 5, (x[1] & 0x20) >> 5, (x[2] & 0x20) >> 5, (x[3] & 0x20) >> 5, (x[4] & 0x20) >> 5, (x[5] & 0x20) >> 5, (x[6] & 0x20) >> 5, (x[7] & 0x20) >> 5])
m.SetRow(3, [(x[0] & 0x10) >> 4, (x[1] & 0x10) >> 4, (x[2] & 0x10) >> 4, (x[3] & 0x10) >> 4, (x[4] & 0x10) >> 4, (x[5] & 0x10) >> 4, (x[6] & 0x10) >> 4, (x[7] & 0x10) >> 4])
m.SetRow(4, [(x[0] & 0x08) >> 3, (x[1] & 0x08) >> 3, (x[2] & 0x08) >> 3, (x[3] & 0x08) >> 3, (x[4] & 0x08) >> 3, (x[5] & 0x08) >> 3, (x[6] & 0x08) >> 3, (x[7] & 0x08) >> 3])
m.SetRow(5, [(x[0] & 0x04) >> 2, (x[1] & 0x04) >> 2, (x[2] & 0x04) >> 2, (x[3] & 0x04) >> 2, (x[4] & 0x04) >> 2, (x[5] & 0x04) >> 2, (x[6] & 0x04) >> 2, (x[7] & 0x04) >> 2])
m.SetRow(6, [(x[0] & 0x02) >> 1, (x[1] & 0x02) >> 1, (x[2] & 0x02) >> 1, (x[3] & 0x02) >> 1, (x[4] & 0x02) >> 1, (x[5] & 0x02) >> 1, (x[6] & 0x02) >> 1, (x[7] & 0x02) >> 1])
m.SetRow(7, [(x[0] & 0x01) >> 0, (x[1] & 0x01) >> 0, (x[2] & 0x01) >> 0, (x[3] & 0x01) >> 0, (x[4] & 0x01) >> 0, (x[5] & 0x01) >> 0, (x[6] & 0x01) >> 0, (x[7] & 0x01) >> 0])
return m
def matrix_col_byte(c):
return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)
def matrix_row_byte(c):
return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)
def matrix_cols(m):
x = []
for i in range(8):
c = m.GetColumn(i)
x.append(matrix_col_byte(c))
return x
def matrix_rows(m):
x = []
for i in range(8):
r = m.GetRow(i)
x.append(matrix_row_byte(r))
return x
def gen_X_inv(x):
m = to_matrix(x)
m_inv = m.Inverse()
return matrix_cols(m_inv)
def G4_mul(x, y):
'''
GF(2^2) multiply operator, normal basis is {W^2, W}
'''
a = (x & 0x02) >> 1
b = x & 0x01
c = (y & 0x02) >> 1
d = y & 0x01
e = (a ^ b) & (c ^ d)
return (((a & c) ^ e) << 1) | ((b & d) ^ e)
def G4_mul_N(x):
'''
GF(2^2) multiply N, normal basis is {W^2, W}, N = W^2
'''
a = (x & 0x02) >> 1
b = x & 0x01
p = b
q = a ^ b
return (p << 1) | q
def G4_mul_N2(x):
'''
GF(2^2) multiply N^2, normal basis is {W^2, W}, N = W^2
'''
a = (x & 0x02) >> 1
b = x & 0x01
return ((a ^ b) << 1) | a
def G4_inv(x):
'''
GF(2^2) inverse opertor
'''
a = (x & 0x02) >> 1
b = x & 0x01
return (b << 1) | a
def G16_mul(x, y):
'''
GF(2^4) multiply operator, normal basis is {Z^4, Z}
'''
a = (x & 0xc) >> 2
b = x & 0x03
c = (y & 0xc) >> 2
d = y & 0x03
e = G4_mul(a ^ b, c ^ d)
e = G4_mul_N(e)
p = G4_mul(a, c) ^ e
q = G4_mul(b, d) ^ e
return (p << 2) | q
def G16_sq_mul_u(x):
'''
GF(2^4) x^2 * u operator, u = N^2 Z, N = W^2
'''
a = (x & 0xc) >> 2
b = x & 0x03
p = G4_inv(a ^ b)
q = G4_mul_N2(G4_inv(b))
return (p << 2) | q
def G16_inv(x):
'''
GF(2^4) inverse opertor
'''
a = (x & 0xc) >> 2
b = x & 0x03
c = G4_mul_N(G4_inv(a ^ b))
d = G4_mul(a, b)
e = G4_inv(c ^ d)
p = G4_mul(e, b)
q = G4_mul(e, a)
return (p << 2) | q
def G256_inv(x):
'''
GF(2^8) inverse opertor
'''
a = (x & 0xf0) >> 4
b = x & 0x0f
c = G16_sq_mul_u(a ^ b)
d = G16_mul(a, b)
e = G16_inv(c ^ d)
p = G16_mul(e, b)
q = G16_mul(e, a)
return (p << 4) | q
def G256_new_basis(x, b):
'''
x presentation under new basis b
'''
y = 0
for i in range(8):
if x & (1<<((7-i))):
y ^= b[i]
return y
AES_A = [0b10001111, 0b11000111, 0b11100011, 0b11110001, 0b11111000, 0b01111100, 0b00111110, 0b00011111]
AES_C = [0, 1, 1, 0, 0, 0, 1, 1]
def AES_SBOX(X, X_inv):
sbox = []
for i in range(256):
t = G256_new_basis(i, X_inv)
t = G256_inv(t)
t = G256_new_basis(t, X)
t = G256_new_basis(t, AES_A)
sbox.append(t ^ 0x63)
return sbox
def print_sbox(sbox):
for i, s in enumerate(sbox):
print(f'%02x'%s,',', end='')
if (i+1) % 16 == 0:
print()
def print_all_aes_sbox():
result_list = get_all_WZY(aesf)
for i, v in enumerate(result_list):
X = gen_X(aesf, v[0], v[1], v[2], v[3], v[4], v[5])
X_inv = gen_X_inv(X)
print_sbox(AES_SBOX(X, X_inv))
print()
ZUC_A = [0b01110111, 0b10111011, 0b11011101, 0b11101110, 0b11001011, 0b01101101, 0b00111110, 0b10010111]
ZUC_C = [0, 1, 0, 1, 0, 1, 0, 1]
def ZUC_SBOX(X, X_inv):
sbox = []
for i in range(256):
t = G256_new_basis(i, X_inv)
t = G256_inv(t)
t = G256_new_basis(t, X)
t = G256_new_basis(t, ZUC_A)
sbox.append(t ^ 0x55)
return sbox
def print_all_zuc_sbox():
result_list = get_all_WZY(zucf)
for i, v in enumerate(result_list):
X = gen_X(zucf, v[0], v[1], v[2], v[3], v[4], v[5])
X_inv = gen_X_inv(X)
print_sbox(ZUC_SBOX(X, X_inv))
print()
def print_m(m):
for i, s in enumerate(m):
print(f'0x%02x'%s,',', end='')
def gen_all_m1_c1_m2_c2():
aes_result_list = get_all_WZY(aesf)
zuc_result_list = get_all_WZY(zucf)
Aaes = to_matrix(AES_A)
Aaes_inv = Aaes.Inverse()
Azuc = to_matrix(ZUC_A)
Caes = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for i in range(8):
Caes.SetRow(i, [AES_C[i]])
Czuc = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for i in range(8):
Czuc.SetRow(i, [ZUC_C[i]])
for i, v1 in enumerate(aes_result_list):
Xaes = to_matrix(gen_X(aesf, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]))
Xaes_inv = Xaes.Inverse()
for j, v2 in enumerate(zuc_result_list):
Xzuc = to_matrix(gen_X(zucf, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]))
Xzuc_inv = Xzuc.Inverse()
M1 = Xaes * Xzuc_inv
M2 = Azuc * Xzuc * Xaes_inv * Aaes_inv
C2 = M2 * Caes
print(f'M1=','', end='')
print_m(matrix_rows(M1))
print(f' C1=','', end='')
print(hex(0x0))
print(f'M2=','', end='')
print_m(matrix_rows(M2))
print(f' C2=','', end='')
print(hex(0x55 ^ matrix_col_byte(C2.GetColumn(0))))
print()
gen_all_m1_c1_m2_c2()
结果:
M1= 0x28 ,0x58 ,0xf6 ,0x76 ,0x8a ,0x40 ,0x3e ,0xf3 , C1= 0x0
M2= 0x81 ,0xfd ,0x57 ,0x8e ,0xdb ,0x6d ,0xf6 ,0x2e , C2= 0xab
M1= 0x3c ,0xaa ,0xe2 ,0x90 ,0xb2 ,0x78 ,0x3e ,0x2b , C1= 0x0
M2= 0x0e ,0x43 ,0x91 ,0x08 ,0xa3 ,0x93 ,0x70 ,0x6e , C2= 0xbc
M1= 0xc6 ,0xac ,0x18 ,0x9e ,0x5a ,0x4e ,0x12 ,0x95 , C1= 0x0
M2= 0x01 ,0x5d ,0x26 ,0x88 ,0xcc ,0xb3 ,0x36 ,0x96 , C2= 0xd8
M1= 0x0c ,0x5e ,0xd2 ,0xa6 ,0xbc ,0xa8 ,0x12 ,0xbf , C1= 0x0
M2= 0x87 ,0x25 ,0xe0 ,0x07 ,0x72 ,0x82 ,0xb9 ,0xdf , C2= 0x58
M1= 0x70 ,0x7c ,0xae ,0x1e ,0xf0 ,0xc8 ,0x06 ,0xdd , C1= 0x0
M2= 0x02 ,0xa5 ,0xd8 ,0x5a ,0x05 ,0xd9 ,0xed ,0x0d , C2= 0xfe
//Intel也用了这组
M1= 0x96 ,0x50 ,0x48 ,0xd4 ,0xe4 ,0xdc ,0x06 ,0x11 , C1= 0x0
M2= 0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c , C2= 0x32
M1= 0x52 ,0x6e ,0x8c ,0x02 ,0x26 ,0xc0 ,0xf4 ,0x47 , C1= 0x0
M2= 0x95 ,0x45 ,0x66 ,0xf5 ,0x9d ,0xe7 ,0x84 ,0x15 , C2= 0xec
M1= 0x6a ,0x42 ,0xb4 ,0x16 ,0xec ,0x0a ,0xf4 ,0xa7 , C1= 0x0
M2= 0x62 ,0xf2 ,0xa0 ,0xcd ,0xec ,0xae ,0xbc ,0xeb , C2= 0xb7
计算查找表
from pyfinite import genericmatrix
def XOR(x, y): return x ^ y
def AND(x, y): return x & y
def DIV(x, y): return x
def genCMatrix(c):
Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for j in range (8):
Imatrix.SetRow(j, [(0x63 >> (7 - j)) & 1])
return Imatrix
def matrix_from_cols(cols):
m = genericmatrix.GenericMatrix(size=(8, 8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for i in range (8):
k = 7 - i
j = 1 << k
m.SetRow(i, [(cols[0] & j) >> k, (cols[1] & j) >> k, (cols[2] & j) >> k, (cols[3] & j) >> k, (cols[4] & j) >> k, (cols[5] & j) >> k, (cols[6] & j) >> k, (cols[7] & j) >> k])
return m
def gen_matrix_based_table(table):
return matrix_from_cols([table[0x80] ^ table[0], table[0x40] ^ table[0], table[0x20] ^ table[0], table[0x10] ^ table[0], table[0x08] ^ table[0], table[0x04] ^ table[0], table[0x02] ^ table[0], table[0x01] ^ table[0]])
def gen_matrix_based_high_low(high, low):
table = []
for i in range(16):
for j in range(16):
table.append(high[i] ^ low[j])
return gen_matrix_based_table(table)
def matrix_col_byte(c):
return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)
def gen_lookup(m, c):
table = []
for i in range(256):
Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for j in range (8):
Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
tmp = m * Imatrix
table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)
return table
def gen_lookup_low(m, c):
table = []
for i in range(256):
Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for j in range (8):
if j < 4:
Imatrix.SetRow(j, [0])
else:
Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
tmp = m * Imatrix
table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)
return table
def gen_lookup_high(m, c):
table = []
for i in range(256):
Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for j in range (8):
if j < 4:
Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
else:
Imatrix.SetRow(j, [0])
tmp = m * Imatrix
table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)
return table
def print_table(table):
for i, s in enumerate(table):
print(f'0x%02X'%s,',', end='')
if (i+1) % 16 == 0:
print()
def print_high(table):
for i, s in enumerate(table):
if i % 16 == 0:
print(f'0x%02X'%s,',', end='')
print()
def print_low(table):
for i, s in enumerate(table):
if i < 16:
print(f'0x%02X'%s,',', end='')
print()
def to_matrix(x):
m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for i in range(8):
m.SetRow(i, [(x[i] & 0x80) >> 7, (x[i] & 0x40) >> 6, (x[i] & 0x20) >> 5, (x[i] & 0x10) >> 4, (x[i] & 0x08) >> 3, (x[i] & 0x04) >> 2, (x[i] & 0x02) >> 1, (x[i] & 0x01) >> 0])
return m
def gen_intel_c(m, c):
Cmatrix = genCMatrix(0x63)
c1 = m*Cmatrix
return matrix_col_byte(c1.GetColumn(0)) ^ c
Mmatrix = to_matrix([0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x55))
print()
print('Low for AMD64 wich use Cancel AES 0x63')
print_low(gen_lookup_low(Mmatrix, 0x00))
print()
print('Low for ARM64')
print_low(gen_lookup_low(Mmatrix, 0x32^0x55))
结果:
High
0x55 ,0xBA ,0xCC ,0x23 ,0x15 ,0xFA ,0x8C ,0x63 ,0x09 ,0xE6 ,0x90 ,0x7F ,0x49 ,0xA6 ,0xD0 ,0x3F ,
Low for AMD64 wich use Cancel AES 0x63
0x00 ,0x14 ,0xAA ,0xBE ,0x71 ,0x65 ,0xDB ,0xCF ,0xB7 ,0xA3 ,0x1D ,0x09 ,0xC6 ,0xD2 ,0x6C ,0x78 ,
Low for ARM64
0x67 ,0x73 ,0xCD ,0xD9 ,0x16 ,0x02 ,0xBC ,0xA8 ,0xD0 ,0xC4 ,0x7A ,0x6E ,0xA1 ,0xB5 ,0x0B ,0x1F ,
当然,ARM64的外层查找表也可以写成:
Mmatrix = to_matrix([0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x00))
print()
print('Low for ARM64')
print_low(gen_lookup_low(Mmatrix, 0x32))
结果:
High
0x00 ,0xEF ,0x99 ,0x76 ,0x40 ,0xAF ,0xD9 ,0x36 ,0x5C ,0xB3 ,0xC5 ,0x2A ,0x1C ,0xF3 ,0x85 ,0x6A ,
Low for ARM64
0x32 ,0x26 ,0x98 ,0x8C ,0x43 ,0x57 ,0xE9 ,0xFD ,0x85 ,0x91 ,0x2F ,0x3B ,0xF4 ,0xE0 ,0x5E ,0x4A ,
内层查找表:
Mmatrix = to_matrix([0x96 ,0x50 ,0x48 ,0xd4 ,0xe4 ,0xdc ,0x06 ,0x11])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x00))
print()
print('Low')
print_low(gen_lookup_low(Mmatrix, 0x00))
结果:
High
0x00 ,0xD5 ,0x08 ,0xDD ,0x7C ,0xA9 ,0x74 ,0xA1 ,0x9C ,0x49 ,0x94 ,0x41 ,0xE0 ,0x35 ,0xE8 ,0x3D ,
Low
0x00 ,0x01 ,0x82 ,0x83 ,0x9E ,0x9F ,0x1C ,0x1D ,0x24 ,0x25 ,0xA6 ,0xA7 ,0xBA ,0xBB ,0x38 ,0x39 ,
AMD64 SHLD SHRD的性能
经过测试,SHLD/SHRD的性能还不如目前的多条指令实现。
BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xC3; BYTE $0x10 \ // SHLDL(BRC_X0, AX, $16)
BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xDC; BYTE $0x10 \ // SHLDL(BRC_X1, BX, $16)
BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xCD; BYTE $0x10 \ // SHLDL(BRC_X2, CX, $16)
BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xD6; BYTE $0x10
参考:
- zuc sbox with aesni, This is the pure golang code to study ZUC implementation with AESENCLAST/AESE instruction.
- Faster 128-EEA3 and 128-EIA3 Software, Delayed modular reduction & Carryless multiplication
- Efficient Software Implementations of ZUC-256, 这篇文章有ZUC S0 和 S1 的较详细介绍。
- Analyzing SNOW and ZUC Security Algorithms Using NIST SP 800-22 and Enhancing their Randomness
- Intel(R) Multi-Buffer Crypto for IPsec Library,在Intel CPU架构实现所有优化。
- Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode