Updated SM3性能优化 (markdown)

Sun Yimin 2021-03-09 15:48:18 +08:00
parent ee607864ab
commit 5c221dd095

@ -2,4 +2,50 @@
* 纯golang版本减少WW'预计算;避免使用条件判断。
* 三月初实现了针对AMD64架构的ASM版本。
下一步看看能否实现使用AVX2指令集进行优化。
下一步看看能否实现使用AVX2指令集进行优化。
下面是SHA256 message scheduler SIMD实现四轮算出下一个4个DWORDs.
// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
// Transpose data into high/low parts
VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7] = {w12,w11,w10,w9}
VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] = {w12+w3, w11+w2, w10+w1, w9+w0}
VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15] = {w4,w3,w2,w1}
VPSRLD $7, XTMP1, XTMP2; \ // XTMP2 = W[-15] >> 7 = {w4>>7,w3>>7,w2>>7,w1>>7}
VPSLLD $(32-7), XTMP1, XTMP3; \ // XTMP3 = W[-15] << 28 = {w4<<25,w3<<25,w2>>25,w1<<25}
VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 = {ROTR(7,w4),ROTR(7,w3),ROTR(7,w2),ROTR(7,w1)}
VPSRLD $18, XTMP1, XTMP2;
VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
VPSLLD $(32-18), XTMP1, XTMP1; \ // why no VPOR?
VPXOR XTMP1, XTMP3, XTMP3; \
VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 ^ (W[-15] >> 3)
VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA} {w15,w15,w14,w14}
VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0
VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA}
VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA}
VPXOR XTMP3, XTMP2, XTMP2; \
VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA}
VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]}
VPSHUFD $0x50, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC}
VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC}
VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC}
VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC}
VPXOR XTMP3, XTMP2, XTMP2; \
VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC}
VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]}