mirror of
https://github.com/emmansun/gmsm.git
synced 2025-05-11 03:26:17 +08:00
Updated SM3性能优化 (markdown)
parent
ee607864ab
commit
5c221dd095
48
SM3性能优化.md
48
SM3性能优化.md
@ -2,4 +2,50 @@
|
||||
* 纯golang版本,减少W,W'预计算;避免使用条件判断。
|
||||
* 三月初实现了针对AMD64架构的ASM版本。
|
||||
|
||||
下一步,看看能否实现使用AVX2指令集进行优化。
|
||||
下一步,看看能否实现使用AVX2指令集进行优化。
|
||||
|
||||
下面是SHA256 message scheduler SIMD实现,四轮算出下一个4个DWORDs.
|
||||
|
||||
// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
|
||||
// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
|
||||
// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
|
||||
|
||||
// Transpose data into high/low parts
|
||||
VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
|
||||
VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
|
||||
VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
|
||||
VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
|
||||
|
||||
|
||||
VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7] = {w12,w11,w10,w9}
|
||||
VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] = {w12+w3, w11+w2, w10+w1, w9+w0}
|
||||
VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15] = {w4,w3,w2,w1}
|
||||
VPSRLD $7, XTMP1, XTMP2; \ // XTMP2 = W[-15] >> 7 = {w4>>7,w3>>7,w2>>7,w1>>7}
|
||||
VPSLLD $(32-7), XTMP1, XTMP3; \ // XTMP3 = W[-15] << 28 = {w4<<25,w3<<25,w2>>25,w1<<25}
|
||||
VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 = {ROTR(7,w4),ROTR(7,w3),ROTR(7,w2),ROTR(7,w1)}
|
||||
VPSRLD $18, XTMP1, XTMP2;
|
||||
|
||||
VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
|
||||
VPSLLD $(32-18), XTMP1, XTMP1; \ // why no VPOR?
|
||||
VPXOR XTMP1, XTMP3, XTMP3; \
|
||||
VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
|
||||
VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 ^ (W[-15] >> 3)
|
||||
VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA} {w15,w15,w14,w14}
|
||||
VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0
|
||||
VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA}
|
||||
|
||||
VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
|
||||
VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA}
|
||||
VPXOR XTMP3, XTMP2, XTMP2; \
|
||||
VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA}
|
||||
VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
|
||||
VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]}
|
||||
VPSHUFD $0x50, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC}
|
||||
|
||||
VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC}
|
||||
VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC}
|
||||
VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC}
|
||||
VPXOR XTMP3, XTMP2, XTMP2; \
|
||||
VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC}
|
||||
VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
|
||||
VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
|
Loading…
x
Reference in New Issue
Block a user