diff --git a/SM3性能优化.md b/SM3性能优化.md index 69d058c..834b1ea 100644 --- a/SM3性能优化.md +++ b/SM3性能优化.md @@ -2,4 +2,50 @@ * 纯golang版本,减少W,W'预计算;避免使用条件判断。 * 三月初实现了针对AMD64架构的ASM版本。 -下一步,看看能否实现使用AVX2指令集进行优化。 \ No newline at end of file +下一步,看看能否实现使用AVX2指令集进行优化。 + +下面是SHA256 message scheduler SIMD实现,四轮算出下一个4个DWORDs. + + // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 + // SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) + // SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) + + // Transpose data into high/low parts + VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0 + VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4 + VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8 + VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12 + + + VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7] = {w12,w11,w10,w9} + VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] = {w12+w3, w11+w2, w10+w1, w9+w0} + VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15] = {w4,w3,w2,w1} + VPSRLD $7, XTMP1, XTMP2; \ // XTMP2 = W[-15] >> 7 = {w4>>7,w3>>7,w2>>7,w1>>7} + VPSLLD $(32-7), XTMP1, XTMP3; \ // XTMP3 = W[-15] << 28 = {w4<<25,w3<<25,w2>>25,w1<<25} + VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 = {ROTR(7,w4),ROTR(7,w3),ROTR(7,w2),ROTR(7,w1)} + VPSRLD $18, XTMP1, XTMP2; + + VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3 + VPSLLD $(32-18), XTMP1, XTMP1; \ // why no VPOR? + VPXOR XTMP1, XTMP3, XTMP3; \ + VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 + VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 ^ (W[-15] >> 3) + VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA} {w15,w15,w14,w14} + VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0 + VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA} + + VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA} + VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA} + VPXOR XTMP3, XTMP2, XTMP2; \ + VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA} + VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA} + VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]} + VPSHUFD $0x50, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC} + + VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC} + VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC} + VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC} + VPXOR XTMP3, XTMP2, XTMP2; \ + VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC} + VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00} + VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} \ No newline at end of file