diff --git a/SM3性能优化.md b/SM3性能优化.md index 371659e..3c8a0d4 100644 --- a/SM3性能优化.md +++ b/SM3性能优化.md @@ -8,7 +8,7 @@ # SHA256 SIMD Execution 下面是SHA256 message scheduler SIMD实现,四轮算出下一个4个DWORDs. - +```asm // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 // SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) // SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) @@ -52,6 +52,7 @@ VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC} VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00} VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} +``` SM3的message scheduler有两个显著差别: @@ -64,7 +65,7 @@ https://software.intel.com/sites/landingpage/IntrinsicsGuide/ # SM3 SIMD Execution SM3的第一版,比SHA256复杂,不知道有没有继续优化的空间。 - +```asm // Wj ← P1(Wj−16 ⊕ Wj−9 ⊕ (Wj−3 ≪ 15)) ⊕ (Wj−13 ≪ 7) ⊕ Wj−6 // Transpose data into high/low parts VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0 @@ -112,6 +113,7 @@ SM3的第一版,比SHA256复杂,不知道有没有继续优化的空间。 VPXOR XTMP1, XTMP0, XTMP1; \ // XTMP1 = {W[3], W[2], ..., ...} VPALIGNR $8, XTMP1, XTMP2, XTMP3; \ // XTMP3 = {W[1], W[0], W[3], W[2]} VPSHUFD $0x4E, XTMP3, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} +``` 由于要算**52**个DWORDs,所以