From 76bafca3c420884b2d7683f33d2f70836d140512 Mon Sep 17 00:00:00 2001
From: Sun Yimin <emmansun@users.noreply.github.com>
Date: Wed, 20 Sep 2023 10:27:01 +0800
Subject: [PATCH] sm3: format code, #165

---
 sm3/sm3block_simd_amd64.s | 76 +++++++++++++++------------------------
 1 file changed, 28 insertions(+), 48 deletions(-)

diff --git a/sm3/sm3block_simd_amd64.s b/sm3/sm3block_simd_amd64.s
index 4cbf3de..ac89ad9 100644
--- a/sm3/sm3block_simd_amd64.s
+++ b/sm3/sm3block_simd_amd64.s
@@ -56,6 +56,23 @@
 #define _INP _INP_END + INP_END_SIZE
 #define STACK_SIZE _INP + INP_SIZE
 
+#define SS12(a, e, const, ss1, ss2) \
+	MOVL     a, ss2;                            \
+	ROLL     $12, ss2;                          \ // y0 = a <<< 12
+	MOVL     e, ss1;                            \
+	ADDL     $const, ss1;                       \
+	ADDL     ss2, ss1;                          \ // y2 = a <<< 12 + e + T
+	ROLL     $7, ss1;                           \ // y2 = SS1
+	XORL     ss1, ss2
+
+#define P0(tt2, tmp, out) \
+	MOVL     tt2, tmp;                             \
+	ROLL     $9, tmp;                              \
+	MOVL     tt2, out;                             \
+	ROLL     $17, out;                             \ 
+	XORL     tmp, out;                             \
+	XORL     tt2, out
+
 // For rounds [0 - 16)
 #define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
 	;                                          \ // #############################  RND N + 0 ############################//
@@ -105,27 +122,27 @@
 	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
 	ROLL     $7, y2;                           \ // y2 = SS1
 	XORL     y2, y0                            \ // y0 = SS2
-	VPSRLQ  $17, XTMP2, XTMP2;                 \ // XTMP2 = W[-3] rol 15 {xBxA}
 	ADDL     (disp + 1*4)(SP), y2;             \ // y2 = SS1 + W
+	VPSRLQ  $17, XTMP2, XTMP2;                 \ // XTMP2 = W[-3] rol 15 {xBxA}
 	ADDL     h, y2;                            \ // y2 = h + SS1 + W
 	ADDL     (disp + 1*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
-	VPXOR   XTMP1, XTMP2, XTMP2;               \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA}
 	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
+	VPXOR   XTMP1, XTMP2, XTMP2;               \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA}
 	MOVL     a, h;                             \
 	XORL     b, h;                             \
-	VPSHUFD $0x00, XTMP2, XTMP2;               \ // XTMP2 = {AAAA}
 	XORL     c, h;                             \
 	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
+	VPSHUFD $0x00, XTMP2, XTMP2;               \ // XTMP2 = {AAAA}
 	MOVL     e, y1;                            \ 
 	XORL     f, y1;                            \
-	VPSRLQ  $17, XTMP2, XTMP3;                 \ // XTMP3 = XTMP2 rol 15 {xxxA}
 	XORL     g, y1;                            \
 	ADDL     y1, y2;                           \ // y2 = GG(e, f, g) + h + SS1 + W = tt2
+	VPSRLQ  $17, XTMP2, XTMP3;                 \ // XTMP3 = XTMP2 rol 15 {xxxA}
 	ROLL     $9, b;                            \
 	ROLL     $19, f;                           \
-	VPSRLQ  $9, XTMP2, XTMP4;                  \ // XTMP4 = XTMP2 rol 23 {xxxA}
 	MOVL     y2, y0;                           \
 	ROLL     $9, y0;                           \
+	VPSRLQ  $9, XTMP2, XTMP4;                  \ // XTMP4 = XTMP2 rol 23 {xxxA}
 	MOVL     y2, d;                            \
 	ROLL     $17, d;                           \ 
 	XORL     y0, d;                            \
@@ -138,8 +155,8 @@
 	ROLL     $12, y0;                          \ // y0 = a <<< 12
 	MOVL     e, y2;                            \
 	ADDL     $const, y2;                       \
-	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
 	VPXOR    XTMP4, XTMP3, XTMP4;              \ // XTMP4 = XTMP2 ^ (XTMP2 rol 15 {xxxA}) ^ (XTMP2 rol 23 {xxxA})
+	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
 	ROLL     $7, y2;                           \ // y2 = SS1
 	XORL     y2, y0                            \ // y0 = SS2
 	ADDL     (disp + 2*4)(SP), y2;             \ // y2 = SS1 + W
@@ -241,12 +258,7 @@
 	ROLL     $9, b;                            \
 	ROLL     $19, f;                           \
 	VPALIGNR $12, XWORD1, XWORD2, XTMP1;       \ // XTMP1 = W[-9] = {w10,w9,w8,w7}
-	MOVL     y2, y0;                           \
-	ROLL     $9, y0;                           \
-	MOVL     y2, d;                            \
-	ROLL     $17, d;                           \ 
-	XORL     y0, d;                            \
-	XORL     y2, d;                            \ // d = P(tt2)
+	P0(y2, y0, d);                             \
 	VPXOR XWORD0, XTMP1, XTMP1;                \ // XTMP1 = W[-9] ^ W[-16]
 
 #define ROUND_AND_SCHED_N_1_1(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
@@ -282,12 +294,7 @@
 	ROLL     $9, b;                            \
 	ROLL     $19, f;                           \
 	VPSRLQ  $9, XTMP2, XTMP4;                  \ // XTMP4 = XTMP2 rol 23 {xxxA}
-	MOVL     y2, y0;                           \
-	ROLL     $9, y0;                           \
-	MOVL     y2, d;                            \
-	ROLL     $17, d;                           \ 
-	XORL     y0, d;                            \
-	XORL     y2, d;                            \ // d = P(tt2)
+	P0(y2, y0, d);                             \
 	VPXOR    XTMP2, XTMP4, XTMP4;              \ // XTMP4 = XTMP2 XOR (XTMP2 rol 23 {xxxA})
 
 #define ROUND_AND_SCHED_N_1_2(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
@@ -323,12 +330,7 @@
 	ROLL     $9, b;                            \
 	ROLL     $19, f;                           \
 	VPOR    XTMP3, XTMP4, XTMP4;               \ // XTMP4 = (W[-3] rol 15) {DCBA}
-	MOVL     y2, y0;                           \
-	ROLL     $9, y0;                           \
-	MOVL     y2, d;                            \
-	ROLL     $17, d;                           \ 
-	XORL     y0, d;                            \
-	XORL     y2, d;                            \ // d = P(tt2)
+	P0(y2, y0, d);                             \
 	VPXOR   XTMP1, XTMP4, XTMP4;               \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {DCBA}
 
 #define ROUND_AND_SCHED_N_1_3(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
@@ -364,31 +366,9 @@
 	ROLL     $9, b;                            \
 	ROLL     $19, f;                           \
 	VPXOR    XTMP3, XTMP1, XTMP1;              \ // XTMP1 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) ^ (XTMP4 rol 23 {DCBA})
-	MOVL     y2, y0;                           \
-	ROLL     $9, y0;                           \
-	MOVL     y2, d;                            \
-	ROLL     $17, d;                           \ 
-	XORL     y0, d;                            \
-	XORL     y2, d;                            \ // d = P(tt2)
+	P0(y2, y0, d);                             \
 	VPXOR    XTMP1, XTMP0, XWORD0;             \ // XWORD0 = {W[3], W[2], W[1], W[0]}
 
-#define SS12(a, e, const, ss1, ss2) \
-	MOVL     a, ss2;                            \
-	ROLL     $12, ss2;                          \ // y0 = a <<< 12
-	MOVL     e, ss1;                            \
-	ADDL     $const, ss1;                       \
-	ADDL     ss2, ss1;                          \ // y2 = a <<< 12 + e + T
-	ROLL     $7, ss1;                           \ // y2 = SS1
-	XORL     ss1, ss2
-
-#define P0(tt2, tmp, out) \
-	MOVL     tt2, tmp;                             \
-	ROLL     $9, tmp;                              \
-	MOVL     tt2, out;                             \
-	ROLL     $17, out;                             \ 
-	XORL     tmp, out;                             \
-	XORL     tt2, out
-
 // For rounds [0 - 16)
 #define DO_ROUND_N_0(disp, idx, const, a, b, c, d, e, f, g, h) \
 	;                                            \ // #############################  RND N + 0 ############################//
@@ -443,7 +423,7 @@
 
 // Requires: SSE2, SSSE3
 #define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \
-    MOVOU  XWORD1, XTMP0;                    \ 
+	MOVOU  XWORD1, XTMP0;                    \ 
 	PALIGNR $12, XWORD0, XTMP0;              \ // XTMP0 = W[-13] = {w6,w5,w4,w3}
 	MOVOU  XTMP0, XTMP1;                     \
 	PSLLL  $7, XTMP1;                        \