diff --git a/sm3/sm3blocks_s390x.s b/sm3/sm3blocks_s390x.s index b957538..17dcd8a 100644 --- a/sm3/sm3blocks_s390x.s +++ b/sm3/sm3blocks_s390x.s @@ -127,38 +127,38 @@ GLOBL mask<>(SB), 8, $64 ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ #define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \ - MESSAGE_SCHEDULE(index) \ // V11 is Wt+4 now, Pls do not use it + MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it PROLD(a, TMP0, 12) \ - VLR TMP0, TMP1 \ + VLR TMP0, TMP4 \ VLREPF (index*4)(R3), TMP2 \ VAF TMP2, TMP0, TMP0 \ VAF e, TMP0, TMP0 \ PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 - VX TMP2, TMP1, TMP0 \ // TMP0 = SS2 + VX TMP2, TMP4, TMP0 \ // TMP0 = SS2 VO a, b, TMP3 \ - VN a, b, TMP1 \ + VN a, b, TMP4 \ VN c, TMP3, TMP3 \ - VO TMP1, TMP3, TMP1 \ // (a AND b) OR (a AND c) OR (b AND c) - VAF TMP1, d, TMP1 \ // (a AND b) OR (a AND c) OR (b AND c) + d + VO TMP4, TMP3, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + VAF TMP4, d, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d loadWordByIndex(TMP3, index) \ // Wj - VX TMP3, TMP4, TMP4 \ // Wj XOR Wj+4 - VAF TMP1, TMP4, TMP1 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4) - VAF TMP1, TMP0, TMP1 \ // TT1 + VX TMP3, TMP1, TMP1 \ // Wj XOR Wj+4 + VAF TMP4, TMP1, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4) + VAF TMP4, TMP0, TMP4 \ // TT1 VAF h, TMP3, TMP3 \ // Wt + h VAF TMP2, TMP3, TMP3 \ // Wt + h + SS1 - VX f, g, TMP4 \ - VN TMP4, e, TMP4 \ - VX g, TMP4, TMP4 \ // (f XOR g) AND e XOR g - VAF TMP3, TMP4, TMP3 \ // TT2 - VLR b, TMP4 \ - PROLD(TMP4, b, 9) \ // b = b <<< 9 - VLR TMP1, h \ // h = TT1 - VLR f, TMP4 \ - PROLD(TMP4, f, 19) \ // f = f <<< 19 - PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9 - PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17 - VX TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9) - VX TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) + VX f, g, TMP1 \ + VN TMP1, e, TMP1 \ + VX g, TMP1, TMP1 \ // (f XOR g) AND e XOR g + VAF TMP3, TMP1, TMP3 \ // TT2 + VLR b, TMP1 \ + PROLD(TMP1, b, 9) \ // b = b <<< 9 + VLR TMP4, h \ // h = TT1 + VLR f, TMP1 \ + PROLD(TMP1, f, 19) \ // f = f <<< 19 + PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9 + PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17 + VX TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9) + VX TMP1, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) // func copyResultsBy4(dig *uint32, dst *byte) TEXT ·copyResultsBy4(SB),NOSPLIT,$0