diff --git a/sm3/sm3_const_asm.s b/sm3/sm3_const_asm.s new file mode 100644 index 0000000..d213cac --- /dev/null +++ b/sm3/sm3_const_asm.s @@ -0,0 +1,71 @@ +#define T0 0x79cc4519 +#define T1 0xf3988a32 +#define T2 0xe7311465 +#define T3 0xce6228cb +#define T4 0x9cc45197 +#define T5 0x3988a32f +#define T6 0x7311465e +#define T7 0xe6228cbc + +#define T8 0xcc451979 +#define T9 0x988a32f3 +#define T10 0x311465e7 +#define T11 0x6228cbce +#define T12 0xc451979c +#define T13 0x88a32f39 +#define T14 0x11465e73 +#define T15 0x228cbce6 + +#define T16 0x9d8a7a87 +#define T17 0x3b14f50f +#define T18 0x7629ea1e +#define T19 0xec53d43c +#define T20 0xd8a7a879 +#define T21 0xb14f50f3 +#define T22 0x629ea1e7 +#define T23 0xc53d43ce + +#define T24 0x8a7a879d +#define T25 0x14f50f3b +#define T26 0x29ea1e76 +#define T27 0x53d43cec +#define T28 0xa7a879d8 +#define T29 0x4f50f3b1 +#define T30 0x9ea1e762 +#define T31 0x3d43cec5 + +#define T32 0x7a879d8a +#define T33 0xf50f3b14 +#define T34 0xea1e7629 +#define T35 0xd43cec53 +#define T36 0xa879d8a7 +#define T37 0x50f3b14f +#define T38 0xa1e7629e +#define T39 0x43cec53d + +#define T40 0x879d8a7a +#define T41 0xf3b14f5 +#define T42 0x1e7629ea +#define T43 0x3cec53d4 +#define T44 0x79d8a7a8 +#define T45 0xf3b14f50 +#define T46 0xe7629ea1 +#define T47 0xcec53d43 + +#define T48 0x9d8a7a87 +#define T49 0x3b14f50f +#define T50 0x7629ea1e +#define T51 0xec53d43c +#define T52 0xd8a7a879 +#define T53 0xb14f50f3 +#define T54 0x629ea1e7 +#define T55 0xc53d43ce + +#define T56 0x8a7a879d +#define T57 0x14f50f3b +#define T58 0x29ea1e76 +#define T59 0x53d43cec +#define T60 0xa7a879d8 +#define T61 0x4f50f3b1 +#define T62 0x9ea1e762 +#define T63 0x3d43cec5 diff --git a/sm3/sm3block_amd64.s b/sm3/sm3block_amd64.s index 0a64ac2..72ffa9b 100644 --- a/sm3/sm3block_amd64.s +++ b/sm3/sm3block_amd64.s @@ -3,6 +3,23 @@ #include "textflag.h" +#include "sm3_const_asm.s" + +// xorm (mem), reg +// Xor reg to mem using reg-mem xor and store +#define xorm(P1, P2) \ + XORL P2, P1; \ + MOVL P1, P2 + +#define a R8 +#define b R9 +#define c R10 +#define d R11 +#define e R12 +#define f R13 +#define g R14 +#define h DI + // Wt = Mt; for 0 <= t <= 3 #define MSGSCHEDULE0(index) \ MOVL (index*4)(SI), AX; \ @@ -148,14 +165,14 @@ TEXT ·blockAMD64(SB), 0, $288-32 JEQ end MOVQ dig+0(FP), BP - MOVL (0*4)(BP), R8 // a = H0 - MOVL (1*4)(BP), R9 // b = H1 - MOVL (2*4)(BP), R10 // c = H2 - MOVL (3*4)(BP), R11 // d = H3 - MOVL (4*4)(BP), R12 // e = H4 - MOVL (5*4)(BP), R13 // f = H5 - MOVL (6*4)(BP), R14 // g = H6 - MOVL (7*4)(BP), DI // h = H7 + MOVL (0*4)(BP), a // a = H0 + MOVL (1*4)(BP), b // b = H1 + MOVL (2*4)(BP), c // c = H2 + MOVL (3*4)(BP), d // d = H3 + MOVL (4*4)(BP), e // e = H4 + MOVL (5*4)(BP), f // f = H5 + MOVL (6*4)(BP), g // g = H6 + MOVL (7*4)(BP), h // h = H7 loop: MOVQ SP, BP @@ -165,91 +182,83 @@ loop: MSGSCHEDULE0(2) MSGSCHEDULE0(3) - SM3ROUND0(0, 0x79cc4519, R8, R9, R10, R11, R12, R13, R14, DI) - SM3ROUND0(1, 0xf3988a32, DI, R8, R9, R10, R11, R12, R13, R14) - SM3ROUND0(2, 0xe7311465, R14, DI, R8, R9, R10, R11, R12, R13) - SM3ROUND0(3, 0xce6228cb, R13, R14, DI, R8, R9, R10, R11, R12) - SM3ROUND0(4, 0x9cc45197, R12, R13, R14, DI, R8, R9, R10, R11) - SM3ROUND0(5, 0x3988a32f, R11, R12, R13, R14, DI, R8, R9, R10) - SM3ROUND0(6, 0x7311465e, R10, R11, R12, R13, R14, DI, R8, R9) - SM3ROUND0(7, 0xe6228cbc, R9, R10, R11, R12, R13, R14, DI, R8) - SM3ROUND0(8, 0xcc451979, R8, R9, R10, R11, R12, R13, R14, DI) - SM3ROUND0(9, 0x988a32f3, DI, R8, R9, R10, R11, R12, R13, R14) - SM3ROUND0(10, 0x311465e7, R14, DI, R8, R9, R10, R11, R12, R13) - SM3ROUND0(11, 0x6228cbce, R13, R14, DI, R8, R9, R10, R11, R12) + SM3ROUND0(0, T0, a, b, c, d, e, f, g, h) + SM3ROUND0(1, T1, h, a, b, c, d, e, f, g) + SM3ROUND0(2, T2, g, h, a, b, c, d, e, f) + SM3ROUND0(3, T3, f, g, h, a, b, c, d, e) + SM3ROUND0(4, T4, e, f, g, h, a, b, c, d) + SM3ROUND0(5, T5, d, e, f, g, h, a, b, c) + SM3ROUND0(6, T6, c, d, e, f, g, h, a, b) + SM3ROUND0(7, T7, b, c, d, e, f, g, h, a) + SM3ROUND0(8, T8, a, b, c, d, e, f, g, h) + SM3ROUND0(9, T9, h, a, b, c, d, e, f, g) + SM3ROUND0(10, T10, g, h, a, b, c, d, e, f) + SM3ROUND0(11, T11, f, g, h, a, b, c, d, e) - SM3ROUND1(12, 0xc451979c, R12, R13, R14, DI, R8, R9, R10, R11) - SM3ROUND1(13, 0x88a32f39, R11, R12, R13, R14, DI, R8, R9, R10) - SM3ROUND1(14, 0x11465e73, R10, R11, R12, R13, R14, DI, R8, R9) - SM3ROUND1(15, 0x228cbce6, R9, R10, R11, R12, R13, R14, DI, R8) + SM3ROUND1(12, T12, e, f, g, h, a, b, c, d) + SM3ROUND1(13, T13, d, e, f, g, h, a, b, c) + SM3ROUND1(14, T14, c, d, e, f, g, h, a, b) + SM3ROUND1(15, T15, b, c, d, e, f, g, h, a) - SM3ROUND2(16, 0x9d8a7a87, R8, R9, R10, R11, R12, R13, R14, DI) - SM3ROUND2(17, 0x3b14f50f, DI, R8, R9, R10, R11, R12, R13, R14) - SM3ROUND2(18, 0x7629ea1e, R14, DI, R8, R9, R10, R11, R12, R13) - SM3ROUND2(19, 0xec53d43c, R13, R14, DI, R8, R9, R10, R11, R12) - SM3ROUND2(20, 0xd8a7a879, R12, R13, R14, DI, R8, R9, R10, R11) - SM3ROUND2(21, 0xb14f50f3, R11, R12, R13, R14, DI, R8, R9, R10) - SM3ROUND2(22, 0x629ea1e7, R10, R11, R12, R13, R14, DI, R8, R9) - SM3ROUND2(23, 0xc53d43ce, R9, R10, R11, R12, R13, R14, DI, R8) - SM3ROUND2(24, 0x8a7a879d, R8, R9, R10, R11, R12, R13, R14, DI) - SM3ROUND2(25, 0x14f50f3b, DI, R8, R9, R10, R11, R12, R13, R14) - SM3ROUND2(26, 0x29ea1e76, R14, DI, R8, R9, R10, R11, R12, R13) - SM3ROUND2(27, 0x53d43cec, R13, R14, DI, R8, R9, R10, R11, R12) - SM3ROUND2(28, 0xa7a879d8, R12, R13, R14, DI, R8, R9, R10, R11) - SM3ROUND2(29, 0x4f50f3b1, R11, R12, R13, R14, DI, R8, R9, R10) - SM3ROUND2(30, 0x9ea1e762, R10, R11, R12, R13, R14, DI, R8, R9) - SM3ROUND2(31, 0x3d43cec5, R9, R10, R11, R12, R13, R14, DI, R8) - SM3ROUND2(32, 0x7a879d8a, R8, R9, R10, R11, R12, R13, R14, DI) - SM3ROUND2(33, 0xf50f3b14, DI, R8, R9, R10, R11, R12, R13, R14) - SM3ROUND2(34, 0xea1e7629, R14, DI, R8, R9, R10, R11, R12, R13) - SM3ROUND2(35, 0xd43cec53, R13, R14, DI, R8, R9, R10, R11, R12) - SM3ROUND2(36, 0xa879d8a7, R12, R13, R14, DI, R8, R9, R10, R11) - SM3ROUND2(37, 0x50f3b14f, R11, R12, R13, R14, DI, R8, R9, R10) - SM3ROUND2(38, 0xa1e7629e, R10, R11, R12, R13, R14, DI, R8, R9) - SM3ROUND2(39, 0x43cec53d, R9, R10, R11, R12, R13, R14, DI, R8) - SM3ROUND2(40, 0x879d8a7a, R8, R9, R10, R11, R12, R13, R14, DI) - SM3ROUND2(41, 0xf3b14f5, DI, R8, R9, R10, R11, R12, R13, R14) - SM3ROUND2(42, 0x1e7629ea, R14, DI, R8, R9, R10, R11, R12, R13) - SM3ROUND2(43, 0x3cec53d4, R13, R14, DI, R8, R9, R10, R11, R12) - SM3ROUND2(44, 0x79d8a7a8, R12, R13, R14, DI, R8, R9, R10, R11) - SM3ROUND2(45, 0xf3b14f50, R11, R12, R13, R14, DI, R8, R9, R10) - SM3ROUND2(46, 0xe7629ea1, R10, R11, R12, R13, R14, DI, R8, R9) - SM3ROUND2(47, 0xcec53d43, R9, R10, R11, R12, R13, R14, DI, R8) - SM3ROUND2(48, 0x9d8a7a87, R8, R9, R10, R11, R12, R13, R14, DI) - SM3ROUND2(49, 0x3b14f50f, DI, R8, R9, R10, R11, R12, R13, R14) - SM3ROUND2(50, 0x7629ea1e, R14, DI, R8, R9, R10, R11, R12, R13) - SM3ROUND2(51, 0xec53d43c, R13, R14, DI, R8, R9, R10, R11, R12) - SM3ROUND2(52, 0xd8a7a879, R12, R13, R14, DI, R8, R9, R10, R11) - SM3ROUND2(53, 0xb14f50f3, R11, R12, R13, R14, DI, R8, R9, R10) - SM3ROUND2(54, 0x629ea1e7, R10, R11, R12, R13, R14, DI, R8, R9) - SM3ROUND2(55, 0xc53d43ce, R9, R10, R11, R12, R13, R14, DI, R8) - SM3ROUND2(56, 0x8a7a879d, R8, R9, R10, R11, R12, R13, R14, DI) - SM3ROUND2(57, 0x14f50f3b, DI, R8, R9, R10, R11, R12, R13, R14) - SM3ROUND2(58, 0x29ea1e76, R14, DI, R8, R9, R10, R11, R12, R13) - SM3ROUND2(59, 0x53d43cec, R13, R14, DI, R8, R9, R10, R11, R12) - SM3ROUND2(60, 0xa7a879d8, R12, R13, R14, DI, R8, R9, R10, R11) - SM3ROUND2(61, 0x4f50f3b1, R11, R12, R13, R14, DI, R8, R9, R10) - SM3ROUND2(62, 0x9ea1e762, R10, R11, R12, R13, R14, DI, R8, R9) - SM3ROUND2(63, 0x3d43cec5, R9, R10, R11, R12, R13, R14, DI, R8) + SM3ROUND2(16, T16, a, b, c, d, e, f, g, h) + SM3ROUND2(17, T17, h, a, b, c, d, e, f, g) + SM3ROUND2(18, T18, g, h, a, b, c, d, e, f) + SM3ROUND2(19, T19, f, g, h, a, b, c, d, e) + SM3ROUND2(20, T20, e, f, g, h, a, b, c, d) + SM3ROUND2(21, T21, d, e, f, g, h, a, b, c) + SM3ROUND2(22, T22, c, d, e, f, g, h, a, b) + SM3ROUND2(23, T23, b, c, d, e, f, g, h, a) + SM3ROUND2(24, T24, a, b, c, d, e, f, g, h) + SM3ROUND2(25, T25, h, a, b, c, d, e, f, g) + SM3ROUND2(26, T26, g, h, a, b, c, d, e, f) + SM3ROUND2(27, T27, f, g, h, a, b, c, d, e) + SM3ROUND2(28, T28, e, f, g, h, a, b, c, d) + SM3ROUND2(29, T29, d, e, f, g, h, a, b, c) + SM3ROUND2(30, T30, c, d, e, f, g, h, a, b) + SM3ROUND2(31, T31, b, c, d, e, f, g, h, a) + SM3ROUND2(32, T32, a, b, c, d, e, f, g, h) + SM3ROUND2(33, T33, h, a, b, c, d, e, f, g) + SM3ROUND2(34, T34, g, h, a, b, c, d, e, f) + SM3ROUND2(35, T35, f, g, h, a, b, c, d, e) + SM3ROUND2(36, T36, e, f, g, h, a, b, c, d) + SM3ROUND2(37, T37, d, e, f, g, h, a, b, c) + SM3ROUND2(38, T38, c, d, e, f, g, h, a, b) + SM3ROUND2(39, T39, b, c, d, e, f, g, h, a) + SM3ROUND2(40, T40, a, b, c, d, e, f, g, h) + SM3ROUND2(41, T41, h, a, b, c, d, e, f, g) + SM3ROUND2(42, T42, g, h, a, b, c, d, e, f) + SM3ROUND2(43, T43, f, g, h, a, b, c, d, e) + SM3ROUND2(44, T44, e, f, g, h, a, b, c, d) + SM3ROUND2(45, T45, d, e, f, g, h, a, b, c) + SM3ROUND2(46, T46, c, d, e, f, g, h, a, b) + SM3ROUND2(47, T47, b, c, d, e, f, g, h, a) + SM3ROUND2(48, T48, a, b, c, d, e, f, g, h) + SM3ROUND2(49, T49, h, a, b, c, d, e, f, g) + SM3ROUND2(50, T50, g, h, a, b, c, d, e, f) + SM3ROUND2(51, T51, f, g, h, a, b, c, d, e) + SM3ROUND2(52, T52, e, f, g, h, a, b, c, d) + SM3ROUND2(53, T53, d, e, f, g, h, a, b, c) + SM3ROUND2(54, T54, c, d, e, f, g, h, a, b) + SM3ROUND2(55, T55, b, c, d, e, f, g, h, a) + SM3ROUND2(56, T56, a, b, c, d, e, f, g, h) + SM3ROUND2(57, T57, h, a, b, c, d, e, f, g) + SM3ROUND2(58, T58, g, h, a, b, c, d, e, f) + SM3ROUND2(59, T59, f, g, h, a, b, c, d, e) + SM3ROUND2(60, T60, e, f, g, h, a, b, c, d) + SM3ROUND2(61, T61, d, e, f, g, h, a, b, c) + SM3ROUND2(62, T62, c, d, e, f, g, h, a, b) + SM3ROUND2(63, T63, b, c, d, e, f, g, h, a) - MOVQ dig+0(FP), BP + MOVQ hg+0(FP), BP - XORL (0*4)(BP), R8 // H0 = a XOR H0 - MOVL R8, (0*4)(BP) - XORL (1*4)(BP), R9 // H1 = b XOR H1 - MOVL R9, (1*4)(BP) - XORL (2*4)(BP), R10 // H2 = c XOR H2 - MOVL R10, (2*4)(BP) - XORL (3*4)(BP), R11 // H3 = d XOR H3 - MOVL R11, (3*4)(BP) - XORL (4*4)(BP), R12 // H4 = e XOR H4 - MOVL R12, (4*4)(BP) - XORL (5*4)(BP), R13 // H5 = f XOR H5 - MOVL R13, (5*4)(BP) - XORL (6*4)(BP), R14 // H6 = g XOR H6 - MOVL R14, (6*4)(BP) - XORL (7*4)(BP), DI // H7 = h XOR H7 - MOVL DI, (7*4)(BP) + xorm( 0(BP), a) + xorm( 4(BP), b) + xorm( 8(BP), c) + xorm( 12(BP), d) + xorm( 16(BP), e) + xorm( 20(BP), f) + xorm( 24(BP), g) + xorm( 28(BP), h) ADDQ $64, SI CMPQ SI, 272(SP) diff --git a/sm3/sm3block_arm64.s b/sm3/sm3block_arm64.s index b9a8d50..2f394c2 100644 --- a/sm3/sm3block_arm64.s +++ b/sm3/sm3block_arm64.s @@ -3,6 +3,8 @@ #include "textflag.h" +#include "sm3_const_asm.s" + #define XWORD0 V0 #define XWORD1 V1 #define XWORD2 V2 @@ -399,117 +401,117 @@ loop: schedule_compress: // for w0 - w47 // Do 4 rounds and scheduling VEOR XWORD0.B16, XWORD1.B16, Wt.B16 - ROUND_AND_SCHED_N_0_0(0*16, 0x79cc4519, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_0_1(0*16, 0xf3988a32, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_0_2(0*16, 0xe7311465, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_0_3(0*16, 0xce6228cb, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_0_0(0*16, T0, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_0_1(0*16, T1, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_0_2(0*16, T2, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_0_3(0*16, T3, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) // Do 4 rounds and scheduling VEOR XWORD1.B16, XWORD2.B16, Wt.B16 - ROUND_AND_SCHED_N_0_0(0*16, 0x9cc45197, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt) - ROUND_AND_SCHED_N_0_1(0*16, 0x3988a32f, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt) - ROUND_AND_SCHED_N_0_2(0*16, 0x7311465e, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt) - ROUND_AND_SCHED_N_0_3(0*16, 0xe6228cbc, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_0_0(0*16, T4, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_0_1(0*16, T5, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_0_2(0*16, T6, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_0_3(0*16, T7, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt) // Do 4 rounds and scheduling VEOR XWORD2.B16, XWORD3.B16, Wt.B16 - ROUND_AND_SCHED_N_0_0(0*16, 0xcc451979, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt) - ROUND_AND_SCHED_N_0_1(0*16, 0x988a32f3, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt) - ROUND_AND_SCHED_N_0_2(0*16, 0x311465e7, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt) - ROUND_AND_SCHED_N_0_3(0*16, 0x6228cbce, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_0_0(0*16, T8, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_0_1(0*16, T9, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_0_2(0*16, T10, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_0_3(0*16, T11, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt) // Do 4 rounds and scheduling VEOR XWORD3.B16, XWORD0.B16, Wt.B16 - ROUND_AND_SCHED_N_0_0(0*16, 0xc451979c, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt) - ROUND_AND_SCHED_N_0_1(0*16, 0x88a32f39, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt) - ROUND_AND_SCHED_N_0_2(0*16, 0x11465e73, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt) - ROUND_AND_SCHED_N_0_3(0*16, 0x228cbce6, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_0_0(0*16, T12, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_0_1(0*16, T13, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_0_2(0*16, T14, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_0_3(0*16, T15, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt) // Do 4 rounds and scheduling VEOR XWORD0.B16, XWORD1.B16, Wt.B16 - ROUND_AND_SCHED_N_1_0(0*16, 0x9d8a7a87, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_1_1(0*16, 0x3b14f50f, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_1_2(0*16, 0x7629ea1e, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_1_3(0*16, 0xec53d43c, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_0(0*16, T16, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_1(0*16, T17, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_2(0*16, T18, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_3(0*16, T19, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) // Do 4 rounds and scheduling VEOR XWORD1.B16, XWORD2.B16, Wt.B16 - ROUND_AND_SCHED_N_1_0(0*16, 0xd8a7a879, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt) - ROUND_AND_SCHED_N_1_1(0*16, 0xb14f50f3, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt) - ROUND_AND_SCHED_N_1_2(0*16, 0x629ea1e7, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt) - ROUND_AND_SCHED_N_1_3(0*16, 0xc53d43ce, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_1_0(0*16, T20, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_1_1(0*16, T21, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_1_2(0*16, T22, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_1_3(0*16, T23, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt) // Do 4 rounds and scheduling VEOR XWORD2.B16, XWORD3.B16, Wt.B16 - ROUND_AND_SCHED_N_1_0(0*16, 0x8a7a879d, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt) - ROUND_AND_SCHED_N_1_1(0*16, 0x14f50f3b, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt) - ROUND_AND_SCHED_N_1_2(0*16, 0x29ea1e76, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt) - ROUND_AND_SCHED_N_1_3(0*16, 0x53d43cec, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_1_0(0*16, T24, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_1_1(0*16, T25, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_1_2(0*16, T26, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_1_3(0*16, T27, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt) // Do 4 rounds and scheduling VEOR XWORD3.B16, XWORD0.B16, Wt.B16 - ROUND_AND_SCHED_N_1_0(0*16, 0xa7a879d8, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt) - ROUND_AND_SCHED_N_1_1(0*16, 0x4f50f3b1, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt) - ROUND_AND_SCHED_N_1_2(0*16, 0x9ea1e762, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt) - ROUND_AND_SCHED_N_1_3(0*16, 0x3d43cec5, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_1_0(0*16, T28, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_1_1(0*16, T29, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_1_2(0*16, T30, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_1_3(0*16, T31, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt) // Do 4 rounds and scheduling VEOR XWORD0.B16, XWORD1.B16, Wt.B16 - ROUND_AND_SCHED_N_1_0(0*16, 0x7a879d8a, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_1_1(0*16, 0xf50f3b14, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_1_2(0*16, 0xea1e7629, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_1_3(0*16, 0xd43cec53, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_0(0*16, T32, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_1(0*16, T33, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_2(0*16, T34, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_3(0*16, T35, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) // Do 4 rounds and scheduling VEOR XWORD1.B16, XWORD2.B16, Wt.B16 - ROUND_AND_SCHED_N_1_0(0*16, 0xa879d8a7, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt) - ROUND_AND_SCHED_N_1_1(0*16, 0x50f3b14f, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt) - ROUND_AND_SCHED_N_1_2(0*16, 0xa1e7629e, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt) - ROUND_AND_SCHED_N_1_3(0*16, 0x43cec53d, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_1_0(0*16, T36, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_1_1(0*16, T37, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_1_2(0*16, T38, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt) + ROUND_AND_SCHED_N_1_3(0*16, T39, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt) // Do 4 rounds and scheduling VEOR XWORD2.B16, XWORD3.B16, Wt.B16 - ROUND_AND_SCHED_N_1_0(0*16, 0x879d8a7a, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt) - ROUND_AND_SCHED_N_1_1(0*16, 0xf3b14f5, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt) - ROUND_AND_SCHED_N_1_2(0*16, 0x1e7629ea, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt) - ROUND_AND_SCHED_N_1_3(0*16, 0x3cec53d4, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_1_0(0*16, T40, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_1_1(0*16, T41, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_1_2(0*16, T42, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt) + ROUND_AND_SCHED_N_1_3(0*16, T43, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt) // Do 4 rounds and scheduling VEOR XWORD3.B16, XWORD0.B16, Wt.B16 - ROUND_AND_SCHED_N_1_0(0*16, 0x79d8a7a8, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt) - ROUND_AND_SCHED_N_1_1(0*16, 0xf3b14f50, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt) - ROUND_AND_SCHED_N_1_2(0*16, 0xe7629ea1, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt) - ROUND_AND_SCHED_N_1_3(0*16, 0xcec53d43, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_1_0(0*16, T44, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_1_1(0*16, T45, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_1_2(0*16, T46, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt) + ROUND_AND_SCHED_N_1_3(0*16, T47, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt) // w48 - w63 processed with only 4 rounds scheduling (last 16 rounds) // Do 4 rounds and scheduling VEOR XWORD0.B16, XWORD1.B16, Wt.B16 - ROUND_AND_SCHED_N_1_0(0*16, 0x9d8a7a87, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_1_1(0*16, 0x3b14f50f, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_1_2(0*16, 0x7629ea1e, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) - ROUND_AND_SCHED_N_1_3(0*16, 0xec53d43c, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_0(0*16, T48, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_1(0*16, T49, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_2(0*16, T50, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) + ROUND_AND_SCHED_N_1_3(0*16, T51, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) // w52 - w63 processed with no scheduling (last 12 rounds) // Do 4 rounds VEOR XWORD1.B16, XWORD2.B16, Wt.B16 - DO_ROUND_N_1(0*16, 0, 0xd8a7a879, e, f, g, h, a, b, c, d, XWORD1, Wt) - DO_ROUND_N_1(0*16, 1, 0xb14f50f3, d, e, f, g, h, a, b, c, XWORD1, Wt) - DO_ROUND_N_1(0*16, 2, 0x629ea1e7, c, d, e, f, g, h, a, b, XWORD1, Wt) - DO_ROUND_N_1(0*16, 3, 0xc53d43ce, b, c, d, e, f, g, h, a, XWORD1, Wt) + DO_ROUND_N_1(0*16, 0, T52, e, f, g, h, a, b, c, d, XWORD1, Wt) + DO_ROUND_N_1(0*16, 1, T53, d, e, f, g, h, a, b, c, XWORD1, Wt) + DO_ROUND_N_1(0*16, 2, T54, c, d, e, f, g, h, a, b, XWORD1, Wt) + DO_ROUND_N_1(0*16, 3, T55, b, c, d, e, f, g, h, a, XWORD1, Wt) // Do 4 rounds VEOR XWORD2.B16, XWORD3.B16, Wt.B16 - DO_ROUND_N_1(0*16, 0, 0x8a7a879d, a, b, c, d, e, f, g, h, XWORD2, Wt) - DO_ROUND_N_1(0*16, 1, 0x14f50f3b, h, a, b, c, d, e, f, g, XWORD2, Wt) - DO_ROUND_N_1(0*16, 2, 0x29ea1e76, g, h, a, b, c, d, e, f, XWORD2, Wt) - DO_ROUND_N_1(0*16, 3, 0x53d43cec, f, g, h, a, b, c, d, e, XWORD2, Wt) + DO_ROUND_N_1(0*16, 0, T56, a, b, c, d, e, f, g, h, XWORD2, Wt) + DO_ROUND_N_1(0*16, 1, T57, h, a, b, c, d, e, f, g, XWORD2, Wt) + DO_ROUND_N_1(0*16, 2, T58, g, h, a, b, c, d, e, f, XWORD2, Wt) + DO_ROUND_N_1(0*16, 3, T59, f, g, h, a, b, c, d, e, XWORD2, Wt) // Do 4 rounds VEOR XWORD3.B16, XWORD0.B16, Wt.B16 - DO_ROUND_N_1(0*16, 0, 0xa7a879d8, e, f, g, h, a, b, c, d, XWORD3, Wt) - DO_ROUND_N_1(0*16, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c, XWORD3, Wt) - DO_ROUND_N_1(0*16, 2, 0x9ea1e762, c, d, e, f, g, h, a, b, XWORD3, Wt) - DO_ROUND_N_1(0*16, 3, 0x3d43cec5, b, c, d, e, f, g, h, a, XWORD3, Wt) + DO_ROUND_N_1(0*16, 0, T60, e, f, g, h, a, b, c, d, XWORD3, Wt) + DO_ROUND_N_1(0*16, 1, T61, d, e, f, g, h, a, b, c, XWORD3, Wt) + DO_ROUND_N_1(0*16, 2, T62, c, d, e, f, g, h, a, b, XWORD3, Wt) + DO_ROUND_N_1(0*16, 3, T63, b, c, d, e, f, g, h, a, XWORD3, Wt) EORW a1, a // H0 = a XOR H0 EORW b1, b // H1 = b XOR H1 diff --git a/sm3/sm3block_avx2_amd64.s b/sm3/sm3block_avx2_amd64.s index ed44f05..07b3751 100644 --- a/sm3/sm3block_avx2_amd64.s +++ b/sm3/sm3block_avx2_amd64.s @@ -3,6 +3,8 @@ #include "textflag.h" +#include "sm3_const_asm.s" + // Definitions for AVX2 version // xorm (mem), reg @@ -478,37 +480,37 @@ avx2_schedule_compress: // for w0 - w47 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) VPXOR XDWORD0, XDWORD1, XFER VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_0_0(_XFER + 0*32, 0x79cc4519, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_0_1(_XFER + 0*32, 0xf3988a32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_0_2(_XFER + 0*32, 0xe7311465, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_0_3(_XFER + 0*32, 0xce6228cb, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_0_0(_XFER + 0*32, T0, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_0_1(_XFER + 0*32, T1, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_0_2(_XFER + 0*32, T2, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_0_3(_XFER + 0*32, T3, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) // Do 4 rounds and scheduling VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1) VPXOR XDWORD1, XDWORD2, XFER VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_0_0(_XFER + 2*32, 0x9cc45197, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_0_1(_XFER + 2*32, 0x3988a32f, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_0_2(_XFER + 2*32, 0x7311465e, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_0_3(_XFER + 2*32, 0xe6228cbc, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_0_0(_XFER + 2*32, T4, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_0_1(_XFER + 2*32, T5, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_0_2(_XFER + 2*32, T6, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_0_3(_XFER + 2*32, T7, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) // Do 4 rounds and scheduling VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1) VPXOR XDWORD2, XDWORD3, XFER VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_0_0(_XFER + 4*32, 0xcc451979, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_0_1(_XFER + 4*32, 0x988a32f3, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_0_2(_XFER + 4*32, 0x311465e7, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_0_3(_XFER + 4*32, 0x6228cbce, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_0_0(_XFER + 4*32, T8, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_0_1(_XFER + 4*32, T9, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_0_2(_XFER + 4*32, T10, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_0_3(_XFER + 4*32, T11, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) // Do 4 rounds and scheduling VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1) VPXOR XDWORD3, XDWORD0, XFER VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_0_0(_XFER + 6*32, 0xc451979c, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_0_1(_XFER + 6*32, 0x88a32f39, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_0_2(_XFER + 6*32, 0x11465e73, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_0_3(_XFER + 6*32, 0x228cbce6, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_0_0(_XFER + 6*32, T12, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_0_1(_XFER + 6*32, T13, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_0_2(_XFER + 6*32, T14, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_0_3(_XFER + 6*32, T15, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) ADDQ $8*32, SRND @@ -516,37 +518,37 @@ avx2_schedule_compress: // for w0 - w47 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) VPXOR XDWORD0, XDWORD1, XFER VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_1_0(_XFER + 0*32, 0x9d8a7a87, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1_1(_XFER + 0*32, 0x3b14f50f, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1_2(_XFER + 0*32, 0x7629ea1e, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1_3(_XFER + 0*32, 0xec53d43c, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_0(_XFER + 0*32, T16, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_1(_XFER + 0*32, T17, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_2(_XFER + 0*32, T18, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_3(_XFER + 0*32, T19, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) // Do 4 rounds and scheduling VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1) VPXOR XDWORD1, XDWORD2, XFER VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_1_0(_XFER + 2*32, 0xd8a7a879, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_1_1(_XFER + 2*32, 0xb14f50f3, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_1_2(_XFER + 2*32, 0x629ea1e7, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_1_3(_XFER + 2*32, 0xc53d43ce, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_1_0(_XFER + 2*32, T20, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_1_1(_XFER + 2*32, T21, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_1_2(_XFER + 2*32, T22, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_1_3(_XFER + 2*32, T23, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) // Do 4 rounds and scheduling VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1) VPXOR XDWORD2, XDWORD3, XFER VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_1_0(_XFER + 4*32, 0x8a7a879d, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_1_1(_XFER + 4*32, 0x14f50f3b, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_1_2(_XFER + 4*32, 0x29ea1e76, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_1_3(_XFER + 4*32, 0x53d43cec, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_1_0(_XFER + 4*32, T24, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_1_1(_XFER + 4*32, T25, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_1_2(_XFER + 4*32, T26, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_1_3(_XFER + 4*32, T27, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) // Do 4 rounds and scheduling VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1) VPXOR XDWORD3, XDWORD0, XFER VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_1_0(_XFER + 6*32, 0xa7a879d8, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_1_1(_XFER + 6*32, 0x4f50f3b1, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_1_2(_XFER + 6*32, 0x9ea1e762, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_1_3(_XFER + 6*32, 0x3d43cec5, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_1_0(_XFER + 6*32, T28, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_1_1(_XFER + 6*32, T29, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_1_2(_XFER + 6*32, T30, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_1_3(_XFER + 6*32, T31, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) ADDQ $8*32, SRND @@ -554,37 +556,37 @@ avx2_schedule_compress: // for w0 - w47 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) VPXOR XDWORD0, XDWORD1, XFER VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_1_0(_XFER + 0*32, 0x7a879d8a, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1_1(_XFER + 0*32, 0xf50f3b14, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1_2(_XFER + 0*32, 0xea1e7629, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1_3(_XFER + 0*32, 0xd43cec53, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_0(_XFER + 0*32, T32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_1(_XFER + 0*32, T33, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_2(_XFER + 0*32, T34, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_3(_XFER + 0*32, T35, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) // Do 4 rounds and scheduling VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1) VPXOR XDWORD1, XDWORD2, XFER VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_1_0(_XFER + 2*32, 0xa879d8a7, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_1_1(_XFER + 2*32, 0x50f3b14f, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_1_2(_XFER + 2*32, 0xa1e7629e, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_1_3(_XFER + 2*32, 0x43cec53d, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_1_0(_XFER + 2*32, T36, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_1_1(_XFER + 2*32, T37, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_1_2(_XFER + 2*32, T38, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + ROUND_AND_SCHED_N_1_3(_XFER + 2*32, T39, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) // Do 4 rounds and scheduling VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1) VPXOR XDWORD2, XDWORD3, XFER VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_1_0(_XFER + 4*32, 0x879d8a7a, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_1_1(_XFER + 4*32, 0xf3b14f5, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_1_2(_XFER + 4*32, 0x1e7629ea, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_1_3(_XFER + 4*32, 0x3cec53d4, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_1_0(_XFER + 4*32, T40, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_1_1(_XFER + 4*32, T41, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_1_2(_XFER + 4*32, T42, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + ROUND_AND_SCHED_N_1_3(_XFER + 4*32, T43, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) // Do 4 rounds and scheduling VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1) VPXOR XDWORD3, XDWORD0, XFER VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_1_0(_XFER + 6*32, 0x79d8a7a8, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_1_1(_XFER + 6*32, 0xf3b14f50, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_1_2(_XFER + 6*32, 0xe7629ea1, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_1_3(_XFER + 6*32, 0xcec53d43, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_1_0(_XFER + 6*32, T44, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_1_1(_XFER + 6*32, T45, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_1_2(_XFER + 6*32, T46, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + ROUND_AND_SCHED_N_1_3(_XFER + 6*32, T47, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) ADDQ $8*32, SRND @@ -593,38 +595,38 @@ avx2_schedule_compress: // for w0 - w47 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) VPXOR XDWORD0, XDWORD1, XFER VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_1_0(_XFER + 0*32, 0x9d8a7a87, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1_1(_XFER + 0*32, 0x3b14f50f, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1_2(_XFER + 0*32, 0x7629ea1e, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1_3(_XFER + 0*32, 0xec53d43c, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_0(_XFER + 0*32, T48, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_1(_XFER + 0*32, T49, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_2(_XFER + 0*32, T50, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + ROUND_AND_SCHED_N_1_3(_XFER + 0*32, T51, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) // w52 - w63 processed with no scheduling (last 12 rounds) // Do 4 rounds VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1) VPXOR XDWORD1, XDWORD2, XFER VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) - DO_ROUND_N_1(_XFER + 2*32, 0, 0xd8a7a879, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER + 2*32, 1, 0xb14f50f3, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER + 2*32, 2, 0x629ea1e7, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER + 2*32, 3, 0xc53d43ce, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER + 2*32, 0, T52, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER + 2*32, 1, T53, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER + 2*32, 2, T54, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER + 2*32, 3, T55, b, c, d, e, f, g, h, a) // Do 4 rounds VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1) VPXOR XDWORD2, XDWORD3, XFER VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1) - DO_ROUND_N_1(_XFER + 4*32, 0, 0x8a7a879d, a, b, c, d, e, f, g, h) - DO_ROUND_N_1(_XFER + 4*32, 1, 0x14f50f3b, h, a, b, c, d, e, f, g) - DO_ROUND_N_1(_XFER + 4*32, 2, 0x29ea1e76, g, h, a, b, c, d, e, f) - DO_ROUND_N_1(_XFER + 4*32, 3, 0x53d43cec, f, g, h, a, b, c, d, e) + DO_ROUND_N_1(_XFER + 4*32, 0, T56, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(_XFER + 4*32, 1, T57, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(_XFER + 4*32, 2, T58, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(_XFER + 4*32, 3, T59, f, g, h, a, b, c, d, e) // Do 4 rounds VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1) VPXOR XDWORD3, XDWORD0, XFER VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1) - DO_ROUND_N_1(_XFER + 6*32, 0, 0xa7a879d8, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER + 6*32, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER + 6*32, 2, 0x9ea1e762, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER + 6*32, 3, 0x3d43cec5, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER + 6*32, 0, T60, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER + 6*32, 1, T61, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER + 6*32, 2, T62, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER + 6*32, 3, T63, b, c, d, e, f, g, h, a) MOVQ dig+0(FP), CTX // d.h[8] MOVQ _INP(SP), INP @@ -644,91 +646,91 @@ avx2_schedule_compress: // for w0 - w47 XORQ SRND, SRND avx2_compress: // Do second block using previously scheduled results - DO_ROUND_N_0(_XFER + 0*32 + 16, 0, 0x79cc4519, a, b, c, d, e, f, g, h) - DO_ROUND_N_0(_XFER + 0*32 + 16, 1, 0xf3988a32, h, a, b, c, d, e, f, g) - DO_ROUND_N_0(_XFER + 0*32 + 16, 2, 0xe7311465, g, h, a, b, c, d, e, f) - DO_ROUND_N_0(_XFER + 0*32 + 16, 3, 0xce6228cb, f, g, h, a, b, c, d, e) + DO_ROUND_N_0(_XFER + 0*32 + 16, 0, T0, a, b, c, d, e, f, g, h) + DO_ROUND_N_0(_XFER + 0*32 + 16, 1, T1, h, a, b, c, d, e, f, g) + DO_ROUND_N_0(_XFER + 0*32 + 16, 2, T2, g, h, a, b, c, d, e, f) + DO_ROUND_N_0(_XFER + 0*32 + 16, 3, T3, f, g, h, a, b, c, d, e) - DO_ROUND_N_0(_XFER + 2*32 + 16, 0, 0x9cc45197, e, f, g, h, a, b, c, d) - DO_ROUND_N_0(_XFER + 2*32 + 16, 1, 0x3988a32f, d, e, f, g, h, a, b, c) - DO_ROUND_N_0(_XFER + 2*32 + 16, 2, 0x7311465e, c, d, e, f, g, h, a, b) - DO_ROUND_N_0(_XFER + 2*32 + 16, 3, 0xe6228cbc, b, c, d, e, f, g, h, a) + DO_ROUND_N_0(_XFER + 2*32 + 16, 0, T4, e, f, g, h, a, b, c, d) + DO_ROUND_N_0(_XFER + 2*32 + 16, 1, T5, d, e, f, g, h, a, b, c) + DO_ROUND_N_0(_XFER + 2*32 + 16, 2, T6, c, d, e, f, g, h, a, b) + DO_ROUND_N_0(_XFER + 2*32 + 16, 3, T7, b, c, d, e, f, g, h, a) - DO_ROUND_N_0(_XFER + 4*32 + 16, 0, 0xcc451979, a, b, c, d, e, f, g, h) - DO_ROUND_N_0(_XFER + 4*32 + 16, 1, 0x988a32f3, h, a, b, c, d, e, f, g) - DO_ROUND_N_0(_XFER + 4*32 + 16, 2, 0x311465e7, g, h, a, b, c, d, e, f) - DO_ROUND_N_0(_XFER + 4*32 + 16, 3, 0x6228cbce, f, g, h, a, b, c, d, e) + DO_ROUND_N_0(_XFER + 4*32 + 16, 0, T8, a, b, c, d, e, f, g, h) + DO_ROUND_N_0(_XFER + 4*32 + 16, 1, T9, h, a, b, c, d, e, f, g) + DO_ROUND_N_0(_XFER + 4*32 + 16, 2, T10, g, h, a, b, c, d, e, f) + DO_ROUND_N_0(_XFER + 4*32 + 16, 3, T11, f, g, h, a, b, c, d, e) - DO_ROUND_N_0(_XFER + 6*32 + 16, 0, 0xc451979c, e, f, g, h, a, b, c, d) - DO_ROUND_N_0(_XFER + 6*32 + 16, 1, 0x88a32f39, d, e, f, g, h, a, b, c) - DO_ROUND_N_0(_XFER + 6*32 + 16, 2, 0x11465e73, c, d, e, f, g, h, a, b) - DO_ROUND_N_0(_XFER + 6*32 + 16, 3, 0x228cbce6, b, c, d, e, f, g, h, a) + DO_ROUND_N_0(_XFER + 6*32 + 16, 0, T12, e, f, g, h, a, b, c, d) + DO_ROUND_N_0(_XFER + 6*32 + 16, 1, T13, d, e, f, g, h, a, b, c) + DO_ROUND_N_0(_XFER + 6*32 + 16, 2, T14, c, d, e, f, g, h, a, b) + DO_ROUND_N_0(_XFER + 6*32 + 16, 3, T15, b, c, d, e, f, g, h, a) ADDQ $8*32, SRND - DO_ROUND_N_1(_XFER + 0*32 + 16, 0, 0x9d8a7a87, a, b, c, d, e, f, g, h) - DO_ROUND_N_1(_XFER + 0*32 + 16, 1, 0x3b14f50f, h, a, b, c, d, e, f, g) - DO_ROUND_N_1(_XFER + 0*32 + 16, 2, 0x7629ea1e, g, h, a, b, c, d, e, f) - DO_ROUND_N_1(_XFER + 0*32 + 16, 3, 0xec53d43c, f, g, h, a, b, c, d, e) + DO_ROUND_N_1(_XFER + 0*32 + 16, 0, T16, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(_XFER + 0*32 + 16, 1, T17, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(_XFER + 0*32 + 16, 2, T18, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(_XFER + 0*32 + 16, 3, T19, f, g, h, a, b, c, d, e) - DO_ROUND_N_1(_XFER + 2*32 + 16, 0, 0xd8a7a879, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER + 2*32 + 16, 1, 0xb14f50f3, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER + 2*32 + 16, 2, 0x629ea1e7, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER + 2*32 + 16, 3, 0xc53d43ce, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER + 2*32 + 16, 0, T20, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER + 2*32 + 16, 1, T21, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER + 2*32 + 16, 2, T22, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER + 2*32 + 16, 3, T23, b, c, d, e, f, g, h, a) - DO_ROUND_N_1(_XFER + 4*32 + 16, 0, 0x8a7a879d, a, b, c, d, e, f, g, h) - DO_ROUND_N_1(_XFER + 4*32 + 16, 1, 0x14f50f3b, h, a, b, c, d, e, f, g) - DO_ROUND_N_1(_XFER + 4*32 + 16, 2, 0x29ea1e76, g, h, a, b, c, d, e, f) - DO_ROUND_N_1(_XFER + 4*32 + 16, 3, 0x53d43cec, f, g, h, a, b, c, d, e) + DO_ROUND_N_1(_XFER + 4*32 + 16, 0, T24, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(_XFER + 4*32 + 16, 1, T25, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(_XFER + 4*32 + 16, 2, T26, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(_XFER + 4*32 + 16, 3, T27, f, g, h, a, b, c, d, e) - DO_ROUND_N_1(_XFER + 6*32 + 16, 0, 0xa7a879d8, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER + 6*32 + 16, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER + 6*32 + 16, 2, 0x9ea1e762, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER + 6*32 + 16, 3, 0x3d43cec5, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER + 6*32 + 16, 0, T28, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER + 6*32 + 16, 1, T29, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER + 6*32 + 16, 2, T30, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER + 6*32 + 16, 3, T31, b, c, d, e, f, g, h, a) ADDQ $8*32, SRND - DO_ROUND_N_1(_XFER + 0*32 + 16, 0, 0x7a879d8a, a, b, c, d, e, f, g, h) - DO_ROUND_N_1(_XFER + 0*32 + 16, 1, 0xf50f3b14, h, a, b, c, d, e, f, g) - DO_ROUND_N_1(_XFER + 0*32 + 16, 2, 0xea1e7629, g, h, a, b, c, d, e, f) - DO_ROUND_N_1(_XFER + 0*32 + 16, 3, 0xd43cec53, f, g, h, a, b, c, d, e) + DO_ROUND_N_1(_XFER + 0*32 + 16, 0, T32, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(_XFER + 0*32 + 16, 1, T33, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(_XFER + 0*32 + 16, 2, T34, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(_XFER + 0*32 + 16, 3, T35, f, g, h, a, b, c, d, e) - DO_ROUND_N_1(_XFER + 2*32 + 16, 0, 0xa879d8a7, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER + 2*32 + 16, 1, 0x50f3b14f, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER + 2*32 + 16, 2, 0xa1e7629e, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER + 2*32 + 16, 3, 0x43cec53d, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER + 2*32 + 16, 0, T36, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER + 2*32 + 16, 1, T37, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER + 2*32 + 16, 2, T38, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER + 2*32 + 16, 3, T39, b, c, d, e, f, g, h, a) - DO_ROUND_N_1(_XFER + 4*32 + 16, 0, 0x879d8a7a, a, b, c, d, e, f, g, h) - DO_ROUND_N_1(_XFER + 4*32 + 16, 1, 0xf3b14f5, h, a, b, c, d, e, f, g) - DO_ROUND_N_1(_XFER + 4*32 + 16, 2, 0x1e7629ea, g, h, a, b, c, d, e, f) - DO_ROUND_N_1(_XFER + 4*32 + 16, 3, 0x3cec53d4, f, g, h, a, b, c, d, e) + DO_ROUND_N_1(_XFER + 4*32 + 16, 0, T40, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(_XFER + 4*32 + 16, 1, T41, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(_XFER + 4*32 + 16, 2, T42, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(_XFER + 4*32 + 16, 3, T43, f, g, h, a, b, c, d, e) - DO_ROUND_N_1(_XFER + 6*32 + 16, 0, 0x79d8a7a8, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER + 6*32 + 16, 1, 0xf3b14f50, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER + 6*32 + 16, 2, 0xe7629ea1, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER + 6*32 + 16, 3, 0xcec53d43, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER + 6*32 + 16, 0, T44, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER + 6*32 + 16, 1, T45, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER + 6*32 + 16, 2, T46, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER + 6*32 + 16, 3, T47, b, c, d, e, f, g, h, a) ADDQ $8*32, SRND - DO_ROUND_N_1(_XFER + 0*32 + 16, 0, 0x9d8a7a87, a, b, c, d, e, f, g, h) - DO_ROUND_N_1(_XFER + 0*32 + 16, 1, 0x3b14f50f, h, a, b, c, d, e, f, g) - DO_ROUND_N_1(_XFER + 0*32 + 16, 2, 0x7629ea1e, g, h, a, b, c, d, e, f) - DO_ROUND_N_1(_XFER + 0*32 + 16, 3, 0xec53d43c, f, g, h, a, b, c, d, e) + DO_ROUND_N_1(_XFER + 0*32 + 16, 0, T48, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(_XFER + 0*32 + 16, 1, T49, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(_XFER + 0*32 + 16, 2, T50, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(_XFER + 0*32 + 16, 3, T51, f, g, h, a, b, c, d, e) - DO_ROUND_N_1(_XFER + 2*32 + 16, 0, 0xd8a7a879, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER + 2*32 + 16, 1, 0xb14f50f3, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER + 2*32 + 16, 2, 0x629ea1e7, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER + 2*32 + 16, 3, 0xc53d43ce, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER + 2*32 + 16, 0, T52, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER + 2*32 + 16, 1, T53, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER + 2*32 + 16, 2, T54, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER + 2*32 + 16, 3, T55, b, c, d, e, f, g, h, a) - DO_ROUND_N_1(_XFER + 4*32 + 16, 0, 0x8a7a879d, a, b, c, d, e, f, g, h) - DO_ROUND_N_1(_XFER + 4*32 + 16, 1, 0x14f50f3b, h, a, b, c, d, e, f, g) - DO_ROUND_N_1(_XFER + 4*32 + 16, 2, 0x29ea1e76, g, h, a, b, c, d, e, f) - DO_ROUND_N_1(_XFER + 4*32 + 16, 3, 0x53d43cec, f, g, h, a, b, c, d, e) + DO_ROUND_N_1(_XFER + 4*32 + 16, 0, T56, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(_XFER + 4*32 + 16, 1, T57, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(_XFER + 4*32 + 16, 2, T58, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(_XFER + 4*32 + 16, 3, T59, f, g, h, a, b, c, d, e) - DO_ROUND_N_1(_XFER + 6*32 + 16, 0, 0xa7a879d8, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER + 6*32 + 16, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER + 6*32 + 16, 2, 0x9ea1e762, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER + 6*32 + 16, 3, 0x3d43cec5, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER + 6*32 + 16, 0, T60, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER + 6*32 + 16, 1, T61, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER + 6*32 + 16, 2, T62, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER + 6*32 + 16, 3, T63, b, c, d, e, f, g, h, a) MOVQ dig+0(FP), CTX // d.h[8] MOVQ _INP(SP), INP diff --git a/sm3/sm3block_avx_amd64.s b/sm3/sm3block_avx_amd64.s index 7e6ff1d..51e420c 100644 --- a/sm3/sm3block_avx_amd64.s +++ b/sm3/sm3block_avx_amd64.s @@ -3,6 +3,7 @@ #include "textflag.h" +#include "sm3_const_asm.s" // Definitions for AVX version // xorm (mem), reg @@ -425,148 +426,148 @@ avx_schedule_compress: // for w0 - w47 VMOVDQU XWORD0, (_XFER + 0*16)(SP) VPXOR XWORD0, XWORD1, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_0_0(_XFER, 0x79cc4519, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_0_1(_XFER, 0xf3988a32, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_0_2(_XFER, 0xe7311465, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_0_3(_XFER, 0xce6228cb, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_0_0(_XFER, T0, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_0_1(_XFER, T1, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_0_2(_XFER, T2, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_0_3(_XFER, T3, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) // Do 4 rounds and scheduling VMOVDQU XWORD1, (_XFER + 0*16)(SP) VPXOR XWORD1, XWORD2, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_0_0(_XFER, 0x9cc45197, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0) - ROUND_AND_SCHED_N_0_1(_XFER, 0x3988a32f, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0) - ROUND_AND_SCHED_N_0_2(_XFER, 0x7311465e, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0) - ROUND_AND_SCHED_N_0_3(_XFER, 0xe6228cbc, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_0_0(_XFER, T4, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_0_1(_XFER, T5, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_0_2(_XFER, T6, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_0_3(_XFER, T7, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0) // Do 4 rounds and scheduling VMOVDQU XWORD2, (_XFER + 0*16)(SP) VPXOR XWORD2, XWORD3, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_0_0(_XFER, 0xcc451979, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1) - ROUND_AND_SCHED_N_0_1(_XFER, 0x988a32f3, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1) - ROUND_AND_SCHED_N_0_2(_XFER, 0x311465e7, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1) - ROUND_AND_SCHED_N_0_3(_XFER, 0x6228cbce, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_0_0(_XFER, T8, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_0_1(_XFER, T9, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_0_2(_XFER, T10, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_0_3(_XFER, T11, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1) // Do 4 rounds and scheduling VMOVDQU XWORD3, (_XFER + 0*16)(SP) VPXOR XWORD3, XWORD0, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_0_0(_XFER, 0xc451979c, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2) - ROUND_AND_SCHED_N_0_1(_XFER, 0x88a32f39, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2) - ROUND_AND_SCHED_N_0_2(_XFER, 0x11465e73, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2) - ROUND_AND_SCHED_N_0_3(_XFER, 0x228cbce6, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_0_0(_XFER, T12, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_0_1(_XFER, T13, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_0_2(_XFER, T14, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_0_3(_XFER, T15, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2) // Do 4 rounds and scheduling VMOVDQU XWORD0, (_XFER + 0*16)(SP) VPXOR XWORD0, XWORD1, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_1_0(_XFER, 0x9d8a7a87, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_1_1(_XFER, 0x3b14f50f, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_1_2(_XFER, 0x7629ea1e, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_1_3(_XFER, 0xec53d43c, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_0(_XFER, T16, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_1(_XFER, T17, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_2(_XFER, T18, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_3(_XFER, T19, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) // Do 4 rounds and scheduling VMOVDQU XWORD1, (_XFER + 0*16)(SP) VPXOR XWORD1, XWORD2, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_1_0(_XFER, 0xd8a7a879, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0) - ROUND_AND_SCHED_N_1_1(_XFER, 0xb14f50f3, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0) - ROUND_AND_SCHED_N_1_2(_XFER, 0x629ea1e7, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0) - ROUND_AND_SCHED_N_1_3(_XFER, 0xc53d43ce, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_1_0(_XFER, T20, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_1_1(_XFER, T21, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_1_2(_XFER, T22, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_1_3(_XFER, T23, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0) // Do 4 rounds and scheduling VMOVDQU XWORD2, (_XFER + 0*16)(SP) VPXOR XWORD2, XWORD3, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_1_0(_XFER, 0x8a7a879d, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1) - ROUND_AND_SCHED_N_1_1(_XFER, 0x14f50f3b, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1) - ROUND_AND_SCHED_N_1_2(_XFER, 0x29ea1e76, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1) - ROUND_AND_SCHED_N_1_3(_XFER, 0x53d43cec, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_1_0(_XFER, T24, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_1_1(_XFER, T25, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_1_2(_XFER, T26, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_1_3(_XFER, T27, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1) // Do 4 rounds and scheduling VMOVDQU XWORD3, (_XFER + 0*16)(SP) VPXOR XWORD3, XWORD0, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_1_0(_XFER, 0xa7a879d8, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2) - ROUND_AND_SCHED_N_1_1(_XFER, 0x4f50f3b1, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2) - ROUND_AND_SCHED_N_1_2(_XFER, 0x9ea1e762, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2) - ROUND_AND_SCHED_N_1_3(_XFER, 0x3d43cec5, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_1_0(_XFER, T28, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_1_1(_XFER, T29, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_1_2(_XFER, T30, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_1_3(_XFER, T31, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2) // Do 4 rounds and scheduling VMOVDQU XWORD0, (_XFER + 0*16)(SP) VPXOR XWORD0, XWORD1, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_1_0(_XFER, 0x7a879d8a, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_1_1(_XFER, 0xf50f3b14, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_1_2(_XFER, 0xea1e7629, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_1_3(_XFER, 0xd43cec53, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_0(_XFER, T32, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_1(_XFER, T33, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_2(_XFER, T34, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_3(_XFER, T35, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) // Do 4 rounds and scheduling VMOVDQU XWORD1, (_XFER + 0*16)(SP) VPXOR XWORD1, XWORD2, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_1_0(_XFER, 0xa879d8a7, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0) - ROUND_AND_SCHED_N_1_1(_XFER, 0x50f3b14f, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0) - ROUND_AND_SCHED_N_1_2(_XFER, 0xa1e7629e, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0) - ROUND_AND_SCHED_N_1_3(_XFER, 0x43cec53d, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_1_0(_XFER, T36, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_1_1(_XFER, T37, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_1_2(_XFER, T38, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0) + ROUND_AND_SCHED_N_1_3(_XFER, T39, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0) // Do 4 rounds and scheduling VMOVDQU XWORD2, (_XFER + 0*16)(SP) VPXOR XWORD2, XWORD3, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_1_0(_XFER, 0x879d8a7a, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1) - ROUND_AND_SCHED_N_1_1(_XFER, 0xf3b14f5, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1) - ROUND_AND_SCHED_N_1_2(_XFER, 0x1e7629ea, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1) - ROUND_AND_SCHED_N_1_3(_XFER, 0x3cec53d4, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_1_0(_XFER, T40, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_1_1(_XFER, T41, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_1_2(_XFER, T42, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1) + ROUND_AND_SCHED_N_1_3(_XFER, T43, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1) // Do 4 rounds and scheduling VMOVDQU XWORD3, (_XFER + 0*16)(SP) VPXOR XWORD3, XWORD0, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_1_0(_XFER, 0x79d8a7a8, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2) - ROUND_AND_SCHED_N_1_1(_XFER, 0xf3b14f50, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2) - ROUND_AND_SCHED_N_1_2(_XFER, 0xe7629ea1, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2) - ROUND_AND_SCHED_N_1_3(_XFER, 0xcec53d43, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_1_0(_XFER, T44, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_1_1(_XFER, T45, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_1_2(_XFER, T46, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2) + ROUND_AND_SCHED_N_1_3(_XFER, T47, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2) // w48 - w63 processed with only 4 rounds scheduling (last 16 rounds) // Do 4 rounds and scheduling VMOVDQU XWORD0, (_XFER + 0*16)(SP) VPXOR XWORD0, XWORD1, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - ROUND_AND_SCHED_N_1_0(_XFER, 0x9d8a7a87, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_1_1(_XFER, 0x3b14f50f, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_1_2(_XFER, 0x7629ea1e, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) - ROUND_AND_SCHED_N_1_3(_XFER, 0xec53d43c, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_0(_XFER, T48, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_1(_XFER, T49, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_2(_XFER, T50, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) + ROUND_AND_SCHED_N_1_3(_XFER, T51, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) // w52 - w63 processed with no scheduling (last 12 rounds) // Do 4 rounds VMOVDQU XWORD1, (_XFER + 0*16)(SP) VPXOR XWORD1, XWORD2, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - DO_ROUND_N_1(_XFER, 0, 0xd8a7a879, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER, 1, 0xb14f50f3, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER, 2, 0x629ea1e7, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER, 3, 0xc53d43ce, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER, 0, T52, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER, 1, T53, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER, 2, T54, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER, 3, T55, b, c, d, e, f, g, h, a) // Do 4 rounds VMOVDQU XWORD2, (_XFER + 0*16)(SP) VPXOR XWORD2, XWORD3, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - DO_ROUND_N_1(_XFER, 0, 0x8a7a879d, a, b, c, d, e, f, g, h) - DO_ROUND_N_1(_XFER, 1, 0x14f50f3b, h, a, b, c, d, e, f, g) - DO_ROUND_N_1(_XFER, 2, 0x29ea1e76, g, h, a, b, c, d, e, f) - DO_ROUND_N_1(_XFER, 3, 0x53d43cec, f, g, h, a, b, c, d, e) + DO_ROUND_N_1(_XFER, 0, T56, a, b, c, d, e, f, g, h) + DO_ROUND_N_1(_XFER, 1, T57, h, a, b, c, d, e, f, g) + DO_ROUND_N_1(_XFER, 2, T58, g, h, a, b, c, d, e, f) + DO_ROUND_N_1(_XFER, 3, T59, f, g, h, a, b, c, d, e) // Do 4 rounds VMOVDQU XWORD3, (_XFER + 0*16)(SP) VPXOR XWORD3, XWORD0, XFER VMOVDQU XFER, (_XFER + 1*16)(SP) - DO_ROUND_N_1(_XFER, 0, 0xa7a879d8, e, f, g, h, a, b, c, d) - DO_ROUND_N_1(_XFER, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c) - DO_ROUND_N_1(_XFER, 2, 0x9ea1e762, c, d, e, f, g, h, a, b) - DO_ROUND_N_1(_XFER, 3, 0x3d43cec5, b, c, d, e, f, g, h, a) + DO_ROUND_N_1(_XFER, 0, T60, e, f, g, h, a, b, c, d) + DO_ROUND_N_1(_XFER, 1, T61, d, e, f, g, h, a, b, c) + DO_ROUND_N_1(_XFER, 2, T62, c, d, e, f, g, h, a, b) + DO_ROUND_N_1(_XFER, 3, T63, b, c, d, e, f, g, h, a) MOVQ _INP(SP), INP