mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-22 10:16:18 +08:00
sm3: refactoring asm, extract constants
This commit is contained in:
parent
df3a5c10de
commit
cee7547606
71
sm3/sm3_const_asm.s
Normal file
71
sm3/sm3_const_asm.s
Normal file
@ -0,0 +1,71 @@
|
||||
#define T0 0x79cc4519
|
||||
#define T1 0xf3988a32
|
||||
#define T2 0xe7311465
|
||||
#define T3 0xce6228cb
|
||||
#define T4 0x9cc45197
|
||||
#define T5 0x3988a32f
|
||||
#define T6 0x7311465e
|
||||
#define T7 0xe6228cbc
|
||||
|
||||
#define T8 0xcc451979
|
||||
#define T9 0x988a32f3
|
||||
#define T10 0x311465e7
|
||||
#define T11 0x6228cbce
|
||||
#define T12 0xc451979c
|
||||
#define T13 0x88a32f39
|
||||
#define T14 0x11465e73
|
||||
#define T15 0x228cbce6
|
||||
|
||||
#define T16 0x9d8a7a87
|
||||
#define T17 0x3b14f50f
|
||||
#define T18 0x7629ea1e
|
||||
#define T19 0xec53d43c
|
||||
#define T20 0xd8a7a879
|
||||
#define T21 0xb14f50f3
|
||||
#define T22 0x629ea1e7
|
||||
#define T23 0xc53d43ce
|
||||
|
||||
#define T24 0x8a7a879d
|
||||
#define T25 0x14f50f3b
|
||||
#define T26 0x29ea1e76
|
||||
#define T27 0x53d43cec
|
||||
#define T28 0xa7a879d8
|
||||
#define T29 0x4f50f3b1
|
||||
#define T30 0x9ea1e762
|
||||
#define T31 0x3d43cec5
|
||||
|
||||
#define T32 0x7a879d8a
|
||||
#define T33 0xf50f3b14
|
||||
#define T34 0xea1e7629
|
||||
#define T35 0xd43cec53
|
||||
#define T36 0xa879d8a7
|
||||
#define T37 0x50f3b14f
|
||||
#define T38 0xa1e7629e
|
||||
#define T39 0x43cec53d
|
||||
|
||||
#define T40 0x879d8a7a
|
||||
#define T41 0xf3b14f5
|
||||
#define T42 0x1e7629ea
|
||||
#define T43 0x3cec53d4
|
||||
#define T44 0x79d8a7a8
|
||||
#define T45 0xf3b14f50
|
||||
#define T46 0xe7629ea1
|
||||
#define T47 0xcec53d43
|
||||
|
||||
#define T48 0x9d8a7a87
|
||||
#define T49 0x3b14f50f
|
||||
#define T50 0x7629ea1e
|
||||
#define T51 0xec53d43c
|
||||
#define T52 0xd8a7a879
|
||||
#define T53 0xb14f50f3
|
||||
#define T54 0x629ea1e7
|
||||
#define T55 0xc53d43ce
|
||||
|
||||
#define T56 0x8a7a879d
|
||||
#define T57 0x14f50f3b
|
||||
#define T58 0x29ea1e76
|
||||
#define T59 0x53d43cec
|
||||
#define T60 0xa7a879d8
|
||||
#define T61 0x4f50f3b1
|
||||
#define T62 0x9ea1e762
|
||||
#define T63 0x3d43cec5
|
@ -3,6 +3,23 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#include "sm3_const_asm.s"
|
||||
|
||||
// xorm (mem), reg
|
||||
// Xor reg to mem using reg-mem xor and store
|
||||
#define xorm(P1, P2) \
|
||||
XORL P2, P1; \
|
||||
MOVL P1, P2
|
||||
|
||||
#define a R8
|
||||
#define b R9
|
||||
#define c R10
|
||||
#define d R11
|
||||
#define e R12
|
||||
#define f R13
|
||||
#define g R14
|
||||
#define h DI
|
||||
|
||||
// Wt = Mt; for 0 <= t <= 3
|
||||
#define MSGSCHEDULE0(index) \
|
||||
MOVL (index*4)(SI), AX; \
|
||||
@ -148,14 +165,14 @@ TEXT ·blockAMD64(SB), 0, $288-32
|
||||
JEQ end
|
||||
|
||||
MOVQ dig+0(FP), BP
|
||||
MOVL (0*4)(BP), R8 // a = H0
|
||||
MOVL (1*4)(BP), R9 // b = H1
|
||||
MOVL (2*4)(BP), R10 // c = H2
|
||||
MOVL (3*4)(BP), R11 // d = H3
|
||||
MOVL (4*4)(BP), R12 // e = H4
|
||||
MOVL (5*4)(BP), R13 // f = H5
|
||||
MOVL (6*4)(BP), R14 // g = H6
|
||||
MOVL (7*4)(BP), DI // h = H7
|
||||
MOVL (0*4)(BP), a // a = H0
|
||||
MOVL (1*4)(BP), b // b = H1
|
||||
MOVL (2*4)(BP), c // c = H2
|
||||
MOVL (3*4)(BP), d // d = H3
|
||||
MOVL (4*4)(BP), e // e = H4
|
||||
MOVL (5*4)(BP), f // f = H5
|
||||
MOVL (6*4)(BP), g // g = H6
|
||||
MOVL (7*4)(BP), h // h = H7
|
||||
|
||||
loop:
|
||||
MOVQ SP, BP
|
||||
@ -165,91 +182,83 @@ loop:
|
||||
MSGSCHEDULE0(2)
|
||||
MSGSCHEDULE0(3)
|
||||
|
||||
SM3ROUND0(0, 0x79cc4519, R8, R9, R10, R11, R12, R13, R14, DI)
|
||||
SM3ROUND0(1, 0xf3988a32, DI, R8, R9, R10, R11, R12, R13, R14)
|
||||
SM3ROUND0(2, 0xe7311465, R14, DI, R8, R9, R10, R11, R12, R13)
|
||||
SM3ROUND0(3, 0xce6228cb, R13, R14, DI, R8, R9, R10, R11, R12)
|
||||
SM3ROUND0(4, 0x9cc45197, R12, R13, R14, DI, R8, R9, R10, R11)
|
||||
SM3ROUND0(5, 0x3988a32f, R11, R12, R13, R14, DI, R8, R9, R10)
|
||||
SM3ROUND0(6, 0x7311465e, R10, R11, R12, R13, R14, DI, R8, R9)
|
||||
SM3ROUND0(7, 0xe6228cbc, R9, R10, R11, R12, R13, R14, DI, R8)
|
||||
SM3ROUND0(8, 0xcc451979, R8, R9, R10, R11, R12, R13, R14, DI)
|
||||
SM3ROUND0(9, 0x988a32f3, DI, R8, R9, R10, R11, R12, R13, R14)
|
||||
SM3ROUND0(10, 0x311465e7, R14, DI, R8, R9, R10, R11, R12, R13)
|
||||
SM3ROUND0(11, 0x6228cbce, R13, R14, DI, R8, R9, R10, R11, R12)
|
||||
SM3ROUND0(0, T0, a, b, c, d, e, f, g, h)
|
||||
SM3ROUND0(1, T1, h, a, b, c, d, e, f, g)
|
||||
SM3ROUND0(2, T2, g, h, a, b, c, d, e, f)
|
||||
SM3ROUND0(3, T3, f, g, h, a, b, c, d, e)
|
||||
SM3ROUND0(4, T4, e, f, g, h, a, b, c, d)
|
||||
SM3ROUND0(5, T5, d, e, f, g, h, a, b, c)
|
||||
SM3ROUND0(6, T6, c, d, e, f, g, h, a, b)
|
||||
SM3ROUND0(7, T7, b, c, d, e, f, g, h, a)
|
||||
SM3ROUND0(8, T8, a, b, c, d, e, f, g, h)
|
||||
SM3ROUND0(9, T9, h, a, b, c, d, e, f, g)
|
||||
SM3ROUND0(10, T10, g, h, a, b, c, d, e, f)
|
||||
SM3ROUND0(11, T11, f, g, h, a, b, c, d, e)
|
||||
|
||||
SM3ROUND1(12, 0xc451979c, R12, R13, R14, DI, R8, R9, R10, R11)
|
||||
SM3ROUND1(13, 0x88a32f39, R11, R12, R13, R14, DI, R8, R9, R10)
|
||||
SM3ROUND1(14, 0x11465e73, R10, R11, R12, R13, R14, DI, R8, R9)
|
||||
SM3ROUND1(15, 0x228cbce6, R9, R10, R11, R12, R13, R14, DI, R8)
|
||||
SM3ROUND1(12, T12, e, f, g, h, a, b, c, d)
|
||||
SM3ROUND1(13, T13, d, e, f, g, h, a, b, c)
|
||||
SM3ROUND1(14, T14, c, d, e, f, g, h, a, b)
|
||||
SM3ROUND1(15, T15, b, c, d, e, f, g, h, a)
|
||||
|
||||
SM3ROUND2(16, 0x9d8a7a87, R8, R9, R10, R11, R12, R13, R14, DI)
|
||||
SM3ROUND2(17, 0x3b14f50f, DI, R8, R9, R10, R11, R12, R13, R14)
|
||||
SM3ROUND2(18, 0x7629ea1e, R14, DI, R8, R9, R10, R11, R12, R13)
|
||||
SM3ROUND2(19, 0xec53d43c, R13, R14, DI, R8, R9, R10, R11, R12)
|
||||
SM3ROUND2(20, 0xd8a7a879, R12, R13, R14, DI, R8, R9, R10, R11)
|
||||
SM3ROUND2(21, 0xb14f50f3, R11, R12, R13, R14, DI, R8, R9, R10)
|
||||
SM3ROUND2(22, 0x629ea1e7, R10, R11, R12, R13, R14, DI, R8, R9)
|
||||
SM3ROUND2(23, 0xc53d43ce, R9, R10, R11, R12, R13, R14, DI, R8)
|
||||
SM3ROUND2(24, 0x8a7a879d, R8, R9, R10, R11, R12, R13, R14, DI)
|
||||
SM3ROUND2(25, 0x14f50f3b, DI, R8, R9, R10, R11, R12, R13, R14)
|
||||
SM3ROUND2(26, 0x29ea1e76, R14, DI, R8, R9, R10, R11, R12, R13)
|
||||
SM3ROUND2(27, 0x53d43cec, R13, R14, DI, R8, R9, R10, R11, R12)
|
||||
SM3ROUND2(28, 0xa7a879d8, R12, R13, R14, DI, R8, R9, R10, R11)
|
||||
SM3ROUND2(29, 0x4f50f3b1, R11, R12, R13, R14, DI, R8, R9, R10)
|
||||
SM3ROUND2(30, 0x9ea1e762, R10, R11, R12, R13, R14, DI, R8, R9)
|
||||
SM3ROUND2(31, 0x3d43cec5, R9, R10, R11, R12, R13, R14, DI, R8)
|
||||
SM3ROUND2(32, 0x7a879d8a, R8, R9, R10, R11, R12, R13, R14, DI)
|
||||
SM3ROUND2(33, 0xf50f3b14, DI, R8, R9, R10, R11, R12, R13, R14)
|
||||
SM3ROUND2(34, 0xea1e7629, R14, DI, R8, R9, R10, R11, R12, R13)
|
||||
SM3ROUND2(35, 0xd43cec53, R13, R14, DI, R8, R9, R10, R11, R12)
|
||||
SM3ROUND2(36, 0xa879d8a7, R12, R13, R14, DI, R8, R9, R10, R11)
|
||||
SM3ROUND2(37, 0x50f3b14f, R11, R12, R13, R14, DI, R8, R9, R10)
|
||||
SM3ROUND2(38, 0xa1e7629e, R10, R11, R12, R13, R14, DI, R8, R9)
|
||||
SM3ROUND2(39, 0x43cec53d, R9, R10, R11, R12, R13, R14, DI, R8)
|
||||
SM3ROUND2(40, 0x879d8a7a, R8, R9, R10, R11, R12, R13, R14, DI)
|
||||
SM3ROUND2(41, 0xf3b14f5, DI, R8, R9, R10, R11, R12, R13, R14)
|
||||
SM3ROUND2(42, 0x1e7629ea, R14, DI, R8, R9, R10, R11, R12, R13)
|
||||
SM3ROUND2(43, 0x3cec53d4, R13, R14, DI, R8, R9, R10, R11, R12)
|
||||
SM3ROUND2(44, 0x79d8a7a8, R12, R13, R14, DI, R8, R9, R10, R11)
|
||||
SM3ROUND2(45, 0xf3b14f50, R11, R12, R13, R14, DI, R8, R9, R10)
|
||||
SM3ROUND2(46, 0xe7629ea1, R10, R11, R12, R13, R14, DI, R8, R9)
|
||||
SM3ROUND2(47, 0xcec53d43, R9, R10, R11, R12, R13, R14, DI, R8)
|
||||
SM3ROUND2(48, 0x9d8a7a87, R8, R9, R10, R11, R12, R13, R14, DI)
|
||||
SM3ROUND2(49, 0x3b14f50f, DI, R8, R9, R10, R11, R12, R13, R14)
|
||||
SM3ROUND2(50, 0x7629ea1e, R14, DI, R8, R9, R10, R11, R12, R13)
|
||||
SM3ROUND2(51, 0xec53d43c, R13, R14, DI, R8, R9, R10, R11, R12)
|
||||
SM3ROUND2(52, 0xd8a7a879, R12, R13, R14, DI, R8, R9, R10, R11)
|
||||
SM3ROUND2(53, 0xb14f50f3, R11, R12, R13, R14, DI, R8, R9, R10)
|
||||
SM3ROUND2(54, 0x629ea1e7, R10, R11, R12, R13, R14, DI, R8, R9)
|
||||
SM3ROUND2(55, 0xc53d43ce, R9, R10, R11, R12, R13, R14, DI, R8)
|
||||
SM3ROUND2(56, 0x8a7a879d, R8, R9, R10, R11, R12, R13, R14, DI)
|
||||
SM3ROUND2(57, 0x14f50f3b, DI, R8, R9, R10, R11, R12, R13, R14)
|
||||
SM3ROUND2(58, 0x29ea1e76, R14, DI, R8, R9, R10, R11, R12, R13)
|
||||
SM3ROUND2(59, 0x53d43cec, R13, R14, DI, R8, R9, R10, R11, R12)
|
||||
SM3ROUND2(60, 0xa7a879d8, R12, R13, R14, DI, R8, R9, R10, R11)
|
||||
SM3ROUND2(61, 0x4f50f3b1, R11, R12, R13, R14, DI, R8, R9, R10)
|
||||
SM3ROUND2(62, 0x9ea1e762, R10, R11, R12, R13, R14, DI, R8, R9)
|
||||
SM3ROUND2(63, 0x3d43cec5, R9, R10, R11, R12, R13, R14, DI, R8)
|
||||
SM3ROUND2(16, T16, a, b, c, d, e, f, g, h)
|
||||
SM3ROUND2(17, T17, h, a, b, c, d, e, f, g)
|
||||
SM3ROUND2(18, T18, g, h, a, b, c, d, e, f)
|
||||
SM3ROUND2(19, T19, f, g, h, a, b, c, d, e)
|
||||
SM3ROUND2(20, T20, e, f, g, h, a, b, c, d)
|
||||
SM3ROUND2(21, T21, d, e, f, g, h, a, b, c)
|
||||
SM3ROUND2(22, T22, c, d, e, f, g, h, a, b)
|
||||
SM3ROUND2(23, T23, b, c, d, e, f, g, h, a)
|
||||
SM3ROUND2(24, T24, a, b, c, d, e, f, g, h)
|
||||
SM3ROUND2(25, T25, h, a, b, c, d, e, f, g)
|
||||
SM3ROUND2(26, T26, g, h, a, b, c, d, e, f)
|
||||
SM3ROUND2(27, T27, f, g, h, a, b, c, d, e)
|
||||
SM3ROUND2(28, T28, e, f, g, h, a, b, c, d)
|
||||
SM3ROUND2(29, T29, d, e, f, g, h, a, b, c)
|
||||
SM3ROUND2(30, T30, c, d, e, f, g, h, a, b)
|
||||
SM3ROUND2(31, T31, b, c, d, e, f, g, h, a)
|
||||
SM3ROUND2(32, T32, a, b, c, d, e, f, g, h)
|
||||
SM3ROUND2(33, T33, h, a, b, c, d, e, f, g)
|
||||
SM3ROUND2(34, T34, g, h, a, b, c, d, e, f)
|
||||
SM3ROUND2(35, T35, f, g, h, a, b, c, d, e)
|
||||
SM3ROUND2(36, T36, e, f, g, h, a, b, c, d)
|
||||
SM3ROUND2(37, T37, d, e, f, g, h, a, b, c)
|
||||
SM3ROUND2(38, T38, c, d, e, f, g, h, a, b)
|
||||
SM3ROUND2(39, T39, b, c, d, e, f, g, h, a)
|
||||
SM3ROUND2(40, T40, a, b, c, d, e, f, g, h)
|
||||
SM3ROUND2(41, T41, h, a, b, c, d, e, f, g)
|
||||
SM3ROUND2(42, T42, g, h, a, b, c, d, e, f)
|
||||
SM3ROUND2(43, T43, f, g, h, a, b, c, d, e)
|
||||
SM3ROUND2(44, T44, e, f, g, h, a, b, c, d)
|
||||
SM3ROUND2(45, T45, d, e, f, g, h, a, b, c)
|
||||
SM3ROUND2(46, T46, c, d, e, f, g, h, a, b)
|
||||
SM3ROUND2(47, T47, b, c, d, e, f, g, h, a)
|
||||
SM3ROUND2(48, T48, a, b, c, d, e, f, g, h)
|
||||
SM3ROUND2(49, T49, h, a, b, c, d, e, f, g)
|
||||
SM3ROUND2(50, T50, g, h, a, b, c, d, e, f)
|
||||
SM3ROUND2(51, T51, f, g, h, a, b, c, d, e)
|
||||
SM3ROUND2(52, T52, e, f, g, h, a, b, c, d)
|
||||
SM3ROUND2(53, T53, d, e, f, g, h, a, b, c)
|
||||
SM3ROUND2(54, T54, c, d, e, f, g, h, a, b)
|
||||
SM3ROUND2(55, T55, b, c, d, e, f, g, h, a)
|
||||
SM3ROUND2(56, T56, a, b, c, d, e, f, g, h)
|
||||
SM3ROUND2(57, T57, h, a, b, c, d, e, f, g)
|
||||
SM3ROUND2(58, T58, g, h, a, b, c, d, e, f)
|
||||
SM3ROUND2(59, T59, f, g, h, a, b, c, d, e)
|
||||
SM3ROUND2(60, T60, e, f, g, h, a, b, c, d)
|
||||
SM3ROUND2(61, T61, d, e, f, g, h, a, b, c)
|
||||
SM3ROUND2(62, T62, c, d, e, f, g, h, a, b)
|
||||
SM3ROUND2(63, T63, b, c, d, e, f, g, h, a)
|
||||
|
||||
MOVQ dig+0(FP), BP
|
||||
MOVQ hg+0(FP), BP
|
||||
|
||||
XORL (0*4)(BP), R8 // H0 = a XOR H0
|
||||
MOVL R8, (0*4)(BP)
|
||||
XORL (1*4)(BP), R9 // H1 = b XOR H1
|
||||
MOVL R9, (1*4)(BP)
|
||||
XORL (2*4)(BP), R10 // H2 = c XOR H2
|
||||
MOVL R10, (2*4)(BP)
|
||||
XORL (3*4)(BP), R11 // H3 = d XOR H3
|
||||
MOVL R11, (3*4)(BP)
|
||||
XORL (4*4)(BP), R12 // H4 = e XOR H4
|
||||
MOVL R12, (4*4)(BP)
|
||||
XORL (5*4)(BP), R13 // H5 = f XOR H5
|
||||
MOVL R13, (5*4)(BP)
|
||||
XORL (6*4)(BP), R14 // H6 = g XOR H6
|
||||
MOVL R14, (6*4)(BP)
|
||||
XORL (7*4)(BP), DI // H7 = h XOR H7
|
||||
MOVL DI, (7*4)(BP)
|
||||
xorm( 0(BP), a)
|
||||
xorm( 4(BP), b)
|
||||
xorm( 8(BP), c)
|
||||
xorm( 12(BP), d)
|
||||
xorm( 16(BP), e)
|
||||
xorm( 20(BP), f)
|
||||
xorm( 24(BP), g)
|
||||
xorm( 28(BP), h)
|
||||
|
||||
ADDQ $64, SI
|
||||
CMPQ SI, 272(SP)
|
||||
|
@ -3,6 +3,8 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#include "sm3_const_asm.s"
|
||||
|
||||
#define XWORD0 V0
|
||||
#define XWORD1 V1
|
||||
#define XWORD2 V2
|
||||
@ -399,117 +401,117 @@ loop:
|
||||
schedule_compress: // for w0 - w47
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD0.B16, XWORD1.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_0_0(0*16, 0x79cc4519, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_0_1(0*16, 0xf3988a32, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_0_2(0*16, 0xe7311465, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_0_3(0*16, 0xce6228cb, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_0_0(0*16, T0, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_0_1(0*16, T1, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_0_2(0*16, T2, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_0_3(0*16, T3, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD1.B16, XWORD2.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_0_0(0*16, 0x9cc45197, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_0_1(0*16, 0x3988a32f, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_0_2(0*16, 0x7311465e, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_0_3(0*16, 0xe6228cbc, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_0_0(0*16, T4, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_0_1(0*16, T5, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_0_2(0*16, T6, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_0_3(0*16, T7, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD2.B16, XWORD3.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_0_0(0*16, 0xcc451979, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_0_1(0*16, 0x988a32f3, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_0_2(0*16, 0x311465e7, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_0_3(0*16, 0x6228cbce, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_0_0(0*16, T8, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_0_1(0*16, T9, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_0_2(0*16, T10, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_0_3(0*16, T11, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD3.B16, XWORD0.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_0_0(0*16, 0xc451979c, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_0_1(0*16, 0x88a32f39, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_0_2(0*16, 0x11465e73, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_0_3(0*16, 0x228cbce6, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_0_0(0*16, T12, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_0_1(0*16, T13, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_0_2(0*16, T14, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_0_3(0*16, T15, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD0.B16, XWORD1.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_1_0(0*16, 0x9d8a7a87, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, 0x3b14f50f, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, 0x7629ea1e, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, 0xec53d43c, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_0(0*16, T16, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, T17, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, T18, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, T19, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD1.B16, XWORD2.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_1_0(0*16, 0xd8a7a879, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, 0xb14f50f3, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, 0x629ea1e7, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, 0xc53d43ce, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_0(0*16, T20, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, T21, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, T22, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, T23, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD2.B16, XWORD3.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_1_0(0*16, 0x8a7a879d, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, 0x14f50f3b, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, 0x29ea1e76, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, 0x53d43cec, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_0(0*16, T24, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, T25, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, T26, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, T27, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD3.B16, XWORD0.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_1_0(0*16, 0xa7a879d8, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, 0x4f50f3b1, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, 0x9ea1e762, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, 0x3d43cec5, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_0(0*16, T28, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, T29, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, T30, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, T31, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD0.B16, XWORD1.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_1_0(0*16, 0x7a879d8a, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, 0xf50f3b14, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, 0xea1e7629, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, 0xd43cec53, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_0(0*16, T32, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, T33, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, T34, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, T35, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD1.B16, XWORD2.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_1_0(0*16, 0xa879d8a7, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, 0x50f3b14f, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, 0xa1e7629e, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, 0x43cec53d, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_0(0*16, T36, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, T37, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, T38, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, T39, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD2.B16, XWORD3.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_1_0(0*16, 0x879d8a7a, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, 0xf3b14f5, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, 0x1e7629ea, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, 0x3cec53d4, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_0(0*16, T40, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, T41, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, T42, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, T43, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD3.B16, XWORD0.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_1_0(0*16, 0x79d8a7a8, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, 0xf3b14f50, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, 0xe7629ea1, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, 0xcec53d43, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_0(0*16, T44, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, T45, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, T46, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, T47, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
|
||||
|
||||
// w48 - w63 processed with only 4 rounds scheduling (last 16 rounds)
|
||||
// Do 4 rounds and scheduling
|
||||
VEOR XWORD0.B16, XWORD1.B16, Wt.B16
|
||||
ROUND_AND_SCHED_N_1_0(0*16, 0x9d8a7a87, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, 0x3b14f50f, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, 0x7629ea1e, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, 0xec53d43c, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_0(0*16, T48, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_1(0*16, T49, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_2(0*16, T50, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
ROUND_AND_SCHED_N_1_3(0*16, T51, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
|
||||
|
||||
// w52 - w63 processed with no scheduling (last 12 rounds)
|
||||
// Do 4 rounds
|
||||
VEOR XWORD1.B16, XWORD2.B16, Wt.B16
|
||||
DO_ROUND_N_1(0*16, 0, 0xd8a7a879, e, f, g, h, a, b, c, d, XWORD1, Wt)
|
||||
DO_ROUND_N_1(0*16, 1, 0xb14f50f3, d, e, f, g, h, a, b, c, XWORD1, Wt)
|
||||
DO_ROUND_N_1(0*16, 2, 0x629ea1e7, c, d, e, f, g, h, a, b, XWORD1, Wt)
|
||||
DO_ROUND_N_1(0*16, 3, 0xc53d43ce, b, c, d, e, f, g, h, a, XWORD1, Wt)
|
||||
DO_ROUND_N_1(0*16, 0, T52, e, f, g, h, a, b, c, d, XWORD1, Wt)
|
||||
DO_ROUND_N_1(0*16, 1, T53, d, e, f, g, h, a, b, c, XWORD1, Wt)
|
||||
DO_ROUND_N_1(0*16, 2, T54, c, d, e, f, g, h, a, b, XWORD1, Wt)
|
||||
DO_ROUND_N_1(0*16, 3, T55, b, c, d, e, f, g, h, a, XWORD1, Wt)
|
||||
|
||||
// Do 4 rounds
|
||||
VEOR XWORD2.B16, XWORD3.B16, Wt.B16
|
||||
DO_ROUND_N_1(0*16, 0, 0x8a7a879d, a, b, c, d, e, f, g, h, XWORD2, Wt)
|
||||
DO_ROUND_N_1(0*16, 1, 0x14f50f3b, h, a, b, c, d, e, f, g, XWORD2, Wt)
|
||||
DO_ROUND_N_1(0*16, 2, 0x29ea1e76, g, h, a, b, c, d, e, f, XWORD2, Wt)
|
||||
DO_ROUND_N_1(0*16, 3, 0x53d43cec, f, g, h, a, b, c, d, e, XWORD2, Wt)
|
||||
DO_ROUND_N_1(0*16, 0, T56, a, b, c, d, e, f, g, h, XWORD2, Wt)
|
||||
DO_ROUND_N_1(0*16, 1, T57, h, a, b, c, d, e, f, g, XWORD2, Wt)
|
||||
DO_ROUND_N_1(0*16, 2, T58, g, h, a, b, c, d, e, f, XWORD2, Wt)
|
||||
DO_ROUND_N_1(0*16, 3, T59, f, g, h, a, b, c, d, e, XWORD2, Wt)
|
||||
|
||||
// Do 4 rounds
|
||||
VEOR XWORD3.B16, XWORD0.B16, Wt.B16
|
||||
DO_ROUND_N_1(0*16, 0, 0xa7a879d8, e, f, g, h, a, b, c, d, XWORD3, Wt)
|
||||
DO_ROUND_N_1(0*16, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c, XWORD3, Wt)
|
||||
DO_ROUND_N_1(0*16, 2, 0x9ea1e762, c, d, e, f, g, h, a, b, XWORD3, Wt)
|
||||
DO_ROUND_N_1(0*16, 3, 0x3d43cec5, b, c, d, e, f, g, h, a, XWORD3, Wt)
|
||||
DO_ROUND_N_1(0*16, 0, T60, e, f, g, h, a, b, c, d, XWORD3, Wt)
|
||||
DO_ROUND_N_1(0*16, 1, T61, d, e, f, g, h, a, b, c, XWORD3, Wt)
|
||||
DO_ROUND_N_1(0*16, 2, T62, c, d, e, f, g, h, a, b, XWORD3, Wt)
|
||||
DO_ROUND_N_1(0*16, 3, T63, b, c, d, e, f, g, h, a, XWORD3, Wt)
|
||||
|
||||
EORW a1, a // H0 = a XOR H0
|
||||
EORW b1, b // H1 = b XOR H1
|
||||
|
@ -3,6 +3,8 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#include "sm3_const_asm.s"
|
||||
|
||||
// Definitions for AVX2 version
|
||||
|
||||
// xorm (mem), reg
|
||||
@ -478,37 +480,37 @@ avx2_schedule_compress: // for w0 - w47
|
||||
VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD0, XDWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER + 0*32, 0x79cc4519, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER + 0*32, 0xf3988a32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER + 0*32, 0xe7311465, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER + 0*32, 0xce6228cb, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER + 0*32, T0, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER + 0*32, T1, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER + 0*32, T2, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER + 0*32, T3, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD1, XDWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER + 2*32, 0x9cc45197, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER + 2*32, 0x3988a32f, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER + 2*32, 0x7311465e, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER + 2*32, 0xe6228cbc, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER + 2*32, T4, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER + 2*32, T5, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER + 2*32, T6, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER + 2*32, T7, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD2, XDWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER + 4*32, 0xcc451979, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER + 4*32, 0x988a32f3, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER + 4*32, 0x311465e7, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER + 4*32, 0x6228cbce, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER + 4*32, T8, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER + 4*32, T9, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER + 4*32, T10, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER + 4*32, T11, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD3, XDWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER + 6*32, 0xc451979c, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER + 6*32, 0x88a32f39, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER + 6*32, 0x11465e73, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER + 6*32, 0x228cbce6, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER + 6*32, T12, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER + 6*32, T13, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER + 6*32, T14, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER + 6*32, T15, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
|
||||
ADDQ $8*32, SRND
|
||||
|
||||
@ -516,37 +518,37 @@ avx2_schedule_compress: // for w0 - w47
|
||||
VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD0, XDWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 0*32, 0x9d8a7a87, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 0*32, 0x3b14f50f, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 0*32, 0x7629ea1e, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 0*32, 0xec53d43c, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 0*32, T16, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 0*32, T17, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 0*32, T18, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 0*32, T19, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD1, XDWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 2*32, 0xd8a7a879, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 2*32, 0xb14f50f3, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 2*32, 0x629ea1e7, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 2*32, 0xc53d43ce, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 2*32, T20, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 2*32, T21, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 2*32, T22, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 2*32, T23, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD2, XDWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 4*32, 0x8a7a879d, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 4*32, 0x14f50f3b, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 4*32, 0x29ea1e76, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 4*32, 0x53d43cec, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 4*32, T24, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 4*32, T25, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 4*32, T26, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 4*32, T27, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD3, XDWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 6*32, 0xa7a879d8, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 6*32, 0x4f50f3b1, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 6*32, 0x9ea1e762, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 6*32, 0x3d43cec5, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 6*32, T28, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 6*32, T29, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 6*32, T30, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 6*32, T31, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
|
||||
ADDQ $8*32, SRND
|
||||
|
||||
@ -554,37 +556,37 @@ avx2_schedule_compress: // for w0 - w47
|
||||
VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD0, XDWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 0*32, 0x7a879d8a, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 0*32, 0xf50f3b14, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 0*32, 0xea1e7629, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 0*32, 0xd43cec53, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 0*32, T32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 0*32, T33, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 0*32, T34, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 0*32, T35, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD1, XDWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 2*32, 0xa879d8a7, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 2*32, 0x50f3b14f, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 2*32, 0xa1e7629e, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 2*32, 0x43cec53d, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 2*32, T36, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 2*32, T37, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 2*32, T38, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 2*32, T39, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD2, XDWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 4*32, 0x879d8a7a, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 4*32, 0xf3b14f5, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 4*32, 0x1e7629ea, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 4*32, 0x3cec53d4, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 4*32, T40, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 4*32, T41, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 4*32, T42, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 4*32, T43, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD3, XDWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 6*32, 0x79d8a7a8, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 6*32, 0xf3b14f50, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 6*32, 0xe7629ea1, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 6*32, 0xcec53d43, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 6*32, T44, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 6*32, T45, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 6*32, T46, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 6*32, T47, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
|
||||
ADDQ $8*32, SRND
|
||||
|
||||
@ -593,38 +595,38 @@ avx2_schedule_compress: // for w0 - w47
|
||||
VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD0, XDWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 0*32, 0x9d8a7a87, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 0*32, 0x3b14f50f, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 0*32, 0x7629ea1e, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 0*32, 0xec53d43c, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER + 0*32, T48, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER + 0*32, T49, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER + 0*32, T50, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER + 0*32, T51, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
|
||||
// w52 - w63 processed with no scheduling (last 12 rounds)
|
||||
// Do 4 rounds
|
||||
VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD1, XDWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
|
||||
DO_ROUND_N_1(_XFER + 2*32, 0, 0xd8a7a879, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 2*32, 1, 0xb14f50f3, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 2*32, 2, 0x629ea1e7, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 2*32, 3, 0xc53d43ce, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER + 2*32, 0, T52, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 2*32, 1, T53, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 2*32, 2, T54, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 2*32, 3, T55, b, c, d, e, f, g, h, a)
|
||||
|
||||
// Do 4 rounds
|
||||
VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD2, XDWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1)
|
||||
DO_ROUND_N_1(_XFER + 4*32, 0, 0x8a7a879d, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 4*32, 1, 0x14f50f3b, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 4*32, 2, 0x29ea1e76, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 4*32, 3, 0x53d43cec, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_1(_XFER + 4*32, 0, T56, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 4*32, 1, T57, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 4*32, 2, T58, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 4*32, 3, T59, f, g, h, a, b, c, d, e)
|
||||
|
||||
// Do 4 rounds
|
||||
VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1)
|
||||
VPXOR XDWORD3, XDWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1)
|
||||
DO_ROUND_N_1(_XFER + 6*32, 0, 0xa7a879d8, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 6*32, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 6*32, 2, 0x9ea1e762, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 6*32, 3, 0x3d43cec5, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER + 6*32, 0, T60, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 6*32, 1, T61, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 6*32, 2, T62, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 6*32, 3, T63, b, c, d, e, f, g, h, a)
|
||||
|
||||
MOVQ dig+0(FP), CTX // d.h[8]
|
||||
MOVQ _INP(SP), INP
|
||||
@ -644,91 +646,91 @@ avx2_schedule_compress: // for w0 - w47
|
||||
XORQ SRND, SRND
|
||||
|
||||
avx2_compress: // Do second block using previously scheduled results
|
||||
DO_ROUND_N_0(_XFER + 0*32 + 16, 0, 0x79cc4519, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_0(_XFER + 0*32 + 16, 1, 0xf3988a32, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_0(_XFER + 0*32 + 16, 2, 0xe7311465, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_0(_XFER + 0*32 + 16, 3, 0xce6228cb, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_0(_XFER + 0*32 + 16, 0, T0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_0(_XFER + 0*32 + 16, 1, T1, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_0(_XFER + 0*32 + 16, 2, T2, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_0(_XFER + 0*32 + 16, 3, T3, f, g, h, a, b, c, d, e)
|
||||
|
||||
DO_ROUND_N_0(_XFER + 2*32 + 16, 0, 0x9cc45197, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_0(_XFER + 2*32 + 16, 1, 0x3988a32f, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_0(_XFER + 2*32 + 16, 2, 0x7311465e, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_0(_XFER + 2*32 + 16, 3, 0xe6228cbc, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_0(_XFER + 2*32 + 16, 0, T4, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_0(_XFER + 2*32 + 16, 1, T5, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_0(_XFER + 2*32 + 16, 2, T6, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_0(_XFER + 2*32 + 16, 3, T7, b, c, d, e, f, g, h, a)
|
||||
|
||||
DO_ROUND_N_0(_XFER + 4*32 + 16, 0, 0xcc451979, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_0(_XFER + 4*32 + 16, 1, 0x988a32f3, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_0(_XFER + 4*32 + 16, 2, 0x311465e7, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_0(_XFER + 4*32 + 16, 3, 0x6228cbce, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_0(_XFER + 4*32 + 16, 0, T8, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_0(_XFER + 4*32 + 16, 1, T9, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_0(_XFER + 4*32 + 16, 2, T10, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_0(_XFER + 4*32 + 16, 3, T11, f, g, h, a, b, c, d, e)
|
||||
|
||||
DO_ROUND_N_0(_XFER + 6*32 + 16, 0, 0xc451979c, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_0(_XFER + 6*32 + 16, 1, 0x88a32f39, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_0(_XFER + 6*32 + 16, 2, 0x11465e73, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_0(_XFER + 6*32 + 16, 3, 0x228cbce6, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_0(_XFER + 6*32 + 16, 0, T12, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_0(_XFER + 6*32 + 16, 1, T13, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_0(_XFER + 6*32 + 16, 2, T14, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_0(_XFER + 6*32 + 16, 3, T15, b, c, d, e, f, g, h, a)
|
||||
|
||||
ADDQ $8*32, SRND
|
||||
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 0, 0x9d8a7a87, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 1, 0x3b14f50f, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 2, 0x7629ea1e, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 3, 0xec53d43c, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 0, T16, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 1, T17, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 2, T18, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 3, T19, f, g, h, a, b, c, d, e)
|
||||
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 0, 0xd8a7a879, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 1, 0xb14f50f3, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 2, 0x629ea1e7, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 3, 0xc53d43ce, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 0, T20, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 1, T21, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 2, T22, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 3, T23, b, c, d, e, f, g, h, a)
|
||||
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 0, 0x8a7a879d, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 1, 0x14f50f3b, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 2, 0x29ea1e76, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 3, 0x53d43cec, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 0, T24, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 1, T25, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 2, T26, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 3, T27, f, g, h, a, b, c, d, e)
|
||||
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 0, 0xa7a879d8, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 2, 0x9ea1e762, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 3, 0x3d43cec5, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 0, T28, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 1, T29, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 2, T30, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 3, T31, b, c, d, e, f, g, h, a)
|
||||
|
||||
ADDQ $8*32, SRND
|
||||
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 0, 0x7a879d8a, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 1, 0xf50f3b14, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 2, 0xea1e7629, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 3, 0xd43cec53, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 0, T32, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 1, T33, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 2, T34, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 3, T35, f, g, h, a, b, c, d, e)
|
||||
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 0, 0xa879d8a7, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 1, 0x50f3b14f, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 2, 0xa1e7629e, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 3, 0x43cec53d, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 0, T36, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 1, T37, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 2, T38, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 3, T39, b, c, d, e, f, g, h, a)
|
||||
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 0, 0x879d8a7a, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 1, 0xf3b14f5, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 2, 0x1e7629ea, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 3, 0x3cec53d4, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 0, T40, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 1, T41, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 2, T42, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 3, T43, f, g, h, a, b, c, d, e)
|
||||
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 0, 0x79d8a7a8, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 1, 0xf3b14f50, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 2, 0xe7629ea1, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 3, 0xcec53d43, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 0, T44, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 1, T45, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 2, T46, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 3, T47, b, c, d, e, f, g, h, a)
|
||||
|
||||
ADDQ $8*32, SRND
|
||||
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 0, 0x9d8a7a87, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 1, 0x3b14f50f, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 2, 0x7629ea1e, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 3, 0xec53d43c, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 0, T48, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 1, T49, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 2, T50, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 0*32 + 16, 3, T51, f, g, h, a, b, c, d, e)
|
||||
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 0, 0xd8a7a879, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 1, 0xb14f50f3, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 2, 0x629ea1e7, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 3, 0xc53d43ce, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 0, T52, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 1, T53, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 2, T54, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 2*32 + 16, 3, T55, b, c, d, e, f, g, h, a)
|
||||
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 0, 0x8a7a879d, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 1, 0x14f50f3b, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 2, 0x29ea1e76, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 3, 0x53d43cec, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 0, T56, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 1, T57, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 2, T58, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER + 4*32 + 16, 3, T59, f, g, h, a, b, c, d, e)
|
||||
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 0, 0xa7a879d8, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 2, 0x9ea1e762, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 3, 0x3d43cec5, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 0, T60, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 1, T61, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 2, T62, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER + 6*32 + 16, 3, T63, b, c, d, e, f, g, h, a)
|
||||
|
||||
MOVQ dig+0(FP), CTX // d.h[8]
|
||||
MOVQ _INP(SP), INP
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#include "sm3_const_asm.s"
|
||||
// Definitions for AVX version
|
||||
|
||||
// xorm (mem), reg
|
||||
@ -425,148 +426,148 @@ avx_schedule_compress: // for w0 - w47
|
||||
VMOVDQU XWORD0, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD0, XWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER, 0x79cc4519, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER, 0xf3988a32, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER, 0xe7311465, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER, 0xce6228cb, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER, T0, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER, T1, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER, T2, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER, T3, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD1, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD1, XWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER, 0x9cc45197, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER, 0x3988a32f, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER, 0x7311465e, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER, 0xe6228cbc, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER, T4, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER, T5, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER, T6, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER, T7, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD2, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD2, XWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER, 0xcc451979, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER, 0x988a32f3, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER, 0x311465e7, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER, 0x6228cbce, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER, T8, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER, T9, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER, T10, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER, T11, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD3, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD3, XWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER, 0xc451979c, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER, 0x88a32f39, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER, 0x11465e73, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER, 0x228cbce6, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_0_0(_XFER, T12, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_0_1(_XFER, T13, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_0_2(_XFER, T14, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_0_3(_XFER, T15, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD0, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD0, XWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, 0x9d8a7a87, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, 0x3b14f50f, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, 0x7629ea1e, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, 0xec53d43c, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, T16, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, T17, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, T18, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, T19, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD1, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD1, XWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, 0xd8a7a879, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, 0xb14f50f3, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, 0x629ea1e7, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, 0xc53d43ce, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, T20, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, T21, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, T22, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, T23, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD2, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD2, XWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, 0x8a7a879d, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, 0x14f50f3b, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, 0x29ea1e76, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, 0x53d43cec, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, T24, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, T25, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, T26, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, T27, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD3, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD3, XWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, 0xa7a879d8, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, 0x4f50f3b1, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, 0x9ea1e762, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, 0x3d43cec5, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, T28, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, T29, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, T30, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, T31, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD0, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD0, XWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, 0x7a879d8a, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, 0xf50f3b14, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, 0xea1e7629, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, 0xd43cec53, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, T32, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, T33, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, T34, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, T35, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD1, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD1, XWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, 0xa879d8a7, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, 0x50f3b14f, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, 0xa1e7629e, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, 0x43cec53d, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, T36, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, T37, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, T38, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, T39, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD2, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD2, XWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, 0x879d8a7a, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, 0xf3b14f5, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, 0x1e7629ea, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, 0x3cec53d4, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, T40, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, T41, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, T42, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, T43, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1)
|
||||
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD3, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD3, XWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, 0x79d8a7a8, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, 0xf3b14f50, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, 0xe7629ea1, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, 0xcec53d43, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, T44, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, T45, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, T46, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, T47, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2)
|
||||
|
||||
// w48 - w63 processed with only 4 rounds scheduling (last 16 rounds)
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XWORD0, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD0, XWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, 0x9d8a7a87, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, 0x3b14f50f, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, 0x7629ea1e, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, 0xec53d43c, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_0(_XFER, T48, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_1(_XFER, T49, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_2(_XFER, T50, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
ROUND_AND_SCHED_N_1_3(_XFER, T51, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
// w52 - w63 processed with no scheduling (last 12 rounds)
|
||||
// Do 4 rounds
|
||||
VMOVDQU XWORD1, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD1, XWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
DO_ROUND_N_1(_XFER, 0, 0xd8a7a879, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER, 1, 0xb14f50f3, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER, 2, 0x629ea1e7, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER, 3, 0xc53d43ce, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER, 0, T52, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER, 1, T53, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER, 2, T54, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER, 3, T55, b, c, d, e, f, g, h, a)
|
||||
|
||||
// Do 4 rounds
|
||||
VMOVDQU XWORD2, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD2, XWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
DO_ROUND_N_1(_XFER, 0, 0x8a7a879d, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER, 1, 0x14f50f3b, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER, 2, 0x29ea1e76, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER, 3, 0x53d43cec, f, g, h, a, b, c, d, e)
|
||||
DO_ROUND_N_1(_XFER, 0, T56, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND_N_1(_XFER, 1, T57, h, a, b, c, d, e, f, g)
|
||||
DO_ROUND_N_1(_XFER, 2, T58, g, h, a, b, c, d, e, f)
|
||||
DO_ROUND_N_1(_XFER, 3, T59, f, g, h, a, b, c, d, e)
|
||||
|
||||
// Do 4 rounds
|
||||
VMOVDQU XWORD3, (_XFER + 0*16)(SP)
|
||||
VPXOR XWORD3, XWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*16)(SP)
|
||||
DO_ROUND_N_1(_XFER, 0, 0xa7a879d8, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER, 1, 0x4f50f3b1, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER, 2, 0x9ea1e762, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER, 3, 0x3d43cec5, b, c, d, e, f, g, h, a)
|
||||
DO_ROUND_N_1(_XFER, 0, T60, e, f, g, h, a, b, c, d)
|
||||
DO_ROUND_N_1(_XFER, 1, T61, d, e, f, g, h, a, b, c)
|
||||
DO_ROUND_N_1(_XFER, 2, T62, c, d, e, f, g, h, a, b)
|
||||
DO_ROUND_N_1(_XFER, 3, T63, b, c, d, e, f, g, h, a)
|
||||
|
||||
MOVQ _INP(SP), INP
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user