kdf-sm3: mult by 4 way optimization

This commit is contained in:
Sun Yimin 2024-05-23 10:38:06 +08:00 committed by GitHub
parent 499415a21d
commit 2e05c453c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 182 additions and 282 deletions

View File

@ -32,7 +32,9 @@ func copyResult(result []byte, dig *[8]uint32) {
// p = 64 * 4 * 2 = 512 // p = 64 * 4 * 2 = 512
// state = 8 * 16 = 128 // state = 8 * 16 = 128
// words = 68 * 16 = 1088 // words = 68 * 16 = 1088
const preallocSize = 1728 const preallocSizeBy4 = 1728
const parallelSize4 = 4
func kdfBy4(baseMD *digest, keyLen int, limit int) []byte { func kdfBy4(baseMD *digest, keyLen int, limit int) []byte {
if limit < 4 { if limit < 4 {
@ -51,13 +53,13 @@ func kdfBy4(baseMD *digest, keyLen int, limit int) []byte {
} }
len <<= 3 len <<= 3
// prepare temporary buffer // prepare temporary buffer
tmpStart := 4 * blocks * BlockSize tmpStart := parallelSize4 * blocks * BlockSize
buffer := make([]byte, preallocSize) buffer := make([]byte, preallocSizeBy4)
tmp := buffer[tmpStart:] tmp := buffer[tmpStart:]
// prepare processing data // prepare processing data
var data [4]*byte var data [parallelSize4]*byte
var digs [4]*[8]uint32 var digs [parallelSize4]*[8]uint32
var states [4][8]uint32 var states [parallelSize4][8]uint32
for j := 0; j < 4; j++ { for j := 0; j < 4; j++ {
digs[j] = &states[j] digs[j] = &states[j]
} }
@ -65,9 +67,9 @@ func kdfBy4(baseMD *digest, keyLen int, limit int) []byte {
var ct uint32 = 1 var ct uint32 = 1
k := make([]byte, keyLen) k := make([]byte, keyLen)
ret := k ret := k
times := limit / 4 times := limit / parallelSize4
for i := 0; i < times; i++ { for i := 0; i < times; i++ {
for j := 0; j < 4; j++ { for j := 0; j < parallelSize4; j++ {
// prepare states // prepare states
states[j] = baseMD.h states[j] = baseMD.h
// prepare data // prepare data
@ -77,12 +79,12 @@ func kdfBy4(baseMD *digest, keyLen int, limit int) []byte {
ct++ ct++
} }
blockMultBy4(&digs[0], &data[0], &tmp[0], blocks) blockMultBy4(&digs[0], &data[0], &tmp[0], blocks)
for j := 0; j < 4; j++ { for j := 0; j < parallelSize4; j++ {
copyResult(ret, digs[j]) copyResult(ret, digs[j])
ret = ret[Size:] ret = ret[Size:]
} }
} }
remain := limit % 4 remain := limit % parallelSize4
for i := 0; i < remain; i++ { for i := 0; i < remain; i++ {
binary.BigEndian.PutUint32(tmp[:], ct) binary.BigEndian.PutUint32(tmp[:], ct)
md := *baseMD md := *baseMD

View File

@ -422,6 +422,7 @@ func TestKdf(t *testing.T) {
{"sm3 case 5", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 128}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb"}, {"sm3 case 5", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 128}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb"},
{"sm3 case 6", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 159}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e8"}, {"sm3 case 6", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 159}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e8"},
{"sm3 case 7", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 300}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a9"}, {"sm3 case 7", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 300}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a9"},
{"sm3 case 8", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 516}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a93f26fd6c99111b3017db1d1e6025d28d88ed3a419eb9c72e4fa3267f19c806092fd80cb91079cc00cefc55db53ad840ed1e6384f4cf02d9f2ecbaed54391e7a6da71fca4ea53ccfdd4d85adf37e4be8af1324f43ee402f109ac6a77915fd7e248d3f14f3698dd0e8ea7ea27e4288b288d75b4343ec8ab3d0cd9491a146e1b6033c512399bcd1cb9568d4f10d582f145c3ad7aae4ace7a14ec0abf831edc5aabcf58a1fb05180fa6e79651aa8753ddbf3ca0877b9a9d745ae1729b253f61cfc726cba4c9113008187830e41d428ca223014c994f317998689"},
} }
for _, tt := range tests { for _, tt := range tests {
wantBytes, _ := hex.DecodeString(tt.want) wantBytes, _ := hex.DecodeString(tt.want)

View File

@ -7,103 +7,10 @@ DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL flip_mask<>(SB), RODATA, $16 GLOBL flip_mask<>(SB), RODATA, $16
DATA T256_4<>+0x00(SB)/8, $0x79cc451979cc4519 // left rotations of 32-bit words by 8-bit increments
DATA T256_4<>+0x08(SB)/8, $0x79cc451979cc4519 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA T256_4<>+0x10(SB)/8, $0xf3988a32f3988a32 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA T256_4<>+0x18(SB)/8, $0xf3988a32f3988a32 GLOBL r08_mask<>(SB), 8, $16
DATA T256_4<>+0x20(SB)/8, $0xe7311465e7311465
DATA T256_4<>+0x28(SB)/8, $0xe7311465e7311465
DATA T256_4<>+0x30(SB)/8, $0xce6228cbce6228cb
DATA T256_4<>+0x38(SB)/8, $0xce6228cbce6228cb
DATA T256_4<>+0x40(SB)/8, $0x9cc451979cc45197
DATA T256_4<>+0x48(SB)/8, $0x9cc451979cc45197
DATA T256_4<>+0x50(SB)/8, $0x3988a32f3988a32f
DATA T256_4<>+0x58(SB)/8, $0x3988a32f3988a32f
DATA T256_4<>+0x60(SB)/8, $0x7311465e7311465e
DATA T256_4<>+0x68(SB)/8, $0x7311465e7311465e
DATA T256_4<>+0x70(SB)/8, $0xe6228cbce6228cbc
DATA T256_4<>+0x78(SB)/8, $0xe6228cbce6228cbc
DATA T256_4<>+0x80(SB)/8, $0xcc451979cc451979
DATA T256_4<>+0x88(SB)/8, $0xcc451979cc451979
DATA T256_4<>+0x90(SB)/8, $0x988a32f3988a32f3
DATA T256_4<>+0x98(SB)/8, $0x988a32f3988a32f3
DATA T256_4<>+0xa0(SB)/8, $0x311465e7311465e7
DATA T256_4<>+0xa8(SB)/8, $0x311465e7311465e7
DATA T256_4<>+0xb0(SB)/8, $0x6228cbce6228cbce
DATA T256_4<>+0xb8(SB)/8, $0x6228cbce6228cbce
DATA T256_4<>+0xc0(SB)/8, $0xc451979cc451979c
DATA T256_4<>+0xc8(SB)/8, $0xc451979cc451979c
DATA T256_4<>+0xd0(SB)/8, $0x88a32f3988a32f39
DATA T256_4<>+0xd8(SB)/8, $0x88a32f3988a32f39
DATA T256_4<>+0xe0(SB)/8, $0x11465e7311465e73
DATA T256_4<>+0xe8(SB)/8, $0x11465e7311465e73
DATA T256_4<>+0xf0(SB)/8, $0x228cbce6228cbce6
DATA T256_4<>+0xf8(SB)/8, $0x228cbce6228cbce6
DATA T256_4<>+0x0100(SB)/8, $0x9d8a7a879d8a7a87
DATA T256_4<>+0x0108(SB)/8, $0x9d8a7a879d8a7a87
DATA T256_4<>+0x0110(SB)/8, $0x3b14f50f3b14f50f
DATA T256_4<>+0x0118(SB)/8, $0x3b14f50f3b14f50f
DATA T256_4<>+0x0120(SB)/8, $0x7629ea1e7629ea1e
DATA T256_4<>+0x0128(SB)/8, $0x7629ea1e7629ea1e
DATA T256_4<>+0x0130(SB)/8, $0xec53d43cec53d43c
DATA T256_4<>+0x0138(SB)/8, $0xec53d43cec53d43c
DATA T256_4<>+0x0140(SB)/8, $0xd8a7a879d8a7a879
DATA T256_4<>+0x0148(SB)/8, $0xd8a7a879d8a7a879
DATA T256_4<>+0x0150(SB)/8, $0xb14f50f3b14f50f3
DATA T256_4<>+0x0158(SB)/8, $0xb14f50f3b14f50f3
DATA T256_4<>+0x0160(SB)/8, $0x629ea1e7629ea1e7
DATA T256_4<>+0x0168(SB)/8, $0x629ea1e7629ea1e7
DATA T256_4<>+0x0170(SB)/8, $0xc53d43cec53d43ce
DATA T256_4<>+0x0178(SB)/8, $0xc53d43cec53d43ce
DATA T256_4<>+0x0180(SB)/8, $0x8a7a879d8a7a879d
DATA T256_4<>+0x0188(SB)/8, $0x8a7a879d8a7a879d
DATA T256_4<>+0x0190(SB)/8, $0x14f50f3b14f50f3b
DATA T256_4<>+0x0198(SB)/8, $0x14f50f3b14f50f3b
DATA T256_4<>+0x01a0(SB)/8, $0x29ea1e7629ea1e76
DATA T256_4<>+0x01a8(SB)/8, $0x29ea1e7629ea1e76
DATA T256_4<>+0x01b0(SB)/8, $0x53d43cec53d43cec
DATA T256_4<>+0x01b8(SB)/8, $0x53d43cec53d43cec
DATA T256_4<>+0x01c0(SB)/8, $0xa7a879d8a7a879d8
DATA T256_4<>+0x01c8(SB)/8, $0xa7a879d8a7a879d8
DATA T256_4<>+0x01d0(SB)/8, $0x4f50f3b14f50f3b1
DATA T256_4<>+0x01d8(SB)/8, $0x4f50f3b14f50f3b1
DATA T256_4<>+0x01e0(SB)/8, $0x9ea1e7629ea1e762
DATA T256_4<>+0x01e8(SB)/8, $0x9ea1e7629ea1e762
DATA T256_4<>+0x01f0(SB)/8, $0x3d43cec53d43cec5
DATA T256_4<>+0x01f8(SB)/8, $0x3d43cec53d43cec5
DATA T256_4<>+0x0200(SB)/8, $0x7a879d8a7a879d8a
DATA T256_4<>+0x0208(SB)/8, $0x7a879d8a7a879d8a
DATA T256_4<>+0x0210(SB)/8, $0xf50f3b14f50f3b14
DATA T256_4<>+0x0218(SB)/8, $0xf50f3b14f50f3b14
DATA T256_4<>+0x0220(SB)/8, $0xea1e7629ea1e7629
DATA T256_4<>+0x0228(SB)/8, $0xea1e7629ea1e7629
DATA T256_4<>+0x0230(SB)/8, $0xd43cec53d43cec53
DATA T256_4<>+0x0238(SB)/8, $0xd43cec53d43cec53
DATA T256_4<>+0x0240(SB)/8, $0xa879d8a7a879d8a7
DATA T256_4<>+0x0248(SB)/8, $0xa879d8a7a879d8a7
DATA T256_4<>+0x0250(SB)/8, $0x50f3b14f50f3b14f
DATA T256_4<>+0x0258(SB)/8, $0x50f3b14f50f3b14f
DATA T256_4<>+0x0260(SB)/8, $0xa1e7629ea1e7629e
DATA T256_4<>+0x0268(SB)/8, $0xa1e7629ea1e7629e
DATA T256_4<>+0x0270(SB)/8, $0x43cec53d43cec53d
DATA T256_4<>+0x0278(SB)/8, $0x43cec53d43cec53d
DATA T256_4<>+0x0280(SB)/8, $0x879d8a7a879d8a7a
DATA T256_4<>+0x0288(SB)/8, $0x879d8a7a879d8a7a
DATA T256_4<>+0x0290(SB)/8, $0x0f3b14f50f3b14f5
DATA T256_4<>+0x0298(SB)/8, $0x0f3b14f50f3b14f5
DATA T256_4<>+0x02a0(SB)/8, $0x1e7629ea1e7629ea
DATA T256_4<>+0x02a8(SB)/8, $0x1e7629ea1e7629ea
DATA T256_4<>+0x02b0(SB)/8, $0x3cec53d43cec53d4
DATA T256_4<>+0x02b8(SB)/8, $0x3cec53d43cec53d4
DATA T256_4<>+0x02c0(SB)/8, $0x79d8a7a879d8a7a8
DATA T256_4<>+0x02c8(SB)/8, $0x79d8a7a879d8a7a8
DATA T256_4<>+0x02d0(SB)/8, $0xf3b14f50f3b14f50
DATA T256_4<>+0x02d8(SB)/8, $0xf3b14f50f3b14f50
DATA T256_4<>+0x02e0(SB)/8, $0xe7629ea1e7629ea1
DATA T256_4<>+0x02e8(SB)/8, $0xe7629ea1e7629ea1
DATA T256_4<>+0x02f0(SB)/8, $0xcec53d43cec53d43
DATA T256_4<>+0x02f8(SB)/8, $0xcec53d43cec53d43
GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance! // Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
// input: from high to low // input: from high to low
@ -179,8 +86,8 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
PXOR tmp1, P2; \ PXOR tmp1, P2; \
MOVOU P2, P1 MOVOU P2, P1
#define store4Words(W, j) MOVOU W, (128+(j)*16)(BX) #define storeWord(W, j) MOVOU W, (128+(j)*16)(BX)
#define load4Words(W, i) MOVOU (128+(i)*16)(BX), W #define loadWord(W, i) MOVOU (128+(i)*16)(BX), W
#define prepareFirst16Words(i) \ #define prepareFirst16Words(i) \
MOVOU (i*16)(R8), X10; \ MOVOU (i*16)(R8), X10; \
@ -195,10 +102,14 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
PSHUFB tmp1, X12; \ PSHUFB tmp1, X12; \
PSHUFB tmp1, X13; \ PSHUFB tmp1, X13; \
; \ ; \
store4Words(X10, 4*i+0); \ storeWord(X10, 4*i+0); \
store4Words(X11, 4*i+1); \ storeWord(X11, 4*i+1); \
store4Words(X12, 4*i+2); \ storeWord(X12, 4*i+2); \
store4Words(X13, 4*i+3) storeWord(X13, 4*i+3)
#define LOAD_T(index, T) \
MOVL (index*4)(AX), T; \
PSHUFD $0, T, T
// r <<< n, SSE version // r <<< n, SSE version
#define PROLD(r, n) \ #define PROLD(r, n) \
@ -211,7 +122,7 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
MOVOU a, X12; \ MOVOU a, X12; \
PROLD(X12, 12); \ PROLD(X12, 12); \
MOVOU X12, X13; \ // a <<< 12 MOVOU X12, X13; \ // a <<< 12
MOVOU (index*16)(AX), tmp2; \ LOAD_T(index, tmp2); \
PADDL tmp2, X12; \ PADDL tmp2, X12; \
PADDL e, X12; \ PADDL e, X12; \
PROLD(X12, 7); \ // SS1 PROLD(X12, 7); \ // SS1
@ -220,8 +131,8 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
PXOR a, X14; \ PXOR a, X14; \
PXOR c, X14; \ // (a XOR b XOR c) PXOR c, X14; \ // (a XOR b XOR c)
PADDL d, X14; \ // (a XOR b XOR c) + d PADDL d, X14; \ // (a XOR b XOR c) + d
load4Words(X10, index); \ loadWord(X10, index); \
load4Words(X11, index+4); \ loadWord(X11, index+4); \
PXOR X10, X11; \ //Wt XOR Wt+4 PXOR X10, X11; \ //Wt XOR Wt+4
PADDL X11, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4 PADDL X11, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
PADDL X14, X13; \ // TT1 PADDL X14, X13; \ // TT1
@ -237,40 +148,40 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
PROLD(f, 19); \ PROLD(f, 19); \
MOVOU X10, X13; \ MOVOU X10, X13; \
PROLD(X13, 9); \ PROLD(X13, 9); \
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) PXOR X13, X10; \ // tt2 XOR ROTL(9, tt2)
PROLD(X10, 17); \ PSHUFB r08_mask<>(SB), X13; \ // ROTL(17, tt2)
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2) PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
MOVOU X13, d MOVOU X13, d
#define MESSAGE_SCHEDULE(index) \ #define MESSAGE_SCHEDULE(index) \
load4Words(X10, index+1); \ // Wj-3 loadWord(X10, index+1); \ // Wj-3
PROLD(X10, 15); \ PROLD(X10, 15); \
load4Words(X11, index-12); \ // Wj-16 loadWord(X11, index-12); \ // Wj-16
PXOR X11, X10; \ PXOR X11, X10; \
load4Words(X11, index-5); \ // Wj-9 loadWord(X11, index-5); \ // Wj-9
PXOR X11, X10; \ PXOR X11, X10; \
MOVOU X10, X11; \ MOVOU X10, X11; \
PROLD(X11, 15); \ PROLD(X11, 15); \
PXOR X11, X10; \ PXOR X11, X10; \
PROLD(X11, 8); \ PSHUFB r08_mask<>(SB), X11; \
PXOR X11, X10; \ // P1 PXOR X11, X10; \ // P1
load4Words(X11, index-9); \ // Wj-13 loadWord(X11, index-9); \ // Wj-13
PROLD(X11, 7); \ PROLD(X11, 7); \
PXOR X11, X10; \ PXOR X11, X10; \
load4Words(X11, index-2); \ // Wj-6 loadWord(X11, index-2); \ // Wj-6
PXOR X10, X11; \ PXOR X10, X11; \
store4Words(X11, index+4) storeWord(X11, index+4)
#define ROUND_12_15(index, a, b, c, d, e, f, g, h) \ #define ROUND_12_15(index, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index); \ MESSAGE_SCHEDULE(index); \
ROUND_00_11(index, a, b, c, d, e, f, g, h) ROUND_00_11(index, a, b, c, d, e, f, g, h)
#define ROUND_16_63(index, cIndex, a, b, c, d, e, f, g, h) \ #define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
MOVOU a, X12; \ MOVOU a, X12; \
PROLD(X12, 12); \ PROLD(X12, 12); \
MOVOU X12, X13; \ // a <<< 12 MOVOU X12, X13; \ // a <<< 12
MOVOU (cIndex*16)(AX), tmp2; \ LOAD_T(index, tmp2); \
PADDL tmp2, X12; \ PADDL tmp2, X12; \
PADDL e, X12; \ PADDL e, X12; \
PROLD(X12, 7); \ // SS1 PROLD(X12, 7); \ // SS1
@ -283,7 +194,7 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
PAND c, X14; \ PAND c, X14; \
POR X10, X14; \ // (a AND b) OR (a AND c) OR (b AND c) POR X10, X14; \ // (a AND b) OR (a AND c) OR (b AND c)
PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
load4Words(X10, index); \ loadWord(X10, index); \
PXOR X10, X11; \ //Wt XOR Wt+4 PXOR X10, X11; \ //Wt XOR Wt+4
PADDL X11, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4 PADDL X11, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
PADDL X14, X13; \ // TT1 PADDL X14, X13; \ // TT1
@ -301,8 +212,8 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
PROLD(f, 19); \ PROLD(f, 19); \
MOVOU X10, X13; \ MOVOU X10, X13; \
PROLD(X13, 9); \ PROLD(X13, 9); \
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) PXOR X13, X10; \ // tt2 XOR ROTL(9, tt2)
PROLD(X10, 17); \ PSHUFB r08_mask<>(SB), X13; \ // ROTL(17, tt2)
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2) PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
MOVOU X13, d MOVOU X13, d
@ -330,8 +241,8 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
VPXOR P1, P2, P2; \ VPXOR P1, P2, P2; \
VMOVDQU P2, P1 VMOVDQU P2, P1
#define avxStore4Words(W, j) VMOVDQU W, (128+(j)*16)(BX) #define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX)
#define avxLoad4Words(W, i) VMOVDQU (128+(i)*16)(BX), W #define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W
#define avxPrepareFirst16Words(i) \ #define avxPrepareFirst16Words(i) \
VMOVDQU (i*16)(R8), X10; \ VMOVDQU (i*16)(R8), X10; \
@ -345,10 +256,14 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
VPSHUFB flip_mask<>(SB), X12, X12; \ VPSHUFB flip_mask<>(SB), X12, X12; \
VPSHUFB flip_mask<>(SB), X13, X13; \ VPSHUFB flip_mask<>(SB), X13, X13; \
; \ ; \
avxStore4Words(X10, 4*i+0); \ avxStoreWord(X10, 4*i+0); \
avxStore4Words(X11, 4*i+1); \ avxStoreWord(X11, 4*i+1); \
avxStore4Words(X12, 4*i+2); \ avxStoreWord(X12, 4*i+2); \
avxStore4Words(X13, 4*i+3) avxStoreWord(X13, 4*i+3)
#define AVX_LOAD_T(index, T) \
MOVL (index*4)(AX), T; \
VPSHUFD $0, T, T
// r <<< n // r <<< n
#define VPROLD(r, n) \ #define VPROLD(r, n) \
@ -364,7 +279,8 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
#define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \ #define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \
VPROLD2(a, X13, 12); \ // a <<< 12 VPROLD2(a, X13, 12); \ // a <<< 12
VPADDD (index*16)(AX), X13, X12; \ AVX_LOAD_T(index, X12); \
VPADDD X12, X13, X12; \
VPADDD e, X12, X12; \ VPADDD e, X12, X12; \
VPROLD(X12, 7); \ // SS1 VPROLD(X12, 7); \ // SS1
VPXOR X12, X13, X13; \ // SS2 VPXOR X12, X13, X13; \ // SS2
@ -372,8 +288,8 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
VPXOR a, b, X14; \ VPXOR a, b, X14; \
VPXOR c, X14, X14; \ // (a XOR b XOR c) VPXOR c, X14, X14; \ // (a XOR b XOR c)
VPADDD d, X14, X14; \ // (a XOR b XOR c) + d VPADDD d, X14, X14; \ // (a XOR b XOR c) + d
avxLoad4Words(X10, index); \ avxLoadWord(X10, index); \
avxLoad4Words(X11, index+4); \ avxLoadWord(X11, index+4); \
VPXOR X10, X11, X11; \ //Wt XOR Wt+4 VPXOR X10, X11, X11; \ //Wt XOR Wt+4
VPADDD X11, X14, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4 VPADDD X11, X14, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
VPADDD X14, X13, X13; \ // TT1 VPADDD X14, X13, X13; \ // TT1
@ -387,34 +303,35 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
VMOVDQU X13, h; \ VMOVDQU X13, h; \
VPROLD(f, 19); \ VPROLD(f, 19); \
VPROLD2(X10, X13, 9); \ // tt2 <<< 9 VPROLD2(X10, X13, 9); \ // tt2 <<< 9
VPXOR X10, X13, X13; \ // tt2 XOR ROTL(9, tt2) VPXOR X10, X13, X10; \ // tt2 XOR ROTL(9, tt2)
VPROLD(X10, 17); \ // tt2 <<< 17 VPSHUFB r08_mask<>(SB), X13, X13; \ // ROTL(17, tt2)
VPXOR X10, X13, d VPXOR X10, X13, d
#define AVX_MESSAGE_SCHEDULE(index) \ #define AVX_MESSAGE_SCHEDULE(index) \
avxLoad4Words(X10, index+1); \ // Wj-3 avxLoadWord(X10, index+1); \ // Wj-3
VPROLD(X10, 15); \ VPROLD(X10, 15); \
VPXOR (128+(index-12)*16)(BX), X10, X10; \ // Wj-16 VPXOR (128+(index-12)*16)(BX), X10, X10; \ // Wj-16
VPXOR (128+(index-5)*16)(BX), X10, X10; \ // Wj-9 VPXOR (128+(index-5)*16)(BX), X10, X10; \ // Wj-9
; \ // P1 ; \ // P1
VPROLD2(X10, X11, 15); \ VPROLD2(X10, X11, 15); \
VPXOR X11, X10, X10; \ VPXOR X11, X10, X10; \
VPROLD(X11, 8); \ VPSHUFB r08_mask<>(SB), X11, X11; \
VPXOR X11, X10, X10; \ // P1 VPXOR X11, X10, X10; \ // P1
avxLoad4Words(X11, index-9); \ // Wj-13 avxLoadWord(X11, index-9); \ // Wj-13
VPROLD(X11, 7); \ VPROLD(X11, 7); \
VPXOR X11, X10, X10; \ VPXOR X11, X10, X10; \
VPXOR (128+(index-2)*16)(BX), X10, X11; \ VPXOR (128+(index-2)*16)(BX), X10, X11; \
avxStore4Words(X11, index+4) avxStoreWord(X11, index+4)
#define AVX_ROUND_12_15(index, a, b, c, d, e, f, g, h) \ #define AVX_ROUND_12_15(index, a, b, c, d, e, f, g, h) \
AVX_MESSAGE_SCHEDULE(index); \ AVX_MESSAGE_SCHEDULE(index); \
AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h)
#define AVX_ROUND_16_63(index, cIndex, a, b, c, d, e, f, g, h) \ #define AVX_ROUND_16_63(index, a, b, c, d, e, f, g, h) \
AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
VPROLD2(a, X13, 12); \ // a <<< 12 VPROLD2(a, X13, 12); \ // a <<< 12
VPADDD (cIndex*16)(AX), X13, X12; \ AVX_LOAD_T(index, X12); \
VPADDD X12, X13, X12; \
VPADDD e, X12, X12; \ VPADDD e, X12, X12; \
VPROLD(X12, 7); \ // SS1 VPROLD(X12, 7); \ // SS1
VPXOR X12, X13, X13; \ // SS2 VPXOR X12, X13, X13; \ // SS2
@ -424,7 +341,7 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
VPAND c, X14, X14; \ VPAND c, X14, X14; \
VPOR X10, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) VPOR X10, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c)
VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
avxLoad4Words(X10, index); \ avxLoadWord(X10, index); \
VPXOR X10, X11, X11; \ //Wt XOR Wt+4 VPXOR X10, X11, X11; \ //Wt XOR Wt+4
VPADDD X11, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4 VPADDD X11, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
VPADDD X14, X13, X13; \ // TT1 VPADDD X14, X13, X13; \ // TT1
@ -440,8 +357,8 @@ GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
VMOVDQU X13, h; \ VMOVDQU X13, h; \
VPROLD(f, 19); \ VPROLD(f, 19); \
VPROLD2(X10, X13, 9); \ // tt2 <<< 9 VPROLD2(X10, X13, 9); \ // tt2 <<< 9
VPXOR X10, X13, X13; \ // tt2 XOR ROTL(9, tt2) VPXOR X10, X13, X10; \ // tt2 XOR ROTL(9, tt2)
VPROLD(X10, 17); \ // tt2 <<< 17 VPSHUFB r08_mask<>(SB), X13, X13; \ // ROTL(17, tt2)
VPXOR X10, X13, d VPXOR X10, X13, d
// blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int) // blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int)
@ -475,7 +392,7 @@ TEXT ·blockMultBy4(SB),NOSPLIT,$0
// store state to temporary buffer // store state to temporary buffer
storeState storeState
MOVQ $T256_4<>(SB), AX MOVQ $·_K+0(SB), AX
MOVQ (SI), R8 MOVQ (SI), R8
MOVQ 8(SI), R9 MOVQ 8(SI), R9
MOVQ 16(SI), R10 MOVQ 16(SI), R10
@ -506,54 +423,54 @@ loop:
ROUND_12_15(14, c, d, e, f, g, h, a, b) ROUND_12_15(14, c, d, e, f, g, h, a, b)
ROUND_12_15(15, b, c, d, e, f, g, h, a) ROUND_12_15(15, b, c, d, e, f, g, h, a)
ROUND_16_63(16, 16, a, b, c, d, e, f, g, h) ROUND_16_63(16, a, b, c, d, e, f, g, h)
ROUND_16_63(17, 17, h, a, b, c, d, e, f, g) ROUND_16_63(17, h, a, b, c, d, e, f, g)
ROUND_16_63(18, 18, g, h, a, b, c, d, e, f) ROUND_16_63(18, g, h, a, b, c, d, e, f)
ROUND_16_63(19, 19, f, g, h, a, b, c, d, e) ROUND_16_63(19, f, g, h, a, b, c, d, e)
ROUND_16_63(20, 20, e, f, g, h, a, b, c, d) ROUND_16_63(20, e, f, g, h, a, b, c, d)
ROUND_16_63(21, 21, d, e, f, g, h, a, b, c) ROUND_16_63(21, d, e, f, g, h, a, b, c)
ROUND_16_63(22, 22, c, d, e, f, g, h, a, b) ROUND_16_63(22, c, d, e, f, g, h, a, b)
ROUND_16_63(23, 23, b, c, d, e, f, g, h, a) ROUND_16_63(23, b, c, d, e, f, g, h, a)
ROUND_16_63(24, 24, a, b, c, d, e, f, g, h) ROUND_16_63(24, a, b, c, d, e, f, g, h)
ROUND_16_63(25, 25, h, a, b, c, d, e, f, g) ROUND_16_63(25, h, a, b, c, d, e, f, g)
ROUND_16_63(26, 26, g, h, a, b, c, d, e, f) ROUND_16_63(26, g, h, a, b, c, d, e, f)
ROUND_16_63(27, 27, f, g, h, a, b, c, d, e) ROUND_16_63(27, f, g, h, a, b, c, d, e)
ROUND_16_63(28, 28, e, f, g, h, a, b, c, d) ROUND_16_63(28, e, f, g, h, a, b, c, d)
ROUND_16_63(29, 29, d, e, f, g, h, a, b, c) ROUND_16_63(29, d, e, f, g, h, a, b, c)
ROUND_16_63(30, 30, c, d, e, f, g, h, a, b) ROUND_16_63(30, c, d, e, f, g, h, a, b)
ROUND_16_63(31, 31, b, c, d, e, f, g, h, a) ROUND_16_63(31, b, c, d, e, f, g, h, a)
ROUND_16_63(32, 32, a, b, c, d, e, f, g, h) ROUND_16_63(32, a, b, c, d, e, f, g, h)
ROUND_16_63(33, 33, h, a, b, c, d, e, f, g) ROUND_16_63(33, h, a, b, c, d, e, f, g)
ROUND_16_63(34, 34, g, h, a, b, c, d, e, f) ROUND_16_63(34, g, h, a, b, c, d, e, f)
ROUND_16_63(35, 35, f, g, h, a, b, c, d, e) ROUND_16_63(35, f, g, h, a, b, c, d, e)
ROUND_16_63(36, 36, e, f, g, h, a, b, c, d) ROUND_16_63(36, e, f, g, h, a, b, c, d)
ROUND_16_63(37, 37, d, e, f, g, h, a, b, c) ROUND_16_63(37, d, e, f, g, h, a, b, c)
ROUND_16_63(38, 38, c, d, e, f, g, h, a, b) ROUND_16_63(38, c, d, e, f, g, h, a, b)
ROUND_16_63(39, 39, b, c, d, e, f, g, h, a) ROUND_16_63(39, b, c, d, e, f, g, h, a)
ROUND_16_63(40, 40, a, b, c, d, e, f, g, h) ROUND_16_63(40, a, b, c, d, e, f, g, h)
ROUND_16_63(41, 41, h, a, b, c, d, e, f, g) ROUND_16_63(41, h, a, b, c, d, e, f, g)
ROUND_16_63(42, 42, g, h, a, b, c, d, e, f) ROUND_16_63(42, g, h, a, b, c, d, e, f)
ROUND_16_63(43, 43, f, g, h, a, b, c, d, e) ROUND_16_63(43, f, g, h, a, b, c, d, e)
ROUND_16_63(44, 44, e, f, g, h, a, b, c, d) ROUND_16_63(44, e, f, g, h, a, b, c, d)
ROUND_16_63(45, 45, d, e, f, g, h, a, b, c) ROUND_16_63(45, d, e, f, g, h, a, b, c)
ROUND_16_63(46, 46, c, d, e, f, g, h, a, b) ROUND_16_63(46, c, d, e, f, g, h, a, b)
ROUND_16_63(47, 47, b, c, d, e, f, g, h, a) ROUND_16_63(47, b, c, d, e, f, g, h, a)
ROUND_16_63(48, 16, a, b, c, d, e, f, g, h) ROUND_16_63(48, a, b, c, d, e, f, g, h)
ROUND_16_63(49, 17, h, a, b, c, d, e, f, g) ROUND_16_63(49, h, a, b, c, d, e, f, g)
ROUND_16_63(50, 18, g, h, a, b, c, d, e, f) ROUND_16_63(50, g, h, a, b, c, d, e, f)
ROUND_16_63(51, 19, f, g, h, a, b, c, d, e) ROUND_16_63(51, f, g, h, a, b, c, d, e)
ROUND_16_63(52, 20, e, f, g, h, a, b, c, d) ROUND_16_63(52, e, f, g, h, a, b, c, d)
ROUND_16_63(53, 21, d, e, f, g, h, a, b, c) ROUND_16_63(53, d, e, f, g, h, a, b, c)
ROUND_16_63(54, 22, c, d, e, f, g, h, a, b) ROUND_16_63(54, c, d, e, f, g, h, a, b)
ROUND_16_63(55, 23, b, c, d, e, f, g, h, a) ROUND_16_63(55, b, c, d, e, f, g, h, a)
ROUND_16_63(56, 24, a, b, c, d, e, f, g, h) ROUND_16_63(56, a, b, c, d, e, f, g, h)
ROUND_16_63(57, 25, h, a, b, c, d, e, f, g) ROUND_16_63(57, h, a, b, c, d, e, f, g)
ROUND_16_63(58, 26, g, h, a, b, c, d, e, f) ROUND_16_63(58, g, h, a, b, c, d, e, f)
ROUND_16_63(59, 27, f, g, h, a, b, c, d, e) ROUND_16_63(59, f, g, h, a, b, c, d, e)
ROUND_16_63(60, 28, e, f, g, h, a, b, c, d) ROUND_16_63(60, e, f, g, h, a, b, c, d)
ROUND_16_63(61, 29, d, e, f, g, h, a, b, c) ROUND_16_63(61, d, e, f, g, h, a, b, c)
ROUND_16_63(62, 30, c, d, e, f, g, h, a, b) ROUND_16_63(62, c, d, e, f, g, h, a, b)
ROUND_16_63(63, 31, b, c, d, e, f, g, h, a) ROUND_16_63(63, b, c, d, e, f, g, h, a)
xorm( 0(BX), a) xorm( 0(BX), a)
xorm( 16(BX), b) xorm( 16(BX), b)
@ -619,7 +536,7 @@ avx:
VMOVDQU g, 96(BX) VMOVDQU g, 96(BX)
VMOVDQU h, 112(BX) VMOVDQU h, 112(BX)
MOVQ $T256_4<>(SB), AX MOVQ $·_K+0(SB), AX
MOVQ (SI), R8 MOVQ (SI), R8
MOVQ 8(SI), R9 MOVQ 8(SI), R9
MOVQ 16(SI), R10 MOVQ 16(SI), R10
@ -650,54 +567,54 @@ avxLoop:
AVX_ROUND_12_15(14, c, d, e, f, g, h, a, b) AVX_ROUND_12_15(14, c, d, e, f, g, h, a, b)
AVX_ROUND_12_15(15, b, c, d, e, f, g, h, a) AVX_ROUND_12_15(15, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(16, 16, a, b, c, d, e, f, g, h) AVX_ROUND_16_63(16, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(17, 17, h, a, b, c, d, e, f, g) AVX_ROUND_16_63(17, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(18, 18, g, h, a, b, c, d, e, f) AVX_ROUND_16_63(18, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(19, 19, f, g, h, a, b, c, d, e) AVX_ROUND_16_63(19, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(20, 20, e, f, g, h, a, b, c, d) AVX_ROUND_16_63(20, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(21, 21, d, e, f, g, h, a, b, c) AVX_ROUND_16_63(21, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(22, 22, c, d, e, f, g, h, a, b) AVX_ROUND_16_63(22, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(23, 23, b, c, d, e, f, g, h, a) AVX_ROUND_16_63(23, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(24, 24, a, b, c, d, e, f, g, h) AVX_ROUND_16_63(24, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(25, 25, h, a, b, c, d, e, f, g) AVX_ROUND_16_63(25, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(26, 26, g, h, a, b, c, d, e, f) AVX_ROUND_16_63(26, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(27, 27, f, g, h, a, b, c, d, e) AVX_ROUND_16_63(27, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(28, 28, e, f, g, h, a, b, c, d) AVX_ROUND_16_63(28, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(29, 29, d, e, f, g, h, a, b, c) AVX_ROUND_16_63(29, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(30, 30, c, d, e, f, g, h, a, b) AVX_ROUND_16_63(30, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(31, 31, b, c, d, e, f, g, h, a) AVX_ROUND_16_63(31, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(32, 32, a, b, c, d, e, f, g, h) AVX_ROUND_16_63(32, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(33, 33, h, a, b, c, d, e, f, g) AVX_ROUND_16_63(33, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(34, 34, g, h, a, b, c, d, e, f) AVX_ROUND_16_63(34, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(35, 35, f, g, h, a, b, c, d, e) AVX_ROUND_16_63(35, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(36, 36, e, f, g, h, a, b, c, d) AVX_ROUND_16_63(36, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(37, 37, d, e, f, g, h, a, b, c) AVX_ROUND_16_63(37, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(38, 38, c, d, e, f, g, h, a, b) AVX_ROUND_16_63(38, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(39, 39, b, c, d, e, f, g, h, a) AVX_ROUND_16_63(39, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(40, 40, a, b, c, d, e, f, g, h) AVX_ROUND_16_63(40, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(41, 41, h, a, b, c, d, e, f, g) AVX_ROUND_16_63(41, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(42, 42, g, h, a, b, c, d, e, f) AVX_ROUND_16_63(42, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(43, 43, f, g, h, a, b, c, d, e) AVX_ROUND_16_63(43, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(44, 44, e, f, g, h, a, b, c, d) AVX_ROUND_16_63(44, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(45, 45, d, e, f, g, h, a, b, c) AVX_ROUND_16_63(45, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(46, 46, c, d, e, f, g, h, a, b) AVX_ROUND_16_63(46, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(47, 47, b, c, d, e, f, g, h, a) AVX_ROUND_16_63(47, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(48, 16, a, b, c, d, e, f, g, h) AVX_ROUND_16_63(48, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(49, 17, h, a, b, c, d, e, f, g) AVX_ROUND_16_63(49, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(50, 18, g, h, a, b, c, d, e, f) AVX_ROUND_16_63(50, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(51, 19, f, g, h, a, b, c, d, e) AVX_ROUND_16_63(51, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(52, 20, e, f, g, h, a, b, c, d) AVX_ROUND_16_63(52, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(53, 21, d, e, f, g, h, a, b, c) AVX_ROUND_16_63(53, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(54, 22, c, d, e, f, g, h, a, b) AVX_ROUND_16_63(54, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(55, 23, b, c, d, e, f, g, h, a) AVX_ROUND_16_63(55, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(56, 24, a, b, c, d, e, f, g, h) AVX_ROUND_16_63(56, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(57, 25, h, a, b, c, d, e, f, g) AVX_ROUND_16_63(57, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(58, 26, g, h, a, b, c, d, e, f) AVX_ROUND_16_63(58, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(59, 27, f, g, h, a, b, c, d, e) AVX_ROUND_16_63(59, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(60, 28, e, f, g, h, a, b, c, d) AVX_ROUND_16_63(60, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(61, 29, d, e, f, g, h, a, b, c) AVX_ROUND_16_63(61, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(62, 30, c, d, e, f, g, h, a, b) AVX_ROUND_16_63(62, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(63, 31, b, c, d, e, f, g, h, a) AVX_ROUND_16_63(63, b, c, d, e, f, g, h, a)
avxXorm( 0(BX), a) avxXorm( 0(BX), a)
avxXorm( 16(BX), b) avxXorm( 16(BX), b)

View File

@ -50,52 +50,32 @@ func createTwoBlocksBy4() [4]*byte {
func TestBlockMultBy4(t *testing.T) { func TestBlockMultBy4(t *testing.T) {
digs := initState4() digs := initState4()
p := createOneBlockBy4() p := createOneBlockBy4()
buffer := make([]byte, preallocSize) buffer := make([]byte, preallocSizeBy4)
blockMultBy4(&digs[0], &p[0], &buffer[0], 1) blockMultBy4(&digs[0], &p[0], &buffer[0], 1)
expected := "[66c7f0f4 62eeedd9 d1f2d46b dc10e4e2 4167c487 5cf2f7a2 297da02b 8f4ba8e0]" expected := "[66c7f0f4 62eeedd9 d1f2d46b dc10e4e2 4167c487 5cf2f7a2 297da02b 8f4ba8e0]"
s := fmt.Sprintf("%x", digs[0][:]) for i:=0; i<4; i++ {
if s != expected { s := fmt.Sprintf("%x", digs[i][:])
t.Errorf("digs[0] got %s", s) if s != expected {
} t.Errorf("digs[%d] got %s", i, s)
s = fmt.Sprintf("%x", digs[1][:]) }
if s != expected {
t.Errorf("digs[1] got %s", s)
}
s = fmt.Sprintf("%x", digs[2][:])
if s != expected {
t.Errorf("digs[2] got %s", s)
}
s = fmt.Sprintf("%x", digs[3][:])
if s != expected {
t.Errorf("digs[3] got %s", s)
} }
digs = initState4() digs = initState4()
p = createTwoBlocksBy4() p = createTwoBlocksBy4()
blockMultBy4(&digs[0], &p[0], &buffer[0], 2) blockMultBy4(&digs[0], &p[0], &buffer[0], 2)
expected = "[debe9ff9 2275b8a1 38604889 c18e5a4d 6fdb70e5 387e5765 293dcba3 9c0c5732]" expected = "[debe9ff9 2275b8a1 38604889 c18e5a4d 6fdb70e5 387e5765 293dcba3 9c0c5732]"
s = fmt.Sprintf("%x", digs[0][:]) for i:=0; i<4; i++ {
if s != expected { s := fmt.Sprintf("%x", digs[i][:])
t.Errorf("digs[0] got %s", s) if s != expected {
} t.Errorf("digs[%d] got %s", i, s)
s = fmt.Sprintf("%x", digs[1][:]) }
if s != expected {
t.Errorf("digs[1] got %s", s)
}
s = fmt.Sprintf("%x", digs[2][:])
if s != expected {
t.Errorf("digs[2] got %s", s)
}
s = fmt.Sprintf("%x", digs[3][:])
if s != expected {
t.Errorf("digs[3] got %s", s)
} }
} }
func BenchmarkOneBlockBy4(b *testing.B) { func BenchmarkOneBlockBy4(b *testing.B) {
digs := initState4() digs := initState4()
p := createOneBlockBy4() p := createOneBlockBy4()
buffer := make([]byte, 1216) buffer := make([]byte, preallocSizeBy4)
b.SetBytes(64 * 4) b.SetBytes(64 * 4)
b.ReportAllocs() b.ReportAllocs()
b.ResetTimer() b.ResetTimer()
@ -107,7 +87,7 @@ func BenchmarkOneBlockBy4(b *testing.B) {
func BenchmarkTwoBlocksBy4(b *testing.B) { func BenchmarkTwoBlocksBy4(b *testing.B) {
digs := initState4() digs := initState4()
p := createTwoBlocksBy4() p := createTwoBlocksBy4()
buffer := make([]byte, 1216) buffer := make([]byte, preallocSizeBy4)
b.SetBytes(64 * 2 * 4) b.SetBytes(64 * 2 * 4)
b.ReportAllocs() b.ReportAllocs()
b.ResetTimer() b.ResetTimer()