gmsm/sm3/sm3blocks_simd_amd64.s
2024-09-04 11:24:56 +08:00

723 lines
19 KiB
ArmAsm

// Copyright 2024 Sun Yimin. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
//go:build !purego
#include "textflag.h"
// shuffle byte order from LE to BE
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL flip_mask<>(SB), RODATA, $16
// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), 8, $16
// Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions.
// input: from high to low
// r0 = [w3, w2, w1, w0]
// r1 = [w7, w6, w5, w4]
// r2 = [w11, w10, w9, w8]
// r3 = [w15, w14, w13, w12]
// r: 32/64 temp register
// tmp1: 128 bits temp register
// tmp2: 128 bits temp register
//
// output: from high to low
// r0 = [w12, w8, w4, w0]
// r1 = [w13, w9, w5, w1]
// r2 = [w14, w10, w6, w2]
// r3 = [w15, w11, w7, w3]
//
// SSE2/MMX instructions:
// MOVOU r0, tmp2;
// PUNPCKHDQ r1, tmp2;
// PUNPCKLDQ r1, r0;
// MOVOU r2, tmp1;
// PUNPCKLDQ r3, tmp1;
// PUNPCKHDQ r3, r2;
// MOVOU r0, r1;
// PUNPCKHQDQ tmp1, r1;
// PUNPCKLQDQ tmp1, r0;
// MOVOU tmp2, r3;
// PUNPCKHQDQ r2, r3;
// PUNPCKLQDQ r2, tmp2;
// MOVOU tmp2, r2
#define SSE_TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
MOVOU r0, tmp2; \
PUNPCKHLQ r1, tmp2; \
PUNPCKLLQ r1, r0; \
MOVOU r2, tmp1; \
PUNPCKLLQ r3, tmp1; \
PUNPCKHLQ r3, r2; \
MOVOU r0, r1; \
PUNPCKHQDQ tmp1, r1; \
PUNPCKLQDQ tmp1, r0; \
MOVOU tmp2, r3; \
PUNPCKHQDQ r2, r3; \
PUNPCKLQDQ r2, tmp2; \
MOVOU tmp2, r2
#define a X0
#define b X1
#define c X2
#define d X3
#define e X4
#define f X5
#define g X6
#define h X7
#define tmp1 X8
#define tmp2 X9
#define storeState(R) \
MOVOU a, (R) \
MOVOU b, 16(R) \
MOVOU c, 32(R) \
MOVOU d, 48(R) \
MOVOU e, 64(R) \
MOVOU f, 80(R) \
MOVOU g, 96(R) \
MOVOU h, 112(R)
#define storeWord(W, j) MOVOU W, (128+(j)*16)(BX)
#define loadWord(W, i) MOVOU (128+(i)*16)(BX), W
#define SSE_REV32(a, b, c, d) \
PSHUFB flip_mask<>(SB), a; \
PSHUFB flip_mask<>(SB), b; \
PSHUFB flip_mask<>(SB), c; \
PSHUFB flip_mask<>(SB), d
#define prepare4Words(i) \
MOVOU (i*16)(R8), X10; \
MOVOU (i*16)(R9), X11; \
MOVOU (i*16)(R10), X12; \
MOVOU (i*16)(R11), X13; \
; \
SSE_TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
SSE_REV32(X10, X11, X12, X13); \
; \
storeWord(X10, 4*i+0); \
storeWord(X11, 4*i+1); \
storeWord(X12, 4*i+2); \
storeWord(X13, 4*i+3)
#define LOAD_T(index, T) \
MOVL (index*4)(AX), T; \
PSHUFD $0, T, T
// r <<< n, SSE version
#define PROLD(r, n) \
MOVOU r, tmp1; \
PSLLL $n, r; \
PSRLL $(32-n), tmp1; \
POR tmp1, r
#define SSE_SS1SS2(index, a, e, TMP, SS1, SS2) \
MOVOU a, SS1; \
PROLD(SS1, 12); \
MOVOU SS1, SS2; \ // a <<< 12
LOAD_T(index, TMP); \
PADDL TMP, SS1; \
PADDL e, SS1; \
PROLD(SS1, 7); \ // SS1
PXOR SS1, SS2; \ // SS2
#define SSE_FF0(X, Y, Z, DST) \
MOVOU X, DST; \
PXOR Y, DST; \
PXOR Z, DST
#define SSE_FF1(X, Y, Z, TMP, DST) \
MOVOU X, DST; \
POR Y, DST; \
MOVOU X, TMP; \
PAND Y, TMP; \
PAND Z, DST; \
POR TMP, DST; \ // (a AND b) OR (a AND c) OR (b AND c)
#define SSE_GG0(X, Y, Z, DST) \
SSE_FF0(X, Y, Z, DST)
// DST = (Y XOR Z) AND X XOR Z
#define SSE_GG1(X, Y, Z, DST) \
MOVOU Y, DST; \
PXOR Z, DST; \
PAND X, DST; \
PXOR Z, DST
#define SSE_COPY_RESULT(b, d, f, h, TT1, TT2) \
PROLD(b, 9); \
MOVOU TT1, h; \
PROLD(f, 19); \
MOVOU TT2, TT1; \
PROLD(TT1, 9); \
PXOR TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
PSHUFB r08_mask<>(SB), TT1; \ // ROTL(17, tt2)
PXOR TT2, TT1; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
MOVOU TT1, d
#define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
SSE_FF0(a, b, c, X14); \
PADDL d, X14; \ // (a XOR b XOR c) + d
loadWord(X10, index); \
loadWord(X11, index+4); \
PXOR X10, X11; \ //Wt XOR Wt+4
PADDL X11, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
PADDL X14, X13; \ // TT1
PADDL h, X10; \ // Wt + h
PADDL X12, X10; \ // Wt + h + SS1
SSE_GG0(e, f, g, X11); \
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
SSE_COPY_RESULT(b, d, f, h, X13, X10)
#define MESSAGE_SCHEDULE(index) \
loadWord(X10, index+1); \ // Wj-3
PROLD(X10, 15); \
loadWord(X11, index-12); \ // Wj-16
PXOR X11, X10; \
loadWord(X11, index-5); \ // Wj-9
PXOR X11, X10; \
MOVOU X10, X11; \
PROLD(X11, 15); \
PXOR X11, X10; \
PSHUFB r08_mask<>(SB), X11; \
PXOR X11, X10; \ // P1
loadWord(X11, index-9); \ // Wj-13
PROLD(X11, 7); \
PXOR X11, X10; \
loadWord(X11, index-2); \ // Wj-6
PXOR X10, X11; \
storeWord(X11, index+4)
#define ROUND_12_15(index, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index); \
ROUND_00_11(index, a, b, c, d, e, f, g, h)
#define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
; \
SSE_FF1(a, b, c, X10, X14); \
PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
loadWord(X10, index); \
PXOR X10, X11; \ //Wt XOR Wt+4
PADDL X11, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
PADDL X14, X13; \ // TT1
; \
PADDL h, X10; \ // Wt + h
PADDL X12, X10; \ // Wt + h + SS1
SSE_GG1(e, f, g, X11); \
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
SSE_COPY_RESULT(b, d, f, h, X13, X10)
// transpose matrix function, AVX version
// parameters:
// - r0: 128 bits register as input/output data
// - r1: 128 bits register as input/output data
// - r2: 128 bits register as input/output data
// - r3: 128 bits register as input/output data
// - tmp1: 128 bits temp register
// - tmp2: 128 bits temp register
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = tmp2 = [w07, w03, w06, w02]
VPUNPCKLDQ r1, r0, r0; \ // r0 = r0 = [w05, w01, w04, w00]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = tmp1 = [w13, w09, w12, w08]
VPUNPCKHDQ r3, r2, r2; \ // r2 = r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = r1 = [w13, w09, w05, w01]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = r0 = [w12, w08, w04, w00]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = r3 = [w15, w11, w07, w03]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = r2 = [w14, w10, w06, w02]
#define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX)
#define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W
#define avxStoreState(R) \
VMOVDQU a, (0*16)(R) \
VMOVDQU b, (1*16)(R) \
VMOVDQU c, (2*16)(R) \
VMOVDQU d, (3*16)(R) \
VMOVDQU e, (4*16)(R) \
VMOVDQU f, (5*16)(R) \
VMOVDQU g, (6*16)(R) \
VMOVDQU h, (7*16)(R)
#define AVX_REV32(a, b, c, d) \
VPSHUFB flip_mask<>(SB), a, a; \
VPSHUFB flip_mask<>(SB), b, b; \
VPSHUFB flip_mask<>(SB), c, c; \
VPSHUFB flip_mask<>(SB), d, d
#define avxPrepare4Words(i) \
VMOVDQU (i*16)(R8), X10; \
VMOVDQU (i*16)(R9), X11; \
VMOVDQU (i*16)(R10), X12; \
VMOVDQU (i*16)(R11), X13; \
; \
TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
AVX_REV32(X10, X11, X12, X13); \
; \
avxStoreWord(X10, 4*i+0); \
avxStoreWord(X11, 4*i+1); \
avxStoreWord(X12, 4*i+2); \
avxStoreWord(X13, 4*i+3)
#define AVX_LOAD_T(index, T) \
MOVL (index*4)(AX), T; \
VPSHUFD $0, T, T
// r <<< n
#define VPROLD(r, n) \
VPSLLD $(n), r, tmp1; \
VPSRLD $(32-n), r, r; \
VPOR tmp1, r, r
// d = r <<< n
#define VPROLD2(r, d, n) \
VPSLLD $(n), r, tmp1; \
VPSRLD $(32-n), r, d; \
VPOR tmp1, d, d
#define AVX_SS1SS2(index, a, e, SS1, SS2) \
VPROLD2(a, SS2, 12); \ // a <<< 12
AVX_LOAD_T(index, SS1); \
VPADDD SS1, SS2, SS1; \
VPADDD e, SS1, SS1; \
VPROLD(SS1, 7); \ // SS1
VPXOR SS1, SS2, SS2
// DST = X XOR Y XOR Z
#define AVX_FF0(X, Y, Z, DST) \
VPXOR X, Y, DST; \
VPXOR Z, DST, DST
// DST = (X AND Y) OR (X AND Z) OR (Y AND Z)
#define AVX_FF1(X, Y, Z, TMP, DST) \
VPOR X, Y, DST; \
VPAND X, Y, TMP; \
VPAND Z, DST, DST; \
VPOR TMP, DST, DST
// DST = X XOR Y XOR Z
#define AVX_GG0(X, Y, Z, DST) \
AVX_FF0(X, Y, Z, DST)
// DST = (Y XOR Z) AND X XOR Z
#define AVX_GG1(X, Y, Z, DST) \
VPXOR Y, Z, DST; \
VPAND X, DST, DST; \
VPXOR Z, DST, DST
#define AVX_COPY_RESULT(b, d, f, h, TT1, TT2) \
VPROLD(b, 9); \
VMOVDQU TT1, h; \
VPROLD(f, 19); \
VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9
VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2)
VPXOR TT2, TT1, d
#define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \
AVX_SS1SS2(index, a, e, X12, X13); \
; \
AVX_FF0(a, b, c, X14); \
VPADDD d, X14, X14; \ // (a XOR b XOR c) + d
avxLoadWord(X10, index); \
avxLoadWord(X11, index+4); \
VPXOR X10, X11, X11; \ //Wt XOR Wt+4
VPADDD X11, X14, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
VPADDD X14, X13, X13; \ // TT1
VPADDD h, X10, X10; \ // Wt + h
VPADDD X12, X10, X10; \ // Wt + h + SS1
AVX_GG0(e, f, g, X11); \
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
AVX_COPY_RESULT(b, d, f, h, X13, X10)
#define AVX_MESSAGE_SCHEDULE(index) \
avxLoadWord(X10, index+1); \ // Wj-3
VPROLD(X10, 15); \
VPXOR (128+(index-12)*16)(BX), X10, X10; \ // Wj-16
VPXOR (128+(index-5)*16)(BX), X10, X10; \ // Wj-9
; \ // P1
VPROLD2(X10, X11, 15); \
VPXOR X11, X10, X10; \
VPSHUFB r08_mask<>(SB), X11, X11; \
VPXOR X11, X10, X10; \ // P1
avxLoadWord(X11, index-9); \ // Wj-13
VPROLD(X11, 7); \
VPXOR X11, X10, X10; \
VPXOR (128+(index-2)*16)(BX), X10, X11; \
avxStoreWord(X11, index+4)
#define AVX_ROUND_12_15(index, a, b, c, d, e, f, g, h) \
AVX_MESSAGE_SCHEDULE(index); \
AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h)
#define AVX_ROUND_16_63(index, a, b, c, d, e, f, g, h) \
AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
AVX_SS1SS2(index, a, e, X12, X13); \
; \
AVX_FF1(a, b, c, X10, X14); \
VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
avxLoadWord(X10, index); \
VPXOR X10, X11, X11; \ //Wt XOR Wt+4
VPADDD X11, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
VPADDD X14, X13, X13; \ // TT1
; \
VPADDD h, X10, X10; \ // Wt + h
VPADDD X12, X10, X10; \ // Wt + h + SS1
AVX_GG1(e, f, g, X11); \
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
AVX_COPY_RESULT(b, d, f, h, X13, X10)
// blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int)
TEXT ·blockMultBy4(SB),NOSPLIT,$0
MOVQ dig+0(FP), DI
MOVQ p+8(FP), SI
MOVQ buffer+16(FP), BX
MOVQ blocks+24(FP), DX
CMPB ·useAVX(SB), $1
JE avx
// load state
MOVQ (DI), R8
MOVOU (0*16)(R8), a
MOVOU (1*16)(R8), e
MOVQ 8(DI), R8
MOVOU (0*16)(R8), b
MOVOU (1*16)(R8), f
MOVQ 16(DI), R8
MOVOU (0*16)(R8), c
MOVOU (1*16)(R8), g
MOVQ 24(DI), R8
MOVOU (0*16)(R8), d
MOVOU (1*16)(R8), h
// transpose state
SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
// store state to temporary buffer
storeState(BX)
MOVQ $·_K+0(SB), AX
MOVQ (SI), R8
MOVQ 8(SI), R9
MOVQ 16(SI), R10
MOVQ 24(SI), R11
loop:
// load message block
prepare4Words(0)
prepare4Words(1)
prepare4Words(2)
prepare4Words(3)
ROUND_00_11(0, a, b, c, d, e, f, g, h)
ROUND_00_11(1, h, a, b, c, d, e, f, g)
ROUND_00_11(2, g, h, a, b, c, d, e, f)
ROUND_00_11(3, f, g, h, a, b, c, d, e)
ROUND_00_11(4, e, f, g, h, a, b, c, d)
ROUND_00_11(5, d, e, f, g, h, a, b, c)
ROUND_00_11(6, c, d, e, f, g, h, a, b)
ROUND_00_11(7, b, c, d, e, f, g, h, a)
ROUND_00_11(8, a, b, c, d, e, f, g, h)
ROUND_00_11(9, h, a, b, c, d, e, f, g)
ROUND_00_11(10, g, h, a, b, c, d, e, f)
ROUND_00_11(11, f, g, h, a, b, c, d, e)
ROUND_12_15(12, e, f, g, h, a, b, c, d)
ROUND_12_15(13, d, e, f, g, h, a, b, c)
ROUND_12_15(14, c, d, e, f, g, h, a, b)
ROUND_12_15(15, b, c, d, e, f, g, h, a)
ROUND_16_63(16, a, b, c, d, e, f, g, h)
ROUND_16_63(17, h, a, b, c, d, e, f, g)
ROUND_16_63(18, g, h, a, b, c, d, e, f)
ROUND_16_63(19, f, g, h, a, b, c, d, e)
ROUND_16_63(20, e, f, g, h, a, b, c, d)
ROUND_16_63(21, d, e, f, g, h, a, b, c)
ROUND_16_63(22, c, d, e, f, g, h, a, b)
ROUND_16_63(23, b, c, d, e, f, g, h, a)
ROUND_16_63(24, a, b, c, d, e, f, g, h)
ROUND_16_63(25, h, a, b, c, d, e, f, g)
ROUND_16_63(26, g, h, a, b, c, d, e, f)
ROUND_16_63(27, f, g, h, a, b, c, d, e)
ROUND_16_63(28, e, f, g, h, a, b, c, d)
ROUND_16_63(29, d, e, f, g, h, a, b, c)
ROUND_16_63(30, c, d, e, f, g, h, a, b)
ROUND_16_63(31, b, c, d, e, f, g, h, a)
ROUND_16_63(32, a, b, c, d, e, f, g, h)
ROUND_16_63(33, h, a, b, c, d, e, f, g)
ROUND_16_63(34, g, h, a, b, c, d, e, f)
ROUND_16_63(35, f, g, h, a, b, c, d, e)
ROUND_16_63(36, e, f, g, h, a, b, c, d)
ROUND_16_63(37, d, e, f, g, h, a, b, c)
ROUND_16_63(38, c, d, e, f, g, h, a, b)
ROUND_16_63(39, b, c, d, e, f, g, h, a)
ROUND_16_63(40, a, b, c, d, e, f, g, h)
ROUND_16_63(41, h, a, b, c, d, e, f, g)
ROUND_16_63(42, g, h, a, b, c, d, e, f)
ROUND_16_63(43, f, g, h, a, b, c, d, e)
ROUND_16_63(44, e, f, g, h, a, b, c, d)
ROUND_16_63(45, d, e, f, g, h, a, b, c)
ROUND_16_63(46, c, d, e, f, g, h, a, b)
ROUND_16_63(47, b, c, d, e, f, g, h, a)
ROUND_16_63(48, a, b, c, d, e, f, g, h)
ROUND_16_63(49, h, a, b, c, d, e, f, g)
ROUND_16_63(50, g, h, a, b, c, d, e, f)
ROUND_16_63(51, f, g, h, a, b, c, d, e)
ROUND_16_63(52, e, f, g, h, a, b, c, d)
ROUND_16_63(53, d, e, f, g, h, a, b, c)
ROUND_16_63(54, c, d, e, f, g, h, a, b)
ROUND_16_63(55, b, c, d, e, f, g, h, a)
ROUND_16_63(56, a, b, c, d, e, f, g, h)
ROUND_16_63(57, h, a, b, c, d, e, f, g)
ROUND_16_63(58, g, h, a, b, c, d, e, f)
ROUND_16_63(59, f, g, h, a, b, c, d, e)
ROUND_16_63(60, e, f, g, h, a, b, c, d)
ROUND_16_63(61, d, e, f, g, h, a, b, c)
ROUND_16_63(62, c, d, e, f, g, h, a, b)
ROUND_16_63(63, b, c, d, e, f, g, h, a)
MOVOU (0*16)(BX), tmp1
PXOR tmp1, a
MOVOU (1*16)(BX), tmp1
PXOR tmp1, b
MOVOU (2*16)(BX), tmp1
PXOR tmp1, c
MOVOU (3*16)(BX), tmp1
PXOR tmp1, d
MOVOU (4*16)(BX), tmp1
PXOR tmp1, e
MOVOU (5*16)(BX), tmp1
PXOR tmp1, f
MOVOU (6*16)(BX), tmp1
PXOR tmp1, g
MOVOU (7*16)(BX), tmp1
PXOR tmp1, h
DECQ DX
JZ end
storeState(BX)
LEAQ 64(R8), R8
LEAQ 64(R9), R9
LEAQ 64(R10), R10
LEAQ 64(R11), R11
JMP loop
end:
// transpose state
SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
MOVQ (DI), R8
MOVOU a, (0*16)(R8)
MOVOU e, (1*16)(R8)
MOVQ 8(DI), R8
MOVOU b, (0*16)(R8)
MOVOU f, (1*16)(R8)
MOVQ 16(DI), R8
MOVOU c, (0*16)(R8)
MOVOU g, (1*16)(R8)
MOVQ 24(DI), R8
MOVOU d, (0*16)(R8)
MOVOU h, (1*16)(R8)
RET
avx:
// load state
MOVQ (DI), R8
VMOVDQU (0*16)(R8), a
VMOVDQU (1*16)(R8), e
MOVQ 8(DI), R8
VMOVDQU (0*16)(R8), b
VMOVDQU (1*16)(R8), f
MOVQ 16(DI), R8
VMOVDQU (0*16)(R8), c
VMOVDQU (1*16)(R8), g
MOVQ 24(DI), R8
VMOVDQU (0*16)(R8), d
VMOVDQU (1*16)(R8), h
// transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
avxStoreState(BX)
MOVQ $·_K+0(SB), AX
MOVQ (SI), R8
MOVQ 8(SI), R9
MOVQ 16(SI), R10
MOVQ 24(SI), R11
avxLoop:
// load message block
avxPrepare4Words(0)
avxPrepare4Words(1)
avxPrepare4Words(2)
avxPrepare4Words(3)
AVX_ROUND_00_11(0, a, b, c, d, e, f, g, h)
AVX_ROUND_00_11(1, h, a, b, c, d, e, f, g)
AVX_ROUND_00_11(2, g, h, a, b, c, d, e, f)
AVX_ROUND_00_11(3, f, g, h, a, b, c, d, e)
AVX_ROUND_00_11(4, e, f, g, h, a, b, c, d)
AVX_ROUND_00_11(5, d, e, f, g, h, a, b, c)
AVX_ROUND_00_11(6, c, d, e, f, g, h, a, b)
AVX_ROUND_00_11(7, b, c, d, e, f, g, h, a)
AVX_ROUND_00_11(8, a, b, c, d, e, f, g, h)
AVX_ROUND_00_11(9, h, a, b, c, d, e, f, g)
AVX_ROUND_00_11(10, g, h, a, b, c, d, e, f)
AVX_ROUND_00_11(11, f, g, h, a, b, c, d, e)
AVX_ROUND_12_15(12, e, f, g, h, a, b, c, d)
AVX_ROUND_12_15(13, d, e, f, g, h, a, b, c)
AVX_ROUND_12_15(14, c, d, e, f, g, h, a, b)
AVX_ROUND_12_15(15, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(16, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(17, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(18, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(19, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(20, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(21, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(22, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(23, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(24, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(25, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(26, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(27, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(28, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(29, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(30, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(31, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(32, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(33, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(34, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(35, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(36, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(37, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(38, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(39, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(40, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(41, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(42, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(43, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(44, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(45, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(46, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(47, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(48, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(49, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(50, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(51, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(52, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(53, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(54, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(55, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(56, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(57, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(58, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(59, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(60, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(61, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(62, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(63, b, c, d, e, f, g, h, a)
VPXOR (0*16)(BX), a, a
VPXOR (1*16)(BX), b, b
VPXOR (2*16)(BX), c, c
VPXOR (3*16)(BX), d, d
VPXOR (4*16)(BX), e, e
VPXOR (5*16)(BX), f, f
VPXOR (6*16)(BX), g, g
VPXOR (7*16)(BX), h, h
DECQ DX
JZ avxEnd
// store current state
avxStoreState(BX)
LEAQ 64(R8), R8
LEAQ 64(R9), R9
LEAQ 64(R10), R10
LEAQ 64(R11), R11
JMP avxLoop
avxEnd:
// transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
MOVQ (DI), R8
VMOVDQU a, (0*16)(R8)
VMOVDQU e, (1*16)(R8)
MOVQ 8(DI), R8
VMOVDQU b, (0*16)(R8)
VMOVDQU f, (1*16)(R8)
MOVQ 16(DI), R8
VMOVDQU c, (0*16)(R8)
VMOVDQU g, (1*16)(R8)
MOVQ 24(DI), R8
VMOVDQU d, (0*16)(R8)
VMOVDQU h, (1*16)(R8)
RET
// func copyResultsBy4(dig *uint32, dst *byte)
TEXT ·copyResultsBy4(SB),NOSPLIT,$0
MOVQ dig+0(FP), DI
MOVQ dst+8(FP), SI
CMPB ·useAVX(SB), $1
JE avx
// load state
MOVOU (0*16)(DI), a
MOVOU (1*16)(DI), b
MOVOU (2*16)(DI), c
MOVOU (3*16)(DI), d
MOVOU (4*16)(DI), e
MOVOU (5*16)(DI), f
MOVOU (6*16)(DI), g
MOVOU (7*16)(DI), h
SSE_REV32(a, b, c, d)
SSE_REV32(e, f, g, h)
storeState(SI)
RET
avx:
// load state
VMOVDQU (0*16)(DI), a
VMOVDQU (1*16)(DI), b
VMOVDQU (2*16)(DI), c
VMOVDQU (3*16)(DI), d
VMOVDQU (4*16)(DI), e
VMOVDQU (5*16)(DI), f
VMOVDQU (6*16)(DI), g
VMOVDQU (7*16)(DI), h
AVX_REV32(a, b, c, d)
AVX_REV32(e, f, g, h)
avxStoreState(SI)
RET