mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 04:36:19 +08:00
719 lines
19 KiB
ArmAsm
719 lines
19 KiB
ArmAsm
//go:build !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
// shuffle byte order from LE to BE
|
|
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
|
|
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
|
GLOBL flip_mask<>(SB), RODATA, $16
|
|
|
|
// left rotations of 32-bit words by 8-bit increments
|
|
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
|
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
|
GLOBL r08_mask<>(SB), 8, $16
|
|
|
|
// Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions.
|
|
// input: from high to low
|
|
// r0 = [w3, w2, w1, w0]
|
|
// r1 = [w7, w6, w5, w4]
|
|
// r2 = [w11, w10, w9, w8]
|
|
// r3 = [w15, w14, w13, w12]
|
|
// r: 32/64 temp register
|
|
// tmp1: 128 bits temp register
|
|
// tmp2: 128 bits temp register
|
|
//
|
|
// output: from high to low
|
|
// r0 = [w12, w8, w4, w0]
|
|
// r1 = [w13, w9, w5, w1]
|
|
// r2 = [w14, w10, w6, w2]
|
|
// r3 = [w15, w11, w7, w3]
|
|
//
|
|
// SSE2/MMX instructions:
|
|
// MOVOU r0, tmp2;
|
|
// PUNPCKHDQ r1, tmp2;
|
|
// PUNPCKLDQ r1, r0;
|
|
// MOVOU r2, tmp1;
|
|
// PUNPCKLDQ r3, tmp1;
|
|
// PUNPCKHDQ r3, r2;
|
|
// MOVOU r0, r1;
|
|
// PUNPCKHQDQ tmp1, r1;
|
|
// PUNPCKLQDQ tmp1, r0;
|
|
// MOVOU tmp2, r3;
|
|
// PUNPCKHQDQ r2, r3;
|
|
// PUNPCKLQDQ r2, tmp2;
|
|
// MOVOU tmp2, r2
|
|
#define SSE_TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
|
MOVOU r0, tmp2; \
|
|
PUNPCKHLQ r1, tmp2; \
|
|
PUNPCKLLQ r1, r0; \
|
|
MOVOU r2, tmp1; \
|
|
PUNPCKLLQ r3, tmp1; \
|
|
PUNPCKHLQ r3, r2; \
|
|
MOVOU r0, r1; \
|
|
PUNPCKHQDQ tmp1, r1; \
|
|
PUNPCKLQDQ tmp1, r0; \
|
|
MOVOU tmp2, r3; \
|
|
PUNPCKHQDQ r2, r3; \
|
|
PUNPCKLQDQ r2, tmp2; \
|
|
MOVOU tmp2, r2
|
|
|
|
#define a X0
|
|
#define b X1
|
|
#define c X2
|
|
#define d X3
|
|
#define e X4
|
|
#define f X5
|
|
#define g X6
|
|
#define h X7
|
|
|
|
#define tmp1 X8
|
|
#define tmp2 X9
|
|
|
|
#define storeState(R) \
|
|
MOVOU a, (R) \
|
|
MOVOU b, 16(R) \
|
|
MOVOU c, 32(R) \
|
|
MOVOU d, 48(R) \
|
|
MOVOU e, 64(R) \
|
|
MOVOU f, 80(R) \
|
|
MOVOU g, 96(R) \
|
|
MOVOU h, 112(R)
|
|
|
|
#define storeWord(W, j) MOVOU W, (128+(j)*16)(BX)
|
|
#define loadWord(W, i) MOVOU (128+(i)*16)(BX), W
|
|
|
|
#define SSE_REV32(a, b, c, d) \
|
|
PSHUFB flip_mask<>(SB), a; \
|
|
PSHUFB flip_mask<>(SB), b; \
|
|
PSHUFB flip_mask<>(SB), c; \
|
|
PSHUFB flip_mask<>(SB), d
|
|
|
|
#define prepare4Words(i) \
|
|
MOVOU (i*16)(R8), X10; \
|
|
MOVOU (i*16)(R9), X11; \
|
|
MOVOU (i*16)(R10), X12; \
|
|
MOVOU (i*16)(R11), X13; \
|
|
; \
|
|
SSE_TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
|
|
SSE_REV32(X10, X11, X12, X13); \
|
|
; \
|
|
storeWord(X10, 4*i+0); \
|
|
storeWord(X11, 4*i+1); \
|
|
storeWord(X12, 4*i+2); \
|
|
storeWord(X13, 4*i+3)
|
|
|
|
#define LOAD_T(index, T) \
|
|
MOVL (index*4)(AX), T; \
|
|
PSHUFD $0, T, T
|
|
|
|
// r <<< n, SSE version
|
|
#define PROLD(r, n) \
|
|
MOVOU r, tmp1; \
|
|
PSLLL $n, r; \
|
|
PSRLL $(32-n), tmp1; \
|
|
POR tmp1, r
|
|
|
|
#define SSE_SS1SS2(index, a, e, TMP, SS1, SS2) \
|
|
MOVOU a, SS1; \
|
|
PROLD(SS1, 12); \
|
|
MOVOU SS1, SS2; \ // a <<< 12
|
|
LOAD_T(index, TMP); \
|
|
PADDL TMP, SS1; \
|
|
PADDL e, SS1; \
|
|
PROLD(SS1, 7); \ // SS1
|
|
PXOR SS1, SS2; \ // SS2
|
|
|
|
#define SSE_FF0(X, Y, Z, DST) \
|
|
MOVOU X, DST; \
|
|
PXOR Y, DST; \
|
|
PXOR Z, DST
|
|
|
|
#define SSE_FF1(X, Y, Z, TMP, DST) \
|
|
MOVOU X, DST; \
|
|
POR Y, DST; \
|
|
MOVOU X, TMP; \
|
|
PAND Y, TMP; \
|
|
PAND Z, DST; \
|
|
POR TMP, DST; \ // (a AND b) OR (a AND c) OR (b AND c)
|
|
|
|
#define SSE_GG0(X, Y, Z, DST) \
|
|
SSE_FF0(X, Y, Z, DST)
|
|
|
|
// DST = (Y XOR Z) AND X XOR Z
|
|
#define SSE_GG1(X, Y, Z, DST) \
|
|
MOVOU Y, DST; \
|
|
PXOR Z, DST; \
|
|
PAND X, DST; \
|
|
PXOR Z, DST
|
|
|
|
#define SSE_COPY_RESULT(b, d, f, h, TT1, TT2) \
|
|
PROLD(b, 9); \
|
|
MOVOU TT1, h; \
|
|
PROLD(f, 19); \
|
|
MOVOU TT2, TT1; \
|
|
PROLD(TT1, 9); \
|
|
PXOR TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
|
|
PSHUFB r08_mask<>(SB), TT1; \ // ROTL(17, tt2)
|
|
PXOR TT2, TT1; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
|
|
MOVOU TT1, d
|
|
|
|
#define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
|
|
SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
|
|
SSE_FF0(a, b, c, X14); \
|
|
PADDL d, X14; \ // (a XOR b XOR c) + d
|
|
loadWord(X10, index); \
|
|
loadWord(X11, index+4); \
|
|
PXOR X10, X11; \ //Wt XOR Wt+4
|
|
PADDL X11, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
|
|
PADDL X14, X13; \ // TT1
|
|
PADDL h, X10; \ // Wt + h
|
|
PADDL X12, X10; \ // Wt + h + SS1
|
|
SSE_GG0(e, f, g, X11); \
|
|
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
|
; \ // copy result
|
|
SSE_COPY_RESULT(b, d, f, h, X13, X10)
|
|
|
|
#define MESSAGE_SCHEDULE(index) \
|
|
loadWord(X10, index+1); \ // Wj-3
|
|
PROLD(X10, 15); \
|
|
loadWord(X11, index-12); \ // Wj-16
|
|
PXOR X11, X10; \
|
|
loadWord(X11, index-5); \ // Wj-9
|
|
PXOR X11, X10; \
|
|
MOVOU X10, X11; \
|
|
PROLD(X11, 15); \
|
|
PXOR X11, X10; \
|
|
PSHUFB r08_mask<>(SB), X11; \
|
|
PXOR X11, X10; \ // P1
|
|
loadWord(X11, index-9); \ // Wj-13
|
|
PROLD(X11, 7); \
|
|
PXOR X11, X10; \
|
|
loadWord(X11, index-2); \ // Wj-6
|
|
PXOR X10, X11; \
|
|
storeWord(X11, index+4)
|
|
|
|
#define ROUND_12_15(index, a, b, c, d, e, f, g, h) \
|
|
MESSAGE_SCHEDULE(index); \
|
|
ROUND_00_11(index, a, b, c, d, e, f, g, h)
|
|
|
|
#define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
|
|
MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
|
|
SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
|
|
; \
|
|
SSE_FF1(a, b, c, X10, X14); \
|
|
PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
|
|
loadWord(X10, index); \
|
|
PXOR X10, X11; \ //Wt XOR Wt+4
|
|
PADDL X11, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
|
|
PADDL X14, X13; \ // TT1
|
|
; \
|
|
PADDL h, X10; \ // Wt + h
|
|
PADDL X12, X10; \ // Wt + h + SS1
|
|
SSE_GG1(e, f, g, X11); \
|
|
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
|
; \ // copy result
|
|
SSE_COPY_RESULT(b, d, f, h, X13, X10)
|
|
|
|
// transpose matrix function, AVX version
|
|
// parameters:
|
|
// - r0: 128 bits register as input/output data
|
|
// - r1: 128 bits register as input/output data
|
|
// - r2: 128 bits register as input/output data
|
|
// - r3: 128 bits register as input/output data
|
|
// - tmp1: 128 bits temp register
|
|
// - tmp2: 128 bits temp register
|
|
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
|
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = tmp2 = [w07, w03, w06, w02]
|
|
VPUNPCKLDQ r1, r0, r0; \ // r0 = r0 = [w05, w01, w04, w00]
|
|
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = tmp1 = [w13, w09, w12, w08]
|
|
VPUNPCKHDQ r3, r2, r2; \ // r2 = r2 = [w15, w11, w14, w10]
|
|
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = r1 = [w13, w09, w05, w01]
|
|
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = r0 = [w12, w08, w04, w00]
|
|
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = r3 = [w15, w11, w07, w03]
|
|
VPUNPCKLQDQ r2, tmp2, r2 // r2 = r2 = [w14, w10, w06, w02]
|
|
|
|
#define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX)
|
|
#define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W
|
|
|
|
#define avxStoreState(R) \
|
|
VMOVDQU a, (0*16)(R) \
|
|
VMOVDQU b, (1*16)(R) \
|
|
VMOVDQU c, (2*16)(R) \
|
|
VMOVDQU d, (3*16)(R) \
|
|
VMOVDQU e, (4*16)(R) \
|
|
VMOVDQU f, (5*16)(R) \
|
|
VMOVDQU g, (6*16)(R) \
|
|
VMOVDQU h, (7*16)(R)
|
|
|
|
#define AVX_REV32(a, b, c, d) \
|
|
VPSHUFB flip_mask<>(SB), a, a; \
|
|
VPSHUFB flip_mask<>(SB), b, b; \
|
|
VPSHUFB flip_mask<>(SB), c, c; \
|
|
VPSHUFB flip_mask<>(SB), d, d
|
|
|
|
#define avxPrepare4Words(i) \
|
|
VMOVDQU (i*16)(R8), X10; \
|
|
VMOVDQU (i*16)(R9), X11; \
|
|
VMOVDQU (i*16)(R10), X12; \
|
|
VMOVDQU (i*16)(R11), X13; \
|
|
; \
|
|
TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
|
|
AVX_REV32(X10, X11, X12, X13); \
|
|
; \
|
|
avxStoreWord(X10, 4*i+0); \
|
|
avxStoreWord(X11, 4*i+1); \
|
|
avxStoreWord(X12, 4*i+2); \
|
|
avxStoreWord(X13, 4*i+3)
|
|
|
|
#define AVX_LOAD_T(index, T) \
|
|
MOVL (index*4)(AX), T; \
|
|
VPSHUFD $0, T, T
|
|
|
|
// r <<< n
|
|
#define VPROLD(r, n) \
|
|
VPSLLD $(n), r, tmp1; \
|
|
VPSRLD $(32-n), r, r; \
|
|
VPOR tmp1, r, r
|
|
|
|
// d = r <<< n
|
|
#define VPROLD2(r, d, n) \
|
|
VPSLLD $(n), r, tmp1; \
|
|
VPSRLD $(32-n), r, d; \
|
|
VPOR tmp1, d, d
|
|
|
|
#define AVX_SS1SS2(index, a, e, SS1, SS2) \
|
|
VPROLD2(a, SS2, 12); \ // a <<< 12
|
|
AVX_LOAD_T(index, SS1); \
|
|
VPADDD SS1, SS2, SS1; \
|
|
VPADDD e, SS1, SS1; \
|
|
VPROLD(SS1, 7); \ // SS1
|
|
VPXOR SS1, SS2, SS2
|
|
|
|
// DST = X XOR Y XOR Z
|
|
#define AVX_FF0(X, Y, Z, DST) \
|
|
VPXOR X, Y, DST; \
|
|
VPXOR Z, DST, DST
|
|
|
|
// DST = (X AND Y) OR (X AND Z) OR (Y AND Z)
|
|
#define AVX_FF1(X, Y, Z, TMP, DST) \
|
|
VPOR X, Y, DST; \
|
|
VPAND X, Y, TMP; \
|
|
VPAND Z, DST, DST; \
|
|
VPOR TMP, DST, DST
|
|
|
|
// DST = X XOR Y XOR Z
|
|
#define AVX_GG0(X, Y, Z, DST) \
|
|
AVX_FF0(X, Y, Z, DST)
|
|
|
|
// DST = (Y XOR Z) AND X XOR Z
|
|
#define AVX_GG1(X, Y, Z, DST) \
|
|
VPXOR Y, Z, DST; \
|
|
VPAND X, DST, DST; \
|
|
VPXOR Z, DST, DST
|
|
|
|
#define AVX_COPY_RESULT(b, d, f, h, TT1, TT2) \
|
|
VPROLD(b, 9); \
|
|
VMOVDQU TT1, h; \
|
|
VPROLD(f, 19); \
|
|
VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9
|
|
VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
|
|
VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2)
|
|
VPXOR TT2, TT1, d
|
|
|
|
#define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \
|
|
AVX_SS1SS2(index, a, e, X12, X13); \
|
|
; \
|
|
AVX_FF0(a, b, c, X14); \
|
|
VPADDD d, X14, X14; \ // (a XOR b XOR c) + d
|
|
avxLoadWord(X10, index); \
|
|
avxLoadWord(X11, index+4); \
|
|
VPXOR X10, X11, X11; \ //Wt XOR Wt+4
|
|
VPADDD X11, X14, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
|
|
VPADDD X14, X13, X13; \ // TT1
|
|
VPADDD h, X10, X10; \ // Wt + h
|
|
VPADDD X12, X10, X10; \ // Wt + h + SS1
|
|
AVX_GG0(e, f, g, X11); \
|
|
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
|
; \ // copy result
|
|
AVX_COPY_RESULT(b, d, f, h, X13, X10)
|
|
|
|
#define AVX_MESSAGE_SCHEDULE(index) \
|
|
avxLoadWord(X10, index+1); \ // Wj-3
|
|
VPROLD(X10, 15); \
|
|
VPXOR (128+(index-12)*16)(BX), X10, X10; \ // Wj-16
|
|
VPXOR (128+(index-5)*16)(BX), X10, X10; \ // Wj-9
|
|
; \ // P1
|
|
VPROLD2(X10, X11, 15); \
|
|
VPXOR X11, X10, X10; \
|
|
VPSHUFB r08_mask<>(SB), X11, X11; \
|
|
VPXOR X11, X10, X10; \ // P1
|
|
avxLoadWord(X11, index-9); \ // Wj-13
|
|
VPROLD(X11, 7); \
|
|
VPXOR X11, X10, X10; \
|
|
VPXOR (128+(index-2)*16)(BX), X10, X11; \
|
|
avxStoreWord(X11, index+4)
|
|
|
|
#define AVX_ROUND_12_15(index, a, b, c, d, e, f, g, h) \
|
|
AVX_MESSAGE_SCHEDULE(index); \
|
|
AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h)
|
|
|
|
#define AVX_ROUND_16_63(index, a, b, c, d, e, f, g, h) \
|
|
AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
|
|
AVX_SS1SS2(index, a, e, X12, X13); \
|
|
; \
|
|
AVX_FF1(a, b, c, X10, X14); \
|
|
VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
|
|
avxLoadWord(X10, index); \
|
|
VPXOR X10, X11, X11; \ //Wt XOR Wt+4
|
|
VPADDD X11, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
|
|
VPADDD X14, X13, X13; \ // TT1
|
|
; \
|
|
VPADDD h, X10, X10; \ // Wt + h
|
|
VPADDD X12, X10, X10; \ // Wt + h + SS1
|
|
AVX_GG1(e, f, g, X11); \
|
|
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
|
; \ // copy result
|
|
AVX_COPY_RESULT(b, d, f, h, X13, X10)
|
|
|
|
// blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int)
|
|
TEXT ·blockMultBy4(SB),NOSPLIT,$0
|
|
MOVQ dig+0(FP), DI
|
|
MOVQ p+8(FP), SI
|
|
MOVQ buffer+16(FP), BX
|
|
MOVQ blocks+24(FP), DX
|
|
|
|
CMPB ·useAVX(SB), $1
|
|
JE avx
|
|
|
|
// load state
|
|
MOVQ (DI), R8
|
|
MOVOU (0*16)(R8), a
|
|
MOVOU (1*16)(R8), e
|
|
MOVQ 8(DI), R8
|
|
MOVOU (0*16)(R8), b
|
|
MOVOU (1*16)(R8), f
|
|
MOVQ 16(DI), R8
|
|
MOVOU (0*16)(R8), c
|
|
MOVOU (1*16)(R8), g
|
|
MOVQ 24(DI), R8
|
|
MOVOU (0*16)(R8), d
|
|
MOVOU (1*16)(R8), h
|
|
|
|
// transpose state
|
|
SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
|
|
SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
|
|
|
|
// store state to temporary buffer
|
|
storeState(BX)
|
|
|
|
MOVQ $·_K+0(SB), AX
|
|
MOVQ (SI), R8
|
|
MOVQ 8(SI), R9
|
|
MOVQ 16(SI), R10
|
|
MOVQ 24(SI), R11
|
|
|
|
loop:
|
|
// load message block
|
|
prepare4Words(0)
|
|
prepare4Words(1)
|
|
prepare4Words(2)
|
|
prepare4Words(3)
|
|
|
|
ROUND_00_11(0, a, b, c, d, e, f, g, h)
|
|
ROUND_00_11(1, h, a, b, c, d, e, f, g)
|
|
ROUND_00_11(2, g, h, a, b, c, d, e, f)
|
|
ROUND_00_11(3, f, g, h, a, b, c, d, e)
|
|
ROUND_00_11(4, e, f, g, h, a, b, c, d)
|
|
ROUND_00_11(5, d, e, f, g, h, a, b, c)
|
|
ROUND_00_11(6, c, d, e, f, g, h, a, b)
|
|
ROUND_00_11(7, b, c, d, e, f, g, h, a)
|
|
ROUND_00_11(8, a, b, c, d, e, f, g, h)
|
|
ROUND_00_11(9, h, a, b, c, d, e, f, g)
|
|
ROUND_00_11(10, g, h, a, b, c, d, e, f)
|
|
ROUND_00_11(11, f, g, h, a, b, c, d, e)
|
|
|
|
ROUND_12_15(12, e, f, g, h, a, b, c, d)
|
|
ROUND_12_15(13, d, e, f, g, h, a, b, c)
|
|
ROUND_12_15(14, c, d, e, f, g, h, a, b)
|
|
ROUND_12_15(15, b, c, d, e, f, g, h, a)
|
|
|
|
ROUND_16_63(16, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(17, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(18, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(19, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(20, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(21, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(22, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(23, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(24, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(25, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(26, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(27, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(28, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(29, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(30, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(31, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(32, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(33, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(34, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(35, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(36, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(37, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(38, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(39, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(40, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(41, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(42, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(43, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(44, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(45, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(46, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(47, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(48, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(49, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(50, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(51, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(52, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(53, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(54, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(55, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(56, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(57, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(58, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(59, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(60, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(61, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(62, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(63, b, c, d, e, f, g, h, a)
|
|
|
|
MOVOU (0*16)(BX), tmp1
|
|
PXOR tmp1, a
|
|
MOVOU (1*16)(BX), tmp1
|
|
PXOR tmp1, b
|
|
MOVOU (2*16)(BX), tmp1
|
|
PXOR tmp1, c
|
|
MOVOU (3*16)(BX), tmp1
|
|
PXOR tmp1, d
|
|
MOVOU (4*16)(BX), tmp1
|
|
PXOR tmp1, e
|
|
MOVOU (5*16)(BX), tmp1
|
|
PXOR tmp1, f
|
|
MOVOU (6*16)(BX), tmp1
|
|
PXOR tmp1, g
|
|
MOVOU (7*16)(BX), tmp1
|
|
PXOR tmp1, h
|
|
|
|
DECQ DX
|
|
JZ end
|
|
|
|
storeState(BX)
|
|
LEAQ 64(R8), R8
|
|
LEAQ 64(R9), R9
|
|
LEAQ 64(R10), R10
|
|
LEAQ 64(R11), R11
|
|
JMP loop
|
|
|
|
end:
|
|
// transpose state
|
|
SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
|
|
SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
|
|
|
|
MOVQ (DI), R8
|
|
MOVOU a, (0*16)(R8)
|
|
MOVOU e, (1*16)(R8)
|
|
MOVQ 8(DI), R8
|
|
MOVOU b, (0*16)(R8)
|
|
MOVOU f, (1*16)(R8)
|
|
MOVQ 16(DI), R8
|
|
MOVOU c, (0*16)(R8)
|
|
MOVOU g, (1*16)(R8)
|
|
MOVQ 24(DI), R8
|
|
MOVOU d, (0*16)(R8)
|
|
MOVOU h, (1*16)(R8)
|
|
|
|
RET
|
|
|
|
avx:
|
|
// load state
|
|
MOVQ (DI), R8
|
|
VMOVDQU (0*16)(R8), a
|
|
VMOVDQU (1*16)(R8), e
|
|
MOVQ 8(DI), R8
|
|
VMOVDQU (0*16)(R8), b
|
|
VMOVDQU (1*16)(R8), f
|
|
MOVQ 16(DI), R8
|
|
VMOVDQU (0*16)(R8), c
|
|
VMOVDQU (1*16)(R8), g
|
|
MOVQ 24(DI), R8
|
|
VMOVDQU (0*16)(R8), d
|
|
VMOVDQU (1*16)(R8), h
|
|
|
|
// transpose state
|
|
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
|
|
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
|
|
|
|
avxStoreState(BX)
|
|
|
|
MOVQ $·_K+0(SB), AX
|
|
MOVQ (SI), R8
|
|
MOVQ 8(SI), R9
|
|
MOVQ 16(SI), R10
|
|
MOVQ 24(SI), R11
|
|
|
|
avxLoop:
|
|
// load message block
|
|
avxPrepare4Words(0)
|
|
avxPrepare4Words(1)
|
|
avxPrepare4Words(2)
|
|
avxPrepare4Words(3)
|
|
|
|
AVX_ROUND_00_11(0, a, b, c, d, e, f, g, h)
|
|
AVX_ROUND_00_11(1, h, a, b, c, d, e, f, g)
|
|
AVX_ROUND_00_11(2, g, h, a, b, c, d, e, f)
|
|
AVX_ROUND_00_11(3, f, g, h, a, b, c, d, e)
|
|
AVX_ROUND_00_11(4, e, f, g, h, a, b, c, d)
|
|
AVX_ROUND_00_11(5, d, e, f, g, h, a, b, c)
|
|
AVX_ROUND_00_11(6, c, d, e, f, g, h, a, b)
|
|
AVX_ROUND_00_11(7, b, c, d, e, f, g, h, a)
|
|
AVX_ROUND_00_11(8, a, b, c, d, e, f, g, h)
|
|
AVX_ROUND_00_11(9, h, a, b, c, d, e, f, g)
|
|
AVX_ROUND_00_11(10, g, h, a, b, c, d, e, f)
|
|
AVX_ROUND_00_11(11, f, g, h, a, b, c, d, e)
|
|
|
|
AVX_ROUND_12_15(12, e, f, g, h, a, b, c, d)
|
|
AVX_ROUND_12_15(13, d, e, f, g, h, a, b, c)
|
|
AVX_ROUND_12_15(14, c, d, e, f, g, h, a, b)
|
|
AVX_ROUND_12_15(15, b, c, d, e, f, g, h, a)
|
|
|
|
AVX_ROUND_16_63(16, a, b, c, d, e, f, g, h)
|
|
AVX_ROUND_16_63(17, h, a, b, c, d, e, f, g)
|
|
AVX_ROUND_16_63(18, g, h, a, b, c, d, e, f)
|
|
AVX_ROUND_16_63(19, f, g, h, a, b, c, d, e)
|
|
AVX_ROUND_16_63(20, e, f, g, h, a, b, c, d)
|
|
AVX_ROUND_16_63(21, d, e, f, g, h, a, b, c)
|
|
AVX_ROUND_16_63(22, c, d, e, f, g, h, a, b)
|
|
AVX_ROUND_16_63(23, b, c, d, e, f, g, h, a)
|
|
AVX_ROUND_16_63(24, a, b, c, d, e, f, g, h)
|
|
AVX_ROUND_16_63(25, h, a, b, c, d, e, f, g)
|
|
AVX_ROUND_16_63(26, g, h, a, b, c, d, e, f)
|
|
AVX_ROUND_16_63(27, f, g, h, a, b, c, d, e)
|
|
AVX_ROUND_16_63(28, e, f, g, h, a, b, c, d)
|
|
AVX_ROUND_16_63(29, d, e, f, g, h, a, b, c)
|
|
AVX_ROUND_16_63(30, c, d, e, f, g, h, a, b)
|
|
AVX_ROUND_16_63(31, b, c, d, e, f, g, h, a)
|
|
AVX_ROUND_16_63(32, a, b, c, d, e, f, g, h)
|
|
AVX_ROUND_16_63(33, h, a, b, c, d, e, f, g)
|
|
AVX_ROUND_16_63(34, g, h, a, b, c, d, e, f)
|
|
AVX_ROUND_16_63(35, f, g, h, a, b, c, d, e)
|
|
AVX_ROUND_16_63(36, e, f, g, h, a, b, c, d)
|
|
AVX_ROUND_16_63(37, d, e, f, g, h, a, b, c)
|
|
AVX_ROUND_16_63(38, c, d, e, f, g, h, a, b)
|
|
AVX_ROUND_16_63(39, b, c, d, e, f, g, h, a)
|
|
AVX_ROUND_16_63(40, a, b, c, d, e, f, g, h)
|
|
AVX_ROUND_16_63(41, h, a, b, c, d, e, f, g)
|
|
AVX_ROUND_16_63(42, g, h, a, b, c, d, e, f)
|
|
AVX_ROUND_16_63(43, f, g, h, a, b, c, d, e)
|
|
AVX_ROUND_16_63(44, e, f, g, h, a, b, c, d)
|
|
AVX_ROUND_16_63(45, d, e, f, g, h, a, b, c)
|
|
AVX_ROUND_16_63(46, c, d, e, f, g, h, a, b)
|
|
AVX_ROUND_16_63(47, b, c, d, e, f, g, h, a)
|
|
AVX_ROUND_16_63(48, a, b, c, d, e, f, g, h)
|
|
AVX_ROUND_16_63(49, h, a, b, c, d, e, f, g)
|
|
AVX_ROUND_16_63(50, g, h, a, b, c, d, e, f)
|
|
AVX_ROUND_16_63(51, f, g, h, a, b, c, d, e)
|
|
AVX_ROUND_16_63(52, e, f, g, h, a, b, c, d)
|
|
AVX_ROUND_16_63(53, d, e, f, g, h, a, b, c)
|
|
AVX_ROUND_16_63(54, c, d, e, f, g, h, a, b)
|
|
AVX_ROUND_16_63(55, b, c, d, e, f, g, h, a)
|
|
AVX_ROUND_16_63(56, a, b, c, d, e, f, g, h)
|
|
AVX_ROUND_16_63(57, h, a, b, c, d, e, f, g)
|
|
AVX_ROUND_16_63(58, g, h, a, b, c, d, e, f)
|
|
AVX_ROUND_16_63(59, f, g, h, a, b, c, d, e)
|
|
AVX_ROUND_16_63(60, e, f, g, h, a, b, c, d)
|
|
AVX_ROUND_16_63(61, d, e, f, g, h, a, b, c)
|
|
AVX_ROUND_16_63(62, c, d, e, f, g, h, a, b)
|
|
AVX_ROUND_16_63(63, b, c, d, e, f, g, h, a)
|
|
|
|
VPXOR (0*16)(BX), a, a
|
|
VPXOR (1*16)(BX), b, b
|
|
VPXOR (2*16)(BX), c, c
|
|
VPXOR (3*16)(BX), d, d
|
|
VPXOR (4*16)(BX), e, e
|
|
VPXOR (5*16)(BX), f, f
|
|
VPXOR (6*16)(BX), g, g
|
|
VPXOR (7*16)(BX), h, h
|
|
|
|
DECQ DX
|
|
JZ avxEnd
|
|
|
|
// store current state
|
|
avxStoreState(BX)
|
|
|
|
LEAQ 64(R8), R8
|
|
LEAQ 64(R9), R9
|
|
LEAQ 64(R10), R10
|
|
LEAQ 64(R11), R11
|
|
JMP avxLoop
|
|
|
|
avxEnd:
|
|
// transpose state
|
|
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
|
|
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
|
|
|
|
MOVQ (DI), R8
|
|
VMOVDQU a, (0*16)(R8)
|
|
VMOVDQU e, (1*16)(R8)
|
|
MOVQ 8(DI), R8
|
|
VMOVDQU b, (0*16)(R8)
|
|
VMOVDQU f, (1*16)(R8)
|
|
MOVQ 16(DI), R8
|
|
VMOVDQU c, (0*16)(R8)
|
|
VMOVDQU g, (1*16)(R8)
|
|
MOVQ 24(DI), R8
|
|
VMOVDQU d, (0*16)(R8)
|
|
VMOVDQU h, (1*16)(R8)
|
|
|
|
RET
|
|
|
|
// func copyResultsBy4(dig *uint32, dst *byte)
|
|
TEXT ·copyResultsBy4(SB),NOSPLIT,$0
|
|
MOVQ dig+0(FP), DI
|
|
MOVQ dst+8(FP), SI
|
|
|
|
CMPB ·useAVX(SB), $1
|
|
JE avx
|
|
|
|
// load state
|
|
MOVOU (0*16)(DI), a
|
|
MOVOU (1*16)(DI), b
|
|
MOVOU (2*16)(DI), c
|
|
MOVOU (3*16)(DI), d
|
|
MOVOU (4*16)(DI), e
|
|
MOVOU (5*16)(DI), f
|
|
MOVOU (6*16)(DI), g
|
|
MOVOU (7*16)(DI), h
|
|
|
|
SSE_REV32(a, b, c, d)
|
|
SSE_REV32(e, f, g, h)
|
|
storeState(SI)
|
|
|
|
RET
|
|
|
|
avx:
|
|
// load state
|
|
VMOVDQU (0*16)(DI), a
|
|
VMOVDQU (1*16)(DI), b
|
|
VMOVDQU (2*16)(DI), c
|
|
VMOVDQU (3*16)(DI), d
|
|
VMOVDQU (4*16)(DI), e
|
|
VMOVDQU (5*16)(DI), f
|
|
VMOVDQU (6*16)(DI), g
|
|
VMOVDQU (7*16)(DI), h
|
|
|
|
AVX_REV32(a, b, c, d)
|
|
AVX_REV32(e, f, g, h)
|
|
|
|
avxStoreState(SI)
|
|
|
|
RET
|