mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-22 10:16:18 +08:00
zuc: refactoring
This commit is contained in:
parent
1f209d2317
commit
58ad15fde8
@ -18,7 +18,7 @@ func block256(m *ZUC256Mac, p []byte) {
|
||||
case 16:
|
||||
eia256RoundTag16(&m.t[0], &m.k0[0], &p[0])
|
||||
default:
|
||||
eia3Round16B(&m.t[0], &m.k0[0], &p[0], m.tagSize)
|
||||
eiaRoundTag4(&m.t[0], &m.k0[0], &p[0])
|
||||
}
|
||||
p = p[chunk:]
|
||||
}
|
||||
|
@ -1,448 +0,0 @@
|
||||
// Referenced Intel(R) Multi-Buffer Crypto for IPsec
|
||||
// https://github.com/intel/intel-ipsec-mb/
|
||||
//go:build !purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800
|
||||
DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901
|
||||
GLOBL bit_reverse_table_l<>(SB), RODATA, $16
|
||||
|
||||
DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000
|
||||
DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010
|
||||
GLOBL bit_reverse_table_h<>(SB), RODATA, $16
|
||||
|
||||
DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
|
||||
DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
|
||||
GLOBL bit_reverse_and_table<>(SB), RODATA, $16
|
||||
|
||||
DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100
|
||||
DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
|
||||
GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16
|
||||
|
||||
DATA shuf_mask_0_0_dw1_0<>+0x00(SB)/8, $0xffffffffffffffff
|
||||
DATA shuf_mask_0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
|
||||
GLOBL shuf_mask_0_0_dw1_0<>(SB), RODATA, $16
|
||||
|
||||
DATA shuf_mask_0_0_0_dw1<>+0x00(SB)/8, $0xffffffffffffffff
|
||||
DATA shuf_mask_0_0_0_dw1<>+0x08(SB)/8, $0x07060504ffffffff
|
||||
GLOBL shuf_mask_0_0_0_dw1<>(SB), RODATA, $16
|
||||
|
||||
DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908
|
||||
DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c
|
||||
GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
|
||||
|
||||
DATA bits_32_63<>+0x00(SB)/8, $0xffffffff00000000
|
||||
DATA bits_32_63<>+0x08(SB)/8, $0x0000000000000000
|
||||
GLOBL bits_32_63<>(SB), RODATA, $16
|
||||
|
||||
|
||||
#define XTMP1 X1
|
||||
#define XTMP2 X2
|
||||
#define XTMP3 X3
|
||||
#define XTMP4 X4
|
||||
#define XTMP5 X5
|
||||
#define XTMP6 X6
|
||||
#define XDATA X7
|
||||
#define XDIGEST X8
|
||||
#define KS_L X9
|
||||
#define KS_M1 X10
|
||||
#define KS_M2 X11
|
||||
#define KS_H X12
|
||||
|
||||
// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
|
||||
MOVQ t+0(FP), AX
|
||||
MOVQ ks+8(FP), BX
|
||||
MOVQ p+16(FP), CX
|
||||
|
||||
CMPB ·useAVX(SB), $1
|
||||
JE avx
|
||||
|
||||
// Reverse data bytes
|
||||
MOVUPS (0)(CX), XDATA
|
||||
MOVOU bit_reverse_and_table<>(SB), XTMP4
|
||||
MOVOU XDATA, XTMP2
|
||||
PAND XTMP4, XTMP2
|
||||
|
||||
PANDN XDATA, XTMP4
|
||||
PSRLQ $4, XTMP4
|
||||
|
||||
MOVOU bit_reverse_table_h<>(SB), XTMP3
|
||||
PSHUFB XTMP2, XTMP3
|
||||
|
||||
MOVOU bit_reverse_table_l<>(SB), XTMP1
|
||||
PSHUFB XTMP4, XTMP1
|
||||
|
||||
PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
MOVUPS (0*4)(BX), XTMP1
|
||||
MOVUPS (2*4)(BX), XTMP2
|
||||
MOVUPS (4*4)(BX), XTMP4
|
||||
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
||||
PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
||||
|
||||
// setup DATA
|
||||
MOVOU XTMP3, XTMP1
|
||||
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
||||
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
||||
|
||||
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
|
||||
MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Save data for following products
|
||||
MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s]
|
||||
MOVOU XTMP3, XTMP6 // Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// Calculate lower 32 bits of tag
|
||||
PCLMULQDQ $0x00, KS_L, XTMP1
|
||||
PCLMULQDQ $0x11, KS_L, XTMP2
|
||||
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
||||
PCLMULQDQ $0x11, KS_M1, XTMP3
|
||||
|
||||
// XOR all products and move bits 63-32 bits to lower 32 bits
|
||||
PXOR XTMP1, XTMP2
|
||||
PXOR XTMP3, XDIGEST
|
||||
PXOR XTMP2, XDIGEST
|
||||
MOVQ XDIGEST, XDIGEST // Clear top 64 bits
|
||||
PSRLDQ $4, XDIGEST
|
||||
|
||||
// Prepare data and calculate bits 63-32 of tag
|
||||
MOVOU XTMP5, XTMP1
|
||||
MOVOU XTMP5, XTMP2
|
||||
MOVOU XTMP6, XTMP3
|
||||
MOVOU XTMP6, XTMP4
|
||||
|
||||
PCLMULQDQ $0x10, KS_L, XTMP1
|
||||
PCLMULQDQ $0x01, KS_M1, XTMP2
|
||||
PCLMULQDQ $0x10, KS_M1, XTMP3
|
||||
PCLMULQDQ $0x01, KS_M2, XTMP4
|
||||
|
||||
// XOR all the products and keep only bits 63-32
|
||||
PXOR XTMP2, XTMP1
|
||||
PXOR XTMP4, XTMP3
|
||||
PXOR XTMP3, XTMP1
|
||||
PAND bits_32_63<>(SB), XTMP1
|
||||
|
||||
// OR with lower 32 bits, to construct 64 bits of tag
|
||||
POR XTMP1, XDIGEST
|
||||
|
||||
// Update tag
|
||||
MOVQ XDIGEST, R10
|
||||
XORQ R10, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
MOVUPS (4*4)(BX), XTMP1
|
||||
MOVUPS XTMP1, (0*4)(BX)
|
||||
|
||||
RET
|
||||
|
||||
avx:
|
||||
VMOVDQU (0)(CX), XDATA
|
||||
|
||||
// Reverse data bytes
|
||||
VMOVDQU bit_reverse_and_table<>(SB), XTMP1
|
||||
VPAND XTMP1, XDATA, XTMP2
|
||||
VPANDN XDATA, XTMP1, XTMP3
|
||||
VPSRLD $4, XTMP3, XTMP3
|
||||
|
||||
VMOVDQU bit_reverse_table_h<>(SB), XTMP1
|
||||
VPSHUFB XTMP2, XTMP1, XTMP4
|
||||
VMOVDQU bit_reverse_table_l<>(SB), XTMP1
|
||||
VPSHUFB XTMP3, XTMP1, XTMP1
|
||||
VPOR XTMP1, XTMP4, XTMP4
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
|
||||
VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
||||
|
||||
// setup DATA
|
||||
// Data bytes [31:0 0s 63:32 0s]
|
||||
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
|
||||
// Data bytes [95:64 0s 127:96 0s]
|
||||
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
|
||||
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
|
||||
|
||||
VPXOR XTMP3, XTMP4, XTMP3
|
||||
VPXOR XTMP5, XTMP6, XTMP5
|
||||
VPXOR XTMP3, XTMP5, XTMP3
|
||||
|
||||
// Move previous result to low 32 bits and XOR with previous digest
|
||||
VMOVQ XTMP3, XTMP3 // Clear top 64 bits
|
||||
VPSRLDQ $4, XTMP3, XDIGEST
|
||||
|
||||
VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
|
||||
|
||||
// XOR all the products and keep only 32-63 bits
|
||||
VPXOR XTMP4, XTMP3, XTMP3
|
||||
VPXOR XTMP6, XTMP5, XTMP5
|
||||
VPXOR XTMP5, XTMP3, XTMP3
|
||||
VPAND bits_32_63<>(SB), XTMP3, XTMP3
|
||||
|
||||
// XOR with bits 32-63 of previous digest
|
||||
VPXOR XTMP3, XDIGEST, XDIGEST
|
||||
|
||||
// Update tag
|
||||
VMOVQ XDIGEST, R10
|
||||
XORQ R10, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
VMOVDQU (4*4)(BX), XTMP1
|
||||
VMOVDQU XTMP1, (0*4)(BX)
|
||||
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
|
||||
MOVQ t+0(FP), AX
|
||||
MOVQ ks+8(FP), BX
|
||||
MOVQ p+16(FP), CX
|
||||
|
||||
CMPB ·useAVX(SB), $1
|
||||
JE avx
|
||||
|
||||
// Reverse data bytes
|
||||
MOVUPS (0)(CX), XDATA
|
||||
MOVOU bit_reverse_and_table<>(SB), XTMP4
|
||||
MOVOU XDATA, XTMP2
|
||||
PAND XTMP4, XTMP2
|
||||
|
||||
PANDN XDATA, XTMP4
|
||||
PSRLQ $4, XTMP4
|
||||
|
||||
MOVOU bit_reverse_table_h<>(SB), XTMP3
|
||||
PSHUFB XTMP2, XTMP3
|
||||
|
||||
MOVOU bit_reverse_table_l<>(SB), XTMP1
|
||||
PSHUFB XTMP4, XTMP1
|
||||
|
||||
PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
MOVUPS (0*4)(BX), XTMP1
|
||||
MOVUPS (2*4)(BX), XTMP2
|
||||
MOVUPS (4*4)(BX), XTMP4
|
||||
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
||||
PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
||||
PSHUFD $0xBB, XTMP4, KS_H // KS bits [255:224 223:192 255:224 223:192]
|
||||
|
||||
// setup DATA
|
||||
MOVOU XTMP3, XTMP1
|
||||
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
||||
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
||||
|
||||
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
|
||||
MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Save data for following products
|
||||
MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s]
|
||||
MOVOU XTMP3, XTMP6 // Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// Calculate lower 32 bits of tag
|
||||
PCLMULQDQ $0x00, KS_L, XTMP1
|
||||
PCLMULQDQ $0x11, KS_L, XTMP2
|
||||
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
||||
PCLMULQDQ $0x11, KS_M1, XTMP3
|
||||
|
||||
// XOR all products and move bits 63-32 bits to lower 32 bits
|
||||
PXOR XTMP1, XTMP2
|
||||
PXOR XTMP3, XDIGEST
|
||||
PXOR XTMP2, XDIGEST
|
||||
MOVQ XDIGEST, XDIGEST // Clear top 64 bits
|
||||
PSRLDQ $4, XDIGEST
|
||||
|
||||
// Prepare data and calculate bits 63-32 of tag
|
||||
MOVOU XTMP5, XTMP1
|
||||
MOVOU XTMP5, XTMP2
|
||||
MOVOU XTMP6, XTMP3
|
||||
MOVOU XTMP6, XTMP4
|
||||
|
||||
PCLMULQDQ $0x10, KS_L, XTMP1
|
||||
PCLMULQDQ $0x01, KS_M1, XTMP2
|
||||
PCLMULQDQ $0x10, KS_M1, XTMP3
|
||||
PCLMULQDQ $0x01, KS_M2, XTMP4
|
||||
|
||||
// XOR all the products and keep only bits 63-32
|
||||
PXOR XTMP2, XTMP1
|
||||
PXOR XTMP4, XTMP3
|
||||
PXOR XTMP3, XTMP1
|
||||
PAND bits_32_63<>(SB), XTMP1
|
||||
|
||||
// OR with lower 32 bits, to construct 64 bits of tag
|
||||
POR XTMP1, XDIGEST
|
||||
|
||||
// Prepare data and calculate bits 95-64 of tag
|
||||
MOVOU XTMP5, XTMP1
|
||||
MOVOU XTMP5, XTMP2
|
||||
MOVOU XTMP6, XTMP3
|
||||
MOVOU XTMP6, XTMP4
|
||||
|
||||
PCLMULQDQ $0x00, KS_M1, XTMP1
|
||||
PCLMULQDQ $0x11, KS_M1, XTMP2
|
||||
PCLMULQDQ $0x00, KS_M2, XTMP3
|
||||
PCLMULQDQ $0x11, KS_M2, XTMP4
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 95-64
|
||||
PXOR XTMP2, XTMP1
|
||||
PXOR XTMP4, XTMP3
|
||||
PXOR XTMP3, XTMP1
|
||||
PSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP1
|
||||
|
||||
// OR with lower 64 bits, to construct 96 bits of tag
|
||||
POR XTMP1, XDIGEST
|
||||
|
||||
// Prepare data and calculate bits 127-96 of tag
|
||||
MOVOU XTMP5, XTMP1
|
||||
MOVOU XTMP5, XTMP2
|
||||
MOVOU XTMP6, XTMP3
|
||||
MOVOU XTMP6, XTMP4
|
||||
|
||||
PCLMULQDQ $0x10, KS_M1, XTMP1
|
||||
PCLMULQDQ $0x01, KS_M2, XTMP2
|
||||
PCLMULQDQ $0x10, KS_M2, XTMP3
|
||||
PCLMULQDQ $0x01, KS_H, XTMP4
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 127-96
|
||||
PXOR XTMP2, XTMP1
|
||||
PXOR XTMP4, XTMP3
|
||||
PXOR XTMP3, XTMP1
|
||||
PSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP1
|
||||
|
||||
// OR with lower 96 bits, to construct 128 bits of tag
|
||||
POR XTMP1, XDIGEST
|
||||
|
||||
// Update tag
|
||||
MOVUPS (AX), XTMP1
|
||||
PXOR XTMP1, XDIGEST
|
||||
MOVUPS XDIGEST, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
MOVUPS (4*4)(BX), XTMP1
|
||||
MOVUPS XTMP1, (0*4)(BX)
|
||||
|
||||
RET
|
||||
|
||||
avx:
|
||||
VMOVDQU (0)(CX), XDATA
|
||||
|
||||
// Reverse data bytes
|
||||
VMOVDQU bit_reverse_and_table<>(SB), XTMP1
|
||||
VPAND XTMP1, XDATA, XTMP2
|
||||
VPANDN XDATA, XTMP1, XTMP3
|
||||
VPSRLD $4, XTMP3, XTMP3
|
||||
|
||||
VMOVDQU bit_reverse_table_h<>(SB), XTMP1
|
||||
VPSHUFB XTMP2, XTMP1, XTMP4
|
||||
VMOVDQU bit_reverse_table_l<>(SB), XTMP1
|
||||
VPSHUFB XTMP3, XTMP1, XTMP1
|
||||
VPOR XTMP1, XTMP4, XTMP4
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
|
||||
VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
||||
VPSHUFD $0xBB, (4*4)(BX), KS_H // KS bits [255:224 223:192 255:224 223:192]
|
||||
|
||||
// setup DATA
|
||||
// Data bytes [31:0 0s 63:32 0s]
|
||||
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
|
||||
// Data bytes [95:64 0s 127:96 0s]
|
||||
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
|
||||
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
|
||||
|
||||
VPXOR XTMP3, XTMP4, XTMP3
|
||||
VPXOR XTMP5, XTMP6, XTMP5
|
||||
VPXOR XTMP3, XTMP5, XTMP3
|
||||
|
||||
// Move previous result to low 32 bits and XOR with previous digest
|
||||
VMOVQ XTMP3, XTMP3 // Clear top 64 bits
|
||||
VPSRLDQ $4, XTMP3, XDIGEST
|
||||
|
||||
VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
|
||||
|
||||
// XOR all the products and keep only 32-63 bits
|
||||
VPXOR XTMP4, XTMP3, XTMP3
|
||||
VPXOR XTMP6, XTMP5, XTMP5
|
||||
VPXOR XTMP5, XTMP3, XTMP3
|
||||
VPAND bits_32_63<>(SB), XTMP3, XTMP3
|
||||
|
||||
// XOR with bits 32-63 of previous digest
|
||||
VPXOR XTMP3, XDIGEST, XDIGEST
|
||||
|
||||
// Prepare data and calculate bits 95-64 of tag
|
||||
VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 95-64
|
||||
VPXOR XTMP4, XTMP3, XTMP3
|
||||
VPXOR XTMP6, XTMP5, XTMP5
|
||||
VPXOR XTMP5, XTMP3, XTMP3
|
||||
|
||||
VPSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP3, XTMP3
|
||||
|
||||
// XOR with previous bits 64-95 of previous digest
|
||||
VPXOR XTMP3, XDIGEST, XDIGEST
|
||||
|
||||
// Prepare data and calculate bits 127-96 of tag
|
||||
VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 127-96
|
||||
VPXOR XTMP4, XTMP3, XTMP3
|
||||
VPXOR XTMP6, XTMP5, XTMP5
|
||||
VPXOR XTMP5, XTMP3, XTMP3
|
||||
|
||||
VPSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP3, XTMP3
|
||||
|
||||
// XOR with previous bits 64-95 of previous digest
|
||||
VPXOR XTMP3, XDIGEST, XDIGEST
|
||||
|
||||
// Update tag
|
||||
VPXOR (AX), XDIGEST, XDIGEST
|
||||
VMOVDQA XDIGEST, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
VMOVDQU (4*4)(BX), XTMP1
|
||||
VMOVDQU XTMP1, (0*4)(BX)
|
||||
|
||||
VZEROUPPER
|
||||
RET
|
@ -1,208 +0,0 @@
|
||||
//go:build !purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define AX R2
|
||||
#define BX R3
|
||||
#define CX R4
|
||||
#define DX R5
|
||||
|
||||
#define XTMP1 V1
|
||||
#define XTMP2 V2
|
||||
#define XTMP3 V3
|
||||
#define XTMP4 V4
|
||||
#define XTMP5 V5
|
||||
#define XTMP6 V6
|
||||
#define XDATA V7
|
||||
#define XDIGEST V8
|
||||
#define KS_L V9
|
||||
#define KS_M1 V10
|
||||
#define KS_M2 V11
|
||||
#define KS_H V12
|
||||
#define BIT_REV_AND_TAB V20
|
||||
#define BIT_REV_TAB_L V21
|
||||
#define BIT_REV_TAB_H V22
|
||||
#define SHUF_MASK_DW0_DW1 V23
|
||||
#define SHUF_MASK_DW2_DW3 V24
|
||||
|
||||
#define LOAD_GLOBAL_DATA() \
|
||||
MOVD $·eia_const(SB), R0 \
|
||||
VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16, SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16] \
|
||||
MOVW $0x0F0F0F0F, R0 \
|
||||
VDUP R0, BIT_REV_AND_TAB.S4
|
||||
|
||||
// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), AX
|
||||
MOVD ks+8(FP), BX
|
||||
MOVD p+16(FP), CX
|
||||
|
||||
LOAD_GLOBAL_DATA()
|
||||
|
||||
// Reverse data bytes
|
||||
VLD1 (CX), [XDATA.B16]
|
||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
||||
VUSHR $4, XDATA.B16, XTMP1.B16
|
||||
|
||||
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
||||
VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
VLD1 (BX), [XTMP1.B16, XTMP2.B16]
|
||||
VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
|
||||
// TODO: Any better solution???
|
||||
VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
|
||||
VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32]
|
||||
VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
|
||||
VDUP XTMP1.S[3], KS_M1.S4
|
||||
VMOV XTMP1.S[2], KS_M1.S[1]
|
||||
VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
|
||||
|
||||
// setup DATA
|
||||
VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
|
||||
VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
|
||||
// Calculate lower 32 bits of tag
|
||||
VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
|
||||
VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
|
||||
VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
|
||||
VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
|
||||
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
// Move previous result to low 32 bits and XOR with previous digest
|
||||
VMOV XTMP3.S[1], XDIGEST.S[0]
|
||||
|
||||
// Prepare data and calculate bits 63-32 of tag
|
||||
VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16
|
||||
VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
|
||||
VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16
|
||||
VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
|
||||
VEXT $8, KS_M1.B16, KS_M1.B16, XTMP1.B16
|
||||
VPMULL XTMP1.D1, XTMP2.D1, XTMP5.Q1
|
||||
VEXT $8, XTMP2.B16, XTMP2.B16, XTMP1.B16
|
||||
VPMULL KS_M2.D1, XTMP1.D1, XTMP6.Q1
|
||||
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
VMOV XTMP3.S[1], XDIGEST.S[1]
|
||||
|
||||
VMOV XDIGEST.D[0], R10
|
||||
MOVD (AX), R11
|
||||
EOR R10, R11
|
||||
MOVD R11, (AX)
|
||||
|
||||
RET
|
||||
|
||||
// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), AX
|
||||
MOVD ks+8(FP), BX
|
||||
MOVD p+16(FP), CX
|
||||
|
||||
LOAD_GLOBAL_DATA()
|
||||
|
||||
// Reverse data bytes
|
||||
VLD1 (CX), [XDATA.B16]
|
||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
||||
VUSHR $4, XDATA.B16, XTMP1.B16
|
||||
|
||||
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
||||
VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
VLD1 (BX), [XTMP1.B16, XTMP2.B16]
|
||||
VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
|
||||
// TODO: Any better solution??? We can use VTBL, but there are no performance imprvoement if we can't reuse MASK constant
|
||||
VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
|
||||
VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32]
|
||||
VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
|
||||
VMOVQ $0x0b0a09080f0e0d0c, $0x0b0a09080f0e0d0c, XTMP4
|
||||
VTBL XTMP4.B16, [XTMP2.B16], KS_H.B16 // KS bits [255:224 223:192 255:224 223:192]
|
||||
VDUP XTMP1.S[3], KS_M1.S4
|
||||
VMOV XTMP1.S[2], KS_M1.S[1]
|
||||
VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
|
||||
|
||||
// setup DATA
|
||||
VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
|
||||
VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
|
||||
// Calculate lower 32 bits of tag
|
||||
VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
|
||||
VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
|
||||
VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
|
||||
VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
|
||||
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
// Move previous result to low 32 bits and XOR with previous digest
|
||||
VMOV XTMP3.S[1], XDIGEST.S[0]
|
||||
|
||||
// Prepare data and calculate bits 63-32 of tag
|
||||
VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16
|
||||
VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
|
||||
VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16
|
||||
VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
|
||||
VEXT $8, KS_M1.B16, KS_M1.B16, XTMP6.B16
|
||||
VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
|
||||
VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16
|
||||
VPMULL KS_M2.D1, KS_L.D1, XTMP6.Q1
|
||||
|
||||
// XOR all the products and keep only 32-63 bits
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
VMOV XTMP3.S[1], XDIGEST.S[1]
|
||||
|
||||
// Prepare data and calculate bits 95-64 of tag
|
||||
VPMULL KS_M1.D1, XTMP1.D1, XTMP3.Q1
|
||||
VPMULL2 KS_M1.D2, XTMP1.D2, XTMP4.Q1
|
||||
VPMULL KS_M2.D1, XTMP2.D1, XTMP5.Q1
|
||||
VPMULL2 KS_M2.D2, XTMP2.D2, XTMP6.Q1
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 95-64
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
VMOV XTMP3.S[1], XDIGEST.S[2]
|
||||
|
||||
// Prepare data and calculate bits 127-96 of tag
|
||||
VEXT $8, KS_M1.B16, KS_M1.B16, XTMP5.B16
|
||||
VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
|
||||
VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16
|
||||
VPMULL KS_M2.D1, XTMP5.D1, XTMP4.Q1
|
||||
VEXT $8, KS_M2.B16, KS_M2.B16, XTMP6.B16
|
||||
VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
|
||||
VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16
|
||||
VPMULL KS_H.D1, KS_L.D1, XTMP6.Q1
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 127-96
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
VMOV XTMP3.S[1], XDIGEST.S[3]
|
||||
|
||||
VLD1 (AX), [XTMP1.B16]
|
||||
VEOR XTMP1.B16, XDIGEST.B16, XDIGEST.B16
|
||||
VST1 [XDIGEST.B16], (AX)
|
||||
|
||||
RET
|
@ -1,196 +0,0 @@
|
||||
// Copyright 2024 Sun Yimin. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build (ppc64 || ppc64le) && !purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define XTMP1 V0
|
||||
#define XTMP2 V1
|
||||
#define XTMP3 V2
|
||||
#define XTMP4 V3
|
||||
#define XTMP5 V4
|
||||
#define XTMP6 V5
|
||||
#define XDATA V6
|
||||
#define XDIGEST V7
|
||||
#define KS_L V8
|
||||
#define KS_M1 V9
|
||||
#define KS_M2 V10
|
||||
#define KS_H V11
|
||||
#define BIT_REV_TAB_L V12
|
||||
#define BIT_REV_TAB_H V13
|
||||
#define ZERO V15
|
||||
#define PTR R7
|
||||
|
||||
// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), R3
|
||||
MOVD ks+8(FP), R4
|
||||
MOVD p+16(FP), R5
|
||||
|
||||
#ifndef GOARCH_ppc64le
|
||||
MOVD $·rcon(SB), PTR // PTR points to rcon addr
|
||||
LVX (PTR), XTMP1
|
||||
ADD $0x10, PTR
|
||||
#else
|
||||
MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
|
||||
#endif
|
||||
|
||||
LXVD2X (R5)(R0), XDATA
|
||||
#ifndef GOARCH_ppc64le
|
||||
VPERM XDATA, XDATA, XTMP1, XDATA
|
||||
#endif
|
||||
|
||||
VSPLTISB $4, XTMP2;
|
||||
LXVD2X (PTR)(R0), BIT_REV_TAB_L
|
||||
VSLB BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H
|
||||
VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, XDATA, XTMP3 // XTMP3 - bit reverse data bytes
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup data
|
||||
VSPLTISB $0, ZERO
|
||||
MOVD $0x10, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM ZERO, XTMP3, XTMP4, XTMP1
|
||||
MOVD $0x20, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM ZERO, XTMP3, XTMP4, XTMP2
|
||||
|
||||
// setup KS
|
||||
LXVW4X (R4), KS_L
|
||||
MOVD $8, R8
|
||||
LXVW4X (R8)(R4), KS_M1
|
||||
MOVD $16, R8
|
||||
LXVW4X (R8)(R4), KS_M2
|
||||
MOVD $0x30, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM KS_L, KS_L, XTMP4, KS_L
|
||||
VPERM KS_M1, KS_M1, XTMP4, KS_M1
|
||||
VPERM KS_M2, KS_M2, XTMP4, KS_M2
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
VPMSUMD XTMP1, KS_L, XTMP3
|
||||
VPMSUMD XTMP2, KS_M1, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSPLTW $2, XTMP3, XDIGEST
|
||||
|
||||
// Calculate upper 32 bits of tag
|
||||
VSLDOI $8, KS_M1, KS_L, KS_L
|
||||
VPMSUMD XTMP1, KS_L, XTMP3
|
||||
VSLDOI $8, KS_M2, KS_M1, KS_M1
|
||||
VPMSUMD XTMP2, KS_M1, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSPLTW $2, XTMP3, XTMP3
|
||||
|
||||
// Update tag
|
||||
#ifdef GOARCH_ppc64le
|
||||
VSLDOI $12, XTMP3, XDIGEST, XDIGEST
|
||||
#else
|
||||
VSLDOI $12, XDIGEST, XTMP3, XDIGEST
|
||||
#endif
|
||||
MFVSRD XDIGEST, R8
|
||||
MOVD (R3), R6
|
||||
XOR R6, R8, R6
|
||||
MOVD R6, (R3)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
MOVD $16, R8
|
||||
LXVD2X (R8)(R4), XTMP1
|
||||
STXVD2X XTMP1, (R4)(R0)
|
||||
|
||||
RET
|
||||
|
||||
// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), R3
|
||||
MOVD ks+8(FP), R4
|
||||
MOVD p+16(FP), R5
|
||||
|
||||
#ifndef GOARCH_ppc64le
|
||||
MOVD $·rcon(SB), PTR // PTR points to rcon addr
|
||||
LVX (PTR), XTMP1
|
||||
ADD $0x10, PTR
|
||||
#else
|
||||
MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
|
||||
#endif
|
||||
|
||||
LXVD2X (R5)(R0), XDATA
|
||||
#ifndef GOARCH_ppc64le
|
||||
VPERM XDATA, XDATA, XTMP1, XDATA
|
||||
#endif
|
||||
|
||||
VSPLTISB $4, XTMP2;
|
||||
LXVD2X (PTR)(R0), BIT_REV_TAB_L
|
||||
VSLB BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H
|
||||
VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, XDATA, XTMP3 // XTMP3 - bit reverse data bytes
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup data
|
||||
VSPLTISB $0, ZERO
|
||||
MOVD $0x10, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM ZERO, XTMP3, XTMP4, XTMP1
|
||||
MOVD $0x20, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM ZERO, XTMP3, XTMP4, XTMP2
|
||||
|
||||
// setup KS
|
||||
LXVW4X (R4), KS_L
|
||||
MOVD $8, R8
|
||||
LXVW4X (R8)(R4), KS_M1
|
||||
MOVD $16, R8
|
||||
LXVW4X (R8)(R4), KS_M2
|
||||
VOR KS_M2, KS_M2, KS_H
|
||||
MOVD $0x30, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM KS_L, KS_L, XTMP4, KS_L
|
||||
VPERM KS_M1, KS_M1, XTMP4, KS_M1
|
||||
VPERM KS_M2, KS_M2, XTMP4, KS_M2
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
VPMSUMD XTMP1, KS_L, XTMP3
|
||||
VPMSUMD XTMP2, KS_M1, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSLDOI $12, XTMP3, XTMP3, XDIGEST
|
||||
|
||||
// Calculate upper 32 bits of tag
|
||||
VSLDOI $8, KS_M1, KS_L, KS_L
|
||||
VPMSUMD XTMP1, KS_L, XTMP3
|
||||
VSLDOI $8, KS_M2, KS_M1, XTMP5
|
||||
VPMSUMD XTMP2, XTMP5, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSLDOI $8, XTMP3, XTMP3, XTMP3
|
||||
VSLDOI $4, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// calculate bits 95-64 of tag
|
||||
VPMSUMD XTMP1, KS_M1, XTMP3
|
||||
VPMSUMD XTMP2, KS_M2, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSLDOI $8, XTMP3, XTMP3, XTMP3
|
||||
VSLDOI $4, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// calculate bits 127-96 of tag
|
||||
VSLDOI $8, KS_M2, KS_M1, KS_M1
|
||||
VPMSUMD XTMP1, KS_M1, XTMP3
|
||||
VSLDOI $8, KS_H, KS_M2, KS_M2
|
||||
VPMSUMD XTMP2, KS_M2, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSLDOI $8, XTMP3, XTMP3, XTMP3
|
||||
VSLDOI $4, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// Update tag
|
||||
LXVW4X (R3)(R0), XTMP1
|
||||
VXOR XTMP1, XDIGEST, XDIGEST
|
||||
STXVW4X XDIGEST, (R3)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
MOVD $16, R8
|
||||
LXVD2X (R8)(R4), XTMP1
|
||||
STXVD2X XTMP1, (R4)(R0)
|
||||
|
||||
RET
|
@ -9,13 +9,13 @@ import (
|
||||
var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD
|
||||
|
||||
//go:noescape
|
||||
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||
func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte)
|
||||
|
||||
func block(m *ZUC128Mac, p []byte) {
|
||||
if supportsGFMUL {
|
||||
for len(p) >= chunk {
|
||||
m.genKeywords(m.k0[4:])
|
||||
eia3Round16B(&m.t, &m.k0[0], &p[0], m.tagSize)
|
||||
eiaRoundTag4(&m.t, &m.k0[0], &p[0])
|
||||
p = p[chunk:]
|
||||
}
|
||||
} else {
|
||||
|
@ -37,32 +37,41 @@ GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
|
||||
#define KS_M2 X11
|
||||
#define KS_H X12
|
||||
|
||||
// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||
TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
#define BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2) \
|
||||
MOVOU bit_reverse_and_table<>(SB), XTMP1; \
|
||||
MOVOU XDATA, XTMP2; \
|
||||
PAND XTMP1, XTMP2; \
|
||||
PANDN XDATA, XTMP1; \
|
||||
PSRLQ $4, XTMP1; \
|
||||
MOVOU bit_reverse_table_h<>(SB), XDATA; \
|
||||
PSHUFB XTMP2, XDATA; \
|
||||
MOVOU bit_reverse_table_l<>(SB), XTMP2; \
|
||||
PSHUFB XTMP1, XTMP2; \
|
||||
PXOR XTMP2, XDATA
|
||||
|
||||
#define BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2) \
|
||||
VMOVDQU bit_reverse_and_table<>(SB), XTMP1; \
|
||||
VPAND XTMP1, XDATA, XTMP2; \
|
||||
VPANDN XDATA, XTMP1, XTMP1; \
|
||||
VPSRLD $4, XTMP1, XTMP1; \
|
||||
VMOVDQU bit_reverse_table_h<>(SB), XDATA; \
|
||||
VPSHUFB XTMP2, XDATA, XDATA; \
|
||||
VMOVDQU bit_reverse_table_l<>(SB), XTMP2; \
|
||||
VPSHUFB XTMP1, XTMP2, XTMP1; \
|
||||
VPOR XTMP1, XDATA, XDATA
|
||||
|
||||
// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eiaRoundTag4(SB),NOSPLIT,$0
|
||||
MOVQ t+0(FP), AX
|
||||
MOVQ ks+8(FP), BX
|
||||
MOVQ p+16(FP), CX
|
||||
MOVQ tagSize+24(FP), DX
|
||||
|
||||
CMPB ·useAVX(SB), $1
|
||||
JE avx
|
||||
|
||||
// Reverse data bytes
|
||||
MOVUPS (0)(CX), XDATA
|
||||
MOVOU bit_reverse_and_table<>(SB), XTMP4
|
||||
MOVOU XDATA, XTMP2
|
||||
PAND XTMP4, XTMP2
|
||||
|
||||
PANDN XDATA, XTMP4
|
||||
PSRLQ $4, XTMP4
|
||||
|
||||
MOVOU bit_reverse_table_h<>(SB), XTMP3
|
||||
PSHUFB XTMP2, XTMP3
|
||||
|
||||
MOVOU bit_reverse_table_l<>(SB), XTMP1
|
||||
PSHUFB XTMP4, XTMP1
|
||||
|
||||
PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes
|
||||
BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
@ -72,12 +81,12 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
||||
|
||||
// setup DATA
|
||||
MOVOU XTMP3, XTMP1
|
||||
MOVOU XDATA, XTMP1
|
||||
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
||||
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
||||
|
||||
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
|
||||
MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
|
||||
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA
|
||||
MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
@ -85,11 +94,11 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
PCLMULQDQ $0x00, KS_L, XTMP1
|
||||
PCLMULQDQ $0x11, KS_L, XTMP2
|
||||
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
||||
PCLMULQDQ $0x11, KS_M1, XTMP3
|
||||
PCLMULQDQ $0x11, KS_M1, XDATA
|
||||
|
||||
// XOR all products and move 32-bits to lower 32 bits
|
||||
PXOR XTMP1, XTMP2
|
||||
PXOR XTMP3, XDIGEST
|
||||
PXOR XDATA, XDIGEST
|
||||
PXOR XTMP2, XDIGEST
|
||||
PSRLDQ $4, XDIGEST
|
||||
|
||||
@ -105,18 +114,8 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
|
||||
avx:
|
||||
VMOVDQU (0)(CX), XDATA
|
||||
|
||||
// Reverse data bytes
|
||||
VMOVDQU bit_reverse_and_table<>(SB), XTMP1
|
||||
VPAND XTMP1, XDATA, XTMP2
|
||||
VPANDN XDATA, XTMP1, XTMP3
|
||||
VPSRLD $4, XTMP3, XTMP3
|
||||
|
||||
VMOVDQU bit_reverse_table_h<>(SB), XTMP1
|
||||
VPSHUFB XTMP2, XTMP1, XTMP4
|
||||
VMOVDQU bit_reverse_table_l<>(SB), XTMP1
|
||||
VPSHUFB XTMP3, XTMP1, XTMP1
|
||||
VPOR XTMP1, XTMP4, XTMP4
|
||||
BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
@ -125,9 +124,9 @@ avx:
|
||||
|
||||
// setup DATA
|
||||
// Data bytes [31:0 0s 63:32 0s]
|
||||
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
|
||||
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1
|
||||
// Data bytes [95:64 0s 127:96 0s]
|
||||
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
|
||||
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
@ -140,14 +139,339 @@ avx:
|
||||
VPXOR XTMP3, XTMP4, XTMP3
|
||||
VPXOR XTMP5, XTMP6, XTMP5
|
||||
VPXOR XTMP3, XTMP5, XDIGEST
|
||||
VPSLLDQ $4, XDIGEST, XDIGEST
|
||||
|
||||
VMOVQ XDIGEST, R10
|
||||
SHRQ $32, R10
|
||||
// Update tag
|
||||
MOVL XDIGEST, R10
|
||||
XORL R10, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
VMOVDQU (4*4)(BX), XTMP1
|
||||
VMOVDQU XTMP1, (0*4)(BX)
|
||||
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
|
||||
MOVQ t+0(FP), AX
|
||||
MOVQ ks+8(FP), BX
|
||||
MOVQ p+16(FP), CX
|
||||
|
||||
CMPB ·useAVX(SB), $1
|
||||
JE avx
|
||||
|
||||
// Reverse data bytes
|
||||
MOVUPS (0)(CX), XDATA
|
||||
BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
MOVUPS (0*4)(BX), XTMP1
|
||||
MOVUPS (2*4)(BX), XTMP2
|
||||
MOVUPS (4*4)(BX), XTMP4
|
||||
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
||||
PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
||||
|
||||
// setup DATA
|
||||
MOVOU XDATA, XTMP1
|
||||
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
||||
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
||||
|
||||
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA
|
||||
MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Save data for following products
|
||||
MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s]
|
||||
MOVOU XDATA, XTMP6 // Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// Calculate lower 32 bits of tag
|
||||
PCLMULQDQ $0x00, KS_L, XTMP1
|
||||
PCLMULQDQ $0x11, KS_L, XTMP2
|
||||
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
||||
PCLMULQDQ $0x11, KS_M1, XDATA
|
||||
|
||||
// XOR all products and move bits 63-32 bits to lower 32 bits
|
||||
PXOR XTMP1, XTMP2
|
||||
PXOR XDATA, XDIGEST
|
||||
PXOR XTMP2, XDIGEST
|
||||
PSLLDQ $8, XDIGEST // Move bits 63-32 to bits 127-96
|
||||
|
||||
// Prepare data and calculate bits 63-32 of tag
|
||||
MOVOU XTMP5, XTMP1
|
||||
MOVOU XTMP5, XTMP2
|
||||
MOVOU XTMP6, XTMP3
|
||||
MOVOU XTMP6, XTMP4
|
||||
|
||||
PCLMULQDQ $0x10, KS_L, XTMP1
|
||||
PCLMULQDQ $0x01, KS_M1, XTMP2
|
||||
PCLMULQDQ $0x10, KS_M1, XTMP3
|
||||
PCLMULQDQ $0x01, KS_M2, XTMP4
|
||||
|
||||
// XOR all the products and keep only bits 63-32
|
||||
PXOR XTMP2, XTMP1
|
||||
PXOR XTMP4, XTMP3
|
||||
PXOR XTMP3, XTMP1
|
||||
PSRLDQ $4, XTMP1 // Move bits 63-32 to bits 31-0
|
||||
|
||||
PALIGNR $12, XDIGEST, XTMP1 // XTMP1 || XDIGEST
|
||||
|
||||
// Update tag
|
||||
MOVQ XTMP1, R10
|
||||
XORQ R10, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
MOVUPS (4*4)(BX), XTMP1
|
||||
MOVUPS XTMP1, (0*4)(BX)
|
||||
|
||||
RET
|
||||
|
||||
avx:
|
||||
VMOVDQU (0)(CX), XDATA
|
||||
|
||||
// Reverse data bytes
|
||||
BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
|
||||
VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
||||
|
||||
// setup DATA
|
||||
// Data bytes [31:0 0s 63:32 0s]
|
||||
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1
|
||||
// Data bytes [95:64 0s 127:96 0s]
|
||||
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
|
||||
|
||||
VPXOR XTMP3, XTMP4, XTMP3
|
||||
VPXOR XTMP5, XTMP6, XTMP5
|
||||
VPXOR XTMP3, XTMP5, XTMP3
|
||||
VPSLLDQ $8, XTMP3, XDIGEST
|
||||
|
||||
VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
|
||||
|
||||
// XOR all the products and keep only 32-63 bits
|
||||
VPXOR XTMP4, XTMP3, XTMP3
|
||||
VPXOR XTMP6, XTMP5, XTMP5
|
||||
VPXOR XTMP5, XTMP3, XTMP3
|
||||
VPSRLDQ $4, XTMP3, XTMP3
|
||||
|
||||
VPALIGNR $12, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// Update tag
|
||||
VMOVQ XDIGEST, R10
|
||||
XORQ R10, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
VMOVDQU (4*4)(BX), XTMP1
|
||||
VMOVDQU XTMP1, (0*4)(BX)
|
||||
|
||||
RET
|
||||
|
||||
// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
|
||||
MOVQ t+0(FP), AX
|
||||
MOVQ ks+8(FP), BX
|
||||
MOVQ p+16(FP), CX
|
||||
|
||||
CMPB ·useAVX(SB), $1
|
||||
JE avx
|
||||
|
||||
// Reverse data bytes
|
||||
MOVUPS (0)(CX), XDATA
|
||||
BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
MOVUPS (0*4)(BX), XTMP1
|
||||
MOVUPS (2*4)(BX), XTMP2
|
||||
MOVUPS (4*4)(BX), XTMP4
|
||||
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
||||
PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
||||
PSHUFD $0xBB, XTMP4, KS_H // KS bits [255:224 223:192 255:224 223:192]
|
||||
|
||||
// setup DATA
|
||||
MOVOU XDATA, XTMP1
|
||||
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
||||
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
||||
|
||||
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA
|
||||
MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Save data for following products
|
||||
MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s]
|
||||
MOVOU XDATA, XTMP6 // Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// Calculate lower 32 bits of tag
|
||||
PCLMULQDQ $0x00, KS_L, XTMP1
|
||||
PCLMULQDQ $0x11, KS_L, XTMP2
|
||||
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
||||
PCLMULQDQ $0x11, KS_M1, XDATA
|
||||
|
||||
// XOR all products and move bits 63-32 bits to lower 32 bits
|
||||
PXOR XTMP1, XTMP2
|
||||
PXOR XDATA, XDIGEST
|
||||
PXOR XTMP2, XDIGEST
|
||||
PSLLDQ $8, XDIGEST // Move bits 63-32 to bits 127-96
|
||||
|
||||
// Prepare data and calculate bits 63-32 of tag
|
||||
MOVOU XTMP5, XTMP1
|
||||
MOVOU XTMP5, XTMP2
|
||||
MOVOU XTMP6, XTMP3
|
||||
MOVOU XTMP6, XTMP4
|
||||
|
||||
PCLMULQDQ $0x10, KS_L, XTMP1
|
||||
PCLMULQDQ $0x01, KS_M1, XTMP2
|
||||
PCLMULQDQ $0x10, KS_M1, XTMP3
|
||||
PCLMULQDQ $0x01, KS_M2, XTMP4
|
||||
|
||||
// XOR all the products and keep only bits 63-32
|
||||
PXOR XTMP2, XTMP1
|
||||
PXOR XTMP4, XTMP3
|
||||
PXOR XTMP3, XTMP1
|
||||
PSRLDQ $4, XTMP1 // Move bits 63-32 to bits 31-0
|
||||
PALIGNR $4, XDIGEST, XTMP1 // XTMP1 || XDIGEST
|
||||
|
||||
// Prepare data and calculate bits 95-64 of tag
|
||||
MOVOU XTMP5, XDIGEST
|
||||
MOVOU XTMP5, XTMP2
|
||||
MOVOU XTMP6, XTMP3
|
||||
MOVOU XTMP6, XTMP4
|
||||
|
||||
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
||||
PCLMULQDQ $0x11, KS_M1, XTMP2
|
||||
PCLMULQDQ $0x00, KS_M2, XTMP3
|
||||
PCLMULQDQ $0x11, KS_M2, XTMP4
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 95-64
|
||||
PXOR XTMP2, XDIGEST
|
||||
PXOR XTMP4, XTMP3
|
||||
PXOR XTMP3, XDIGEST
|
||||
PSRLDQ $4, XDIGEST // Move bits 63-32 to bits 31-0
|
||||
PALIGNR $4, XTMP1, XDIGEST // XDIGEST || XTMP1
|
||||
|
||||
// Prepare data and calculate bits 127-96 of tag
|
||||
MOVOU XTMP5, XTMP1
|
||||
MOVOU XTMP5, XTMP2
|
||||
MOVOU XTMP6, XTMP3
|
||||
MOVOU XTMP6, XTMP4
|
||||
|
||||
PCLMULQDQ $0x10, KS_M1, XTMP1
|
||||
PCLMULQDQ $0x01, KS_M2, XTMP2
|
||||
PCLMULQDQ $0x10, KS_M2, XTMP3
|
||||
PCLMULQDQ $0x01, KS_H, XTMP4
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 127-96
|
||||
PXOR XTMP2, XTMP1
|
||||
PXOR XTMP4, XTMP3
|
||||
PXOR XTMP3, XTMP1
|
||||
PSRLDQ $4, XTMP1 // Move bits 63-32 to bits 31-0
|
||||
PALIGNR $4, XDIGEST, XTMP1 // XTMP1 || XDIGEST
|
||||
|
||||
// Update tag
|
||||
MOVUPS (AX), XDIGEST
|
||||
PXOR XTMP1, XDIGEST
|
||||
MOVUPS XDIGEST, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
MOVUPS (4*4)(BX), XTMP1
|
||||
MOVUPS XTMP1, (0*4)(BX)
|
||||
|
||||
RET
|
||||
|
||||
avx:
|
||||
VMOVDQU (0)(CX), XDATA
|
||||
|
||||
// Reverse data bytes
|
||||
BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
|
||||
VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
||||
VPSHUFD $0xBB, (4*4)(BX), KS_H // KS bits [255:224 223:192 255:224 223:192]
|
||||
|
||||
// setup DATA
|
||||
// Data bytes [31:0 0s 63:32 0s]
|
||||
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1
|
||||
// Data bytes [95:64 0s 127:96 0s]
|
||||
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
|
||||
|
||||
VPXOR XTMP3, XTMP4, XTMP3
|
||||
VPXOR XTMP5, XTMP6, XTMP5
|
||||
VPXOR XTMP3, XTMP5, XTMP3
|
||||
VPSLLDQ $8, XTMP3, XDIGEST
|
||||
|
||||
VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
|
||||
|
||||
// XOR all the products and keep only 32-63 bits
|
||||
VPXOR XTMP4, XTMP3, XTMP3
|
||||
VPXOR XTMP6, XTMP5, XTMP5
|
||||
VPXOR XTMP5, XTMP3, XTMP3
|
||||
VPSRLDQ $4, XTMP3, XTMP3
|
||||
VPALIGNR $4, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// Prepare data and calculate bits 95-64 of tag
|
||||
VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 95-64
|
||||
VPXOR XTMP4, XTMP3, XTMP3
|
||||
VPXOR XTMP6, XTMP5, XTMP5
|
||||
VPXOR XTMP5, XTMP3, XTMP3
|
||||
VPSRLDQ $4, XTMP3, XTMP3
|
||||
VPALIGNR $4, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// Prepare data and calculate bits 127-96 of tag
|
||||
VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 127-96
|
||||
VPXOR XTMP4, XTMP3, XTMP3
|
||||
VPXOR XTMP6, XTMP5, XTMP5
|
||||
VPXOR XTMP5, XTMP3, XTMP3
|
||||
VPSRLDQ $4, XTMP3, XTMP3
|
||||
VPALIGNR $4, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// Update tag
|
||||
VPXOR (AX), XDIGEST, XDIGEST
|
||||
VMOVDQA XDIGEST, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
VMOVDQU (4*4)(BX), XTMP1
|
||||
VMOVDQU XTMP1, (0*4)(BX)
|
||||
|
||||
RET
|
||||
|
@ -2,15 +2,15 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA ·eia_const+0x00(SB)/8, $0x0e060a020c040800 // bit_reverse_table low
|
||||
DATA ·eia_const+0x08(SB)/8, $0x0f070b030d050901
|
||||
DATA ·eia_const+0x10(SB)/8, $0xe060a020c0408000 // bit_reverse_table high
|
||||
DATA ·eia_const+0x18(SB)/8, $0xf070b030d0509010
|
||||
DATA ·eia_const+0x20(SB)/8, $0xffffffff03020100 // SHUF_MASK_DW0_DW1
|
||||
DATA ·eia_const+0x28(SB)/8, $0xffffffff07060504
|
||||
DATA ·eia_const+0x30(SB)/8, $0xffffffff0b0a0908 // SHUF_MASK_DW2_DW3
|
||||
DATA ·eia_const+0x38(SB)/8, $0xffffffff0f0e0d0c
|
||||
GLOBL ·eia_const(SB), RODATA, $64
|
||||
DATA eia_const<>+0x00(SB)/8, $0x0e060a020c040800 // bit_reverse_table low
|
||||
DATA eia_const<>+0x08(SB)/8, $0x0f070b030d050901
|
||||
DATA eia_const<>+0x10(SB)/8, $0xe060a020c0408000 // bit_reverse_table high
|
||||
DATA eia_const<>+0x18(SB)/8, $0xf070b030d0509010
|
||||
DATA eia_const<>+0x20(SB)/8, $0xffffffff03020100 // SHUF_MASK_DW0_DW1
|
||||
DATA eia_const<>+0x28(SB)/8, $0xffffffff07060504
|
||||
DATA eia_const<>+0x30(SB)/8, $0xffffffff0b0a0908 // SHUF_MASK_DW2_DW3
|
||||
DATA eia_const<>+0x38(SB)/8, $0xffffffff0f0e0d0c
|
||||
GLOBL eia_const<>(SB), RODATA, $64
|
||||
|
||||
#define AX R2
|
||||
#define BX R3
|
||||
@ -36,28 +36,29 @@ GLOBL ·eia_const(SB), RODATA, $64
|
||||
#define SHUF_MASK_DW2_DW3 V24
|
||||
|
||||
#define LOAD_GLOBAL_DATA() \
|
||||
MOVD $·eia_const(SB), R0 \
|
||||
VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16, SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16] \
|
||||
MOVW $0x0F0F0F0F, R0 \
|
||||
VDUP R0, BIT_REV_AND_TAB.S4
|
||||
MOVD $eia_const<>(SB), R0 \
|
||||
VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16, SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16] \
|
||||
MOVW $0x0F0F0F0F, R0 \
|
||||
VDUP R0, BIT_REV_AND_TAB.S4
|
||||
|
||||
// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||
TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
#define BIT_REVERSE(XDATA, XTMP1, XTMP2) \
|
||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP2.B16 \
|
||||
VUSHR $4, XDATA.B16, XTMP1.B16 \
|
||||
VTBL XTMP2.B16, [BIT_REV_TAB_H.B16], XTMP2.B16 \
|
||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 \
|
||||
VEOR XTMP1.B16, XTMP2.B16, XDATA.B16
|
||||
|
||||
// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eiaRoundTag4(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), AX
|
||||
MOVD ks+8(FP), BX
|
||||
MOVD p+16(FP), CX
|
||||
MOVD tagSize+24(FP), DX
|
||||
|
||||
LOAD_GLOBAL_DATA()
|
||||
|
||||
// Reverse data bytes
|
||||
VLD1 (CX), [XDATA.B16]
|
||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
||||
VUSHR $4, XDATA.B16, XTMP1.B16
|
||||
|
||||
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
||||
VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes
|
||||
BIT_REVERSE(XDATA, XTMP1, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
@ -72,8 +73,8 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
|
||||
|
||||
// setup DATA
|
||||
VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
|
||||
VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
|
||||
VTBL SHUF_MASK_DW0_DW1.B16, [XDATA.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
|
||||
VTBL SHUF_MASK_DW2_DW3.B16, [XDATA.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
@ -93,3 +94,169 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
MOVW R11, (AX)
|
||||
|
||||
RET
|
||||
|
||||
// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), AX
|
||||
MOVD ks+8(FP), BX
|
||||
MOVD p+16(FP), CX
|
||||
|
||||
LOAD_GLOBAL_DATA()
|
||||
|
||||
// Reverse data bytes
|
||||
VLD1 (CX), [XDATA.B16]
|
||||
BIT_REVERSE(XDATA, XTMP1, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
VLD1 (BX), [XTMP1.B16, XTMP2.B16]
|
||||
VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
|
||||
// TODO: Any better solution???
|
||||
VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
|
||||
VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32]
|
||||
VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
|
||||
VDUP XTMP1.S[3], KS_M1.S4
|
||||
VMOV XTMP1.S[2], KS_M1.S[1]
|
||||
VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
|
||||
|
||||
// setup DATA
|
||||
VTBL SHUF_MASK_DW0_DW1.B16, [XDATA.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
|
||||
VTBL SHUF_MASK_DW2_DW3.B16, [XDATA.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
|
||||
// Calculate lower 32 bits of tag
|
||||
VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
|
||||
VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
|
||||
VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
|
||||
VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
|
||||
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
// Move previous result to low 32 bits and XOR with previous digest
|
||||
VMOV XTMP3.S[1], XDIGEST.S[0]
|
||||
|
||||
// Prepare data and calculate bits 63-32 of tag
|
||||
VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16
|
||||
VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
|
||||
VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16
|
||||
VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
|
||||
VEXT $8, KS_M1.B16, KS_M1.B16, XTMP1.B16
|
||||
VPMULL XTMP1.D1, XTMP2.D1, XTMP5.Q1
|
||||
VEXT $8, XTMP2.B16, XTMP2.B16, XTMP1.B16
|
||||
VPMULL KS_M2.D1, XTMP1.D1, XTMP6.Q1
|
||||
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
VMOV XTMP3.S[1], XDIGEST.S[1]
|
||||
|
||||
VMOV XDIGEST.D[0], R10
|
||||
MOVD (AX), R11
|
||||
EOR R10, R11
|
||||
MOVD R11, (AX)
|
||||
|
||||
RET
|
||||
|
||||
// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), AX
|
||||
MOVD ks+8(FP), BX
|
||||
MOVD p+16(FP), CX
|
||||
|
||||
LOAD_GLOBAL_DATA()
|
||||
|
||||
// Reverse data bytes
|
||||
VLD1 (CX), [XDATA.B16]
|
||||
BIT_REVERSE(XDATA, XTMP1, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
VLD1 (BX), [XTMP1.B16, XTMP2.B16]
|
||||
VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
|
||||
// TODO: Any better solution??? We can use VTBL, but there are no performance imprvoement if we can't reuse MASK constant
|
||||
VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
|
||||
VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32]
|
||||
VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
|
||||
VMOVQ $0x0b0a09080f0e0d0c, $0x0b0a09080f0e0d0c, XTMP4
|
||||
VTBL XTMP4.B16, [XTMP2.B16], KS_H.B16 // KS bits [255:224 223:192 255:224 223:192]
|
||||
VDUP XTMP1.S[3], KS_M1.S4
|
||||
VMOV XTMP1.S[2], KS_M1.S[1]
|
||||
VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
|
||||
|
||||
// setup DATA
|
||||
VTBL SHUF_MASK_DW0_DW1.B16, [XDATA.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
|
||||
VTBL SHUF_MASK_DW2_DW3.B16, [XDATA.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
|
||||
// Calculate lower 32 bits of tag
|
||||
VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
|
||||
VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
|
||||
VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
|
||||
VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
|
||||
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
// Move previous result to low 32 bits and XOR with previous digest
|
||||
VMOV XTMP3.S[1], XDIGEST.S[0]
|
||||
|
||||
// Prepare data and calculate bits 63-32 of tag
|
||||
VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16
|
||||
VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
|
||||
VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16
|
||||
VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
|
||||
VEXT $8, KS_M1.B16, KS_M1.B16, XTMP6.B16
|
||||
VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
|
||||
VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16
|
||||
VPMULL KS_M2.D1, KS_L.D1, XTMP6.Q1
|
||||
|
||||
// XOR all the products and keep only 32-63 bits
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
VMOV XTMP3.S[1], XDIGEST.S[1]
|
||||
|
||||
// Prepare data and calculate bits 95-64 of tag
|
||||
VPMULL KS_M1.D1, XTMP1.D1, XTMP3.Q1
|
||||
VPMULL2 KS_M1.D2, XTMP1.D2, XTMP4.Q1
|
||||
VPMULL KS_M2.D1, XTMP2.D1, XTMP5.Q1
|
||||
VPMULL2 KS_M2.D2, XTMP2.D2, XTMP6.Q1
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 95-64
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
VMOV XTMP3.S[1], XDIGEST.S[2]
|
||||
|
||||
// Prepare data and calculate bits 127-96 of tag
|
||||
VEXT $8, KS_M1.B16, KS_M1.B16, XTMP5.B16
|
||||
VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
|
||||
VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16
|
||||
VPMULL KS_M2.D1, XTMP5.D1, XTMP4.Q1
|
||||
VEXT $8, KS_M2.B16, KS_M2.B16, XTMP6.B16
|
||||
VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
|
||||
VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16
|
||||
VPMULL KS_H.D1, KS_L.D1, XTMP6.Q1
|
||||
|
||||
// XOR all the products and move bits 63-32 to bits 127-96
|
||||
VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
|
||||
VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
|
||||
VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
|
||||
|
||||
VMOV XTMP3.S[1], XDIGEST.S[3]
|
||||
|
||||
VLD1 (AX), [XTMP1.B16]
|
||||
VEOR XTMP1.B16, XDIGEST.B16, XDIGEST.B16
|
||||
VST1 [XDIGEST.B16], (AX)
|
||||
|
||||
RET
|
||||
|
@ -6,43 +6,53 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA ·rcon+0x00(SB)/8, $0x0706050403020100 // Permute for vector doubleword endian swap
|
||||
DATA ·rcon+0x08(SB)/8, $0x0f0e0d0c0b0a0908
|
||||
DATA ·rcon+0x10(SB)/8, $0x0008040c020a060e // bit_reverse_table_l
|
||||
DATA ·rcon+0x18(SB)/8, $0x0109050d030b070f // bit_reverse_table_l
|
||||
DATA ·rcon+0x20(SB)/8, $0x0000000010111213 // data mask
|
||||
DATA ·rcon+0x28(SB)/8, $0x0000000014151617 // data mask
|
||||
DATA ·rcon+0x30(SB)/8, $0x0000000018191a1b // data mask
|
||||
DATA ·rcon+0x38(SB)/8, $0x000000001c1d1e1f // data mask
|
||||
DATA ·rcon+0x40(SB)/8, $0x0405060708090a0b // ks mask
|
||||
DATA ·rcon+0x48(SB)/8, $0x0001020304050607 // ks mask
|
||||
GLOBL ·rcon(SB), RODATA, $80
|
||||
DATA eia_const<>+0x00(SB)/8, $0x0706050403020100 // Permute for vector doubleword endian swap
|
||||
DATA eia_const<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908
|
||||
DATA eia_const<>+0x10(SB)/8, $0x0008040c020a060e // bit_reverse_table_l
|
||||
DATA eia_const<>+0x18(SB)/8, $0x0109050d030b070f // bit_reverse_table_l
|
||||
DATA eia_const<>+0x20(SB)/8, $0x0000000010111213 // data mask
|
||||
DATA eia_const<>+0x28(SB)/8, $0x0000000014151617 // data mask
|
||||
DATA eia_const<>+0x30(SB)/8, $0x0000000018191a1b // data mask
|
||||
DATA eia_const<>+0x38(SB)/8, $0x000000001c1d1e1f // data mask
|
||||
DATA eia_const<>+0x40(SB)/8, $0x0405060708090a0b // ks mask
|
||||
DATA eia_const<>+0x48(SB)/8, $0x0001020304050607 // ks mask
|
||||
GLOBL eia_const<>(SB), RODATA, $80
|
||||
|
||||
#define XTMP1 V0
|
||||
#define XTMP2 V1
|
||||
#define XTMP3 V2
|
||||
#define XTMP4 V3
|
||||
#define XTMP5 V4
|
||||
#define XTMP6 V5
|
||||
#define XDATA V6
|
||||
#define XDIGEST V7
|
||||
#define KS_L V8
|
||||
#define KS_M1 V9
|
||||
#define KS_M2 V10
|
||||
#define KS_H V11
|
||||
#define BIT_REV_TAB_L V12
|
||||
#define BIT_REV_TAB_H V13
|
||||
|
||||
#define ZERO V15
|
||||
#define PTR R7
|
||||
|
||||
// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||
TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
#define BIT_REVERSE(addr, IN, OUT, XTMP) \
|
||||
LXVD2X (addr)(R0), BIT_REV_TAB_L \
|
||||
VSPLTISB $4, XTMP \
|
||||
VSLB BIT_REV_TAB_L, XTMP, BIT_REV_TAB_H \
|
||||
VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, IN, OUT
|
||||
|
||||
// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eiaRoundTag4(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), R3
|
||||
MOVD ks+8(FP), R4
|
||||
MOVD p+16(FP), R5
|
||||
|
||||
#ifndef GOARCH_ppc64le
|
||||
MOVD $·rcon(SB), PTR // PTR points to rcon addr
|
||||
MOVD $eia_const<>(SB), PTR
|
||||
LVX (PTR), XTMP1
|
||||
ADD $0x10, PTR
|
||||
#else
|
||||
MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
|
||||
MOVD $eia_const<>+0x10(SB), PTR
|
||||
#endif
|
||||
|
||||
LXVD2X (R5)(R0), XDATA
|
||||
@ -50,10 +60,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
VPERM XDATA, XDATA, XTMP1, XDATA
|
||||
#endif
|
||||
|
||||
VSPLTISB $4, XTMP2;
|
||||
LXVD2X (PTR)(R0), BIT_REV_TAB_L
|
||||
VSLB BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H
|
||||
VPERMXOR BIT_REV_TAB_L, BIT_REV_TAB_H, XDATA, XTMP3 // XTMP3 - bit reverse data bytes
|
||||
BIT_REVERSE(PTR, XDATA, XTMP3, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup data
|
||||
@ -95,3 +102,169 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
STXVD2X XTMP1, (R4)(R0)
|
||||
|
||||
RET
|
||||
|
||||
// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), R3
|
||||
MOVD ks+8(FP), R4
|
||||
MOVD p+16(FP), R5
|
||||
|
||||
#ifndef GOARCH_ppc64le
|
||||
MOVD $eia_const<>(SB), PTR
|
||||
LVX (PTR), XTMP1
|
||||
ADD $0x10, PTR
|
||||
#else
|
||||
MOVD $eia_const<>+0x10(SB), PTR
|
||||
#endif
|
||||
|
||||
LXVD2X (R5)(R0), XDATA
|
||||
#ifndef GOARCH_ppc64le
|
||||
VPERM XDATA, XDATA, XTMP1, XDATA
|
||||
#endif
|
||||
|
||||
BIT_REVERSE(PTR, XDATA, XTMP3, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup data
|
||||
VSPLTISB $0, ZERO
|
||||
MOVD $0x10, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM ZERO, XTMP3, XTMP4, XTMP1
|
||||
MOVD $0x20, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM ZERO, XTMP3, XTMP4, XTMP2
|
||||
|
||||
// setup KS
|
||||
LXVW4X (R4), KS_L
|
||||
MOVD $8, R8
|
||||
LXVW4X (R8)(R4), KS_M1
|
||||
MOVD $16, R8
|
||||
LXVW4X (R8)(R4), KS_M2
|
||||
MOVD $0x30, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM KS_L, KS_L, XTMP4, KS_L
|
||||
VPERM KS_M1, KS_M1, XTMP4, KS_M1
|
||||
VPERM KS_M2, KS_M2, XTMP4, KS_M2
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
VPMSUMD XTMP1, KS_L, XTMP3
|
||||
VPMSUMD XTMP2, KS_M1, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSPLTW $2, XTMP3, XDIGEST
|
||||
|
||||
// Calculate upper 32 bits of tag
|
||||
VSLDOI $8, KS_M1, KS_L, KS_L
|
||||
VPMSUMD XTMP1, KS_L, XTMP3
|
||||
VSLDOI $8, KS_M2, KS_M1, KS_M1
|
||||
VPMSUMD XTMP2, KS_M1, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSPLTW $2, XTMP3, XTMP3
|
||||
|
||||
// Update tag
|
||||
#ifdef GOARCH_ppc64le
|
||||
VSLDOI $12, XTMP3, XDIGEST, XDIGEST
|
||||
#else
|
||||
VSLDOI $12, XDIGEST, XTMP3, XDIGEST
|
||||
#endif
|
||||
MFVSRD XDIGEST, R8
|
||||
MOVD (R3), R6
|
||||
XOR R6, R8, R6
|
||||
MOVD R6, (R3)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
MOVD $16, R8
|
||||
LXVD2X (R8)(R4), XTMP1
|
||||
STXVD2X XTMP1, (R4)(R0)
|
||||
|
||||
RET
|
||||
|
||||
// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
|
||||
TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
|
||||
MOVD t+0(FP), R3
|
||||
MOVD ks+8(FP), R4
|
||||
MOVD p+16(FP), R5
|
||||
|
||||
#ifndef GOARCH_ppc64le
|
||||
MOVD $eia_const<>(SB), PTR
|
||||
LVX (PTR), XTMP1
|
||||
ADD $0x10, PTR
|
||||
#else
|
||||
MOVD $eia_const<>+0x10(SB), PTR
|
||||
#endif
|
||||
|
||||
LXVD2X (R5)(R0), XDATA
|
||||
#ifndef GOARCH_ppc64le
|
||||
VPERM XDATA, XDATA, XTMP1, XDATA
|
||||
#endif
|
||||
|
||||
BIT_REVERSE(PTR, XDATA, XTMP3, XTMP2)
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup data
|
||||
VSPLTISB $0, ZERO
|
||||
MOVD $0x10, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM ZERO, XTMP3, XTMP4, XTMP1
|
||||
MOVD $0x20, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM ZERO, XTMP3, XTMP4, XTMP2
|
||||
|
||||
// setup KS
|
||||
LXVW4X (R4), KS_L
|
||||
MOVD $8, R8
|
||||
LXVW4X (R8)(R4), KS_M1
|
||||
MOVD $16, R8
|
||||
LXVW4X (R8)(R4), KS_M2
|
||||
VOR KS_M2, KS_M2, KS_H
|
||||
MOVD $0x30, R8
|
||||
LXVD2X (PTR)(R8), XTMP4
|
||||
VPERM KS_L, KS_L, XTMP4, KS_L
|
||||
VPERM KS_M1, KS_M1, XTMP4, KS_M1
|
||||
VPERM KS_M2, KS_M2, XTMP4, KS_M2
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
VPMSUMD XTMP1, KS_L, XTMP3
|
||||
VPMSUMD XTMP2, KS_M1, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSLDOI $12, XTMP3, XTMP3, XDIGEST
|
||||
|
||||
// Calculate upper 32 bits of tag
|
||||
VSLDOI $8, KS_M1, KS_L, KS_L
|
||||
VPMSUMD XTMP1, KS_L, XTMP3
|
||||
VSLDOI $8, KS_M2, KS_M1, XTMP5
|
||||
VPMSUMD XTMP2, XTMP5, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSLDOI $8, XTMP3, XTMP3, XTMP3
|
||||
VSLDOI $4, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// calculate bits 95-64 of tag
|
||||
VPMSUMD XTMP1, KS_M1, XTMP3
|
||||
VPMSUMD XTMP2, KS_M2, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSLDOI $8, XTMP3, XTMP3, XTMP3
|
||||
VSLDOI $4, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// calculate bits 127-96 of tag
|
||||
VSLDOI $8, KS_M2, KS_M1, KS_M1
|
||||
VPMSUMD XTMP1, KS_M1, XTMP3
|
||||
VSLDOI $8, KS_H, KS_M2, KS_M2
|
||||
VPMSUMD XTMP2, KS_M2, XTMP4
|
||||
VXOR XTMP3, XTMP4, XTMP3
|
||||
VSLDOI $8, XTMP3, XTMP3, XTMP3
|
||||
VSLDOI $4, XDIGEST, XTMP3, XDIGEST
|
||||
|
||||
// Update tag
|
||||
LXVW4X (R3)(R0), XTMP1
|
||||
VXOR XTMP1, XDIGEST, XDIGEST
|
||||
STXVW4X XDIGEST, (R3)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
MOVD $16, R8
|
||||
LXVD2X (R8)(R4), XTMP1
|
||||
STXVD2X XTMP1, (R4)(R0)
|
||||
|
||||
RET
|
||||
|
Loading…
x
Reference in New Issue
Block a user