mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-24 19:26:18 +08:00
478 lines
13 KiB
ArmAsm
478 lines
13 KiB
ArmAsm
// Referenced Intel(R) Multi-Buffer Crypto for IPsec
|
|
// https://github.com/intel/intel-ipsec-mb/
|
|
//go:build !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800
|
|
DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901
|
|
GLOBL bit_reverse_table_l<>(SB), RODATA, $16
|
|
|
|
DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000
|
|
DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010
|
|
GLOBL bit_reverse_table_h<>(SB), RODATA, $16
|
|
|
|
DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
|
|
DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
|
|
GLOBL bit_reverse_and_table<>(SB), RODATA, $16
|
|
|
|
DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100
|
|
DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
|
|
GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16
|
|
|
|
DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908
|
|
DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c
|
|
GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
|
|
|
|
#define XTMP1 X1
|
|
#define XTMP2 X2
|
|
#define XTMP3 X3
|
|
#define XTMP4 X4
|
|
#define XTMP5 X5
|
|
#define XTMP6 X6
|
|
#define XDATA X7
|
|
#define XDIGEST X8
|
|
#define KS_L X9
|
|
#define KS_M1 X10
|
|
#define KS_M2 X11
|
|
#define KS_H X12
|
|
|
|
#define BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2) \
|
|
MOVOU bit_reverse_and_table<>(SB), XTMP1; \
|
|
MOVOU XDATA, XTMP2; \
|
|
PAND XTMP1, XTMP2; \
|
|
PANDN XDATA, XTMP1; \
|
|
PSRLQ $4, XTMP1; \
|
|
MOVOU bit_reverse_table_h<>(SB), XDATA; \
|
|
PSHUFB XTMP2, XDATA; \
|
|
MOVOU bit_reverse_table_l<>(SB), XTMP2; \
|
|
PSHUFB XTMP1, XTMP2; \
|
|
PXOR XTMP2, XDATA
|
|
|
|
#define BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2) \
|
|
VMOVDQU bit_reverse_and_table<>(SB), XTMP1; \
|
|
VPAND XTMP1, XDATA, XTMP2; \
|
|
VPANDN XDATA, XTMP1, XTMP1; \
|
|
VPSRLD $4, XTMP1, XTMP1; \
|
|
VMOVDQU bit_reverse_table_h<>(SB), XDATA; \
|
|
VPSHUFB XTMP2, XDATA, XDATA; \
|
|
VMOVDQU bit_reverse_table_l<>(SB), XTMP2; \
|
|
VPSHUFB XTMP1, XTMP2, XTMP1; \
|
|
VPOR XTMP1, XDATA, XDATA
|
|
|
|
// func eiaRoundTag4(t *uint32, keyStream *uint32, p *byte)
|
|
TEXT ·eiaRoundTag4(SB),NOSPLIT,$0
|
|
MOVQ t+0(FP), AX
|
|
MOVQ ks+8(FP), BX
|
|
MOVQ p+16(FP), CX
|
|
|
|
CMPB ·useAVX(SB), $1
|
|
JE avx
|
|
|
|
// Reverse data bytes
|
|
MOVUPS (0)(CX), XDATA
|
|
BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2)
|
|
|
|
// ZUC authentication part, 4x32 data bits
|
|
// setup KS
|
|
MOVUPS (0*4)(BX), XTMP1
|
|
MOVUPS (2*4)(BX), XTMP2
|
|
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
|
|
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
|
|
|
// setup DATA
|
|
MOVOU XDATA, XTMP1
|
|
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
|
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
|
|
|
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA
|
|
MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s]
|
|
|
|
// clmul
|
|
// xor the results from 4 32-bit words together
|
|
// Calculate lower 32 bits of tag
|
|
PCLMULQDQ $0x00, KS_L, XTMP1
|
|
PCLMULQDQ $0x11, KS_L, XTMP2
|
|
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
|
PCLMULQDQ $0x11, KS_M1, XDATA
|
|
|
|
// XOR all products and move 32-bits to lower 32 bits
|
|
PXOR XTMP1, XTMP2
|
|
PXOR XDATA, XDIGEST
|
|
PXOR XTMP2, XDIGEST
|
|
PSRLDQ $4, XDIGEST
|
|
|
|
// Update tag
|
|
MOVL XDIGEST, R10
|
|
XORL R10, (AX)
|
|
|
|
// Copy last 16 bytes of KS to the front
|
|
MOVUPS (4*4)(BX), XTMP1
|
|
MOVUPS XTMP1, (0*4)(BX)
|
|
|
|
RET
|
|
|
|
avx:
|
|
VMOVDQU (0)(CX), XDATA
|
|
// Reverse data bytes
|
|
BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2)
|
|
|
|
// ZUC authentication part, 4x32 data bits
|
|
// setup KS
|
|
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
|
|
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
|
|
|
|
// setup DATA
|
|
// Data bytes [31:0 0s 63:32 0s]
|
|
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1
|
|
// Data bytes [95:64 0s 127:96 0s]
|
|
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2
|
|
|
|
// clmul
|
|
// xor the results from 4 32-bit words together
|
|
// Calculate lower 32 bits of tag
|
|
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
|
|
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
|
|
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
|
|
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
|
|
|
|
VPXOR XTMP3, XTMP4, XTMP3
|
|
VPXOR XTMP5, XTMP6, XTMP5
|
|
VPXOR XTMP3, XTMP5, XDIGEST
|
|
VPSRLDQ $4, XDIGEST, XDIGEST
|
|
|
|
// Update tag
|
|
MOVL XDIGEST, R10
|
|
XORL R10, (AX)
|
|
|
|
// Copy last 16 bytes of KS to the front
|
|
VMOVDQU (4*4)(BX), XTMP1
|
|
VMOVDQU XTMP1, (0*4)(BX)
|
|
|
|
RET
|
|
|
|
// func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
|
|
TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
|
|
MOVQ t+0(FP), AX
|
|
MOVQ ks+8(FP), BX
|
|
MOVQ p+16(FP), CX
|
|
|
|
CMPB ·useAVX(SB), $1
|
|
JE avx
|
|
|
|
// Reverse data bytes
|
|
MOVUPS (0)(CX), XDATA
|
|
BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2)
|
|
|
|
// ZUC authentication part, 4x32 data bits
|
|
// setup KS
|
|
MOVUPS (0*4)(BX), XTMP1
|
|
MOVUPS (2*4)(BX), XTMP2
|
|
MOVUPS (4*4)(BX), XTMP4
|
|
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
|
|
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
|
PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
|
|
|
// setup DATA
|
|
MOVOU XDATA, XTMP1
|
|
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
|
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
|
|
|
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA
|
|
MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s]
|
|
|
|
// clmul
|
|
// xor the results from 4 32-bit words together
|
|
// Save data for following products
|
|
MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s]
|
|
MOVOU XDATA, XTMP6 // Data bits [95:64 0s 127:96 0s]
|
|
|
|
// Calculate lower 32 bits of tag
|
|
PCLMULQDQ $0x00, KS_L, XTMP1
|
|
PCLMULQDQ $0x11, KS_L, XTMP2
|
|
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
|
PCLMULQDQ $0x11, KS_M1, XDATA
|
|
|
|
// XOR all products and move bits 63-32 bits to lower 32 bits
|
|
PXOR XTMP1, XTMP2
|
|
PXOR XDATA, XDIGEST
|
|
PXOR XTMP2, XDIGEST
|
|
PSLLDQ $8, XDIGEST // Move bits 63-32 to bits 127-96
|
|
|
|
// Prepare data and calculate bits 63-32 of tag
|
|
MOVOU XTMP5, XTMP1
|
|
MOVOU XTMP5, XTMP2
|
|
MOVOU XTMP6, XTMP3
|
|
MOVOU XTMP6, XTMP4
|
|
|
|
PCLMULQDQ $0x10, KS_L, XTMP1
|
|
PCLMULQDQ $0x01, KS_M1, XTMP2
|
|
PCLMULQDQ $0x10, KS_M1, XTMP3
|
|
PCLMULQDQ $0x01, KS_M2, XTMP4
|
|
|
|
// XOR all the products and keep only bits 63-32
|
|
PXOR XTMP2, XTMP1
|
|
PXOR XTMP4, XTMP3
|
|
PXOR XTMP3, XTMP1
|
|
PSRLDQ $4, XTMP1 // Move bits 63-32 to bits 31-0
|
|
|
|
PALIGNR $12, XDIGEST, XTMP1 // XTMP1 || XDIGEST
|
|
|
|
// Update tag
|
|
MOVQ XTMP1, R10
|
|
XORQ R10, (AX)
|
|
|
|
// Copy last 16 bytes of KS to the front
|
|
MOVUPS (4*4)(BX), XTMP1
|
|
MOVUPS XTMP1, (0*4)(BX)
|
|
|
|
RET
|
|
|
|
avx:
|
|
VMOVDQU (0)(CX), XDATA
|
|
|
|
// Reverse data bytes
|
|
BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2)
|
|
|
|
// ZUC authentication part, 4x32 data bits
|
|
// setup KS
|
|
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
|
|
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
|
|
VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
|
|
|
// setup DATA
|
|
// Data bytes [31:0 0s 63:32 0s]
|
|
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1
|
|
// Data bytes [95:64 0s 127:96 0s]
|
|
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2
|
|
|
|
// clmul
|
|
// xor the results from 4 32-bit words together
|
|
// Calculate lower 32 bits of tag
|
|
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
|
|
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
|
|
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
|
|
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
|
|
|
|
VPXOR XTMP3, XTMP4, XTMP3
|
|
VPXOR XTMP5, XTMP6, XTMP5
|
|
VPXOR XTMP3, XTMP5, XTMP3
|
|
VPSLLDQ $8, XTMP3, XDIGEST
|
|
|
|
VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
|
|
VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
|
|
VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
|
|
VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
|
|
|
|
// XOR all the products and keep only 32-63 bits
|
|
VPXOR XTMP4, XTMP3, XTMP3
|
|
VPXOR XTMP6, XTMP5, XTMP5
|
|
VPXOR XTMP5, XTMP3, XTMP3
|
|
VPSRLDQ $4, XTMP3, XTMP3
|
|
|
|
VPALIGNR $12, XDIGEST, XTMP3, XDIGEST
|
|
|
|
// Update tag
|
|
VMOVQ XDIGEST, R10
|
|
XORQ R10, (AX)
|
|
|
|
// Copy last 16 bytes of KS to the front
|
|
VMOVDQU (4*4)(BX), XTMP1
|
|
VMOVDQU XTMP1, (0*4)(BX)
|
|
|
|
RET
|
|
|
|
// func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
|
|
TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
|
|
MOVQ t+0(FP), AX
|
|
MOVQ ks+8(FP), BX
|
|
MOVQ p+16(FP), CX
|
|
|
|
CMPB ·useAVX(SB), $1
|
|
JE avx
|
|
|
|
// Reverse data bytes
|
|
MOVUPS (0)(CX), XDATA
|
|
BIT_REVERSE_SSE(XDATA, XTMP1, XTMP2)
|
|
|
|
// ZUC authentication part, 4x32 data bits
|
|
// setup KS
|
|
MOVUPS (0*4)(BX), XTMP1
|
|
MOVUPS (2*4)(BX), XTMP2
|
|
MOVUPS (4*4)(BX), XTMP4
|
|
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
|
|
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
|
PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
|
PSHUFD $0xBB, XTMP4, KS_H // KS bits [255:224 223:192 255:224 223:192]
|
|
|
|
// setup DATA
|
|
MOVOU XDATA, XTMP1
|
|
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
|
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
|
|
|
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA
|
|
MOVOU XDATA, XDIGEST // XDIGEST/XDATA - Data bits [95:64 0s 127:96 0s]
|
|
|
|
// clmul
|
|
// xor the results from 4 32-bit words together
|
|
// Save data for following products
|
|
MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s]
|
|
MOVOU XDATA, XTMP6 // Data bits [95:64 0s 127:96 0s]
|
|
|
|
// Calculate lower 32 bits of tag
|
|
PCLMULQDQ $0x00, KS_L, XTMP1
|
|
PCLMULQDQ $0x11, KS_L, XTMP2
|
|
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
|
PCLMULQDQ $0x11, KS_M1, XDATA
|
|
|
|
// XOR all products and move bits 63-32 bits to lower 32 bits
|
|
PXOR XTMP1, XTMP2
|
|
PXOR XDATA, XDIGEST
|
|
PXOR XTMP2, XDIGEST
|
|
PSLLDQ $8, XDIGEST // Move bits 63-32 to bits 127-96
|
|
|
|
// Prepare data and calculate bits 63-32 of tag
|
|
MOVOU XTMP5, XTMP1
|
|
MOVOU XTMP5, XTMP2
|
|
MOVOU XTMP6, XTMP3
|
|
MOVOU XTMP6, XTMP4
|
|
|
|
PCLMULQDQ $0x10, KS_L, XTMP1
|
|
PCLMULQDQ $0x01, KS_M1, XTMP2
|
|
PCLMULQDQ $0x10, KS_M1, XTMP3
|
|
PCLMULQDQ $0x01, KS_M2, XTMP4
|
|
|
|
// XOR all the products and keep only bits 63-32
|
|
PXOR XTMP2, XTMP1
|
|
PXOR XTMP4, XTMP3
|
|
PXOR XTMP3, XTMP1
|
|
PSRLDQ $4, XTMP1 // Move bits 63-32 to bits 31-0
|
|
PALIGNR $4, XDIGEST, XTMP1 // XTMP1 || XDIGEST
|
|
|
|
// Prepare data and calculate bits 95-64 of tag
|
|
MOVOU XTMP5, XDIGEST
|
|
MOVOU XTMP5, XTMP2
|
|
MOVOU XTMP6, XTMP3
|
|
MOVOU XTMP6, XTMP4
|
|
|
|
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
|
PCLMULQDQ $0x11, KS_M1, XTMP2
|
|
PCLMULQDQ $0x00, KS_M2, XTMP3
|
|
PCLMULQDQ $0x11, KS_M2, XTMP4
|
|
|
|
// XOR all the products and move bits 63-32 to bits 95-64
|
|
PXOR XTMP2, XDIGEST
|
|
PXOR XTMP4, XTMP3
|
|
PXOR XTMP3, XDIGEST
|
|
PSRLDQ $4, XDIGEST // Move bits 63-32 to bits 31-0
|
|
PALIGNR $4, XTMP1, XDIGEST // XDIGEST || XTMP1
|
|
|
|
// Prepare data and calculate bits 127-96 of tag
|
|
MOVOU XTMP5, XTMP1
|
|
MOVOU XTMP5, XTMP2
|
|
MOVOU XTMP6, XTMP3
|
|
MOVOU XTMP6, XTMP4
|
|
|
|
PCLMULQDQ $0x10, KS_M1, XTMP1
|
|
PCLMULQDQ $0x01, KS_M2, XTMP2
|
|
PCLMULQDQ $0x10, KS_M2, XTMP3
|
|
PCLMULQDQ $0x01, KS_H, XTMP4
|
|
|
|
// XOR all the products and move bits 63-32 to bits 127-96
|
|
PXOR XTMP2, XTMP1
|
|
PXOR XTMP4, XTMP3
|
|
PXOR XTMP3, XTMP1
|
|
PSRLDQ $4, XTMP1 // Move bits 63-32 to bits 31-0
|
|
PALIGNR $4, XDIGEST, XTMP1 // XTMP1 || XDIGEST
|
|
|
|
// Update tag
|
|
MOVUPS (AX), XDIGEST
|
|
PXOR XTMP1, XDIGEST
|
|
MOVUPS XDIGEST, (AX)
|
|
|
|
// Copy last 16 bytes of KS to the front
|
|
MOVUPS (4*4)(BX), XTMP1
|
|
MOVUPS XTMP1, (0*4)(BX)
|
|
|
|
RET
|
|
|
|
avx:
|
|
VMOVDQU (0)(CX), XDATA
|
|
|
|
// Reverse data bytes
|
|
BIT_REVERSE_AVX(XDATA, XTMP1, XTMP2)
|
|
|
|
// ZUC authentication part, 4x32 data bits
|
|
// setup KS
|
|
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
|
|
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
|
|
VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160]
|
|
VPSHUFD $0xBB, (4*4)(BX), KS_H // KS bits [255:224 223:192 255:224 223:192]
|
|
|
|
// setup DATA
|
|
// Data bytes [31:0 0s 63:32 0s]
|
|
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XDATA, XTMP1
|
|
// Data bytes [95:64 0s 127:96 0s]
|
|
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XDATA, XTMP2
|
|
|
|
// clmul
|
|
// xor the results from 4 32-bit words together
|
|
// Calculate lower 32 bits of tag
|
|
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
|
|
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
|
|
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
|
|
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
|
|
|
|
VPXOR XTMP3, XTMP4, XTMP3
|
|
VPXOR XTMP5, XTMP6, XTMP5
|
|
VPXOR XTMP3, XTMP5, XTMP3
|
|
VPSLLDQ $8, XTMP3, XDIGEST
|
|
|
|
VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
|
|
VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
|
|
VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
|
|
VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
|
|
|
|
// XOR all the products and keep only 32-63 bits
|
|
VPXOR XTMP4, XTMP3, XTMP3
|
|
VPXOR XTMP6, XTMP5, XTMP5
|
|
VPXOR XTMP5, XTMP3, XTMP3
|
|
VPSRLDQ $4, XTMP3, XTMP3
|
|
VPALIGNR $4, XDIGEST, XTMP3, XDIGEST
|
|
|
|
// Prepare data and calculate bits 95-64 of tag
|
|
VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3
|
|
VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4
|
|
VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5
|
|
VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6
|
|
|
|
// XOR all the products and move bits 63-32 to bits 95-64
|
|
VPXOR XTMP4, XTMP3, XTMP3
|
|
VPXOR XTMP6, XTMP5, XTMP5
|
|
VPXOR XTMP5, XTMP3, XTMP3
|
|
VPSRLDQ $4, XTMP3, XTMP3
|
|
VPALIGNR $4, XDIGEST, XTMP3, XDIGEST
|
|
|
|
// Prepare data and calculate bits 127-96 of tag
|
|
VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3
|
|
VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4
|
|
VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5
|
|
VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6
|
|
|
|
// XOR all the products and move bits 63-32 to bits 127-96
|
|
VPXOR XTMP4, XTMP3, XTMP3
|
|
VPXOR XTMP6, XTMP5, XTMP5
|
|
VPXOR XTMP5, XTMP3, XTMP3
|
|
VPSRLDQ $4, XTMP3, XTMP3
|
|
VPALIGNR $4, XDIGEST, XTMP3, XDIGEST
|
|
|
|
// Update tag
|
|
VPXOR (AX), XDIGEST, XDIGEST
|
|
VMOVDQA XDIGEST, (AX)
|
|
|
|
// Copy last 16 bytes of KS to the front
|
|
VMOVDQU (4*4)(BX), XTMP1
|
|
VMOVDQU XTMP1, (0*4)(BX)
|
|
|
|
RET
|