mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
zuc: EIA performance improvement
This commit is contained in:
parent
87f6f6a736
commit
39274df2bd
@ -48,10 +48,18 @@ func (s *zucState32) f32(x0, x1, x2 uint32) uint32 {
|
||||
fmt.Println()
|
||||
```
|
||||
|
||||
## Performance with AMD64 SIMD & AESNI:
|
||||
## EEA Performance with AMD64 SIMD & AESNI:
|
||||
goos: windows
|
||||
goarch: amd64
|
||||
pkg: github.com/emmansun/gmsm/zuc
|
||||
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
|
||||
BenchmarkEncrypt1K-6 409755 2802 ns/op 363.62 MB/s
|
||||
BenchmarkEncrypt8K-6 54120 22413 ns/op 365.28 MB/s
|
||||
|
||||
## EIA Performance with AMD64 SIMD & AESNI & CLMUL:
|
||||
goos: windows
|
||||
goarch: amd64
|
||||
pkg: github.com/emmansun/gmsm/zuc
|
||||
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
|
||||
BenchmarkHash1K-6 317750 3833 ns/op 267.13 MB/s
|
||||
BenchmarkHash8K-6 40460 28921 ns/op 283.26 MB/s
|
||||
|
16
zuc/eia.go
16
zuc/eia.go
@ -1,7 +1,5 @@
|
||||
package zuc
|
||||
|
||||
// Just for reference, no performance advantage due to the block size / chunk are 4 bytes only!
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
@ -29,6 +27,7 @@ func NewHash(key, iv []byte) (*ZUC128Mac, error) {
|
||||
ivLen := len(iv)
|
||||
mac := &ZUC128Mac{}
|
||||
mac.tagSize = 4
|
||||
|
||||
switch k {
|
||||
default:
|
||||
return nil, fmt.Errorf("zuc/eia: invalid key size %d, expect 16 in bytes", k)
|
||||
@ -38,6 +37,7 @@ func NewHash(key, iv []byte) (*ZUC128Mac, error) {
|
||||
}
|
||||
mac.loadKeyIV16(key, iv)
|
||||
}
|
||||
|
||||
// initialization
|
||||
for i := 0; i < 32; i++ {
|
||||
mac.bitReorganization()
|
||||
@ -89,10 +89,10 @@ func (m *ZUC128Mac) Reset() {
|
||||
m.r1 = m.initState.r1
|
||||
m.r2 = m.initState.r2
|
||||
copy(m.lfsr[:], m.initState.lfsr[:])
|
||||
m.genKeywords(m.k0[:4])
|
||||
m.genKeywords(m.k0[:len(m.k0)/2])
|
||||
}
|
||||
|
||||
func (m *ZUC128Mac) block(p []byte) {
|
||||
func blockGeneric(m *ZUC128Mac, p []byte) {
|
||||
var k64, t64 uint64
|
||||
t64 = uint64(m.t) << 32
|
||||
for len(p) >= chunk {
|
||||
@ -121,14 +121,14 @@ func (m *ZUC128Mac) Write(p []byte) (nn int, err error) {
|
||||
n := copy(m.x[m.nx:], p)
|
||||
m.nx += n
|
||||
if m.nx == chunk {
|
||||
m.block(m.x[:])
|
||||
block(m, m.x[:])
|
||||
m.nx = 0
|
||||
}
|
||||
p = p[n:]
|
||||
}
|
||||
if len(p) >= chunk {
|
||||
n := len(p) &^ (chunk - 1)
|
||||
m.block(p[:n])
|
||||
block(m, p[:n])
|
||||
p = p[n:]
|
||||
}
|
||||
if len(p) > 0 {
|
||||
@ -139,7 +139,7 @@ func (m *ZUC128Mac) Write(p []byte) (nn int, err error) {
|
||||
|
||||
func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte {
|
||||
if m.nx >= chunk {
|
||||
panic("m.nx >= 16")
|
||||
panic("m.nx >= chunk")
|
||||
}
|
||||
kIdx := 0
|
||||
if m.nx > 0 || additionalBits > 0 {
|
||||
@ -147,7 +147,7 @@ func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte {
|
||||
t64 = uint64(m.t) << 32
|
||||
m.x[m.nx] = b
|
||||
nRemainBits := 8*m.nx + additionalBits
|
||||
if nRemainBits > 64 {
|
||||
if nRemainBits > 2*32 {
|
||||
m.genKeywords(m.k0[4:6])
|
||||
}
|
||||
words := (nRemainBits + 31) / 32
|
||||
|
24
zuc/eia_asm.go
Normal file
24
zuc/eia_asm.go
Normal file
@ -0,0 +1,24 @@
|
||||
//go:build (amd64 && !generic)
|
||||
// +build amd64,!generic
|
||||
|
||||
package zuc
|
||||
|
||||
import "golang.org/x/sys/cpu"
|
||||
|
||||
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
|
||||
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
|
||||
|
||||
//go:noescape
|
||||
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||
|
||||
func block(m *ZUC128Mac, p []byte) {
|
||||
if supportsGFMUL {
|
||||
for len(p) >= chunk {
|
||||
m.genKeywords(m.k0[4:])
|
||||
eia3Round16B(&m.t, &m.k0[0], &p[0], m.tagSize)
|
||||
p = p[chunk:]
|
||||
}
|
||||
} else {
|
||||
blockGeneric(m, p)
|
||||
}
|
||||
}
|
153
zuc/eia_asm_amd64.s
Normal file
153
zuc/eia_asm_amd64.s
Normal file
@ -0,0 +1,153 @@
|
||||
// Referenced https://github.com/intel/intel-ipsec-mb/
|
||||
//go:build amd64 && !generic
|
||||
// +build amd64,!generic
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800
|
||||
DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901
|
||||
GLOBL bit_reverse_table_l<>(SB), RODATA, $16
|
||||
|
||||
DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000
|
||||
DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010
|
||||
GLOBL bit_reverse_table_h<>(SB), RODATA, $16
|
||||
|
||||
DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
|
||||
DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
|
||||
GLOBL bit_reverse_and_table<>(SB), RODATA, $16
|
||||
|
||||
DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100
|
||||
DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
|
||||
GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16
|
||||
|
||||
DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908
|
||||
DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c
|
||||
GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
|
||||
|
||||
#define XTMP1 X1
|
||||
#define XTMP2 X2
|
||||
#define XTMP3 X3
|
||||
#define XTMP4 X4
|
||||
#define XTMP5 X5
|
||||
#define XTMP6 X6
|
||||
#define XDATA X7
|
||||
#define XDIGEST X8
|
||||
#define KS_L X9
|
||||
#define KS_M1 X10
|
||||
#define KS_M2 X11
|
||||
#define KS_H X12
|
||||
|
||||
// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||
TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
MOVQ t+0(FP), AX
|
||||
MOVQ ks+8(FP), BX
|
||||
MOVQ p+16(FP), CX
|
||||
MOVQ tagSize+24(FP), DX
|
||||
|
||||
CMPB ·useAVX2(SB), $1
|
||||
JE avx2
|
||||
|
||||
// Reverse data bytes
|
||||
MOVUPS (0)(CX), XDATA
|
||||
MOVOU bit_reverse_and_table<>(SB), XTMP4
|
||||
MOVOU XDATA, XTMP2
|
||||
PAND XTMP4, XTMP2
|
||||
|
||||
PANDN XDATA, XTMP4
|
||||
PSRLQ $4, XTMP4
|
||||
|
||||
MOVOU bit_reverse_table_h<>(SB), XTMP3
|
||||
PSHUFB XTMP2, XTMP3
|
||||
|
||||
MOVOU bit_reverse_table_l<>(SB), XTMP1
|
||||
PSHUFB XTMP4, XTMP1
|
||||
|
||||
PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
MOVUPS (0*4)(BX), XTMP1
|
||||
MOVUPS (2*4)(BX), XTMP2
|
||||
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
||||
|
||||
// setup DATA
|
||||
MOVOU XTMP3, XTMP1
|
||||
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
||||
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
||||
|
||||
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
|
||||
MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
PCLMULQDQ $0x00, KS_L, XTMP1
|
||||
PCLMULQDQ $0x11, KS_L, XTMP2
|
||||
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
||||
PCLMULQDQ $0x11, KS_M1, XTMP3
|
||||
|
||||
// XOR all products and move 32-bits to lower 32 bits
|
||||
PXOR XTMP1, XTMP2
|
||||
PXOR XTMP3, XDIGEST
|
||||
PXOR XTMP2, XDIGEST
|
||||
PSRLDQ $4, XDIGEST
|
||||
|
||||
// Update tag
|
||||
MOVL XDIGEST, R10
|
||||
XORL R10, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
MOVUPS (4*4)(BX), XTMP1
|
||||
MOVUPS XTMP1, (0*4)(BX)
|
||||
|
||||
RET
|
||||
|
||||
avx2:
|
||||
VMOVDQU (0)(CX), XDATA
|
||||
|
||||
// Reverse data bytes
|
||||
VMOVDQU bit_reverse_and_table<>(SB), XTMP1
|
||||
VPAND XTMP1, XDATA, XTMP2
|
||||
VPANDN XDATA, XTMP1, XTMP3
|
||||
VPSRLD $4, XTMP3, XTMP3
|
||||
|
||||
VMOVDQU bit_reverse_table_h<>(SB), XTMP1
|
||||
VPSHUFB XTMP2, XTMP1, XTMP4
|
||||
VMOVDQU bit_reverse_table_l<>(SB), XTMP1
|
||||
VPSHUFB XTMP3, XTMP1, XTMP1
|
||||
VPOR XTMP1, XTMP4, XTMP4
|
||||
|
||||
// ZUC authentication part, 4x32 data bits
|
||||
// setup KS
|
||||
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
|
||||
|
||||
// setup DATA
|
||||
// Data bytes [31:0 0s 63:32 0s]
|
||||
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
|
||||
// Data bytes [95:64 0s 127:96 0s]
|
||||
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
|
||||
|
||||
// clmul
|
||||
// xor the results from 4 32-bit words together
|
||||
// Calculate lower 32 bits of tag
|
||||
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
|
||||
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
|
||||
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
|
||||
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
|
||||
|
||||
VPXOR XTMP3, XTMP4, XTMP3
|
||||
VPXOR XTMP5, XTMP6, XTMP5
|
||||
VPXOR XTMP3, XTMP5, XDIGEST
|
||||
|
||||
VMOVQ XDIGEST, R10
|
||||
SHRQ $32, R10
|
||||
XORL R10, (AX)
|
||||
|
||||
// Copy last 16 bytes of KS to the front
|
||||
VMOVDQU (4*4)(BX), XTMP1
|
||||
VMOVDQU XTMP1, (0*4)(BX)
|
||||
|
||||
VZEROUPPER
|
||||
RET
|
8
zuc/eia_generic.go
Normal file
8
zuc/eia_generic.go
Normal file
@ -0,0 +1,8 @@
|
||||
//go:build !amd64 || generic
|
||||
// +build !amd64 generic
|
||||
|
||||
package zuc
|
||||
|
||||
func block(m *ZUC128Mac, p []byte) {
|
||||
blockGeneric(m, p)
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user