zuc: amd64 performance improvement 3

This commit is contained in:
Sun Yimin 2022-07-01 11:00:42 +08:00 committed by GitHub
parent ff4e5e9606
commit 83c6a58365
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 220 additions and 12 deletions

View File

@ -53,5 +53,5 @@ func (s *zucState32) f32(x0, x1, x2 uint32) uint32 {
goarch: amd64
pkg: github.com/emmansun/gmsm/zuc
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkEncrypt1K-6 253791 4621 ns/op 220.52 MB/s
BenchmarkEncrypt8K-6 31794 37772 ns/op 216.75 MB/s
BenchmarkEncrypt1K-6 409755 2802 ns/op 363.62 MB/s
BenchmarkEncrypt8K-6 54120 22413 ns/op 365.28 MB/s

View File

@ -72,6 +72,11 @@ DATA mask_S1<>+0x00(SB)/8, $0x00ff00ff00ff00ff
DATA mask_S1<>+0x08(SB)/8, $0x00ff00ff00ff00ff
GLOBL mask_S1<>(SB), RODATA, $16
// shuffle byte order from LE to BE
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL flip_mask<>(SB), RODATA, $16
#define OFFSET_FR1 (16*4)
#define OFFSET_FR2 (17*4)
#define OFFSET_BRC_X0 (18*4)
@ -429,7 +434,7 @@ avx:
XORQ AX, AX \
LFSR_UPDT(idx)
#define ROUND_AVX(idx) \
#define ROUND_AVX(idx) \
BITS_REORG(idx) \
NONLIN_FUN_AVX() \
XORL R15, AX \
@ -437,6 +442,24 @@ avx:
XORQ AX, AX \
LFSR_UPDT(idx)
#define ROUND_REV32_SSE(idx) \
BITS_REORG(idx) \
NONLIN_FUN_SSE() \
XORL R15, AX \
BSWAPL AX \
MOVL AX, (idx*4)(DI) \
XORQ AX, AX \
LFSR_UPDT(idx)
#define ROUND_REV32_AVX(idx) \
BITS_REORG(idx) \
NONLIN_FUN_AVX() \
XORL R15, AX \
BSWAPL AX \
MOVL AX, (idx*4)(DI) \
XORQ AX, AX \
LFSR_UPDT(idx)
// func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
MOVQ ks+0(FP), DI
@ -576,3 +599,145 @@ avxZucRet:
SAVE_STATE()
VZEROUPPER
RET
// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0
MOVQ ks+0(FP), DI
MOVQ ks_len+8(FP), BP
MOVQ pState+24(FP), SI
SHRQ $2, BP
LOAD_STATE()
CMPB ·useAVX(SB), $1
JE avxZucSixteens
sseZucSixteens:
CMPQ BP, $16
JB sseZucOctet
SUBQ $16, BP
ROUND_REV32_SSE(0)
ROUND_REV32_SSE(1)
ROUND_REV32_SSE(2)
ROUND_REV32_SSE(3)
ROUND_REV32_SSE(4)
ROUND_REV32_SSE(5)
ROUND_REV32_SSE(6)
ROUND_REV32_SSE(7)
ROUND_REV32_SSE(8)
ROUND_REV32_SSE(9)
ROUND_REV32_SSE(10)
ROUND_REV32_SSE(11)
ROUND_REV32_SSE(12)
ROUND_REV32_SSE(13)
ROUND_REV32_SSE(14)
ROUND_REV32_SSE(15)
LEAQ 64(DI), DI
JMP sseZucSixteens
sseZucOctet:
CMPQ BP, $8
JB sseZucNibble
SUBQ $8, BP
ROUND_REV32_SSE(0)
ROUND_REV32_SSE(1)
ROUND_REV32_SSE(2)
ROUND_REV32_SSE(3)
ROUND_REV32_SSE(4)
ROUND_REV32_SSE(5)
ROUND_REV32_SSE(6)
ROUND_REV32_SSE(7)
LEAQ 32(DI), DI
RESTORE_LFSR_8()
sseZucNibble:
CMPQ BP, $4
JB sseZucDouble
SUBQ $4, BP
ROUND_REV32_SSE(0)
ROUND_REV32_SSE(1)
ROUND_REV32_SSE(2)
ROUND_REV32_SSE(3)
LEAQ 16(DI), DI
RESTORE_LFSR_4()
sseZucDouble:
CMPQ BP, $2
JB sseZucSingle
SUBQ $2, BP
ROUND_REV32_SSE(0)
ROUND_REV32_SSE(1)
LEAQ 8(DI), DI
RESTORE_LFSR_2()
sseZucSingle:
TESTQ BP, BP
JE sseZucRet
ROUND_REV32_SSE(0)
RESTORE_LFSR_0()
sseZucRet:
SAVE_STATE()
RET
avxZucSixteens:
CMPQ BP, $16
JB avxZucOctet
SUBQ $16, BP
ROUND_REV32_AVX(0)
ROUND_REV32_AVX(1)
ROUND_REV32_AVX(2)
ROUND_REV32_AVX(3)
ROUND_REV32_AVX(4)
ROUND_REV32_AVX(5)
ROUND_REV32_AVX(6)
ROUND_REV32_AVX(7)
ROUND_REV32_AVX(8)
ROUND_REV32_AVX(9)
ROUND_REV32_AVX(10)
ROUND_REV32_AVX(11)
ROUND_REV32_AVX(12)
ROUND_REV32_AVX(13)
ROUND_REV32_AVX(14)
ROUND_REV32_AVX(15)
LEAQ 64(DI), DI
JMP avxZucSixteens
avxZucOctet:
CMPQ BP, $8
JB avxZucNibble
SUBQ $8, BP
ROUND_REV32_AVX(0)
ROUND_REV32_AVX(1)
ROUND_REV32_AVX(2)
ROUND_REV32_AVX(3)
ROUND_REV32_AVX(4)
ROUND_REV32_AVX(5)
ROUND_REV32_AVX(6)
ROUND_REV32_AVX(7)
LEAQ 32(DI), DI
RESTORE_LFSR_8()
avxZucNibble:
CMPQ BP, $4
JB avxZucDouble
SUBQ $4, BP
ROUND_REV32_AVX(0)
ROUND_REV32_AVX(1)
ROUND_REV32_AVX(2)
ROUND_REV32_AVX(3)
LEAQ 16(DI), DI
RESTORE_LFSR_4()
avxZucDouble:
CMPQ BP, $2
JB avxZucSingle
SUBQ $2, BP
ROUND_REV32_AVX(0)
ROUND_REV32_AVX(1)
LEAQ 8(DI), DI
RESTORE_LFSR_2()
avxZucSingle:
TESTQ BP, BP
JE avxZucRet
ROUND_REV32_AVX(0)
RESTORE_LFSR_0()
avxZucRet:
SAVE_STATE()
VZEROUPPER
RET

View File

@ -14,4 +14,4 @@ func genKeyword(s *zucState32) uint32 {
z := s.x3 ^ s.f32()
s.enterWorkMode()
return z
}
}

View File

@ -8,7 +8,7 @@ import (
"github.com/emmansun/gmsm/internal/xor"
)
const RoundWords = 16
const RoundWords = 32
// NewCipher create a stream cipher based on key and iv aguments.
func NewCipher(key, iv []byte) (cipher.Stream, error) {
@ -25,13 +25,7 @@ func NewEEACipher(key []byte, count, bearer, direction uint32) (cipher.Stream, e
return newZUCState(key, iv)
}
func (c *zucState32) XORKeyStream(dst, src []byte) {
if len(dst) < len(src) {
panic("zuc: output smaller than input")
}
if subtle.InexactOverlap(dst[:len(src)], src) {
panic("zuc: invalid buffer overlap")
}
func genericXorKeyStream(c *zucState32, dst, src []byte) {
words := (len(src) + 3) / 4
rounds := words / RoundWords
var keyWords [RoundWords]uint32
@ -53,3 +47,13 @@ func (c *zucState32) XORKeyStream(dst, src []byte) {
xor.XorBytes(dst, src, keyBytes[:])
}
}
func (c *zucState32) XORKeyStream(dst, src []byte) {
if len(dst) < len(src) {
panic("zuc: output smaller than input")
}
if subtle.InexactOverlap(dst[:len(src)], src) {
panic("zuc: invalid buffer overlap")
}
xorKeyStream(c, dst, src)
}

31
zuc/eea_asm.go Normal file
View File

@ -0,0 +1,31 @@
//go:build (amd64 && !generic)
// +build amd64,!generic
package zuc
import (
"github.com/emmansun/gmsm/internal/xor"
)
//go:noescape
func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
func xorKeyStream(c *zucState32, dst, src []byte) {
if supportsAES {
words := len(src) / 4
// handle complete words first
if words > 0 {
dstWords := dst[:words*4]
genKeyStreamRev32Asm(dstWords, c)
xor.XorBytes(dst, src, dstWords)
}
// handle remain bytes
if words*4 < len(src) {
var singleWord [4]byte
genKeyStreamRev32Asm(singleWord[:], c)
xor.XorBytes(dst[words*4:], src[words*4:], singleWord[:])
}
} else {
genericXorKeyStream(c, dst, src)
}
}

8
zuc/eea_generic.go Normal file
View File

@ -0,0 +1,8 @@
//go:build !amd64 || generic
// +build !amd64 generic
package zuc
func xorKeyStream(c *zucState32, dst, src []byte) {
genericXorKeyStream(c, dst, src)
}