mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-22 10:16:18 +08:00
zuc: amd64 performance improvement 3
This commit is contained in:
parent
ff4e5e9606
commit
83c6a58365
@ -53,5 +53,5 @@ func (s *zucState32) f32(x0, x1, x2 uint32) uint32 {
|
||||
goarch: amd64
|
||||
pkg: github.com/emmansun/gmsm/zuc
|
||||
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
|
||||
BenchmarkEncrypt1K-6 253791 4621 ns/op 220.52 MB/s
|
||||
BenchmarkEncrypt8K-6 31794 37772 ns/op 216.75 MB/s
|
||||
BenchmarkEncrypt1K-6 409755 2802 ns/op 363.62 MB/s
|
||||
BenchmarkEncrypt8K-6 54120 22413 ns/op 365.28 MB/s
|
||||
|
167
zuc/asm_amd64.s
167
zuc/asm_amd64.s
@ -72,6 +72,11 @@ DATA mask_S1<>+0x00(SB)/8, $0x00ff00ff00ff00ff
|
||||
DATA mask_S1<>+0x08(SB)/8, $0x00ff00ff00ff00ff
|
||||
GLOBL mask_S1<>(SB), RODATA, $16
|
||||
|
||||
// shuffle byte order from LE to BE
|
||||
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
|
||||
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
||||
GLOBL flip_mask<>(SB), RODATA, $16
|
||||
|
||||
#define OFFSET_FR1 (16*4)
|
||||
#define OFFSET_FR2 (17*4)
|
||||
#define OFFSET_BRC_X0 (18*4)
|
||||
@ -429,7 +434,7 @@ avx:
|
||||
XORQ AX, AX \
|
||||
LFSR_UPDT(idx)
|
||||
|
||||
#define ROUND_AVX(idx) \
|
||||
#define ROUND_AVX(idx) \
|
||||
BITS_REORG(idx) \
|
||||
NONLIN_FUN_AVX() \
|
||||
XORL R15, AX \
|
||||
@ -437,6 +442,24 @@ avx:
|
||||
XORQ AX, AX \
|
||||
LFSR_UPDT(idx)
|
||||
|
||||
#define ROUND_REV32_SSE(idx) \
|
||||
BITS_REORG(idx) \
|
||||
NONLIN_FUN_SSE() \
|
||||
XORL R15, AX \
|
||||
BSWAPL AX \
|
||||
MOVL AX, (idx*4)(DI) \
|
||||
XORQ AX, AX \
|
||||
LFSR_UPDT(idx)
|
||||
|
||||
#define ROUND_REV32_AVX(idx) \
|
||||
BITS_REORG(idx) \
|
||||
NONLIN_FUN_AVX() \
|
||||
XORL R15, AX \
|
||||
BSWAPL AX \
|
||||
MOVL AX, (idx*4)(DI) \
|
||||
XORQ AX, AX \
|
||||
LFSR_UPDT(idx)
|
||||
|
||||
// func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
||||
TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
|
||||
MOVQ ks+0(FP), DI
|
||||
@ -576,3 +599,145 @@ avxZucRet:
|
||||
SAVE_STATE()
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
|
||||
TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0
|
||||
MOVQ ks+0(FP), DI
|
||||
MOVQ ks_len+8(FP), BP
|
||||
MOVQ pState+24(FP), SI
|
||||
|
||||
SHRQ $2, BP
|
||||
|
||||
LOAD_STATE()
|
||||
|
||||
CMPB ·useAVX(SB), $1
|
||||
JE avxZucSixteens
|
||||
|
||||
sseZucSixteens:
|
||||
CMPQ BP, $16
|
||||
JB sseZucOctet
|
||||
SUBQ $16, BP
|
||||
ROUND_REV32_SSE(0)
|
||||
ROUND_REV32_SSE(1)
|
||||
ROUND_REV32_SSE(2)
|
||||
ROUND_REV32_SSE(3)
|
||||
ROUND_REV32_SSE(4)
|
||||
ROUND_REV32_SSE(5)
|
||||
ROUND_REV32_SSE(6)
|
||||
ROUND_REV32_SSE(7)
|
||||
ROUND_REV32_SSE(8)
|
||||
ROUND_REV32_SSE(9)
|
||||
ROUND_REV32_SSE(10)
|
||||
ROUND_REV32_SSE(11)
|
||||
ROUND_REV32_SSE(12)
|
||||
ROUND_REV32_SSE(13)
|
||||
ROUND_REV32_SSE(14)
|
||||
ROUND_REV32_SSE(15)
|
||||
LEAQ 64(DI), DI
|
||||
JMP sseZucSixteens
|
||||
|
||||
sseZucOctet:
|
||||
CMPQ BP, $8
|
||||
JB sseZucNibble
|
||||
SUBQ $8, BP
|
||||
ROUND_REV32_SSE(0)
|
||||
ROUND_REV32_SSE(1)
|
||||
ROUND_REV32_SSE(2)
|
||||
ROUND_REV32_SSE(3)
|
||||
ROUND_REV32_SSE(4)
|
||||
ROUND_REV32_SSE(5)
|
||||
ROUND_REV32_SSE(6)
|
||||
ROUND_REV32_SSE(7)
|
||||
LEAQ 32(DI), DI
|
||||
RESTORE_LFSR_8()
|
||||
sseZucNibble:
|
||||
CMPQ BP, $4
|
||||
JB sseZucDouble
|
||||
SUBQ $4, BP
|
||||
ROUND_REV32_SSE(0)
|
||||
ROUND_REV32_SSE(1)
|
||||
ROUND_REV32_SSE(2)
|
||||
ROUND_REV32_SSE(3)
|
||||
LEAQ 16(DI), DI
|
||||
RESTORE_LFSR_4()
|
||||
sseZucDouble:
|
||||
CMPQ BP, $2
|
||||
JB sseZucSingle
|
||||
SUBQ $2, BP
|
||||
ROUND_REV32_SSE(0)
|
||||
ROUND_REV32_SSE(1)
|
||||
LEAQ 8(DI), DI
|
||||
RESTORE_LFSR_2()
|
||||
sseZucSingle:
|
||||
TESTQ BP, BP
|
||||
JE sseZucRet
|
||||
ROUND_REV32_SSE(0)
|
||||
RESTORE_LFSR_0()
|
||||
sseZucRet:
|
||||
SAVE_STATE()
|
||||
RET
|
||||
|
||||
avxZucSixteens:
|
||||
CMPQ BP, $16
|
||||
JB avxZucOctet
|
||||
SUBQ $16, BP
|
||||
ROUND_REV32_AVX(0)
|
||||
ROUND_REV32_AVX(1)
|
||||
ROUND_REV32_AVX(2)
|
||||
ROUND_REV32_AVX(3)
|
||||
ROUND_REV32_AVX(4)
|
||||
ROUND_REV32_AVX(5)
|
||||
ROUND_REV32_AVX(6)
|
||||
ROUND_REV32_AVX(7)
|
||||
ROUND_REV32_AVX(8)
|
||||
ROUND_REV32_AVX(9)
|
||||
ROUND_REV32_AVX(10)
|
||||
ROUND_REV32_AVX(11)
|
||||
ROUND_REV32_AVX(12)
|
||||
ROUND_REV32_AVX(13)
|
||||
ROUND_REV32_AVX(14)
|
||||
ROUND_REV32_AVX(15)
|
||||
LEAQ 64(DI), DI
|
||||
JMP avxZucSixteens
|
||||
|
||||
avxZucOctet:
|
||||
CMPQ BP, $8
|
||||
JB avxZucNibble
|
||||
SUBQ $8, BP
|
||||
ROUND_REV32_AVX(0)
|
||||
ROUND_REV32_AVX(1)
|
||||
ROUND_REV32_AVX(2)
|
||||
ROUND_REV32_AVX(3)
|
||||
ROUND_REV32_AVX(4)
|
||||
ROUND_REV32_AVX(5)
|
||||
ROUND_REV32_AVX(6)
|
||||
ROUND_REV32_AVX(7)
|
||||
LEAQ 32(DI), DI
|
||||
RESTORE_LFSR_8()
|
||||
avxZucNibble:
|
||||
CMPQ BP, $4
|
||||
JB avxZucDouble
|
||||
SUBQ $4, BP
|
||||
ROUND_REV32_AVX(0)
|
||||
ROUND_REV32_AVX(1)
|
||||
ROUND_REV32_AVX(2)
|
||||
ROUND_REV32_AVX(3)
|
||||
LEAQ 16(DI), DI
|
||||
RESTORE_LFSR_4()
|
||||
avxZucDouble:
|
||||
CMPQ BP, $2
|
||||
JB avxZucSingle
|
||||
SUBQ $2, BP
|
||||
ROUND_REV32_AVX(0)
|
||||
ROUND_REV32_AVX(1)
|
||||
LEAQ 8(DI), DI
|
||||
RESTORE_LFSR_2()
|
||||
avxZucSingle:
|
||||
TESTQ BP, BP
|
||||
JE avxZucRet
|
||||
ROUND_REV32_AVX(0)
|
||||
RESTORE_LFSR_0()
|
||||
avxZucRet:
|
||||
SAVE_STATE()
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
@ -14,4 +14,4 @@ func genKeyword(s *zucState32) uint32 {
|
||||
z := s.x3 ^ s.f32()
|
||||
s.enterWorkMode()
|
||||
return z
|
||||
}
|
||||
}
|
||||
|
20
zuc/eea.go
20
zuc/eea.go
@ -8,7 +8,7 @@ import (
|
||||
"github.com/emmansun/gmsm/internal/xor"
|
||||
)
|
||||
|
||||
const RoundWords = 16
|
||||
const RoundWords = 32
|
||||
|
||||
// NewCipher create a stream cipher based on key and iv aguments.
|
||||
func NewCipher(key, iv []byte) (cipher.Stream, error) {
|
||||
@ -25,13 +25,7 @@ func NewEEACipher(key []byte, count, bearer, direction uint32) (cipher.Stream, e
|
||||
return newZUCState(key, iv)
|
||||
}
|
||||
|
||||
func (c *zucState32) XORKeyStream(dst, src []byte) {
|
||||
if len(dst) < len(src) {
|
||||
panic("zuc: output smaller than input")
|
||||
}
|
||||
if subtle.InexactOverlap(dst[:len(src)], src) {
|
||||
panic("zuc: invalid buffer overlap")
|
||||
}
|
||||
func genericXorKeyStream(c *zucState32, dst, src []byte) {
|
||||
words := (len(src) + 3) / 4
|
||||
rounds := words / RoundWords
|
||||
var keyWords [RoundWords]uint32
|
||||
@ -53,3 +47,13 @@ func (c *zucState32) XORKeyStream(dst, src []byte) {
|
||||
xor.XorBytes(dst, src, keyBytes[:])
|
||||
}
|
||||
}
|
||||
|
||||
func (c *zucState32) XORKeyStream(dst, src []byte) {
|
||||
if len(dst) < len(src) {
|
||||
panic("zuc: output smaller than input")
|
||||
}
|
||||
if subtle.InexactOverlap(dst[:len(src)], src) {
|
||||
panic("zuc: invalid buffer overlap")
|
||||
}
|
||||
xorKeyStream(c, dst, src)
|
||||
}
|
||||
|
31
zuc/eea_asm.go
Normal file
31
zuc/eea_asm.go
Normal file
@ -0,0 +1,31 @@
|
||||
//go:build (amd64 && !generic)
|
||||
// +build amd64,!generic
|
||||
|
||||
package zuc
|
||||
|
||||
import (
|
||||
"github.com/emmansun/gmsm/internal/xor"
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
|
||||
|
||||
func xorKeyStream(c *zucState32, dst, src []byte) {
|
||||
if supportsAES {
|
||||
words := len(src) / 4
|
||||
// handle complete words first
|
||||
if words > 0 {
|
||||
dstWords := dst[:words*4]
|
||||
genKeyStreamRev32Asm(dstWords, c)
|
||||
xor.XorBytes(dst, src, dstWords)
|
||||
}
|
||||
// handle remain bytes
|
||||
if words*4 < len(src) {
|
||||
var singleWord [4]byte
|
||||
genKeyStreamRev32Asm(singleWord[:], c)
|
||||
xor.XorBytes(dst[words*4:], src[words*4:], singleWord[:])
|
||||
}
|
||||
} else {
|
||||
genericXorKeyStream(c, dst, src)
|
||||
}
|
||||
}
|
8
zuc/eea_generic.go
Normal file
8
zuc/eea_generic.go
Normal file
@ -0,0 +1,8 @@
|
||||
//go:build !amd64 || generic
|
||||
// +build !amd64 generic
|
||||
|
||||
package zuc
|
||||
|
||||
func xorKeyStream(c *zucState32, dst, src []byte) {
|
||||
genericXorKeyStream(c, dst, src)
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user