zuc: amd64 optimization step 2

This commit is contained in:
Sun Yimin 2022-06-30 11:29:42 +08:00 committed by GitHub
parent f7a55494c8
commit bd2543cdf9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 227 additions and 20 deletions

View File

@ -313,6 +313,7 @@ GLOBL mask_S1<>(SB), RODATA, $16
MOVUPS (36)(SI), X2 \
MOVQ (52)(SI), BX \
MOVL (60)(SI), CX \
\
MOVUPS X0, (SI) \
MOVUPS X1, (16)(SI) \
MOVUPS X2, (32)(SI) \
@ -320,6 +321,41 @@ GLOBL mask_S1<>(SB), RODATA, $16
MOVL CX, (56)(SI) \
MOVL AX, (60)(SI)
#define RESTORE_LFSR_2() \
MOVQ (0)(SI), AX \
MOVUPS (8)(SI), X0 \
MOVUPS (24)(SI), X1 \
MOVUPS (40)(SI), X2 \
MOVQ (56)(SI), BX \
\
MOVUPS X0, (SI) \
MOVUPS X1, (16)(SI) \
MOVUPS X2, (32)(SI) \
MOVQ BX, (48)(SI) \
MOVQ AX, (56)(SI)
#define RESTORE_LFSR_4() \
MOVUPS (0)(SI), X0 \
MOVUPS (16)(SI), X1 \
MOVUPS (32)(SI), X2 \
MOVUPS (48)(SI), X3 \
\
MOVUPS X1, (0)(SI) \
MOVUPS X2, (16)(SI) \
MOVUPS X3, (32)(SI) \
MOVUPS X0, (48)(SI)
#define RESTORE_LFSR_8() \
MOVUPS (0)(SI), X0 \
MOVUPS (16)(SI), X1 \
MOVUPS (32)(SI), X2 \
MOVUPS (48)(SI), X3 \
\
MOVUPS X2, (0)(SI) \
MOVUPS X3, (16)(SI) \
MOVUPS X0, (32)(SI) \
MOVUPS X1, (48)(SI)
#define NONLIN_FUN_AVX() \
NONLIN_FUN() \
VMOVQ DX, X0 \
@ -334,6 +370,14 @@ GLOBL mask_S1<>(SB), RODATA, $16
MOVL X0, R10 \ // F_R1
VPEXTRD $1, X0, R11
#define LOAD_STATE() \
MOVL OFFSET_FR1(SI), R10 \
MOVL OFFSET_FR2(SI), R11 \
MOVL OFFSET_BRC_X0(SI), R12 \
MOVL OFFSET_BRC_X1(SI), R13 \
MOVL OFFSET_BRC_X2(SI), R14 \
MOVL OFFSET_BRC_X3(SI), R15
#define SAVE_STATE() \
MOVL R10, OFFSET_FR1(SI) \
MOVL R11, OFFSET_FR2(SI) \
@ -346,13 +390,7 @@ GLOBL mask_S1<>(SB), RODATA, $16
TEXT ·genKeywordAsm(SB),NOSPLIT,$0
MOVQ pState+0(FP), SI
MOVL OFFSET_FR1(SI), R10
MOVL OFFSET_FR2(SI), R11
MOVL OFFSET_BRC_X0(SI), R12
MOVL OFFSET_BRC_X1(SI), R13
MOVL OFFSET_BRC_X2(SI), R14
MOVL OFFSET_BRC_X3(SI), R15
LOAD_STATE()
BITS_REORG(0)
CMPB ·useAVX(SB), $1
@ -382,3 +420,159 @@ avx:
VZEROUPPER
RET
#define ROUND_SSE(idx) \
BITS_REORG(idx) \
NONLIN_FUN_SSE() \
XORL R15, AX \
MOVL AX, (idx*4)(DI) \
XORQ AX, AX \
LFSR_UPDT(idx)
#define ROUND_AVX(idx) \
BITS_REORG(idx) \
NONLIN_FUN_AVX() \
XORL R15, AX \
MOVL AX, (idx*4)(DI) \
XORQ AX, AX \
LFSR_UPDT(idx)
// func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
MOVQ ks+0(FP), DI
MOVQ ks_len+8(FP), BP
MOVQ pState+24(FP), SI
LOAD_STATE()
CMPB ·useAVX(SB), $1
JE avxZucSixteens
sseZucSixteens:
CMPQ BP, $16
JB sseZucOctet
SUBQ $16, BP
ROUND_SSE(0)
ROUND_SSE(1)
ROUND_SSE(2)
ROUND_SSE(3)
ROUND_SSE(4)
ROUND_SSE(5)
ROUND_SSE(6)
ROUND_SSE(7)
ROUND_SSE(8)
ROUND_SSE(9)
ROUND_SSE(10)
ROUND_SSE(11)
ROUND_SSE(12)
ROUND_SSE(13)
ROUND_SSE(14)
ROUND_SSE(15)
LEAQ 64(DI), DI
JMP sseZucSixteens
sseZucOctet:
CMPQ BP, $8
JB sseZucNibble
SUBQ $8, BP
ROUND_SSE(0)
ROUND_SSE(1)
ROUND_SSE(2)
ROUND_SSE(3)
ROUND_SSE(4)
ROUND_SSE(5)
ROUND_SSE(6)
ROUND_SSE(7)
LEAQ 32(DI), DI
RESTORE_LFSR_8()
sseZucNibble:
CMPQ BP, $4
JB sseZucDouble
SUBQ $4, BP
ROUND_SSE(0)
ROUND_SSE(1)
ROUND_SSE(2)
ROUND_SSE(3)
LEAQ 16(DI), DI
RESTORE_LFSR_4()
sseZucDouble:
CMPQ BP, $2
JB sseZucSingle
SUBQ $2, BP
ROUND_SSE(0)
ROUND_SSE(1)
LEAQ 8(DI), DI
RESTORE_LFSR_2()
sseZucSingle:
TESTQ BP, BP
JE sseZucRet
ROUND_SSE(0)
RESTORE_LFSR_0()
sseZucRet:
SAVE_STATE()
RET
avxZucSixteens:
CMPQ BP, $16
JB avxZucOctet
SUBQ $16, BP
ROUND_AVX(0)
ROUND_AVX(1)
ROUND_AVX(2)
ROUND_AVX(3)
ROUND_AVX(4)
ROUND_AVX(5)
ROUND_AVX(6)
ROUND_AVX(7)
ROUND_AVX(8)
ROUND_AVX(9)
ROUND_AVX(10)
ROUND_AVX(11)
ROUND_AVX(12)
ROUND_AVX(13)
ROUND_AVX(14)
ROUND_AVX(15)
LEAQ 64(DI), DI
JMP avxZucSixteens
avxZucOctet:
CMPQ BP, $8
JB avxZucNibble
SUBQ $8, BP
ROUND_AVX(0)
ROUND_AVX(1)
ROUND_AVX(2)
ROUND_AVX(3)
ROUND_AVX(4)
ROUND_AVX(5)
ROUND_AVX(6)
ROUND_AVX(7)
LEAQ 32(DI), DI
RESTORE_LFSR_8()
avxZucNibble:
CMPQ BP, $4
JB avxZucDouble
SUBQ $4, BP
ROUND_AVX(0)
ROUND_AVX(1)
ROUND_AVX(2)
ROUND_AVX(3)
LEAQ 16(DI), DI
RESTORE_LFSR_4()
avxZucDouble:
CMPQ BP, $2
JB avxZucSingle
SUBQ $2, BP
ROUND_AVX(0)
ROUND_AVX(1)
LEAQ 8(DI), DI
RESTORE_LFSR_2()
avxZucSingle:
TESTQ BP, BP
JE avxZucRet
ROUND_AVX(0)
RESTORE_LFSR_0()
avxZucRet:
SAVE_STATE()
VZEROUPPER
RET

View File

@ -13,8 +13,14 @@ var useAVX = cpu.X86.HasAVX
//go:noescape
func genKeywordAsm(s *zucState32) uint32
//go:noescape
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
func genKeyStream(keyStream []uint32, pState *zucState32) {
// TODO: will change the implementation later
if supportsAES {
genKeyStreamAsm(keyStream, pState)
return
}
for i := 0; i < len(keyStream); i++ {
keyStream[i] = genKeyword(pState)
}

View File

@ -1,7 +1,5 @@
package zuc
// Just for reference, no performance advantage due to the block size / chunk are 4 bytes only!
import (
"crypto/cipher"
"encoding/binary"
@ -10,6 +8,8 @@ import (
"github.com/emmansun/gmsm/internal/xor"
)
const RoundWords = 16
// NewCipher create a stream cipher based on key and iv aguments.
func NewCipher(key, iv []byte) (cipher.Stream, error) {
return newZUCState(key, iv)
@ -25,8 +25,6 @@ func NewEEACipher(key []byte, count, bearer, direction uint32) (cipher.Stream, e
return newZUCState(key, iv)
}
// Per test, even we generate key stream first, and then XOR once, the performance
// improvement is NOT significant.
func (c *zucState32) XORKeyStream(dst, src []byte) {
if len(dst) < len(src) {
panic("zuc: output smaller than input")
@ -35,14 +33,23 @@ func (c *zucState32) XORKeyStream(dst, src []byte) {
panic("zuc: invalid buffer overlap")
}
words := (len(src) + 3) / 4
var keyWords [4]byte
for i := 0; i < words; i++ {
binary.BigEndian.PutUint32(keyWords[:], c.genKeyword())
xor.XorBytes(dst, src, keyWords[:])
if i < words-1 {
dst = dst[4:]
src = src[4:]
rounds := words / RoundWords
var keyWords [RoundWords]uint32
var keyBytes [RoundWords * 4]byte
for i := 0; i < rounds; i++ {
c.genKeywords(keyWords[:])
for j := 0; j < RoundWords; j++ {
binary.BigEndian.PutUint32(keyBytes[j*4:], keyWords[j])
}
xor.XorBytes(dst, src, keyBytes[:])
dst = dst[RoundWords*4:]
src = src[RoundWords*4:]
}
if rounds*RoundWords < words {
c.genKeywords(keyWords[:words-rounds*RoundWords])
for j := 0; j < words-rounds*RoundWords; j++ {
binary.BigEndian.PutUint32(keyBytes[j*4:], keyWords[j])
}
xor.XorBytes(dst, src, keyBytes[:])
}
}
}