mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 04:36:19 +08:00
zuc: amd64 optimization step 2
This commit is contained in:
parent
f7a55494c8
commit
bd2543cdf9
208
zuc/asm_amd64.s
208
zuc/asm_amd64.s
@ -313,6 +313,7 @@ GLOBL mask_S1<>(SB), RODATA, $16
|
||||
MOVUPS (36)(SI), X2 \
|
||||
MOVQ (52)(SI), BX \
|
||||
MOVL (60)(SI), CX \
|
||||
\
|
||||
MOVUPS X0, (SI) \
|
||||
MOVUPS X1, (16)(SI) \
|
||||
MOVUPS X2, (32)(SI) \
|
||||
@ -320,6 +321,41 @@ GLOBL mask_S1<>(SB), RODATA, $16
|
||||
MOVL CX, (56)(SI) \
|
||||
MOVL AX, (60)(SI)
|
||||
|
||||
#define RESTORE_LFSR_2() \
|
||||
MOVQ (0)(SI), AX \
|
||||
MOVUPS (8)(SI), X0 \
|
||||
MOVUPS (24)(SI), X1 \
|
||||
MOVUPS (40)(SI), X2 \
|
||||
MOVQ (56)(SI), BX \
|
||||
\
|
||||
MOVUPS X0, (SI) \
|
||||
MOVUPS X1, (16)(SI) \
|
||||
MOVUPS X2, (32)(SI) \
|
||||
MOVQ BX, (48)(SI) \
|
||||
MOVQ AX, (56)(SI)
|
||||
|
||||
#define RESTORE_LFSR_4() \
|
||||
MOVUPS (0)(SI), X0 \
|
||||
MOVUPS (16)(SI), X1 \
|
||||
MOVUPS (32)(SI), X2 \
|
||||
MOVUPS (48)(SI), X3 \
|
||||
\
|
||||
MOVUPS X1, (0)(SI) \
|
||||
MOVUPS X2, (16)(SI) \
|
||||
MOVUPS X3, (32)(SI) \
|
||||
MOVUPS X0, (48)(SI)
|
||||
|
||||
#define RESTORE_LFSR_8() \
|
||||
MOVUPS (0)(SI), X0 \
|
||||
MOVUPS (16)(SI), X1 \
|
||||
MOVUPS (32)(SI), X2 \
|
||||
MOVUPS (48)(SI), X3 \
|
||||
\
|
||||
MOVUPS X2, (0)(SI) \
|
||||
MOVUPS X3, (16)(SI) \
|
||||
MOVUPS X0, (32)(SI) \
|
||||
MOVUPS X1, (48)(SI)
|
||||
|
||||
#define NONLIN_FUN_AVX() \
|
||||
NONLIN_FUN() \
|
||||
VMOVQ DX, X0 \
|
||||
@ -334,6 +370,14 @@ GLOBL mask_S1<>(SB), RODATA, $16
|
||||
MOVL X0, R10 \ // F_R1
|
||||
VPEXTRD $1, X0, R11
|
||||
|
||||
#define LOAD_STATE() \
|
||||
MOVL OFFSET_FR1(SI), R10 \
|
||||
MOVL OFFSET_FR2(SI), R11 \
|
||||
MOVL OFFSET_BRC_X0(SI), R12 \
|
||||
MOVL OFFSET_BRC_X1(SI), R13 \
|
||||
MOVL OFFSET_BRC_X2(SI), R14 \
|
||||
MOVL OFFSET_BRC_X3(SI), R15
|
||||
|
||||
#define SAVE_STATE() \
|
||||
MOVL R10, OFFSET_FR1(SI) \
|
||||
MOVL R11, OFFSET_FR2(SI) \
|
||||
@ -346,13 +390,7 @@ GLOBL mask_S1<>(SB), RODATA, $16
|
||||
TEXT ·genKeywordAsm(SB),NOSPLIT,$0
|
||||
MOVQ pState+0(FP), SI
|
||||
|
||||
MOVL OFFSET_FR1(SI), R10
|
||||
MOVL OFFSET_FR2(SI), R11
|
||||
MOVL OFFSET_BRC_X0(SI), R12
|
||||
MOVL OFFSET_BRC_X1(SI), R13
|
||||
MOVL OFFSET_BRC_X2(SI), R14
|
||||
MOVL OFFSET_BRC_X3(SI), R15
|
||||
|
||||
LOAD_STATE()
|
||||
|
||||
BITS_REORG(0)
|
||||
CMPB ·useAVX(SB), $1
|
||||
@ -382,3 +420,159 @@ avx:
|
||||
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
#define ROUND_SSE(idx) \
|
||||
BITS_REORG(idx) \
|
||||
NONLIN_FUN_SSE() \
|
||||
XORL R15, AX \
|
||||
MOVL AX, (idx*4)(DI) \
|
||||
XORQ AX, AX \
|
||||
LFSR_UPDT(idx)
|
||||
|
||||
#define ROUND_AVX(idx) \
|
||||
BITS_REORG(idx) \
|
||||
NONLIN_FUN_AVX() \
|
||||
XORL R15, AX \
|
||||
MOVL AX, (idx*4)(DI) \
|
||||
XORQ AX, AX \
|
||||
LFSR_UPDT(idx)
|
||||
|
||||
// func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
||||
TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
|
||||
MOVQ ks+0(FP), DI
|
||||
MOVQ ks_len+8(FP), BP
|
||||
MOVQ pState+24(FP), SI
|
||||
|
||||
LOAD_STATE()
|
||||
|
||||
CMPB ·useAVX(SB), $1
|
||||
JE avxZucSixteens
|
||||
|
||||
sseZucSixteens:
|
||||
CMPQ BP, $16
|
||||
JB sseZucOctet
|
||||
SUBQ $16, BP
|
||||
ROUND_SSE(0)
|
||||
ROUND_SSE(1)
|
||||
ROUND_SSE(2)
|
||||
ROUND_SSE(3)
|
||||
ROUND_SSE(4)
|
||||
ROUND_SSE(5)
|
||||
ROUND_SSE(6)
|
||||
ROUND_SSE(7)
|
||||
ROUND_SSE(8)
|
||||
ROUND_SSE(9)
|
||||
ROUND_SSE(10)
|
||||
ROUND_SSE(11)
|
||||
ROUND_SSE(12)
|
||||
ROUND_SSE(13)
|
||||
ROUND_SSE(14)
|
||||
ROUND_SSE(15)
|
||||
LEAQ 64(DI), DI
|
||||
JMP sseZucSixteens
|
||||
|
||||
sseZucOctet:
|
||||
CMPQ BP, $8
|
||||
JB sseZucNibble
|
||||
SUBQ $8, BP
|
||||
ROUND_SSE(0)
|
||||
ROUND_SSE(1)
|
||||
ROUND_SSE(2)
|
||||
ROUND_SSE(3)
|
||||
ROUND_SSE(4)
|
||||
ROUND_SSE(5)
|
||||
ROUND_SSE(6)
|
||||
ROUND_SSE(7)
|
||||
LEAQ 32(DI), DI
|
||||
RESTORE_LFSR_8()
|
||||
sseZucNibble:
|
||||
CMPQ BP, $4
|
||||
JB sseZucDouble
|
||||
SUBQ $4, BP
|
||||
ROUND_SSE(0)
|
||||
ROUND_SSE(1)
|
||||
ROUND_SSE(2)
|
||||
ROUND_SSE(3)
|
||||
LEAQ 16(DI), DI
|
||||
RESTORE_LFSR_4()
|
||||
sseZucDouble:
|
||||
CMPQ BP, $2
|
||||
JB sseZucSingle
|
||||
SUBQ $2, BP
|
||||
ROUND_SSE(0)
|
||||
ROUND_SSE(1)
|
||||
LEAQ 8(DI), DI
|
||||
RESTORE_LFSR_2()
|
||||
sseZucSingle:
|
||||
TESTQ BP, BP
|
||||
JE sseZucRet
|
||||
ROUND_SSE(0)
|
||||
RESTORE_LFSR_0()
|
||||
sseZucRet:
|
||||
SAVE_STATE()
|
||||
RET
|
||||
|
||||
avxZucSixteens:
|
||||
CMPQ BP, $16
|
||||
JB avxZucOctet
|
||||
SUBQ $16, BP
|
||||
ROUND_AVX(0)
|
||||
ROUND_AVX(1)
|
||||
ROUND_AVX(2)
|
||||
ROUND_AVX(3)
|
||||
ROUND_AVX(4)
|
||||
ROUND_AVX(5)
|
||||
ROUND_AVX(6)
|
||||
ROUND_AVX(7)
|
||||
ROUND_AVX(8)
|
||||
ROUND_AVX(9)
|
||||
ROUND_AVX(10)
|
||||
ROUND_AVX(11)
|
||||
ROUND_AVX(12)
|
||||
ROUND_AVX(13)
|
||||
ROUND_AVX(14)
|
||||
ROUND_AVX(15)
|
||||
LEAQ 64(DI), DI
|
||||
JMP avxZucSixteens
|
||||
|
||||
avxZucOctet:
|
||||
CMPQ BP, $8
|
||||
JB avxZucNibble
|
||||
SUBQ $8, BP
|
||||
ROUND_AVX(0)
|
||||
ROUND_AVX(1)
|
||||
ROUND_AVX(2)
|
||||
ROUND_AVX(3)
|
||||
ROUND_AVX(4)
|
||||
ROUND_AVX(5)
|
||||
ROUND_AVX(6)
|
||||
ROUND_AVX(7)
|
||||
LEAQ 32(DI), DI
|
||||
RESTORE_LFSR_8()
|
||||
avxZucNibble:
|
||||
CMPQ BP, $4
|
||||
JB avxZucDouble
|
||||
SUBQ $4, BP
|
||||
ROUND_AVX(0)
|
||||
ROUND_AVX(1)
|
||||
ROUND_AVX(2)
|
||||
ROUND_AVX(3)
|
||||
LEAQ 16(DI), DI
|
||||
RESTORE_LFSR_4()
|
||||
avxZucDouble:
|
||||
CMPQ BP, $2
|
||||
JB avxZucSingle
|
||||
SUBQ $2, BP
|
||||
ROUND_AVX(0)
|
||||
ROUND_AVX(1)
|
||||
LEAQ 8(DI), DI
|
||||
RESTORE_LFSR_2()
|
||||
avxZucSingle:
|
||||
TESTQ BP, BP
|
||||
JE avxZucRet
|
||||
ROUND_AVX(0)
|
||||
RESTORE_LFSR_0()
|
||||
avxZucRet:
|
||||
SAVE_STATE()
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
@ -13,8 +13,14 @@ var useAVX = cpu.X86.HasAVX
|
||||
//go:noescape
|
||||
func genKeywordAsm(s *zucState32) uint32
|
||||
|
||||
//go:noescape
|
||||
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
||||
|
||||
func genKeyStream(keyStream []uint32, pState *zucState32) {
|
||||
// TODO: will change the implementation later
|
||||
if supportsAES {
|
||||
genKeyStreamAsm(keyStream, pState)
|
||||
return
|
||||
}
|
||||
for i := 0; i < len(keyStream); i++ {
|
||||
keyStream[i] = genKeyword(pState)
|
||||
}
|
||||
|
31
zuc/eea.go
31
zuc/eea.go
@ -1,7 +1,5 @@
|
||||
package zuc
|
||||
|
||||
// Just for reference, no performance advantage due to the block size / chunk are 4 bytes only!
|
||||
|
||||
import (
|
||||
"crypto/cipher"
|
||||
"encoding/binary"
|
||||
@ -10,6 +8,8 @@ import (
|
||||
"github.com/emmansun/gmsm/internal/xor"
|
||||
)
|
||||
|
||||
const RoundWords = 16
|
||||
|
||||
// NewCipher create a stream cipher based on key and iv aguments.
|
||||
func NewCipher(key, iv []byte) (cipher.Stream, error) {
|
||||
return newZUCState(key, iv)
|
||||
@ -25,8 +25,6 @@ func NewEEACipher(key []byte, count, bearer, direction uint32) (cipher.Stream, e
|
||||
return newZUCState(key, iv)
|
||||
}
|
||||
|
||||
// Per test, even we generate key stream first, and then XOR once, the performance
|
||||
// improvement is NOT significant.
|
||||
func (c *zucState32) XORKeyStream(dst, src []byte) {
|
||||
if len(dst) < len(src) {
|
||||
panic("zuc: output smaller than input")
|
||||
@ -35,14 +33,23 @@ func (c *zucState32) XORKeyStream(dst, src []byte) {
|
||||
panic("zuc: invalid buffer overlap")
|
||||
}
|
||||
words := (len(src) + 3) / 4
|
||||
var keyWords [4]byte
|
||||
for i := 0; i < words; i++ {
|
||||
binary.BigEndian.PutUint32(keyWords[:], c.genKeyword())
|
||||
xor.XorBytes(dst, src, keyWords[:])
|
||||
if i < words-1 {
|
||||
dst = dst[4:]
|
||||
src = src[4:]
|
||||
rounds := words / RoundWords
|
||||
var keyWords [RoundWords]uint32
|
||||
var keyBytes [RoundWords * 4]byte
|
||||
for i := 0; i < rounds; i++ {
|
||||
c.genKeywords(keyWords[:])
|
||||
for j := 0; j < RoundWords; j++ {
|
||||
binary.BigEndian.PutUint32(keyBytes[j*4:], keyWords[j])
|
||||
}
|
||||
xor.XorBytes(dst, src, keyBytes[:])
|
||||
dst = dst[RoundWords*4:]
|
||||
src = src[RoundWords*4:]
|
||||
}
|
||||
if rounds*RoundWords < words {
|
||||
c.genKeywords(keyWords[:words-rounds*RoundWords])
|
||||
for j := 0; j < words-rounds*RoundWords; j++ {
|
||||
binary.BigEndian.PutUint32(keyBytes[j*4:], keyWords[j])
|
||||
}
|
||||
xor.XorBytes(dst, src, keyBytes[:])
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user