diff --git a/zuc/asm_amd64.s b/zuc/asm_amd64.s index 3ac2114..1419440 100644 --- a/zuc/asm_amd64.s +++ b/zuc/asm_amd64.s @@ -313,6 +313,7 @@ GLOBL mask_S1<>(SB), RODATA, $16 MOVUPS (36)(SI), X2 \ MOVQ (52)(SI), BX \ MOVL (60)(SI), CX \ + \ MOVUPS X0, (SI) \ MOVUPS X1, (16)(SI) \ MOVUPS X2, (32)(SI) \ @@ -320,6 +321,41 @@ GLOBL mask_S1<>(SB), RODATA, $16 MOVL CX, (56)(SI) \ MOVL AX, (60)(SI) +#define RESTORE_LFSR_2() \ + MOVQ (0)(SI), AX \ + MOVUPS (8)(SI), X0 \ + MOVUPS (24)(SI), X1 \ + MOVUPS (40)(SI), X2 \ + MOVQ (56)(SI), BX \ + \ + MOVUPS X0, (SI) \ + MOVUPS X1, (16)(SI) \ + MOVUPS X2, (32)(SI) \ + MOVQ BX, (48)(SI) \ + MOVQ AX, (56)(SI) + +#define RESTORE_LFSR_4() \ + MOVUPS (0)(SI), X0 \ + MOVUPS (16)(SI), X1 \ + MOVUPS (32)(SI), X2 \ + MOVUPS (48)(SI), X3 \ + \ + MOVUPS X1, (0)(SI) \ + MOVUPS X2, (16)(SI) \ + MOVUPS X3, (32)(SI) \ + MOVUPS X0, (48)(SI) + +#define RESTORE_LFSR_8() \ + MOVUPS (0)(SI), X0 \ + MOVUPS (16)(SI), X1 \ + MOVUPS (32)(SI), X2 \ + MOVUPS (48)(SI), X3 \ + \ + MOVUPS X2, (0)(SI) \ + MOVUPS X3, (16)(SI) \ + MOVUPS X0, (32)(SI) \ + MOVUPS X1, (48)(SI) + #define NONLIN_FUN_AVX() \ NONLIN_FUN() \ VMOVQ DX, X0 \ @@ -334,6 +370,14 @@ GLOBL mask_S1<>(SB), RODATA, $16 MOVL X0, R10 \ // F_R1 VPEXTRD $1, X0, R11 +#define LOAD_STATE() \ + MOVL OFFSET_FR1(SI), R10 \ + MOVL OFFSET_FR2(SI), R11 \ + MOVL OFFSET_BRC_X0(SI), R12 \ + MOVL OFFSET_BRC_X1(SI), R13 \ + MOVL OFFSET_BRC_X2(SI), R14 \ + MOVL OFFSET_BRC_X3(SI), R15 + #define SAVE_STATE() \ MOVL R10, OFFSET_FR1(SI) \ MOVL R11, OFFSET_FR2(SI) \ @@ -346,13 +390,7 @@ GLOBL mask_S1<>(SB), RODATA, $16 TEXT ·genKeywordAsm(SB),NOSPLIT,$0 MOVQ pState+0(FP), SI - MOVL OFFSET_FR1(SI), R10 - MOVL OFFSET_FR2(SI), R11 - MOVL OFFSET_BRC_X0(SI), R12 - MOVL OFFSET_BRC_X1(SI), R13 - MOVL OFFSET_BRC_X2(SI), R14 - MOVL OFFSET_BRC_X3(SI), R15 - + LOAD_STATE() BITS_REORG(0) CMPB ·useAVX(SB), $1 @@ -382,3 +420,159 @@ avx: VZEROUPPER RET + +#define ROUND_SSE(idx) \ + BITS_REORG(idx) \ + NONLIN_FUN_SSE() \ + XORL R15, AX \ + MOVL AX, (idx*4)(DI) \ + XORQ AX, AX \ + LFSR_UPDT(idx) + +#define ROUND_AVX(idx) \ + BITS_REORG(idx) \ + NONLIN_FUN_AVX() \ + XORL R15, AX \ + MOVL AX, (idx*4)(DI) \ + XORQ AX, AX \ + LFSR_UPDT(idx) + +// func genKeyStreamAsm(keyStream []uint32, pState *zucState32) +TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0 + MOVQ ks+0(FP), DI + MOVQ ks_len+8(FP), BP + MOVQ pState+24(FP), SI + + LOAD_STATE() + + CMPB ·useAVX(SB), $1 + JE avxZucSixteens + +sseZucSixteens: + CMPQ BP, $16 + JB sseZucOctet + SUBQ $16, BP + ROUND_SSE(0) + ROUND_SSE(1) + ROUND_SSE(2) + ROUND_SSE(3) + ROUND_SSE(4) + ROUND_SSE(5) + ROUND_SSE(6) + ROUND_SSE(7) + ROUND_SSE(8) + ROUND_SSE(9) + ROUND_SSE(10) + ROUND_SSE(11) + ROUND_SSE(12) + ROUND_SSE(13) + ROUND_SSE(14) + ROUND_SSE(15) + LEAQ 64(DI), DI + JMP sseZucSixteens + +sseZucOctet: + CMPQ BP, $8 + JB sseZucNibble + SUBQ $8, BP + ROUND_SSE(0) + ROUND_SSE(1) + ROUND_SSE(2) + ROUND_SSE(3) + ROUND_SSE(4) + ROUND_SSE(5) + ROUND_SSE(6) + ROUND_SSE(7) + LEAQ 32(DI), DI + RESTORE_LFSR_8() +sseZucNibble: + CMPQ BP, $4 + JB sseZucDouble + SUBQ $4, BP + ROUND_SSE(0) + ROUND_SSE(1) + ROUND_SSE(2) + ROUND_SSE(3) + LEAQ 16(DI), DI + RESTORE_LFSR_4() +sseZucDouble: + CMPQ BP, $2 + JB sseZucSingle + SUBQ $2, BP + ROUND_SSE(0) + ROUND_SSE(1) + LEAQ 8(DI), DI + RESTORE_LFSR_2() +sseZucSingle: + TESTQ BP, BP + JE sseZucRet + ROUND_SSE(0) + RESTORE_LFSR_0() +sseZucRet: + SAVE_STATE() + RET + +avxZucSixteens: + CMPQ BP, $16 + JB avxZucOctet + SUBQ $16, BP + ROUND_AVX(0) + ROUND_AVX(1) + ROUND_AVX(2) + ROUND_AVX(3) + ROUND_AVX(4) + ROUND_AVX(5) + ROUND_AVX(6) + ROUND_AVX(7) + ROUND_AVX(8) + ROUND_AVX(9) + ROUND_AVX(10) + ROUND_AVX(11) + ROUND_AVX(12) + ROUND_AVX(13) + ROUND_AVX(14) + ROUND_AVX(15) + LEAQ 64(DI), DI + JMP avxZucSixteens + +avxZucOctet: + CMPQ BP, $8 + JB avxZucNibble + SUBQ $8, BP + ROUND_AVX(0) + ROUND_AVX(1) + ROUND_AVX(2) + ROUND_AVX(3) + ROUND_AVX(4) + ROUND_AVX(5) + ROUND_AVX(6) + ROUND_AVX(7) + LEAQ 32(DI), DI + RESTORE_LFSR_8() +avxZucNibble: + CMPQ BP, $4 + JB avxZucDouble + SUBQ $4, BP + ROUND_AVX(0) + ROUND_AVX(1) + ROUND_AVX(2) + ROUND_AVX(3) + LEAQ 16(DI), DI + RESTORE_LFSR_4() +avxZucDouble: + CMPQ BP, $2 + JB avxZucSingle + SUBQ $2, BP + ROUND_AVX(0) + ROUND_AVX(1) + LEAQ 8(DI), DI + RESTORE_LFSR_2() +avxZucSingle: + TESTQ BP, BP + JE avxZucRet + ROUND_AVX(0) + RESTORE_LFSR_0() +avxZucRet: + SAVE_STATE() + VZEROUPPER + RET diff --git a/zuc/core_asm.go b/zuc/core_asm.go index e66552d..c630751 100644 --- a/zuc/core_asm.go +++ b/zuc/core_asm.go @@ -13,8 +13,14 @@ var useAVX = cpu.X86.HasAVX //go:noescape func genKeywordAsm(s *zucState32) uint32 +//go:noescape +func genKeyStreamAsm(keyStream []uint32, pState *zucState32) + func genKeyStream(keyStream []uint32, pState *zucState32) { - // TODO: will change the implementation later + if supportsAES { + genKeyStreamAsm(keyStream, pState) + return + } for i := 0; i < len(keyStream); i++ { keyStream[i] = genKeyword(pState) } diff --git a/zuc/eea.go b/zuc/eea.go index ac46b60..df0b39c 100644 --- a/zuc/eea.go +++ b/zuc/eea.go @@ -1,7 +1,5 @@ package zuc -// Just for reference, no performance advantage due to the block size / chunk are 4 bytes only! - import ( "crypto/cipher" "encoding/binary" @@ -10,6 +8,8 @@ import ( "github.com/emmansun/gmsm/internal/xor" ) +const RoundWords = 16 + // NewCipher create a stream cipher based on key and iv aguments. func NewCipher(key, iv []byte) (cipher.Stream, error) { return newZUCState(key, iv) @@ -25,8 +25,6 @@ func NewEEACipher(key []byte, count, bearer, direction uint32) (cipher.Stream, e return newZUCState(key, iv) } -// Per test, even we generate key stream first, and then XOR once, the performance -// improvement is NOT significant. func (c *zucState32) XORKeyStream(dst, src []byte) { if len(dst) < len(src) { panic("zuc: output smaller than input") @@ -35,14 +33,23 @@ func (c *zucState32) XORKeyStream(dst, src []byte) { panic("zuc: invalid buffer overlap") } words := (len(src) + 3) / 4 - var keyWords [4]byte - for i := 0; i < words; i++ { - binary.BigEndian.PutUint32(keyWords[:], c.genKeyword()) - xor.XorBytes(dst, src, keyWords[:]) - if i < words-1 { - dst = dst[4:] - src = src[4:] + rounds := words / RoundWords + var keyWords [RoundWords]uint32 + var keyBytes [RoundWords * 4]byte + for i := 0; i < rounds; i++ { + c.genKeywords(keyWords[:]) + for j := 0; j < RoundWords; j++ { + binary.BigEndian.PutUint32(keyBytes[j*4:], keyWords[j]) } + xor.XorBytes(dst, src, keyBytes[:]) + dst = dst[RoundWords*4:] + src = src[RoundWords*4:] + } + if rounds*RoundWords < words { + c.genKeywords(keyWords[:words-rounds*RoundWords]) + for j := 0; j < words-rounds*RoundWords; j++ { + binary.BigEndian.PutUint32(keyBytes[j*4:], keyWords[j]) + } + xor.XorBytes(dst, src, keyBytes[:]) } - }