diff --git a/zuc/README.md b/zuc/README.md index 69e4999..98045c7 100644 --- a/zuc/README.md +++ b/zuc/README.md @@ -53,5 +53,5 @@ func (s *zucState32) f32(x0, x1, x2 uint32) uint32 { goarch: amd64 pkg: github.com/emmansun/gmsm/zuc cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz - BenchmarkEncrypt1K-6 253791 4621 ns/op 220.52 MB/s - BenchmarkEncrypt8K-6 31794 37772 ns/op 216.75 MB/s + BenchmarkEncrypt1K-6 409755 2802 ns/op 363.62 MB/s + BenchmarkEncrypt8K-6 54120 22413 ns/op 365.28 MB/s diff --git a/zuc/asm_amd64.s b/zuc/asm_amd64.s index 1419440..c26fca6 100644 --- a/zuc/asm_amd64.s +++ b/zuc/asm_amd64.s @@ -72,6 +72,11 @@ DATA mask_S1<>+0x00(SB)/8, $0x00ff00ff00ff00ff DATA mask_S1<>+0x08(SB)/8, $0x00ff00ff00ff00ff GLOBL mask_S1<>(SB), RODATA, $16 +// shuffle byte order from LE to BE +DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 +DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b +GLOBL flip_mask<>(SB), RODATA, $16 + #define OFFSET_FR1 (16*4) #define OFFSET_FR2 (17*4) #define OFFSET_BRC_X0 (18*4) @@ -429,7 +434,7 @@ avx: XORQ AX, AX \ LFSR_UPDT(idx) -#define ROUND_AVX(idx) \ +#define ROUND_AVX(idx) \ BITS_REORG(idx) \ NONLIN_FUN_AVX() \ XORL R15, AX \ @@ -437,6 +442,24 @@ avx: XORQ AX, AX \ LFSR_UPDT(idx) +#define ROUND_REV32_SSE(idx) \ + BITS_REORG(idx) \ + NONLIN_FUN_SSE() \ + XORL R15, AX \ + BSWAPL AX \ + MOVL AX, (idx*4)(DI) \ + XORQ AX, AX \ + LFSR_UPDT(idx) + +#define ROUND_REV32_AVX(idx) \ + BITS_REORG(idx) \ + NONLIN_FUN_AVX() \ + XORL R15, AX \ + BSWAPL AX \ + MOVL AX, (idx*4)(DI) \ + XORQ AX, AX \ + LFSR_UPDT(idx) + // func genKeyStreamAsm(keyStream []uint32, pState *zucState32) TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0 MOVQ ks+0(FP), DI @@ -576,3 +599,145 @@ avxZucRet: SAVE_STATE() VZEROUPPER RET + +// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) +TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0 + MOVQ ks+0(FP), DI + MOVQ ks_len+8(FP), BP + MOVQ pState+24(FP), SI + + SHRQ $2, BP + + LOAD_STATE() + + CMPB ·useAVX(SB), $1 + JE avxZucSixteens + +sseZucSixteens: + CMPQ BP, $16 + JB sseZucOctet + SUBQ $16, BP + ROUND_REV32_SSE(0) + ROUND_REV32_SSE(1) + ROUND_REV32_SSE(2) + ROUND_REV32_SSE(3) + ROUND_REV32_SSE(4) + ROUND_REV32_SSE(5) + ROUND_REV32_SSE(6) + ROUND_REV32_SSE(7) + ROUND_REV32_SSE(8) + ROUND_REV32_SSE(9) + ROUND_REV32_SSE(10) + ROUND_REV32_SSE(11) + ROUND_REV32_SSE(12) + ROUND_REV32_SSE(13) + ROUND_REV32_SSE(14) + ROUND_REV32_SSE(15) + LEAQ 64(DI), DI + JMP sseZucSixteens + +sseZucOctet: + CMPQ BP, $8 + JB sseZucNibble + SUBQ $8, BP + ROUND_REV32_SSE(0) + ROUND_REV32_SSE(1) + ROUND_REV32_SSE(2) + ROUND_REV32_SSE(3) + ROUND_REV32_SSE(4) + ROUND_REV32_SSE(5) + ROUND_REV32_SSE(6) + ROUND_REV32_SSE(7) + LEAQ 32(DI), DI + RESTORE_LFSR_8() +sseZucNibble: + CMPQ BP, $4 + JB sseZucDouble + SUBQ $4, BP + ROUND_REV32_SSE(0) + ROUND_REV32_SSE(1) + ROUND_REV32_SSE(2) + ROUND_REV32_SSE(3) + LEAQ 16(DI), DI + RESTORE_LFSR_4() +sseZucDouble: + CMPQ BP, $2 + JB sseZucSingle + SUBQ $2, BP + ROUND_REV32_SSE(0) + ROUND_REV32_SSE(1) + LEAQ 8(DI), DI + RESTORE_LFSR_2() +sseZucSingle: + TESTQ BP, BP + JE sseZucRet + ROUND_REV32_SSE(0) + RESTORE_LFSR_0() +sseZucRet: + SAVE_STATE() + RET + +avxZucSixteens: + CMPQ BP, $16 + JB avxZucOctet + SUBQ $16, BP + ROUND_REV32_AVX(0) + ROUND_REV32_AVX(1) + ROUND_REV32_AVX(2) + ROUND_REV32_AVX(3) + ROUND_REV32_AVX(4) + ROUND_REV32_AVX(5) + ROUND_REV32_AVX(6) + ROUND_REV32_AVX(7) + ROUND_REV32_AVX(8) + ROUND_REV32_AVX(9) + ROUND_REV32_AVX(10) + ROUND_REV32_AVX(11) + ROUND_REV32_AVX(12) + ROUND_REV32_AVX(13) + ROUND_REV32_AVX(14) + ROUND_REV32_AVX(15) + LEAQ 64(DI), DI + JMP avxZucSixteens + +avxZucOctet: + CMPQ BP, $8 + JB avxZucNibble + SUBQ $8, BP + ROUND_REV32_AVX(0) + ROUND_REV32_AVX(1) + ROUND_REV32_AVX(2) + ROUND_REV32_AVX(3) + ROUND_REV32_AVX(4) + ROUND_REV32_AVX(5) + ROUND_REV32_AVX(6) + ROUND_REV32_AVX(7) + LEAQ 32(DI), DI + RESTORE_LFSR_8() +avxZucNibble: + CMPQ BP, $4 + JB avxZucDouble + SUBQ $4, BP + ROUND_REV32_AVX(0) + ROUND_REV32_AVX(1) + ROUND_REV32_AVX(2) + ROUND_REV32_AVX(3) + LEAQ 16(DI), DI + RESTORE_LFSR_4() +avxZucDouble: + CMPQ BP, $2 + JB avxZucSingle + SUBQ $2, BP + ROUND_REV32_AVX(0) + ROUND_REV32_AVX(1) + LEAQ 8(DI), DI + RESTORE_LFSR_2() +avxZucSingle: + TESTQ BP, BP + JE avxZucRet + ROUND_REV32_AVX(0) + RESTORE_LFSR_0() +avxZucRet: + SAVE_STATE() + VZEROUPPER + RET diff --git a/zuc/core_generic.go b/zuc/core_generic.go index 2825b1d..f91a358 100644 --- a/zuc/core_generic.go +++ b/zuc/core_generic.go @@ -14,4 +14,4 @@ func genKeyword(s *zucState32) uint32 { z := s.x3 ^ s.f32() s.enterWorkMode() return z -} \ No newline at end of file +} diff --git a/zuc/eea.go b/zuc/eea.go index df0b39c..bf04a28 100644 --- a/zuc/eea.go +++ b/zuc/eea.go @@ -8,7 +8,7 @@ import ( "github.com/emmansun/gmsm/internal/xor" ) -const RoundWords = 16 +const RoundWords = 32 // NewCipher create a stream cipher based on key and iv aguments. func NewCipher(key, iv []byte) (cipher.Stream, error) { @@ -25,13 +25,7 @@ func NewEEACipher(key []byte, count, bearer, direction uint32) (cipher.Stream, e return newZUCState(key, iv) } -func (c *zucState32) XORKeyStream(dst, src []byte) { - if len(dst) < len(src) { - panic("zuc: output smaller than input") - } - if subtle.InexactOverlap(dst[:len(src)], src) { - panic("zuc: invalid buffer overlap") - } +func genericXorKeyStream(c *zucState32, dst, src []byte) { words := (len(src) + 3) / 4 rounds := words / RoundWords var keyWords [RoundWords]uint32 @@ -53,3 +47,13 @@ func (c *zucState32) XORKeyStream(dst, src []byte) { xor.XorBytes(dst, src, keyBytes[:]) } } + +func (c *zucState32) XORKeyStream(dst, src []byte) { + if len(dst) < len(src) { + panic("zuc: output smaller than input") + } + if subtle.InexactOverlap(dst[:len(src)], src) { + panic("zuc: invalid buffer overlap") + } + xorKeyStream(c, dst, src) +} diff --git a/zuc/eea_asm.go b/zuc/eea_asm.go new file mode 100644 index 0000000..65b0e6b --- /dev/null +++ b/zuc/eea_asm.go @@ -0,0 +1,31 @@ +//go:build (amd64 && !generic) +// +build amd64,!generic + +package zuc + +import ( + "github.com/emmansun/gmsm/internal/xor" +) + +//go:noescape +func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) + +func xorKeyStream(c *zucState32, dst, src []byte) { + if supportsAES { + words := len(src) / 4 + // handle complete words first + if words > 0 { + dstWords := dst[:words*4] + genKeyStreamRev32Asm(dstWords, c) + xor.XorBytes(dst, src, dstWords) + } + // handle remain bytes + if words*4 < len(src) { + var singleWord [4]byte + genKeyStreamRev32Asm(singleWord[:], c) + xor.XorBytes(dst[words*4:], src[words*4:], singleWord[:]) + } + } else { + genericXorKeyStream(c, dst, src) + } +} diff --git a/zuc/eea_generic.go b/zuc/eea_generic.go new file mode 100644 index 0000000..3a7d839 --- /dev/null +++ b/zuc/eea_generic.go @@ -0,0 +1,8 @@ +//go:build !amd64 || generic +// +build !amd64 generic + +package zuc + +func xorKeyStream(c *zucState32, dst, src []byte) { + genericXorKeyStream(c, dst, src) +}