diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 2a8c690..0887ed1 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -321,6 +321,74 @@ loop: done_sm4: RET +// func encryptBlockAsm(xk *uint32, dst, src *byte) +TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 + MOVQ xk+0(FP), AX + MOVQ dst+8(FP), BX + MOVQ src+16(FP), DX + + PINSRD $0, 0(DX), t0 + PSHUFB flip_mask<>(SB), t0 + + PINSRD $0, 4(DX), t1 + PSHUFB flip_mask<>(SB), t1 + + PINSRD $0, 8(DX), t2 + PSHUFB flip_mask<>(SB), t2 + + PINSRD $0, 12(DX), t3 + PSHUFB flip_mask<>(SB), t3 + + XORL CX, CX + +loop: + PINSRD $0, 0(AX)(CX*1), x + PXOR t1, x + PXOR t2, x + PXOR t3, x + + SM4_TAO_L1(x, y) + PXOR x, t0 + + PINSRD $0, 4(AX)(CX*1), x + PXOR t0, x + PXOR t2, x + PXOR t3, x + SM4_TAO_L1(x, y) + PXOR x, t1 + + PINSRD $0, 8(AX)(CX*1), x + PXOR t0, x + PXOR t1, x + PXOR t3, x + SM4_TAO_L1(x, y) + PXOR x, t2 + + PINSRD $0, 12(AX)(CX*1), x + PXOR t0, x + PXOR t1, x + PXOR t2, x + SM4_TAO_L1(x, y) + PXOR x, t3 + + ADDL $16, CX + CMPL CX, $4*32 + JB loop + + PSHUFB flip_mask<>(SB), t3 + PSHUFB flip_mask<>(SB), t2 + PSHUFB flip_mask<>(SB), t1 + PSHUFB flip_mask<>(SB), t0 + MOVUPS t3, 0(BX) + PEXTRD $0, t2, R8 + MOVL R8, 4(BX) + PEXTRD $0, t1, R8 + MOVL R8, 8(BX) + PEXTRD $0, t0, R8 + MOVL R8, 12(BX) +done_sm4: + RET + // func xorBytesSSE2(dst, a, b *byte, n int) TEXT ·xorBytesSSE2(SB), NOSPLIT, $0 MOVQ dst+0(FP), BX diff --git a/sm4/cipher_asm.go b/sm4/cipher_asm.go index 101ed60..345f193 100644 --- a/sm4/cipher_asm.go +++ b/sm4/cipher_asm.go @@ -11,6 +11,9 @@ import ( //go:noescape func encryptBlocksAsm(xk *uint32, dst, src *byte) +//go:noescape +func encryptBlockAsm(xk *uint32, dst, src *byte) + //go:noescape func expandKeyAsm(key *byte, ck, enc, dec *uint32) @@ -47,11 +50,7 @@ func (c *sm4CipherAsm) Encrypt(dst, src []byte) { if InexactOverlap(dst[:BlockSize], src[:BlockSize]) { panic("sm4: invalid buffer overlap") } - var src64 []byte = make([]byte, FourBlocksSize) - var dst64 []byte = make([]byte, FourBlocksSize) - copy(src64, src) - encryptBlocksAsm(&c.enc[0], &dst64[0], &src64[0]) - copy(dst, dst64[:BlockSize]) + encryptBlockAsm(&c.enc[0], &dst[0], &src[0]) } func (c *sm4CipherAsm) Decrypt(dst, src []byte) { @@ -64,9 +63,5 @@ func (c *sm4CipherAsm) Decrypt(dst, src []byte) { if InexactOverlap(dst[:BlockSize], src[:BlockSize]) { panic("sm4: invalid buffer overlap") } - var src64 []byte = make([]byte, FourBlocksSize) - var dst64 []byte = make([]byte, FourBlocksSize) - copy(src64, src) - encryptBlocksAsm(&c.dec[0], &dst64[0], &src64[0]) - copy(dst, dst64[:BlockSize]) + encryptBlockAsm(&c.dec[0], &dst[0], &src[0]) } diff --git a/sm4_test/benchmark_test.go b/sm4_test/benchmark_test.go index 26205e7..686aee0 100644 --- a/sm4_test/benchmark_test.go +++ b/sm4_test/benchmark_test.go @@ -1,50 +1,87 @@ package sm4_test import ( + "crypto/aes" "crypto/cipher" "testing" "github.com/emmansun/gmsm/sm4" ) -func BenchmarkSM4CBCEncrypt1K(b *testing.B) { +func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) { buf := make([]byte, 1024) b.SetBytes(int64(len(buf))) - var key [16]byte var iv [16]byte - c, _ := sm4.NewCipher(key[:]) - cbc := cipher.NewCBCEncrypter(c, iv[:]) + cbc := cipher.NewCBCEncrypter(block, iv[:]) for i := 0; i < b.N; i++ { cbc.CryptBlocks(buf, buf) } } -func BenchmarkSM4CBCDecrypt1K(b *testing.B) { +func BenchmarkAESCBCEncrypt1K(b *testing.B) { + var key [16]byte + c, _ := aes.NewCipher(key[:]) + benchmarkCBCEncrypt1K(b, c) +} + +func BenchmarkSM4CBCEncrypt1K(b *testing.B) { + var key [16]byte + c, _ := sm4.NewCipher(key[:]) + benchmarkCBCEncrypt1K(b, c) +} + +func benchmarkSM4CBCDecrypt1K(b *testing.B, block cipher.Block) { buf := make([]byte, 1024) b.SetBytes(int64(len(buf))) - var key [16]byte var iv [16]byte - c, _ := sm4.NewCipher(key[:]) - cbc := cipher.NewCBCDecrypter(c, iv[:]) + cbc := cipher.NewCBCDecrypter(block, iv[:]) for i := 0; i < b.N; i++ { cbc.CryptBlocks(buf, buf) } } +func BenchmarkAESCBCDecrypt1K(b *testing.B) { + var key [16]byte + c, _ := aes.NewCipher(key[:]) + benchmarkSM4CBCDecrypt1K(b, c) +} + +func BenchmarkSM4CBCDecrypt1K(b *testing.B) { + var key [16]byte + c, _ := sm4.NewCipher(key[:]) + benchmarkSM4CBCDecrypt1K(b, c) +} + +func benchmarkStream(b *testing.B, block cipher.Block, mode func(cipher.Block, []byte) cipher.Stream, buf []byte) { + b.SetBytes(int64(len(buf))) + + //var key [16]byte + var iv [16]byte + //c, _ := sm4.NewCipher(key[:]) + stream := mode(block, iv[:]) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + stream.XORKeyStream(buf, buf) + } +} + func benchmarkSM4Stream(b *testing.B, mode func(cipher.Block, []byte) cipher.Stream, buf []byte) { b.SetBytes(int64(len(buf))) var key [16]byte - var iv [16]byte c, _ := sm4.NewCipher(key[:]) - stream := mode(c, iv[:]) + benchmarkStream(b, c, mode, buf) +} - b.ResetTimer() - for i := 0; i < b.N; i++ { - stream.XORKeyStream(buf, buf) - } +func benchmarkAESStream(b *testing.B, mode func(cipher.Block, []byte) cipher.Stream, buf []byte) { + b.SetBytes(int64(len(buf))) + + var key [16]byte + c, _ := aes.NewCipher(key[:]) + benchmarkStream(b, c, mode, buf) } // If we test exactly 1K blocks, we would generate exact multiples of @@ -54,101 +91,182 @@ func benchmarkSM4Stream(b *testing.B, mode func(cipher.Block, []byte) cipher.Str const almost1K = 1024 - 5 const almost8K = 8*1024 - 5 +func BenchmarkAESCFBEncrypt1K(b *testing.B) { + benchmarkAESStream(b, cipher.NewCFBEncrypter, make([]byte, almost1K)) +} + func BenchmarkSM4CFBEncrypt1K(b *testing.B) { benchmarkSM4Stream(b, cipher.NewCFBEncrypter, make([]byte, almost1K)) } +func BenchmarkAESCFBDecrypt1K(b *testing.B) { + benchmarkAESStream(b, cipher.NewCFBDecrypter, make([]byte, almost1K)) +} + func BenchmarkSM4CFBDecrypt1K(b *testing.B) { benchmarkSM4Stream(b, cipher.NewCFBDecrypter, make([]byte, almost1K)) } +func BenchmarkAESCFBDecrypt8K(b *testing.B) { + benchmarkAESStream(b, cipher.NewCFBDecrypter, make([]byte, almost8K)) +} + func BenchmarkSM4CFBDecrypt8K(b *testing.B) { benchmarkSM4Stream(b, cipher.NewCFBDecrypter, make([]byte, almost8K)) } +func BenchmarkAESOFB1K(b *testing.B) { + benchmarkAESStream(b, cipher.NewOFB, make([]byte, almost1K)) +} + func BenchmarkSM4OFB1K(b *testing.B) { benchmarkSM4Stream(b, cipher.NewOFB, make([]byte, almost1K)) } +func BenchmarkAESCTR1K(b *testing.B) { + benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost1K)) +} + func BenchmarkSM4CTR1K(b *testing.B) { benchmarkSM4Stream(b, cipher.NewCTR, make([]byte, almost1K)) } +func BenchmarkAESCTR8K(b *testing.B) { + benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost8K)) +} + func BenchmarkSM4CTR8K(b *testing.B) { benchmarkSM4Stream(b, cipher.NewCTR, make([]byte, almost8K)) } -func benchmarkSM4GCMSign(b *testing.B, buf []byte) { +func benchmarkGCMSign(b *testing.B, aead cipher.AEAD, buf []byte) { b.SetBytes(int64(len(buf))) - var key [16]byte var nonce [12]byte - c, _ := sm4.NewCipher(key[:]) - sm4gcm, _ := cipher.NewGCM(c) var out []byte b.ResetTimer() for i := 0; i < b.N; i++ { - out = sm4gcm.Seal(out[:0], nonce[:], nil, buf) + out = aead.Seal(out[:0], nonce[:], nil, buf) } } +func benchmarkAESGCMSign(b *testing.B, buf []byte) { + var key [16]byte + c, _ := aes.NewCipher(key[:]) + aesgcm, _ := cipher.NewGCM(c) + benchmarkGCMSign(b, aesgcm, buf) +} + +func benchmarkSM4GCMSign(b *testing.B, buf []byte) { + var key [16]byte + c, _ := sm4.NewCipher(key[:]) + sm4gcm, _ := cipher.NewGCM(c) + benchmarkGCMSign(b, sm4gcm, buf) +} + +func benchmarkGCMSeal(b *testing.B, aead cipher.AEAD, buf []byte) { + b.SetBytes(int64(len(buf))) + + var nonce [12]byte + var ad [13]byte + var out []byte + + b.ResetTimer() + for i := 0; i < b.N; i++ { + out = aead.Seal(out[:0], nonce[:], buf, ad[:]) + } +} + +func benchmarkAESGCMSeal(b *testing.B, buf []byte) { + var key [16]byte + c, _ := aes.NewCipher(key[:]) + sm4gcm, _ := cipher.NewGCM(c) + benchmarkGCMSeal(b, sm4gcm, buf) +} + func benchmarkSM4GCMSeal(b *testing.B, buf []byte) { - b.SetBytes(int64(len(buf))) - var key [16]byte - var nonce [12]byte - var ad [13]byte c, _ := sm4.NewCipher(key[:]) sm4gcm, _ := cipher.NewGCM(c) - var out []byte - - b.ResetTimer() - for i := 0; i < b.N; i++ { - out = sm4gcm.Seal(out[:0], nonce[:], buf, ad[:]) - } + benchmarkGCMSeal(b, sm4gcm, buf) } -func benchmarkSM4GCMOpen(b *testing.B, buf []byte) { +func benchmarkGCMOpen(b *testing.B, aead cipher.AEAD, buf []byte) { b.SetBytes(int64(len(buf))) - var key [16]byte var nonce [12]byte var ad [13]byte - c, _ := sm4.NewCipher(key[:]) - sm4gcm, _ := cipher.NewGCM(c) var out []byte - out = sm4gcm.Seal(out[:0], nonce[:], buf, ad[:]) + out = aead.Seal(out[:0], nonce[:], buf, ad[:]) b.ResetTimer() for i := 0; i < b.N; i++ { - _, err := sm4gcm.Open(buf[:0], nonce[:], out, ad[:]) + _, err := aead.Open(buf[:0], nonce[:], out, ad[:]) if err != nil { b.Errorf("Open: %v", err) } } } +func benchmarkAESGCMOpen(b *testing.B, buf []byte) { + var key [16]byte + c, _ := aes.NewCipher(key[:]) + sm4gcm, _ := cipher.NewGCM(c) + benchmarkGCMOpen(b, sm4gcm, buf) +} + +func benchmarkSM4GCMOpen(b *testing.B, buf []byte) { + var key [16]byte + c, _ := sm4.NewCipher(key[:]) + sm4gcm, _ := cipher.NewGCM(c) + benchmarkGCMOpen(b, sm4gcm, buf) +} + +func BenchmarkAESGCMSeal1K(b *testing.B) { + benchmarkAESGCMSeal(b, make([]byte, 1024)) +} + func BenchmarkSM4GCMSeal1K(b *testing.B) { benchmarkSM4GCMSeal(b, make([]byte, 1024)) } +func BenchmarkAESGCMOpen1K(b *testing.B) { + benchmarkAESGCMOpen(b, make([]byte, 1024)) +} + func BenchmarkSM4GCMOpen1K(b *testing.B) { benchmarkSM4GCMOpen(b, make([]byte, 1024)) } +func BenchmarkAESGCMSign1K(b *testing.B) { + benchmarkAESGCMSign(b, make([]byte, 1024)) +} + func BenchmarkSM4GCMSign1K(b *testing.B) { benchmarkSM4GCMSign(b, make([]byte, 1024)) } +func BenchmarkAESGCMSign8K(b *testing.B) { + benchmarkAESGCMSign(b, make([]byte, 8*1024)) +} + func BenchmarkSM4GCMSign8K(b *testing.B) { benchmarkSM4GCMSign(b, make([]byte, 8*1024)) } +func BenchmarkAESGCMSeal8K(b *testing.B) { + benchmarkAESGCMSeal(b, make([]byte, 8*1024)) +} + func BenchmarkSM4GCMSeal8K(b *testing.B) { benchmarkSM4GCMSeal(b, make([]byte, 8*1024)) } +func BenchmarkAESGCMOpen8K(b *testing.B) { + benchmarkAESGCMOpen(b, make([]byte, 8*1024)) +} + func BenchmarkSM4GCMOpen8K(b *testing.B) { benchmarkSM4GCMOpen(b, make([]byte, 8*1024)) }