MAGIC - support ccm mode

2025-07-14 21:36:01 +08:00 · 2021-03-31 11:55:52 +08:00 · 2021-03-31 11:55:52 +08:00 · 85b3ecb129
commit 85b3ecb129
parent 75b6f26331
14 changed files with 814 additions and 118 deletions
--- a/cipher/ccm.go
+++ b/cipher/ccm.go
@ -0,0 +1,237 @@
 package cipher
 import (
 	goCipher "crypto/cipher"
 	"crypto/subtle"
 	"encoding/binary"
 	"math"
 	"errors"
 )
 const (
 	ccmBlockSize         = 16
 	ccmTagSize           = 16
 	ccmMinimumTagSize    = 4
 	ccmStandardNonceSize = 12
 )
 // ccmAble is an interface implemented by ciphers that have a specific optimized
 // implementation of CCM.
 type ccmAble interface {
 	NewCCM(nonceSize, tagSize int) (goCipher.AEAD, error)
 }
 type ccm struct {
 	cipher    goCipher.Block
 	nonceSize int
 	tagSize   int
 }
 func (c *ccm) NonceSize() int {
 	return c.nonceSize
 }
 func (c *ccm) Overhead() int {
 	return c.tagSize
 }
 func (c *ccm) MaxLength() int {
 	return maxlen(15-c.NonceSize(), c.Overhead())
 }
 func maxlen(L, tagsize int) int {
 	max := (uint64(1) << (8 * L)) - 1
 	if m64 := uint64(math.MaxInt64) - uint64(tagsize); L > 8 || max > m64 {
 		max = m64 // The maximum lentgh on a 64bit arch
 	}
 	if max != uint64(int(max)) {
 		return math.MaxInt32 - tagsize // We have only 32bit int's
 	}
 	return int(max)
 }
 // NewCCM returns the given 128-bit, block cipher wrapped in CCM
 // with the standard nonce length.
 func NewCCM(cipher goCipher.Block) (goCipher.AEAD, error) {
 	return NewCCMWithNonceAndTagSize(cipher, ccmStandardNonceSize, ccmTagSize)
 }
 // NewCCMWithNonceSize returns the given 128-bit, block cipher wrapped in CCM,
 // which accepts nonces of the given length. The length must not
 // be zero.
 func NewCCMWithNonceSize(cipher goCipher.Block, size int) (goCipher.AEAD, error) {
 	return NewCCMWithNonceAndTagSize(cipher, size, ccmTagSize)
 }
 // NewCCMWithTagSize returns the given 128-bit, block cipher wrapped in CCM,
 // which generates tags with the given length.
 //
 // Tag sizes between 8 and 16 bytes are allowed.
 //
 func NewCCMWithTagSize(cipher goCipher.Block, tagSize int) (goCipher.AEAD, error) {
 	return NewCCMWithNonceAndTagSize(cipher, ccmStandardNonceSize, tagSize)
 }
 // https://tools.ietf.org/html/rfc3610
 func NewCCMWithNonceAndTagSize(cipher goCipher.Block, nonceSize, tagSize int) (goCipher.AEAD, error) {
 	if tagSize < ccmMinimumTagSize || tagSize > ccmBlockSize || tagSize&1 != 0 {
 		return nil, errors.New("cipher: incorrect tag size given to CCM")
 	}
 	if nonceSize <= 0 {
 		return nil, errors.New("cipher: the nonce can't have zero length, or the security of the key will be immediately compromised")
 	}
 	lenSize := 15 - nonceSize
 	if lenSize < 2 || lenSize > 8 {
 		return nil, errors.New("cipher: invalid ccm nounce size, should be in [7,13]")
 	}
 	if cipher, ok := cipher.(ccmAble); ok {
 		return cipher.NewCCM(nonceSize, tagSize)
 	}
 	if cipher.BlockSize() != ccmBlockSize {
 		return nil, errors.New("cipher: NewCCM requires 128-bit block cipher")
 	}
 	c := &ccm{cipher: cipher, nonceSize: nonceSize, tagSize: tagSize}
 	return c, nil
 }
 // https://tools.ietf.org/html/rfc3610
 func (c *ccm) deriveCounter(counter *[ccmBlockSize]byte, nonce []byte) {
 	counter[0] = byte(14 - c.nonceSize)
 	copy(counter[1:], nonce)
 }
 func (c *ccm) cmac(out, data []byte) {
 	for len(data) >= ccmBlockSize {
 		XorBytes(out, out, data)
 		c.cipher.Encrypt(out, out)
 		data = data[ccmBlockSize:]
 	}
 	if len(data) > 0 {
 		var block [ccmBlockSize]byte
 		copy(block[:], data)
 		XorBytes(out, out, data)
 		c.cipher.Encrypt(out, out)
 	}
 }
 // https://tools.ietf.org/html/rfc3610 2.2. Authentication
 func (c *ccm) auth(nonce, plaintext, additionalData []byte, tagMask *[ccmBlockSize]byte) []byte {
 	var out [ccmTagSize]byte
 	if len(additionalData) > 0 {
 		out[0] = 1 << 6 // 64*Adata
 	}
 	out[0] |= byte(c.tagSize-2) << 2
 	out[0] |= byte(14 - c.nonceSize)
 	binary.BigEndian.PutUint64(out[ccmBlockSize-8:], uint64(len(plaintext)))
 	copy(out[1:], nonce)
 	c.cipher.Encrypt(out[:], out[:])
 	var block [ccmBlockSize]byte
 	if n := uint64(len(additionalData)); n > 0 {
 		// First adata block includes adata length
 		i := 2
 		if n <= 0xfeff {
 			binary.BigEndian.PutUint16(block[:i], uint16(n))
 		} else {
 			block[0] = 0xfe
 			block[1] = 0xff
 			if n < uint64(1<<32) {
 				i = 2 + 4
 				binary.BigEndian.PutUint32(block[2:i], uint32(n))
 			} else {
 				i = 2 + 8
 				binary.BigEndian.PutUint64(block[2:i], uint64(n))
 			}
 		}
 		i = copy(block[i:], additionalData)
 		c.cmac(out[:], block[:])
 		c.cmac(out[:], additionalData[i:])
 	}
 	if len(plaintext) > 0 {
 		c.cmac(out[:], plaintext)
 	}
 	XorWords(out[:], out[:], tagMask[:])
 	return out[:c.tagSize]
 }
 func (c *ccm) Seal(dst, nonce, plaintext, data []byte) []byte {
 	if len(nonce) != c.nonceSize {
 		panic("cipher: incorrect nonce length given to CCM")
 	}
 	if uint64(len(plaintext)) > uint64(c.MaxLength()) {
 		panic("cipher: message too large for CCM")
 	}
 	ret, out := SliceForAppend(dst, len(plaintext)+c.tagSize)
 	if InexactOverlap(out, plaintext) {
 		panic("cipher: invalid buffer overlap")
 	}
 	var counter, tagMask [ccmBlockSize]byte
 	c.deriveCounter(&counter, nonce)
 	c.cipher.Encrypt(tagMask[:], counter[:])
 	counter[len(counter)-1] |= 1
 	ctr := goCipher.NewCTR(c.cipher, counter[:])
 	ctr.XORKeyStream(out, plaintext)
 	tag := c.auth(nonce, plaintext, data, &tagMask)
 	copy(out[len(plaintext):], tag)
 	return ret
 }
 var errOpen = errors.New("cipher: message authentication failed")
 func (c *ccm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
 	if len(nonce) != c.nonceSize {
 		panic("cipher: incorrect nonce length given to CCM")
 	}
 	// Sanity check to prevent the authentication from always succeeding if an implementation
 	// leaves tagSize uninitialized, for example.
 	if c.tagSize < ccmMinimumTagSize {
 		panic("cipher: incorrect CCM tag size")
 	}
 	if len(ciphertext) < c.tagSize {
 		return nil, errOpen
 	}
 	if len(ciphertext) > c.MaxLength()+c.Overhead() {
 		return nil, errOpen
 	}
 	tag := ciphertext[len(ciphertext)-c.tagSize:]
 	ciphertext = ciphertext[:len(ciphertext)-c.tagSize]
 	var counter, tagMask [ccmBlockSize]byte
 	c.deriveCounter(&counter, nonce)
 	c.cipher.Encrypt(tagMask[:], counter[:])
 	ret, out := SliceForAppend(dst, len(ciphertext))
 	if InexactOverlap(out, ciphertext) {
 		panic("cipher: invalid buffer overlap")
 	}
 	counter[len(counter)-1] |= 1
 	ctr := goCipher.NewCTR(c.cipher, counter[:])
 	ctr.XORKeyStream(out, ciphertext)
 	expectedTag := c.auth(nonce, out, data, &tagMask)
 	if subtle.ConstantTimeCompare(expectedTag, tag) != 1 {
 		// The AESNI code decrypts and authenticates concurrently, and
 		// so overwrites dst in the event of a tag mismatch. That
 		// behavior is mimicked here in order to be consistent across
 		// platforms.
 		for i := range out {
 			out[i] = 0
 		}
 		return nil, errOpen
 	}
 	return ret, nil
 }
--- a/cipher/ccm_test.go
+++ b/cipher/ccm_test.go
@ -0,0 +1,263 @@
 package cipher
 import (
 	"crypto/aes"
 	"encoding/hex"
 	"testing"
 )
 var aesCCMTests = []struct {
 	key, nonce, plaintext, ad, result string
 	tagSize                           int
 }{
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"00000003020100a0a1a2a3a4a5",
 		"08090a0b0c0d0e0f101112131415161718191a1b1c1d1e",
 		"0001020304050607",
 		"588c979a61c663d2f066d0c2c0f989806d5f6b61dac38417e8d12cfdf926e0",
 		8,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"00000004030201a0a1a2a3a4a5",
 		"08090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f",
 		"0001020304050607",
 		"72c91a36e135f8cf291ca894085c87e3cc15c439c9e43a3ba091d56e10400916",
 		8,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"00000005040302a0a1a2a3a4a5",
 		"08090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f20",
 		"0001020304050607",
 		"51b1e5f44a197d1da46b0f8e2d282ae871e838bb64da8596574adaa76fbd9fb0c5",
 		8,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"00000006050403a0a1a2a3a4a5",
 		"0c0d0e0f101112131415161718191a1b1c1d1e",
 		"000102030405060708090a0b",
 		"a28c6865939a9a79faaa5c4c2a9d4a91cdac8c96c861b9c9e61ef1",
 		8,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"00000007060504a0a1a2a3a4a5",
 		"0c0d0e0f101112131415161718191a1b1c1d1e1f",
 		"000102030405060708090a0b",
 		"dcf1fb7b5d9e23fb9d4e131253658ad86ebdca3e51e83f077d9c2d93",
 		8,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"00000008070605a0a1a2a3a4a5",
 		"0c0d0e0f101112131415161718191a1b1c1d1e1f20",
 		"000102030405060708090a0b",
 		"6fc1b011f006568b5171a42d953d469b2570a4bd87405a0443ac91cb94",
 		8,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"00000009080706a0a1a2a3a4a5",
 		"08090a0b0c0d0e0f101112131415161718191a1b1c1d1e",
 		"0001020304050607",
 		"0135d1b2c95f41d5d1d4fec185d166b8094e999dfed96c048c56602c97acbb7490",
 		10,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"0000000a090807a0a1a2a3a4a5",
 		"08090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f",
 		"0001020304050607",
 		"7b75399ac0831dd2f0bbd75879a2fd8f6cae6b6cd9b7db24c17b4433f434963f34b4",
 		10,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"0000000b0a0908a0a1a2a3a4a5",
 		"08090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f20",
 		"0001020304050607",
 		"82531a60cc24945a4b8279181ab5c84df21ce7f9b73f42e197ea9c07e56b5eb17e5f4e",
 		10,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"0000000c0b0a09a0a1a2a3a4a5",
 		"0c0d0e0f101112131415161718191a1b1c1d1e",
 		"000102030405060708090a0b",
 		"07342594157785152b074098330abb141b947b566aa9406b4d999988dd",
 		10,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"0000000d0c0b0aa0a1a2a3a4a5",
 		"0c0d0e0f101112131415161718191a1b1c1d1e1f",
 		"000102030405060708090a0b",
 		"676bb20380b0e301e8ab79590a396da78b834934f53aa2e9107a8b6c022c",
 		10,
 	},
 	{
 		"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf",
 		"0000000e0d0c0ba0a1a2a3a4a5",
 		"0c0d0e0f101112131415161718191a1b1c1d1e1f20",
 		"000102030405060708090a0b",
 		"c0ffa0d6f05bdb67f24d43a4338d2aa4bed7b20e43cd1aa31662e7ad65d6db",
 		10,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"00412b4ea9cdbe3c9696766cfa",
 		"08e8cf97d820ea258460e96ad9cf5289054d895ceac47c",
 		"0be1a88bace018b1",
 		"4cb97f86a2a4689a877947ab8091ef5386a6ffbdd080f8e78cf7cb0cddd7b3",
 		8,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"0033568ef7b2633c9696766cfa",
 		"9020ea6f91bdd85afa0039ba4baff9bfb79c7028949cd0ec",
 		"63018f76dc8a1bcb",
 		"4ccb1e7ca981befaa0726c55d378061298c85c92814abc33c52ee81d7d77c08a",
 		8,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"00f8b678094e3b3c9696766cfa",
 		"e88b6a46c78d63e52eb8c546efb5de6f75e9cc0d",
 		"77b60f011c03e1525899bcae",
 		"5545ff1a085ee2efbf52b2e04bee1e2336c73e3f762c0c7744fe7e3c",
 		8,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"00d560912d3f703c9696766cfa",
 		"6435acbafb11a82e2f071d7ca4a5ebd93a803ba87f",
 		"cd9044d2b71fdb8120ea60c0",
 		"009769ecabdf48625594c59251e6035722675e04c847099e5ae0704551",
 		8,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"0042fff8f1951c3c9696766cfa",
 		"8a19b950bcf71a018e5e6701c91787659809d67dbedd18",
 		"d85bc7e69f944fb8",
 		"bc218daa947427b6db386a99ac1aef23ade0b52939cb6a637cf9bec2408897c6ba",
 		10,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"00920f40e56cdc3c9696766cfa",
 		"1761433c37c5a35fc1f39f406302eb907c6163be38c98437",
 		"74a0ebc9069f5b37",
 		"5810e6fd25874022e80361a478e3e9cf484ab04f447efff6f0a477cc2fc9bf548944",
 		10,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"0027ca0c7120bc3c9696766cfa",
 		"a434a8e58500c6e41530538862d686ea9e81301b5ae4226bfa",
 		"44a3aa3aae6475ca",
 		"f2beed7bc5098e83feb5b31608f8e29c38819a89c8e776f1544d4151a4ed3a8b87b9ce",
 		10,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"005b8ccbcd9af83c9696766cfa",
 		"b96b49e21d621741632875db7f6c9243d2d7c2",
 		"ec46bb63b02520c33c49fd70",
 		"31d750a09da3ed7fddd49a2032aabf17ec8ebf7d22c8088c666be5c197",
 		10,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"003ebe94044b9a3c9696766cfa",
 		"e2fcfbb880442c731bf95167c8ffd7895e337076",
 		"47a65ac78b3d594227e85e71",
 		"e882f1dbd38ce3eda7c23f04dd65071eb41342acdf7e00dccec7ae52987d",
 		10,
 	},
 	{
 		"d7828d13b2b0bdc325a76236df93cc6b",
 		"008d493b30ae8b3c9696766cfa",
 		"abf21c0b02feb88f856df4a37381bce3cc128517d4",
 		"6e37a6ef546d955d34ab6059",
 		"f32905b88a641b04b9c9ffb58cc390900f3da12ab16dce9e82efa16da62059",
 		10,
 	},
 }
 func TestCCM(t *testing.T) {
 	for i, tt := range aesCCMTests {
 		nonce, _ := hex.DecodeString(tt.nonce)
 		plaintext, _ := hex.DecodeString(tt.plaintext)
 		ad, _ := hex.DecodeString(tt.ad)
 		key, _ := hex.DecodeString(tt.key)
 		c, err := aes.NewCipher(key)
 		if err != nil {
 			t.Fatal(err)
 		}
 		aesccm, err := NewCCMWithNonceAndTagSize(c, len(nonce), tt.tagSize)
 		if err != nil {
 			t.Fatal(err)
 		}
 		ct := aesccm.Seal(nil, nonce, plaintext, ad)
 		if ctHex := hex.EncodeToString(ct); ctHex != tt.result {
 			t.Errorf("#%d: got %s, want %s", i, ctHex, tt.result)
 			continue
 		}
 		//func (c *ccm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error)
 		pt, err := aesccm.Open(nil, nonce, ct, ad)
 		if err != nil {
 			t.Fatal(err)
 		}
 		if ptHex := hex.EncodeToString(pt); ptHex != tt.plaintext {
 			t.Errorf("#%d: got %s, want %s", i, ptHex, tt.plaintext)
 			continue
 		}
 	}
 }
 func TestCCMInvalidTagSize(t *testing.T) {
 	key, _ := hex.DecodeString("ab72c77b97cb5fe9a382d9fe81ffdbed")
 	c, _ := aes.NewCipher(key)
 	for _, tagSize := range []int{0, 1, c.BlockSize() + 1} {
 		aesccm, err := NewCCMWithTagSize(c, tagSize)
 		if aesccm != nil || err == nil {
 			t.Fatalf("NewCCMWithNonceAndTagSize was successful with an invalid %d-byte tag size", tagSize)
 		}
 	}
 }
 func TestTagFailureOverwrite(t *testing.T) {
 	key, _ := hex.DecodeString("ab72c77b97cb5fe9a382d9fe81ffdbed")
 	nonce, _ := hex.DecodeString("54cc7dc2c37ec006bcc6d1db")
 	ciphertext, _ := hex.DecodeString("0e1bde206a07a9c2c1b65300f8c649972b4401346697138c7a4891ee59867d0c")
 	c, _ := aes.NewCipher(key)
 	aesccm, _ := NewCCM(c)
 	dst := make([]byte, len(ciphertext)-16)
 	for i := range dst {
 		dst[i] = 42
 	}
 	result, err := aesccm.Open(dst[:0], nonce, ciphertext, nil)
 	if err == nil {
 		t.Fatal("Bad Open still resulted in nil error.")
 	}
 	if result != nil {
 		t.Fatal("Failed Open returned non-nil result.")
 	}
 	for i := range dst {
 		if dst[i] != 0 {
 			t.Fatal("Failed Open didn't zero dst buffer")
 		}
 	}
 }
--- a/cipher/utils.go
+++ b/cipher/utils.go
@ -0,0 +1,39 @@
 package cipher
 import "unsafe"
 // AnyOverlap reports whether x and y share memory at any (not necessarily
 // corresponding) index. The memory beyond the slice length is ignored.
 func AnyOverlap(x, y []byte) bool {
 	return len(x) > 0 && len(y) > 0 &&
 		uintptr(unsafe.Pointer(&x[0])) <= uintptr(unsafe.Pointer(&y[len(y)-1])) &&
 		uintptr(unsafe.Pointer(&y[0])) <= uintptr(unsafe.Pointer(&x[len(x)-1]))
 }
 // InexactOverlap reports whether x and y share memory at any non-corresponding
 // index. The memory beyond the slice length is ignored. Note that x and y can
 // have different lengths and still not have any inexact overlap.
 //
 // InexactOverlap can be used to implement the requirements of the crypto/cipher
 // AEAD, Block, BlockMode and Stream interfaces.
 func InexactOverlap(x, y []byte) bool {
 	if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] {
 		return false
 	}
 	return AnyOverlap(x, y)
 }
 // SliceForAppend takes a slice and a requested number of bytes. It returns a
 // slice with the contents of the given slice followed by that many bytes and a
 // second slice that aliases into it and contains only the extra bytes. If the
 // original slice has sufficient capacity then no allocation is performed.
 func SliceForAppend(in []byte, n int) (head, tail []byte) {
 	if total := len(in) + n; cap(in) >= total {
 		head = in[:total]
 	} else {
 		head = make([]byte, total)
 		copy(head, in)
 	}
 	tail = head[len(in):]
 	return
 }
--- a/cipher/xor_amd64.go
+++ b/cipher/xor_amd64.go
@ -1,8 +1,8 @@
-package sm4
+package cipher
-// xorBytes xors the bytes in a and b. The destination should have enough
+// XorBytes xors the bytes in a and b. The destination should have enough
 // space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
-func xorBytes(dst, a, b []byte) int {
+func XorBytes(dst, a, b []byte) int {
 	n := len(a)
 	if len(b) < n {
 		n = len(b)
@ -15,8 +15,8 @@ func xorBytes(dst, a, b []byte) int {
 	return n
 }
-func xorWords(dst, a, b []byte) {
+func XorWords(dst, a, b []byte) {
-	xorBytes(dst, a, b)
+	XorBytes(dst, a, b)
 }
 //go:noescape
--- a/cipher/xor_amd64.s
+++ b/cipher/xor_amd64.s
@ -0,0 +1,50 @@
 #include "textflag.h"
 // func xorBytesSSE2(dst, a, b *byte, n int)
 TEXT ·xorBytesSSE2(SB), NOSPLIT, $0
 	MOVQ  dst+0(FP), BX
 	MOVQ  a+8(FP), SI
 	MOVQ  b+16(FP), CX
 	MOVQ  n+24(FP), DX
 	TESTQ $15, DX            // AND 15 & len, if not zero jump to not_aligned.
 	JNZ   not_aligned
 aligned:
 	MOVQ $0, AX // position in slices
 loop16b:
 	MOVOU (SI)(AX*1), X0   // XOR 16byte forwards.
 	MOVOU (CX)(AX*1), X1
 	PXOR  X1, X0
 	MOVOU X0, (BX)(AX*1)
 	ADDQ  $16, AX
 	CMPQ  DX, AX
 	JNE   loop16b
 	RET
 loop_1b:
 	SUBQ  $1, DX           // XOR 1byte backwards.
 	MOVB  (SI)(DX*1), DI
 	MOVB  (CX)(DX*1), AX
 	XORB  AX, DI
 	MOVB  DI, (BX)(DX*1)
 	TESTQ $7, DX           // AND 7 & len, if not zero jump to loop_1b.
 	JNZ   loop_1b
 	CMPQ  DX, $0           // if len is 0, ret.
 	JE    ret
 	TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
 	JZ    aligned
 not_aligned:
 	TESTQ $7, DX           // AND $7 & len, if not zero jump to loop_1b.
 	JNE   loop_1b
 	SUBQ  $8, DX           // XOR 8bytes backwards.
 	MOVQ  (SI)(DX*1), DI
 	MOVQ  (CX)(DX*1), AX
 	XORQ  AX, DI
 	MOVQ  DI, (BX)(DX*1)
 	CMPQ  DX, $16          // if len is greater or equal 16 here, it must be aligned.
 	JGE   aligned
 ret:
 	RET
--- a/cipher/xor_generic.go
+++ b/cipher/xor_generic.go
@ -0,0 +1,87 @@
 // +build !amd64
 package cipher
 import (
 	"runtime"
 	"unsafe"
 )
 // xorBytes xors the bytes in a and b. The destination should have enough
 // space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
 func XorBytes(dst, a, b []byte) int {
 	n := len(a)
 	if len(b) < n {
 		n = len(b)
 	}
 	if n == 0 {
 		return 0
 	}
 	switch {
 	case supportsUnaligned:
 		fastXORBytes(dst, a, b, n)
 	default:
 		// TODO(hanwen): if (dst, a, b) have common alignment
 		// we could still try fastXORBytes. It is not clear
 		// how often this happens, and it's only worth it if
 		// the block encryption itself is hardware
 		// accelerated.
 		safeXORBytes(dst, a, b, n)
 	}
 	return n
 }
 const wordSize = int(unsafe.Sizeof(uintptr(0)))
 const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
 // fastXORBytes xors in bulk. It only works on architectures that
 // support unaligned read/writes.
 // n needs to be smaller or equal than the length of a and b.
 func fastXORBytes(dst, a, b []byte, n int) {
 	// Assert dst has enough space
 	_ = dst[n-1]
 	w := n / wordSize
 	if w > 0 {
 		dw := *(*[]uintptr)(unsafe.Pointer(&dst))
 		aw := *(*[]uintptr)(unsafe.Pointer(&a))
 		bw := *(*[]uintptr)(unsafe.Pointer(&b))
 		for i := 0; i < w; i++ {
 			dw[i] = aw[i] ^ bw[i]
 		}
 	}
 	for i := (n - n%wordSize); i < n; i++ {
 		dst[i] = a[i] ^ b[i]
 	}
 }
 // n needs to be smaller or equal than the length of a and b.
 func safeXORBytes(dst, a, b []byte, n int) {
 	for i := 0; i < n; i++ {
 		dst[i] = a[i] ^ b[i]
 	}
 }
 // fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
 // The arguments are assumed to be of equal length.
 func fastXORWords(dst, a, b []byte) {
 	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
 	aw := *(*[]uintptr)(unsafe.Pointer(&a))
 	bw := *(*[]uintptr)(unsafe.Pointer(&b))
 	n := len(b) / wordSize
 	for i := 0; i < n; i++ {
 		dw[i] = aw[i] ^ bw[i]
 	}
 }
 // fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
 // The slice arguments a and b are assumed to be of equal length.
 func XorWords(dst, a, b []byte) {
 	if supportsUnaligned {
 		fastXORWords(dst, a, b)
 	} else {
 		safeXORBytes(dst, a, b, len(b))
 	}
 }
--- a/sm4/asm_amd64.s
+++ b/sm4/asm_amd64.s
@ -358,52 +358,3 @@ loop:
  MOVL R8, 12(BX)
 done_sm4:
 	RET
 // func xorBytesSSE2(dst, a, b *byte, n int)
 TEXT ·xorBytesSSE2(SB), NOSPLIT, $0
 	MOVQ  dst+0(FP), BX
 	MOVQ  a+8(FP), SI
 	MOVQ  b+16(FP), CX
 	MOVQ  n+24(FP), DX
 	TESTQ $15, DX            // AND 15 & len, if not zero jump to not_aligned.
 	JNZ   not_aligned
 aligned:
 	MOVQ $0, AX // position in slices
 loop16b:
 	MOVOU (SI)(AX*1), X0   // XOR 16byte forwards.
 	MOVOU (CX)(AX*1), X1
 	PXOR  X1, X0
 	MOVOU X0, (BX)(AX*1)
 	ADDQ  $16, AX
 	CMPQ  DX, AX
 	JNE   loop16b
 	RET
 loop_1b:
 	SUBQ  $1, DX           // XOR 1byte backwards.
 	MOVB  (SI)(DX*1), DI
 	MOVB  (CX)(DX*1), AX
 	XORB  AX, DI
 	MOVB  DI, (BX)(DX*1)
 	TESTQ $7, DX           // AND 7 & len, if not zero jump to loop_1b.
 	JNZ   loop_1b
 	CMPQ  DX, $0           // if len is 0, ret.
 	JE    ret
 	TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
 	JZ    aligned
 not_aligned:
 	TESTQ $7, DX           // AND $7 & len, if not zero jump to loop_1b.
 	JNE   loop_1b
 	SUBQ  $8, DX           // XOR 8bytes backwards.
 	MOVQ  (SI)(DX*1), DI
 	MOVQ  (CX)(DX*1), AX
 	XORQ  AX, DI
 	MOVQ  DI, (BX)(DX*1)
 	CMPQ  DX, $16          // if len is greater or equal 16 here, it must be aligned.
 	JGE   aligned
 ret:
 	RET
--- a/sm4/cbc_amd64.go
+++ b/sm4/cbc_amd64.go
@ -1,6 +1,10 @@
 package sm4
-import "crypto/cipher"
+import (
 	"crypto/cipher"
 	smcipher "github.com/emmansun/gmsm/cipher"
 )
 // Assert that sm4CipherAsm implements the cbcDecAble interfaces.
 var _ cbcDecAble = (*sm4CipherAsm)(nil)
@ -29,7 +33,7 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
 	if len(dst) < len(src) {
 		panic("crypto/cipher: output smaller than input")
 	}
-	if InexactOverlap(dst[:len(src)], src) {
+	if smcipher.InexactOverlap(dst[:len(src)], src) {
 		panic("crypto/cipher: invalid buffer overlap")
 	}
 	if len(src) == 0 {
@ -42,10 +46,10 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
 	var src64 []byte = make([]byte, FourBlocksSize)
 	for start > 0 {
 		encryptBlocksAsm(&x.b.dec[0], &temp[0], &src[start:end][0])
-		xorBytes(dst[end-BlockSize:end], temp[FourBlocksSize-BlockSize:FourBlocksSize], src[end-2*BlockSize:end-BlockSize])
+		smcipher.XorBytes(dst[end-BlockSize:end], temp[FourBlocksSize-BlockSize:FourBlocksSize], src[end-2*BlockSize:end-BlockSize])
-		xorBytes(dst[end-2*BlockSize:end-BlockSize], temp[FourBlocksSize-2*BlockSize:FourBlocksSize-BlockSize], src[end-3*BlockSize:end-2*BlockSize])
+		smcipher.XorBytes(dst[end-2*BlockSize:end-BlockSize], temp[FourBlocksSize-2*BlockSize:FourBlocksSize-BlockSize], src[end-3*BlockSize:end-2*BlockSize])
-		xorBytes(dst[end-3*BlockSize:end-2*BlockSize], temp[FourBlocksSize-3*BlockSize:FourBlocksSize-2*BlockSize], src[end-4*BlockSize:end-3*BlockSize])
+		smcipher.XorBytes(dst[end-3*BlockSize:end-2*BlockSize], temp[FourBlocksSize-3*BlockSize:FourBlocksSize-2*BlockSize], src[end-4*BlockSize:end-3*BlockSize])
-		xorBytes(dst[end-4*BlockSize:end-3*BlockSize], temp[:BlockSize], src[end-5*BlockSize:end-4*BlockSize])
+		smcipher.XorBytes(dst[end-4*BlockSize:end-3*BlockSize], temp[:BlockSize], src[end-5*BlockSize:end-4*BlockSize])
 		end = start
 		start -= FourBlocksSize
@ -55,10 +59,10 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
 	encryptBlocksAsm(&x.b.dec[0], &temp[0], &src[:end][0])
 	count := end / BlockSize
 	for i := count; i > 1; i-- {
-		xorBytes(dst[end-BlockSize:end], temp[end-BlockSize:end], src[end-2*BlockSize:end-BlockSize])
+		smcipher.XorBytes(dst[end-BlockSize:end], temp[end-BlockSize:end], src[end-2*BlockSize:end-BlockSize])
 		end -= BlockSize
 	}
-	xorBytes(dst[0:end], temp[0:end], x.iv[:])
+	smcipher.XorBytes(dst[0:end], temp[0:end], x.iv[:])
 	// Set the new iv to the first block we copied earlier.
 	x.iv, x.tmp = x.tmp, x.iv
 }
--- a/sm4/cipher.go
+++ b/sm4/cipher.go
@ -3,7 +3,8 @@ package sm4
 import (
 	"crypto/cipher"
 	"fmt"
-	"unsafe"
+
 	smcipher "github.com/emmansun/gmsm/cipher"
 )
 // BlockSize the sm4 block size in bytes.
@ -47,7 +48,7 @@ func (c *sm4Cipher) Encrypt(dst, src []byte) {
 	if len(dst) < BlockSize {
 		panic("sm4: output not full block")
 	}
-	if InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
+	if smcipher.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
 		panic("sm4: invalid buffer overlap")
 	}
 	encryptBlockGo(c.enc, dst, src)
@ -60,29 +61,8 @@ func (c *sm4Cipher) Decrypt(dst, src []byte) {
 	if len(dst) < BlockSize {
 		panic("sm4: output not full block")
 	}
-	if InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
+	if smcipher.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
 		panic("sm4: invalid buffer overlap")
 	}
 	decryptBlockGo(c.dec, dst, src)
 }
 // AnyOverlap reports whether x and y share memory at any (not necessarily
 // corresponding) index. The memory beyond the slice length is ignored.
 func AnyOverlap(x, y []byte) bool {
 	return len(x) > 0 && len(y) > 0 &&
 		uintptr(unsafe.Pointer(&x[0])) <= uintptr(unsafe.Pointer(&y[len(y)-1])) &&
 		uintptr(unsafe.Pointer(&y[0])) <= uintptr(unsafe.Pointer(&x[len(x)-1]))
 }
 // InexactOverlap reports whether x and y share memory at any non-corresponding
 // index. The memory beyond the slice length is ignored. Note that x and y can
 // have different lengths and still not have any inexact overlap.
 //
 // InexactOverlap can be used to implement the requirements of the crypto/cipher
 // AEAD, Block, BlockMode and Stream interfaces.
 func InexactOverlap(x, y []byte) bool {
 	if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] {
 		return false
 	}
 	return AnyOverlap(x, y)
 }
--- a/sm4/cipher_asm.go
+++ b/sm4/cipher_asm.go
@ -5,6 +5,7 @@ package sm4
 import (
 	"crypto/cipher"
 	smcipher "github.com/emmansun/gmsm/cipher"
 	"golang.org/x/sys/cpu"
 )
@ -47,7 +48,7 @@ func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
 	if len(dst) < BlockSize {
 		panic("sm4: output not full block")
 	}
-	if InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
+	if smcipher.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
 		panic("sm4: invalid buffer overlap")
 	}
 	encryptBlockAsm(&c.enc[0], &dst[0], &src[0])
@ -60,7 +61,7 @@ func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
 	if len(dst) < BlockSize {
 		panic("sm4: output not full block")
 	}
-	if InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
+	if smcipher.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
 		panic("sm4: invalid buffer overlap")
 	}
 	encryptBlockAsm(&c.dec[0], &dst[0], &src[0])
--- a/sm4/ctr_amd64.go
+++ b/sm4/ctr_amd64.go
@ -1,6 +1,10 @@
 package sm4
-import "crypto/cipher"
+import (
 	"crypto/cipher"
 	smcipher "github.com/emmansun/gmsm/cipher"
 )
 // Assert that sm4CipherAsm implements the ctrAble interface.
 var _ ctrAble = (*sm4CipherAsm)(nil)
@ -76,14 +80,14 @@ func (x *ctr) XORKeyStream(dst, src []byte) {
 	if len(dst) < len(src) {
 		panic("crypto/cipher: output smaller than input")
 	}
-	if InexactOverlap(dst[:len(src)], src) {
+	if smcipher.InexactOverlap(dst[:len(src)], src) {
 		panic("crypto/cipher: invalid buffer overlap")
 	}
 	for len(src) > 0 {
 		if x.outUsed >= len(x.out)-BlockSize {
 			x.refill()
 		}
-		n := xorBytes(dst, src, x.out[x.outUsed:])
+		n := smcipher.XorBytes(dst, src, x.out[x.outUsed:])
 		dst = dst[n:]
 		src = src[n:]
 		x.outUsed += n
--- a/sm4/gcm_amd64.go
+++ b/sm4/gcm_amd64.go
@ -5,6 +5,8 @@ import (
 	"crypto/subtle"
 	"encoding/binary"
 	"errors"
 	smcipher "github.com/emmansun/gmsm/cipher"
 )
 // Assert that sm4CipherAsm implements the gcmAble interface.
@ -80,8 +82,8 @@ func (g *gcm) Seal(dst, nonce, plaintext, data []byte) []byte {
 		panic("crypto/cipher: message too large for GCM")
 	}
-	ret, out := sliceForAppend(dst, len(plaintext)+g.tagSize)
+	ret, out := smcipher.SliceForAppend(dst, len(plaintext)+g.tagSize)
-	if InexactOverlap(out, plaintext) {
+	if smcipher.InexactOverlap(out, plaintext) {
 		panic("crypto/cipher: invalid buffer overlap")
 	}
@ -131,8 +133,8 @@ func (g *gcm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
 	var expectedTag [gcmTagSize]byte
 	g.auth(expectedTag[:], ciphertext, data, &tagMask)
-	ret, out := sliceForAppend(dst, len(ciphertext))
+	ret, out := smcipher.SliceForAppend(dst, len(ciphertext))
-	if InexactOverlap(out, ciphertext) {
+	if smcipher.InexactOverlap(out, ciphertext) {
 		panic("crypto/cipher: invalid buffer overlap")
 	}
@ -257,21 +259,6 @@ func gcmInc32(counterBlock *[16]byte) {
 	binary.BigEndian.PutUint32(ctr, binary.BigEndian.Uint32(ctr)+1)
 }
 // sliceForAppend takes a slice and a requested number of bytes. It returns a
 // slice with the contents of the given slice followed by that many bytes and a
 // second slice that aliases into it and contains only the extra bytes. If the
 // original slice has sufficient capacity then no allocation is performed.
 func sliceForAppend(in []byte, n int) (head, tail []byte) {
 	if total := len(in) + n; cap(in) >= total {
 		head = in[:total]
 	} else {
 		head = make([]byte, total)
 		copy(head, in)
 	}
 	tail = head[len(in):]
 	return
 }
 // counterCrypt crypts in to out using g.cipher in counter mode.
 func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
 	var mask [FourBlocksSize]byte
@ -288,7 +275,7 @@ func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
 		encryptBlocksAsm(&g.cipher.enc[0], &mask[0], &couters[0])
 		gcmInc32(counter)
-		xorWords(out, in, mask[:])
+		smcipher.XorWords(out, in, mask[:])
 		out = out[FourBlocksSize:]
 		in = in[FourBlocksSize:]
 	}
@ -300,7 +287,7 @@ func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
 			gcmInc32(counter)
 		}
 		encryptBlocksAsm(&g.cipher.enc[0], &mask[0], &couters[0])
-		xorBytes(out, in, mask[:blocks*gcmBlockSize])
+		smcipher.XorBytes(out, in, mask[:blocks*gcmBlockSize])
 	}
 }
@ -342,5 +329,5 @@ func (g *gcm) auth(out, ciphertext, additionalData []byte, tagMask *[gcmTagSize]
 	binary.BigEndian.PutUint64(out, y.low)
 	binary.BigEndian.PutUint64(out[8:], y.high)
-	xorWords(out, out, tagMask[:])
+	smcipher.XorWords(out, out, tagMask[:])
 }
--- a/sm4/sm4_gcm.go
+++ b/sm4/sm4_gcm.go
@ -4,6 +4,8 @@ package sm4
 import (
 	"crypto/cipher"
 	"crypto/subtle"
 	smcipher "github.com/emmansun/gmsm/cipher"
 )
 // sm4CipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
@ -80,8 +82,8 @@ func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
 	gcmSm4Data(&g.bytesProductTable, data, &tagOut)
-	ret, out := sliceForAppend(dst, len(plaintext)+g.tagSize)
+	ret, out := smcipher.SliceForAppend(dst, len(plaintext)+g.tagSize)
-	if InexactOverlap(out[:len(plaintext)], plaintext) {
+	if smcipher.InexactOverlap(out[:len(plaintext)], plaintext) {
 		panic("crypto/cipher: invalid buffer overlap")
 	}
@ -136,8 +138,8 @@ func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
 	var expectedTag [gcmTagSize]byte
 	gcmSm4Data(&g.bytesProductTable, data, &expectedTag)
-	ret, out := sliceForAppend(dst, len(ciphertext))
+	ret, out := smcipher.SliceForAppend(dst, len(ciphertext))
-	if InexactOverlap(out, ciphertext) {
+	if smcipher.InexactOverlap(out, ciphertext) {
 		panic("crypto/cipher: invalid buffer overlap")
 	}
 	if len(ciphertext) > 0 {
--- a/sm4_test/benchmark_test.go
+++ b/sm4_test/benchmark_test.go
@ -5,6 +5,7 @@ import (
 	"crypto/cipher"
 	"testing"
 	smcipher "github.com/emmansun/gmsm/cipher"
 	"github.com/emmansun/gmsm/sm4"
 )
@ -270,3 +271,93 @@ func BenchmarkAESGCMOpen8K(b *testing.B) {
 func BenchmarkSM4GCMOpen8K(b *testing.B) {
 	benchmarkSM4GCMOpen(b, make([]byte, 8*1024))
 }
 func benchmarkAESCCMSign(b *testing.B, buf []byte) {
 	var key [16]byte
 	c, _ := aes.NewCipher(key[:])
 	aesccm, _ := smcipher.NewCCM(c)
 	benchmarkGCMSign(b, aesccm, buf)
 }
 func benchmarkSM4CCMSign(b *testing.B, buf []byte) {
 	var key [16]byte
 	c, _ := sm4.NewCipher(key[:])
 	sm4ccm, _ := smcipher.NewCCM(c)
 	benchmarkGCMSign(b, sm4ccm, buf)
 }
 func BenchmarkAESCCMSign1K(b *testing.B) {
 	benchmarkAESCCMSign(b, make([]byte, 1024))
 }
 func BenchmarkSM4CCMSign1K(b *testing.B) {
 	benchmarkSM4CCMSign(b, make([]byte, 1024))
 }
 func BenchmarkAESCCMSeal1K(b *testing.B) {
 	benchmarkAESCCMSeal(b, make([]byte, 1024))
 }
 func BenchmarkSM4CCMSeal1K(b *testing.B) {
 	benchmarkSM4CCMSeal(b, make([]byte, 1024))
 }
 func BenchmarkAESCCMOpen1K(b *testing.B) {
 	benchmarkAESCCMOpen(b, make([]byte, 1024))
 }
 func BenchmarkSM4CCMOpen1K(b *testing.B) {
 	benchmarkSM4CCMOpen(b, make([]byte, 1024))
 }
 func BenchmarkAESCCMSign8K(b *testing.B) {
 	benchmarkAESCCMSign(b, make([]byte, 8*1024))
 }
 func BenchmarkSM4CCMSign8K(b *testing.B) {
 	benchmarkSM4CCMSign(b, make([]byte, 8*1024))
 }
 func BenchmarkAESCCMSeal8K(b *testing.B) {
 	benchmarkAESCCMSeal(b, make([]byte, 8*1024))
 }
 func BenchmarkSM4CCMSeal8K(b *testing.B) {
 	benchmarkSM4CCMSeal(b, make([]byte, 8*1024))
 }
 func BenchmarkAESCCMOpen8K(b *testing.B) {
 	benchmarkAESCCMOpen(b, make([]byte, 8*1024))
 }
 func BenchmarkSM4CCMOpen8K(b *testing.B) {
 	benchmarkSM4CCMOpen(b, make([]byte, 8*1024))
 }
 func benchmarkAESCCMSeal(b *testing.B, buf []byte) {
 	var key [16]byte
 	c, _ := aes.NewCipher(key[:])
 	sm4gcm, _ := smcipher.NewCCM(c)
 	benchmarkGCMSeal(b, sm4gcm, buf)
 }
 func benchmarkSM4CCMSeal(b *testing.B, buf []byte) {
 	var key [16]byte
 	c, _ := sm4.NewCipher(key[:])
 	sm4gcm, _ := smcipher.NewCCM(c)
 	benchmarkGCMSeal(b, sm4gcm, buf)
 }
 func benchmarkAESCCMOpen(b *testing.B, buf []byte) {
 	var key [16]byte
 	c, _ := aes.NewCipher(key[:])
 	sm4gcm, _ := smcipher.NewCCM(c)
 	benchmarkGCMOpen(b, sm4gcm, buf)
 }
 func benchmarkSM4CCMOpen(b *testing.B, buf []byte) {
 	var key [16]byte
 	c, _ := sm4.NewCipher(key[:])
 	sm4gcm, _ := smcipher.NewCCM(c)
 	benchmarkGCMOpen(b, sm4gcm, buf)
 }