From 88e456e9a8899a9ce5df05aa6307e74e2bfebe57 Mon Sep 17 00:00:00 2001
From: Emman <emman.sun@cargosmart.com>
Date: Fri, 19 Mar 2021 17:17:19 +0800
Subject: [PATCH] MAGIC - ghash asm tuning

---
 sm4/cipher_asm.go |   4 +
 sm4/gcm_amd64.go  | 346 ++++++++++++++++++++++++++++++++++++++++
 sm4/gcm_amd64.s   | 392 ++++++++++++++++++++++++++++++++++++++++++++++
 sm4/sm4_gcm.go    | 158 +++++++++++++++++++
 4 files changed, 900 insertions(+)
 create mode 100644 sm4/gcm_amd64.go
 create mode 100644 sm4/gcm_amd64.s
 create mode 100644 sm4/sm4_gcm.go

diff --git a/sm4/cipher_asm.go b/sm4/cipher_asm.go
index 146c0d8..4632bd6 100644
--- a/sm4/cipher_asm.go
+++ b/sm4/cipher_asm.go
@@ -16,6 +16,7 @@ type sm4CipherAsm struct {
 }
 
 var supportsAES = cpu.X86.HasAES
+var supportsGFMUL = cpu.X86.HasPCLMULQDQ
 
 func newCipher(key []byte) (cipher.Block, error) {
 	if !supportsAES {
@@ -23,6 +24,9 @@ func newCipher(key []byte) (cipher.Block, error) {
 	}
 	c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}}
 	expandKeyGo(key, c.enc, c.dec)
+	if supportsAES && supportsGFMUL {
+		return &sm4CipherGCM{c}, nil
+	}
 	return &c, nil
 }
 
diff --git a/sm4/gcm_amd64.go b/sm4/gcm_amd64.go
new file mode 100644
index 0000000..c718e92
--- /dev/null
+++ b/sm4/gcm_amd64.go
@@ -0,0 +1,346 @@
+package sm4
+
+import (
+	"crypto/cipher"
+	"crypto/subtle"
+	"encoding/binary"
+	"errors"
+)
+
+// Assert that sm4CipherAsm implements the gcmAble interface.
+var _ gcmAble = (*sm4CipherAsm)(nil)
+
+// NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only
+// called by crypto/cipher.NewGCM via the gcmAble interface.
+func (c *sm4CipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
+	var key [gcmBlockSize]byte
+	c.Encrypt(key[:], key[:])
+	g := &gcm{cipher: c, nonceSize: nonceSize, tagSize: tagSize}
+	// We precompute 16 multiples of |key|. However, when we do lookups
+	// into this table we'll be using bits from a field element and
+	// therefore the bits will be in the reverse order. So normally one
+	// would expect, say, 4*key to be in index 4 of the table but due to
+	// this bit ordering it will actually be in index 0010 (base 2) = 2.
+	x := gcmFieldElement{
+		binary.BigEndian.Uint64(key[:8]),
+		binary.BigEndian.Uint64(key[8:]),
+	}
+	g.productTable[reverseBits(1)] = x
+
+	for i := 2; i < 16; i += 2 {
+		g.productTable[reverseBits(i)] = gcmDouble(&g.productTable[reverseBits(i/2)])
+		g.productTable[reverseBits(i+1)] = gcmAdd(&g.productTable[reverseBits(i)], &x)
+	}
+
+	return g, nil
+}
+
+// gcmFieldElement represents a value in GF(2¹²⁸). In order to reflect the GCM
+// standard and make binary.BigEndian suitable for marshaling these values, the
+// bits are stored in big endian order. For example:
+//   the coefficient of x⁰ can be obtained by v.low >> 63.
+//   the coefficient of x⁶³ can be obtained by v.low & 1.
+//   the coefficient of x⁶⁴ can be obtained by v.high >> 63.
+//   the coefficient of x¹²⁷ can be obtained by v.high & 1.
+type gcmFieldElement struct {
+	low, high uint64
+}
+
+// gcm represents a Galois Counter Mode with a specific key. See
+// https://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+type gcm struct {
+	cipher    *sm4CipherAsm
+	nonceSize int
+	tagSize   int
+	// productTable contains the first sixteen powers of the key, H.
+	// However, they are in bit reversed order. See NewGCMWithNonceSize.
+	productTable [16]gcmFieldElement
+}
+
+const (
+	gcmBlockSize         = 16
+	gcmTagSize           = 16
+	gcmMinimumTagSize    = 12 // NIST SP 800-38D recommends tags with 12 or more bytes.
+	gcmStandardNonceSize = 12
+)
+
+func (g *gcm) NonceSize() int {
+	return g.nonceSize
+}
+
+func (g *gcm) Overhead() int {
+	return g.tagSize
+}
+
+func (g *gcm) Seal(dst, nonce, plaintext, data []byte) []byte {
+	if len(nonce) != g.nonceSize {
+		panic("crypto/cipher: incorrect nonce length given to GCM")
+	}
+	if uint64(len(plaintext)) > ((1<<32)-2)*uint64(g.cipher.BlockSize()) {
+		panic("crypto/cipher: message too large for GCM")
+	}
+
+	ret, out := sliceForAppend(dst, len(plaintext)+g.tagSize)
+	if InexactOverlap(out, plaintext) {
+		panic("crypto/cipher: invalid buffer overlap")
+	}
+
+	var counter, tagMask [gcmBlockSize]byte
+	g.deriveCounter(&counter, nonce)
+
+	g.cipher.Encrypt(tagMask[:], counter[:])
+	gcmInc32(&counter)
+
+	g.counterCrypt(out, plaintext, &counter)
+
+	var tag [gcmTagSize]byte
+	g.auth(tag[:], out[:len(plaintext)], data, &tagMask)
+	copy(out[len(plaintext):], tag[:])
+
+	return ret
+}
+
+var errOpen = errors.New("cipher: message authentication failed")
+
+func (g *gcm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
+	if len(nonce) != g.nonceSize {
+		panic("crypto/cipher: incorrect nonce length given to GCM")
+	}
+	// Sanity check to prevent the authentication from always succeeding if an implementation
+	// leaves tagSize uninitialized, for example.
+	if g.tagSize < gcmMinimumTagSize {
+		panic("crypto/cipher: incorrect GCM tag size")
+	}
+
+	if len(ciphertext) < g.tagSize {
+		return nil, errOpen
+	}
+	if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(g.cipher.BlockSize())+uint64(g.tagSize) {
+		return nil, errOpen
+	}
+
+	tag := ciphertext[len(ciphertext)-g.tagSize:]
+	ciphertext = ciphertext[:len(ciphertext)-g.tagSize]
+
+	var counter, tagMask [gcmBlockSize]byte
+	g.deriveCounter(&counter, nonce)
+
+	g.cipher.Encrypt(tagMask[:], counter[:])
+	gcmInc32(&counter)
+
+	var expectedTag [gcmTagSize]byte
+	g.auth(expectedTag[:], ciphertext, data, &tagMask)
+
+	ret, out := sliceForAppend(dst, len(ciphertext))
+	if InexactOverlap(out, ciphertext) {
+		panic("crypto/cipher: invalid buffer overlap")
+	}
+
+	if subtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 {
+		// The AESNI code decrypts and authenticates concurrently, and
+		// so overwrites dst in the event of a tag mismatch. That
+		// behavior is mimicked here in order to be consistent across
+		// platforms.
+		for i := range out {
+			out[i] = 0
+		}
+		return nil, errOpen
+	}
+
+	g.counterCrypt(out, ciphertext, &counter)
+
+	return ret, nil
+}
+
+// reverseBits reverses the order of the bits of 4-bit number in i.
+func reverseBits(i int) int {
+	i = ((i << 2) & 0xc) | ((i >> 2) & 0x3)
+	i = ((i << 1) & 0xa) | ((i >> 1) & 0x5)
+	return i
+}
+
+// gcmAdd adds two elements of GF(2¹²⁸) and returns the sum.
+func gcmAdd(x, y *gcmFieldElement) gcmFieldElement {
+	// Addition in a characteristic 2 field is just XOR.
+	return gcmFieldElement{x.low ^ y.low, x.high ^ y.high}
+}
+
+// gcmDouble returns the result of doubling an element of GF(2¹²⁸).
+func gcmDouble(x *gcmFieldElement) (double gcmFieldElement) {
+	msbSet := x.high&1 == 1
+
+	// Because of the bit-ordering, doubling is actually a right shift.
+	double.high = x.high >> 1
+	double.high |= x.low << 63
+	double.low = x.low >> 1
+
+	// If the most-significant bit was set before shifting then it,
+	// conceptually, becomes a term of x^128. This is greater than the
+	// irreducible polynomial so the result has to be reduced. The
+	// irreducible polynomial is 1+x+x^2+x^7+x^128. We can subtract that to
+	// eliminate the term at x^128 which also means subtracting the other
+	// four terms. In characteristic 2 fields, subtraction == addition ==
+	// XOR.
+	if msbSet {
+		double.low ^= 0xe100000000000000
+	}
+
+	return
+}
+
+var gcmReductionTable = []uint16{
+	0x0000, 0x1c20, 0x3840, 0x2460, 0x7080, 0x6ca0, 0x48c0, 0x54e0,
+	0xe100, 0xfd20, 0xd940, 0xc560, 0x9180, 0x8da0, 0xa9c0, 0xb5e0,
+}
+
+// mul sets y to y*H, where H is the GCM key, fixed during NewGCMWithNonceSize.
+func (g *gcm) mul(y *gcmFieldElement) {
+	var z gcmFieldElement
+
+	for i := 0; i < 2; i++ {
+		word := y.high
+		if i == 1 {
+			word = y.low
+		}
+
+		// Multiplication works by multiplying z by 16 and adding in
+		// one of the precomputed multiples of H.
+		for j := 0; j < 64; j += 4 {
+			msw := z.high & 0xf
+			z.high >>= 4
+			z.high |= z.low << 60
+			z.low >>= 4
+			z.low ^= uint64(gcmReductionTable[msw]) << 48
+
+			// the values in |table| are ordered for
+			// little-endian bit positions. See the comment
+			// in NewGCMWithNonceSize.
+			t := &g.productTable[word&0xf]
+
+			z.low ^= t.low
+			z.high ^= t.high
+			word >>= 4
+		}
+	}
+
+	*y = z
+}
+
+// updateBlocks extends y with more polynomial terms from blocks, based on
+// Horner's rule. There must be a multiple of gcmBlockSize bytes in blocks.
+func (g *gcm) updateBlocks(y *gcmFieldElement, blocks []byte) {
+	for len(blocks) > 0 {
+		y.low ^= binary.BigEndian.Uint64(blocks)
+		y.high ^= binary.BigEndian.Uint64(blocks[8:])
+		g.mul(y)
+		blocks = blocks[gcmBlockSize:]
+	}
+}
+
+// update extends y with more polynomial terms from data. If data is not a
+// multiple of gcmBlockSize bytes long then the remainder is zero padded.
+func (g *gcm) update(y *gcmFieldElement, data []byte) {
+	fullBlocks := (len(data) >> 4) << 4
+	g.updateBlocks(y, data[:fullBlocks])
+
+	if len(data) != fullBlocks {
+		var partialBlock [gcmBlockSize]byte
+		copy(partialBlock[:], data[fullBlocks:])
+		g.updateBlocks(y, partialBlock[:])
+	}
+}
+
+// gcmInc32 treats the final four bytes of counterBlock as a big-endian value
+// and increments it.
+func gcmInc32(counterBlock *[16]byte) {
+	ctr := counterBlock[len(counterBlock)-4:]
+	binary.BigEndian.PutUint32(ctr, binary.BigEndian.Uint32(ctr)+1)
+}
+
+// sliceForAppend takes a slice and a requested number of bytes. It returns a
+// slice with the contents of the given slice followed by that many bytes and a
+// second slice that aliases into it and contains only the extra bytes. If the
+// original slice has sufficient capacity then no allocation is performed.
+func sliceForAppend(in []byte, n int) (head, tail []byte) {
+	if total := len(in) + n; cap(in) >= total {
+		head = in[:total]
+	} else {
+		head = make([]byte, total)
+		copy(head, in)
+	}
+	tail = head[len(in):]
+	return
+}
+
+// counterCrypt crypts in to out using g.cipher in counter mode.
+func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
+	var mask [FourBlocksSize]byte
+	var couters [FourBlocksSize]byte
+
+	for len(in) >= FourBlocksSize {
+		copy(couters[:], counter[:])
+		gcmInc32(counter)
+		copy(couters[gcmBlockSize:], counter[:])
+		gcmInc32(counter)
+		copy(couters[2*gcmBlockSize:], counter[:])
+		gcmInc32(counter)
+		copy(couters[3*gcmBlockSize:], counter[:])
+
+		encryptBlocksAsm(&g.cipher.enc[0], &mask[0], &couters[0])
+		gcmInc32(counter)
+		xorWords(out, in, mask[:])
+		out = out[FourBlocksSize:]
+		in = in[FourBlocksSize:]
+	}
+
+	if len(in) > 0 {
+		blocks := (len(in) + gcmBlockSize - 1) / gcmBlockSize
+		for i := 0; i < blocks; i++ {
+			copy(couters[i*gcmBlockSize:], counter[:])
+			gcmInc32(counter)
+		}
+		encryptBlocksAsm(&g.cipher.enc[0], &mask[0], &couters[0])
+		xorBytes(out, in, mask[:blocks*gcmBlockSize])
+	}
+}
+
+// deriveCounter computes the initial GCM counter state from the given nonce.
+// See NIST SP 800-38D, section 7.1. This assumes that counter is filled with
+// zeros on entry.
+func (g *gcm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) {
+	// GCM has two modes of operation with respect to the initial counter
+	// state: a "fast path" for 96-bit (12-byte) nonces, and a "slow path"
+	// for nonces of other lengths. For a 96-bit nonce, the nonce, along
+	// with a four-byte big-endian counter starting at one, is used
+	// directly as the starting counter. For other nonce sizes, the counter
+	// is computed by passing it through the GHASH function.
+	if len(nonce) == gcmStandardNonceSize {
+		copy(counter[:], nonce)
+		counter[gcmBlockSize-1] = 1
+	} else {
+		var y gcmFieldElement
+		g.update(&y, nonce)
+		y.high ^= uint64(len(nonce)) * 8
+		g.mul(&y)
+		binary.BigEndian.PutUint64(counter[:8], y.low)
+		binary.BigEndian.PutUint64(counter[8:], y.high)
+	}
+}
+
+// auth calculates GHASH(ciphertext, additionalData), masks the result with
+// tagMask and writes the result to out.
+func (g *gcm) auth(out, ciphertext, additionalData []byte, tagMask *[gcmTagSize]byte) {
+	var y gcmFieldElement
+	g.update(&y, additionalData)
+	g.update(&y, ciphertext)
+
+	y.low ^= uint64(len(additionalData)) * 8
+	y.high ^= uint64(len(ciphertext)) * 8
+
+	g.mul(&y)
+
+	binary.BigEndian.PutUint64(out, y.low)
+	binary.BigEndian.PutUint64(out[8:], y.high)
+
+	xorWords(out, out, tagMask[:])
+}
diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s
new file mode 100644
index 0000000..3da26ab
--- /dev/null
+++ b/sm4/gcm_amd64.s
@@ -0,0 +1,392 @@
+// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
+// The implementation uses some optimization as described in:
+// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
+//     Instruction and its Usage for Computing the GCM Mode rev. 2.02
+// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
+//     Hardware
+
+#include "textflag.h"
+
+#define B0 X0
+#define B1 X1
+#define B2 X2
+#define B3 X3
+#define B4 X4
+#define B5 X5
+#define B6 X6
+#define B7 X7
+
+#define ACC0 X8
+#define ACC1 X9
+#define ACCM X10
+
+#define T0 X11
+#define T1 X12
+#define T2 X13
+#define POLY X14
+#define BSWAP X15
+
+DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
+DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
+
+DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
+DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
+
+DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
+DATA andMask<>+0x08(SB)/8, $0x0000000000000000
+DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
+DATA andMask<>+0x18(SB)/8, $0x0000000000000000
+DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
+DATA andMask<>+0x28(SB)/8, $0x0000000000000000
+DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
+DATA andMask<>+0x38(SB)/8, $0x0000000000000000
+DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
+DATA andMask<>+0x48(SB)/8, $0x0000000000000000
+DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
+DATA andMask<>+0x58(SB)/8, $0x0000000000000000
+DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
+DATA andMask<>+0x68(SB)/8, $0x0000000000000000
+DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0x78(SB)/8, $0x0000000000000000
+DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
+DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
+DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
+DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
+DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
+DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
+DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
+
+GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
+GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
+GLOBL andMask<>(SB), (NOPTR+RODATA), $240
+
+// func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
+TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
+#define pTbl DI
+#define tMsk SI
+#define tPtr DX
+#define plen AX
+#define dlen CX
+
+	MOVQ productTable+0(FP), pTbl
+	MOVQ tagMask+8(FP), tMsk
+	MOVQ T+16(FP), tPtr
+	MOVQ pLen+24(FP), plen
+	MOVQ dLen+32(FP), dlen
+
+	MOVOU (tPtr), ACC0
+	MOVOU (tMsk), T2
+
+	MOVOU bswapMask<>(SB), BSWAP
+	MOVOU gcmPoly<>(SB), POLY
+
+	SHLQ $3, plen
+	SHLQ $3, dlen
+
+	MOVQ plen, B0
+	PINSRQ $1, dlen, B0
+
+	PXOR ACC0, B0
+
+	MOVOU (16*14)(pTbl), ACC0
+	MOVOU (16*15)(pTbl), ACCM
+	MOVOU ACC0, ACC1
+
+	PCLMULQDQ $0x00, B0, ACC0
+	PCLMULQDQ $0x11, B0, ACC1
+	PSHUFD $78, B0, T0
+	PXOR B0, T0
+	PCLMULQDQ $0x00, T0, ACCM
+
+	PXOR ACC0, ACCM
+	PXOR ACC1, ACCM
+	MOVOU ACCM, T0
+	PSRLDQ $8, ACCM
+	PSLLDQ $8, T0
+	PXOR ACCM, ACC1
+	PXOR T0, ACC0
+
+	MOVOU POLY, T0
+	PCLMULQDQ $0x01, ACC0, T0
+	PSHUFD $78, ACC0, ACC0
+	PXOR T0, ACC0
+
+	MOVOU POLY, T0
+	PCLMULQDQ $0x01, ACC0, T0
+	PSHUFD $78, ACC0, ACC0
+	PXOR T0, ACC0
+
+	PXOR ACC1, ACC0
+
+	PSHUFB BSWAP, ACC0
+	PXOR T2, ACC0
+	MOVOU ACC0, (tPtr)
+
+	RET
+
+#undef pTbl
+#undef tMsk
+#undef tPtr
+#undef plen
+#undef dlen
+
+// func precomputeTableAsm(productTable *[256]byte, src *[16]byte)
+TEXT ·precomputeTableAsm(SB),NOSPLIT,$0
+#define dst DI
+#define SRC SI
+
+	MOVQ productTable+0(FP), dst
+	MOVQ src+8(FP), SRC
+
+	MOVOU bswapMask<>(SB), BSWAP
+	MOVOU gcmPoly<>(SB), POLY
+
+  MOVOU (16*0)(SRC), B0
+  PSHUFB BSWAP, B0
+
+	// H * 2
+	PSHUFD $0xff, B0, T0
+	MOVOU B0, T1
+	PSRAL $31, T0
+	PAND POLY, T0
+	PSRLL $31, T1
+	PSLLDQ $4, T1
+	PSLLL $1, B0
+	PXOR T0, B0
+	PXOR T1, B0
+	// Karatsuba pre-computations
+	MOVOU B0, (16*14)(dst)
+	PSHUFD $78, B0, B1
+	PXOR B0, B1
+	MOVOU B1, (16*15)(dst)
+
+	MOVOU B0, B2
+	MOVOU B1, B3
+	// Now prepare powers of H and pre-computations for them
+	MOVQ $7, AX
+
+initLoop:
+		MOVOU B2, T0
+		MOVOU B2, T1
+		MOVOU B3, T2
+		PCLMULQDQ $0x00, B0, T0
+		PCLMULQDQ $0x11, B0, T1
+		PCLMULQDQ $0x00, B1, T2
+
+		PXOR T0, T2
+		PXOR T1, T2
+		MOVOU T2, B4
+		PSLLDQ $8, B4
+		PSRLDQ $8, T2
+		PXOR B4, T0
+		PXOR T2, T1
+
+		MOVOU POLY, B2
+		PCLMULQDQ $0x01, T0, B2
+		PSHUFD $78, T0, T0
+		PXOR B2, T0
+		MOVOU POLY, B2
+		PCLMULQDQ $0x01, T0, B2
+		PSHUFD $78, T0, T0
+		PXOR T0, B2
+		PXOR T1, B2
+
+		MOVOU B2, (16*12)(dst)
+		PSHUFD $78, B2, B3
+		PXOR B2, B3
+		MOVOU B3, (16*13)(dst)
+
+		DECQ AX
+		LEAQ (-16*2)(dst), dst
+	JNE initLoop
+
+	RET
+
+#undef SRC
+#undef dst
+
+// func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
+TEXT ·gcmSm4Data(SB),NOSPLIT,$0
+#define pTbl DI
+#define aut SI
+#define tPtr CX
+#define autLen DX
+
+#define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
+#define mulRoundAAD(X ,i) \
+	MOVOU (16*(i*2))(pTbl), T1;\
+	MOVOU T1, T2;\
+	PCLMULQDQ $0x00, X, T1;\
+	PXOR T1, ACC0;\
+	PCLMULQDQ $0x11, X, T2;\
+	PXOR T2, ACC1;\
+	PSHUFD $78, X, T1;\
+	PXOR T1, X;\
+	MOVOU (16*(i*2+1))(pTbl), T1;\
+	PCLMULQDQ $0x00, X, T1;\
+	PXOR T1, ACCM
+
+	MOVQ productTable+0(FP), pTbl
+	MOVQ data_base+8(FP), aut
+	MOVQ data_len+16(FP), autLen
+	MOVQ T+32(FP), tPtr
+
+	//PXOR ACC0, ACC0
+  MOVOU (tPtr), ACC0
+	MOVOU bswapMask<>(SB), BSWAP
+	MOVOU gcmPoly<>(SB), POLY
+
+	TESTQ autLen, autLen
+	JEQ dataBail
+
+	CMPQ autLen, $13	// optimize the TLS case
+	JE dataTLS
+	CMPQ autLen, $128
+	JB startSinglesLoop
+	JMP dataOctaLoop
+
+dataTLS:
+	MOVOU (16*14)(pTbl), T1
+	MOVOU (16*15)(pTbl), T2
+	PXOR B0, B0
+	MOVQ (aut), B0
+	PINSRD $2, 8(aut), B0
+	PINSRB $12, 12(aut), B0
+	XORQ autLen, autLen
+	JMP dataMul
+
+dataOctaLoop:
+		CMPQ autLen, $128
+		JB startSinglesLoop
+		SUBQ $128, autLen
+
+		MOVOU (16*0)(aut), X0
+		MOVOU (16*1)(aut), X1
+		MOVOU (16*2)(aut), X2
+		MOVOU (16*3)(aut), X3
+		MOVOU (16*4)(aut), X4
+		MOVOU (16*5)(aut), X5
+		MOVOU (16*6)(aut), X6
+		MOVOU (16*7)(aut), X7
+		LEAQ (16*8)(aut), aut
+		PSHUFB BSWAP, X0
+		PSHUFB BSWAP, X1
+		PSHUFB BSWAP, X2
+		PSHUFB BSWAP, X3
+		PSHUFB BSWAP, X4
+		PSHUFB BSWAP, X5
+		PSHUFB BSWAP, X6
+		PSHUFB BSWAP, X7
+		PXOR ACC0, X0
+
+		MOVOU (16*0)(pTbl), ACC0
+		MOVOU (16*1)(pTbl), ACCM
+		MOVOU ACC0, ACC1
+		PSHUFD $78, X0, T1
+		PXOR X0, T1
+		PCLMULQDQ $0x00, X0, ACC0
+		PCLMULQDQ $0x11, X0, ACC1
+		PCLMULQDQ $0x00, T1, ACCM
+
+		mulRoundAAD(X1, 1)
+		mulRoundAAD(X2, 2)
+		mulRoundAAD(X3, 3)
+		mulRoundAAD(X4, 4)
+		mulRoundAAD(X5, 5)
+		mulRoundAAD(X6, 6)
+		mulRoundAAD(X7, 7)
+
+		PXOR ACC0, ACCM
+		PXOR ACC1, ACCM
+		MOVOU ACCM, T0
+		PSRLDQ $8, ACCM
+		PSLLDQ $8, T0
+		PXOR ACCM, ACC1
+		PXOR T0, ACC0
+		reduceRound(ACC0)
+		reduceRound(ACC0)
+		PXOR ACC1, ACC0
+	JMP dataOctaLoop
+
+startSinglesLoop:
+	MOVOU (16*14)(pTbl), T1
+	MOVOU (16*15)(pTbl), T2
+
+dataSinglesLoop:
+
+		CMPQ autLen, $16
+		JB dataEnd
+		SUBQ $16, autLen
+
+		MOVOU (aut), B0
+dataMul:
+		PSHUFB BSWAP, B0
+		PXOR ACC0, B0
+
+		MOVOU T1, ACC0
+		MOVOU T2, ACCM
+		MOVOU T1, ACC1
+
+		PSHUFD $78, B0, T0
+		PXOR B0, T0
+		PCLMULQDQ $0x00, B0, ACC0
+		PCLMULQDQ $0x11, B0, ACC1
+		PCLMULQDQ $0x00, T0, ACCM
+
+		PXOR ACC0, ACCM
+		PXOR ACC1, ACCM
+		MOVOU ACCM, T0
+		PSRLDQ $8, ACCM
+		PSLLDQ $8, T0
+		PXOR ACCM, ACC1
+		PXOR T0, ACC0
+
+		MOVOU POLY, T0
+		PCLMULQDQ $0x01, ACC0, T0
+		PSHUFD $78, ACC0, ACC0
+		PXOR T0, ACC0
+
+		MOVOU POLY, T0
+		PCLMULQDQ $0x01, ACC0, T0
+		PSHUFD $78, ACC0, ACC0
+		PXOR T0, ACC0
+		PXOR ACC1, ACC0
+
+		LEAQ 16(aut), aut
+
+	JMP dataSinglesLoop
+
+dataEnd:
+
+	TESTQ autLen, autLen
+	JEQ dataBail
+
+	PXOR B0, B0
+	LEAQ -1(aut)(autLen*1), aut
+
+dataLoadLoop:
+
+		PSLLDQ $1, B0
+		PINSRB $0, (aut), B0
+
+		LEAQ -1(aut), aut
+		DECQ autLen
+		JNE dataLoadLoop
+
+	JMP dataMul
+
+dataBail:
+	MOVOU ACC0, (tPtr)
+	RET
+
+#undef pTbl
+#undef aut
+#undef tPtr
+#undef autLen
diff --git a/sm4/sm4_gcm.go b/sm4/sm4_gcm.go
new file mode 100644
index 0000000..85761dc
--- /dev/null
+++ b/sm4/sm4_gcm.go
@@ -0,0 +1,158 @@
+// +build amd64
+package sm4
+
+import (
+	"crypto/cipher"
+	"crypto/subtle"
+)
+
+// sm4CipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
+// will use the optimised implementation in this file when possible. Instances
+// of this type only exist when hasGCMAsm returns true.
+type sm4CipherGCM struct {
+	sm4CipherAsm
+}
+
+// Assert that sm4CipherGCM implements the gcmAble interface.
+var _ gcmAble = (*sm4CipherGCM)(nil)
+
+//go:noescape
+func precomputeTableAsm(productTable *[256]byte, src *[16]byte)
+
+//go:noescape
+func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
+
+//go:noescape
+func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
+
+type gcmAsm struct {
+	gcm
+	bytesProductTable [256]byte
+}
+
+// NewGCM returns the SM4 cipher wrapped in Galois Counter Mode. This is only
+// called by crypto/cipher.NewGCM via the gcmAble interface.
+func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
+	g := &gcmAsm{}
+	g.cipher = &c.sm4CipherAsm
+	g.nonceSize = nonceSize
+	g.tagSize = tagSize
+	var key [gcmBlockSize]byte
+	c.Encrypt(key[:], key[:])
+	precomputeTableAsm(&g.bytesProductTable, &key)
+	return g, nil
+}
+
+func (g *gcmAsm) NonceSize() int {
+	return g.nonceSize
+}
+
+func (g *gcmAsm) Overhead() int {
+	return g.tagSize
+}
+
+// Seal encrypts and authenticates plaintext. See the cipher.AEAD interface for
+// details.
+func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
+	if len(nonce) != g.nonceSize {
+		panic("crypto/cipher: incorrect nonce length given to GCM")
+	}
+	if uint64(len(plaintext)) > ((1<<32)-2)*BlockSize {
+		panic("crypto/cipher: message too large for GCM")
+	}
+
+	var counter, tagMask [gcmBlockSize]byte
+
+	if len(nonce) == gcmStandardNonceSize {
+		// Init counter to nonce||1
+		copy(counter[:], nonce)
+		counter[gcmBlockSize-1] = 1
+	} else {
+		// Otherwise counter = GHASH(nonce)
+		gcmSm4Data(&g.bytesProductTable, nonce, &counter)
+		gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
+	}
+
+	g.cipher.Encrypt(tagMask[:], counter[:])
+	gcmInc32(&counter)
+
+	var tagOut [gcmTagSize]byte
+
+	gcmSm4Data(&g.bytesProductTable, data, &tagOut)
+
+	ret, out := sliceForAppend(dst, len(plaintext)+g.tagSize)
+	if InexactOverlap(out[:len(plaintext)], plaintext) {
+		panic("crypto/cipher: invalid buffer overlap")
+	}
+
+	if len(plaintext) > 0 {
+		g.counterCrypt(out, plaintext, &counter)
+		gcmSm4Data(&g.bytesProductTable, out[:len(plaintext)], &tagOut)
+	}
+	gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data)))
+	copy(out[len(plaintext):], tagOut[:])
+
+	return ret
+}
+
+// Open authenticates and decrypts ciphertext. See the cipher.AEAD interface
+// for details.
+func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
+	if len(nonce) != g.nonceSize {
+		panic("crypto/cipher: incorrect nonce length given to GCM")
+	}
+	// Sanity check to prevent the authentication from always succeeding if an implementation
+	// leaves tagSize uninitialized, for example.
+	if g.tagSize < gcmMinimumTagSize {
+		panic("crypto/cipher: incorrect GCM tag size")
+	}
+
+	if len(ciphertext) < g.tagSize {
+		return nil, errOpen
+	}
+	if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(BlockSize)+uint64(g.tagSize) {
+		return nil, errOpen
+	}
+
+	tag := ciphertext[len(ciphertext)-g.tagSize:]
+	ciphertext = ciphertext[:len(ciphertext)-g.tagSize]
+
+	// See GCM spec, section 7.1.
+	var counter, tagMask [gcmBlockSize]byte
+
+	if len(nonce) == gcmStandardNonceSize {
+		// Init counter to nonce||1
+		copy(counter[:], nonce)
+		counter[gcmBlockSize-1] = 1
+	} else {
+		// Otherwise counter = GHASH(nonce)
+		gcmSm4Data(&g.bytesProductTable, nonce, &counter)
+		gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
+	}
+
+	g.cipher.Encrypt(tagMask[:], counter[:])
+	gcmInc32(&counter)
+
+	var expectedTag [gcmTagSize]byte
+	gcmSm4Data(&g.bytesProductTable, data, &expectedTag)
+
+	ret, out := sliceForAppend(dst, len(ciphertext))
+	if InexactOverlap(out, ciphertext) {
+		panic("crypto/cipher: invalid buffer overlap")
+	}
+	if len(ciphertext) > 0 {
+		gcmSm4Data(&g.bytesProductTable, ciphertext, &expectedTag)
+	}
+	gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data)))
+
+	if subtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 {
+		for i := range out {
+			out[i] = 0
+		}
+		return nil, errOpen
+	}
+
+	g.counterCrypt(out, ciphertext, &counter)
+
+	return ret, nil
+}