From 88e456e9a8899a9ce5df05aa6307e74e2bfebe57 Mon Sep 17 00:00:00 2001 From: Emman Date: Fri, 19 Mar 2021 17:17:19 +0800 Subject: [PATCH] MAGIC - ghash asm tuning --- sm4/cipher_asm.go | 4 + sm4/gcm_amd64.go | 346 ++++++++++++++++++++++++++++++++++++++++ sm4/gcm_amd64.s | 392 ++++++++++++++++++++++++++++++++++++++++++++++ sm4/sm4_gcm.go | 158 +++++++++++++++++++ 4 files changed, 900 insertions(+) create mode 100644 sm4/gcm_amd64.go create mode 100644 sm4/gcm_amd64.s create mode 100644 sm4/sm4_gcm.go diff --git a/sm4/cipher_asm.go b/sm4/cipher_asm.go index 146c0d8..4632bd6 100644 --- a/sm4/cipher_asm.go +++ b/sm4/cipher_asm.go @@ -16,6 +16,7 @@ type sm4CipherAsm struct { } var supportsAES = cpu.X86.HasAES +var supportsGFMUL = cpu.X86.HasPCLMULQDQ func newCipher(key []byte) (cipher.Block, error) { if !supportsAES { @@ -23,6 +24,9 @@ func newCipher(key []byte) (cipher.Block, error) { } c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}} expandKeyGo(key, c.enc, c.dec) + if supportsAES && supportsGFMUL { + return &sm4CipherGCM{c}, nil + } return &c, nil } diff --git a/sm4/gcm_amd64.go b/sm4/gcm_amd64.go new file mode 100644 index 0000000..c718e92 --- /dev/null +++ b/sm4/gcm_amd64.go @@ -0,0 +1,346 @@ +package sm4 + +import ( + "crypto/cipher" + "crypto/subtle" + "encoding/binary" + "errors" +) + +// Assert that sm4CipherAsm implements the gcmAble interface. +var _ gcmAble = (*sm4CipherAsm)(nil) + +// NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only +// called by crypto/cipher.NewGCM via the gcmAble interface. +func (c *sm4CipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) { + var key [gcmBlockSize]byte + c.Encrypt(key[:], key[:]) + g := &gcm{cipher: c, nonceSize: nonceSize, tagSize: tagSize} + // We precompute 16 multiples of |key|. However, when we do lookups + // into this table we'll be using bits from a field element and + // therefore the bits will be in the reverse order. So normally one + // would expect, say, 4*key to be in index 4 of the table but due to + // this bit ordering it will actually be in index 0010 (base 2) = 2. + x := gcmFieldElement{ + binary.BigEndian.Uint64(key[:8]), + binary.BigEndian.Uint64(key[8:]), + } + g.productTable[reverseBits(1)] = x + + for i := 2; i < 16; i += 2 { + g.productTable[reverseBits(i)] = gcmDouble(&g.productTable[reverseBits(i/2)]) + g.productTable[reverseBits(i+1)] = gcmAdd(&g.productTable[reverseBits(i)], &x) + } + + return g, nil +} + +// gcmFieldElement represents a value in GF(2¹²⁸). In order to reflect the GCM +// standard and make binary.BigEndian suitable for marshaling these values, the +// bits are stored in big endian order. For example: +// the coefficient of x⁰ can be obtained by v.low >> 63. +// the coefficient of x⁶³ can be obtained by v.low & 1. +// the coefficient of x⁶⁴ can be obtained by v.high >> 63. +// the coefficient of x¹²⁷ can be obtained by v.high & 1. +type gcmFieldElement struct { + low, high uint64 +} + +// gcm represents a Galois Counter Mode with a specific key. See +// https://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +type gcm struct { + cipher *sm4CipherAsm + nonceSize int + tagSize int + // productTable contains the first sixteen powers of the key, H. + // However, they are in bit reversed order. See NewGCMWithNonceSize. + productTable [16]gcmFieldElement +} + +const ( + gcmBlockSize = 16 + gcmTagSize = 16 + gcmMinimumTagSize = 12 // NIST SP 800-38D recommends tags with 12 or more bytes. + gcmStandardNonceSize = 12 +) + +func (g *gcm) NonceSize() int { + return g.nonceSize +} + +func (g *gcm) Overhead() int { + return g.tagSize +} + +func (g *gcm) Seal(dst, nonce, plaintext, data []byte) []byte { + if len(nonce) != g.nonceSize { + panic("crypto/cipher: incorrect nonce length given to GCM") + } + if uint64(len(plaintext)) > ((1<<32)-2)*uint64(g.cipher.BlockSize()) { + panic("crypto/cipher: message too large for GCM") + } + + ret, out := sliceForAppend(dst, len(plaintext)+g.tagSize) + if InexactOverlap(out, plaintext) { + panic("crypto/cipher: invalid buffer overlap") + } + + var counter, tagMask [gcmBlockSize]byte + g.deriveCounter(&counter, nonce) + + g.cipher.Encrypt(tagMask[:], counter[:]) + gcmInc32(&counter) + + g.counterCrypt(out, plaintext, &counter) + + var tag [gcmTagSize]byte + g.auth(tag[:], out[:len(plaintext)], data, &tagMask) + copy(out[len(plaintext):], tag[:]) + + return ret +} + +var errOpen = errors.New("cipher: message authentication failed") + +func (g *gcm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) { + if len(nonce) != g.nonceSize { + panic("crypto/cipher: incorrect nonce length given to GCM") + } + // Sanity check to prevent the authentication from always succeeding if an implementation + // leaves tagSize uninitialized, for example. + if g.tagSize < gcmMinimumTagSize { + panic("crypto/cipher: incorrect GCM tag size") + } + + if len(ciphertext) < g.tagSize { + return nil, errOpen + } + if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(g.cipher.BlockSize())+uint64(g.tagSize) { + return nil, errOpen + } + + tag := ciphertext[len(ciphertext)-g.tagSize:] + ciphertext = ciphertext[:len(ciphertext)-g.tagSize] + + var counter, tagMask [gcmBlockSize]byte + g.deriveCounter(&counter, nonce) + + g.cipher.Encrypt(tagMask[:], counter[:]) + gcmInc32(&counter) + + var expectedTag [gcmTagSize]byte + g.auth(expectedTag[:], ciphertext, data, &tagMask) + + ret, out := sliceForAppend(dst, len(ciphertext)) + if InexactOverlap(out, ciphertext) { + panic("crypto/cipher: invalid buffer overlap") + } + + if subtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 { + // The AESNI code decrypts and authenticates concurrently, and + // so overwrites dst in the event of a tag mismatch. That + // behavior is mimicked here in order to be consistent across + // platforms. + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + + g.counterCrypt(out, ciphertext, &counter) + + return ret, nil +} + +// reverseBits reverses the order of the bits of 4-bit number in i. +func reverseBits(i int) int { + i = ((i << 2) & 0xc) | ((i >> 2) & 0x3) + i = ((i << 1) & 0xa) | ((i >> 1) & 0x5) + return i +} + +// gcmAdd adds two elements of GF(2¹²⁸) and returns the sum. +func gcmAdd(x, y *gcmFieldElement) gcmFieldElement { + // Addition in a characteristic 2 field is just XOR. + return gcmFieldElement{x.low ^ y.low, x.high ^ y.high} +} + +// gcmDouble returns the result of doubling an element of GF(2¹²⁸). +func gcmDouble(x *gcmFieldElement) (double gcmFieldElement) { + msbSet := x.high&1 == 1 + + // Because of the bit-ordering, doubling is actually a right shift. + double.high = x.high >> 1 + double.high |= x.low << 63 + double.low = x.low >> 1 + + // If the most-significant bit was set before shifting then it, + // conceptually, becomes a term of x^128. This is greater than the + // irreducible polynomial so the result has to be reduced. The + // irreducible polynomial is 1+x+x^2+x^7+x^128. We can subtract that to + // eliminate the term at x^128 which also means subtracting the other + // four terms. In characteristic 2 fields, subtraction == addition == + // XOR. + if msbSet { + double.low ^= 0xe100000000000000 + } + + return +} + +var gcmReductionTable = []uint16{ + 0x0000, 0x1c20, 0x3840, 0x2460, 0x7080, 0x6ca0, 0x48c0, 0x54e0, + 0xe100, 0xfd20, 0xd940, 0xc560, 0x9180, 0x8da0, 0xa9c0, 0xb5e0, +} + +// mul sets y to y*H, where H is the GCM key, fixed during NewGCMWithNonceSize. +func (g *gcm) mul(y *gcmFieldElement) { + var z gcmFieldElement + + for i := 0; i < 2; i++ { + word := y.high + if i == 1 { + word = y.low + } + + // Multiplication works by multiplying z by 16 and adding in + // one of the precomputed multiples of H. + for j := 0; j < 64; j += 4 { + msw := z.high & 0xf + z.high >>= 4 + z.high |= z.low << 60 + z.low >>= 4 + z.low ^= uint64(gcmReductionTable[msw]) << 48 + + // the values in |table| are ordered for + // little-endian bit positions. See the comment + // in NewGCMWithNonceSize. + t := &g.productTable[word&0xf] + + z.low ^= t.low + z.high ^= t.high + word >>= 4 + } + } + + *y = z +} + +// updateBlocks extends y with more polynomial terms from blocks, based on +// Horner's rule. There must be a multiple of gcmBlockSize bytes in blocks. +func (g *gcm) updateBlocks(y *gcmFieldElement, blocks []byte) { + for len(blocks) > 0 { + y.low ^= binary.BigEndian.Uint64(blocks) + y.high ^= binary.BigEndian.Uint64(blocks[8:]) + g.mul(y) + blocks = blocks[gcmBlockSize:] + } +} + +// update extends y with more polynomial terms from data. If data is not a +// multiple of gcmBlockSize bytes long then the remainder is zero padded. +func (g *gcm) update(y *gcmFieldElement, data []byte) { + fullBlocks := (len(data) >> 4) << 4 + g.updateBlocks(y, data[:fullBlocks]) + + if len(data) != fullBlocks { + var partialBlock [gcmBlockSize]byte + copy(partialBlock[:], data[fullBlocks:]) + g.updateBlocks(y, partialBlock[:]) + } +} + +// gcmInc32 treats the final four bytes of counterBlock as a big-endian value +// and increments it. +func gcmInc32(counterBlock *[16]byte) { + ctr := counterBlock[len(counterBlock)-4:] + binary.BigEndian.PutUint32(ctr, binary.BigEndian.Uint32(ctr)+1) +} + +// sliceForAppend takes a slice and a requested number of bytes. It returns a +// slice with the contents of the given slice followed by that many bytes and a +// second slice that aliases into it and contains only the extra bytes. If the +// original slice has sufficient capacity then no allocation is performed. +func sliceForAppend(in []byte, n int) (head, tail []byte) { + if total := len(in) + n; cap(in) >= total { + head = in[:total] + } else { + head = make([]byte, total) + copy(head, in) + } + tail = head[len(in):] + return +} + +// counterCrypt crypts in to out using g.cipher in counter mode. +func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) { + var mask [FourBlocksSize]byte + var couters [FourBlocksSize]byte + + for len(in) >= FourBlocksSize { + copy(couters[:], counter[:]) + gcmInc32(counter) + copy(couters[gcmBlockSize:], counter[:]) + gcmInc32(counter) + copy(couters[2*gcmBlockSize:], counter[:]) + gcmInc32(counter) + copy(couters[3*gcmBlockSize:], counter[:]) + + encryptBlocksAsm(&g.cipher.enc[0], &mask[0], &couters[0]) + gcmInc32(counter) + xorWords(out, in, mask[:]) + out = out[FourBlocksSize:] + in = in[FourBlocksSize:] + } + + if len(in) > 0 { + blocks := (len(in) + gcmBlockSize - 1) / gcmBlockSize + for i := 0; i < blocks; i++ { + copy(couters[i*gcmBlockSize:], counter[:]) + gcmInc32(counter) + } + encryptBlocksAsm(&g.cipher.enc[0], &mask[0], &couters[0]) + xorBytes(out, in, mask[:blocks*gcmBlockSize]) + } +} + +// deriveCounter computes the initial GCM counter state from the given nonce. +// See NIST SP 800-38D, section 7.1. This assumes that counter is filled with +// zeros on entry. +func (g *gcm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) { + // GCM has two modes of operation with respect to the initial counter + // state: a "fast path" for 96-bit (12-byte) nonces, and a "slow path" + // for nonces of other lengths. For a 96-bit nonce, the nonce, along + // with a four-byte big-endian counter starting at one, is used + // directly as the starting counter. For other nonce sizes, the counter + // is computed by passing it through the GHASH function. + if len(nonce) == gcmStandardNonceSize { + copy(counter[:], nonce) + counter[gcmBlockSize-1] = 1 + } else { + var y gcmFieldElement + g.update(&y, nonce) + y.high ^= uint64(len(nonce)) * 8 + g.mul(&y) + binary.BigEndian.PutUint64(counter[:8], y.low) + binary.BigEndian.PutUint64(counter[8:], y.high) + } +} + +// auth calculates GHASH(ciphertext, additionalData), masks the result with +// tagMask and writes the result to out. +func (g *gcm) auth(out, ciphertext, additionalData []byte, tagMask *[gcmTagSize]byte) { + var y gcmFieldElement + g.update(&y, additionalData) + g.update(&y, ciphertext) + + y.low ^= uint64(len(additionalData)) * 8 + y.high ^= uint64(len(ciphertext)) * 8 + + g.mul(&y) + + binary.BigEndian.PutUint64(out, y.low) + binary.BigEndian.PutUint64(out[8:], y.high) + + xorWords(out, out, tagMask[:]) +} diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s new file mode 100644 index 0000000..3da26ab --- /dev/null +++ b/sm4/gcm_amd64.s @@ -0,0 +1,392 @@ +// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI +// The implementation uses some optimization as described in: +// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication +// Instruction and its Usage for Computing the GCM Mode rev. 2.02 +// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and +// Hardware + +#include "textflag.h" + +#define B0 X0 +#define B1 X1 +#define B2 X2 +#define B3 X3 +#define B4 X4 +#define B5 X5 +#define B6 X6 +#define B7 X7 + +#define ACC0 X8 +#define ACC1 X9 +#define ACCM X10 + +#define T0 X11 +#define T1 X12 +#define T2 X13 +#define POLY X14 +#define BSWAP X15 + +DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f +DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 + +DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 +DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 + +DATA andMask<>+0x00(SB)/8, $0x00000000000000ff +DATA andMask<>+0x08(SB)/8, $0x0000000000000000 +DATA andMask<>+0x10(SB)/8, $0x000000000000ffff +DATA andMask<>+0x18(SB)/8, $0x0000000000000000 +DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff +DATA andMask<>+0x28(SB)/8, $0x0000000000000000 +DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff +DATA andMask<>+0x38(SB)/8, $0x0000000000000000 +DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff +DATA andMask<>+0x48(SB)/8, $0x0000000000000000 +DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff +DATA andMask<>+0x58(SB)/8, $0x0000000000000000 +DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff +DATA andMask<>+0x68(SB)/8, $0x0000000000000000 +DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff +DATA andMask<>+0x78(SB)/8, $0x0000000000000000 +DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff +DATA andMask<>+0x88(SB)/8, $0x00000000000000ff +DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff +DATA andMask<>+0x98(SB)/8, $0x000000000000ffff +DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff +DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff +DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff +DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff +DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff +DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff +DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff +DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff +DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff +DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff + +GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 +GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 +GLOBL andMask<>(SB), (NOPTR+RODATA), $240 + +// func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) +TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 +#define pTbl DI +#define tMsk SI +#define tPtr DX +#define plen AX +#define dlen CX + + MOVQ productTable+0(FP), pTbl + MOVQ tagMask+8(FP), tMsk + MOVQ T+16(FP), tPtr + MOVQ pLen+24(FP), plen + MOVQ dLen+32(FP), dlen + + MOVOU (tPtr), ACC0 + MOVOU (tMsk), T2 + + MOVOU bswapMask<>(SB), BSWAP + MOVOU gcmPoly<>(SB), POLY + + SHLQ $3, plen + SHLQ $3, dlen + + MOVQ plen, B0 + PINSRQ $1, dlen, B0 + + PXOR ACC0, B0 + + MOVOU (16*14)(pTbl), ACC0 + MOVOU (16*15)(pTbl), ACCM + MOVOU ACC0, ACC1 + + PCLMULQDQ $0x00, B0, ACC0 + PCLMULQDQ $0x11, B0, ACC1 + PSHUFD $78, B0, T0 + PXOR B0, T0 + PCLMULQDQ $0x00, T0, ACCM + + PXOR ACC0, ACCM + PXOR ACC1, ACCM + MOVOU ACCM, T0 + PSRLDQ $8, ACCM + PSLLDQ $8, T0 + PXOR ACCM, ACC1 + PXOR T0, ACC0 + + MOVOU POLY, T0 + PCLMULQDQ $0x01, ACC0, T0 + PSHUFD $78, ACC0, ACC0 + PXOR T0, ACC0 + + MOVOU POLY, T0 + PCLMULQDQ $0x01, ACC0, T0 + PSHUFD $78, ACC0, ACC0 + PXOR T0, ACC0 + + PXOR ACC1, ACC0 + + PSHUFB BSWAP, ACC0 + PXOR T2, ACC0 + MOVOU ACC0, (tPtr) + + RET + +#undef pTbl +#undef tMsk +#undef tPtr +#undef plen +#undef dlen + +// func precomputeTableAsm(productTable *[256]byte, src *[16]byte) +TEXT ·precomputeTableAsm(SB),NOSPLIT,$0 +#define dst DI +#define SRC SI + + MOVQ productTable+0(FP), dst + MOVQ src+8(FP), SRC + + MOVOU bswapMask<>(SB), BSWAP + MOVOU gcmPoly<>(SB), POLY + + MOVOU (16*0)(SRC), B0 + PSHUFB BSWAP, B0 + + // H * 2 + PSHUFD $0xff, B0, T0 + MOVOU B0, T1 + PSRAL $31, T0 + PAND POLY, T0 + PSRLL $31, T1 + PSLLDQ $4, T1 + PSLLL $1, B0 + PXOR T0, B0 + PXOR T1, B0 + // Karatsuba pre-computations + MOVOU B0, (16*14)(dst) + PSHUFD $78, B0, B1 + PXOR B0, B1 + MOVOU B1, (16*15)(dst) + + MOVOU B0, B2 + MOVOU B1, B3 + // Now prepare powers of H and pre-computations for them + MOVQ $7, AX + +initLoop: + MOVOU B2, T0 + MOVOU B2, T1 + MOVOU B3, T2 + PCLMULQDQ $0x00, B0, T0 + PCLMULQDQ $0x11, B0, T1 + PCLMULQDQ $0x00, B1, T2 + + PXOR T0, T2 + PXOR T1, T2 + MOVOU T2, B4 + PSLLDQ $8, B4 + PSRLDQ $8, T2 + PXOR B4, T0 + PXOR T2, T1 + + MOVOU POLY, B2 + PCLMULQDQ $0x01, T0, B2 + PSHUFD $78, T0, T0 + PXOR B2, T0 + MOVOU POLY, B2 + PCLMULQDQ $0x01, T0, B2 + PSHUFD $78, T0, T0 + PXOR T0, B2 + PXOR T1, B2 + + MOVOU B2, (16*12)(dst) + PSHUFD $78, B2, B3 + PXOR B2, B3 + MOVOU B3, (16*13)(dst) + + DECQ AX + LEAQ (-16*2)(dst), dst + JNE initLoop + + RET + +#undef SRC +#undef dst + +// func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) +TEXT ·gcmSm4Data(SB),NOSPLIT,$0 +#define pTbl DI +#define aut SI +#define tPtr CX +#define autLen DX + +#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a +#define mulRoundAAD(X ,i) \ + MOVOU (16*(i*2))(pTbl), T1;\ + MOVOU T1, T2;\ + PCLMULQDQ $0x00, X, T1;\ + PXOR T1, ACC0;\ + PCLMULQDQ $0x11, X, T2;\ + PXOR T2, ACC1;\ + PSHUFD $78, X, T1;\ + PXOR T1, X;\ + MOVOU (16*(i*2+1))(pTbl), T1;\ + PCLMULQDQ $0x00, X, T1;\ + PXOR T1, ACCM + + MOVQ productTable+0(FP), pTbl + MOVQ data_base+8(FP), aut + MOVQ data_len+16(FP), autLen + MOVQ T+32(FP), tPtr + + //PXOR ACC0, ACC0 + MOVOU (tPtr), ACC0 + MOVOU bswapMask<>(SB), BSWAP + MOVOU gcmPoly<>(SB), POLY + + TESTQ autLen, autLen + JEQ dataBail + + CMPQ autLen, $13 // optimize the TLS case + JE dataTLS + CMPQ autLen, $128 + JB startSinglesLoop + JMP dataOctaLoop + +dataTLS: + MOVOU (16*14)(pTbl), T1 + MOVOU (16*15)(pTbl), T2 + PXOR B0, B0 + MOVQ (aut), B0 + PINSRD $2, 8(aut), B0 + PINSRB $12, 12(aut), B0 + XORQ autLen, autLen + JMP dataMul + +dataOctaLoop: + CMPQ autLen, $128 + JB startSinglesLoop + SUBQ $128, autLen + + MOVOU (16*0)(aut), X0 + MOVOU (16*1)(aut), X1 + MOVOU (16*2)(aut), X2 + MOVOU (16*3)(aut), X3 + MOVOU (16*4)(aut), X4 + MOVOU (16*5)(aut), X5 + MOVOU (16*6)(aut), X6 + MOVOU (16*7)(aut), X7 + LEAQ (16*8)(aut), aut + PSHUFB BSWAP, X0 + PSHUFB BSWAP, X1 + PSHUFB BSWAP, X2 + PSHUFB BSWAP, X3 + PSHUFB BSWAP, X4 + PSHUFB BSWAP, X5 + PSHUFB BSWAP, X6 + PSHUFB BSWAP, X7 + PXOR ACC0, X0 + + MOVOU (16*0)(pTbl), ACC0 + MOVOU (16*1)(pTbl), ACCM + MOVOU ACC0, ACC1 + PSHUFD $78, X0, T1 + PXOR X0, T1 + PCLMULQDQ $0x00, X0, ACC0 + PCLMULQDQ $0x11, X0, ACC1 + PCLMULQDQ $0x00, T1, ACCM + + mulRoundAAD(X1, 1) + mulRoundAAD(X2, 2) + mulRoundAAD(X3, 3) + mulRoundAAD(X4, 4) + mulRoundAAD(X5, 5) + mulRoundAAD(X6, 6) + mulRoundAAD(X7, 7) + + PXOR ACC0, ACCM + PXOR ACC1, ACCM + MOVOU ACCM, T0 + PSRLDQ $8, ACCM + PSLLDQ $8, T0 + PXOR ACCM, ACC1 + PXOR T0, ACC0 + reduceRound(ACC0) + reduceRound(ACC0) + PXOR ACC1, ACC0 + JMP dataOctaLoop + +startSinglesLoop: + MOVOU (16*14)(pTbl), T1 + MOVOU (16*15)(pTbl), T2 + +dataSinglesLoop: + + CMPQ autLen, $16 + JB dataEnd + SUBQ $16, autLen + + MOVOU (aut), B0 +dataMul: + PSHUFB BSWAP, B0 + PXOR ACC0, B0 + + MOVOU T1, ACC0 + MOVOU T2, ACCM + MOVOU T1, ACC1 + + PSHUFD $78, B0, T0 + PXOR B0, T0 + PCLMULQDQ $0x00, B0, ACC0 + PCLMULQDQ $0x11, B0, ACC1 + PCLMULQDQ $0x00, T0, ACCM + + PXOR ACC0, ACCM + PXOR ACC1, ACCM + MOVOU ACCM, T0 + PSRLDQ $8, ACCM + PSLLDQ $8, T0 + PXOR ACCM, ACC1 + PXOR T0, ACC0 + + MOVOU POLY, T0 + PCLMULQDQ $0x01, ACC0, T0 + PSHUFD $78, ACC0, ACC0 + PXOR T0, ACC0 + + MOVOU POLY, T0 + PCLMULQDQ $0x01, ACC0, T0 + PSHUFD $78, ACC0, ACC0 + PXOR T0, ACC0 + PXOR ACC1, ACC0 + + LEAQ 16(aut), aut + + JMP dataSinglesLoop + +dataEnd: + + TESTQ autLen, autLen + JEQ dataBail + + PXOR B0, B0 + LEAQ -1(aut)(autLen*1), aut + +dataLoadLoop: + + PSLLDQ $1, B0 + PINSRB $0, (aut), B0 + + LEAQ -1(aut), aut + DECQ autLen + JNE dataLoadLoop + + JMP dataMul + +dataBail: + MOVOU ACC0, (tPtr) + RET + +#undef pTbl +#undef aut +#undef tPtr +#undef autLen diff --git a/sm4/sm4_gcm.go b/sm4/sm4_gcm.go new file mode 100644 index 0000000..85761dc --- /dev/null +++ b/sm4/sm4_gcm.go @@ -0,0 +1,158 @@ +// +build amd64 +package sm4 + +import ( + "crypto/cipher" + "crypto/subtle" +) + +// sm4CipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM +// will use the optimised implementation in this file when possible. Instances +// of this type only exist when hasGCMAsm returns true. +type sm4CipherGCM struct { + sm4CipherAsm +} + +// Assert that sm4CipherGCM implements the gcmAble interface. +var _ gcmAble = (*sm4CipherGCM)(nil) + +//go:noescape +func precomputeTableAsm(productTable *[256]byte, src *[16]byte) + +//go:noescape +func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) + +//go:noescape +func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) + +type gcmAsm struct { + gcm + bytesProductTable [256]byte +} + +// NewGCM returns the SM4 cipher wrapped in Galois Counter Mode. This is only +// called by crypto/cipher.NewGCM via the gcmAble interface. +func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) { + g := &gcmAsm{} + g.cipher = &c.sm4CipherAsm + g.nonceSize = nonceSize + g.tagSize = tagSize + var key [gcmBlockSize]byte + c.Encrypt(key[:], key[:]) + precomputeTableAsm(&g.bytesProductTable, &key) + return g, nil +} + +func (g *gcmAsm) NonceSize() int { + return g.nonceSize +} + +func (g *gcmAsm) Overhead() int { + return g.tagSize +} + +// Seal encrypts and authenticates plaintext. See the cipher.AEAD interface for +// details. +func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte { + if len(nonce) != g.nonceSize { + panic("crypto/cipher: incorrect nonce length given to GCM") + } + if uint64(len(plaintext)) > ((1<<32)-2)*BlockSize { + panic("crypto/cipher: message too large for GCM") + } + + var counter, tagMask [gcmBlockSize]byte + + if len(nonce) == gcmStandardNonceSize { + // Init counter to nonce||1 + copy(counter[:], nonce) + counter[gcmBlockSize-1] = 1 + } else { + // Otherwise counter = GHASH(nonce) + gcmSm4Data(&g.bytesProductTable, nonce, &counter) + gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0)) + } + + g.cipher.Encrypt(tagMask[:], counter[:]) + gcmInc32(&counter) + + var tagOut [gcmTagSize]byte + + gcmSm4Data(&g.bytesProductTable, data, &tagOut) + + ret, out := sliceForAppend(dst, len(plaintext)+g.tagSize) + if InexactOverlap(out[:len(plaintext)], plaintext) { + panic("crypto/cipher: invalid buffer overlap") + } + + if len(plaintext) > 0 { + g.counterCrypt(out, plaintext, &counter) + gcmSm4Data(&g.bytesProductTable, out[:len(plaintext)], &tagOut) + } + gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data))) + copy(out[len(plaintext):], tagOut[:]) + + return ret +} + +// Open authenticates and decrypts ciphertext. See the cipher.AEAD interface +// for details. +func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) { + if len(nonce) != g.nonceSize { + panic("crypto/cipher: incorrect nonce length given to GCM") + } + // Sanity check to prevent the authentication from always succeeding if an implementation + // leaves tagSize uninitialized, for example. + if g.tagSize < gcmMinimumTagSize { + panic("crypto/cipher: incorrect GCM tag size") + } + + if len(ciphertext) < g.tagSize { + return nil, errOpen + } + if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(BlockSize)+uint64(g.tagSize) { + return nil, errOpen + } + + tag := ciphertext[len(ciphertext)-g.tagSize:] + ciphertext = ciphertext[:len(ciphertext)-g.tagSize] + + // See GCM spec, section 7.1. + var counter, tagMask [gcmBlockSize]byte + + if len(nonce) == gcmStandardNonceSize { + // Init counter to nonce||1 + copy(counter[:], nonce) + counter[gcmBlockSize-1] = 1 + } else { + // Otherwise counter = GHASH(nonce) + gcmSm4Data(&g.bytesProductTable, nonce, &counter) + gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0)) + } + + g.cipher.Encrypt(tagMask[:], counter[:]) + gcmInc32(&counter) + + var expectedTag [gcmTagSize]byte + gcmSm4Data(&g.bytesProductTable, data, &expectedTag) + + ret, out := sliceForAppend(dst, len(ciphertext)) + if InexactOverlap(out, ciphertext) { + panic("crypto/cipher: invalid buffer overlap") + } + if len(ciphertext) > 0 { + gcmSm4Data(&g.bytesProductTable, ciphertext, &expectedTag) + } + gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data))) + + if subtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 { + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + + g.counterCrypt(out, ciphertext, &counter) + + return ret, nil +}