MAGIC - ghash asm tuning

This commit is contained in:
Emman 2021-03-19 17:17:19 +08:00
parent 3e2e3c8c63
commit 88e456e9a8
4 changed files with 900 additions and 0 deletions

View File

@ -16,6 +16,7 @@ type sm4CipherAsm struct {
} }
var supportsAES = cpu.X86.HasAES var supportsAES = cpu.X86.HasAES
var supportsGFMUL = cpu.X86.HasPCLMULQDQ
func newCipher(key []byte) (cipher.Block, error) { func newCipher(key []byte) (cipher.Block, error) {
if !supportsAES { if !supportsAES {
@ -23,6 +24,9 @@ func newCipher(key []byte) (cipher.Block, error) {
} }
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}} c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}}
expandKeyGo(key, c.enc, c.dec) expandKeyGo(key, c.enc, c.dec)
if supportsAES && supportsGFMUL {
return &sm4CipherGCM{c}, nil
}
return &c, nil return &c, nil
} }

346
sm4/gcm_amd64.go Normal file
View File

@ -0,0 +1,346 @@
package sm4
import (
"crypto/cipher"
"crypto/subtle"
"encoding/binary"
"errors"
)
// Assert that sm4CipherAsm implements the gcmAble interface.
var _ gcmAble = (*sm4CipherAsm)(nil)
// NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only
// called by crypto/cipher.NewGCM via the gcmAble interface.
func (c *sm4CipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
var key [gcmBlockSize]byte
c.Encrypt(key[:], key[:])
g := &gcm{cipher: c, nonceSize: nonceSize, tagSize: tagSize}
// We precompute 16 multiples of |key|. However, when we do lookups
// into this table we'll be using bits from a field element and
// therefore the bits will be in the reverse order. So normally one
// would expect, say, 4*key to be in index 4 of the table but due to
// this bit ordering it will actually be in index 0010 (base 2) = 2.
x := gcmFieldElement{
binary.BigEndian.Uint64(key[:8]),
binary.BigEndian.Uint64(key[8:]),
}
g.productTable[reverseBits(1)] = x
for i := 2; i < 16; i += 2 {
g.productTable[reverseBits(i)] = gcmDouble(&g.productTable[reverseBits(i/2)])
g.productTable[reverseBits(i+1)] = gcmAdd(&g.productTable[reverseBits(i)], &x)
}
return g, nil
}
// gcmFieldElement represents a value in GF(2¹²⁸). In order to reflect the GCM
// standard and make binary.BigEndian suitable for marshaling these values, the
// bits are stored in big endian order. For example:
// the coefficient of x⁰ can be obtained by v.low >> 63.
// the coefficient of x⁶³ can be obtained by v.low & 1.
// the coefficient of x⁶⁴ can be obtained by v.high >> 63.
// the coefficient of x¹²⁷ can be obtained by v.high & 1.
type gcmFieldElement struct {
low, high uint64
}
// gcm represents a Galois Counter Mode with a specific key. See
// https://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
type gcm struct {
cipher *sm4CipherAsm
nonceSize int
tagSize int
// productTable contains the first sixteen powers of the key, H.
// However, they are in bit reversed order. See NewGCMWithNonceSize.
productTable [16]gcmFieldElement
}
const (
gcmBlockSize = 16
gcmTagSize = 16
gcmMinimumTagSize = 12 // NIST SP 800-38D recommends tags with 12 or more bytes.
gcmStandardNonceSize = 12
)
func (g *gcm) NonceSize() int {
return g.nonceSize
}
func (g *gcm) Overhead() int {
return g.tagSize
}
func (g *gcm) Seal(dst, nonce, plaintext, data []byte) []byte {
if len(nonce) != g.nonceSize {
panic("crypto/cipher: incorrect nonce length given to GCM")
}
if uint64(len(plaintext)) > ((1<<32)-2)*uint64(g.cipher.BlockSize()) {
panic("crypto/cipher: message too large for GCM")
}
ret, out := sliceForAppend(dst, len(plaintext)+g.tagSize)
if InexactOverlap(out, plaintext) {
panic("crypto/cipher: invalid buffer overlap")
}
var counter, tagMask [gcmBlockSize]byte
g.deriveCounter(&counter, nonce)
g.cipher.Encrypt(tagMask[:], counter[:])
gcmInc32(&counter)
g.counterCrypt(out, plaintext, &counter)
var tag [gcmTagSize]byte
g.auth(tag[:], out[:len(plaintext)], data, &tagMask)
copy(out[len(plaintext):], tag[:])
return ret
}
var errOpen = errors.New("cipher: message authentication failed")
func (g *gcm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
if len(nonce) != g.nonceSize {
panic("crypto/cipher: incorrect nonce length given to GCM")
}
// Sanity check to prevent the authentication from always succeeding if an implementation
// leaves tagSize uninitialized, for example.
if g.tagSize < gcmMinimumTagSize {
panic("crypto/cipher: incorrect GCM tag size")
}
if len(ciphertext) < g.tagSize {
return nil, errOpen
}
if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(g.cipher.BlockSize())+uint64(g.tagSize) {
return nil, errOpen
}
tag := ciphertext[len(ciphertext)-g.tagSize:]
ciphertext = ciphertext[:len(ciphertext)-g.tagSize]
var counter, tagMask [gcmBlockSize]byte
g.deriveCounter(&counter, nonce)
g.cipher.Encrypt(tagMask[:], counter[:])
gcmInc32(&counter)
var expectedTag [gcmTagSize]byte
g.auth(expectedTag[:], ciphertext, data, &tagMask)
ret, out := sliceForAppend(dst, len(ciphertext))
if InexactOverlap(out, ciphertext) {
panic("crypto/cipher: invalid buffer overlap")
}
if subtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 {
// The AESNI code decrypts and authenticates concurrently, and
// so overwrites dst in the event of a tag mismatch. That
// behavior is mimicked here in order to be consistent across
// platforms.
for i := range out {
out[i] = 0
}
return nil, errOpen
}
g.counterCrypt(out, ciphertext, &counter)
return ret, nil
}
// reverseBits reverses the order of the bits of 4-bit number in i.
func reverseBits(i int) int {
i = ((i << 2) & 0xc) | ((i >> 2) & 0x3)
i = ((i << 1) & 0xa) | ((i >> 1) & 0x5)
return i
}
// gcmAdd adds two elements of GF(2¹²⁸) and returns the sum.
func gcmAdd(x, y *gcmFieldElement) gcmFieldElement {
// Addition in a characteristic 2 field is just XOR.
return gcmFieldElement{x.low ^ y.low, x.high ^ y.high}
}
// gcmDouble returns the result of doubling an element of GF(2¹²⁸).
func gcmDouble(x *gcmFieldElement) (double gcmFieldElement) {
msbSet := x.high&1 == 1
// Because of the bit-ordering, doubling is actually a right shift.
double.high = x.high >> 1
double.high |= x.low << 63
double.low = x.low >> 1
// If the most-significant bit was set before shifting then it,
// conceptually, becomes a term of x^128. This is greater than the
// irreducible polynomial so the result has to be reduced. The
// irreducible polynomial is 1+x+x^2+x^7+x^128. We can subtract that to
// eliminate the term at x^128 which also means subtracting the other
// four terms. In characteristic 2 fields, subtraction == addition ==
// XOR.
if msbSet {
double.low ^= 0xe100000000000000
}
return
}
var gcmReductionTable = []uint16{
0x0000, 0x1c20, 0x3840, 0x2460, 0x7080, 0x6ca0, 0x48c0, 0x54e0,
0xe100, 0xfd20, 0xd940, 0xc560, 0x9180, 0x8da0, 0xa9c0, 0xb5e0,
}
// mul sets y to y*H, where H is the GCM key, fixed during NewGCMWithNonceSize.
func (g *gcm) mul(y *gcmFieldElement) {
var z gcmFieldElement
for i := 0; i < 2; i++ {
word := y.high
if i == 1 {
word = y.low
}
// Multiplication works by multiplying z by 16 and adding in
// one of the precomputed multiples of H.
for j := 0; j < 64; j += 4 {
msw := z.high & 0xf
z.high >>= 4
z.high |= z.low << 60
z.low >>= 4
z.low ^= uint64(gcmReductionTable[msw]) << 48
// the values in |table| are ordered for
// little-endian bit positions. See the comment
// in NewGCMWithNonceSize.
t := &g.productTable[word&0xf]
z.low ^= t.low
z.high ^= t.high
word >>= 4
}
}
*y = z
}
// updateBlocks extends y with more polynomial terms from blocks, based on
// Horner's rule. There must be a multiple of gcmBlockSize bytes in blocks.
func (g *gcm) updateBlocks(y *gcmFieldElement, blocks []byte) {
for len(blocks) > 0 {
y.low ^= binary.BigEndian.Uint64(blocks)
y.high ^= binary.BigEndian.Uint64(blocks[8:])
g.mul(y)
blocks = blocks[gcmBlockSize:]
}
}
// update extends y with more polynomial terms from data. If data is not a
// multiple of gcmBlockSize bytes long then the remainder is zero padded.
func (g *gcm) update(y *gcmFieldElement, data []byte) {
fullBlocks := (len(data) >> 4) << 4
g.updateBlocks(y, data[:fullBlocks])
if len(data) != fullBlocks {
var partialBlock [gcmBlockSize]byte
copy(partialBlock[:], data[fullBlocks:])
g.updateBlocks(y, partialBlock[:])
}
}
// gcmInc32 treats the final four bytes of counterBlock as a big-endian value
// and increments it.
func gcmInc32(counterBlock *[16]byte) {
ctr := counterBlock[len(counterBlock)-4:]
binary.BigEndian.PutUint32(ctr, binary.BigEndian.Uint32(ctr)+1)
}
// sliceForAppend takes a slice and a requested number of bytes. It returns a
// slice with the contents of the given slice followed by that many bytes and a
// second slice that aliases into it and contains only the extra bytes. If the
// original slice has sufficient capacity then no allocation is performed.
func sliceForAppend(in []byte, n int) (head, tail []byte) {
if total := len(in) + n; cap(in) >= total {
head = in[:total]
} else {
head = make([]byte, total)
copy(head, in)
}
tail = head[len(in):]
return
}
// counterCrypt crypts in to out using g.cipher in counter mode.
func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
var mask [FourBlocksSize]byte
var couters [FourBlocksSize]byte
for len(in) >= FourBlocksSize {
copy(couters[:], counter[:])
gcmInc32(counter)
copy(couters[gcmBlockSize:], counter[:])
gcmInc32(counter)
copy(couters[2*gcmBlockSize:], counter[:])
gcmInc32(counter)
copy(couters[3*gcmBlockSize:], counter[:])
encryptBlocksAsm(&g.cipher.enc[0], &mask[0], &couters[0])
gcmInc32(counter)
xorWords(out, in, mask[:])
out = out[FourBlocksSize:]
in = in[FourBlocksSize:]
}
if len(in) > 0 {
blocks := (len(in) + gcmBlockSize - 1) / gcmBlockSize
for i := 0; i < blocks; i++ {
copy(couters[i*gcmBlockSize:], counter[:])
gcmInc32(counter)
}
encryptBlocksAsm(&g.cipher.enc[0], &mask[0], &couters[0])
xorBytes(out, in, mask[:blocks*gcmBlockSize])
}
}
// deriveCounter computes the initial GCM counter state from the given nonce.
// See NIST SP 800-38D, section 7.1. This assumes that counter is filled with
// zeros on entry.
func (g *gcm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) {
// GCM has two modes of operation with respect to the initial counter
// state: a "fast path" for 96-bit (12-byte) nonces, and a "slow path"
// for nonces of other lengths. For a 96-bit nonce, the nonce, along
// with a four-byte big-endian counter starting at one, is used
// directly as the starting counter. For other nonce sizes, the counter
// is computed by passing it through the GHASH function.
if len(nonce) == gcmStandardNonceSize {
copy(counter[:], nonce)
counter[gcmBlockSize-1] = 1
} else {
var y gcmFieldElement
g.update(&y, nonce)
y.high ^= uint64(len(nonce)) * 8
g.mul(&y)
binary.BigEndian.PutUint64(counter[:8], y.low)
binary.BigEndian.PutUint64(counter[8:], y.high)
}
}
// auth calculates GHASH(ciphertext, additionalData), masks the result with
// tagMask and writes the result to out.
func (g *gcm) auth(out, ciphertext, additionalData []byte, tagMask *[gcmTagSize]byte) {
var y gcmFieldElement
g.update(&y, additionalData)
g.update(&y, ciphertext)
y.low ^= uint64(len(additionalData)) * 8
y.high ^= uint64(len(ciphertext)) * 8
g.mul(&y)
binary.BigEndian.PutUint64(out, y.low)
binary.BigEndian.PutUint64(out[8:], y.high)
xorWords(out, out, tagMask[:])
}

392
sm4/gcm_amd64.s Normal file
View File

@ -0,0 +1,392 @@
// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
// The implementation uses some optimization as described in:
// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
// Instruction and its Usage for Computing the GCM Mode rev. 2.02
// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
// Hardware
#include "textflag.h"
#define B0 X0
#define B1 X1
#define B2 X2
#define B3 X3
#define B4 X4
#define B5 X5
#define B6 X6
#define B7 X7
#define ACC0 X8
#define ACC1 X9
#define ACCM X10
#define T0 X11
#define T1 X12
#define T2 X13
#define POLY X14
#define BSWAP X15
DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
DATA andMask<>+0x08(SB)/8, $0x0000000000000000
DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
DATA andMask<>+0x18(SB)/8, $0x0000000000000000
DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
DATA andMask<>+0x28(SB)/8, $0x0000000000000000
DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
DATA andMask<>+0x38(SB)/8, $0x0000000000000000
DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
DATA andMask<>+0x48(SB)/8, $0x0000000000000000
DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0x58(SB)/8, $0x0000000000000000
DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
DATA andMask<>+0x68(SB)/8, $0x0000000000000000
DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x78(SB)/8, $0x0000000000000000
DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
GLOBL andMask<>(SB), (NOPTR+RODATA), $240
// func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#define pTbl DI
#define tMsk SI
#define tPtr DX
#define plen AX
#define dlen CX
MOVQ productTable+0(FP), pTbl
MOVQ tagMask+8(FP), tMsk
MOVQ T+16(FP), tPtr
MOVQ pLen+24(FP), plen
MOVQ dLen+32(FP), dlen
MOVOU (tPtr), ACC0
MOVOU (tMsk), T2
MOVOU bswapMask<>(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
SHLQ $3, plen
SHLQ $3, dlen
MOVQ plen, B0
PINSRQ $1, dlen, B0
PXOR ACC0, B0
MOVOU (16*14)(pTbl), ACC0
MOVOU (16*15)(pTbl), ACCM
MOVOU ACC0, ACC1
PCLMULQDQ $0x00, B0, ACC0
PCLMULQDQ $0x11, B0, ACC1
PSHUFD $78, B0, T0
PXOR B0, T0
PCLMULQDQ $0x00, T0, ACCM
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
PXOR ACC1, ACC0
PSHUFB BSWAP, ACC0
PXOR T2, ACC0
MOVOU ACC0, (tPtr)
RET
#undef pTbl
#undef tMsk
#undef tPtr
#undef plen
#undef dlen
// func precomputeTableAsm(productTable *[256]byte, src *[16]byte)
TEXT ·precomputeTableAsm(SB),NOSPLIT,$0
#define dst DI
#define SRC SI
MOVQ productTable+0(FP), dst
MOVQ src+8(FP), SRC
MOVOU bswapMask<>(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
MOVOU (16*0)(SRC), B0
PSHUFB BSWAP, B0
// H * 2
PSHUFD $0xff, B0, T0
MOVOU B0, T1
PSRAL $31, T0
PAND POLY, T0
PSRLL $31, T1
PSLLDQ $4, T1
PSLLL $1, B0
PXOR T0, B0
PXOR T1, B0
// Karatsuba pre-computations
MOVOU B0, (16*14)(dst)
PSHUFD $78, B0, B1
PXOR B0, B1
MOVOU B1, (16*15)(dst)
MOVOU B0, B2
MOVOU B1, B3
// Now prepare powers of H and pre-computations for them
MOVQ $7, AX
initLoop:
MOVOU B2, T0
MOVOU B2, T1
MOVOU B3, T2
PCLMULQDQ $0x00, B0, T0
PCLMULQDQ $0x11, B0, T1
PCLMULQDQ $0x00, B1, T2
PXOR T0, T2
PXOR T1, T2
MOVOU T2, B4
PSLLDQ $8, B4
PSRLDQ $8, T2
PXOR B4, T0
PXOR T2, T1
MOVOU POLY, B2
PCLMULQDQ $0x01, T0, B2
PSHUFD $78, T0, T0
PXOR B2, T0
MOVOU POLY, B2
PCLMULQDQ $0x01, T0, B2
PSHUFD $78, T0, T0
PXOR T0, B2
PXOR T1, B2
MOVOU B2, (16*12)(dst)
PSHUFD $78, B2, B3
PXOR B2, B3
MOVOU B3, (16*13)(dst)
DECQ AX
LEAQ (-16*2)(dst), dst
JNE initLoop
RET
#undef SRC
#undef dst
// func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
TEXT ·gcmSm4Data(SB),NOSPLIT,$0
#define pTbl DI
#define aut SI
#define tPtr CX
#define autLen DX
#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
#define mulRoundAAD(X ,i) \
MOVOU (16*(i*2))(pTbl), T1;\
MOVOU T1, T2;\
PCLMULQDQ $0x00, X, T1;\
PXOR T1, ACC0;\
PCLMULQDQ $0x11, X, T2;\
PXOR T2, ACC1;\
PSHUFD $78, X, T1;\
PXOR T1, X;\
MOVOU (16*(i*2+1))(pTbl), T1;\
PCLMULQDQ $0x00, X, T1;\
PXOR T1, ACCM
MOVQ productTable+0(FP), pTbl
MOVQ data_base+8(FP), aut
MOVQ data_len+16(FP), autLen
MOVQ T+32(FP), tPtr
//PXOR ACC0, ACC0
MOVOU (tPtr), ACC0
MOVOU bswapMask<>(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
TESTQ autLen, autLen
JEQ dataBail
CMPQ autLen, $13 // optimize the TLS case
JE dataTLS
CMPQ autLen, $128
JB startSinglesLoop
JMP dataOctaLoop
dataTLS:
MOVOU (16*14)(pTbl), T1
MOVOU (16*15)(pTbl), T2
PXOR B0, B0
MOVQ (aut), B0
PINSRD $2, 8(aut), B0
PINSRB $12, 12(aut), B0
XORQ autLen, autLen
JMP dataMul
dataOctaLoop:
CMPQ autLen, $128
JB startSinglesLoop
SUBQ $128, autLen
MOVOU (16*0)(aut), X0
MOVOU (16*1)(aut), X1
MOVOU (16*2)(aut), X2
MOVOU (16*3)(aut), X3
MOVOU (16*4)(aut), X4
MOVOU (16*5)(aut), X5
MOVOU (16*6)(aut), X6
MOVOU (16*7)(aut), X7
LEAQ (16*8)(aut), aut
PSHUFB BSWAP, X0
PSHUFB BSWAP, X1
PSHUFB BSWAP, X2
PSHUFB BSWAP, X3
PSHUFB BSWAP, X4
PSHUFB BSWAP, X5
PSHUFB BSWAP, X6
PSHUFB BSWAP, X7
PXOR ACC0, X0
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PSHUFD $78, X0, T1
PXOR X0, T1
PCLMULQDQ $0x00, X0, ACC0
PCLMULQDQ $0x11, X0, ACC1
PCLMULQDQ $0x00, T1, ACCM
mulRoundAAD(X1, 1)
mulRoundAAD(X2, 2)
mulRoundAAD(X3, 3)
mulRoundAAD(X4, 4)
mulRoundAAD(X5, 5)
mulRoundAAD(X6, 6)
mulRoundAAD(X7, 7)
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
JMP dataOctaLoop
startSinglesLoop:
MOVOU (16*14)(pTbl), T1
MOVOU (16*15)(pTbl), T2
dataSinglesLoop:
CMPQ autLen, $16
JB dataEnd
SUBQ $16, autLen
MOVOU (aut), B0
dataMul:
PSHUFB BSWAP, B0
PXOR ACC0, B0
MOVOU T1, ACC0
MOVOU T2, ACCM
MOVOU T1, ACC1
PSHUFD $78, B0, T0
PXOR B0, T0
PCLMULQDQ $0x00, B0, ACC0
PCLMULQDQ $0x11, B0, ACC1
PCLMULQDQ $0x00, T0, ACCM
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
PXOR ACC1, ACC0
LEAQ 16(aut), aut
JMP dataSinglesLoop
dataEnd:
TESTQ autLen, autLen
JEQ dataBail
PXOR B0, B0
LEAQ -1(aut)(autLen*1), aut
dataLoadLoop:
PSLLDQ $1, B0
PINSRB $0, (aut), B0
LEAQ -1(aut), aut
DECQ autLen
JNE dataLoadLoop
JMP dataMul
dataBail:
MOVOU ACC0, (tPtr)
RET
#undef pTbl
#undef aut
#undef tPtr
#undef autLen

158
sm4/sm4_gcm.go Normal file
View File

@ -0,0 +1,158 @@
// +build amd64
package sm4
import (
"crypto/cipher"
"crypto/subtle"
)
// sm4CipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
// will use the optimised implementation in this file when possible. Instances
// of this type only exist when hasGCMAsm returns true.
type sm4CipherGCM struct {
sm4CipherAsm
}
// Assert that sm4CipherGCM implements the gcmAble interface.
var _ gcmAble = (*sm4CipherGCM)(nil)
//go:noescape
func precomputeTableAsm(productTable *[256]byte, src *[16]byte)
//go:noescape
func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
//go:noescape
func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
type gcmAsm struct {
gcm
bytesProductTable [256]byte
}
// NewGCM returns the SM4 cipher wrapped in Galois Counter Mode. This is only
// called by crypto/cipher.NewGCM via the gcmAble interface.
func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
g := &gcmAsm{}
g.cipher = &c.sm4CipherAsm
g.nonceSize = nonceSize
g.tagSize = tagSize
var key [gcmBlockSize]byte
c.Encrypt(key[:], key[:])
precomputeTableAsm(&g.bytesProductTable, &key)
return g, nil
}
func (g *gcmAsm) NonceSize() int {
return g.nonceSize
}
func (g *gcmAsm) Overhead() int {
return g.tagSize
}
// Seal encrypts and authenticates plaintext. See the cipher.AEAD interface for
// details.
func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
if len(nonce) != g.nonceSize {
panic("crypto/cipher: incorrect nonce length given to GCM")
}
if uint64(len(plaintext)) > ((1<<32)-2)*BlockSize {
panic("crypto/cipher: message too large for GCM")
}
var counter, tagMask [gcmBlockSize]byte
if len(nonce) == gcmStandardNonceSize {
// Init counter to nonce||1
copy(counter[:], nonce)
counter[gcmBlockSize-1] = 1
} else {
// Otherwise counter = GHASH(nonce)
gcmSm4Data(&g.bytesProductTable, nonce, &counter)
gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
}
g.cipher.Encrypt(tagMask[:], counter[:])
gcmInc32(&counter)
var tagOut [gcmTagSize]byte
gcmSm4Data(&g.bytesProductTable, data, &tagOut)
ret, out := sliceForAppend(dst, len(plaintext)+g.tagSize)
if InexactOverlap(out[:len(plaintext)], plaintext) {
panic("crypto/cipher: invalid buffer overlap")
}
if len(plaintext) > 0 {
g.counterCrypt(out, plaintext, &counter)
gcmSm4Data(&g.bytesProductTable, out[:len(plaintext)], &tagOut)
}
gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data)))
copy(out[len(plaintext):], tagOut[:])
return ret
}
// Open authenticates and decrypts ciphertext. See the cipher.AEAD interface
// for details.
func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
if len(nonce) != g.nonceSize {
panic("crypto/cipher: incorrect nonce length given to GCM")
}
// Sanity check to prevent the authentication from always succeeding if an implementation
// leaves tagSize uninitialized, for example.
if g.tagSize < gcmMinimumTagSize {
panic("crypto/cipher: incorrect GCM tag size")
}
if len(ciphertext) < g.tagSize {
return nil, errOpen
}
if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(BlockSize)+uint64(g.tagSize) {
return nil, errOpen
}
tag := ciphertext[len(ciphertext)-g.tagSize:]
ciphertext = ciphertext[:len(ciphertext)-g.tagSize]
// See GCM spec, section 7.1.
var counter, tagMask [gcmBlockSize]byte
if len(nonce) == gcmStandardNonceSize {
// Init counter to nonce||1
copy(counter[:], nonce)
counter[gcmBlockSize-1] = 1
} else {
// Otherwise counter = GHASH(nonce)
gcmSm4Data(&g.bytesProductTable, nonce, &counter)
gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
}
g.cipher.Encrypt(tagMask[:], counter[:])
gcmInc32(&counter)
var expectedTag [gcmTagSize]byte
gcmSm4Data(&g.bytesProductTable, data, &expectedTag)
ret, out := sliceForAppend(dst, len(ciphertext))
if InexactOverlap(out, ciphertext) {
panic("crypto/cipher: invalid buffer overlap")
}
if len(ciphertext) > 0 {
gcmSm4Data(&g.bytesProductTable, ciphertext, &expectedTag)
}
gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data)))
if subtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 {
for i := range out {
out[i] = 0
}
return nil, errOpen
}
g.counterCrypt(out, ciphertext, &counter)
return ret, nil
}