13
GCM for SM4
Sun Yimin edited this page 2023-06-27 11:38:18 +08:00
This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

为sm4实现的GCM汇编代码是从AES GCM实现中摘抄的。主要为以下三个函数

	//go:noescape
func precomputeTableAsm(productTable *[256]byte, src *[16]byte)

//go:noescape
func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)

//go:noescape
func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
  1. gcmSm4Finish没有改变和gcmAesFinish一模一样
  2. precomputeTableAsm和gcmAesInit的区别在于前者没有加密部分输入参数就是加密结果
  3. gcmSm4Data和gcmAesData的差别在于前者那个T参数同时作为输入输出而后者只作为输出。

加密和GHASH结合ASM优化

主要困难

  1. 底层的CTR加密数据不是block对齐的更不是4、8blocks对齐的所以尾加密及异或运算处理比较麻烦
  2. AMD64同时支持AVX2和NON-AVX2代码量比较大比较复杂
  3. 和GHASH的混合处理提高性能 计划先把下面方法转成ASM:
// counterCrypt crypts in to out using g.cipher in counter mode.
func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
	mask := make([]byte, g.cipher.blocksSize)
	counters := make([]byte, g.cipher.blocksSize)

	for len(in) >= g.cipher.blocksSize {
		for i := 0; i < g.cipher.batchBlocks; i++ {
			copy(counters[i*gcmBlockSize:(i+1)*gcmBlockSize], counter[:])
			gcmInc32(counter)
		}
		g.cipher.EncryptBlocks(mask, counters)
		xor.XorWords(out, in, mask[:])
		out = out[g.cipher.blocksSize:]
		in = in[g.cipher.blocksSize:]
	}

	if len(in) > 0 {
		blocks := (len(in) + gcmBlockSize - 1) / gcmBlockSize
		for i := 0; i < blocks; i++ {
			copy(counters[i*gcmBlockSize:], counter[:])
			gcmInc32(counter)
		}
		g.cipher.EncryptBlocks(mask, counters)
		xor.XorBytes(out, in, mask[:blocks*gcmBlockSize])
	}
}

最后再处理和GHASH混合处理。

2022年1月14日

gcmSm4Init方法已完成AMD64和ARM64开发为了支持golang1.15.x不得不放弃使用VMOVQ指令。

gcmSm4Enc方法初步完成AMD64架构非AVX(2)版本开发,正进行更多测试和优化。

2022年1月18日

gcmSm4Enc, gcmSm4Dec, 已完成AMD64架构下非AVX(2)版本及AVX(2)版本代码有点臃肿ARM64版本也已完成优化的方向为矩阵行列转换。

ARM64矩阵转换

// 从高位到低位
// s0 = s0.S3, s0.S2, s0.S1, s0.S0
// s1 = s1.S3, s1.S2, s1.S1, s1.S0
// s2 = s2.S3, s2.S2, s2.S1, s2.S0
// s3 = s3.S3, s3.S2, s3.S1, s3.S0
#define transpose_4x4(s0, s1, s2, s3)   \
        zip1 RTMP0.4s, s0.4s, s1.4s;    \ // RTMP0 = s1.S1, s0.S1, s1.S0, s0.S0
        zip1 RTMP1.4s, s2.4s, s3.4s;    \ // RTMP1 = s3.S1, s2.S1, s3.S0, s2.S0
        zip2 RTMP2.4s, s0.4s, s1.4s;    \ // RTMP2 = s1.S3, s0.S3, s1.S2, s0.S2
        zip2 RTMP3.4s, s2.4s, s3.4s;    \ // RTMP3 = s3.S3, s2.S3, s3.S2, s2.S2
        zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ // s0 = s3.S0, s2.S0, s1.S0, s0.S0
        zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ // s1 = s3.S1, s2.S1, s1.S1, s0.S1
        zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ // s2 = s3.S2, s2.S2, s1.S2, s0.S2
        zip2 s3.2d, RTMP2.2d, RTMP3.2d;   // s3 = s3.S3, s2.S3, s1.S3, s0.S3

#define rotate_clockwise_90(s0, s1, s2, s3) \
        zip1 RTMP0.4s, s1.4s, s0.4s;        \ // RTMP0 = s0.S1, s1.S1, s0.S0, s1.S0
        zip2 RTMP1.4s, s1.4s, s0.4s;        \ // RTMP1 = s0.S3, s1.S3, s0.S2, s1.S2
        zip1 RTMP2.4s, s3.4s, s2.4s;        \ // RTMP2 = s2.S1, s3.S1, s2.S0, s3.S0
        zip2 RTMP3.4s, s3.4s, s2.4s;        \ // RTMP3 = s2.S3, s3.S3, s2.S2, s3.S2
        zip1 s0.2d, RTMP2.2d, RTMP0.2d;     \ // s0 = s0.S0, s1.S0, s2.S0, s3.S0
        zip2 s1.2d, RTMP2.2d, RTMP0.2d;     \ // s1 = s0.S1, s1.S1, s2.S1, s3.S1
        zip1 s2.2d, RTMP3.2d, RTMP1.2d;     \ // s2 = s0.S2, s1.S2, s2.S2, s3.S2
        zip2 s3.2d, RTMP3.2d, RTMP1.2d;	      // s3 = s0.S3, s1.S3, s2.S3, s3.S3

Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode