sm4: add env var FORCE_SM4BLOCK_AESNI & give up cbc asm implementation

This commit is contained in:
Sun Yimin 2024-01-25 17:46:09 +08:00 committed by GitHub
parent 29b6da1d37
commit a01428eaf3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 35 additions and 159 deletions

View File

@ -238,13 +238,16 @@ type AEAD interface {
## 性能 ## 性能
SM4分组密码算法的软件高效实现不算CPU指令支持的话已知有如下几种方法 SM4分组密码算法的软件高效实现不算CPU指令支持的话已知有如下几种方法
* S盒和L转换预计算 * S盒和L转换预计算本软件库纯Go语言实现采用该方法
* SIMD并行处理并行查表 * SIMD并行处理并行查表
* SIMD并行处理借助CPU的AES指令本软件库采用该方法 * SIMD并行处理借助CPU的AES指令本软件库采用该方法
* SIMD并行处理位切片(bitslicing)[参考实现](https://github.com/emmansun/sm4bs) * SIMD并行处理位切片(bitslicing)[参考实现](https://github.com/emmansun/sm4bs)
当然这些与有CPU指令支持的AES算法相比性能差距依然偏大要是工作模式不支持并行差距就更巨大了。 当然这些与有CPU指令支持的AES算法相比性能差距依然偏大要是工作模式不支持并行差距就更巨大了。
### 混合方式
从**v0.25.0**开始AMD64/ARM64 支持AES-NI的CPU架构下**默认会使用混合方式**,即```cipher.Block```的方法会用纯Go语言实现而对于可以并行的加解密模式则还是会尽量采用AES-NI和SIMD并行处理。您可以通过环境变量```FORCE_SM4BLOCK_AESNI=1```来强制都使用AES-NI实现和v0.25.0之前版本的行为一样)。请参考[SM4: 单block的性能问题](https://github.com/emmansun/gmsm/discussions/172)。
## 与KMS集成 ## 与KMS集成
可能您会说如果我在KMS中创建了一个SM4对称密钥就不需要本地加解密了这话很对不过有种场景会用到 可能您会说如果我在KMS中创建了一个SM4对称密钥就不需要本地加解密了这话很对不过有种场景会用到
* 在KMS中只创建非对称密钥KEK * 在KMS中只创建非对称密钥KEK

View File

@ -1,77 +1,8 @@
//go:build amd64 && !purego //go:build amd64 && !purego
#include "textflag.h" #include "textflag.h"
#define x X0
#define y X1
#define t0 X2
#define t1 X3
#define t2 X4
#define t3 X5
#define XTMP6 X6
#include "aesni_macros_amd64.s" #include "aesni_macros_amd64.s"
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
#define ctx BX
#define ptx DX
#define ptxLen DI
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), ctx
MOVQ src+32(FP), ptx
MOVQ src_len+40(FP), ptxLen
MOVQ iv+56(FP), SI
MOVOU (SI), t0
loopSrc:
CMPQ ptxLen, $16
JB done_sm4
SUBQ $16, ptxLen
MOVOU (ptx), t1
PXOR t1, t0
PSHUFB flip_mask<>(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
XORL CX, CX
loopRound:
SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
ADDL $16, CX
CMPL CX, $4*32
JB loopRound
PALIGNR $4, t3, t3
PALIGNR $4, t3, t2
PALIGNR $4, t2, t1
PALIGNR $4, t1, t0
PSHUFB flip_mask<>(SB), t0
MOVOU t0, (ctx)
LEAQ 16(ptx), ptx
LEAQ 16(ctx), ctx
JMP loopSrc
done_sm4:
MOVOU t0, (SI)
RET
#undef ctx
#undef ptx
#undef ptxLen
#define XDWTMP0 Y0 #define XDWTMP0 Y0
#define XDWTMP1 Y1 #define XDWTMP1 Y1

View File

@ -8,6 +8,14 @@
#define t1 V3 #define t1 V3
#define t2 V4 #define t2 V4
#define t3 V5 #define t3 V5
#define XTMP6 V6
#define XTMP7 V7
#define t4 V10
#define t5 V11
#define t6 V12
#define t7 V13
#define IV V18
#define ZERO V16 #define ZERO V16
#define NIBBLE_MASK V20 #define NIBBLE_MASK V20
#define INVERSE_SHIFT_ROWS V21 #define INVERSE_SHIFT_ROWS V21
@ -17,86 +25,14 @@
#define M2H V25 #define M2H V25
#define R08_MASK V26 #define R08_MASK V26
#define FK_MASK V27 #define FK_MASK V27
#define XTMP6 V6
#define IV V7
#include "aesni_macros_arm64.s" #include "aesni_macros_arm64.s"
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
#define ctx R1
#define ptx R3
#define ptxLen R4
#define rkSave R8
LOAD_SM4_AESNI_CONSTS()
MOVD xk+0(FP), rkSave
MOVD dst+8(FP), ctx
MOVD src+32(FP), ptx
MOVD src_len+40(FP), ptxLen
MOVD iv+56(FP), R5
VEOR ZERO.B16, ZERO.B16, ZERO.B16
VLD1 (R5), [IV.B16]
loopSrc:
CMP $16, ptxLen
BLT done_sm4
SUB $16, ptxLen
VLD1.P (ptx), [t0.S4]
VEOR IV.B16, t0.B16, t0.B16
VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0]
EOR R2, R2
MOVD rkSave, R0
encryptBlockLoop:
SM4_ROUND(R0, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(R0, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(R0, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(R0, R19, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R2
CMP $128, R2
BNE encryptBlockLoop
VMOV t2.S[0], t3.S[1]
VMOV t1.S[0], t3.S[2]
VMOV t0.S[0], t3.S[3]
VREV32 t3.B16, t3.B16
VST1.P [t3.B16], (ctx)
VMOV t3.B16, IV.B16
B loopSrc
done_sm4:
VST1 [IV.B16], (R5)
RET
#undef ctx
#undef ptx
#undef ptxLen
#undef rkSave
#undef IV
#define XTMP7 V7
#define t4 V10
#define t5 V11
#define t6 V12
#define t7 V13
#define dstPtr R1 #define dstPtr R1
#define srcPtr R2 #define srcPtr R2
#define rk R3 #define rk R3
#define rkSave R4 #define rkSave R4
#define srcPtrLen R5 #define srcPtrLen R5
#define IV V18
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 TEXT ·decryptBlocksChain(SB),NOSPLIT,$0

View File

@ -42,9 +42,6 @@ func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode {
func (x *cbc) BlockSize() int { return BlockSize } func (x *cbc) BlockSize() int { return BlockSize }
//go:noescape
func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
//go:noescape //go:noescape
func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)

View File

@ -16,6 +16,7 @@ var supportsAES = cpuid.HasAES
var supportsGFMUL = cpuid.HasGFMUL var supportsGFMUL = cpuid.HasGFMUL
var useAVX2 = cpu.X86.HasAVX2 var useAVX2 = cpu.X86.HasAVX2
var useAVX = cpu.X86.HasAVX var useAVX = cpu.X86.HasAVX
var useAESNI4SingleBlock = os.Getenv("FORCE_SM4BLOCK_AESNI") == "1"
const ( const (
INST_AES int = iota INST_AES int = iota
@ -70,7 +71,28 @@ func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) { if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
panic("sm4: invalid buffer overlap") panic("sm4: invalid buffer overlap")
} }
if useAESNI4SingleBlock {
encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_AES)
} else {
encryptBlockGo(c.enc, dst, src) encryptBlockGo(c.enc, dst, src)
}
}
func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
if len(src) < BlockSize {
panic("sm4: input not full block")
}
if len(dst) < BlockSize {
panic("sm4: output not full block")
}
if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
panic("sm4: invalid buffer overlap")
}
if useAESNI4SingleBlock {
encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_AES)
} else {
decryptBlockGo(c.dec, dst, src)
}
} }
func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) { func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
@ -86,19 +108,6 @@ func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
encryptBlocksAsm(&c.enc[0], dst, src, INST_AES) encryptBlocksAsm(&c.enc[0], dst, src, INST_AES)
} }
func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
if len(src) < BlockSize {
panic("sm4: input not full block")
}
if len(dst) < BlockSize {
panic("sm4: output not full block")
}
if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
panic("sm4: invalid buffer overlap")
}
decryptBlockGo(c.dec, dst, src)
}
func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) { func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) {
if len(src) < c.blocksSize { if len(src) < c.blocksSize {
panic("sm4: input not full blocks") panic("sm4: input not full blocks")