mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
sm4: add env var FORCE_SM4BLOCK_AESNI & give up cbc asm implementation
This commit is contained in:
parent
29b6da1d37
commit
a01428eaf3
@ -238,13 +238,16 @@ type AEAD interface {
|
||||
|
||||
## 性能
|
||||
SM4分组密码算法的软件高效实现,不算CPU指令支持的话,已知有如下几种方法:
|
||||
* S盒和L转换预计算
|
||||
* S盒和L转换预计算,本软件库纯Go语言实现采用该方法
|
||||
* SIMD并行处理:并行查表
|
||||
* SIMD并行处理:借助CPU的AES指令,本软件库采用该方法
|
||||
* SIMD并行处理:位切片(bitslicing),[参考实现](https://github.com/emmansun/sm4bs)
|
||||
|
||||
当然,这些与有CPU指令支持的AES算法相比,性能差距依然偏大,要是工作模式不支持并行,差距就更巨大了。
|
||||
|
||||
### 混合方式
|
||||
从**v0.25.0**开始,AMD64/ARM64 支持AES-NI的CPU架构下,**默认会使用混合方式**,即```cipher.Block```的方法会用纯Go语言实现,而对于可以并行的加解密模式,则还是会尽量采用AES-NI和SIMD并行处理。您可以通过环境变量```FORCE_SM4BLOCK_AESNI=1```来强制都使用AES-NI实现(和v0.25.0之前版本的行为一样)。请参考[SM4: 单block的性能问题](https://github.com/emmansun/gmsm/discussions/172)。
|
||||
|
||||
## 与KMS集成
|
||||
可能您会说,如果我在KMS中创建了一个SM4对称密钥,就不需要本地加解密了,这话很对,不过有种场景会用到:
|
||||
* 在KMS中只创建非对称密钥(KEK);
|
||||
|
@ -1,77 +1,8 @@
|
||||
//go:build amd64 && !purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define x X0
|
||||
#define y X1
|
||||
#define t0 X2
|
||||
#define t1 X3
|
||||
#define t2 X4
|
||||
#define t3 X5
|
||||
|
||||
#define XTMP6 X6
|
||||
|
||||
#include "aesni_macros_amd64.s"
|
||||
|
||||
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
|
||||
#define ctx BX
|
||||
#define ptx DX
|
||||
#define ptxLen DI
|
||||
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), ctx
|
||||
MOVQ src+32(FP), ptx
|
||||
MOVQ src_len+40(FP), ptxLen
|
||||
MOVQ iv+56(FP), SI
|
||||
|
||||
MOVOU (SI), t0
|
||||
|
||||
loopSrc:
|
||||
CMPQ ptxLen, $16
|
||||
JB done_sm4
|
||||
SUBQ $16, ptxLen
|
||||
|
||||
MOVOU (ptx), t1
|
||||
PXOR t1, t0
|
||||
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PSHUFD $1, t0, t1
|
||||
PSHUFD $2, t0, t2
|
||||
PSHUFD $3, t0, t3
|
||||
|
||||
XORL CX, CX
|
||||
|
||||
loopRound:
|
||||
SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loopRound
|
||||
|
||||
PALIGNR $4, t3, t3
|
||||
PALIGNR $4, t3, t2
|
||||
PALIGNR $4, t2, t1
|
||||
PALIGNR $4, t1, t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
|
||||
MOVOU t0, (ctx)
|
||||
|
||||
LEAQ 16(ptx), ptx
|
||||
LEAQ 16(ctx), ctx
|
||||
|
||||
JMP loopSrc
|
||||
|
||||
done_sm4:
|
||||
MOVOU t0, (SI)
|
||||
RET
|
||||
|
||||
#undef ctx
|
||||
#undef ptx
|
||||
#undef ptxLen
|
||||
|
||||
#define XDWTMP0 Y0
|
||||
#define XDWTMP1 Y1
|
||||
|
@ -8,6 +8,14 @@
|
||||
#define t1 V3
|
||||
#define t2 V4
|
||||
#define t3 V5
|
||||
#define XTMP6 V6
|
||||
#define XTMP7 V7
|
||||
#define t4 V10
|
||||
#define t5 V11
|
||||
#define t6 V12
|
||||
#define t7 V13
|
||||
#define IV V18
|
||||
|
||||
#define ZERO V16
|
||||
#define NIBBLE_MASK V20
|
||||
#define INVERSE_SHIFT_ROWS V21
|
||||
@ -17,86 +25,14 @@
|
||||
#define M2H V25
|
||||
#define R08_MASK V26
|
||||
#define FK_MASK V27
|
||||
#define XTMP6 V6
|
||||
#define IV V7
|
||||
|
||||
#include "aesni_macros_arm64.s"
|
||||
|
||||
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
|
||||
#define ctx R1
|
||||
#define ptx R3
|
||||
#define ptxLen R4
|
||||
#define rkSave R8
|
||||
|
||||
LOAD_SM4_AESNI_CONSTS()
|
||||
|
||||
MOVD xk+0(FP), rkSave
|
||||
MOVD dst+8(FP), ctx
|
||||
MOVD src+32(FP), ptx
|
||||
MOVD src_len+40(FP), ptxLen
|
||||
MOVD iv+56(FP), R5
|
||||
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
VLD1 (R5), [IV.B16]
|
||||
|
||||
loopSrc:
|
||||
CMP $16, ptxLen
|
||||
BLT done_sm4
|
||||
SUB $16, ptxLen
|
||||
|
||||
VLD1.P (ptx), [t0.S4]
|
||||
VEOR IV.B16, t0.B16, t0.B16
|
||||
VREV32 t0.B16, t0.B16
|
||||
VMOV t0.S[1], t1.S[0]
|
||||
VMOV t0.S[2], t2.S[0]
|
||||
VMOV t0.S[3], t3.S[0]
|
||||
|
||||
EOR R2, R2
|
||||
MOVD rkSave, R0
|
||||
|
||||
encryptBlockLoop:
|
||||
SM4_ROUND(R0, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(R0, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(R0, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(R0, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R2
|
||||
CMP $128, R2
|
||||
BNE encryptBlockLoop
|
||||
|
||||
VMOV t2.S[0], t3.S[1]
|
||||
VMOV t1.S[0], t3.S[2]
|
||||
VMOV t0.S[0], t3.S[3]
|
||||
VREV32 t3.B16, t3.B16
|
||||
|
||||
VST1.P [t3.B16], (ctx)
|
||||
VMOV t3.B16, IV.B16
|
||||
|
||||
B loopSrc
|
||||
|
||||
done_sm4:
|
||||
VST1 [IV.B16], (R5)
|
||||
RET
|
||||
|
||||
#undef ctx
|
||||
#undef ptx
|
||||
#undef ptxLen
|
||||
#undef rkSave
|
||||
#undef IV
|
||||
|
||||
#define XTMP7 V7
|
||||
#define t4 V10
|
||||
#define t5 V11
|
||||
#define t6 V12
|
||||
#define t7 V13
|
||||
|
||||
#define dstPtr R1
|
||||
#define srcPtr R2
|
||||
#define rk R3
|
||||
#define rkSave R4
|
||||
#define srcPtrLen R5
|
||||
#define IV V18
|
||||
|
||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
|
@ -42,9 +42,6 @@ func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode {
|
||||
|
||||
func (x *cbc) BlockSize() int { return BlockSize }
|
||||
|
||||
//go:noescape
|
||||
func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
|
||||
//go:noescape
|
||||
func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
|
||||
|
@ -16,6 +16,7 @@ var supportsAES = cpuid.HasAES
|
||||
var supportsGFMUL = cpuid.HasGFMUL
|
||||
var useAVX2 = cpu.X86.HasAVX2
|
||||
var useAVX = cpu.X86.HasAVX
|
||||
var useAESNI4SingleBlock = os.Getenv("FORCE_SM4BLOCK_AESNI") == "1"
|
||||
|
||||
const (
|
||||
INST_AES int = iota
|
||||
@ -70,8 +71,29 @@ func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
|
||||
if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
||||
panic("sm4: invalid buffer overlap")
|
||||
}
|
||||
if useAESNI4SingleBlock {
|
||||
encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_AES)
|
||||
} else {
|
||||
encryptBlockGo(c.enc, dst, src)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
|
||||
if len(src) < BlockSize {
|
||||
panic("sm4: input not full block")
|
||||
}
|
||||
if len(dst) < BlockSize {
|
||||
panic("sm4: output not full block")
|
||||
}
|
||||
if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
||||
panic("sm4: invalid buffer overlap")
|
||||
}
|
||||
if useAESNI4SingleBlock {
|
||||
encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_AES)
|
||||
} else {
|
||||
decryptBlockGo(c.dec, dst, src)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
|
||||
if len(src) < c.blocksSize {
|
||||
@ -86,19 +108,6 @@ func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
|
||||
encryptBlocksAsm(&c.enc[0], dst, src, INST_AES)
|
||||
}
|
||||
|
||||
func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
|
||||
if len(src) < BlockSize {
|
||||
panic("sm4: input not full block")
|
||||
}
|
||||
if len(dst) < BlockSize {
|
||||
panic("sm4: output not full block")
|
||||
}
|
||||
if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
||||
panic("sm4: invalid buffer overlap")
|
||||
}
|
||||
decryptBlockGo(c.dec, dst, src)
|
||||
}
|
||||
|
||||
func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) {
|
||||
if len(src) < c.blocksSize {
|
||||
panic("sm4: input not full blocks")
|
||||
|
Loading…
x
Reference in New Issue
Block a user