From a01428eaf3df5193845ec931a70150c018dc0fd0 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Thu, 25 Jan 2024 17:46:09 +0800 Subject: [PATCH] sm4: add env var FORCE_SM4BLOCK_AESNI & give up cbc asm implementation --- docs/sm4.md | 5 ++- sm4/cbc_amd64.s | 69 ------------------------------------- sm4/cbc_arm64.s | 80 +++++-------------------------------------- sm4/cbc_cipher_asm.go | 3 -- sm4/cipher_asm.go | 37 ++++++++++++-------- 5 files changed, 35 insertions(+), 159 deletions(-) diff --git a/docs/sm4.md b/docs/sm4.md index d9be360..e23f810 100644 --- a/docs/sm4.md +++ b/docs/sm4.md @@ -238,13 +238,16 @@ type AEAD interface { ## 性能 SM4分组密码算法的软件高效实现,不算CPU指令支持的话,已知有如下几种方法: -* S盒和L转换预计算 +* S盒和L转换预计算,本软件库纯Go语言实现采用该方法 * SIMD并行处理:并行查表 * SIMD并行处理:借助CPU的AES指令,本软件库采用该方法 * SIMD并行处理:位切片(bitslicing),[参考实现](https://github.com/emmansun/sm4bs) 当然,这些与有CPU指令支持的AES算法相比,性能差距依然偏大,要是工作模式不支持并行,差距就更巨大了。 +### 混合方式 +从**v0.25.0**开始,AMD64/ARM64 支持AES-NI的CPU架构下,**默认会使用混合方式**,即```cipher.Block```的方法会用纯Go语言实现,而对于可以并行的加解密模式,则还是会尽量采用AES-NI和SIMD并行处理。您可以通过环境变量```FORCE_SM4BLOCK_AESNI=1```来强制都使用AES-NI实现(和v0.25.0之前版本的行为一样)。请参考[SM4: 单block的性能问题](https://github.com/emmansun/gmsm/discussions/172)。 + ## 与KMS集成 可能您会说,如果我在KMS中创建了一个SM4对称密钥,就不需要本地加解密了,这话很对,不过有种场景会用到: * 在KMS中只创建非对称密钥(KEK); diff --git a/sm4/cbc_amd64.s b/sm4/cbc_amd64.s index 3dad037..6d8f760 100644 --- a/sm4/cbc_amd64.s +++ b/sm4/cbc_amd64.s @@ -1,77 +1,8 @@ //go:build amd64 && !purego #include "textflag.h" - -#define x X0 -#define y X1 -#define t0 X2 -#define t1 X3 -#define t2 X4 -#define t3 X5 - -#define XTMP6 X6 - #include "aesni_macros_amd64.s" -// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) -TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 -#define ctx BX -#define ptx DX -#define ptxLen DI - - MOVQ xk+0(FP), AX - MOVQ dst+8(FP), ctx - MOVQ src+32(FP), ptx - MOVQ src_len+40(FP), ptxLen - MOVQ iv+56(FP), SI - - MOVOU (SI), t0 - -loopSrc: - CMPQ ptxLen, $16 - JB done_sm4 - SUBQ $16, ptxLen - - MOVOU (ptx), t1 - PXOR t1, t0 - - PSHUFB flip_mask<>(SB), t0 - PSHUFD $1, t0, t1 - PSHUFD $2, t0, t2 - PSHUFD $3, t0, t3 - - XORL CX, CX - -loopRound: - SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3) - SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0) - SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1) - SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2) - - ADDL $16, CX - CMPL CX, $4*32 - JB loopRound - - PALIGNR $4, t3, t3 - PALIGNR $4, t3, t2 - PALIGNR $4, t2, t1 - PALIGNR $4, t1, t0 - PSHUFB flip_mask<>(SB), t0 - - MOVOU t0, (ctx) - - LEAQ 16(ptx), ptx - LEAQ 16(ctx), ctx - - JMP loopSrc - -done_sm4: - MOVOU t0, (SI) - RET - -#undef ctx -#undef ptx -#undef ptxLen #define XDWTMP0 Y0 #define XDWTMP1 Y1 diff --git a/sm4/cbc_arm64.s b/sm4/cbc_arm64.s index 948073c..46b727e 100644 --- a/sm4/cbc_arm64.s +++ b/sm4/cbc_arm64.s @@ -8,6 +8,14 @@ #define t1 V3 #define t2 V4 #define t3 V5 +#define XTMP6 V6 +#define XTMP7 V7 +#define t4 V10 +#define t5 V11 +#define t6 V12 +#define t7 V13 +#define IV V18 + #define ZERO V16 #define NIBBLE_MASK V20 #define INVERSE_SHIFT_ROWS V21 @@ -17,86 +25,14 @@ #define M2H V25 #define R08_MASK V26 #define FK_MASK V27 -#define XTMP6 V6 -#define IV V7 #include "aesni_macros_arm64.s" -// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) -TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 -#define ctx R1 -#define ptx R3 -#define ptxLen R4 -#define rkSave R8 - - LOAD_SM4_AESNI_CONSTS() - - MOVD xk+0(FP), rkSave - MOVD dst+8(FP), ctx - MOVD src+32(FP), ptx - MOVD src_len+40(FP), ptxLen - MOVD iv+56(FP), R5 - - VEOR ZERO.B16, ZERO.B16, ZERO.B16 - VLD1 (R5), [IV.B16] - -loopSrc: - CMP $16, ptxLen - BLT done_sm4 - SUB $16, ptxLen - - VLD1.P (ptx), [t0.S4] - VEOR IV.B16, t0.B16, t0.B16 - VREV32 t0.B16, t0.B16 - VMOV t0.S[1], t1.S[0] - VMOV t0.S[2], t2.S[0] - VMOV t0.S[3], t3.S[0] - - EOR R2, R2 - MOVD rkSave, R0 - -encryptBlockLoop: - SM4_ROUND(R0, R19, x, y, XTMP6, t0, t1, t2, t3) - SM4_ROUND(R0, R19, x, y, XTMP6, t1, t2, t3, t0) - SM4_ROUND(R0, R19, x, y, XTMP6, t2, t3, t0, t1) - SM4_ROUND(R0, R19, x, y, XTMP6, t3, t0, t1, t2) - - ADD $16, R2 - CMP $128, R2 - BNE encryptBlockLoop - - VMOV t2.S[0], t3.S[1] - VMOV t1.S[0], t3.S[2] - VMOV t0.S[0], t3.S[3] - VREV32 t3.B16, t3.B16 - - VST1.P [t3.B16], (ctx) - VMOV t3.B16, IV.B16 - - B loopSrc - -done_sm4: - VST1 [IV.B16], (R5) - RET - -#undef ctx -#undef ptx -#undef ptxLen -#undef rkSave -#undef IV - -#define XTMP7 V7 -#define t4 V10 -#define t5 V11 -#define t6 V12 -#define t7 V13 - #define dstPtr R1 #define srcPtr R2 #define rk R3 #define rkSave R4 #define srcPtrLen R5 -#define IV V18 // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 diff --git a/sm4/cbc_cipher_asm.go b/sm4/cbc_cipher_asm.go index 4e05fb4..72a3559 100644 --- a/sm4/cbc_cipher_asm.go +++ b/sm4/cbc_cipher_asm.go @@ -42,9 +42,6 @@ func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode { func (x *cbc) BlockSize() int { return BlockSize } -//go:noescape -func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) - //go:noescape func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) diff --git a/sm4/cipher_asm.go b/sm4/cipher_asm.go index 9b7bd01..f926575 100644 --- a/sm4/cipher_asm.go +++ b/sm4/cipher_asm.go @@ -16,6 +16,7 @@ var supportsAES = cpuid.HasAES var supportsGFMUL = cpuid.HasGFMUL var useAVX2 = cpu.X86.HasAVX2 var useAVX = cpu.X86.HasAVX +var useAESNI4SingleBlock = os.Getenv("FORCE_SM4BLOCK_AESNI") == "1" const ( INST_AES int = iota @@ -70,7 +71,28 @@ func (c *sm4CipherAsm) Encrypt(dst, src []byte) { if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) { panic("sm4: invalid buffer overlap") } - encryptBlockGo(c.enc, dst, src) + if useAESNI4SingleBlock { + encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_AES) + } else { + encryptBlockGo(c.enc, dst, src) + } +} + +func (c *sm4CipherAsm) Decrypt(dst, src []byte) { + if len(src) < BlockSize { + panic("sm4: input not full block") + } + if len(dst) < BlockSize { + panic("sm4: output not full block") + } + if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) { + panic("sm4: invalid buffer overlap") + } + if useAESNI4SingleBlock { + encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_AES) + } else { + decryptBlockGo(c.dec, dst, src) + } } func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) { @@ -86,19 +108,6 @@ func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) { encryptBlocksAsm(&c.enc[0], dst, src, INST_AES) } -func (c *sm4CipherAsm) Decrypt(dst, src []byte) { - if len(src) < BlockSize { - panic("sm4: input not full block") - } - if len(dst) < BlockSize { - panic("sm4: output not full block") - } - if alias.InexactOverlap(dst[:BlockSize], src[:BlockSize]) { - panic("sm4: invalid buffer overlap") - } - decryptBlockGo(c.dec, dst, src) -} - func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) { if len(src) < c.blocksSize { panic("sm4: input not full blocks")