sm4: cbc encryption improvement a little

This commit is contained in:
Sun Yimin 2022-07-21 17:32:11 +08:00 committed by GitHub
parent 8ddf1bc68f
commit 42faebb588
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 255 additions and 22 deletions

View File

@ -319,17 +319,11 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVQ dst+8(FP), BX
MOVQ src+16(FP), DX
PINSRD $0, 0(DX), t0
PSHUFB flip_mask<>(SB), t0
PINSRD $0, 4(DX), t1
PSHUFB flip_mask<>(SB), t1
PINSRD $0, 8(DX), t2
PSHUFB flip_mask<>(SB), t2
PINSRD $0, 12(DX), t3
PSHUFB flip_mask<>(SB), t3
MOVUPS (DX), t0
PSHUFB flip_mask<>(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
XORL CX, CX
@ -343,16 +337,14 @@ loop:
CMPL CX, $4*32
JB loop
PSHUFB flip_mask<>(SB), t3
PSHUFB flip_mask<>(SB), t2
PSHUFB flip_mask<>(SB), t1
PSHUFB flip_mask<>(SB), t0
MOVUPS t3, 0(BX)
PEXTRD $0, t2, R8
MOVL R8, 4(BX)
PEXTRD $0, t1, R8
MOVL R8, 8(BX)
PEXTRD $0, t0, R8
MOVL R8, 12(BX)
PEXTRD $0, t2, R8
PINSRD $1, R8, t3
PEXTRD $0, t1, R8
PINSRD $2, R8, t3
PEXTRD $0, t0, R8
PINSRD $3, R8, t3
PSHUFB flip_mask<>(SB), t3
MOVUPS t3, (BX)
done_sm4:
RET

View File

@ -12,16 +12,32 @@ import (
// Assert that sm4CipherAsm implements the cbcDecAble interfaces.
var _ cbcDecAble = (*sm4CipherAsm)(nil)
var _ cbcDecAble = (*sm4CipherAsm)(nil)
const cbcEncrypt = 1
const cbcDecrypt = 0
type cbc struct {
b *sm4CipherAsm
iv []byte
tmp []byte
enc int
}
func (b *sm4CipherAsm) NewCBCEncrypter(iv []byte) cipher.BlockMode {
var c cbc
c.b = b
c.enc = cbcEncrypt
c.iv = make([]byte, BlockSize)
c.tmp = make([]byte, BlockSize)
copy(c.iv, iv)
return &c
}
func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode {
var c cbc
c.b = b
c.enc = cbcDecrypt
c.iv = make([]byte, BlockSize)
c.tmp = make([]byte, BlockSize)
copy(c.iv, iv)
@ -30,6 +46,9 @@ func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode {
func (x *cbc) BlockSize() int { return BlockSize }
//go:noescape
func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
func (x *cbc) CryptBlocks(dst, src []byte) {
if len(src)%BlockSize != 0 {
panic("cipher: input not full blocks")
@ -43,6 +62,10 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
if len(src) == 0 {
return
}
if x.enc == cbcEncrypt {
encryptBlocksChain(&x.b.enc[0], dst, src, &x.iv[0])
return
}
// For each block, we need to xor the decrypted data with the previous block's ciphertext (the iv).
// To avoid making a copy each time, we loop over the blocks BACKWARDS.
end := len(src)

View File

@ -0,0 +1,87 @@
//go:build amd64 && !generic
// +build amd64,!generic
#include "textflag.h"
#define x X0
#define y X1
#define t0 X2
#define t1 X3
#define t2 X4
#define t3 X5
#define XTMP6 X6
#define IV X8
#include "aesni_amd64.h"
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
#define ctx BX
#define ptx DX
#define ptxLen DI
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), ctx
MOVQ src+32(FP), ptx
MOVQ src_len+40(FP), ptxLen
MOVQ iv+56(FP), SI
MOVUPS (SI), IV
loopSrc:
CMPQ ptxLen, $16
JB done_sm4
SUBQ $16, ptxLen
MOVUPS (ptx), t0
PXOR IV, t0
PSHUFB flip_mask<>(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
XORL CX, CX
loopRound:
SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
ADDL $16, CX
CMPL CX, $4*32
JB loopRound
PEXTRD $0, t2, R8
PINSRD $1, R8, t3
PEXTRD $0, t1, R8
PINSRD $2, R8, t3
PEXTRD $0, t0, R8
PINSRD $3, R8, t3
PSHUFB flip_mask<>(SB), t3
MOVOU t3, IV
MOVUPS t3, (ctx)
LEAQ 16(ptx), ptx
LEAQ 16(ctx), ctx
JMP loopSrc
done_sm4:
MOVUPS IV, (SI)
RET
#undef ctx
#undef ptx
#undef ptxLen

131
sm4/cbc_cipher_asm_arm64.s Normal file
View File

@ -0,0 +1,131 @@
//go:build arm64 && !generic
// +build arm64,!generic
#include "textflag.h"
#define x V0
#define y V1
#define t0 V2
#define t1 V3
#define t2 V4
#define t3 V5
#define ZERO V16
#define NIBBLE_MASK V20
#define INVERSE_SHIFT_ROWS V21
#define M1L V22
#define M1H V23
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define R16_MASK V27
#define R24_MASK V28
#define FK_MASK V29
#define XTMP6 V6
#define IV V7
#include "aesni_arm64.h"
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
MOVW.P 4(RK), R19; \
VMOV R19, x.S4; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y, z); \
VEOR x.B16, t0.B16, t0.B16
#define load_global_data_1() \
LDP nibble_mask<>(SB), (R0, R1) \
VMOV R0, NIBBLE_MASK.D[0] \
VMOV R1, NIBBLE_MASK.D[1] \
LDP m1_low<>(SB), (R0, R1) \
VMOV R0, M1L.D[0] \
VMOV R1, M1L.D[1] \
LDP m1_high<>(SB), (R0, R1) \
VMOV R0, M1H.D[0] \
VMOV R1, M1H.D[1] \
LDP m2_low<>(SB), (R0, R1) \
VMOV R0, M2L.D[0] \
VMOV R1, M2L.D[1] \
LDP m2_high<>(SB), (R0, R1) \
VMOV R0, M2H.D[0] \
VMOV R1, M2H.D[1] \
LDP fk_mask<>(SB), (R0, R1) \
VMOV R0, FK_MASK.D[0] \
VMOV R1, FK_MASK.D[1] \
LDP inverse_shift_rows<>(SB), (R0, R1) \
VMOV R0, INVERSE_SHIFT_ROWS.D[0] \
VMOV R1, INVERSE_SHIFT_ROWS.D[1]
#define load_global_data_2() \
load_global_data_1() \
LDP r08_mask<>(SB), (R0, R1) \
VMOV R0, R08_MASK.D[0] \
VMOV R1, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R0, R1) \
VMOV R0, R16_MASK.D[0] \
VMOV R1, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R0, R1) \
VMOV R0, R24_MASK.D[0] \
VMOV R1, R24_MASK.D[1]
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
#define ctx R1
#define ptx R3
#define ptxLen R4
#define rkSave R8
load_global_data_2()
MOVD xk+0(FP), rkSave
MOVD dst+8(FP), ctx
MOVD src+32(FP), ptx
MOVD src_len+40(FP), ptxLen
MOVD iv+56(FP), R5
VEOR ZERO.B16, ZERO.B16, ZERO.B16
loopSrc:
CMP $16, ptxLen
BLT done_sm4
SUB $16, ptxLen
VLD1.P (ptx), [t0.S4]
VEOR IV.B16, t0.B16, t0.B16
VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0]
EOR R2, R2
MOVD rkSave, R0
encryptBlockLoop:
SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R2
CMP $128, R2
BNE encryptBlockLoop
VMOV t2.S[0], t3.S[1]
VMOV t1.S[0], t3.S[2]
VMOV t0.S[0], t3.S[3]
VREV32 t3.B16, t3.B16
VST1.P [t3.B16], (ctx)
VMOV t3.B16, IV.B16
B loopSrc
done_sm4:
VST1 [IV.B16], (R5)
RET
#undef ctx
#undef ptx
#undef ptxLen
#undef rkSave