mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
sm4: improve arm64 single block performance & CBC decrypt performance
This commit is contained in:
parent
3cbabc3d1c
commit
f81bbd17df
@ -32,7 +32,7 @@ func BenchmarkSM4CBCEncrypt1K(b *testing.B) {
|
||||
benchmarkCBCEncrypt1K(b, c)
|
||||
}
|
||||
|
||||
func benchmarkSM4CBCDecrypt1K(b *testing.B, block cipher.Block) {
|
||||
func benchmarkCBCDecrypt1K(b *testing.B, block cipher.Block) {
|
||||
buf := make([]byte, 1024)
|
||||
b.SetBytes(int64(len(buf)))
|
||||
|
||||
@ -46,13 +46,13 @@ func benchmarkSM4CBCDecrypt1K(b *testing.B, block cipher.Block) {
|
||||
func BenchmarkAESCBCDecrypt1K(b *testing.B) {
|
||||
var key [16]byte
|
||||
c, _ := aes.NewCipher(key[:])
|
||||
benchmarkSM4CBCDecrypt1K(b, c)
|
||||
benchmarkCBCDecrypt1K(b, c)
|
||||
}
|
||||
|
||||
func BenchmarkSM4CBCDecrypt1K(b *testing.B) {
|
||||
var key [16]byte
|
||||
c, _ := sm4.NewCipher(key[:])
|
||||
benchmarkSM4CBCDecrypt1K(b, c)
|
||||
benchmarkCBCDecrypt1K(b, c)
|
||||
}
|
||||
|
||||
func benchmarkStream(b *testing.B, block cipher.Block, mode func(cipher.Block, []byte) cipher.Stream, buf []byte) {
|
||||
|
@ -883,7 +883,7 @@ type sm2Curve struct {
|
||||
nMinus2 []byte
|
||||
}
|
||||
|
||||
// pointFromAffine is used to convert the PublicKey to a nistec Point.
|
||||
// pointFromAffine is used to convert the PublicKey to a sm2 Point.
|
||||
func (curve *sm2Curve) pointFromAffine(x, y *big.Int) (p *_sm2ec.SM2P256Point, err error) {
|
||||
bitSize := curve.curve.Params().BitSize
|
||||
// Reject values that would not get correctly encoded.
|
||||
@ -902,7 +902,7 @@ func (curve *sm2Curve) pointFromAffine(x, y *big.Int) (p *_sm2ec.SM2P256Point, e
|
||||
return curve.newPoint().SetBytes(buf)
|
||||
}
|
||||
|
||||
// pointToAffine is used to convert a nistec Point to a PublicKey.
|
||||
// pointToAffine is used to convert a sm2 Point to a PublicKey.
|
||||
func (curve *sm2Curve) pointToAffine(p *_sm2ec.SM2P256Point) (x, y *big.Int, err error) {
|
||||
out := p.Bytes()
|
||||
if len(out) == 1 && out[0] == 0 {
|
||||
|
@ -209,7 +209,7 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
MOVL (index * 4)(RK)(IND*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
|
@ -338,7 +338,7 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
||||
MOVQ dst+8(FP), BX
|
||||
MOVQ src+16(FP), DX
|
||||
|
||||
MOVUPS (DX), t0
|
||||
MOVOU (DX), t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
PSHUFD $1, t0, t1
|
||||
PSHUFD $2, t0, t2
|
||||
@ -356,14 +356,12 @@ loop:
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
|
||||
PEXTRD $0, t2, R8
|
||||
PINSRD $1, R8, t3
|
||||
PEXTRD $0, t1, R8
|
||||
PINSRD $2, R8, t3
|
||||
PEXTRD $0, t0, R8
|
||||
PINSRD $3, R8, t3
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
MOVUPS t3, (BX)
|
||||
PALIGNR $4, t3, t3
|
||||
PALIGNR $4, t3, t2
|
||||
PALIGNR $4, t2, t1
|
||||
PALIGNR $4, t1, t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
MOVOU t0, (BX)
|
||||
|
||||
done_sm4:
|
||||
RET
|
||||
|
@ -7,7 +7,6 @@ import (
|
||||
"crypto/cipher"
|
||||
|
||||
"github.com/emmansun/gmsm/internal/alias"
|
||||
"github.com/emmansun/gmsm/internal/subtle"
|
||||
)
|
||||
|
||||
// Assert that sm4CipherAsm implements the cbcEncAble and cbcDecAble interfaces.
|
||||
@ -49,6 +48,9 @@ func (x *cbc) BlockSize() int { return BlockSize }
|
||||
//go:noescape
|
||||
func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
|
||||
//go:noescape
|
||||
func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
|
||||
func (x *cbc) CryptBlocks(dst, src []byte) {
|
||||
if len(src)%BlockSize != 0 {
|
||||
panic("cipher: input not full blocks")
|
||||
@ -76,19 +78,18 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
|
||||
var temp []byte = make([]byte, x.b.blocksSize)
|
||||
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
|
||||
|
||||
decKeyPtr := &x.b.dec[0]
|
||||
for start > 0 {
|
||||
x.b.DecryptBlocks(temp, src[start:end])
|
||||
subtle.XORBytes(temp, temp, src[start-BlockSize:end-BlockSize])
|
||||
copy(dst[start:], temp)
|
||||
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
||||
end = start
|
||||
start -= x.b.blocksSize
|
||||
}
|
||||
|
||||
// Handle remain first blocks
|
||||
copy(batchSrc[BlockSize:], src[:end])
|
||||
x.b.DecryptBlocks(temp, batchSrc[BlockSize:])
|
||||
copy(batchSrc, x.iv)
|
||||
subtle.XORBytes(dst, temp[:end], batchSrc)
|
||||
copy(batchSrc[BlockSize:], src[:end])
|
||||
decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0])
|
||||
copy(dst, temp[:end])
|
||||
|
||||
// Set the new iv to the first block we copied earlier.
|
||||
x.iv, x.tmp = x.tmp, x.iv
|
||||
|
@ -34,7 +34,7 @@ loopSrc:
|
||||
JB done_sm4
|
||||
SUBQ $16, ptxLen
|
||||
|
||||
MOVUPS (ptx), t0
|
||||
MOVOU (ptx), t0
|
||||
PXOR IV, t0
|
||||
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
@ -54,16 +54,14 @@ loopRound:
|
||||
CMPL CX, $4*32
|
||||
JB loopRound
|
||||
|
||||
PEXTRD $0, t2, R8
|
||||
PINSRD $1, R8, t3
|
||||
PEXTRD $0, t1, R8
|
||||
PINSRD $2, R8, t3
|
||||
PEXTRD $0, t0, R8
|
||||
PINSRD $3, R8, t3
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
PALIGNR $4, t3, t3
|
||||
PALIGNR $4, t3, t2
|
||||
PALIGNR $4, t2, t1
|
||||
PALIGNR $4, t1, t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
|
||||
MOVOU t3, IV
|
||||
MOVUPS t3, (ctx)
|
||||
MOVOU t0, IV
|
||||
MOVOU t0, (ctx)
|
||||
|
||||
LEAQ 16(ptx), ptx
|
||||
LEAQ 16(ctx), ctx
|
||||
@ -77,3 +75,162 @@ done_sm4:
|
||||
#undef ctx
|
||||
#undef ptx
|
||||
#undef ptxLen
|
||||
|
||||
#define XDWTMP0 Y0
|
||||
#define XDWTMP1 Y1
|
||||
#define XDWTMP2 Y2
|
||||
|
||||
#define XDWORD0 Y4
|
||||
#define XDWORD1 Y5
|
||||
#define XDWORD2 Y6
|
||||
#define XDWORD3 Y7
|
||||
|
||||
#define NIBBLE_MASK Y3
|
||||
#define X_NIBBLE_MASK X3
|
||||
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
||||
|
||||
#define XDWORD Y8
|
||||
#define YDWORD Y9
|
||||
|
||||
#define XWORD X8
|
||||
#define YWORD X9
|
||||
|
||||
// SM4 round function, AVX2 version, handle 256 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 256 bits temp register
|
||||
// - y: 256 bits temp register
|
||||
// - t0: 256 bits register for data as result
|
||||
// - t1: 256 bits register for data
|
||||
// - t2: 256 bits register for data
|
||||
// - t3: 256 bits register for data
|
||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), BX
|
||||
MOVQ src+32(FP), DX
|
||||
MOVQ iv+56(FP), SI
|
||||
|
||||
CMPB ·useAVX2(SB), $1
|
||||
JE avx2
|
||||
|
||||
non_avx2_start:
|
||||
PINSRD $0, 0(DX), t0
|
||||
PINSRD $1, 16(DX), t0
|
||||
PINSRD $2, 32(DX), t0
|
||||
PINSRD $3, 48(DX), t0
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
|
||||
PINSRD $0, 4(DX), t1
|
||||
PINSRD $1, 20(DX), t1
|
||||
PINSRD $2, 36(DX), t1
|
||||
PINSRD $3, 52(DX), t1
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
|
||||
PINSRD $0, 8(DX), t2
|
||||
PINSRD $1, 24(DX), t2
|
||||
PINSRD $2, 40(DX), t2
|
||||
PINSRD $3, 56(DX), t2
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
|
||||
PINSRD $0, 12(DX), t3
|
||||
PINSRD $1, 28(DX), t3
|
||||
PINSRD $2, 44(DX), t3
|
||||
PINSRD $3, 60(DX), t3
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
|
||||
XORL CX, CX
|
||||
|
||||
loop:
|
||||
SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB loop
|
||||
|
||||
PSHUFB flip_mask<>(SB), t3
|
||||
PSHUFB flip_mask<>(SB), t2
|
||||
PSHUFB flip_mask<>(SB), t1
|
||||
PSHUFB flip_mask<>(SB), t0
|
||||
|
||||
SSE_TRANSPOSE_MATRIX(CX, t3, t2, t1, t0, XWORD, YWORD)
|
||||
|
||||
PXOR 0(SI), t3
|
||||
PXOR 16(SI), t2
|
||||
PXOR 32(SI), t1
|
||||
PXOR 48(SI), t0
|
||||
|
||||
MOVUPS t3, 0(BX)
|
||||
MOVUPS t2, 16(BX)
|
||||
MOVUPS t1, 32(BX)
|
||||
MOVUPS t0, 48(BX)
|
||||
|
||||
done_sm4:
|
||||
RET
|
||||
|
||||
avx2:
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
|
||||
avx2_8blocks:
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
VMOVDQU 96(DX), XDWORD3
|
||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
|
||||
XORL CX, CX
|
||||
|
||||
avx2_loop:
|
||||
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
JB avx2_loop
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
|
||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
VPXOR 0(SI), XDWORD0, XDWORD0
|
||||
VPXOR 32(SI), XDWORD1, XDWORD1
|
||||
VPXOR 64(SI), XDWORD2, XDWORD2
|
||||
VPXOR 96(SI), XDWORD3, XDWORD3
|
||||
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
|
||||
|
||||
avx2_sm4_done:
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
@ -86,3 +86,50 @@ done_sm4:
|
||||
#undef ptx
|
||||
#undef ptxLen
|
||||
#undef rkSave
|
||||
|
||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
LOAD_SM4_AESNI_CONSTS()
|
||||
|
||||
MOVD xk+0(FP), R8
|
||||
MOVD dst+8(FP), R9
|
||||
MOVD src+32(FP), R10
|
||||
MOVD src_len+40(FP), R12
|
||||
MOVD iv+56(FP), R11
|
||||
|
||||
|
||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
EOR R0, R0
|
||||
|
||||
encryptBlocksLoop:
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encryptBlocksLoop
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
|
||||
VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4]
|
||||
|
||||
VEOR V6.B16, t0.B16, t0.B16
|
||||
VEOR V7.B16, t1.B16, t1.B16
|
||||
VEOR V8.B16, t2.B16, t2.B16
|
||||
VEOR V9.B16, t3.B16, t3.B16
|
||||
|
||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
|
||||
RET
|
||||
|
Loading…
x
Reference in New Issue
Block a user