mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
sm4: optimize cbc iv handling
This commit is contained in:
parent
cb47e82478
commit
0fbc30f868
@ -211,6 +211,8 @@ cbCSm4Single:
|
|||||||
MOVOU -32(DX), XWORD2
|
MOVOU -32(DX), XWORD2
|
||||||
MOVOU -16(DX), XWORD3
|
MOVOU -16(DX), XWORD3
|
||||||
|
|
||||||
|
MOVOU XWORD0, XWORD4
|
||||||
|
|
||||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
PXOR 0(SI), XWORD0
|
PXOR 0(SI), XWORD0
|
||||||
@ -222,22 +224,28 @@ cbCSm4Single:
|
|||||||
MOVUPS XWORD1, -48(BX)
|
MOVUPS XWORD1, -48(BX)
|
||||||
MOVUPS XWORD2, -32(BX)
|
MOVUPS XWORD2, -32(BX)
|
||||||
MOVUPS XWORD3, -16(BX)
|
MOVUPS XWORD3, -16(BX)
|
||||||
|
MOVUPS XWORD4, (SI)
|
||||||
JMP cbcSm4Done
|
JMP cbcSm4Done
|
||||||
|
|
||||||
cbcSm4Single16:
|
cbcSm4Single16:
|
||||||
MOVOU -16(DX), XWORD0
|
MOVOU -16(DX), XWORD0
|
||||||
|
|
||||||
|
MOVOU XWORD0, XWORD4
|
||||||
|
|
||||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
PXOR 0(SI), XWORD0
|
PXOR 0(SI), XWORD0
|
||||||
|
|
||||||
MOVUPS XWORD0, -16(BX)
|
MOVUPS XWORD0, -16(BX)
|
||||||
|
MOVUPS XWORD4, (SI)
|
||||||
JMP cbcSm4Done
|
JMP cbcSm4Done
|
||||||
|
|
||||||
cbcSm4Single32:
|
cbcSm4Single32:
|
||||||
MOVOU -32(DX), XWORD0
|
MOVOU -32(DX), XWORD0
|
||||||
MOVOU -16(DX), XWORD1
|
MOVOU -16(DX), XWORD1
|
||||||
|
|
||||||
|
MOVOU XWORD0, XWORD4
|
||||||
|
|
||||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
PXOR 0(SI), XWORD0
|
PXOR 0(SI), XWORD0
|
||||||
@ -245,6 +253,7 @@ cbcSm4Single32:
|
|||||||
|
|
||||||
MOVUPS XWORD0, -32(BX)
|
MOVUPS XWORD0, -32(BX)
|
||||||
MOVUPS XWORD1, -16(BX)
|
MOVUPS XWORD1, -16(BX)
|
||||||
|
MOVUPS XWORD4, (SI)
|
||||||
JMP cbcSm4Done
|
JMP cbcSm4Done
|
||||||
|
|
||||||
cbcSm4Single48:
|
cbcSm4Single48:
|
||||||
@ -252,6 +261,8 @@ cbcSm4Single48:
|
|||||||
MOVOU -32(DX), XWORD1
|
MOVOU -32(DX), XWORD1
|
||||||
MOVOU -16(DX), XWORD2
|
MOVOU -16(DX), XWORD2
|
||||||
|
|
||||||
|
MOVOU XWORD0, XWORD4
|
||||||
|
|
||||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
PXOR 0(SI), XWORD0
|
PXOR 0(SI), XWORD0
|
||||||
@ -261,6 +272,7 @@ cbcSm4Single48:
|
|||||||
MOVUPS XWORD0, -48(BX)
|
MOVUPS XWORD0, -48(BX)
|
||||||
MOVUPS XWORD1, -32(BX)
|
MOVUPS XWORD1, -32(BX)
|
||||||
MOVUPS XWORD2, -16(BX)
|
MOVUPS XWORD2, -16(BX)
|
||||||
|
MOVUPS XWORD4, (SI)
|
||||||
|
|
||||||
cbcSm4Done:
|
cbcSm4Done:
|
||||||
RET
|
RET
|
||||||
@ -342,6 +354,8 @@ avxCbCSm4Single:
|
|||||||
VMOVDQU -32(DX), XWORD2
|
VMOVDQU -32(DX), XWORD2
|
||||||
VMOVDQU -16(DX), XWORD3
|
VMOVDQU -16(DX), XWORD3
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, XWORD4
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
@ -353,22 +367,28 @@ avxCbCSm4Single:
|
|||||||
VMOVDQU XWORD1, -48(BX)
|
VMOVDQU XWORD1, -48(BX)
|
||||||
VMOVDQU XWORD2, -32(BX)
|
VMOVDQU XWORD2, -32(BX)
|
||||||
VMOVDQU XWORD3, -16(BX)
|
VMOVDQU XWORD3, -16(BX)
|
||||||
|
VMOVDQU XWORD4, (SI)
|
||||||
JMP avxCbcSm4Done
|
JMP avxCbcSm4Done
|
||||||
|
|
||||||
avxCbcSm4Single16:
|
avxCbcSm4Single16:
|
||||||
VMOVDQU -16(DX), XWORD0
|
VMOVDQU -16(DX), XWORD0
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, XWORD4
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
|
||||||
VMOVDQU XWORD0, -16(BX)
|
VMOVDQU XWORD0, -16(BX)
|
||||||
|
VMOVDQU XWORD4, (SI)
|
||||||
JMP avxCbcSm4Done
|
JMP avxCbcSm4Done
|
||||||
|
|
||||||
avxCbcSm4Single32:
|
avxCbcSm4Single32:
|
||||||
VMOVDQU -32(DX), XWORD0
|
VMOVDQU -32(DX), XWORD0
|
||||||
VMOVDQU -16(DX), XWORD1
|
VMOVDQU -16(DX), XWORD1
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, XWORD4
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
@ -376,6 +396,7 @@ avxCbcSm4Single32:
|
|||||||
|
|
||||||
VMOVDQU XWORD0, -32(BX)
|
VMOVDQU XWORD0, -32(BX)
|
||||||
VMOVDQU XWORD1, -16(BX)
|
VMOVDQU XWORD1, -16(BX)
|
||||||
|
VMOVDQU XWORD4, (SI)
|
||||||
JMP avxCbcSm4Done
|
JMP avxCbcSm4Done
|
||||||
|
|
||||||
avxCbcSm4Single48:
|
avxCbcSm4Single48:
|
||||||
@ -383,6 +404,8 @@ avxCbcSm4Single48:
|
|||||||
VMOVDQU -32(DX), XWORD1
|
VMOVDQU -32(DX), XWORD1
|
||||||
VMOVDQU -16(DX), XWORD2
|
VMOVDQU -16(DX), XWORD2
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, XWORD4
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
@ -392,6 +415,7 @@ avxCbcSm4Single48:
|
|||||||
VMOVDQU XWORD0, -48(BX)
|
VMOVDQU XWORD0, -48(BX)
|
||||||
VMOVDQU XWORD1, -32(BX)
|
VMOVDQU XWORD1, -32(BX)
|
||||||
VMOVDQU XWORD2, -16(BX)
|
VMOVDQU XWORD2, -16(BX)
|
||||||
|
VMOVDQU XWORD4, (SI)
|
||||||
|
|
||||||
avxCbcSm4Done:
|
avxCbcSm4Done:
|
||||||
RET
|
RET
|
||||||
@ -548,6 +572,8 @@ avx2CbCSm4Single:
|
|||||||
VMOVDQU -32(DX), XWORD2
|
VMOVDQU -32(DX), XWORD2
|
||||||
VMOVDQU -16(DX), XWORD3
|
VMOVDQU -16(DX), XWORD3
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, XWORD4
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
@ -559,22 +585,29 @@ avx2CbCSm4Single:
|
|||||||
VMOVDQU XWORD1, -48(BX)
|
VMOVDQU XWORD1, -48(BX)
|
||||||
VMOVDQU XWORD2, -32(BX)
|
VMOVDQU XWORD2, -32(BX)
|
||||||
VMOVDQU XWORD3, -16(BX)
|
VMOVDQU XWORD3, -16(BX)
|
||||||
|
VMOVDQU XWORD4, (SI)
|
||||||
JMP avx2CbcSm4Done
|
JMP avx2CbcSm4Done
|
||||||
|
|
||||||
avx2CbcSm4Single16:
|
avx2CbcSm4Single16:
|
||||||
VMOVDQU -16(DX), XWORD0
|
VMOVDQU -16(DX), XWORD0
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, XWORD4
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
|
||||||
VMOVDQU XWORD0, -16(BX)
|
VMOVDQU XWORD0, -16(BX)
|
||||||
|
VMOVDQU XWORD4, (SI)
|
||||||
|
|
||||||
JMP avx2CbcSm4Done
|
JMP avx2CbcSm4Done
|
||||||
|
|
||||||
avx2CbcSm4Single32:
|
avx2CbcSm4Single32:
|
||||||
VMOVDQU -32(DX), XWORD0
|
VMOVDQU -32(DX), XWORD0
|
||||||
VMOVDQU -16(DX), XWORD1
|
VMOVDQU -16(DX), XWORD1
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, XWORD4
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
@ -582,6 +615,8 @@ avx2CbcSm4Single32:
|
|||||||
|
|
||||||
VMOVDQU XWORD0, -32(BX)
|
VMOVDQU XWORD0, -32(BX)
|
||||||
VMOVDQU XWORD1, -16(BX)
|
VMOVDQU XWORD1, -16(BX)
|
||||||
|
VMOVDQU XWORD4, (SI)
|
||||||
|
|
||||||
JMP avx2CbcSm4Done
|
JMP avx2CbcSm4Done
|
||||||
|
|
||||||
avx2CbcSm4Single48:
|
avx2CbcSm4Single48:
|
||||||
@ -589,6 +624,8 @@ avx2CbcSm4Single48:
|
|||||||
VMOVDQU -32(DX), XWORD1
|
VMOVDQU -32(DX), XWORD1
|
||||||
VMOVDQU -16(DX), XWORD2
|
VMOVDQU -16(DX), XWORD2
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, XWORD4
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
@ -598,6 +635,7 @@ avx2CbcSm4Single48:
|
|||||||
VMOVDQU XWORD0, -48(BX)
|
VMOVDQU XWORD0, -48(BX)
|
||||||
VMOVDQU XWORD1, -32(BX)
|
VMOVDQU XWORD1, -32(BX)
|
||||||
VMOVDQU XWORD2, -16(BX)
|
VMOVDQU XWORD2, -16(BX)
|
||||||
|
VMOVDQU XWORD4, (SI)
|
||||||
|
|
||||||
avx2CbcSm4Done:
|
avx2CbcSm4Done:
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
|
@ -233,6 +233,7 @@ cbcSm4Single:
|
|||||||
|
|
||||||
// 4 blocks
|
// 4 blocks
|
||||||
VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
|
VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VMOV t0.B16, t4.B16
|
||||||
VREV32 t0.B16, t0.B16
|
VREV32 t0.B16, t0.B16
|
||||||
VREV32 t1.B16, t1.B16
|
VREV32 t1.B16, t1.B16
|
||||||
VREV32 t2.B16, t2.B16
|
VREV32 t2.B16, t2.B16
|
||||||
@ -262,11 +263,13 @@ cbc4BlocksLoop64:
|
|||||||
VEOR V8.B16, t3.B16, t3.B16
|
VEOR V8.B16, t3.B16, t3.B16
|
||||||
|
|
||||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr)
|
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr)
|
||||||
|
VST1 [t4.S4], (R6)
|
||||||
|
|
||||||
B cbcSm4Done
|
B cbcSm4Done
|
||||||
|
|
||||||
cbcSm4Single16:
|
cbcSm4Single16:
|
||||||
VLD1 (srcPtr), [t0.S4]
|
VLD1 (srcPtr), [t0.S4]
|
||||||
|
VMOV t0.B16, t4.B16
|
||||||
VREV32 t0.B16, t0.B16
|
VREV32 t0.B16, t0.B16
|
||||||
VMOV t0.S[1], t1.S[0]
|
VMOV t0.S[1], t1.S[0]
|
||||||
VMOV t0.S[2], t2.S[0]
|
VMOV t0.S[2], t2.S[0]
|
||||||
@ -290,11 +293,13 @@ cbc4BlocksLoop16:
|
|||||||
VEOR IV.B16, t3.B16, t3.B16
|
VEOR IV.B16, t3.B16, t3.B16
|
||||||
|
|
||||||
VST1 [t3.S4], (dstPtr)
|
VST1 [t3.S4], (dstPtr)
|
||||||
|
VST1 [t4.S4], (R6)
|
||||||
|
|
||||||
B cbcSm4Done
|
B cbcSm4Done
|
||||||
|
|
||||||
cbcSm4Single32:
|
cbcSm4Single32:
|
||||||
VLD1 (srcPtr), [t0.S4, t1.S4]
|
VLD1 (srcPtr), [t0.S4, t1.S4]
|
||||||
|
VMOV t0.B16, t4.B16
|
||||||
VREV32 t0.B16, t0.B16
|
VREV32 t0.B16, t0.B16
|
||||||
VREV32 t1.B16, t1.B16
|
VREV32 t1.B16, t1.B16
|
||||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
@ -318,10 +323,12 @@ cbc4BlocksLoop32:
|
|||||||
VEOR V6.B16, t1.B16, t1.B16
|
VEOR V6.B16, t1.B16, t1.B16
|
||||||
|
|
||||||
VST1 [t0.S4, t1.S4], (dstPtr)
|
VST1 [t0.S4, t1.S4], (dstPtr)
|
||||||
|
VST1 [t4.S4], (R6)
|
||||||
B cbcSm4Done
|
B cbcSm4Done
|
||||||
|
|
||||||
cbcSm4Single48:
|
cbcSm4Single48:
|
||||||
VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4]
|
VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4]
|
||||||
|
VMOV t0.B16, t4.B16
|
||||||
VREV32 t0.B16, t0.B16
|
VREV32 t0.B16, t0.B16
|
||||||
VREV32 t1.B16, t1.B16
|
VREV32 t1.B16, t1.B16
|
||||||
VREV32 t2.B16, t2.B16
|
VREV32 t2.B16, t2.B16
|
||||||
@ -348,6 +355,7 @@ cbc4BlocksLoop48:
|
|||||||
VEOR V7.B16, t2.B16, t2.B16
|
VEOR V7.B16, t2.B16, t2.B16
|
||||||
|
|
||||||
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
|
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
|
||||||
|
VST1 [t4.S4], (R6)
|
||||||
|
|
||||||
cbcSm4Done:
|
cbcSm4Done:
|
||||||
RET
|
RET
|
||||||
|
@ -19,7 +19,6 @@ const cbcDecrypt = 0
|
|||||||
type cbc struct {
|
type cbc struct {
|
||||||
b *sm4CipherAsm
|
b *sm4CipherAsm
|
||||||
iv []byte
|
iv []byte
|
||||||
tmp []byte
|
|
||||||
enc int
|
enc int
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -28,7 +27,6 @@ func (b *sm4CipherAsm) NewCBCEncrypter(iv []byte) cipher.BlockMode {
|
|||||||
c.b = b
|
c.b = b
|
||||||
c.enc = cbcEncrypt
|
c.enc = cbcEncrypt
|
||||||
c.iv = make([]byte, BlockSize)
|
c.iv = make([]byte, BlockSize)
|
||||||
c.tmp = make([]byte, BlockSize)
|
|
||||||
copy(c.iv, iv)
|
copy(c.iv, iv)
|
||||||
return &c
|
return &c
|
||||||
}
|
}
|
||||||
@ -38,7 +36,6 @@ func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode {
|
|||||||
c.b = b
|
c.b = b
|
||||||
c.enc = cbcDecrypt
|
c.enc = cbcDecrypt
|
||||||
c.iv = make([]byte, BlockSize)
|
c.iv = make([]byte, BlockSize)
|
||||||
c.tmp = make([]byte, BlockSize)
|
|
||||||
copy(c.iv, iv)
|
copy(c.iv, iv)
|
||||||
return &c
|
return &c
|
||||||
}
|
}
|
||||||
@ -68,16 +65,8 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
|
|||||||
encryptBlocksChain(&x.b.enc[0], dst, src, &x.iv[0])
|
encryptBlocksChain(&x.b.enc[0], dst, src, &x.iv[0])
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// For each block, we need to xor the decrypted data with the previous block's ciphertext (the iv).
|
|
||||||
// To avoid making a copy each time, we loop over the blocks BACKWARDS.
|
|
||||||
end := len(src)
|
|
||||||
// Copy the last block of ciphertext in preparation as the new iv.
|
|
||||||
copy(x.tmp, src[end-BlockSize:end])
|
|
||||||
|
|
||||||
decryptBlocksChain(&x.b.dec[0], dst, src, &x.iv[0])
|
decryptBlocksChain(&x.b.dec[0], dst, src, &x.iv[0])
|
||||||
|
|
||||||
// Set the new iv to the first block we copied earlier.
|
|
||||||
x.iv, x.tmp = x.tmp, x.iv
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (x *cbc) SetIV(iv []byte) {
|
func (x *cbc) SetIV(iv []byte) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user