sm4: fix cbc iv issue

This commit is contained in:
Sun Yimin 2023-08-08 13:07:10 +08:00 committed by GitHub
parent 0fbc30f868
commit feb76edda8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 46 deletions

View File

@ -132,7 +132,9 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
JE avx2Start
CMPB ·useAVX(SB), $1
JE avxCbcSm4Octets
JE avxStart
MOVOU -16(DX), X15
cbcSm4Octets:
CMPQ DI, $128
@ -155,7 +157,7 @@ cbcSm4Octets:
PXOR -16(DX), XWORD0
PXOR 0(DX), XWORD1
PXOR 16(DX), XWORD2
PXOR 32(DX), XWORD3
PXOR 32(DX), XWORD3
PXOR 48(DX), XWORD4
PXOR 64(DX), XWORD5
PXOR 80(DX), XWORD6
@ -211,8 +213,6 @@ cbCSm4Single:
MOVOU -32(DX), XWORD2
MOVOU -16(DX), XWORD3
MOVOU XWORD0, XWORD4
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
@ -224,28 +224,24 @@ cbCSm4Single:
MOVUPS XWORD1, -48(BX)
MOVUPS XWORD2, -32(BX)
MOVUPS XWORD3, -16(BX)
MOVUPS XWORD4, (SI)
JMP cbcSm4Done
cbcSm4Single16:
MOVOU -16(DX), XWORD0
MOVOU XWORD0, XWORD4
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
MOVUPS XWORD0, -16(BX)
MOVUPS XWORD4, (SI)
JMP cbcSm4Done
cbcSm4Single32:
MOVOU -32(DX), XWORD0
MOVOU -16(DX), XWORD1
MOVOU XWORD0, XWORD4
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
@ -253,7 +249,7 @@ cbcSm4Single32:
MOVUPS XWORD0, -32(BX)
MOVUPS XWORD1, -16(BX)
MOVUPS XWORD4, (SI)
JMP cbcSm4Done
cbcSm4Single48:
@ -261,8 +257,6 @@ cbcSm4Single48:
MOVOU -32(DX), XWORD1
MOVOU -16(DX), XWORD2
MOVOU XWORD0, XWORD4
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
@ -272,11 +266,14 @@ cbcSm4Single48:
MOVUPS XWORD0, -48(BX)
MOVUPS XWORD1, -32(BX)
MOVUPS XWORD2, -16(BX)
MOVUPS XWORD4, (SI)
cbcSm4Done:
MOVUPS X15, (SI)
RET
avxStart:
VMOVDQU -16(DX), X15
avxCbcSm4Octets:
CMPQ DI, $128
JLE avxCbcSm4Nibbles
@ -354,8 +351,6 @@ avxCbCSm4Single:
VMOVDQU -32(DX), XWORD2
VMOVDQU -16(DX), XWORD3
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
@ -367,28 +362,24 @@ avxCbCSm4Single:
VMOVDQU XWORD1, -48(BX)
VMOVDQU XWORD2, -32(BX)
VMOVDQU XWORD3, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avxCbcSm4Done
avxCbcSm4Single16:
VMOVDQU -16(DX), XWORD0
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VMOVDQU XWORD0, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avxCbcSm4Done
avxCbcSm4Single32:
VMOVDQU -32(DX), XWORD0
VMOVDQU -16(DX), XWORD1
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
@ -396,7 +387,7 @@ avxCbcSm4Single32:
VMOVDQU XWORD0, -32(BX)
VMOVDQU XWORD1, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avxCbcSm4Done
avxCbcSm4Single48:
@ -404,8 +395,6 @@ avxCbcSm4Single48:
VMOVDQU -32(DX), XWORD1
VMOVDQU -16(DX), XWORD2
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
@ -415,9 +404,9 @@ avxCbcSm4Single48:
VMOVDQU XWORD0, -48(BX)
VMOVDQU XWORD1, -32(BX)
VMOVDQU XWORD2, -16(BX)
VMOVDQU XWORD4, (SI)
avxCbcSm4Done:
VMOVDQU X15, (SI)
RET
avx2Start:
@ -425,6 +414,8 @@ avx2Start:
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
VMOVDQU -16(DX), X15
avx2_16blocks:
CMPQ DI, $256
JLE avx2CbcSm4Octets
@ -572,8 +563,6 @@ avx2CbCSm4Single:
VMOVDQU -32(DX), XWORD2
VMOVDQU -16(DX), XWORD3
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
@ -585,20 +574,17 @@ avx2CbCSm4Single:
VMOVDQU XWORD1, -48(BX)
VMOVDQU XWORD2, -32(BX)
VMOVDQU XWORD3, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avx2CbcSm4Done
avx2CbcSm4Single16:
VMOVDQU -16(DX), XWORD0
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VMOVDQU XWORD0, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avx2CbcSm4Done
@ -606,8 +592,6 @@ avx2CbcSm4Single32:
VMOVDQU -32(DX), XWORD0
VMOVDQU -16(DX), XWORD1
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
@ -615,7 +599,6 @@ avx2CbcSm4Single32:
VMOVDQU XWORD0, -32(BX)
VMOVDQU XWORD1, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avx2CbcSm4Done
@ -624,8 +607,6 @@ avx2CbcSm4Single48:
VMOVDQU -32(DX), XWORD1
VMOVDQU -16(DX), XWORD2
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
@ -635,8 +616,8 @@ avx2CbcSm4Single48:
VMOVDQU XWORD0, -48(BX)
VMOVDQU XWORD1, -32(BX)
VMOVDQU XWORD2, -16(BX)
VMOVDQU XWORD4, (SI)
avx2CbcSm4Done:
VMOVDQU X15, (SI)
VZEROUPPER
RET

View File

@ -114,6 +114,10 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
MOVD rk, rkSave
VLD1 (R6), [IV.B16]
ADD srcPtr, srcPtrLen, R10
SUB $16, R10, R10
VLD1 (R10), [V15.S4]
cbcSm4Octets:
CMP $128, srcPtrLen
BLE cbcSm4Nibbles
@ -233,7 +237,6 @@ cbcSm4Single:
// 4 blocks
VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
VMOV t0.B16, t4.B16
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
@ -263,13 +266,11 @@ cbc4BlocksLoop64:
VEOR V8.B16, t3.B16, t3.B16
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr)
VST1 [t4.S4], (R6)
B cbcSm4Done
cbcSm4Single16:
VLD1 (srcPtr), [t0.S4]
VMOV t0.B16, t4.B16
VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
@ -293,13 +294,11 @@ cbc4BlocksLoop16:
VEOR IV.B16, t3.B16, t3.B16
VST1 [t3.S4], (dstPtr)
VST1 [t4.S4], (R6)
B cbcSm4Done
cbcSm4Single32:
VLD1 (srcPtr), [t0.S4, t1.S4]
VMOV t0.B16, t4.B16
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
@ -323,12 +322,10 @@ cbc4BlocksLoop32:
VEOR V6.B16, t1.B16, t1.B16
VST1 [t0.S4, t1.S4], (dstPtr)
VST1 [t4.S4], (R6)
B cbcSm4Done
cbcSm4Single48:
VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4]
VMOV t0.B16, t4.B16
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
@ -355,7 +352,7 @@ cbc4BlocksLoop48:
VEOR V7.B16, t2.B16, t2.B16
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
VST1 [t4.S4], (R6)
cbcSm4Done:
VST1 [V15.S4], (R6)
RET