sm4: fix cbc iv issue

This commit is contained in:
Sun Yimin 2023-08-08 13:07:10 +08:00 committed by GitHub
parent 0fbc30f868
commit feb76edda8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 46 deletions

View File

@ -132,7 +132,9 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
JE avx2Start JE avx2Start
CMPB ·useAVX(SB), $1 CMPB ·useAVX(SB), $1
JE avxCbcSm4Octets JE avxStart
MOVOU -16(DX), X15
cbcSm4Octets: cbcSm4Octets:
CMPQ DI, $128 CMPQ DI, $128
@ -155,7 +157,7 @@ cbcSm4Octets:
PXOR -16(DX), XWORD0 PXOR -16(DX), XWORD0
PXOR 0(DX), XWORD1 PXOR 0(DX), XWORD1
PXOR 16(DX), XWORD2 PXOR 16(DX), XWORD2
PXOR 32(DX), XWORD3 PXOR 32(DX), XWORD3
PXOR 48(DX), XWORD4 PXOR 48(DX), XWORD4
PXOR 64(DX), XWORD5 PXOR 64(DX), XWORD5
PXOR 80(DX), XWORD6 PXOR 80(DX), XWORD6
@ -211,8 +213,6 @@ cbCSm4Single:
MOVOU -32(DX), XWORD2 MOVOU -32(DX), XWORD2
MOVOU -16(DX), XWORD3 MOVOU -16(DX), XWORD3
MOVOU XWORD0, XWORD4
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0 PXOR 0(SI), XWORD0
@ -224,28 +224,24 @@ cbCSm4Single:
MOVUPS XWORD1, -48(BX) MOVUPS XWORD1, -48(BX)
MOVUPS XWORD2, -32(BX) MOVUPS XWORD2, -32(BX)
MOVUPS XWORD3, -16(BX) MOVUPS XWORD3, -16(BX)
MOVUPS XWORD4, (SI)
JMP cbcSm4Done JMP cbcSm4Done
cbcSm4Single16: cbcSm4Single16:
MOVOU -16(DX), XWORD0 MOVOU -16(DX), XWORD0
MOVOU XWORD0, XWORD4
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0 PXOR 0(SI), XWORD0
MOVUPS XWORD0, -16(BX) MOVUPS XWORD0, -16(BX)
MOVUPS XWORD4, (SI)
JMP cbcSm4Done JMP cbcSm4Done
cbcSm4Single32: cbcSm4Single32:
MOVOU -32(DX), XWORD0 MOVOU -32(DX), XWORD0
MOVOU -16(DX), XWORD1 MOVOU -16(DX), XWORD1
MOVOU XWORD0, XWORD4
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0 PXOR 0(SI), XWORD0
@ -253,7 +249,7 @@ cbcSm4Single32:
MOVUPS XWORD0, -32(BX) MOVUPS XWORD0, -32(BX)
MOVUPS XWORD1, -16(BX) MOVUPS XWORD1, -16(BX)
MOVUPS XWORD4, (SI)
JMP cbcSm4Done JMP cbcSm4Done
cbcSm4Single48: cbcSm4Single48:
@ -261,8 +257,6 @@ cbcSm4Single48:
MOVOU -32(DX), XWORD1 MOVOU -32(DX), XWORD1
MOVOU -16(DX), XWORD2 MOVOU -16(DX), XWORD2
MOVOU XWORD0, XWORD4
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0 PXOR 0(SI), XWORD0
@ -272,11 +266,14 @@ cbcSm4Single48:
MOVUPS XWORD0, -48(BX) MOVUPS XWORD0, -48(BX)
MOVUPS XWORD1, -32(BX) MOVUPS XWORD1, -32(BX)
MOVUPS XWORD2, -16(BX) MOVUPS XWORD2, -16(BX)
MOVUPS XWORD4, (SI)
cbcSm4Done: cbcSm4Done:
MOVUPS X15, (SI)
RET RET
avxStart:
VMOVDQU -16(DX), X15
avxCbcSm4Octets: avxCbcSm4Octets:
CMPQ DI, $128 CMPQ DI, $128
JLE avxCbcSm4Nibbles JLE avxCbcSm4Nibbles
@ -354,8 +351,6 @@ avxCbCSm4Single:
VMOVDQU -32(DX), XWORD2 VMOVDQU -32(DX), XWORD2
VMOVDQU -16(DX), XWORD3 VMOVDQU -16(DX), XWORD3
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0 VPXOR 0(SI), XWORD0, XWORD0
@ -367,28 +362,24 @@ avxCbCSm4Single:
VMOVDQU XWORD1, -48(BX) VMOVDQU XWORD1, -48(BX)
VMOVDQU XWORD2, -32(BX) VMOVDQU XWORD2, -32(BX)
VMOVDQU XWORD3, -16(BX) VMOVDQU XWORD3, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avxCbcSm4Done JMP avxCbcSm4Done
avxCbcSm4Single16: avxCbcSm4Single16:
VMOVDQU -16(DX), XWORD0 VMOVDQU -16(DX), XWORD0
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0 VPXOR 0(SI), XWORD0, XWORD0
VMOVDQU XWORD0, -16(BX) VMOVDQU XWORD0, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avxCbcSm4Done JMP avxCbcSm4Done
avxCbcSm4Single32: avxCbcSm4Single32:
VMOVDQU -32(DX), XWORD0 VMOVDQU -32(DX), XWORD0
VMOVDQU -16(DX), XWORD1 VMOVDQU -16(DX), XWORD1
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0 VPXOR 0(SI), XWORD0, XWORD0
@ -396,7 +387,7 @@ avxCbcSm4Single32:
VMOVDQU XWORD0, -32(BX) VMOVDQU XWORD0, -32(BX)
VMOVDQU XWORD1, -16(BX) VMOVDQU XWORD1, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avxCbcSm4Done JMP avxCbcSm4Done
avxCbcSm4Single48: avxCbcSm4Single48:
@ -404,8 +395,6 @@ avxCbcSm4Single48:
VMOVDQU -32(DX), XWORD1 VMOVDQU -32(DX), XWORD1
VMOVDQU -16(DX), XWORD2 VMOVDQU -16(DX), XWORD2
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0 VPXOR 0(SI), XWORD0, XWORD0
@ -415,9 +404,9 @@ avxCbcSm4Single48:
VMOVDQU XWORD0, -48(BX) VMOVDQU XWORD0, -48(BX)
VMOVDQU XWORD1, -32(BX) VMOVDQU XWORD1, -32(BX)
VMOVDQU XWORD2, -16(BX) VMOVDQU XWORD2, -16(BX)
VMOVDQU XWORD4, (SI)
avxCbcSm4Done: avxCbcSm4Done:
VMOVDQU X15, (SI)
RET RET
avx2Start: avx2Start:
@ -425,6 +414,8 @@ avx2Start:
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
VMOVDQU -16(DX), X15
avx2_16blocks: avx2_16blocks:
CMPQ DI, $256 CMPQ DI, $256
JLE avx2CbcSm4Octets JLE avx2CbcSm4Octets
@ -572,8 +563,6 @@ avx2CbCSm4Single:
VMOVDQU -32(DX), XWORD2 VMOVDQU -32(DX), XWORD2
VMOVDQU -16(DX), XWORD3 VMOVDQU -16(DX), XWORD3
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0 VPXOR 0(SI), XWORD0, XWORD0
@ -585,20 +574,17 @@ avx2CbCSm4Single:
VMOVDQU XWORD1, -48(BX) VMOVDQU XWORD1, -48(BX)
VMOVDQU XWORD2, -32(BX) VMOVDQU XWORD2, -32(BX)
VMOVDQU XWORD3, -16(BX) VMOVDQU XWORD3, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avx2CbcSm4Done JMP avx2CbcSm4Done
avx2CbcSm4Single16: avx2CbcSm4Single16:
VMOVDQU -16(DX), XWORD0 VMOVDQU -16(DX), XWORD0
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0 VPXOR 0(SI), XWORD0, XWORD0
VMOVDQU XWORD0, -16(BX) VMOVDQU XWORD0, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avx2CbcSm4Done JMP avx2CbcSm4Done
@ -606,8 +592,6 @@ avx2CbcSm4Single32:
VMOVDQU -32(DX), XWORD0 VMOVDQU -32(DX), XWORD0
VMOVDQU -16(DX), XWORD1 VMOVDQU -16(DX), XWORD1
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0 VPXOR 0(SI), XWORD0, XWORD0
@ -615,7 +599,6 @@ avx2CbcSm4Single32:
VMOVDQU XWORD0, -32(BX) VMOVDQU XWORD0, -32(BX)
VMOVDQU XWORD1, -16(BX) VMOVDQU XWORD1, -16(BX)
VMOVDQU XWORD4, (SI)
JMP avx2CbcSm4Done JMP avx2CbcSm4Done
@ -624,8 +607,6 @@ avx2CbcSm4Single48:
VMOVDQU -32(DX), XWORD1 VMOVDQU -32(DX), XWORD1
VMOVDQU -16(DX), XWORD2 VMOVDQU -16(DX), XWORD2
VMOVDQU XWORD0, XWORD4
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0 VPXOR 0(SI), XWORD0, XWORD0
@ -635,8 +616,8 @@ avx2CbcSm4Single48:
VMOVDQU XWORD0, -48(BX) VMOVDQU XWORD0, -48(BX)
VMOVDQU XWORD1, -32(BX) VMOVDQU XWORD1, -32(BX)
VMOVDQU XWORD2, -16(BX) VMOVDQU XWORD2, -16(BX)
VMOVDQU XWORD4, (SI)
avx2CbcSm4Done: avx2CbcSm4Done:
VMOVDQU X15, (SI)
VZEROUPPER VZEROUPPER
RET RET

View File

@ -114,6 +114,10 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
MOVD rk, rkSave MOVD rk, rkSave
VLD1 (R6), [IV.B16] VLD1 (R6), [IV.B16]
ADD srcPtr, srcPtrLen, R10
SUB $16, R10, R10
VLD1 (R10), [V15.S4]
cbcSm4Octets: cbcSm4Octets:
CMP $128, srcPtrLen CMP $128, srcPtrLen
BLE cbcSm4Nibbles BLE cbcSm4Nibbles
@ -233,7 +237,6 @@ cbcSm4Single:
// 4 blocks // 4 blocks
VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4] VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
VMOV t0.B16, t4.B16
VREV32 t0.B16, t0.B16 VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16 VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16 VREV32 t2.B16, t2.B16
@ -263,13 +266,11 @@ cbc4BlocksLoop64:
VEOR V8.B16, t3.B16, t3.B16 VEOR V8.B16, t3.B16, t3.B16
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr) VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr)
VST1 [t4.S4], (R6)
B cbcSm4Done B cbcSm4Done
cbcSm4Single16: cbcSm4Single16:
VLD1 (srcPtr), [t0.S4] VLD1 (srcPtr), [t0.S4]
VMOV t0.B16, t4.B16
VREV32 t0.B16, t0.B16 VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0] VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0] VMOV t0.S[2], t2.S[0]
@ -293,13 +294,11 @@ cbc4BlocksLoop16:
VEOR IV.B16, t3.B16, t3.B16 VEOR IV.B16, t3.B16, t3.B16
VST1 [t3.S4], (dstPtr) VST1 [t3.S4], (dstPtr)
VST1 [t4.S4], (R6)
B cbcSm4Done B cbcSm4Done
cbcSm4Single32: cbcSm4Single32:
VLD1 (srcPtr), [t0.S4, t1.S4] VLD1 (srcPtr), [t0.S4, t1.S4]
VMOV t0.B16, t4.B16
VREV32 t0.B16, t0.B16 VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16 VREV32 t1.B16, t1.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
@ -323,12 +322,10 @@ cbc4BlocksLoop32:
VEOR V6.B16, t1.B16, t1.B16 VEOR V6.B16, t1.B16, t1.B16
VST1 [t0.S4, t1.S4], (dstPtr) VST1 [t0.S4, t1.S4], (dstPtr)
VST1 [t4.S4], (R6)
B cbcSm4Done B cbcSm4Done
cbcSm4Single48: cbcSm4Single48:
VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4] VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4]
VMOV t0.B16, t4.B16
VREV32 t0.B16, t0.B16 VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16 VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16 VREV32 t2.B16, t2.B16
@ -355,7 +352,7 @@ cbc4BlocksLoop48:
VEOR V7.B16, t2.B16, t2.B16 VEOR V7.B16, t2.B16, t2.B16
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr) VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
VST1 [t4.S4], (R6)
cbcSm4Done: cbcSm4Done:
VST1 [V15.S4], (R6)
RET RET