mirror of
https://github.com/emmansun/gmsm.git
synced 2025-05-14 13:06:18 +08:00
[SM4] code format
This commit is contained in:
parent
6dde984da4
commit
067a12cb20
264
sm4/gcm_amd64.s
264
sm4/gcm_amd64.s
@ -259,7 +259,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
|||||||
PXOR t1, x; \
|
PXOR t1, x; \
|
||||||
PXOR t2, x; \
|
PXOR t2, x; \
|
||||||
PXOR t3, x; \
|
PXOR t3, x; \
|
||||||
SM4_TAO_L1(x, y, z); \
|
SM4_TAO_L1(x, y, z); \
|
||||||
PXOR x, t0
|
PXOR x, t0
|
||||||
|
|
||||||
// MOVOU r0, tmp2;
|
// MOVOU r0, tmp2;
|
||||||
@ -332,12 +332,12 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
|||||||
PINSRQ $0, r, r2
|
PINSRQ $0, r, r2
|
||||||
|
|
||||||
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
|
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||||
PSHUFB flipMask<>(SB), t0; \
|
PSHUFB flipMask<>(SB), t0; \
|
||||||
PSHUFB flipMask<>(SB), t1; \
|
PSHUFB flipMask<>(SB), t1; \
|
||||||
PSHUFB flipMask<>(SB), t2; \
|
PSHUFB flipMask<>(SB), t2; \
|
||||||
PSHUFB flipMask<>(SB), t3; \
|
PSHUFB flipMask<>(SB), t3; \
|
||||||
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \
|
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \
|
||||||
XORL IND, IND; \
|
XORL IND, IND; \
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||||
@ -352,36 +352,36 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
|||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||||
ADDL $16, IND; \
|
ADDL $16, IND; \
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||||
ADDL $16, IND; \
|
ADDL $16, IND; \
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||||
ADDL $16, IND; \
|
ADDL $16, IND; \
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||||
ADDL $16, IND; \
|
ADDL $16, IND; \
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||||
ADDL $16, IND; \
|
ADDL $16, IND; \
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
||||||
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \
|
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \
|
||||||
PSHUFB BSWAP, t3; \
|
PSHUFB BSWAP, t3; \
|
||||||
PSHUFB BSWAP, t2; \
|
PSHUFB BSWAP, t2; \
|
||||||
PSHUFB BSWAP, t1; \
|
PSHUFB BSWAP, t1; \
|
||||||
PSHUFB BSWAP, t0
|
PSHUFB BSWAP, t0
|
||||||
|
|
||||||
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
||||||
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
||||||
@ -419,16 +419,16 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
#define AVX2_SM4_TAO_L1(x, y, xw, yw, tmp) \
|
#define AVX2_SM4_TAO_L1(x, y, xw, yw, tmp) \
|
||||||
AVX2_SM4_SBOX(x, y, xw, yw, tmp); \
|
AVX2_SM4_SBOX(x, y, xw, yw, tmp); \
|
||||||
VBROADCASTI128 r08Mask<>(SB), tmp; \
|
VBROADCASTI128 r08Mask<>(SB), tmp; \
|
||||||
VPSHUFB tmp, x, y; \
|
VPSHUFB tmp, x, y; \
|
||||||
VPXOR x, y, y; \
|
VPXOR x, y, y; \
|
||||||
VBROADCASTI128 r16Mask<>(SB), tmp; \
|
VBROADCASTI128 r16Mask<>(SB), tmp; \
|
||||||
VPSHUFB tmp, x, tmp; \
|
VPSHUFB tmp, x, tmp; \
|
||||||
VPXOR tmp, y, y; \
|
VPXOR tmp, y, y; \
|
||||||
VPSLLD $2, y, tmp; \
|
VPSLLD $2, y, tmp; \
|
||||||
VPSRLD $30, y, y; \
|
VPSRLD $30, y, y; \
|
||||||
VPXOR tmp, y, y; \
|
VPXOR tmp, y, y; \
|
||||||
VBROADCASTI128 r24Mask<>(SB), tmp; \
|
VBROADCASTI128 r24Mask<>(SB), tmp; \
|
||||||
VPSHUFB tmp, x, tmp; \
|
VPSHUFB tmp, x, tmp; \
|
||||||
VPXOR y, x, x; \
|
VPXOR y, x, x; \
|
||||||
VPXOR x, tmp, x
|
VPXOR x, tmp, x
|
||||||
@ -504,14 +504,14 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
|||||||
XORL CX, CX
|
XORL CX, CX
|
||||||
|
|
||||||
sm4InitEncLoop:
|
sm4InitEncLoop:
|
||||||
SM4_SINGLE_ROUND(0, RK, CX, T0, T1, T2, B0, B1, B2, B3)
|
SM4_SINGLE_ROUND(0, RK, CX, T0, T1, T2, B0, B1, B2, B3)
|
||||||
SM4_SINGLE_ROUND(1, RK, CX, T0, T1, T2, B1, B2, B3, B0)
|
SM4_SINGLE_ROUND(1, RK, CX, T0, T1, T2, B1, B2, B3, B0)
|
||||||
SM4_SINGLE_ROUND(2, RK, CX, T0, T1, T2, B2, B3, B0, B1)
|
SM4_SINGLE_ROUND(2, RK, CX, T0, T1, T2, B2, B3, B0, B1)
|
||||||
SM4_SINGLE_ROUND(3, RK, CX, T0, T1, T2, B3, B0, B1, B2)
|
SM4_SINGLE_ROUND(3, RK, CX, T0, T1, T2, B3, B0, B1, B2)
|
||||||
|
|
||||||
ADDL $16, CX
|
ADDL $16, CX
|
||||||
CMPL CX, $4*32
|
CMPL CX, $4*32
|
||||||
JB sm4InitEncLoop
|
JB sm4InitEncLoop
|
||||||
|
|
||||||
PEXTRD $0, B1, R8
|
PEXTRD $0, B1, R8
|
||||||
PINSRD $1, R8, B0
|
PINSRD $1, R8, B0
|
||||||
@ -608,7 +608,7 @@ TEXT ·gcmSm4Data(SB),NOSPLIT,$0
|
|||||||
MOVQ T+32(FP), tPtr
|
MOVQ T+32(FP), tPtr
|
||||||
|
|
||||||
//PXOR ACC0, ACC0
|
//PXOR ACC0, ACC0
|
||||||
MOVOU (tPtr), ACC0
|
MOVOU (tPtr), ACC0
|
||||||
MOVOU bswapMask<>(SB), BSWAP
|
MOVOU bswapMask<>(SB), BSWAP
|
||||||
MOVOU gcmPoly<>(SB), POLY
|
MOVOU gcmPoly<>(SB), POLY
|
||||||
|
|
||||||
@ -1206,13 +1206,13 @@ avx2GcmSm4Enc:
|
|||||||
|
|
||||||
VBROADCASTI128 flipMask<>(SB), XDWTMP0
|
VBROADCASTI128 flipMask<>(SB), XDWTMP0
|
||||||
// Apply Byte Flip Mask: LE -> BE
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||||
VPSHUFB XDWTMP0, DWB1, DWB1
|
VPSHUFB XDWTMP0, DWB1, DWB1
|
||||||
VPSHUFB XDWTMP0, DWB2, DWB2
|
VPSHUFB XDWTMP0, DWB2, DWB2
|
||||||
VPSHUFB XDWTMP0, DWB3, DWB3
|
VPSHUFB XDWTMP0, DWB3, DWB3
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
XORL BX, BX
|
XORL BX, BX
|
||||||
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
|
||||||
|
|
||||||
@ -1222,18 +1222,18 @@ avx2GcmSm4Enc8Loop1:
|
|||||||
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
||||||
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
JB avx2GcmSm4Enc8Loop1
|
JB avx2GcmSm4Enc8Loop1
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
||||||
VPSHUFB DWBSWAP, DWB0, DWB0
|
VPSHUFB DWBSWAP, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, DWB1, DWB1
|
VPSHUFB DWBSWAP, DWB1, DWB1
|
||||||
VPSHUFB DWBSWAP, DWB2, DWB2
|
VPSHUFB DWBSWAP, DWB2, DWB2
|
||||||
VPSHUFB DWBSWAP, DWB3, DWB3
|
VPSHUFB DWBSWAP, DWB3, DWB3
|
||||||
|
|
||||||
increment(0)
|
increment(0)
|
||||||
increment(1)
|
increment(1)
|
||||||
@ -1289,10 +1289,10 @@ avx2GcmSm4EncOctetsLoop:
|
|||||||
|
|
||||||
VBROADCASTI128 flipMask<>(SB), XDWTMP0
|
VBROADCASTI128 flipMask<>(SB), XDWTMP0
|
||||||
// Apply Byte Flip Mask: LE -> BE
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||||
VPSHUFB XDWTMP0, DWB1, DWB1
|
VPSHUFB XDWTMP0, DWB1, DWB1
|
||||||
VPSHUFB XDWTMP0, DWB2, DWB2
|
VPSHUFB XDWTMP0, DWB2, DWB2
|
||||||
VPSHUFB XDWTMP0, DWB3, DWB3
|
VPSHUFB XDWTMP0, DWB3, DWB3
|
||||||
|
|
||||||
VMOVDQU (16*0)(SP), T0
|
VMOVDQU (16*0)(SP), T0
|
||||||
VPSHUFD $78, T0, T1
|
VPSHUFD $78, T0, T1
|
||||||
@ -1306,8 +1306,8 @@ avx2GcmSm4EncOctetsLoop:
|
|||||||
PCLMULQDQ $0x00, T0, ACC0
|
PCLMULQDQ $0x00, T0, ACC0
|
||||||
PCLMULQDQ $0x11, T0, ACC1
|
PCLMULQDQ $0x11, T0, ACC1
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
XORL BX, BX
|
XORL BX, BX
|
||||||
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
|
||||||
|
|
||||||
@ -1321,14 +1321,14 @@ avx2GcmSm4Enc8Loop2:
|
|||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
JB avx2GcmSm4Enc8Loop2
|
JB avx2GcmSm4Enc8Loop2
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
||||||
VPSHUFB DWBSWAP, DWB0, DWB0
|
VPSHUFB DWBSWAP, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, DWB1, DWB1
|
VPSHUFB DWBSWAP, DWB1, DWB1
|
||||||
VPSHUFB DWBSWAP, DWB2, DWB2
|
VPSHUFB DWBSWAP, DWB2, DWB2
|
||||||
VPSHUFB DWBSWAP, DWB3, DWB3
|
VPSHUFB DWBSWAP, DWB3, DWB3
|
||||||
|
|
||||||
mulRound(1)
|
mulRound(1)
|
||||||
increment(0)
|
increment(0)
|
||||||
@ -1437,8 +1437,8 @@ avx2GcmSm4EncNibbles:
|
|||||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||||
|
|
||||||
VMOVDQU flipMask<>(SB), B4
|
VMOVDQU flipMask<>(SB), B4
|
||||||
VPSHUFB B4, B0, B0
|
VPSHUFB B4, B0, B0
|
||||||
VPSHUFB B4, B1, B1
|
VPSHUFB B4, B1, B1
|
||||||
VPSHUFB B4, B2, B2
|
VPSHUFB B4, B2, B2
|
||||||
VPSHUFB B4, B3, B3
|
VPSHUFB B4, B3, B3
|
||||||
@ -1448,21 +1448,21 @@ avx2GcmSm4EncNibbles:
|
|||||||
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Enc4Loop2:
|
avx2GcmSm4Enc4Loop2:
|
||||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
JB avx2GcmSm4Enc4Loop2
|
JB avx2GcmSm4Enc4Loop2
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
||||||
VPSHUFB BSWAP, B0, B0
|
VPSHUFB BSWAP, B0, B0
|
||||||
VPSHUFB BSWAP, B1, B1
|
VPSHUFB BSWAP, B1, B1
|
||||||
VPSHUFB BSWAP, B2, B2
|
VPSHUFB BSWAP, B2, B2
|
||||||
VPSHUFB BSWAP, B3, B3
|
VPSHUFB BSWAP, B3, B3
|
||||||
|
|
||||||
VMOVDQU (16*0)(ptx), T0
|
VMOVDQU (16*0)(ptx), T0
|
||||||
VPXOR T0, B0, B0
|
VPXOR T0, B0, B0
|
||||||
@ -1500,8 +1500,8 @@ avx2GcmSm4EncSingles:
|
|||||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||||
|
|
||||||
VMOVDQU flipMask<>(SB), B4
|
VMOVDQU flipMask<>(SB), B4
|
||||||
VPSHUFB B4, B0, B0
|
VPSHUFB B4, B0, B0
|
||||||
VPSHUFB B4, B1, B1
|
VPSHUFB B4, B1, B1
|
||||||
VPSHUFB B4, B2, B2
|
VPSHUFB B4, B2, B2
|
||||||
VPSHUFB B4, B3, B3
|
VPSHUFB B4, B3, B3
|
||||||
@ -1511,21 +1511,21 @@ avx2GcmSm4EncSingles:
|
|||||||
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Enc4Loop1:
|
avx2GcmSm4Enc4Loop1:
|
||||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
JB avx2GcmSm4Enc4Loop1
|
JB avx2GcmSm4Enc4Loop1
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
||||||
VPSHUFB BSWAP, B0, B0
|
VPSHUFB BSWAP, B0, B0
|
||||||
VPSHUFB BSWAP, B1, B1
|
VPSHUFB BSWAP, B1, B1
|
||||||
VPSHUFB BSWAP, B2, B2
|
VPSHUFB BSWAP, B2, B2
|
||||||
VPSHUFB BSWAP, B3, B3
|
VPSHUFB BSWAP, B3, B3
|
||||||
|
|
||||||
VMOVDQU B0, (16*0)(SP)
|
VMOVDQU B0, (16*0)(SP)
|
||||||
VMOVDQU B1, (16*1)(SP)
|
VMOVDQU B1, (16*1)(SP)
|
||||||
@ -1932,10 +1932,10 @@ avx2GcmSm4DecOctetsLoop:
|
|||||||
|
|
||||||
VBROADCASTI128 flipMask<>(SB), XDWTMP0
|
VBROADCASTI128 flipMask<>(SB), XDWTMP0
|
||||||
// Apply Byte Flip Mask: LE -> BE
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||||
VPSHUFB XDWTMP0, DWB1, DWB1
|
VPSHUFB XDWTMP0, DWB1, DWB1
|
||||||
VPSHUFB XDWTMP0, DWB2, DWB2
|
VPSHUFB XDWTMP0, DWB2, DWB2
|
||||||
VPSHUFB XDWTMP0, DWB3, DWB3
|
VPSHUFB XDWTMP0, DWB3, DWB3
|
||||||
|
|
||||||
VMOVDQU (16*0)(ctx), T0
|
VMOVDQU (16*0)(ctx), T0
|
||||||
VPSHUFB BSWAP, T0, T0
|
VPSHUFB BSWAP, T0, T0
|
||||||
@ -1952,8 +1952,8 @@ avx2GcmSm4DecOctetsLoop:
|
|||||||
PCLMULQDQ $0x11, T0, ACC1
|
PCLMULQDQ $0x11, T0, ACC1
|
||||||
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
XORL BX, BX
|
XORL BX, BX
|
||||||
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
|
||||||
|
|
||||||
@ -1963,18 +1963,18 @@ avx2GcmSm4Dec8Loop2:
|
|||||||
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
||||||
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
JB avx2GcmSm4Dec8Loop2
|
JB avx2GcmSm4Dec8Loop2
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
||||||
VPSHUFB DWBSWAP, DWB0, DWB0
|
VPSHUFB DWBSWAP, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, DWB1, DWB1
|
VPSHUFB DWBSWAP, DWB1, DWB1
|
||||||
VPSHUFB DWBSWAP, DWB2, DWB2
|
VPSHUFB DWBSWAP, DWB2, DWB2
|
||||||
VPSHUFB DWBSWAP, DWB3, DWB3
|
VPSHUFB DWBSWAP, DWB3, DWB3
|
||||||
decMulRound(1)
|
decMulRound(1)
|
||||||
increment(0)
|
increment(0)
|
||||||
decMulRound(2)
|
decMulRound(2)
|
||||||
@ -2034,8 +2034,8 @@ avx2GcmSm4DecNibbles:
|
|||||||
VMOVDQU (1*16)(SP), B1
|
VMOVDQU (1*16)(SP), B1
|
||||||
VMOVDQU (2*16)(SP), B2
|
VMOVDQU (2*16)(SP), B2
|
||||||
VMOVDQU (3*16)(SP), B3
|
VMOVDQU (3*16)(SP), B3
|
||||||
VMOVDQU flipMask<>(SB), B4
|
VMOVDQU flipMask<>(SB), B4
|
||||||
VPSHUFB B4, B0, B0
|
VPSHUFB B4, B0, B0
|
||||||
VPSHUFB B4, B1, B1
|
VPSHUFB B4, B1, B1
|
||||||
VPSHUFB B4, B2, B2
|
VPSHUFB B4, B2, B2
|
||||||
VPSHUFB B4, B3, B3
|
VPSHUFB B4, B3, B3
|
||||||
@ -2045,21 +2045,21 @@ avx2GcmSm4DecNibbles:
|
|||||||
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Dec4Loop2:
|
avx2GcmSm4Dec4Loop2:
|
||||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
JB avx2GcmSm4Dec4Loop2
|
JB avx2GcmSm4Dec4Loop2
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
||||||
VPSHUFB BSWAP, B0, B0
|
VPSHUFB BSWAP, B0, B0
|
||||||
VPSHUFB BSWAP, B1, B1
|
VPSHUFB BSWAP, B1, B1
|
||||||
VPSHUFB BSWAP, B2, B2
|
VPSHUFB BSWAP, B2, B2
|
||||||
VPSHUFB BSWAP, B3, B3
|
VPSHUFB BSWAP, B3, B3
|
||||||
|
|
||||||
VMOVDQU (16*14)(pTbl), T2
|
VMOVDQU (16*14)(pTbl), T2
|
||||||
VMOVDQU (16*0)(ctx), T0
|
VMOVDQU (16*0)(ctx), T0
|
||||||
@ -2096,8 +2096,8 @@ avx2GcmSm4DecSingles:
|
|||||||
VMOVDQU (2*16)(SP), B2
|
VMOVDQU (2*16)(SP), B2
|
||||||
VMOVDQU (3*16)(SP), B3
|
VMOVDQU (3*16)(SP), B3
|
||||||
|
|
||||||
VMOVDQU flipMask<>(SB), B4
|
VMOVDQU flipMask<>(SB), B4
|
||||||
VPSHUFB B4, B0, B0
|
VPSHUFB B4, B0, B0
|
||||||
VPSHUFB B4, B1, B1
|
VPSHUFB B4, B1, B1
|
||||||
VPSHUFB B4, B2, B2
|
VPSHUFB B4, B2, B2
|
||||||
VPSHUFB B4, B3, B3
|
VPSHUFB B4, B3, B3
|
||||||
@ -2107,21 +2107,21 @@ avx2GcmSm4DecSingles:
|
|||||||
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Dec4Loop1:
|
avx2GcmSm4Dec4Loop1:
|
||||||
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
||||||
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
||||||
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
||||||
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
||||||
|
|
||||||
ADDL $16, BX
|
ADDL $16, BX
|
||||||
CMPL BX, $4*32
|
CMPL BX, $4*32
|
||||||
JB avx2GcmSm4Dec4Loop1
|
JB avx2GcmSm4Dec4Loop1
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
||||||
VPSHUFB BSWAP, B0, B0
|
VPSHUFB BSWAP, B0, B0
|
||||||
VPSHUFB BSWAP, B1, B1
|
VPSHUFB BSWAP, B1, B1
|
||||||
VPSHUFB BSWAP, B2, B2
|
VPSHUFB BSWAP, B2, B2
|
||||||
VPSHUFB BSWAP, B3, B3
|
VPSHUFB BSWAP, B3, B3
|
||||||
|
|
||||||
VMOVDQU B0, (16*4)(SP)
|
VMOVDQU B0, (16*4)(SP)
|
||||||
VMOVDQU B1, (16*5)(SP)
|
VMOVDQU B1, (16*5)(SP)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user