sm4: disable PXOR use m128 directly

This commit is contained in:
Sun Yimin 2023-11-01 15:00:49 +08:00 committed by GitHub
parent 2f163662b5
commit 8f5e603f94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 117 additions and 59 deletions

View File

@ -151,15 +151,23 @@ cbcSm4Octets:
MOVOU 112(DX), XWORD7
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
PXOR -16(DX), XWORD0
PXOR 0(DX), XWORD1
PXOR 16(DX), XWORD2
PXOR 32(DX), XWORD3
PXOR 48(DX), XWORD4
PXOR 64(DX), XWORD5
PXOR 80(DX), XWORD6
PXOR 96(DX), XWORD7
MOVOU -16(DX), XWTMP0
PXOR XWTMP0, XWORD0
MOVOU 0(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVOU 16(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVOU 32(DX), XWTMP0
PXOR XWTMP0, XWORD3
MOVOU 48(DX), XWTMP0
PXOR XWTMP0, XWORD4
MOVOU 64(DX), XWTMP0
PXOR XWTMP0, XWORD5
MOVOU 80(DX), XWTMP0
PXOR XWTMP0, XWORD6
MOVOU 96(DX), XWTMP0
PXOR XWTMP0, XWORD7
MOVOU XWORD0, 0(BX)
MOVOU XWORD1, 16(BX)
@ -186,10 +194,14 @@ cbcSm4Nibbles:
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR -16(DX), XWORD0
PXOR 0(DX), XWORD1
PXOR 16(DX), XWORD2
PXOR 32(DX), XWORD3
MOVUPS -16(DX), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS 0(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVUPS 16(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVUPS 32(DX), XWTMP0
PXOR XWTMP0, XWORD3
MOVUPS XWORD0, 0(BX)
MOVUPS XWORD1, 16(BX)
@ -213,10 +225,14 @@ cbCSm4Single:
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
PXOR -64(DX), XWORD1
PXOR -48(DX), XWORD2
PXOR -32(DX), XWORD3
MOVUPS 0(SI), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS -64(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVUPS -48(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVUPS -32(DX), XWTMP0
PXOR XWTMP0, XWORD3
MOVUPS XWORD0, -64(BX)
MOVUPS XWORD1, -48(BX)
@ -230,7 +246,8 @@ cbcSm4Single16:
SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
MOVUPS 0(SI), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS XWORD0, -16(BX)
@ -242,8 +259,10 @@ cbcSm4Single32:
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
PXOR -32(DX), XWORD1
MOVUPS 0(SI), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS -32(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVUPS XWORD0, -32(BX)
MOVUPS XWORD1, -16(BX)
@ -257,9 +276,12 @@ cbcSm4Single48:
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
PXOR -48(DX), XWORD1
PXOR -32(DX), XWORD2
MOVUPS 0(SI), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS -48(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVUPS -32(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVUPS XWORD0, -48(BX)
MOVUPS XWORD1, -32(BX)

View File

@ -677,14 +677,22 @@ gcmSm4EncOctetsLoop:
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
PXOR (16*0)(ptx), B0
PXOR (16*1)(ptx), B1
PXOR (16*2)(ptx), B2
PXOR (16*3)(ptx), B3
PXOR (16*4)(ptx), B4
PXOR (16*5)(ptx), B5
PXOR (16*6)(ptx), B6
PXOR (16*7)(ptx), B7
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
MOVOU (16*4)(ptx), T0
PXOR T0, B4
MOVOU (16*5)(ptx), T0
PXOR T0, B5
MOVOU (16*6)(ptx), T0
PXOR T0, B6
MOVOU (16*7)(ptx), T0
PXOR T0, B7
MOVOU B0, (16*0)(ctx)
PSHUFB BSWAP, B0
@ -765,10 +773,14 @@ gcmSm4EncNibbles:
MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
PXOR (16*0)(ptx), B0
PXOR (16*1)(ptx), B1
PXOR (16*2)(ptx), B2
PXOR (16*3)(ptx), B3
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
MOVOU B0, (16*0)(ctx)
MOVOU B1, (16*1)(ctx)
@ -1683,14 +1695,22 @@ gcmSm4DecOctetsLoop:
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
PXOR (16*0)(ctx), B0
PXOR (16*1)(ctx), B1
PXOR (16*2)(ctx), B2
PXOR (16*3)(ctx), B3
PXOR (16*4)(ctx), B4
PXOR (16*5)(ctx), B5
PXOR (16*6)(ctx), B6
PXOR (16*7)(ctx), B7
MOVOU (16*0)(ctx), T0
PXOR T0, B0
MOVOU (16*1)(ctx), T0
PXOR T0, B1
MOVOU (16*2)(ctx), T0
PXOR T0, B2
MOVOU (16*3)(ctx), T0
PXOR T0, B3
MOVOU (16*4)(ctx), T0
PXOR T0, B4
MOVOU (16*5)(ctx), T0
PXOR T0, B5
MOVOU (16*6)(ctx), T0
PXOR T0, B6
MOVOU (16*7)(ctx), T0
PXOR T0, B7
MOVOU B0, (16*0)(ptx)
MOVOU B1, (16*1)(ptx)

View File

@ -210,44 +210,60 @@ GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16
#define sseLoad4Blocks \
MOVOU (16*0)(DX), B0; \
PXOR (16*0)(SP), B0; \
MOVOU (16*0)(SP), T0; \
PXOR T0, B0; \
MOVOU (16*1)(DX), B1; \
PXOR (16*1)(SP), B1; \
MOVOU (16*1)(SP), T0; \
PXOR T0, B1; \
MOVOU (16*2)(DX), B2; \
PXOR (16*2)(SP), B2; \
MOVOU (16*2)(SP), T0; \
PXOR T0, B2; \
MOVOU (16*3)(DX), B3; \
PXOR (16*3)(SP), B3
MOVOU (16*3)(SP), T0; \
PXOR T0, B3
#define sseStore4Blocks \
PXOR (16*0)(SP), B0; \
MOVOU (16*0)(SP), T0; \
PXOR T0, B0; \
MOVOU B0, (16*0)(CX); \
PXOR (16*1)(SP), B1; \
MOVOU (16*1)(SP), T0; \
PXOR T0, B1; \
MOVOU B1, (16*1)(CX); \
PXOR (16*2)(SP), B2; \
MOVOU (16*2)(SP), T0; \
PXOR T0, B2; \
MOVOU B2, (16*2)(CX); \
PXOR (16*3)(SP), B3; \
MOVOU (16*3)(SP), T0; \
PXOR T0, B3; \
MOVOU B3, (16*3)(CX)
#define sseLoad8Blocks \
sseLoad4Blocks; \
MOVOU (16*4)(DX), B4; \
PXOR (16*4)(SP), B4; \
MOVOU (16*4)(SP), T0; \
PXOR T0, B4; \
MOVOU (16*5)(DX), B5; \
PXOR (16*5)(SP), B5; \
MOVOU (16*5)(SP), T0; \
PXOR T0, B5; \
MOVOU (16*6)(DX), B6; \
PXOR (16*6)(SP), B6; \
MOVOU (16*6)(SP), T0; \
PXOR T0, B6; \
MOVOU (16*7)(DX), B7; \
PXOR (16*7)(SP), B7
MOVOU (16*7)(SP), T0; \
PXOR T0, B7
#define sseStore8Blocks \
sseStore4Blocks; \
PXOR (16*4)(SP), B4; \
MOVOU (16*4)(SP), T0; \
PXOR T0, B4; \
MOVOU B4, (16*4)(CX); \
PXOR (16*5)(SP), B5; \
MOVOU (16*5)(SP), T0; \
PXOR T0, B5; \
MOVOU B5, (16*5)(CX); \
PXOR (16*6)(SP), B6; \
MOVOU (16*6)(SP), T0; \
PXOR T0, B6; \
MOVOU B6, (16*6)(CX); \
PXOR (16*7)(SP), B7; \
MOVOU (16*7)(SP), T0; \
PXOR T0, B7; \
MOVOU B7, (16*7)(CX)
#define avxLoad4Blocks \