sm4: disable PXOR use m128 directly

This commit is contained in:
Sun Yimin 2023-11-01 15:00:49 +08:00 committed by GitHub
parent 2f163662b5
commit 8f5e603f94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 117 additions and 59 deletions

View File

@ -152,14 +152,22 @@ cbcSm4Octets:
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
PXOR -16(DX), XWORD0 MOVOU -16(DX), XWTMP0
PXOR 0(DX), XWORD1 PXOR XWTMP0, XWORD0
PXOR 16(DX), XWORD2 MOVOU 0(DX), XWTMP0
PXOR 32(DX), XWORD3 PXOR XWTMP0, XWORD1
PXOR 48(DX), XWORD4 MOVOU 16(DX), XWTMP0
PXOR 64(DX), XWORD5 PXOR XWTMP0, XWORD2
PXOR 80(DX), XWORD6 MOVOU 32(DX), XWTMP0
PXOR 96(DX), XWORD7 PXOR XWTMP0, XWORD3
MOVOU 48(DX), XWTMP0
PXOR XWTMP0, XWORD4
MOVOU 64(DX), XWTMP0
PXOR XWTMP0, XWORD5
MOVOU 80(DX), XWTMP0
PXOR XWTMP0, XWORD6
MOVOU 96(DX), XWTMP0
PXOR XWTMP0, XWORD7
MOVOU XWORD0, 0(BX) MOVOU XWORD0, 0(BX)
MOVOU XWORD1, 16(BX) MOVOU XWORD1, 16(BX)
@ -186,10 +194,14 @@ cbcSm4Nibbles:
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR -16(DX), XWORD0 MOVUPS -16(DX), XWTMP0
PXOR 0(DX), XWORD1 PXOR XWTMP0, XWORD0
PXOR 16(DX), XWORD2 MOVUPS 0(DX), XWTMP0
PXOR 32(DX), XWORD3 PXOR XWTMP0, XWORD1
MOVUPS 16(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVUPS 32(DX), XWTMP0
PXOR XWTMP0, XWORD3
MOVUPS XWORD0, 0(BX) MOVUPS XWORD0, 0(BX)
MOVUPS XWORD1, 16(BX) MOVUPS XWORD1, 16(BX)
@ -213,10 +225,14 @@ cbCSm4Single:
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0 MOVUPS 0(SI), XWTMP0
PXOR -64(DX), XWORD1 PXOR XWTMP0, XWORD0
PXOR -48(DX), XWORD2 MOVUPS -64(DX), XWTMP0
PXOR -32(DX), XWORD3 PXOR XWTMP0, XWORD1
MOVUPS -48(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVUPS -32(DX), XWTMP0
PXOR XWTMP0, XWORD3
MOVUPS XWORD0, -64(BX) MOVUPS XWORD0, -64(BX)
MOVUPS XWORD1, -48(BX) MOVUPS XWORD1, -48(BX)
@ -230,7 +246,8 @@ cbcSm4Single16:
SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0 MOVUPS 0(SI), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS XWORD0, -16(BX) MOVUPS XWORD0, -16(BX)
@ -242,8 +259,10 @@ cbcSm4Single32:
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0 MOVUPS 0(SI), XWTMP0
PXOR -32(DX), XWORD1 PXOR XWTMP0, XWORD0
MOVUPS -32(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVUPS XWORD0, -32(BX) MOVUPS XWORD0, -32(BX)
MOVUPS XWORD1, -16(BX) MOVUPS XWORD1, -16(BX)
@ -257,9 +276,12 @@ cbcSm4Single48:
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0 MOVUPS 0(SI), XWTMP0
PXOR -48(DX), XWORD1 PXOR XWTMP0, XWORD0
PXOR -32(DX), XWORD2 MOVUPS -48(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVUPS -32(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVUPS XWORD0, -48(BX) MOVUPS XWORD0, -48(BX)
MOVUPS XWORD1, -32(BX) MOVUPS XWORD1, -32(BX)

View File

@ -677,14 +677,22 @@ gcmSm4EncOctetsLoop:
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
PXOR (16*0)(ptx), B0 MOVOU (16*0)(ptx), T0
PXOR (16*1)(ptx), B1 PXOR T0, B0
PXOR (16*2)(ptx), B2 MOVOU (16*1)(ptx), T0
PXOR (16*3)(ptx), B3 PXOR T0, B1
PXOR (16*4)(ptx), B4 MOVOU (16*2)(ptx), T0
PXOR (16*5)(ptx), B5 PXOR T0, B2
PXOR (16*6)(ptx), B6 MOVOU (16*3)(ptx), T0
PXOR (16*7)(ptx), B7 PXOR T0, B3
MOVOU (16*4)(ptx), T0
PXOR T0, B4
MOVOU (16*5)(ptx), T0
PXOR T0, B5
MOVOU (16*6)(ptx), T0
PXOR T0, B6
MOVOU (16*7)(ptx), T0
PXOR T0, B7
MOVOU B0, (16*0)(ctx) MOVOU B0, (16*0)(ctx)
PSHUFB BSWAP, B0 PSHUFB BSWAP, B0
@ -765,10 +773,14 @@ gcmSm4EncNibbles:
MOVOU (8*16 + 3*16)(SP), B3 MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3) SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
PXOR (16*0)(ptx), B0 MOVOU (16*0)(ptx), T0
PXOR (16*1)(ptx), B1 PXOR T0, B0
PXOR (16*2)(ptx), B2 MOVOU (16*1)(ptx), T0
PXOR (16*3)(ptx), B3 PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
MOVOU B0, (16*0)(ctx) MOVOU B0, (16*0)(ctx)
MOVOU B1, (16*1)(ctx) MOVOU B1, (16*1)(ctx)
@ -1683,14 +1695,22 @@ gcmSm4DecOctetsLoop:
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
PXOR (16*0)(ctx), B0 MOVOU (16*0)(ctx), T0
PXOR (16*1)(ctx), B1 PXOR T0, B0
PXOR (16*2)(ctx), B2 MOVOU (16*1)(ctx), T0
PXOR (16*3)(ctx), B3 PXOR T0, B1
PXOR (16*4)(ctx), B4 MOVOU (16*2)(ctx), T0
PXOR (16*5)(ctx), B5 PXOR T0, B2
PXOR (16*6)(ctx), B6 MOVOU (16*3)(ctx), T0
PXOR (16*7)(ctx), B7 PXOR T0, B3
MOVOU (16*4)(ctx), T0
PXOR T0, B4
MOVOU (16*5)(ctx), T0
PXOR T0, B5
MOVOU (16*6)(ctx), T0
PXOR T0, B6
MOVOU (16*7)(ctx), T0
PXOR T0, B7
MOVOU B0, (16*0)(ptx) MOVOU B0, (16*0)(ptx)
MOVOU B1, (16*1)(ptx) MOVOU B1, (16*1)(ptx)

View File

@ -210,44 +210,60 @@ GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16
#define sseLoad4Blocks \ #define sseLoad4Blocks \
MOVOU (16*0)(DX), B0; \ MOVOU (16*0)(DX), B0; \
PXOR (16*0)(SP), B0; \ MOVOU (16*0)(SP), T0; \
PXOR T0, B0; \
MOVOU (16*1)(DX), B1; \ MOVOU (16*1)(DX), B1; \
PXOR (16*1)(SP), B1; \ MOVOU (16*1)(SP), T0; \
PXOR T0, B1; \
MOVOU (16*2)(DX), B2; \ MOVOU (16*2)(DX), B2; \
PXOR (16*2)(SP), B2; \ MOVOU (16*2)(SP), T0; \
PXOR T0, B2; \
MOVOU (16*3)(DX), B3; \ MOVOU (16*3)(DX), B3; \
PXOR (16*3)(SP), B3 MOVOU (16*3)(SP), T0; \
PXOR T0, B3
#define sseStore4Blocks \ #define sseStore4Blocks \
PXOR (16*0)(SP), B0; \ MOVOU (16*0)(SP), T0; \
PXOR T0, B0; \
MOVOU B0, (16*0)(CX); \ MOVOU B0, (16*0)(CX); \
PXOR (16*1)(SP), B1; \ MOVOU (16*1)(SP), T0; \
PXOR T0, B1; \
MOVOU B1, (16*1)(CX); \ MOVOU B1, (16*1)(CX); \
PXOR (16*2)(SP), B2; \ MOVOU (16*2)(SP), T0; \
PXOR T0, B2; \
MOVOU B2, (16*2)(CX); \ MOVOU B2, (16*2)(CX); \
PXOR (16*3)(SP), B3; \ MOVOU (16*3)(SP), T0; \
PXOR T0, B3; \
MOVOU B3, (16*3)(CX) MOVOU B3, (16*3)(CX)
#define sseLoad8Blocks \ #define sseLoad8Blocks \
sseLoad4Blocks; \ sseLoad4Blocks; \
MOVOU (16*4)(DX), B4; \ MOVOU (16*4)(DX), B4; \
PXOR (16*4)(SP), B4; \ MOVOU (16*4)(SP), T0; \
PXOR T0, B4; \
MOVOU (16*5)(DX), B5; \ MOVOU (16*5)(DX), B5; \
PXOR (16*5)(SP), B5; \ MOVOU (16*5)(SP), T0; \
PXOR T0, B5; \
MOVOU (16*6)(DX), B6; \ MOVOU (16*6)(DX), B6; \
PXOR (16*6)(SP), B6; \ MOVOU (16*6)(SP), T0; \
PXOR T0, B6; \
MOVOU (16*7)(DX), B7; \ MOVOU (16*7)(DX), B7; \
PXOR (16*7)(SP), B7 MOVOU (16*7)(SP), T0; \
PXOR T0, B7
#define sseStore8Blocks \ #define sseStore8Blocks \
sseStore4Blocks; \ sseStore4Blocks; \
PXOR (16*4)(SP), B4; \ MOVOU (16*4)(SP), T0; \
PXOR T0, B4; \
MOVOU B4, (16*4)(CX); \ MOVOU B4, (16*4)(CX); \
PXOR (16*5)(SP), B5; \ MOVOU (16*5)(SP), T0; \
PXOR T0, B5; \
MOVOU B5, (16*5)(CX); \ MOVOU B5, (16*5)(CX); \
PXOR (16*6)(SP), B6; \ MOVOU (16*6)(SP), T0; \
PXOR T0, B6; \
MOVOU B6, (16*6)(CX); \ MOVOU B6, (16*6)(CX); \
PXOR (16*7)(SP), B7; \ MOVOU (16*7)(SP), T0; \
PXOR T0, B7; \
MOVOU B7, (16*7)(CX) MOVOU B7, (16*7)(CX)
#define avxLoad4Blocks \ #define avxLoad4Blocks \