sm4: gcm amd64 optmization

This commit is contained in:
Sun Yimin 2023-08-28 14:41:15 +08:00 committed by GitHub
parent 3f602061fc
commit 8d5c6c4601
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -26,7 +26,6 @@
#define XDWORD Y1 #define XDWORD Y1
#define YDWORD Y3 #define YDWORD Y3
#define XDWTMP0 Y5 #define XDWTMP0 Y5
#define XDWTMP1 Y7
#define ACC0 X8 #define ACC0 X8
#define ACC1 X9 #define ACC1 X9
@ -38,8 +37,8 @@
#define POLY X14 #define POLY X14
#define BSWAP X15 #define BSWAP X15
#define DWBSWAP Y15 #define DWBSWAP Y15
#define NIBBLE_MASK Y11 #define NIBBLE_MASK Y7
#define X_NIBBLE_MASK X11 #define X_NIBBLE_MASK X7
DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
@ -527,10 +526,9 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
PXOR ACC1, ACC1 PXOR ACC1, ACC1
PXOR ACCM, ACCM PXOR ACCM, ACCM
MOVOU (ctrPtr), T0 MOVOU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR
PSHUFB flip_mask<>(SB), T0 PSHUFB flip_mask<>(SB), T0
PEXTRD $3, T0, aluCTR
MOVOU T0, (8*16 + 0*16)(SP) MOVOU T0, (8*16 + 0*16)(SP)
increment(0) increment(0)
MOVOU T0, (8*16 + 1*16)(SP) MOVOU T0, (8*16 + 1*16)(SP)
@ -567,29 +565,29 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
PXOR ACC1, ACC1 PXOR ACC1, ACC1
increment(0) increment(0)
increment(1)
increment(2)
increment(3)
increment(4)
increment(5)
increment(6)
increment(7)
// XOR plaintext // XOR plaintext
MOVOU (16*0)(ptx), T0 MOVOU (16*0)(ptx), T0
PXOR T0, B0 PXOR T0, B0
increment(1)
MOVOU (16*1)(ptx), T0 MOVOU (16*1)(ptx), T0
PXOR T0, B1 PXOR T0, B1
increment(2)
MOVOU (16*2)(ptx), T0 MOVOU (16*2)(ptx), T0
PXOR T0, B2 PXOR T0, B2
increment(3)
MOVOU (16*3)(ptx), T0 MOVOU (16*3)(ptx), T0
PXOR T0, B3 PXOR T0, B3
increment(4)
MOVOU (16*4)(ptx), T0 MOVOU (16*4)(ptx), T0
PXOR T0, B4 PXOR T0, B4
increment(5)
MOVOU (16*5)(ptx), T0 MOVOU (16*5)(ptx), T0
PXOR T0, B5 PXOR T0, B5
increment(6)
MOVOU (16*6)(ptx), T0 MOVOU (16*6)(ptx), T0
PXOR T0, B6 PXOR T0, B6
increment(7)
MOVOU (16*7)(ptx), T0 MOVOU (16*7)(ptx), T0
PXOR T0, B7 PXOR T0, B7
@ -664,7 +662,7 @@ gcmSm4EncOctetsLoop:
increment(5) increment(5)
mulRound(7) mulRound(7)
increment(6) increment(6)
increment(7)
PXOR ACC0, ACCM PXOR ACC0, ACCM
PXOR ACC1, ACCM PXOR ACC1, ACCM
MOVOU ACCM, T0 MOVOU ACCM, T0
@ -673,28 +671,21 @@ gcmSm4EncOctetsLoop:
PXOR ACCM, ACC1 PXOR ACCM, ACC1
PXOR T0, ACC0 PXOR T0, ACC0
increment(7)
reduceRound(ACC0) reduceRound(ACC0)
reduceRound(ACC0) reduceRound(ACC0)
PXOR ACC1, ACC0 PXOR ACC1, ACC0
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
MOVOU (16*0)(ptx), T0 PXOR (16*0)(ptx), B0
PXOR T0, B0 PXOR (16*1)(ptx), B1
MOVOU (16*1)(ptx), T0 PXOR (16*2)(ptx), B2
PXOR T0, B1 PXOR (16*3)(ptx), B3
MOVOU (16*2)(ptx), T0 PXOR (16*4)(ptx), B4
PXOR T0, B2 PXOR (16*5)(ptx), B5
MOVOU (16*3)(ptx), T0 PXOR (16*6)(ptx), B6
PXOR T0, B3 PXOR (16*7)(ptx), B7
MOVOU (16*4)(ptx), T0
PXOR T0, B4
MOVOU (16*5)(ptx), T0
PXOR T0, B5
MOVOU (16*6)(ptx), T0
PXOR T0, B6
MOVOU (16*7)(ptx), T0
PXOR T0, B7
MOVOU B0, (16*0)(ctx) MOVOU B0, (16*0)(ctx)
PSHUFB BSWAP, B0 PSHUFB BSWAP, B0
@ -775,14 +766,10 @@ gcmSm4EncNibbles:
MOVOU (8*16 + 3*16)(SP), B3 MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3) SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
MOVOU (16*0)(ptx), T0 PXOR (16*0)(ptx), B0
PXOR T0, B0 PXOR (16*1)(ptx), B1
MOVOU (16*1)(ptx), T0 PXOR (16*2)(ptx), B2
PXOR T0, B1 PXOR (16*3)(ptx), B3
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
MOVOU B0, (16*0)(ctx) MOVOU B0, (16*0)(ctx)
MOVOU B1, (16*1)(ctx) MOVOU B1, (16*1)(ctx)
@ -790,14 +777,14 @@ gcmSm4EncNibbles:
MOVOU B3, (16*3)(ctx) MOVOU B3, (16*3)(ctx)
MOVOU (16*14)(pTbl), T2 MOVOU (16*14)(pTbl), T2
gcmEncDataStep(B0)
gcmEncDataStep(B1)
gcmEncDataStep(B2)
gcmEncDataStep(B3)
increment(0) increment(0)
gcmEncDataStep(B0)
increment(1) increment(1)
gcmEncDataStep(B1)
increment(2) increment(2)
gcmEncDataStep(B2)
increment(3) increment(3)
gcmEncDataStep(B3)
LEAQ 64(ptx), ptx LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx LEAQ 64(ctx), ctx
@ -871,10 +858,9 @@ avxGcmSm4Enc:
VPXOR ACC1, ACC1, ACC1 VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0 VMOVDQU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR
VPSHUFB flip_mask<>(SB), T0, T0 VPSHUFB flip_mask<>(SB), T0, T0
VPEXTRD $3, T0, aluCTR
VMOVDQU T0, (8*16 + 0*16)(SP) VMOVDQU T0, (8*16 + 0*16)(SP)
increment(0) increment(0)
VMOVDQU T0, (8*16 + 1*16)(SP) VMOVDQU T0, (8*16 + 1*16)(SP)
@ -911,31 +897,32 @@ avxGcmSm4Enc:
AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
VPXOR ACC1, ACC1, ACC1 // clean ACC1 VPXOR ACC1, ACC1, ACC1 // clean ACC1
increment(0) increment(0)
increment(1)
increment(2)
increment(3)
increment(4)
increment(5)
increment(6)
increment(7)
// XOR plaintext // XOR plaintext
VPXOR (16*0)(ptx), B0, B0 VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1 VPXOR (16*1)(ptx), B1, B1
increment(1)
VPXOR (16*2)(ptx), B2, B2 VPXOR (16*2)(ptx), B2, B2
VPXOR (16*3)(ptx), B3, B3 VPXOR (16*3)(ptx), B3, B3
increment(2)
VPXOR (16*4)(ptx), B4, B4 VPXOR (16*4)(ptx), B4, B4
VPXOR (16*5)(ptx), B5, B5 VPXOR (16*5)(ptx), B5, B5
increment(3)
VPXOR (16*6)(ptx), B6, B6 VPXOR (16*6)(ptx), B6, B6
VPXOR (16*7)(ptx), B7, B7 VPXOR (16*7)(ptx), B7, B7
// Store ciphertext // Store ciphertext
VMOVDQU B0, (16*0)(ctx) VMOVDQU B0, (16*0)(ctx)
VPSHUFB BSWAP, B0, B0 VPSHUFB BSWAP, B0, B0
increment(4)
VMOVDQU B1, (16*1)(ctx) VMOVDQU B1, (16*1)(ctx)
VPSHUFB BSWAP, B1, B1 VPSHUFB BSWAP, B1, B1
increment(5)
VMOVDQU B2, (16*2)(ctx) VMOVDQU B2, (16*2)(ctx)
VPSHUFB BSWAP, B2, B2 VPSHUFB BSWAP, B2, B2
increment(6)
VMOVDQU B3, (16*3)(ctx) VMOVDQU B3, (16*3)(ctx)
VPSHUFB BSWAP, B3, B3 VPSHUFB BSWAP, B3, B3
increment(7)
VMOVDQU B4, (16*4)(ctx) VMOVDQU B4, (16*4)(ctx)
VPSHUFB BSWAP, B4, B4 VPSHUFB BSWAP, B4, B4
VMOVDQU B5, (16*5)(ctx) VMOVDQU B5, (16*5)(ctx)
@ -999,7 +986,6 @@ avxGcmSm4EncOctetsLoop:
increment(5) increment(5)
avxMulRound(7) avxMulRound(7)
increment(6) increment(6)
increment(7)
VPXOR ACC0, ACCM, ACCM VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0 VPSLLDQ $8, ACCM, T0
@ -1008,6 +994,7 @@ avxGcmSm4EncOctetsLoop:
VPXOR ACCM, ACC1, ACC1 VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0 VPXOR T0, ACC0, ACC0
increment(7)
avxReduceRound(ACC0) avxReduceRound(ACC0)
avxReduceRound(ACC0) avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0 VPXOR ACC1, ACC0, ACC0
@ -1117,14 +1104,14 @@ avxGcmSm4EncNibbles:
VMOVDQU B3, (16*3)(ctx) VMOVDQU B3, (16*3)(ctx)
VMOVDQU (16*14)(pTbl), T2 VMOVDQU (16*14)(pTbl), T2
avxGcmEncDataStep(B0)
avxGcmEncDataStep(B1)
avxGcmEncDataStep(B2)
avxGcmEncDataStep(B3)
increment(0) increment(0)
avxGcmEncDataStep(B0)
increment(1) increment(1)
avxGcmEncDataStep(B1)
increment(2) increment(2)
avxGcmEncDataStep(B2)
increment(3) increment(3)
avxGcmEncDataStep(B3)
LEAQ 64(ptx), ptx LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx LEAQ 64(ctx), ctx
@ -1201,16 +1188,14 @@ avx2GcmSm4Enc:
VPXOR ACCM, ACCM, ACCM VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0 VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0 VPSHUFB flip_mask<>(SB), T0, T0
MOVL (3*4)(ctrPtr), aluCTR VPEXTRD $3, T0, aluCTR
BSWAPL aluCTR
VMOVDQU T0, (8*16 + 0*16)(SP) VINSERTI128 $1, T0, Y11, Y11
VMOVDQU Y11, (8*16 + 0*32)(SP)
increment(0) increment(0)
VMOVDQU T0, (8*16 + 1*16)(SP)
increment(1) increment(1)
VMOVDQU T0, (8*16 + 2*16)(SP) VMOVDQU Y11, (8*16 + 1*32)(SP)
increment(2) increment(2)
VMOVDQU T0, (8*16 + 3*16)(SP)
increment(3) increment(3)
CMPQ ptxLen, $128 CMPQ ptxLen, $128
@ -1218,13 +1203,11 @@ avx2GcmSm4Enc:
SUBQ $128, ptxLen SUBQ $128, ptxLen
// We have at least 8 blocks to encrypt, prepare the rest of the counters // We have at least 8 blocks to encrypt, prepare the rest of the counters
VMOVDQU T0, (8*16 + 4*16)(SP) VMOVDQU Y11, (8*16 + 2*32)(SP)
increment(4) increment(4)
VMOVDQU T0, (8*16 + 5*16)(SP)
increment(5) increment(5)
VMOVDQU T0, (8*16 + 6*16)(SP) VMOVDQU Y11, (8*16 + 3*32)(SP)
increment(6) increment(6)
VMOVDQU T0, (8*16 + 7*16)(SP)
increment(7) increment(7)
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
@ -1234,38 +1217,35 @@ avx2GcmSm4Enc:
VMOVDQU (4*32 + 2*32)(SP), DWB2 VMOVDQU (4*32 + 2*32)(SP), DWB2
VMOVDQU (4*32 + 3*32)(SP), DWB3 VMOVDQU (4*32 + 3*32)(SP), DWB3
increment(0)
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
increment(1)
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
increment(2)
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
VPSHUFB DWBSWAP, DWB0, DWB0 VPSHUFB DWBSWAP, DWB0, DWB0
VPSHUFB DWBSWAP, DWB1, DWB1 VPSHUFB DWBSWAP, DWB1, DWB1
increment(3)
VPSHUFB DWBSWAP, DWB2, DWB2 VPSHUFB DWBSWAP, DWB2, DWB2
VPSHUFB DWBSWAP, DWB3, DWB3 VPSHUFB DWBSWAP, DWB3, DWB3
increment(0)
increment(1)
increment(2)
increment(3)
increment(4) increment(4)
increment(5)
increment(6)
increment(7)
// XOR plaintext // XOR plaintext
VMOVDQU (32*0)(ptx), XDWTMP0 VMOVDQU (32*0)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB0, DWB0 VPXOR XDWTMP0, DWB0, DWB0
VMOVDQU (32*1)(ptx), XDWTMP0 VMOVDQU (32*1)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB1, DWB1 VPXOR XDWTMP0, DWB1, DWB1
increment(5)
VMOVDQU (32*2)(ptx), XDWTMP0 VMOVDQU (32*2)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB2, DWB2 VPXOR XDWTMP0, DWB2, DWB2
VMOVDQU (32*3)(ptx), XDWTMP0 VMOVDQU (32*3)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB3, DWB3 VPXOR XDWTMP0, DWB3, DWB3
increment(6)
// Store ciphertext // Store ciphertext
VMOVDQU DWB0, (32*0)(ctx) VMOVDQU DWB0, (32*0)(ctx)
@ -1276,7 +1256,7 @@ avx2GcmSm4Enc:
VPSHUFB DWBSWAP, DWB2, DWB2 VPSHUFB DWBSWAP, DWB2, DWB2
VMOVDQU DWB3, (32*3)(ctx) VMOVDQU DWB3, (32*3)(ctx)
VPSHUFB DWBSWAP, DWB3, DWB3 VPSHUFB DWBSWAP, DWB3, DWB3
increment(7)
//VPXOR XDWTMP0, XDWTMP0, XDWTMP0 //VPXOR XDWTMP0, XDWTMP0, XDWTMP0
//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0 //VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
//VPXOR XDWTMP0, DWB0, DWB0 //VPXOR XDWTMP0, DWB0, DWB0
@ -1312,13 +1292,12 @@ avx2GcmSm4EncOctetsLoop:
VPCLMULQDQ $0x11, T0, ACC1, ACC1 VPCLMULQDQ $0x11, T0, ACC1, ACC1
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
VPSHUFB DWBSWAP, DWB0, DWB0 VPSHUFB DWBSWAP, DWB0, DWB0
VPSHUFB DWBSWAP, DWB1, DWB1 VPSHUFB DWBSWAP, DWB1, DWB1
@ -1339,7 +1318,7 @@ avx2GcmSm4EncOctetsLoop:
increment(5) increment(5)
avxMulRound(7) avxMulRound(7)
increment(6) increment(6)
increment(7)
VPXOR ACC0, ACCM, ACCM VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0 VPSLLDQ $8, ACCM, T0
@ -1348,6 +1327,7 @@ avx2GcmSm4EncOctetsLoop:
VPXOR ACCM, ACC1, ACC1 VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0 VPXOR T0, ACC0, ACC0
increment(7)
avxReduceRound(ACC0) avxReduceRound(ACC0)
avxReduceRound(ACC0) avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0 VPXOR ACC1, ACC0, ACC0
@ -1442,12 +1422,12 @@ avx2GcmSm4EncNibbles:
VMOVDQU (16*14)(pTbl), T2 VMOVDQU (16*14)(pTbl), T2
avxGcmEncDataStep(B0) avxGcmEncDataStep(B0)
avxGcmEncDataStep(B1)
avxGcmEncDataStep(B2)
avxGcmEncDataStep(B3)
increment(0) increment(0)
avxGcmEncDataStep(B1)
increment(1) increment(1)
avxGcmEncDataStep(B2)
increment(2) increment(2)
avxGcmEncDataStep(B3)
increment(3) increment(3)
LEAQ 64(ptx), ptx LEAQ 64(ptx), ptx
@ -1622,10 +1602,9 @@ TEXT ·gcmSm4Dec(SB),0,$128-96
PXOR ACC1, ACC1 PXOR ACC1, ACC1
PXOR ACCM, ACCM PXOR ACCM, ACCM
MOVOU (ctrPtr), T0 MOVOU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR
PSHUFB flip_mask<>(SB), T0 PSHUFB flip_mask<>(SB), T0
PEXTRD $3, T0, aluCTR
MOVOU T0, (0*16)(SP) MOVOU T0, (0*16)(SP)
increment(0) increment(0)
MOVOU T0, (1*16)(SP) MOVOU T0, (1*16)(SP)
@ -1706,22 +1685,14 @@ gcmSm4DecOctetsLoop:
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
MOVOU (16*0)(ctx), T0 PXOR (16*0)(ctx), B0
PXOR T0, B0 PXOR (16*1)(ctx), B1
MOVOU (16*1)(ctx), T0 PXOR (16*2)(ctx), B2
PXOR T0, B1 PXOR (16*3)(ctx), B3
MOVOU (16*2)(ctx), T0 PXOR (16*4)(ctx), B4
PXOR T0, B2 PXOR (16*5)(ctx), B5
MOVOU (16*3)(ctx), T0 PXOR (16*6)(ctx), B6
PXOR T0, B3 PXOR (16*7)(ctx), B7
MOVOU (16*4)(ctx), T0
PXOR T0, B4
MOVOU (16*5)(ctx), T0
PXOR T0, B5
MOVOU (16*6)(ctx), T0
PXOR T0, B6
MOVOU (16*7)(ctx), T0
PXOR T0, B7
MOVOU B0, (16*0)(ptx) MOVOU B0, (16*0)(ptx)
MOVOU B1, (16*1)(ptx) MOVOU B1, (16*1)(ptx)
@ -1752,22 +1723,22 @@ gcmSm4DecNibbles:
SM4_4BLOCKS_WO_BS(rk, B0, T0, T1, T2, B4, B5, B6, B7) SM4_4BLOCKS_WO_BS(rk, B0, T0, T1, T2, B4, B5, B6, B7)
MOVOU (16*14)(pTbl), T2 MOVOU (16*14)(pTbl), T2
MOVOU (16*0)(ctx), T0
PXOR T0, B4
MOVOU (16*1)(ctx), T0
PXOR T0, B5
MOVOU (16*2)(ctx), T0
PXOR T0, B6
MOVOU (16*3)(ctx), T0
PXOR T0, B7
decGhashRound(0) MOVOU (16*0)(ctx), B0
PXOR B0, B4
internalDecGhashRound()
increment(0) increment(0)
decGhashRound(1) MOVOU (16*1)(ctx), B0
PXOR B0, B5
internalDecGhashRound()
increment(1) increment(1)
decGhashRound(2) MOVOU (16*2)(ctx), B0
PXOR B0, B6
internalDecGhashRound()
increment(2) increment(2)
decGhashRound(3) MOVOU (16*3)(ctx), B0
PXOR B0, B7
internalDecGhashRound()
increment(3) increment(3)
MOVOU B4, (16*0)(ptx) MOVOU B4, (16*0)(ptx)
@ -1851,10 +1822,9 @@ avxGcmSm4Dec:
VPXOR ACC1, ACC1, ACC1 VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0 VMOVDQU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR
VPSHUFB flip_mask<>(SB), T0, T0 VPSHUFB flip_mask<>(SB), T0, T0
VPEXTRD $3, T0, aluCTR
VMOVDQU T0, (0*16)(SP) VMOVDQU T0, (0*16)(SP)
increment(0) increment(0)
VMOVDQU T0, (1*16)(SP) VMOVDQU T0, (1*16)(SP)
@ -1918,7 +1888,7 @@ avxGcmSm4DecOctetsLoop:
increment(5) increment(5)
avxDecMulRound(7) avxDecMulRound(7)
increment(6) increment(6)
increment(7)
VPXOR ACC0, ACCM, ACCM VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM VPXOR ACC1, ACCM, ACCM
@ -1928,6 +1898,7 @@ avxGcmSm4DecOctetsLoop:
VPXOR ACCM, ACC1, ACC1 VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0 VPXOR T0, ACC0, ACC0
increment(7)
avxReduceRound(ACC0) avxReduceRound(ACC0)
avxReduceRound(ACC0) avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0 VPXOR ACC1, ACC0, ACC0
@ -2075,33 +2046,30 @@ avx2GcmSm4Dec:
VPXOR ACC1, ACC1, ACC1 VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0 VMOVDQU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR
VPSHUFB flip_mask<>(SB), T0, T0 VPSHUFB flip_mask<>(SB), T0, T0
VMOVDQU T0, (0*16)(SP) VPEXTRD $3, T0, aluCTR
VINSERTI128 $1, T0, Y11, Y11
VMOVDQU Y11, (0*32)(SP)
increment(0) increment(0)
VMOVDQU T0, (1*16)(SP)
increment(1) increment(1)
VMOVDQU T0, (2*16)(SP) VMOVDQU Y11, (1*32)(SP)
increment(2) increment(2)
VMOVDQU T0, (3*16)(SP)
increment(3) increment(3)
CMPQ ptxLen, $128 CMPQ ptxLen, $128
JB avx2GcmSm4DecNibbles JB avx2GcmSm4DecNibbles
// We have at least 8 blocks to dencrypt, prepare the rest of the counters // We have at least 8 blocks to dencrypt, prepare the rest of the counters
VMOVDQU T0, (4*16)(SP) VMOVDQU Y11, (2*32)(SP)
increment(4) increment(4)
VMOVDQU T0, (5*16)(SP)
increment(5) increment(5)
VMOVDQU T0, (6*16)(SP) VMOVDQU Y11, (3*32)(SP)
increment(6) increment(6)
VMOVDQU T0, (7*16)(SP)
increment(7) increment(7)
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
avx2GcmSm4DecOctetsLoop: avx2GcmSm4DecOctetsLoop:
CMPQ ptxLen, $128 CMPQ ptxLen, $128
@ -2141,7 +2109,6 @@ avx2GcmSm4DecOctetsLoop:
increment(5) increment(5)
avxDecMulRound(7) avxDecMulRound(7)
increment(6) increment(6)
increment(7)
VPXOR ACC0, ACCM, ACCM VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM VPXOR ACC1, ACCM, ACCM
@ -2150,19 +2117,19 @@ avx2GcmSm4DecOctetsLoop:
VPXOR ACCM, ACC1, ACC1 VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0 VPXOR T0, ACC0, ACC0
increment(7)
avxReduceRound(ACC0) avxReduceRound(ACC0)
avxReduceRound(ACC0) avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0 VPXOR ACC1, ACC0, ACC0
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
VPSHUFB DWBSWAP, DWB0, DWB0 VPSHUFB DWBSWAP, DWB0, DWB0
VPSHUFB DWBSWAP, DWB1, DWB1 VPSHUFB DWBSWAP, DWB1, DWB1
@ -2202,18 +2169,22 @@ avx2GcmSm4DecNibbles:
VMOVDQU (16*14)(pTbl), T2 VMOVDQU (16*14)(pTbl), T2
VMOVDQU (16*0)(ctx), B0 VMOVDQU (16*0)(ctx), B0
VPXOR B0, B4, B4 VPXOR B0, B4, B4
increment(0)
internalAvxDecGhashRound() internalAvxDecGhashRound()
VMOVDQU (16*1)(ctx), B0 VMOVDQU (16*1)(ctx), B0
VPXOR B0, B1, B1 VPXOR B0, B1, B1
increment(1)
internalAvxDecGhashRound() internalAvxDecGhashRound()
VMOVDQU (16*2)(ctx), B0 VMOVDQU (16*2)(ctx), B0
VPXOR B0, B2, B2 VPXOR B0, B2, B2
increment(2)
internalAvxDecGhashRound() internalAvxDecGhashRound()
VMOVDQU (16*3)(ctx), B0 VMOVDQU (16*3)(ctx), B0
VPXOR B0, B3, B3 VPXOR B0, B3, B3
increment(3)
internalAvxDecGhashRound() internalAvxDecGhashRound()
VMOVDQU B4, (16*0)(ptx) VMOVDQU B4, (16*0)(ptx)
@ -2221,11 +2192,6 @@ avx2GcmSm4DecNibbles:
VMOVDQU B2, (16*2)(ptx) VMOVDQU B2, (16*2)(ptx)
VMOVDQU B3, (16*3)(ptx) VMOVDQU B3, (16*3)(ptx)
increment(0)
increment(1)
increment(2)
increment(3)
LEAQ 64(ptx), ptx LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx LEAQ 64(ctx), ctx