mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 04:36:19 +08:00
sm4: gcm amd64 optmization
This commit is contained in:
parent
3f602061fc
commit
8d5c6c4601
256
sm4/gcm_amd64.s
256
sm4/gcm_amd64.s
@ -26,7 +26,6 @@
|
|||||||
#define XDWORD Y1
|
#define XDWORD Y1
|
||||||
#define YDWORD Y3
|
#define YDWORD Y3
|
||||||
#define XDWTMP0 Y5
|
#define XDWTMP0 Y5
|
||||||
#define XDWTMP1 Y7
|
|
||||||
|
|
||||||
#define ACC0 X8
|
#define ACC0 X8
|
||||||
#define ACC1 X9
|
#define ACC1 X9
|
||||||
@ -38,8 +37,8 @@
|
|||||||
#define POLY X14
|
#define POLY X14
|
||||||
#define BSWAP X15
|
#define BSWAP X15
|
||||||
#define DWBSWAP Y15
|
#define DWBSWAP Y15
|
||||||
#define NIBBLE_MASK Y11
|
#define NIBBLE_MASK Y7
|
||||||
#define X_NIBBLE_MASK X11
|
#define X_NIBBLE_MASK X7
|
||||||
|
|
||||||
DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
|
DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
|
||||||
DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
|
DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
|
||||||
@ -527,10 +526,9 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
|
|||||||
PXOR ACC1, ACC1
|
PXOR ACC1, ACC1
|
||||||
PXOR ACCM, ACCM
|
PXOR ACCM, ACCM
|
||||||
MOVOU (ctrPtr), T0
|
MOVOU (ctrPtr), T0
|
||||||
MOVL (3*4)(ctrPtr), aluCTR
|
|
||||||
BSWAPL aluCTR
|
|
||||||
|
|
||||||
PSHUFB flip_mask<>(SB), T0
|
PSHUFB flip_mask<>(SB), T0
|
||||||
|
PEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
MOVOU T0, (8*16 + 0*16)(SP)
|
MOVOU T0, (8*16 + 0*16)(SP)
|
||||||
increment(0)
|
increment(0)
|
||||||
MOVOU T0, (8*16 + 1*16)(SP)
|
MOVOU T0, (8*16 + 1*16)(SP)
|
||||||
@ -567,29 +565,29 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
|
|||||||
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
PXOR ACC1, ACC1
|
PXOR ACC1, ACC1
|
||||||
increment(0)
|
increment(0)
|
||||||
increment(1)
|
|
||||||
increment(2)
|
|
||||||
increment(3)
|
|
||||||
increment(4)
|
|
||||||
increment(5)
|
|
||||||
increment(6)
|
|
||||||
increment(7)
|
|
||||||
|
|
||||||
// XOR plaintext
|
// XOR plaintext
|
||||||
MOVOU (16*0)(ptx), T0
|
MOVOU (16*0)(ptx), T0
|
||||||
PXOR T0, B0
|
PXOR T0, B0
|
||||||
|
increment(1)
|
||||||
MOVOU (16*1)(ptx), T0
|
MOVOU (16*1)(ptx), T0
|
||||||
PXOR T0, B1
|
PXOR T0, B1
|
||||||
|
increment(2)
|
||||||
MOVOU (16*2)(ptx), T0
|
MOVOU (16*2)(ptx), T0
|
||||||
PXOR T0, B2
|
PXOR T0, B2
|
||||||
|
increment(3)
|
||||||
MOVOU (16*3)(ptx), T0
|
MOVOU (16*3)(ptx), T0
|
||||||
PXOR T0, B3
|
PXOR T0, B3
|
||||||
|
increment(4)
|
||||||
MOVOU (16*4)(ptx), T0
|
MOVOU (16*4)(ptx), T0
|
||||||
PXOR T0, B4
|
PXOR T0, B4
|
||||||
|
increment(5)
|
||||||
MOVOU (16*5)(ptx), T0
|
MOVOU (16*5)(ptx), T0
|
||||||
PXOR T0, B5
|
PXOR T0, B5
|
||||||
|
increment(6)
|
||||||
MOVOU (16*6)(ptx), T0
|
MOVOU (16*6)(ptx), T0
|
||||||
PXOR T0, B6
|
PXOR T0, B6
|
||||||
|
increment(7)
|
||||||
MOVOU (16*7)(ptx), T0
|
MOVOU (16*7)(ptx), T0
|
||||||
PXOR T0, B7
|
PXOR T0, B7
|
||||||
|
|
||||||
@ -664,7 +662,7 @@ gcmSm4EncOctetsLoop:
|
|||||||
increment(5)
|
increment(5)
|
||||||
mulRound(7)
|
mulRound(7)
|
||||||
increment(6)
|
increment(6)
|
||||||
increment(7)
|
|
||||||
PXOR ACC0, ACCM
|
PXOR ACC0, ACCM
|
||||||
PXOR ACC1, ACCM
|
PXOR ACC1, ACCM
|
||||||
MOVOU ACCM, T0
|
MOVOU ACCM, T0
|
||||||
@ -673,28 +671,21 @@ gcmSm4EncOctetsLoop:
|
|||||||
PXOR ACCM, ACC1
|
PXOR ACCM, ACC1
|
||||||
PXOR T0, ACC0
|
PXOR T0, ACC0
|
||||||
|
|
||||||
|
increment(7)
|
||||||
reduceRound(ACC0)
|
reduceRound(ACC0)
|
||||||
reduceRound(ACC0)
|
reduceRound(ACC0)
|
||||||
PXOR ACC1, ACC0
|
PXOR ACC1, ACC0
|
||||||
|
|
||||||
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
|
|
||||||
MOVOU (16*0)(ptx), T0
|
PXOR (16*0)(ptx), B0
|
||||||
PXOR T0, B0
|
PXOR (16*1)(ptx), B1
|
||||||
MOVOU (16*1)(ptx), T0
|
PXOR (16*2)(ptx), B2
|
||||||
PXOR T0, B1
|
PXOR (16*3)(ptx), B3
|
||||||
MOVOU (16*2)(ptx), T0
|
PXOR (16*4)(ptx), B4
|
||||||
PXOR T0, B2
|
PXOR (16*5)(ptx), B5
|
||||||
MOVOU (16*3)(ptx), T0
|
PXOR (16*6)(ptx), B6
|
||||||
PXOR T0, B3
|
PXOR (16*7)(ptx), B7
|
||||||
MOVOU (16*4)(ptx), T0
|
|
||||||
PXOR T0, B4
|
|
||||||
MOVOU (16*5)(ptx), T0
|
|
||||||
PXOR T0, B5
|
|
||||||
MOVOU (16*6)(ptx), T0
|
|
||||||
PXOR T0, B6
|
|
||||||
MOVOU (16*7)(ptx), T0
|
|
||||||
PXOR T0, B7
|
|
||||||
|
|
||||||
MOVOU B0, (16*0)(ctx)
|
MOVOU B0, (16*0)(ctx)
|
||||||
PSHUFB BSWAP, B0
|
PSHUFB BSWAP, B0
|
||||||
@ -775,14 +766,10 @@ gcmSm4EncNibbles:
|
|||||||
MOVOU (8*16 + 3*16)(SP), B3
|
MOVOU (8*16 + 3*16)(SP), B3
|
||||||
|
|
||||||
SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
|
SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
|
||||||
MOVOU (16*0)(ptx), T0
|
PXOR (16*0)(ptx), B0
|
||||||
PXOR T0, B0
|
PXOR (16*1)(ptx), B1
|
||||||
MOVOU (16*1)(ptx), T0
|
PXOR (16*2)(ptx), B2
|
||||||
PXOR T0, B1
|
PXOR (16*3)(ptx), B3
|
||||||
MOVOU (16*2)(ptx), T0
|
|
||||||
PXOR T0, B2
|
|
||||||
MOVOU (16*3)(ptx), T0
|
|
||||||
PXOR T0, B3
|
|
||||||
|
|
||||||
MOVOU B0, (16*0)(ctx)
|
MOVOU B0, (16*0)(ctx)
|
||||||
MOVOU B1, (16*1)(ctx)
|
MOVOU B1, (16*1)(ctx)
|
||||||
@ -790,14 +777,14 @@ gcmSm4EncNibbles:
|
|||||||
MOVOU B3, (16*3)(ctx)
|
MOVOU B3, (16*3)(ctx)
|
||||||
|
|
||||||
MOVOU (16*14)(pTbl), T2
|
MOVOU (16*14)(pTbl), T2
|
||||||
gcmEncDataStep(B0)
|
|
||||||
gcmEncDataStep(B1)
|
|
||||||
gcmEncDataStep(B2)
|
|
||||||
gcmEncDataStep(B3)
|
|
||||||
increment(0)
|
increment(0)
|
||||||
|
gcmEncDataStep(B0)
|
||||||
increment(1)
|
increment(1)
|
||||||
|
gcmEncDataStep(B1)
|
||||||
increment(2)
|
increment(2)
|
||||||
|
gcmEncDataStep(B2)
|
||||||
increment(3)
|
increment(3)
|
||||||
|
gcmEncDataStep(B3)
|
||||||
|
|
||||||
LEAQ 64(ptx), ptx
|
LEAQ 64(ptx), ptx
|
||||||
LEAQ 64(ctx), ctx
|
LEAQ 64(ctx), ctx
|
||||||
@ -871,10 +858,9 @@ avxGcmSm4Enc:
|
|||||||
VPXOR ACC1, ACC1, ACC1
|
VPXOR ACC1, ACC1, ACC1
|
||||||
VPXOR ACCM, ACCM, ACCM
|
VPXOR ACCM, ACCM, ACCM
|
||||||
VMOVDQU (ctrPtr), T0
|
VMOVDQU (ctrPtr), T0
|
||||||
MOVL (3*4)(ctrPtr), aluCTR
|
|
||||||
|
|
||||||
BSWAPL aluCTR
|
|
||||||
VPSHUFB flip_mask<>(SB), T0, T0
|
VPSHUFB flip_mask<>(SB), T0, T0
|
||||||
|
VPEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
VMOVDQU T0, (8*16 + 0*16)(SP)
|
VMOVDQU T0, (8*16 + 0*16)(SP)
|
||||||
increment(0)
|
increment(0)
|
||||||
VMOVDQU T0, (8*16 + 1*16)(SP)
|
VMOVDQU T0, (8*16 + 1*16)(SP)
|
||||||
@ -911,31 +897,32 @@ avxGcmSm4Enc:
|
|||||||
AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
VPXOR ACC1, ACC1, ACC1 // clean ACC1
|
VPXOR ACC1, ACC1, ACC1 // clean ACC1
|
||||||
increment(0)
|
increment(0)
|
||||||
increment(1)
|
|
||||||
increment(2)
|
|
||||||
increment(3)
|
|
||||||
increment(4)
|
|
||||||
increment(5)
|
|
||||||
increment(6)
|
|
||||||
increment(7)
|
|
||||||
// XOR plaintext
|
// XOR plaintext
|
||||||
VPXOR (16*0)(ptx), B0, B0
|
VPXOR (16*0)(ptx), B0, B0
|
||||||
VPXOR (16*1)(ptx), B1, B1
|
VPXOR (16*1)(ptx), B1, B1
|
||||||
|
increment(1)
|
||||||
VPXOR (16*2)(ptx), B2, B2
|
VPXOR (16*2)(ptx), B2, B2
|
||||||
VPXOR (16*3)(ptx), B3, B3
|
VPXOR (16*3)(ptx), B3, B3
|
||||||
|
increment(2)
|
||||||
VPXOR (16*4)(ptx), B4, B4
|
VPXOR (16*4)(ptx), B4, B4
|
||||||
VPXOR (16*5)(ptx), B5, B5
|
VPXOR (16*5)(ptx), B5, B5
|
||||||
|
increment(3)
|
||||||
VPXOR (16*6)(ptx), B6, B6
|
VPXOR (16*6)(ptx), B6, B6
|
||||||
VPXOR (16*7)(ptx), B7, B7
|
VPXOR (16*7)(ptx), B7, B7
|
||||||
// Store ciphertext
|
// Store ciphertext
|
||||||
VMOVDQU B0, (16*0)(ctx)
|
VMOVDQU B0, (16*0)(ctx)
|
||||||
VPSHUFB BSWAP, B0, B0
|
VPSHUFB BSWAP, B0, B0
|
||||||
|
increment(4)
|
||||||
VMOVDQU B1, (16*1)(ctx)
|
VMOVDQU B1, (16*1)(ctx)
|
||||||
VPSHUFB BSWAP, B1, B1
|
VPSHUFB BSWAP, B1, B1
|
||||||
|
increment(5)
|
||||||
VMOVDQU B2, (16*2)(ctx)
|
VMOVDQU B2, (16*2)(ctx)
|
||||||
VPSHUFB BSWAP, B2, B2
|
VPSHUFB BSWAP, B2, B2
|
||||||
|
increment(6)
|
||||||
VMOVDQU B3, (16*3)(ctx)
|
VMOVDQU B3, (16*3)(ctx)
|
||||||
VPSHUFB BSWAP, B3, B3
|
VPSHUFB BSWAP, B3, B3
|
||||||
|
increment(7)
|
||||||
VMOVDQU B4, (16*4)(ctx)
|
VMOVDQU B4, (16*4)(ctx)
|
||||||
VPSHUFB BSWAP, B4, B4
|
VPSHUFB BSWAP, B4, B4
|
||||||
VMOVDQU B5, (16*5)(ctx)
|
VMOVDQU B5, (16*5)(ctx)
|
||||||
@ -999,7 +986,6 @@ avxGcmSm4EncOctetsLoop:
|
|||||||
increment(5)
|
increment(5)
|
||||||
avxMulRound(7)
|
avxMulRound(7)
|
||||||
increment(6)
|
increment(6)
|
||||||
increment(7)
|
|
||||||
VPXOR ACC0, ACCM, ACCM
|
VPXOR ACC0, ACCM, ACCM
|
||||||
VPXOR ACC1, ACCM, ACCM
|
VPXOR ACC1, ACCM, ACCM
|
||||||
VPSLLDQ $8, ACCM, T0
|
VPSLLDQ $8, ACCM, T0
|
||||||
@ -1008,6 +994,7 @@ avxGcmSm4EncOctetsLoop:
|
|||||||
VPXOR ACCM, ACC1, ACC1
|
VPXOR ACCM, ACC1, ACC1
|
||||||
VPXOR T0, ACC0, ACC0
|
VPXOR T0, ACC0, ACC0
|
||||||
|
|
||||||
|
increment(7)
|
||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
VPXOR ACC1, ACC0, ACC0
|
VPXOR ACC1, ACC0, ACC0
|
||||||
@ -1117,14 +1104,14 @@ avxGcmSm4EncNibbles:
|
|||||||
VMOVDQU B3, (16*3)(ctx)
|
VMOVDQU B3, (16*3)(ctx)
|
||||||
|
|
||||||
VMOVDQU (16*14)(pTbl), T2
|
VMOVDQU (16*14)(pTbl), T2
|
||||||
avxGcmEncDataStep(B0)
|
|
||||||
avxGcmEncDataStep(B1)
|
|
||||||
avxGcmEncDataStep(B2)
|
|
||||||
avxGcmEncDataStep(B3)
|
|
||||||
increment(0)
|
increment(0)
|
||||||
|
avxGcmEncDataStep(B0)
|
||||||
increment(1)
|
increment(1)
|
||||||
|
avxGcmEncDataStep(B1)
|
||||||
increment(2)
|
increment(2)
|
||||||
|
avxGcmEncDataStep(B2)
|
||||||
increment(3)
|
increment(3)
|
||||||
|
avxGcmEncDataStep(B3)
|
||||||
|
|
||||||
LEAQ 64(ptx), ptx
|
LEAQ 64(ptx), ptx
|
||||||
LEAQ 64(ctx), ctx
|
LEAQ 64(ctx), ctx
|
||||||
@ -1201,16 +1188,14 @@ avx2GcmSm4Enc:
|
|||||||
VPXOR ACCM, ACCM, ACCM
|
VPXOR ACCM, ACCM, ACCM
|
||||||
VMOVDQU (ctrPtr), T0
|
VMOVDQU (ctrPtr), T0
|
||||||
VPSHUFB flip_mask<>(SB), T0, T0
|
VPSHUFB flip_mask<>(SB), T0, T0
|
||||||
MOVL (3*4)(ctrPtr), aluCTR
|
VPEXTRD $3, T0, aluCTR
|
||||||
BSWAPL aluCTR
|
|
||||||
|
|
||||||
VMOVDQU T0, (8*16 + 0*16)(SP)
|
VINSERTI128 $1, T0, Y11, Y11
|
||||||
|
VMOVDQU Y11, (8*16 + 0*32)(SP)
|
||||||
increment(0)
|
increment(0)
|
||||||
VMOVDQU T0, (8*16 + 1*16)(SP)
|
|
||||||
increment(1)
|
increment(1)
|
||||||
VMOVDQU T0, (8*16 + 2*16)(SP)
|
VMOVDQU Y11, (8*16 + 1*32)(SP)
|
||||||
increment(2)
|
increment(2)
|
||||||
VMOVDQU T0, (8*16 + 3*16)(SP)
|
|
||||||
increment(3)
|
increment(3)
|
||||||
|
|
||||||
CMPQ ptxLen, $128
|
CMPQ ptxLen, $128
|
||||||
@ -1218,13 +1203,11 @@ avx2GcmSm4Enc:
|
|||||||
SUBQ $128, ptxLen
|
SUBQ $128, ptxLen
|
||||||
|
|
||||||
// We have at least 8 blocks to encrypt, prepare the rest of the counters
|
// We have at least 8 blocks to encrypt, prepare the rest of the counters
|
||||||
VMOVDQU T0, (8*16 + 4*16)(SP)
|
VMOVDQU Y11, (8*16 + 2*32)(SP)
|
||||||
increment(4)
|
increment(4)
|
||||||
VMOVDQU T0, (8*16 + 5*16)(SP)
|
|
||||||
increment(5)
|
increment(5)
|
||||||
VMOVDQU T0, (8*16 + 6*16)(SP)
|
VMOVDQU Y11, (8*16 + 3*32)(SP)
|
||||||
increment(6)
|
increment(6)
|
||||||
VMOVDQU T0, (8*16 + 7*16)(SP)
|
|
||||||
increment(7)
|
increment(7)
|
||||||
|
|
||||||
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
||||||
@ -1234,38 +1217,35 @@ avx2GcmSm4Enc:
|
|||||||
VMOVDQU (4*32 + 2*32)(SP), DWB2
|
VMOVDQU (4*32 + 2*32)(SP), DWB2
|
||||||
VMOVDQU (4*32 + 3*32)(SP), DWB3
|
VMOVDQU (4*32 + 3*32)(SP), DWB3
|
||||||
|
|
||||||
|
increment(0)
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
|
||||||
|
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
|
increment(1)
|
||||||
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
|
||||||
|
increment(2)
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
|
||||||
|
|
||||||
VPSHUFB DWBSWAP, DWB0, DWB0
|
VPSHUFB DWBSWAP, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, DWB1, DWB1
|
VPSHUFB DWBSWAP, DWB1, DWB1
|
||||||
|
increment(3)
|
||||||
VPSHUFB DWBSWAP, DWB2, DWB2
|
VPSHUFB DWBSWAP, DWB2, DWB2
|
||||||
VPSHUFB DWBSWAP, DWB3, DWB3
|
VPSHUFB DWBSWAP, DWB3, DWB3
|
||||||
|
|
||||||
increment(0)
|
|
||||||
increment(1)
|
|
||||||
increment(2)
|
|
||||||
increment(3)
|
|
||||||
increment(4)
|
increment(4)
|
||||||
increment(5)
|
|
||||||
increment(6)
|
|
||||||
increment(7)
|
|
||||||
|
|
||||||
// XOR plaintext
|
// XOR plaintext
|
||||||
VMOVDQU (32*0)(ptx), XDWTMP0
|
VMOVDQU (32*0)(ptx), XDWTMP0
|
||||||
VPXOR XDWTMP0, DWB0, DWB0
|
VPXOR XDWTMP0, DWB0, DWB0
|
||||||
VMOVDQU (32*1)(ptx), XDWTMP0
|
VMOVDQU (32*1)(ptx), XDWTMP0
|
||||||
VPXOR XDWTMP0, DWB1, DWB1
|
VPXOR XDWTMP0, DWB1, DWB1
|
||||||
|
increment(5)
|
||||||
VMOVDQU (32*2)(ptx), XDWTMP0
|
VMOVDQU (32*2)(ptx), XDWTMP0
|
||||||
VPXOR XDWTMP0, DWB2, DWB2
|
VPXOR XDWTMP0, DWB2, DWB2
|
||||||
VMOVDQU (32*3)(ptx), XDWTMP0
|
VMOVDQU (32*3)(ptx), XDWTMP0
|
||||||
VPXOR XDWTMP0, DWB3, DWB3
|
VPXOR XDWTMP0, DWB3, DWB3
|
||||||
|
increment(6)
|
||||||
|
|
||||||
// Store ciphertext
|
// Store ciphertext
|
||||||
VMOVDQU DWB0, (32*0)(ctx)
|
VMOVDQU DWB0, (32*0)(ctx)
|
||||||
@ -1276,7 +1256,7 @@ avx2GcmSm4Enc:
|
|||||||
VPSHUFB DWBSWAP, DWB2, DWB2
|
VPSHUFB DWBSWAP, DWB2, DWB2
|
||||||
VMOVDQU DWB3, (32*3)(ctx)
|
VMOVDQU DWB3, (32*3)(ctx)
|
||||||
VPSHUFB DWBSWAP, DWB3, DWB3
|
VPSHUFB DWBSWAP, DWB3, DWB3
|
||||||
|
increment(7)
|
||||||
//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
|
//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
|
||||||
//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
|
//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
|
||||||
//VPXOR XDWTMP0, DWB0, DWB0
|
//VPXOR XDWTMP0, DWB0, DWB0
|
||||||
@ -1312,13 +1292,12 @@ avx2GcmSm4EncOctetsLoop:
|
|||||||
VPCLMULQDQ $0x11, T0, ACC1, ACC1
|
VPCLMULQDQ $0x11, T0, ACC1, ACC1
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
|
||||||
|
|
||||||
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
|
||||||
|
|
||||||
VPSHUFB DWBSWAP, DWB0, DWB0
|
VPSHUFB DWBSWAP, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, DWB1, DWB1
|
VPSHUFB DWBSWAP, DWB1, DWB1
|
||||||
@ -1339,7 +1318,7 @@ avx2GcmSm4EncOctetsLoop:
|
|||||||
increment(5)
|
increment(5)
|
||||||
avxMulRound(7)
|
avxMulRound(7)
|
||||||
increment(6)
|
increment(6)
|
||||||
increment(7)
|
|
||||||
VPXOR ACC0, ACCM, ACCM
|
VPXOR ACC0, ACCM, ACCM
|
||||||
VPXOR ACC1, ACCM, ACCM
|
VPXOR ACC1, ACCM, ACCM
|
||||||
VPSLLDQ $8, ACCM, T0
|
VPSLLDQ $8, ACCM, T0
|
||||||
@ -1348,6 +1327,7 @@ avx2GcmSm4EncOctetsLoop:
|
|||||||
VPXOR ACCM, ACC1, ACC1
|
VPXOR ACCM, ACC1, ACC1
|
||||||
VPXOR T0, ACC0, ACC0
|
VPXOR T0, ACC0, ACC0
|
||||||
|
|
||||||
|
increment(7)
|
||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
VPXOR ACC1, ACC0, ACC0
|
VPXOR ACC1, ACC0, ACC0
|
||||||
@ -1442,12 +1422,12 @@ avx2GcmSm4EncNibbles:
|
|||||||
|
|
||||||
VMOVDQU (16*14)(pTbl), T2
|
VMOVDQU (16*14)(pTbl), T2
|
||||||
avxGcmEncDataStep(B0)
|
avxGcmEncDataStep(B0)
|
||||||
avxGcmEncDataStep(B1)
|
|
||||||
avxGcmEncDataStep(B2)
|
|
||||||
avxGcmEncDataStep(B3)
|
|
||||||
increment(0)
|
increment(0)
|
||||||
|
avxGcmEncDataStep(B1)
|
||||||
increment(1)
|
increment(1)
|
||||||
|
avxGcmEncDataStep(B2)
|
||||||
increment(2)
|
increment(2)
|
||||||
|
avxGcmEncDataStep(B3)
|
||||||
increment(3)
|
increment(3)
|
||||||
|
|
||||||
LEAQ 64(ptx), ptx
|
LEAQ 64(ptx), ptx
|
||||||
@ -1622,10 +1602,9 @@ TEXT ·gcmSm4Dec(SB),0,$128-96
|
|||||||
PXOR ACC1, ACC1
|
PXOR ACC1, ACC1
|
||||||
PXOR ACCM, ACCM
|
PXOR ACCM, ACCM
|
||||||
MOVOU (ctrPtr), T0
|
MOVOU (ctrPtr), T0
|
||||||
MOVL (3*4)(ctrPtr), aluCTR
|
|
||||||
BSWAPL aluCTR
|
|
||||||
|
|
||||||
PSHUFB flip_mask<>(SB), T0
|
PSHUFB flip_mask<>(SB), T0
|
||||||
|
PEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
MOVOU T0, (0*16)(SP)
|
MOVOU T0, (0*16)(SP)
|
||||||
increment(0)
|
increment(0)
|
||||||
MOVOU T0, (1*16)(SP)
|
MOVOU T0, (1*16)(SP)
|
||||||
@ -1706,22 +1685,14 @@ gcmSm4DecOctetsLoop:
|
|||||||
|
|
||||||
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
|
|
||||||
MOVOU (16*0)(ctx), T0
|
PXOR (16*0)(ctx), B0
|
||||||
PXOR T0, B0
|
PXOR (16*1)(ctx), B1
|
||||||
MOVOU (16*1)(ctx), T0
|
PXOR (16*2)(ctx), B2
|
||||||
PXOR T0, B1
|
PXOR (16*3)(ctx), B3
|
||||||
MOVOU (16*2)(ctx), T0
|
PXOR (16*4)(ctx), B4
|
||||||
PXOR T0, B2
|
PXOR (16*5)(ctx), B5
|
||||||
MOVOU (16*3)(ctx), T0
|
PXOR (16*6)(ctx), B6
|
||||||
PXOR T0, B3
|
PXOR (16*7)(ctx), B7
|
||||||
MOVOU (16*4)(ctx), T0
|
|
||||||
PXOR T0, B4
|
|
||||||
MOVOU (16*5)(ctx), T0
|
|
||||||
PXOR T0, B5
|
|
||||||
MOVOU (16*6)(ctx), T0
|
|
||||||
PXOR T0, B6
|
|
||||||
MOVOU (16*7)(ctx), T0
|
|
||||||
PXOR T0, B7
|
|
||||||
|
|
||||||
MOVOU B0, (16*0)(ptx)
|
MOVOU B0, (16*0)(ptx)
|
||||||
MOVOU B1, (16*1)(ptx)
|
MOVOU B1, (16*1)(ptx)
|
||||||
@ -1752,22 +1723,22 @@ gcmSm4DecNibbles:
|
|||||||
|
|
||||||
SM4_4BLOCKS_WO_BS(rk, B0, T0, T1, T2, B4, B5, B6, B7)
|
SM4_4BLOCKS_WO_BS(rk, B0, T0, T1, T2, B4, B5, B6, B7)
|
||||||
MOVOU (16*14)(pTbl), T2
|
MOVOU (16*14)(pTbl), T2
|
||||||
MOVOU (16*0)(ctx), T0
|
|
||||||
PXOR T0, B4
|
|
||||||
MOVOU (16*1)(ctx), T0
|
|
||||||
PXOR T0, B5
|
|
||||||
MOVOU (16*2)(ctx), T0
|
|
||||||
PXOR T0, B6
|
|
||||||
MOVOU (16*3)(ctx), T0
|
|
||||||
PXOR T0, B7
|
|
||||||
|
|
||||||
decGhashRound(0)
|
MOVOU (16*0)(ctx), B0
|
||||||
|
PXOR B0, B4
|
||||||
|
internalDecGhashRound()
|
||||||
increment(0)
|
increment(0)
|
||||||
decGhashRound(1)
|
MOVOU (16*1)(ctx), B0
|
||||||
|
PXOR B0, B5
|
||||||
|
internalDecGhashRound()
|
||||||
increment(1)
|
increment(1)
|
||||||
decGhashRound(2)
|
MOVOU (16*2)(ctx), B0
|
||||||
|
PXOR B0, B6
|
||||||
|
internalDecGhashRound()
|
||||||
increment(2)
|
increment(2)
|
||||||
decGhashRound(3)
|
MOVOU (16*3)(ctx), B0
|
||||||
|
PXOR B0, B7
|
||||||
|
internalDecGhashRound()
|
||||||
increment(3)
|
increment(3)
|
||||||
|
|
||||||
MOVOU B4, (16*0)(ptx)
|
MOVOU B4, (16*0)(ptx)
|
||||||
@ -1851,10 +1822,9 @@ avxGcmSm4Dec:
|
|||||||
VPXOR ACC1, ACC1, ACC1
|
VPXOR ACC1, ACC1, ACC1
|
||||||
VPXOR ACCM, ACCM, ACCM
|
VPXOR ACCM, ACCM, ACCM
|
||||||
VMOVDQU (ctrPtr), T0
|
VMOVDQU (ctrPtr), T0
|
||||||
MOVL (3*4)(ctrPtr), aluCTR
|
|
||||||
BSWAPL aluCTR
|
|
||||||
|
|
||||||
VPSHUFB flip_mask<>(SB), T0, T0
|
VPSHUFB flip_mask<>(SB), T0, T0
|
||||||
|
VPEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
VMOVDQU T0, (0*16)(SP)
|
VMOVDQU T0, (0*16)(SP)
|
||||||
increment(0)
|
increment(0)
|
||||||
VMOVDQU T0, (1*16)(SP)
|
VMOVDQU T0, (1*16)(SP)
|
||||||
@ -1918,7 +1888,7 @@ avxGcmSm4DecOctetsLoop:
|
|||||||
increment(5)
|
increment(5)
|
||||||
avxDecMulRound(7)
|
avxDecMulRound(7)
|
||||||
increment(6)
|
increment(6)
|
||||||
increment(7)
|
|
||||||
VPXOR ACC0, ACCM, ACCM
|
VPXOR ACC0, ACCM, ACCM
|
||||||
VPXOR ACC1, ACCM, ACCM
|
VPXOR ACC1, ACCM, ACCM
|
||||||
|
|
||||||
@ -1928,6 +1898,7 @@ avxGcmSm4DecOctetsLoop:
|
|||||||
VPXOR ACCM, ACC1, ACC1
|
VPXOR ACCM, ACC1, ACC1
|
||||||
VPXOR T0, ACC0, ACC0
|
VPXOR T0, ACC0, ACC0
|
||||||
|
|
||||||
|
increment(7)
|
||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
VPXOR ACC1, ACC0, ACC0
|
VPXOR ACC1, ACC0, ACC0
|
||||||
@ -2075,33 +2046,30 @@ avx2GcmSm4Dec:
|
|||||||
VPXOR ACC1, ACC1, ACC1
|
VPXOR ACC1, ACC1, ACC1
|
||||||
VPXOR ACCM, ACCM, ACCM
|
VPXOR ACCM, ACCM, ACCM
|
||||||
VMOVDQU (ctrPtr), T0
|
VMOVDQU (ctrPtr), T0
|
||||||
MOVL (3*4)(ctrPtr), aluCTR
|
|
||||||
BSWAPL aluCTR
|
|
||||||
|
|
||||||
VPSHUFB flip_mask<>(SB), T0, T0
|
VPSHUFB flip_mask<>(SB), T0, T0
|
||||||
VMOVDQU T0, (0*16)(SP)
|
VPEXTRD $3, T0, aluCTR
|
||||||
|
|
||||||
|
VINSERTI128 $1, T0, Y11, Y11
|
||||||
|
VMOVDQU Y11, (0*32)(SP)
|
||||||
increment(0)
|
increment(0)
|
||||||
VMOVDQU T0, (1*16)(SP)
|
|
||||||
increment(1)
|
increment(1)
|
||||||
VMOVDQU T0, (2*16)(SP)
|
VMOVDQU Y11, (1*32)(SP)
|
||||||
increment(2)
|
increment(2)
|
||||||
VMOVDQU T0, (3*16)(SP)
|
|
||||||
increment(3)
|
increment(3)
|
||||||
|
|
||||||
CMPQ ptxLen, $128
|
CMPQ ptxLen, $128
|
||||||
JB avx2GcmSm4DecNibbles
|
JB avx2GcmSm4DecNibbles
|
||||||
|
|
||||||
// We have at least 8 blocks to dencrypt, prepare the rest of the counters
|
// We have at least 8 blocks to dencrypt, prepare the rest of the counters
|
||||||
VMOVDQU T0, (4*16)(SP)
|
VMOVDQU Y11, (2*32)(SP)
|
||||||
increment(4)
|
increment(4)
|
||||||
VMOVDQU T0, (5*16)(SP)
|
|
||||||
increment(5)
|
increment(5)
|
||||||
VMOVDQU T0, (6*16)(SP)
|
VMOVDQU Y11, (3*32)(SP)
|
||||||
increment(6)
|
increment(6)
|
||||||
VMOVDQU T0, (7*16)(SP)
|
|
||||||
increment(7)
|
increment(7)
|
||||||
|
|
||||||
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
|
||||||
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4DecOctetsLoop:
|
avx2GcmSm4DecOctetsLoop:
|
||||||
CMPQ ptxLen, $128
|
CMPQ ptxLen, $128
|
||||||
@ -2141,7 +2109,6 @@ avx2GcmSm4DecOctetsLoop:
|
|||||||
increment(5)
|
increment(5)
|
||||||
avxDecMulRound(7)
|
avxDecMulRound(7)
|
||||||
increment(6)
|
increment(6)
|
||||||
increment(7)
|
|
||||||
|
|
||||||
VPXOR ACC0, ACCM, ACCM
|
VPXOR ACC0, ACCM, ACCM
|
||||||
VPXOR ACC1, ACCM, ACCM
|
VPXOR ACC1, ACCM, ACCM
|
||||||
@ -2150,19 +2117,19 @@ avx2GcmSm4DecOctetsLoop:
|
|||||||
|
|
||||||
VPXOR ACCM, ACC1, ACC1
|
VPXOR ACCM, ACC1, ACC1
|
||||||
VPXOR T0, ACC0, ACC0
|
VPXOR T0, ACC0, ACC0
|
||||||
|
increment(7)
|
||||||
|
|
||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
VPXOR ACC1, ACC0, ACC0
|
VPXOR ACC1, ACC0, ACC0
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
|
||||||
|
|
||||||
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
|
||||||
|
|
||||||
VPSHUFB DWBSWAP, DWB0, DWB0
|
VPSHUFB DWBSWAP, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, DWB1, DWB1
|
VPSHUFB DWBSWAP, DWB1, DWB1
|
||||||
@ -2202,18 +2169,22 @@ avx2GcmSm4DecNibbles:
|
|||||||
VMOVDQU (16*14)(pTbl), T2
|
VMOVDQU (16*14)(pTbl), T2
|
||||||
VMOVDQU (16*0)(ctx), B0
|
VMOVDQU (16*0)(ctx), B0
|
||||||
VPXOR B0, B4, B4
|
VPXOR B0, B4, B4
|
||||||
|
increment(0)
|
||||||
internalAvxDecGhashRound()
|
internalAvxDecGhashRound()
|
||||||
|
|
||||||
VMOVDQU (16*1)(ctx), B0
|
VMOVDQU (16*1)(ctx), B0
|
||||||
VPXOR B0, B1, B1
|
VPXOR B0, B1, B1
|
||||||
|
increment(1)
|
||||||
internalAvxDecGhashRound()
|
internalAvxDecGhashRound()
|
||||||
|
|
||||||
VMOVDQU (16*2)(ctx), B0
|
VMOVDQU (16*2)(ctx), B0
|
||||||
VPXOR B0, B2, B2
|
VPXOR B0, B2, B2
|
||||||
|
increment(2)
|
||||||
internalAvxDecGhashRound()
|
internalAvxDecGhashRound()
|
||||||
|
|
||||||
VMOVDQU (16*3)(ctx), B0
|
VMOVDQU (16*3)(ctx), B0
|
||||||
VPXOR B0, B3, B3
|
VPXOR B0, B3, B3
|
||||||
|
increment(3)
|
||||||
internalAvxDecGhashRound()
|
internalAvxDecGhashRound()
|
||||||
|
|
||||||
VMOVDQU B4, (16*0)(ptx)
|
VMOVDQU B4, (16*0)(ptx)
|
||||||
@ -2221,11 +2192,6 @@ avx2GcmSm4DecNibbles:
|
|||||||
VMOVDQU B2, (16*2)(ptx)
|
VMOVDQU B2, (16*2)(ptx)
|
||||||
VMOVDQU B3, (16*3)(ptx)
|
VMOVDQU B3, (16*3)(ptx)
|
||||||
|
|
||||||
increment(0)
|
|
||||||
increment(1)
|
|
||||||
increment(2)
|
|
||||||
increment(3)
|
|
||||||
|
|
||||||
LEAQ 64(ptx), ptx
|
LEAQ 64(ptx), ptx
|
||||||
LEAQ 64(ctx), ctx
|
LEAQ 64(ctx), ctx
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user