mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
sm4: optimize avx2 implementation
This commit is contained in:
parent
24637cf61d
commit
e00fbe696d
114
sm4/gcm_amd64.s
114
sm4/gcm_amd64.s
@ -1229,6 +1229,7 @@ avx2GcmSm4Enc:
|
|||||||
VMOVDQU T0, (8*16 + 7*16)(SP)
|
VMOVDQU T0, (8*16 + 7*16)(SP)
|
||||||
increment(7)
|
increment(7)
|
||||||
|
|
||||||
|
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
||||||
// load 8 ctrs for encryption
|
// load 8 ctrs for encryption
|
||||||
VMOVDQU (4*32 + 0*32)(SP), DWB0
|
VMOVDQU (4*32 + 0*32)(SP), DWB0
|
||||||
VMOVDQU (4*32 + 1*32)(SP), DWB1
|
VMOVDQU (4*32 + 1*32)(SP), DWB1
|
||||||
@ -1251,7 +1252,6 @@ avx2GcmSm4Enc:
|
|||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
|
||||||
VPSHUFB DWBSWAP, DWB0, DWB0
|
VPSHUFB DWBSWAP, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, DWB1, DWB1
|
VPSHUFB DWBSWAP, DWB1, DWB1
|
||||||
VPSHUFB DWBSWAP, DWB2, DWB2
|
VPSHUFB DWBSWAP, DWB2, DWB2
|
||||||
@ -1336,7 +1336,6 @@ avx2GcmSm4EncOctetsLoop:
|
|||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
|
||||||
VPSHUFB DWBSWAP, DWB0, DWB0
|
VPSHUFB DWBSWAP, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, DWB1, DWB1
|
VPSHUFB DWBSWAP, DWB1, DWB1
|
||||||
VPSHUFB DWBSWAP, DWB2, DWB2
|
VPSHUFB DWBSWAP, DWB2, DWB2
|
||||||
@ -1370,14 +1369,10 @@ avx2GcmSm4EncOctetsLoop:
|
|||||||
VPXOR ACC1, ACC0, ACC0
|
VPXOR ACC1, ACC0, ACC0
|
||||||
|
|
||||||
// XOR plaintext
|
// XOR plaintext
|
||||||
VMOVDQU (32*0)(ptx), XDWTMP0
|
VPXOR (32*0)(ptx), DWB0, DWB0
|
||||||
VPXOR XDWTMP0, DWB0, DWB0
|
VPXOR (32*1)(ptx), DWB1, DWB1
|
||||||
VMOVDQU (32*1)(ptx), XDWTMP0
|
VPXOR (32*2)(ptx), DWB2, DWB2
|
||||||
VPXOR XDWTMP0, DWB1, DWB1
|
VPXOR (32*3)(ptx), DWB3, DWB3
|
||||||
VMOVDQU (32*2)(ptx), XDWTMP0
|
|
||||||
VPXOR XDWTMP0, DWB2, DWB2
|
|
||||||
VMOVDQU (32*3)(ptx), XDWTMP0
|
|
||||||
VPXOR XDWTMP0, DWB3, DWB3
|
|
||||||
|
|
||||||
// Store ciphertext
|
// Store ciphertext
|
||||||
VMOVDQU DWB0, (32*0)(ctx)
|
VMOVDQU DWB0, (32*0)(ctx)
|
||||||
@ -1451,14 +1446,10 @@ avx2GcmSm4EncNibbles:
|
|||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
||||||
|
|
||||||
VMOVDQU (16*0)(ptx), T0
|
VPXOR (16*0)(ptx), B0, B0
|
||||||
VPXOR T0, B0, B0
|
VPXOR (16*1)(ptx), B1, B1
|
||||||
VMOVDQU (16*1)(ptx), T0
|
VPXOR (16*2)(ptx), B2, B2
|
||||||
VPXOR T0, B1, B1
|
VPXOR (16*3)(ptx), B3, B3
|
||||||
VMOVDQU (16*2)(ptx), T0
|
|
||||||
VPXOR T0, B2, B2
|
|
||||||
VMOVDQU (16*3)(ptx), T0
|
|
||||||
VPXOR T0, B3, B3
|
|
||||||
|
|
||||||
VMOVDQU B0, (16*0)(ctx)
|
VMOVDQU B0, (16*0)(ctx)
|
||||||
VMOVDQU B1, (16*1)(ctx)
|
VMOVDQU B1, (16*1)(ctx)
|
||||||
@ -1596,9 +1587,6 @@ TEXT ·gcmSm4Dec(SB),0,$128-96
|
|||||||
#define avxDecMulRound(i) \
|
#define avxDecMulRound(i) \
|
||||||
VMOVDQU (16*i)(ctx), T0;\
|
VMOVDQU (16*i)(ctx), T0;\
|
||||||
VPSHUFB BSWAP, T0, T0;\
|
VPSHUFB BSWAP, T0, T0;\
|
||||||
internalAvxDecMulRound(i)
|
|
||||||
|
|
||||||
#define internalAvxDecMulRound(i) \
|
|
||||||
VMOVDQU (16*(i*2))(pTbl), T2;\
|
VMOVDQU (16*(i*2))(pTbl), T2;\
|
||||||
VPCLMULQDQ $0x00, T0, T2, T1;\
|
VPCLMULQDQ $0x00, T0, T2, T1;\
|
||||||
VPXOR T1, ACC0, ACC0;\
|
VPXOR T1, ACC0, ACC0;\
|
||||||
@ -2126,6 +2114,8 @@ avx2GcmSm4Dec:
|
|||||||
VMOVDQU T0, (7*16)(SP)
|
VMOVDQU T0, (7*16)(SP)
|
||||||
increment(7)
|
increment(7)
|
||||||
|
|
||||||
|
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
||||||
|
|
||||||
avx2GcmSm4DecOctetsLoop:
|
avx2GcmSm4DecOctetsLoop:
|
||||||
CMPQ ptxLen, $128
|
CMPQ ptxLen, $128
|
||||||
JB avx2GcmSm4DecEndOctets
|
JB avx2GcmSm4DecEndOctets
|
||||||
@ -2150,6 +2140,34 @@ avx2GcmSm4DecOctetsLoop:
|
|||||||
VPCLMULQDQ $0x00, T0, ACC1, ACC0
|
VPCLMULQDQ $0x00, T0, ACC1, ACC0
|
||||||
VPCLMULQDQ $0x11, T0, ACC1, ACC1
|
VPCLMULQDQ $0x11, T0, ACC1, ACC1
|
||||||
|
|
||||||
|
avxDecMulRound(1)
|
||||||
|
increment(0)
|
||||||
|
avxDecMulRound(2)
|
||||||
|
increment(1)
|
||||||
|
avxDecMulRound(3)
|
||||||
|
increment(2)
|
||||||
|
avxDecMulRound(4)
|
||||||
|
increment(3)
|
||||||
|
avxDecMulRound(5)
|
||||||
|
increment(4)
|
||||||
|
avxDecMulRound(6)
|
||||||
|
increment(5)
|
||||||
|
avxDecMulRound(7)
|
||||||
|
increment(6)
|
||||||
|
increment(7)
|
||||||
|
|
||||||
|
VPXOR ACC0, ACCM, ACCM
|
||||||
|
VPXOR ACC1, ACCM, ACCM
|
||||||
|
VPSLLDQ $8, ACCM, T0
|
||||||
|
VPSRLDQ $8, ACCM, ACCM
|
||||||
|
|
||||||
|
VPXOR ACCM, ACC1, ACC1
|
||||||
|
VPXOR T0, ACC0, ACC0
|
||||||
|
|
||||||
|
avxReduceRound(ACC0)
|
||||||
|
avxReduceRound(ACC0)
|
||||||
|
VPXOR ACC1, ACC0, ACC0
|
||||||
|
|
||||||
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
|
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
|
||||||
// Apply Byte Flip Mask: LE -> BE
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||||
@ -2166,67 +2184,21 @@ avx2GcmSm4DecOctetsLoop:
|
|||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
|
|
||||||
VPSHUFB DWBSWAP, DWB0, DWB0
|
VPSHUFB DWBSWAP, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, DWB1, DWB1
|
VPSHUFB DWBSWAP, DWB1, DWB1
|
||||||
VPSHUFB DWBSWAP, DWB2, DWB2
|
VPSHUFB DWBSWAP, DWB2, DWB2
|
||||||
VPSHUFB DWBSWAP, DWB3, DWB3
|
VPSHUFB DWBSWAP, DWB3, DWB3
|
||||||
|
|
||||||
VMOVDQU (32*0)(ctx), XDWTMP0
|
VPXOR (32*0)(ctx), DWB0, DWB0
|
||||||
VPXOR XDWTMP0, DWB0, DWB0
|
VPXOR (32*1)(ctx), DWB1, DWB1
|
||||||
VEXTRACTI128 $1, XDWTMP0, T0
|
VPXOR (32*2)(ctx), DWB2, DWB2
|
||||||
VPSHUFB BSWAP, T0, T0
|
VPXOR (32*3)(ctx), DWB3, DWB3
|
||||||
internalAvxDecMulRound(1)
|
|
||||||
increment(0)
|
|
||||||
|
|
||||||
VMOVDQU (32*1)(ctx), XDWTMP0
|
|
||||||
VPXOR XDWTMP0, DWB1, DWB1
|
|
||||||
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
|
|
||||||
VEXTRACTI128 $0, XDWTMP0, T0
|
|
||||||
internalAvxDecMulRound(2)
|
|
||||||
increment(1)
|
|
||||||
VEXTRACTI128 $1, XDWTMP0, T0
|
|
||||||
internalAvxDecMulRound(3)
|
|
||||||
increment(2)
|
|
||||||
|
|
||||||
VMOVDQU (32*2)(ctx), XDWTMP0
|
|
||||||
VPXOR XDWTMP0, DWB2, DWB2
|
|
||||||
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
|
|
||||||
VEXTRACTI128 $0, XDWTMP0, T0
|
|
||||||
internalAvxDecMulRound(4)
|
|
||||||
increment(3)
|
|
||||||
VEXTRACTI128 $1, XDWTMP0, T0
|
|
||||||
internalAvxDecMulRound(5)
|
|
||||||
increment(4)
|
|
||||||
|
|
||||||
VMOVDQU (32*3)(ctx), XDWTMP0
|
|
||||||
VPXOR XDWTMP0, DWB3, DWB3
|
|
||||||
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
|
|
||||||
VEXTRACTI128 $0, XDWTMP0, T0
|
|
||||||
internalAvxDecMulRound(6)
|
|
||||||
increment(5)
|
|
||||||
VEXTRACTI128 $1, XDWTMP0, T0
|
|
||||||
internalAvxDecMulRound(7)
|
|
||||||
increment(6)
|
|
||||||
increment(7)
|
|
||||||
|
|
||||||
VMOVDQU DWB0, (32*0)(ptx)
|
VMOVDQU DWB0, (32*0)(ptx)
|
||||||
VMOVDQU DWB1, (32*1)(ptx)
|
VMOVDQU DWB1, (32*1)(ptx)
|
||||||
VMOVDQU DWB2, (32*2)(ptx)
|
VMOVDQU DWB2, (32*2)(ptx)
|
||||||
VMOVDQU DWB3, (32*3)(ptx)
|
VMOVDQU DWB3, (32*3)(ptx)
|
||||||
|
|
||||||
VPXOR ACC0, ACCM, ACCM
|
|
||||||
VPXOR ACC1, ACCM, ACCM
|
|
||||||
VPSLLDQ $8, ACCM, T0
|
|
||||||
VPSRLDQ $8, ACCM, ACCM
|
|
||||||
|
|
||||||
VPXOR ACCM, ACC1, ACC1
|
|
||||||
VPXOR T0, ACC0, ACC0
|
|
||||||
|
|
||||||
avxReduceRound(ACC0)
|
|
||||||
avxReduceRound(ACC0)
|
|
||||||
VPXOR ACC1, ACC0, ACC0
|
|
||||||
|
|
||||||
LEAQ 128(ptx), ptx
|
LEAQ 128(ptx), ptx
|
||||||
LEAQ 128(ctx), ctx
|
LEAQ 128(ctx), ctx
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user