sm4: gcm amd64 reduce byte shuffling #152

This commit is contained in:
emmansun 2023-08-26 10:11:25 +08:00
parent d3c19c171a
commit 3f602061fc
2 changed files with 40 additions and 43 deletions

View File

@ -271,6 +271,9 @@ GLOBL r24_mask256<>(SB), 8, $32
PSHUFB flip_mask<>(SB), t1; \ PSHUFB flip_mask<>(SB), t1; \
PSHUFB flip_mask<>(SB), t2; \ PSHUFB flip_mask<>(SB), t2; \
PSHUFB flip_mask<>(SB), t3; \ PSHUFB flip_mask<>(SB), t3; \
SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
#define SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
MOVOU (0*16)(RK), rk128; \ MOVOU (0*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
@ -321,6 +324,9 @@ GLOBL r24_mask256<>(SB), 8, $32
PSHUFB flip_mask<>(SB), t5; \ PSHUFB flip_mask<>(SB), t5; \
PSHUFB flip_mask<>(SB), t6; \ PSHUFB flip_mask<>(SB), t6; \
PSHUFB flip_mask<>(SB), t7; \ PSHUFB flip_mask<>(SB), t7; \
SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
#define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
MOVOU (0*16)(RK), rk128; \ MOVOU (0*16)(RK), rk128; \
@ -454,6 +460,9 @@ GLOBL r24_mask256<>(SB), 8, $32
VPSHUFB flip_mask<>(SB), t2, t2 \ VPSHUFB flip_mask<>(SB), t2, t2 \
VPSHUFB flip_mask<>(SB), t3, t3 \ VPSHUFB flip_mask<>(SB), t3, t3 \
; \ ; \
AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
#define AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
VMOVDQU (0*16)(RK), rk128; \ VMOVDQU (0*16)(RK), rk128; \
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
@ -506,6 +515,9 @@ GLOBL r24_mask256<>(SB), 8, $32
VPSHUFB flip_mask<>(SB), t6, t6 \ VPSHUFB flip_mask<>(SB), t6, t6 \
VPSHUFB flip_mask<>(SB), t7, t7 \ VPSHUFB flip_mask<>(SB), t7, t7 \
; \ ; \
AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
#define AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
VMOVDQU (0*16)(RK), rk128; \ VMOVDQU (0*16)(RK), rk128; \

View File

@ -436,7 +436,7 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
#define aluCTR R10 #define aluCTR R10
#define aluTMP R11 #define aluTMP R11
#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, (3*4 + 8*16 + i*16)(SP)
#define mulRound(i) \ #define mulRound(i) \
MOVOU (16*i)(SP), T0;\ MOVOU (16*i)(SP), T0;\
@ -528,8 +528,9 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
PXOR ACCM, ACCM PXOR ACCM, ACCM
MOVOU (ctrPtr), T0 MOVOU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR BSWAPL aluCTR
PSHUFB flip_mask<>(SB), T0
MOVOU T0, (8*16 + 0*16)(SP) MOVOU T0, (8*16 + 0*16)(SP)
increment(0) increment(0)
MOVOU T0, (8*16 + 1*16)(SP) MOVOU T0, (8*16 + 1*16)(SP)
@ -563,7 +564,7 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
MOVOU (8*16 + 6*16)(SP), B6 MOVOU (8*16 + 6*16)(SP), B6
MOVOU (8*16 + 7*16)(SP), B7 MOVOU (8*16 + 7*16)(SP), B7
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
PXOR ACC1, ACC1 PXOR ACC1, ACC1
increment(0) increment(0)
increment(1) increment(1)
@ -676,7 +677,7 @@ gcmSm4EncOctetsLoop:
reduceRound(ACC0) reduceRound(ACC0)
PXOR ACC1, ACC0 PXOR ACC1, ACC0
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
MOVOU (16*0)(ptx), T0 MOVOU (16*0)(ptx), T0
PXOR T0, B0 PXOR T0, B0
@ -773,7 +774,7 @@ gcmSm4EncNibbles:
MOVOU (8*16 + 2*16)(SP), B2 MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3 MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
MOVOU (16*0)(ptx), T0 MOVOU (16*0)(ptx), T0
PXOR T0, B0 PXOR T0, B0
MOVOU (16*1)(ptx), T0 MOVOU (16*1)(ptx), T0
@ -809,7 +810,7 @@ gcmSm4EncSingles:
MOVOU (8*16 + 2*16)(SP), B2 MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3 MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
MOVOU B0, (16*0)(SP) MOVOU B0, (16*0)(SP)
MOVOU B1, (16*1)(SP) MOVOU B1, (16*1)(SP)
MOVOU B2, (16*2)(SP) MOVOU B2, (16*2)(SP)
@ -873,6 +874,7 @@ avxGcmSm4Enc:
MOVL (3*4)(ctrPtr), aluCTR MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR BSWAPL aluCTR
VPSHUFB flip_mask<>(SB), T0, T0
VMOVDQU T0, (8*16 + 0*16)(SP) VMOVDQU T0, (8*16 + 0*16)(SP)
increment(0) increment(0)
VMOVDQU T0, (8*16 + 1*16)(SP) VMOVDQU T0, (8*16 + 1*16)(SP)
@ -906,7 +908,7 @@ avxGcmSm4Enc:
VMOVDQU (8*16 + 6*16)(SP), B6 VMOVDQU (8*16 + 6*16)(SP), B6
VMOVDQU (8*16 + 7*16)(SP), B7 VMOVDQU (8*16 + 7*16)(SP), B7
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
VPXOR ACC1, ACC1, ACC1 // clean ACC1 VPXOR ACC1, ACC1, ACC1 // clean ACC1
increment(0) increment(0)
increment(1) increment(1)
@ -1010,7 +1012,7 @@ avxGcmSm4EncOctetsLoop:
avxReduceRound(ACC0) avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0 VPXOR ACC1, ACC0, ACC0
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
// XOR plaintext // XOR plaintext
VPXOR (16*0)(ptx), B0, B0 VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1 VPXOR (16*1)(ptx), B1, B1
@ -1101,7 +1103,7 @@ avxGcmSm4EncNibbles:
VMOVDQU (8*16 + 2*16)(SP), B2 VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3 VMOVDQU (8*16 + 3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3) AVX_SM4_4BLOCKS_WO_BS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
// XOR plaintext // XOR plaintext
VPXOR (16*0)(ptx), B0, B0 VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1 VPXOR (16*1)(ptx), B1, B1
@ -1136,7 +1138,7 @@ avxGcmSm4EncSingles:
VMOVDQU (8*16 + 2*16)(SP), B2 VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3 VMOVDQU (8*16 + 3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3) AVX_SM4_4BLOCKS_WO_BS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
VMOVDQU B0, (16*0)(SP) VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP) VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP) VMOVDQU B2, (16*2)(SP)
@ -1198,9 +1200,10 @@ avx2GcmSm4Enc:
VPXOR ACC1, ACC1, ACC1 VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0 VMOVDQU (ctrPtr), T0
VPSHUFB flip_mask<>(SB), T0, T0
MOVL (3*4)(ctrPtr), aluCTR MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR BSWAPL aluCTR
VMOVDQU T0, (8*16 + 0*16)(SP) VMOVDQU T0, (8*16 + 0*16)(SP)
increment(0) increment(0)
VMOVDQU T0, (8*16 + 1*16)(SP) VMOVDQU T0, (8*16 + 1*16)(SP)
@ -1231,13 +1234,6 @@ avx2GcmSm4Enc:
VMOVDQU (4*32 + 2*32)(SP), DWB2 VMOVDQU (4*32 + 2*32)(SP), DWB2
VMOVDQU (4*32 + 3*32)(SP), DWB3 VMOVDQU (4*32 + 3*32)(SP), DWB3
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
VPSHUFB XDWTMP0, DWB2, DWB2
VPSHUFB XDWTMP0, DWB3, DWB3
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
@ -1304,13 +1300,6 @@ avx2GcmSm4EncOctetsLoop:
VMOVDQU (4*32 + 2*32)(SP), DWB2 VMOVDQU (4*32 + 2*32)(SP), DWB2
VMOVDQU (4*32 + 3*32)(SP), DWB3 VMOVDQU (4*32 + 3*32)(SP), DWB3
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
VPSHUFB XDWTMP0, DWB2, DWB2
VPSHUFB XDWTMP0, DWB3, DWB3
VMOVDQU (16*0)(SP), T0 VMOVDQU (16*0)(SP), T0
VPSHUFD $78, T0, T1 VPSHUFD $78, T0, T1
VPXOR T0, T1, T1 VPXOR T0, T1, T1
@ -1439,7 +1428,7 @@ avx2GcmSm4EncNibbles:
VMOVDQU (8*16 + 2*16)(SP), B2 VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3 VMOVDQU (8*16 + 3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3) AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
VPXOR (16*0)(ptx), B0, B0 VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1 VPXOR (16*1)(ptx), B1, B1
@ -1473,7 +1462,7 @@ avx2GcmSm4EncSingles:
VMOVDQU (8*16 + 2*16)(SP), B2 VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3 VMOVDQU (8*16 + 3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3) AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
VMOVDQU B0, (16*0)(SP) VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP) VMOVDQU B1, (16*1)(SP)
@ -1533,7 +1522,7 @@ avx2GcmSm4EncDone:
// func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) // func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4Dec(SB),0,$128-96 TEXT ·gcmSm4Dec(SB),0,$128-96
#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, (3*4 + i*16)(SP)
#define decMulRound(i) \ #define decMulRound(i) \
MOVOU (16*i)(ctx), T0;\ MOVOU (16*i)(ctx), T0;\
@ -1636,6 +1625,7 @@ TEXT ·gcmSm4Dec(SB),0,$128-96
MOVL (3*4)(ctrPtr), aluCTR MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR BSWAPL aluCTR
PSHUFB flip_mask<>(SB), T0
MOVOU T0, (0*16)(SP) MOVOU T0, (0*16)(SP)
increment(0) increment(0)
MOVOU T0, (1*16)(SP) MOVOU T0, (1*16)(SP)
@ -1714,7 +1704,7 @@ gcmSm4DecOctetsLoop:
reduceRound(ACC0) reduceRound(ACC0)
PXOR ACC1, ACC0 PXOR ACC1, ACC0
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
MOVOU (16*0)(ctx), T0 MOVOU (16*0)(ctx), T0
PXOR T0, B0 PXOR T0, B0
@ -1760,7 +1750,7 @@ gcmSm4DecNibbles:
MOVOU (2*16)(SP), B6 MOVOU (2*16)(SP), B6
MOVOU (3*16)(SP), B7 MOVOU (3*16)(SP), B7
SM4_4BLOCKS(rk, B0, T0, T1, T2, B4, B5, B6, B7) SM4_4BLOCKS_WO_BS(rk, B0, T0, T1, T2, B4, B5, B6, B7)
MOVOU (16*14)(pTbl), T2 MOVOU (16*14)(pTbl), T2
MOVOU (16*0)(ctx), T0 MOVOU (16*0)(ctx), T0
PXOR T0, B4 PXOR T0, B4
@ -1796,7 +1786,7 @@ gcmSm4DecSingles:
MOVOU (2*16)(SP), B2 MOVOU (2*16)(SP), B2
MOVOU (3*16)(SP), B3 MOVOU (3*16)(SP), B3
SM4_4BLOCKS(rk, B4, T0, T1, T2, B0, B1, B2, B3) SM4_4BLOCKS_WO_BS(rk, B4, T0, T1, T2, B0, B1, B2, B3)
MOVOU B0, (16*4)(SP) MOVOU B0, (16*4)(SP)
MOVOU B1, (16*5)(SP) MOVOU B1, (16*5)(SP)
MOVOU B2, (16*6)(SP) MOVOU B2, (16*6)(SP)
@ -1864,6 +1854,7 @@ avxGcmSm4Dec:
MOVL (3*4)(ctrPtr), aluCTR MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR BSWAPL aluCTR
VPSHUFB flip_mask<>(SB), T0, T0
VMOVDQU T0, (0*16)(SP) VMOVDQU T0, (0*16)(SP)
increment(0) increment(0)
VMOVDQU T0, (1*16)(SP) VMOVDQU T0, (1*16)(SP)
@ -1941,7 +1932,7 @@ avxGcmSm4DecOctetsLoop:
avxReduceRound(ACC0) avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0 VPXOR ACC1, ACC0, ACC0
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
VPXOR (16*0)(ctx), B0, B0 VPXOR (16*0)(ctx), B0, B0
VPXOR (16*1)(ctx), B1, B1 VPXOR (16*1)(ctx), B1, B1
@ -1979,7 +1970,7 @@ avxGcmSm4DecNibbles:
VMOVDQU (2*16)(SP), B6 VMOVDQU (2*16)(SP), B6
VMOVDQU (3*16)(SP), B7 VMOVDQU (3*16)(SP), B7
AVX_SM4_4BLOCKS(rk, B0, B1, T1, T2, B4, B5, B6, B7) AVX_SM4_4BLOCKS_WO_BS(rk, B0, B1, T1, T2, B4, B5, B6, B7)
VMOVDQU (16*14)(pTbl), T2 VMOVDQU (16*14)(pTbl), T2
VMOVDQU (16*0)(ctx), B0 VMOVDQU (16*0)(ctx), B0
@ -2019,7 +2010,7 @@ avxGcmSm4DecSingles:
VMOVDQU (2*16)(SP), B2 VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3 VMOVDQU (3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, B7, B6, B5, B4, B0, B1, B2, B3) AVX_SM4_4BLOCKS_WO_BS(rk, B7, B6, B5, B4, B0, B1, B2, B3)
VMOVDQU B0, (16*4)(SP) VMOVDQU B0, (16*4)(SP)
VMOVDQU B1, (16*5)(SP) VMOVDQU B1, (16*5)(SP)
VMOVDQU B2, (16*6)(SP) VMOVDQU B2, (16*6)(SP)
@ -2087,6 +2078,7 @@ avx2GcmSm4Dec:
MOVL (3*4)(ctrPtr), aluCTR MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR BSWAPL aluCTR
VPSHUFB flip_mask<>(SB), T0, T0
VMOVDQU T0, (0*16)(SP) VMOVDQU T0, (0*16)(SP)
increment(0) increment(0)
VMOVDQU T0, (1*16)(SP) VMOVDQU T0, (1*16)(SP)
@ -2163,13 +2155,6 @@ avx2GcmSm4DecOctetsLoop:
avxReduceRound(ACC0) avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0 VPXOR ACC1, ACC0, ACC0
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
VPSHUFB XDWTMP0, DWB2, DWB2
VPSHUFB XDWTMP0, DWB3, DWB3
// Transpose matrix 4 x 4 32bits word // Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
@ -2212,7 +2197,7 @@ avx2GcmSm4DecNibbles:
VMOVDQU (2*16)(SP), B2 VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3 VMOVDQU (3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, B0, B5, B6, B7, B4, B1, B2, B3) AVX_SM4_4BLOCKS_WO_BS(rk, B0, B5, B6, B7, B4, B1, B2, B3)
VMOVDQU (16*14)(pTbl), T2 VMOVDQU (16*14)(pTbl), T2
VMOVDQU (16*0)(ctx), B0 VMOVDQU (16*0)(ctx), B0
@ -2253,7 +2238,7 @@ avx2GcmSm4DecSingles:
VMOVDQU (2*16)(SP), B2 VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3 VMOVDQU (3*16)(SP), B3
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3) AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
VMOVDQU B0, (16*4)(SP) VMOVDQU B0, (16*4)(SP)
VMOVDQU B1, (16*5)(SP) VMOVDQU B1, (16*5)(SP)