From 6dde984da43a4ca20e5318b4ee71ccc9f4deed46 Mon Sep 17 00:00:00 2001 From: Emman Date: Tue, 18 Jan 2022 10:31:53 +0800 Subject: [PATCH] [SM4] gcmSm4Enc & gcmSm4Dec --- sm4/gcm_amd64.s | 1688 +++++++++++++++++++++++++++++++++++++++++ sm4/gcm_amd64_test.go | 123 +++ sm4/sm4_gcm_amd64.go | 17 +- 3 files changed, 1819 insertions(+), 9 deletions(-) create mode 100644 sm4/gcm_amd64_test.go diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index 6d4a946..310b974 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -16,6 +16,16 @@ #define B6 X6 #define B7 X7 +#define DWB0 Y0 +#define DWB1 Y2 +#define DWB2 Y4 +#define DWB3 Y6 + +#define XDWORD Y1 +#define YDWORD Y3 +#define XDWTMP0 Y5 +#define XDWTMP1 Y7 + #define ACC0 X8 #define ACC1 X9 #define ACCM X10 @@ -25,6 +35,9 @@ #define T2 X13 #define POLY X14 #define BSWAP X15 +#define DWBSWAP Y15 +#define NIBBLE_MASK Y11 +#define X_NIBBLE_MASK X11 // shuffle byte order from LE to BE DATA flipMask<>+0x00(SB)/8, $0x0405060700010203 @@ -240,6 +253,239 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 SM4_TAO_L1(x, y, z); \ PXOR x, t0 +#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ + PINSRD $0, (index * 4)(RK)(IND*1), x; \ + PSHUFD $0, x, x; \ + PXOR t1, x; \ + PXOR t2, x; \ + PXOR t3, x; \ + SM4_TAO_L1(x, y, z); \ + PXOR x, t0 + +// MOVOU r0, tmp2; +// PUNPCKHDQ r1, tmp2; +// PUNPCKLDQ r1, r0; +// MOVOU r2, tmp1; +// PUNPCKLDQ r3, tmp1; +// PUNPCKHDQ r3, r2; +// MOVOU r0, r1; +// PUNPCKHQDQ tmp1, r1; +// PUNPCKLQDQ tmp1, r0; +// MOVOU tmp2, r3; +// PUNPCKHQDQ r2, r3; +// PUNPCKLQDQ r2, tmp2; +// MOVOU tmp2, r2 +#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \ + PEXTRD $2, r0, r; \ + PINSRD $0, r, tmp2; \ + PEXTRD $2, r1, r; \ + PINSRD $1, r, tmp2; \ + ; \ + PEXTRD $3, r0, r; \ + PINSRD $2, r, tmp2; \ + PEXTRD $3, r1, r; \ + PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2] + ; \ + PEXTRD $1, r0, r; \ + PINSRD $2, r, r0; \ + PEXTRD $0, r1, r; \ + PINSRD $1, r, r0; \ + PEXTRD $1, r1, r; \ + PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0] + ; \ + PEXTRD $0, r2, r; \ + PINSRD $0, r, tmp1; \ + PEXTRD $0, r3, r; \ + PINSRD $1, r, tmp1; \ + PEXTRD $1, r2, r; \ + PINSRD $2, r, tmp1; \ + PEXTRD $1, r3, r; \ + PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8] + ; \ + PEXTRD $2, r2, r; \ + PINSRD $0, r, r2; \ + PEXTRD $2, r3, r; \ + PINSRD $1, r, r2; \ + PEXTRD $3, r2, r; \ + PINSRD $2, r, r2; \ + PEXTRD $3, r3, r; \ + PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10] + ; \ + MOVOU r0, r1; \ + PEXTRQ $1, r1, r; \ + PINSRQ $0, r, r1; \ + PEXTRQ $1, tmp1, r; \ + PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1] + ; \ + PEXTRQ $0, tmp1, r; \ + PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0] + ; \ + MOVOU tmp2, r3; \ + PEXTRQ $1, r3, r; \ + PINSRQ $0, r, r3; \ + PEXTRQ $1, r2, r; \ + PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3] + ; \ + PEXTRQ $0, r2, r; \ + PINSRQ $1, r, r2; \ + PEXTRQ $0, tmp2, r; \ + PINSRQ $0, r, r2 + +#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \ + PSHUFB flipMask<>(SB), t0; \ + PSHUFB flipMask<>(SB), t1; \ + PSHUFB flipMask<>(SB), t2; \ + PSHUFB flipMask<>(SB), t3; \ + SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \ + XORL IND, IND; \ + SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ + SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ + SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ + SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ + ADDL $16, IND; \ + SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ + SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ + SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ + SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ + ADDL $16, IND; \ + SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ + SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ + SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ + SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ + ADDL $16, IND; \ + SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ + SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ + SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ + SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ + ADDL $16, IND; \ + SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ + SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ + SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ + SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ + ADDL $16, IND; \ + SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ + SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ + SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ + SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ + ADDL $16, IND; \ + SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ + SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ + SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ + SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ + ADDL $16, IND; \ + SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ + SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ + SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ + SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ + SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \ + PSHUFB BSWAP, t3; \ + PSHUFB BSWAP, t2; \ + PSHUFB BSWAP, t1; \ + PSHUFB BSWAP, t0 + +#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ + VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] + VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] + VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8] + VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10] + VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1] + VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0] + VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] + VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] + +#define AVX2_SM4_SBOX(x, y, xw, yw, tmp) \ + VPAND NIBBLE_MASK, x, tmp; \ + VBROADCASTI128 m1Low<>(SB), y; \ + VPSHUFB tmp, y, y; \ + VPSRLQ $4, x, x; \ + VPAND NIBBLE_MASK, x, x; \ + VBROADCASTI128 m1High<>(SB), tmp; \ + VPSHUFB x, tmp, x; \ + VPXOR y, x, x; \ + VBROADCASTI128 inverseShiftRows<>(SB), tmp; \ + VPSHUFB tmp, x, x; \ + VEXTRACTI128 $1, x, yw \ + VAESENCLAST X_NIBBLE_MASK, xw, xw; \ + VAESENCLAST X_NIBBLE_MASK, yw, yw; \ + VINSERTI128 $1, yw, x, x; \ + VPANDN NIBBLE_MASK, x, tmp; \ + VBROADCASTI128 m2Low<>(SB), y; \ + VPSHUFB tmp, y, y; \ + VPSRLQ $4, x, x; \ + VPAND NIBBLE_MASK, x, x; \ + VBROADCASTI128 m2High<>(SB), tmp; \ + VPSHUFB x, tmp, x; \ + VPXOR y, x, x + +#define AVX2_SM4_TAO_L1(x, y, xw, yw, tmp) \ + AVX2_SM4_SBOX(x, y, xw, yw, tmp); \ + VBROADCASTI128 r08Mask<>(SB), tmp; \ + VPSHUFB tmp, x, y; \ + VPXOR x, y, y; \ + VBROADCASTI128 r16Mask<>(SB), tmp; \ + VPSHUFB tmp, x, tmp; \ + VPXOR tmp, y, y; \ + VPSLLD $2, y, tmp; \ + VPSRLD $30, y, y; \ + VPXOR tmp, y, y; \ + VBROADCASTI128 r24Mask<>(SB), tmp; \ + VPSHUFB tmp, x, tmp; \ + VPXOR y, x, x; \ + VPXOR x, tmp, x + +#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \ + VPBROADCASTD (index * 4)(RK)(IND*1), x; \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX2_SM4_TAO_L1(x, y, xw, yw, tmp); \ + VPXOR x, t0, t0 + +#define AVX_SM4_SBOX(x, y, tmp) \ + VPAND X_NIBBLE_MASK, x, tmp; \ + VMOVDQU m1Low<>(SB), y; \ + VPSHUFB tmp, y, y; \ + VPSRLQ $4, x, x; \ + VPAND X_NIBBLE_MASK, x, x; \ + VMOVDQU m1High<>(SB), tmp; \ + VPSHUFB x, tmp, x; \ + VPXOR y, x, x; \ + VMOVDQU inverseShiftRows<>(SB), tmp; \ + VPSHUFB tmp, x, x; \ + VAESENCLAST X_NIBBLE_MASK, x, x; \ + VPANDN X_NIBBLE_MASK, x, tmp; \ + VMOVDQU m2Low<>(SB), y; \ + VPSHUFB tmp, y, y; \ + VPSRLQ $4, x, x; \ + VPAND X_NIBBLE_MASK, x, x; \ + VMOVDQU m2High<>(SB), tmp; \ + VPSHUFB x, tmp, x; \ + VPXOR y, x, x + +#define AVX_SM4_TAO_L1(x, y, tmp) \ + AVX_SM4_SBOX(x, y, tmp); \ + VMOVDQU r08Mask<>(SB), tmp; \ + VPSHUFB tmp, x, y; \ + VPXOR x, y, y; \ + VMOVDQU r16Mask<>(SB), tmp; \ + VPSHUFB tmp, x, tmp; \ + VPXOR tmp, y, y; \ + VPSLLD $2, y, tmp; \ + VPSRLD $30, y, y; \ + VPXOR tmp, y, y; \ + VMOVDQU r24Mask<>(SB), tmp; \ + VPSHUFB tmp, x, tmp; \ + VPXOR y, x, x; \ + VPXOR x, tmp, x + +#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ + VPBROADCASTD (index * 4)(RK)(IND*1), x; \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX_SM4_TAO_L1(x, y, tmp); \ + VPXOR x, t0, t0 + // func gcmSm4Init(productTable *[256]byte, rk []uint32) TEXT ·gcmSm4Init(SB),NOSPLIT,$0 #define dst DI @@ -513,3 +759,1445 @@ dataBail: #undef aut #undef tPtr #undef autLen + + +// func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) +TEXT ·gcmSm4Enc(SB),0,$256-96 +#define pTbl DI +#define ctx DX +#define ctrPtr CX +#define ptx SI +#define rk AX +#define tPtr R8 +#define ptxLen R9 +#define aluCTR R10 +#define aluTMP R11 + +#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) + +#define mulRound(i) \ + MOVOU (16*i)(SP), T0;\ + MOVOU (16*(i*2))(pTbl), T1;\ + MOVOU T1, T2;\ + PCLMULQDQ $0x00, T0, T1;\ + PXOR T1, ACC0;\ + PCLMULQDQ $0x11, T0, T2;\ + PXOR T2, ACC1;\ + PSHUFD $78, T0, T1;\ + PXOR T1, T0;\ + MOVOU (16*(i*2+1))(pTbl), T1;\ + PCLMULQDQ $0x00, T0, T1;\ + PXOR T1, ACCM + +#define gcmEncDataStep(B) \ + PSHUFB BSWAP, B; \ + PXOR ACC0, B; \ + MOVOU T2, ACC0; \ + MOVOU T2, ACC1; \ + MOVOU (16*15)(pTbl), ACCM; \ + PSHUFD $78, B, T0; \ + PXOR B, T0; \ + PCLMULQDQ $0x00, B, ACC0; \ + PCLMULQDQ $0x11, B, ACC1; \ + PCLMULQDQ $0x00, T0, ACCM; \ + PXOR ACC0, ACCM; \ + PXOR ACC1, ACCM; \ + MOVOU ACCM, T0; \ + PSRLDQ $8, ACCM; \ + PSLLDQ $8, T0; \ + PXOR ACCM, ACC1; \ + PXOR T0, ACC0; \ + reduceRound(ACC0); \ + reduceRound(ACC0); \ + PXOR ACC1, ACC0 + + MOVQ productTable+0(FP), pTbl + MOVQ dst+8(FP), ctx + MOVQ src_base+32(FP), ptx + MOVQ src_len+40(FP), ptxLen + MOVQ ctr+56(FP), ctrPtr + MOVQ T+64(FP), tPtr + MOVQ rk_base+72(FP), rk + + CMPB ·useAVX2(SB), $1 + JE avx2GcmSm4Enc + + MOVOU bswapMask<>(SB), BSWAP + MOVOU gcmPoly<>(SB), POLY + + MOVOU (tPtr), ACC0 + PXOR ACC1, ACC1 + PXOR ACCM, ACCM + MOVOU (ctrPtr), T0 + MOVL (3*4)(ctrPtr), aluCTR + + BSWAPL aluCTR + MOVOU T0, (8*16 + 0*16)(SP) + increment(0) + MOVOU T0, (8*16 + 1*16)(SP) + increment(1) + MOVOU T0, (8*16 + 2*16)(SP) + increment(2) + MOVOU T0, (8*16 + 3*16)(SP) + increment(3) + + CMPQ ptxLen, $128 + JB gcmSm4EncNibbles + SUBQ $128, ptxLen + + // We have at least 8 blocks to encrypt, prepare the rest of the counters + MOVOU T0, (8*16 + 4*16)(SP) + increment(4) + MOVOU T0, (8*16 + 5*16)(SP) + increment(5) + MOVOU T0, (8*16 + 6*16)(SP) + increment(6) + MOVOU T0, (8*16 + 7*16)(SP) + increment(7) + + // load 8 ctrs for encryption + MOVOU (8*16 + 0*16)(SP), B0 + MOVOU (8*16 + 1*16)(SP), B1 + MOVOU (8*16 + 2*16)(SP), B2 + MOVOU (8*16 + 3*16)(SP), B3 + MOVOU (8*16 + 4*16)(SP), B4 + MOVOU (8*16 + 5*16)(SP), B5 + MOVOU (8*16 + 6*16)(SP), B6 + MOVOU (8*16 + 7*16)(SP), B7 + + SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) + increment(0) + increment(1) + increment(2) + increment(3) + SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) + increment(4) + increment(5) + increment(6) + increment(7) + + // XOR plaintext + MOVOU (16*0)(ptx), T0 + PXOR T0, B0 + MOVOU (16*1)(ptx), T0 + PXOR T0, B1 + MOVOU (16*2)(ptx), T0 + PXOR T0, B2 + MOVOU (16*3)(ptx), T0 + PXOR T0, B3 + MOVOU (16*4)(ptx), T0 + PXOR T0, B4 + MOVOU (16*5)(ptx), T0 + PXOR T0, B5 + MOVOU (16*6)(ptx), T0 + PXOR T0, B6 + MOVOU (16*7)(ptx), T0 + PXOR T0, B7 + + // Store ciphertext + MOVOU B0, (16*0)(ctx) + PSHUFB BSWAP, B0 + PXOR ACC0, B0 + MOVOU B1, (16*1)(ctx) + PSHUFB BSWAP, B1 + MOVOU B2, (16*2)(ctx) + PSHUFB BSWAP, B2 + MOVOU B3, (16*3)(ctx) + PSHUFB BSWAP, B3 + MOVOU B4, (16*4)(ctx) + PSHUFB BSWAP, B4 + MOVOU B5, (16*5)(ctx) + PSHUFB BSWAP, B5 + MOVOU B6, (16*6)(ctx) + PSHUFB BSWAP, B6 + MOVOU B7, (16*7)(ctx) + PSHUFB BSWAP, B7 + + MOVOU B0, (16*0)(SP) + MOVOU B1, (16*1)(SP) + MOVOU B2, (16*2)(SP) + MOVOU B3, (16*3)(SP) + MOVOU B4, (16*4)(SP) + MOVOU B5, (16*5)(SP) + MOVOU B6, (16*6)(SP) + MOVOU B7, (16*7)(SP) + + LEAQ 128(ptx), ptx + LEAQ 128(ctx), ctx + +gcmSm4EncOctetsLoop: + CMPQ ptxLen, $128 + JB gcmSm4EncOctetsEnd + SUBQ $128, ptxLen + + MOVOU (8*16 + 0*16)(SP), B0 + MOVOU (8*16 + 1*16)(SP), B1 + MOVOU (8*16 + 2*16)(SP), B2 + MOVOU (8*16 + 3*16)(SP), B3 + MOVOU (8*16 + 4*16)(SP), B4 + MOVOU (8*16 + 5*16)(SP), B5 + MOVOU (8*16 + 6*16)(SP), B6 + MOVOU (8*16 + 7*16)(SP), B7 + + MOVOU (16*0)(SP), T0 + PSHUFD $78, T0, T1 + PXOR T0, T1 + + MOVOU (16*0)(pTbl), ACC0 + MOVOU (16*1)(pTbl), ACCM + MOVOU ACC0, ACC1 + + PCLMULQDQ $0x00, T1, ACCM + PCLMULQDQ $0x00, T0, ACC0 + PCLMULQDQ $0x11, T0, ACC1 + + SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) + mulRound(1) + increment(0) + mulRound(2) + increment(1) + mulRound(3) + increment(2) + mulRound(4) + increment(3) + SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) + mulRound(5) + increment(4) + mulRound(6) + increment(5) + mulRound(7) + increment(6) + increment(7) + PXOR ACC0, ACCM + PXOR ACC1, ACCM + MOVOU ACCM, T0 + PSRLDQ $8, ACCM + PSLLDQ $8, T0 + PXOR ACCM, ACC1 + PXOR T0, ACC0 + + reduceRound(ACC0) + reduceRound(ACC0) + PXOR ACC1, ACC0 + + MOVOU (16*0)(ptx), T0 + PXOR T0, B0 + MOVOU (16*1)(ptx), T0 + PXOR T0, B1 + MOVOU (16*2)(ptx), T0 + PXOR T0, B2 + MOVOU (16*3)(ptx), T0 + PXOR T0, B3 + MOVOU (16*4)(ptx), T0 + PXOR T0, B4 + MOVOU (16*5)(ptx), T0 + PXOR T0, B5 + MOVOU (16*6)(ptx), T0 + PXOR T0, B6 + MOVOU (16*7)(ptx), T0 + PXOR T0, B7 + + MOVOU B0, (16*0)(ctx) + PSHUFB BSWAP, B0 + PXOR ACC0, B0 + MOVOU B1, (16*1)(ctx) + PSHUFB BSWAP, B1 + MOVOU B2, (16*2)(ctx) + PSHUFB BSWAP, B2 + MOVOU B3, (16*3)(ctx) + PSHUFB BSWAP, B3 + MOVOU B4, (16*4)(ctx) + PSHUFB BSWAP, B4 + MOVOU B5, (16*5)(ctx) + PSHUFB BSWAP, B5 + MOVOU B6, (16*6)(ctx) + PSHUFB BSWAP, B6 + MOVOU B7, (16*7)(ctx) + PSHUFB BSWAP, B7 + + MOVOU B0, (16*0)(SP) + MOVOU B1, (16*1)(SP) + MOVOU B2, (16*2)(SP) + MOVOU B3, (16*3)(SP) + MOVOU B4, (16*4)(SP) + MOVOU B5, (16*5)(SP) + MOVOU B6, (16*6)(SP) + MOVOU B7, (16*7)(SP) + + LEAQ 128(ptx), ptx + LEAQ 128(ctx), ctx + + JMP gcmSm4EncOctetsLoop + +gcmSm4EncOctetsEnd: + MOVOU (16*0)(SP), T0 + MOVOU (16*0)(pTbl), ACC0 + MOVOU (16*1)(pTbl), ACCM + MOVOU ACC0, ACC1 + PSHUFD $78, T0, T1 + PXOR T0, T1 + PCLMULQDQ $0x00, T0, ACC0 + PCLMULQDQ $0x11, T0, ACC1 + PCLMULQDQ $0x00, T1, ACCM + + mulRound(1) + mulRound(2) + mulRound(3) + mulRound(4) + mulRound(5) + mulRound(6) + mulRound(7) + + PXOR ACC0, ACCM + PXOR ACC1, ACCM + MOVOU ACCM, T0 + PSRLDQ $8, ACCM + PSLLDQ $8, T0 + PXOR ACCM, ACC1 + PXOR T0, ACC0 + + reduceRound(ACC0) + reduceRound(ACC0) + PXOR ACC1, ACC0 + + TESTQ ptxLen, ptxLen + JE gcmSm4EncDone + + SUBQ $4, aluCTR + +gcmSm4EncNibbles: + CMPQ ptxLen, $64 + JB gcmSm4EncSingles + SUBQ $64, ptxLen + + MOVOU (8*16 + 0*16)(SP), B0 + MOVOU (8*16 + 1*16)(SP), B1 + MOVOU (8*16 + 2*16)(SP), B2 + MOVOU (8*16 + 3*16)(SP), B3 + + SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3) + MOVOU (16*0)(ptx), T0 + PXOR T0, B0 + MOVOU (16*1)(ptx), T0 + PXOR T0, B1 + MOVOU (16*2)(ptx), T0 + PXOR T0, B2 + MOVOU (16*3)(ptx), T0 + PXOR T0, B3 + + MOVOU B0, (16*0)(ctx) + MOVOU B1, (16*1)(ctx) + MOVOU B2, (16*2)(ctx) + MOVOU B3, (16*3)(ctx) + + MOVOU (16*14)(pTbl), T2 + gcmEncDataStep(B0) + gcmEncDataStep(B1) + gcmEncDataStep(B2) + gcmEncDataStep(B3) + increment(0) + increment(1) + increment(2) + increment(3) + + LEAQ 64(ptx), ptx + LEAQ 64(ctx), ctx + +gcmSm4EncSingles: + TESTQ ptxLen, ptxLen + JE gcmSm4EncDone + MOVOU (8*16 + 0*16)(SP), B0 + MOVOU (8*16 + 1*16)(SP), B1 + MOVOU (8*16 + 2*16)(SP), B2 + MOVOU (8*16 + 3*16)(SP), B3 + + SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3) + MOVOU B0, (16*0)(SP) + MOVOU B1, (16*1)(SP) + MOVOU B2, (16*2)(SP) + MOVOU B3, (16*3)(SP) + + MOVOU (16*14)(pTbl), T2 + MOVQ SP, BP + +gcmSm4EncSinglesLoop: + CMPQ ptxLen, $16 + JB gcmSm4EncTail + SUBQ $16, ptxLen + MOVOU (16*0)(BP), B0 + MOVOU (ptx), T0 + PXOR T0, B0 + MOVOU B0, (ctx) + gcmEncDataStep(B0) + LEAQ (16*1)(ptx), ptx + LEAQ (16*1)(ctx), ctx + ADDQ $16, BP + JMP gcmSm4EncSinglesLoop + +gcmSm4EncTail: + TESTQ ptxLen, ptxLen + JE gcmSm4EncDone + MOVOU (16*0)(BP), B0 + MOVOU B0, T0 + + LEAQ -1(ptx)(ptxLen*1), ptx + + MOVQ ptxLen, aluTMP + SHLQ $4, aluTMP + + LEAQ andMask<>(SB), aluCTR + MOVOU -16(aluCTR)(aluTMP*1), T1 + PXOR B0, B0 +ptxLoadLoop: + PSLLDQ $1, B0 + PINSRB $0, (ptx), B0 + LEAQ -1(ptx), ptx + DECQ ptxLen + JNE ptxLoadLoop + + PXOR T0, B0 + PAND T1, B0 + MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT + gcmEncDataStep(B0) + +gcmSm4EncDone: + MOVOU ACC0, (tPtr) + RET + +avx2GcmSm4Enc: + VMOVDQU bswapMask<>(SB), BSWAP + VMOVDQU gcmPoly<>(SB), POLY + + VMOVDQU (tPtr), ACC0 + VPXOR ACC1, ACC1, ACC1 + VPXOR ACCM, ACCM, ACCM + VMOVDQU (ctrPtr), T0 + MOVL (3*4)(ctrPtr), aluCTR + + BSWAPL aluCTR + VMOVDQU T0, (8*16 + 0*16)(SP) + increment(0) + VMOVDQU T0, (8*16 + 1*16)(SP) + increment(1) + VMOVDQU T0, (8*16 + 2*16)(SP) + increment(2) + VMOVDQU T0, (8*16 + 3*16)(SP) + increment(3) + + CMPQ ptxLen, $128 + JB avx2GcmSm4EncNibbles + SUBQ $128, ptxLen + + // We have at least 8 blocks to encrypt, prepare the rest of the counters + VMOVDQU T0, (8*16 + 4*16)(SP) + increment(4) + VMOVDQU T0, (8*16 + 5*16)(SP) + increment(5) + VMOVDQU T0, (8*16 + 6*16)(SP) + increment(6) + VMOVDQU T0, (8*16 + 7*16)(SP) + increment(7) + + // load 8 ctrs for encryption + VMOVDQU (4*32 + 0*32)(SP), DWB0 + VMOVDQU (4*32 + 1*32)(SP), DWB1 + VMOVDQU (4*32 + 2*32)(SP), DWB2 + VMOVDQU (4*32 + 3*32)(SP), DWB3 + + VBROADCASTI128 flipMask<>(SB), XDWTMP0 + // Apply Byte Flip Mask: LE -> BE + VPSHUFB XDWTMP0, DWB0, DWB0 + VPSHUFB XDWTMP0, DWB1, DWB1 + VPSHUFB XDWTMP0, DWB2, DWB2 + VPSHUFB XDWTMP0, DWB3, DWB3 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) + XORL BX, BX + VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK + +avx2GcmSm4Enc8Loop1: + AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) + AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) + AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) + AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) + + ADDL $16, BX + CMPL BX, $4*32 + JB avx2GcmSm4Enc8Loop1 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) + + VBROADCASTI128 bswapMask<>(SB), DWBSWAP + VPSHUFB DWBSWAP, DWB0, DWB0 + VPSHUFB DWBSWAP, DWB1, DWB1 + VPSHUFB DWBSWAP, DWB2, DWB2 + VPSHUFB DWBSWAP, DWB3, DWB3 + + increment(0) + increment(1) + increment(2) + increment(3) + increment(4) + increment(5) + increment(6) + increment(7) + + // XOR plaintext + VMOVDQU (32*0)(ptx), XDWTMP0 + VPXOR XDWTMP0, DWB0, DWB0 + VMOVDQU (32*1)(ptx), XDWTMP0 + VPXOR XDWTMP0, DWB1, DWB1 + VMOVDQU (32*2)(ptx), XDWTMP0 + VPXOR XDWTMP0, DWB2, DWB2 + VMOVDQU (32*3)(ptx), XDWTMP0 + VPXOR XDWTMP0, DWB3, DWB3 + + // Store ciphertext + VMOVDQU DWB0, (32*0)(ctx) + VPSHUFB DWBSWAP, DWB0, DWB0 + VMOVDQU DWB1, (32*1)(ctx) + VPSHUFB DWBSWAP, DWB1, DWB1 + VMOVDQU DWB2, (32*2)(ctx) + VPSHUFB DWBSWAP, DWB2, DWB2 + VMOVDQU DWB3, (32*3)(ctx) + VPSHUFB DWBSWAP, DWB3, DWB3 + + //VPXOR XDWTMP0, XDWTMP0, XDWTMP0 + //VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0 + //VPXOR XDWTMP0, DWB0, DWB0 + PXOR ACC0, B0 // Can't call VPXOR here + VMOVDQU DWB0, (32*0)(SP) + VMOVDQU DWB1, (32*1)(SP) + VMOVDQU DWB2, (32*2)(SP) + VMOVDQU DWB3, (32*3)(SP) + + LEAQ 128(ptx), ptx + LEAQ 128(ctx), ctx + +avx2GcmSm4EncOctetsLoop: + CMPQ ptxLen, $128 + JB avx2GcmSm4EncOctetsEnd + SUBQ $128, ptxLen + + // load 8 ctrs for encryption + VMOVDQU (4*32 + 0*32)(SP), DWB0 + VMOVDQU (4*32 + 1*32)(SP), DWB1 + VMOVDQU (4*32 + 2*32)(SP), DWB2 + VMOVDQU (4*32 + 3*32)(SP), DWB3 + + VBROADCASTI128 flipMask<>(SB), XDWTMP0 + // Apply Byte Flip Mask: LE -> BE + VPSHUFB XDWTMP0, DWB0, DWB0 + VPSHUFB XDWTMP0, DWB1, DWB1 + VPSHUFB XDWTMP0, DWB2, DWB2 + VPSHUFB XDWTMP0, DWB3, DWB3 + + VMOVDQU (16*0)(SP), T0 + VPSHUFD $78, T0, T1 + VPXOR T0, T1, T1 + + VMOVDQU (16*0)(pTbl), ACC0 + VMOVDQU (16*1)(pTbl), ACCM + VMOVDQU ACC0, ACC1 + + PCLMULQDQ $0x00, T1, ACCM + PCLMULQDQ $0x00, T0, ACC0 + PCLMULQDQ $0x11, T0, ACC1 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) + XORL BX, BX + VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK + +avx2GcmSm4Enc8Loop2: + AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) + AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) + AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) + AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) + + ADDL $16, BX + CMPL BX, $4*32 + JB avx2GcmSm4Enc8Loop2 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) + + VBROADCASTI128 bswapMask<>(SB), DWBSWAP + VPSHUFB DWBSWAP, DWB0, DWB0 + VPSHUFB DWBSWAP, DWB1, DWB1 + VPSHUFB DWBSWAP, DWB2, DWB2 + VPSHUFB DWBSWAP, DWB3, DWB3 + + mulRound(1) + increment(0) + mulRound(2) + increment(1) + mulRound(3) + increment(2) + mulRound(4) + increment(3) + mulRound(5) + increment(4) + mulRound(6) + increment(5) + mulRound(7) + increment(6) + increment(7) + VPXOR ACC0, ACCM, ACCM + VPXOR ACC1, ACCM, ACCM + VPSLLDQ $8, ACCM, T0 + VPSRLDQ $8, ACCM, ACCM + + VPXOR ACCM, ACC1, ACC1 + VPXOR T0, ACC0, ACC0 + + reduceRound(ACC0) + reduceRound(ACC0) + VPXOR ACC1, ACC0, ACC0 + + // XOR plaintext + VMOVDQU (32*0)(ptx), XDWTMP0 + VPXOR XDWTMP0, DWB0, DWB0 + VMOVDQU (32*1)(ptx), XDWTMP0 + VPXOR XDWTMP0, DWB1, DWB1 + VMOVDQU (32*2)(ptx), XDWTMP0 + VPXOR XDWTMP0, DWB2, DWB2 + VMOVDQU (32*3)(ptx), XDWTMP0 + VPXOR XDWTMP0, DWB3, DWB3 + + // Store ciphertext + VMOVDQU DWB0, (32*0)(ctx) + VPSHUFB DWBSWAP, DWB0, DWB0 + VMOVDQU DWB1, (32*1)(ctx) + VPSHUFB DWBSWAP, DWB1, DWB1 + VMOVDQU DWB2, (32*2)(ctx) + VPSHUFB DWBSWAP, DWB2, DWB2 + VMOVDQU DWB3, (32*3)(ctx) + VPSHUFB DWBSWAP, DWB3, DWB3 + + //VPXOR XDWTMP0, XDWTMP0, XDWTMP0 + //VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0 + //VPXOR XDWTMP0, DWB0, DWB0 + PXOR ACC0, B0 // Can't call VPXOR here + VMOVDQU DWB0, (32*0)(SP) + VMOVDQU DWB1, (32*1)(SP) + VMOVDQU DWB2, (32*2)(SP) + VMOVDQU DWB3, (32*3)(SP) + + LEAQ 128(ptx), ptx + LEAQ 128(ctx), ctx + + JMP avx2GcmSm4EncOctetsLoop + +avx2GcmSm4EncOctetsEnd: + VMOVDQU (16*0)(SP), T0 + VMOVDQU (16*0)(pTbl), ACC0 + VMOVDQU (16*1)(pTbl), ACCM + VMOVDQU ACC0, ACC1 + VPSHUFD $78, T0, T1 + VPXOR T0, T1, T1 + PCLMULQDQ $0x00, T0, ACC0 + PCLMULQDQ $0x11, T0, ACC1 + PCLMULQDQ $0x00, T1, ACCM + + mulRound(1) + mulRound(2) + mulRound(3) + mulRound(4) + mulRound(5) + mulRound(6) + mulRound(7) + + VPXOR ACC0, ACCM, ACCM + VPXOR ACC1, ACCM, ACCM + VPSLLDQ $8, ACCM, T0 + VPSRLDQ $8, ACCM, ACCM + + VPXOR ACCM, ACC1, ACC1 + VPXOR T0, ACC0, ACC0 + + reduceRound(ACC0) + reduceRound(ACC0) + VPXOR ACC1, ACC0, ACC0 + + TESTQ ptxLen, ptxLen + JE avx2GcmSm4EncDone + + SUBQ $4, aluCTR + +avx2GcmSm4EncNibbles: + CMPQ ptxLen, $64 + JB avx2GcmSm4EncSingles + SUBQ $64, ptxLen + + VMOVDQU (8*16 + 0*16)(SP), B0 + VMOVDQU (8*16 + 1*16)(SP), B1 + VMOVDQU (8*16 + 2*16)(SP), B2 + VMOVDQU (8*16 + 3*16)(SP), B3 + + VMOVDQU flipMask<>(SB), B4 + VPSHUFB B4, B0, B0 + VPSHUFB B4, B1, B1 + VPSHUFB B4, B2, B2 + VPSHUFB B4, B3, B3 + + TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) + XORL BX, BX + VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK + +avx2GcmSm4Enc4Loop2: + AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) + AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) + AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) + AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) + + ADDL $16, BX + CMPL BX, $4*32 + JB avx2GcmSm4Enc4Loop2 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) + VPSHUFB BSWAP, B0, B0 + VPSHUFB BSWAP, B1, B1 + VPSHUFB BSWAP, B2, B2 + VPSHUFB BSWAP, B3, B3 + + VMOVDQU (16*0)(ptx), T0 + VPXOR T0, B0, B0 + VMOVDQU (16*1)(ptx), T0 + VPXOR T0, B1, B1 + VMOVDQU (16*2)(ptx), T0 + VPXOR T0, B2, B2 + VMOVDQU (16*3)(ptx), T0 + VPXOR T0, B3, B3 + + VMOVDQU B0, (16*0)(ctx) + VMOVDQU B1, (16*1)(ctx) + VMOVDQU B2, (16*2)(ctx) + VMOVDQU B3, (16*3)(ctx) + + VMOVDQU (16*14)(pTbl), T2 + gcmEncDataStep(B0) + gcmEncDataStep(B1) + gcmEncDataStep(B2) + gcmEncDataStep(B3) + increment(0) + increment(1) + increment(2) + increment(3) + + LEAQ 64(ptx), ptx + LEAQ 64(ctx), ctx + +avx2GcmSm4EncSingles: + TESTQ ptxLen, ptxLen + JE avx2GcmSm4EncDone + + VMOVDQU (8*16 + 0*16)(SP), B0 + VMOVDQU (8*16 + 1*16)(SP), B1 + VMOVDQU (8*16 + 2*16)(SP), B2 + VMOVDQU (8*16 + 3*16)(SP), B3 + + VMOVDQU flipMask<>(SB), B4 + VPSHUFB B4, B0, B0 + VPSHUFB B4, B1, B1 + VPSHUFB B4, B2, B2 + VPSHUFB B4, B3, B3 + + TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) + XORL BX, BX + VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK + +avx2GcmSm4Enc4Loop1: + AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) + AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) + AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) + AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) + + ADDL $16, BX + CMPL BX, $4*32 + JB avx2GcmSm4Enc4Loop1 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) + VPSHUFB BSWAP, B0, B0 + VPSHUFB BSWAP, B1, B1 + VPSHUFB BSWAP, B2, B2 + VPSHUFB BSWAP, B3, B3 + + VMOVDQU B0, (16*0)(SP) + VMOVDQU B1, (16*1)(SP) + VMOVDQU B2, (16*2)(SP) + VMOVDQU B3, (16*3)(SP) + + VMOVDQU (16*14)(pTbl), T2 + MOVQ SP, BP + +avx2GcmSm4EncSinglesLoop: + CMPQ ptxLen, $16 + JB avx2GcmSm4EncTail + SUBQ $16, ptxLen + VMOVDQU (16*0)(BP), B0 + VMOVDQU (ptx), T0 + VPXOR T0, B0, B0 + VMOVDQU B0, (ctx) + gcmEncDataStep(B0) + LEAQ (16*1)(ptx), ptx + LEAQ (16*1)(ctx), ctx + ADDQ $16, BP + JMP avx2GcmSm4EncSinglesLoop + +avx2GcmSm4EncTail: + TESTQ ptxLen, ptxLen + JE avx2GcmSm4EncDone + VMOVDQU (16*0)(BP), B0 + VMOVDQU B0, T0 + + LEAQ -1(ptx)(ptxLen*1), ptx + + MOVQ ptxLen, aluTMP + SHLQ $4, aluTMP + + LEAQ andMask<>(SB), aluCTR + VMOVDQU -16(aluCTR)(aluTMP*1), T1 + VPXOR B0, B0, B0 + +avx2PtxLoadLoop: + PSLLDQ $1, B0 + PINSRB $0, (ptx), B0 + LEAQ -1(ptx), ptx + DECQ ptxLen + JNE avx2PtxLoadLoop + + VPXOR T0, B0, B0 + VPAND T1, B0, B0 + VMOVDQU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT + gcmEncDataStep(B0) + +avx2GcmSm4EncDone: + VMOVDQU ACC0, (tPtr) + VZEROUPPER + RET + +#undef increment + +// func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) +TEXT ·gcmSm4Dec(SB),0,$128-96 +#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) + +#define decMulRound(i) \ + MOVOU (16*(i*2))(pTbl), T1;\ + MOVOU T1, T2;\ + MOVOU (16*i)(ctx), T0;\ + PSHUFB BSWAP, T0;\ + PCLMULQDQ $0x00, T0, T1;\ + PXOR T1, ACC0;\ + PSHUFD $78, T0, T1;\ + PCLMULQDQ $0x11, T0, T2;\ + PXOR T1, T0;\ + PXOR T2, ACC1;\ + MOVOU (16*(i*2+1))(pTbl), T2;\ + PCLMULQDQ $0x00, T2, T0;\ + PXOR T0, ACCM + +#define decGhashRound(i) \ + MOVOU (16*i)(ctx), B0; \ + PSHUFB BSWAP, B0; \ + PXOR ACC0, B0; \ + MOVOU T2, ACC0; \ + MOVOU T2, ACC1; \ + MOVOU (16*15)(pTbl), ACCM; \ + PCLMULQDQ $0x00, B0, ACC0; \ + PCLMULQDQ $0x11, B0, ACC1; \ + PSHUFD $78, B0, T0; \ + PXOR B0, T0; \ + PCLMULQDQ $0x00, T0, ACCM; \ + PXOR ACC0, ACCM; \ + PXOR ACC1, ACCM; \ + MOVOU ACCM, T0; \ + PSRLDQ $8, ACCM; \ + PSLLDQ $8, T0; \ + PXOR ACCM, ACC1; \ + PXOR T0, ACC0; \ + reduceRound(ACC0); \ + reduceRound(ACC0); \ + PXOR ACC1, ACC0 + + MOVQ productTable+0(FP), pTbl + MOVQ dst+8(FP), ptx + MOVQ src_base+32(FP), ctx + MOVQ src_len+40(FP), ptxLen + MOVQ ctr+56(FP), ctrPtr + MOVQ T+64(FP), tPtr + MOVQ rk_base+72(FP), rk + + CMPB ·useAVX2(SB), $1 + JE avx2GcmSm4Dec + + MOVOU bswapMask<>(SB), BSWAP + MOVOU gcmPoly<>(SB), POLY + + MOVOU (tPtr), ACC0 + PXOR ACC1, ACC1 + PXOR ACCM, ACCM + MOVOU (ctrPtr), T0 + MOVL (3*4)(ctrPtr), aluCTR + BSWAPL aluCTR + + MOVOU T0, (0*16)(SP) + increment(0) + MOVOU T0, (1*16)(SP) + increment(1) + MOVOU T0, (2*16)(SP) + increment(2) + MOVOU T0, (3*16)(SP) + increment(3) + + CMPQ ptxLen, $128 + JB gcmSm4DecNibbles + + // We have at least 8 blocks to dencrypt, prepare the rest of the counters + MOVOU T0, (4*16)(SP) + increment(4) + MOVOU T0, (5*16)(SP) + increment(5) + MOVOU T0, (6*16)(SP) + increment(6) + MOVOU T0, (7*16)(SP) + increment(7) + +gcmSm4DecOctetsLoop: + CMPQ ptxLen, $128 + JB gcmSm4DecEndOctets + SUBQ $128, ptxLen + + MOVOU (0*16)(SP), B0 + MOVOU (1*16)(SP), B1 + MOVOU (2*16)(SP), B2 + MOVOU (3*16)(SP), B3 + MOVOU (4*16)(SP), B4 + MOVOU (5*16)(SP), B5 + MOVOU (6*16)(SP), B6 + MOVOU (7*16)(SP), B7 + + MOVOU (16*0)(ctx), T0 + PSHUFB BSWAP, T0 + PXOR ACC0, T0 + PSHUFD $78, T0, T1 + PXOR T0, T1 + + MOVOU (16*0)(pTbl), ACC0 + MOVOU (16*1)(pTbl), ACCM + MOVOU ACC0, ACC1 + + PCLMULQDQ $0x00, T1, ACCM + PCLMULQDQ $0x00, T0, ACC0 + PCLMULQDQ $0x11, T0, ACC1 + + SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) + decMulRound(1) + increment(0) + decMulRound(2) + increment(1) + decMulRound(3) + increment(2) + decMulRound(4) + increment(3) + SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) + decMulRound(5) + increment(4) + decMulRound(6) + increment(5) + decMulRound(7) + increment(6) + increment(7) + + PXOR ACC0, ACCM + PXOR ACC1, ACCM + MOVOU ACCM, T0 + PSRLDQ $8, ACCM + PSLLDQ $8, T0 + PXOR ACCM, ACC1 + PXOR T0, ACC0 + + reduceRound(ACC0) + reduceRound(ACC0) + PXOR ACC1, ACC0 + + MOVOU (16*0)(ctx), T0 + PXOR T0, B0 + MOVOU (16*1)(ctx), T0 + PXOR T0, B1 + MOVOU (16*2)(ctx), T0 + PXOR T0, B2 + MOVOU (16*3)(ctx), T0 + PXOR T0, B3 + MOVOU (16*4)(ctx), T0 + PXOR T0, B4 + MOVOU (16*5)(ctx), T0 + PXOR T0, B5 + MOVOU (16*6)(ctx), T0 + PXOR T0, B6 + MOVOU (16*7)(ctx), T0 + PXOR T0, B7 + + MOVOU B0, (16*0)(ptx) + MOVOU B1, (16*1)(ptx) + MOVOU B2, (16*2)(ptx) + MOVOU B3, (16*3)(ptx) + MOVOU B4, (16*4)(ptx) + MOVOU B5, (16*5)(ptx) + MOVOU B6, (16*6)(ptx) + MOVOU B7, (16*7)(ptx) + + LEAQ 128(ptx), ptx + LEAQ 128(ctx), ctx + + JMP gcmSm4DecOctetsLoop + +gcmSm4DecEndOctets: + SUBQ $4, aluCTR + +gcmSm4DecNibbles: + CMPQ ptxLen, $64 + JB gcmSm4DecSingles + SUBQ $64, ptxLen + + MOVOU (0*16)(SP), B0 + MOVOU (1*16)(SP), B1 + MOVOU (2*16)(SP), B2 + MOVOU (3*16)(SP), B3 + + SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) + MOVOU (16*14)(pTbl), T2 + MOVOU (16*0)(ctx), T0 + PXOR T0, B0 + MOVOU (16*1)(ctx), T0 + PXOR T0, B1 + MOVOU (16*2)(ctx), T0 + PXOR T0, B2 + MOVOU (16*3)(ctx), T0 + PXOR T0, B3 + + MOVOU B0, (16*0)(ptx) + MOVOU B1, (16*1)(ptx) + MOVOU B2, (16*2)(ptx) + MOVOU B3, (16*3)(ptx) + + + decGhashRound(0) + increment(0) + decGhashRound(1) + increment(1) + decGhashRound(2) + increment(2) + decGhashRound(3) + increment(3) + + LEAQ 64(ptx), ptx + LEAQ 64(ctx), ctx + +gcmSm4DecSingles: + TESTQ ptxLen, ptxLen + JE gcmSm4DecDone + MOVOU (0*16)(SP), B0 + MOVOU (1*16)(SP), B1 + MOVOU (2*16)(SP), B2 + MOVOU (3*16)(SP), B3 + + SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) + MOVOU B0, (16*4)(SP) + MOVOU B1, (16*5)(SP) + MOVOU B2, (16*6)(SP) + MOVOU B3, (16*7)(SP) + + MOVOU (16*14)(pTbl), T2 + MOVQ SP, BP + ADDQ $64, BP + +gcmSm4DecSinglesLoop: + CMPQ ptxLen, $16 + JB gcmSm4DecTail + SUBQ $16, ptxLen + + MOVOU (16*0)(BP), B0 + MOVOU (ctx), T0 + PXOR T0, B0 + MOVOU B0, (ptx) + + decGhashRound(0) + LEAQ (16*1)(ptx), ptx + LEAQ (16*1)(ctx), ctx + ADDQ $16, BP + JMP gcmSm4DecSinglesLoop + +gcmSm4DecTail: + TESTQ ptxLen, ptxLen + JE gcmSm4DecDone + + MOVQ ptxLen, aluTMP + SHLQ $4, aluTMP + LEAQ andMask<>(SB), aluCTR + MOVOU -16(aluCTR)(aluTMP*1), T1 + + MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow + PAND T1, B0 + + MOVOU B0, T1 + PSHUFB BSWAP, B0 + PXOR ACC0, B0 + + MOVOU (16*14)(pTbl), ACC0 + MOVOU (16*15)(pTbl), ACCM + MOVOU ACC0, ACC1 + + PCLMULQDQ $0x00, B0, ACC0 + PCLMULQDQ $0x11, B0, ACC1 + PSHUFD $78, B0, T0 + PXOR B0, T0 + PCLMULQDQ $0x00, T0, ACCM + + PXOR ACC0, ACCM + PXOR ACC1, ACCM + MOVOU ACCM, T0 + PSRLDQ $8, ACCM + PSLLDQ $8, T0 + PXOR ACCM, ACC1 + PXOR T0, ACC0 + + reduceRound(ACC0) + reduceRound(ACC0) + PXOR ACC1, ACC0 + + MOVOU (16*0)(BP), B0 + PXOR T1, B0 + +ptxStoreLoop: + PEXTRB $0, B0, (ptx) + PSRLDQ $1, B0 + LEAQ 1(ptx), ptx + DECQ ptxLen + + JNE ptxStoreLoop + +gcmSm4DecDone: + MOVOU ACC0, (tPtr) + RET + +avx2GcmSm4Dec: + VMOVDQU bswapMask<>(SB), BSWAP + VMOVDQU gcmPoly<>(SB), POLY + + VMOVDQU (tPtr), ACC0 + VPXOR ACC1, ACC1, ACC1 + VPXOR ACCM, ACCM, ACCM + VMOVDQU (ctrPtr), T0 + MOVL (3*4)(ctrPtr), aluCTR + BSWAPL aluCTR + + VMOVDQU T0, (0*16)(SP) + increment(0) + VMOVDQU T0, (1*16)(SP) + increment(1) + VMOVDQU T0, (2*16)(SP) + increment(2) + VMOVDQU T0, (3*16)(SP) + increment(3) + + CMPQ ptxLen, $128 + JB avx2GcmSm4DecNibbles + + // We have at least 8 blocks to dencrypt, prepare the rest of the counters + VMOVDQU T0, (4*16)(SP) + increment(4) + VMOVDQU T0, (5*16)(SP) + increment(5) + VMOVDQU T0, (6*16)(SP) + increment(6) + VMOVDQU T0, (7*16)(SP) + increment(7) + +avx2GcmSm4DecOctetsLoop: + CMPQ ptxLen, $128 + JB avx2GcmSm4DecEndOctets + SUBQ $128, ptxLen + + // load 8 ctrs for encryption + VMOVDQU (0*32)(SP), DWB0 + VMOVDQU (1*32)(SP), DWB1 + VMOVDQU (2*32)(SP), DWB2 + VMOVDQU (3*32)(SP), DWB3 + + VBROADCASTI128 flipMask<>(SB), XDWTMP0 + // Apply Byte Flip Mask: LE -> BE + VPSHUFB XDWTMP0, DWB0, DWB0 + VPSHUFB XDWTMP0, DWB1, DWB1 + VPSHUFB XDWTMP0, DWB2, DWB2 + VPSHUFB XDWTMP0, DWB3, DWB3 + + VMOVDQU (16*0)(ctx), T0 + VPSHUFB BSWAP, T0, T0 + VPXOR ACC0, T0, T0 + VPSHUFD $78, T0, T1 + VPXOR T0, T1, T1 + + VMOVDQU (16*0)(pTbl), ACC0 + VMOVDQU (16*1)(pTbl), ACCM + VMOVDQU ACC0, ACC1 + + PCLMULQDQ $0x00, T1, ACCM + PCLMULQDQ $0x00, T0, ACC0 + PCLMULQDQ $0x11, T0, ACC1 + + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) + XORL BX, BX + VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK + +avx2GcmSm4Dec8Loop2: + AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) + AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) + AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) + AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) + + ADDL $16, BX + CMPL BX, $4*32 + JB avx2GcmSm4Dec8Loop2 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) + + VBROADCASTI128 bswapMask<>(SB), DWBSWAP + VPSHUFB DWBSWAP, DWB0, DWB0 + VPSHUFB DWBSWAP, DWB1, DWB1 + VPSHUFB DWBSWAP, DWB2, DWB2 + VPSHUFB DWBSWAP, DWB3, DWB3 + decMulRound(1) + increment(0) + decMulRound(2) + increment(1) + decMulRound(3) + increment(2) + decMulRound(4) + increment(3) + decMulRound(5) + increment(4) + decMulRound(6) + increment(5) + decMulRound(7) + increment(6) + increment(7) + + VPXOR ACC0, ACCM, ACCM + VPXOR ACC1, ACCM, ACCM + VPSLLDQ $8, ACCM, T0 + VPSRLDQ $8, ACCM, ACCM + + VPXOR ACCM, ACC1, ACC1 + VPXOR T0, ACC0, ACC0 + + reduceRound(ACC0) + reduceRound(ACC0) + VPXOR ACC1, ACC0, ACC0 + + VMOVDQU (32*0)(ctx), XDWTMP0 + VPXOR XDWTMP0, DWB0, DWB0 + VMOVDQU (32*1)(ctx), XDWTMP0 + VPXOR XDWTMP0, DWB1, DWB1 + VMOVDQU (32*2)(ctx), XDWTMP0 + VPXOR XDWTMP0, DWB2, DWB2 + VMOVDQU (32*3)(ctx), XDWTMP0 + VPXOR XDWTMP0, DWB3, DWB3 + + VMOVDQU DWB0, (32*0)(ptx) + VMOVDQU DWB1, (32*1)(ptx) + VMOVDQU DWB2, (32*2)(ptx) + VMOVDQU DWB3, (32*3)(ptx) + + LEAQ 128(ptx), ptx + LEAQ 128(ctx), ctx + + JMP avx2GcmSm4DecOctetsLoop + +avx2GcmSm4DecEndOctets: + SUBQ $4, aluCTR + +avx2GcmSm4DecNibbles: + CMPQ ptxLen, $64 + JB avx2GcmSm4DecSingles + SUBQ $64, ptxLen + + VMOVDQU (0*16)(SP), B0 + VMOVDQU (1*16)(SP), B1 + VMOVDQU (2*16)(SP), B2 + VMOVDQU (3*16)(SP), B3 + VMOVDQU flipMask<>(SB), B4 + VPSHUFB B4, B0, B0 + VPSHUFB B4, B1, B1 + VPSHUFB B4, B2, B2 + VPSHUFB B4, B3, B3 + + TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) + XORL BX, BX + VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK + +avx2GcmSm4Dec4Loop2: + AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) + AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) + AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) + AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) + + ADDL $16, BX + CMPL BX, $4*32 + JB avx2GcmSm4Dec4Loop2 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) + VPSHUFB BSWAP, B0, B0 + VPSHUFB BSWAP, B1, B1 + VPSHUFB BSWAP, B2, B2 + VPSHUFB BSWAP, B3, B3 + + VMOVDQU (16*14)(pTbl), T2 + VMOVDQU (16*0)(ctx), T0 + VPXOR T0, B0, B0 + VMOVDQU (16*1)(ctx), T0 + VPXOR T0, B1, B1 + VMOVDQU (16*2)(ctx), T0 + VPXOR T0, B2, B2 + VMOVDQU (16*3)(ctx), T0 + VPXOR T0, B3, B3 + + VMOVDQU B0, (16*0)(ptx) + VMOVDQU B1, (16*1)(ptx) + VMOVDQU B2, (16*2)(ptx) + VMOVDQU B3, (16*3)(ptx) + + decGhashRound(0) + increment(0) + decGhashRound(1) + increment(1) + decGhashRound(2) + increment(2) + decGhashRound(3) + increment(3) + + LEAQ 64(ptx), ptx + LEAQ 64(ctx), ctx + +avx2GcmSm4DecSingles: + TESTQ ptxLen, ptxLen + JE avx2GcmSm4DecDone + VMOVDQU (0*16)(SP), B0 + VMOVDQU (1*16)(SP), B1 + VMOVDQU (2*16)(SP), B2 + VMOVDQU (3*16)(SP), B3 + + VMOVDQU flipMask<>(SB), B4 + VPSHUFB B4, B0, B0 + VPSHUFB B4, B1, B1 + VPSHUFB B4, B2, B2 + VPSHUFB B4, B3, B3 + + TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) + XORL BX, BX + VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK + +avx2GcmSm4Dec4Loop1: + AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) + AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) + AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) + AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) + + ADDL $16, BX + CMPL BX, $4*32 + JB avx2GcmSm4Dec4Loop1 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) + VPSHUFB BSWAP, B0, B0 + VPSHUFB BSWAP, B1, B1 + VPSHUFB BSWAP, B2, B2 + VPSHUFB BSWAP, B3, B3 + + VMOVDQU B0, (16*4)(SP) + VMOVDQU B1, (16*5)(SP) + VMOVDQU B2, (16*6)(SP) + VMOVDQU B3, (16*7)(SP) + + VMOVDQU (16*14)(pTbl), T2 + MOVQ SP, BP + ADDQ $64, BP + +avx2GcmSm4DecSinglesLoop: + CMPQ ptxLen, $16 + JB avx2GcmSm4DecTail + SUBQ $16, ptxLen + + VMOVDQU (16*0)(BP), B0 + VMOVDQU (ctx), T0 + VPXOR T0, B0, B0 + VMOVDQU B0, (ptx) + + decGhashRound(0) + LEAQ (16*1)(ptx), ptx + LEAQ (16*1)(ctx), ctx + ADDQ $16, BP + JMP avx2GcmSm4DecSinglesLoop + +avx2GcmSm4DecTail: + TESTQ ptxLen, ptxLen + JE avx2GcmSm4DecDone + + MOVQ ptxLen, aluTMP + SHLQ $4, aluTMP + LEAQ andMask<>(SB), aluCTR + VMOVDQU -16(aluCTR)(aluTMP*1), T1 + + VMOVDQU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow + VPAND T1, B0, B0 + + VMOVDQU B0, T1 + VPSHUFB BSWAP, B0, B0 + VPXOR ACC0, B0, B0 + + VMOVDQU (16*14)(pTbl), ACC0 + VMOVDQU (16*15)(pTbl), ACCM + VMOVDQU ACC0, ACC1 + + PCLMULQDQ $0x00, B0, ACC0 + PCLMULQDQ $0x11, B0, ACC1 + VPSHUFD $78, B0, T0 + VPXOR B0, T0, T0 + PCLMULQDQ $0x00, T0, ACCM + + VPXOR ACC0, ACCM, ACCM + VPXOR ACC1, ACCM, ACCM + VPSLLDQ $8, ACCM, T0 + VPSRLDQ $8, ACCM, ACCM + + VPXOR ACCM, ACC1, ACC1 + VPXOR T0, ACC0, ACC0 + + reduceRound(ACC0) + reduceRound(ACC0) + VPXOR ACC1, ACC0, ACC0 + + VMOVDQU (16*0)(BP), B0 + VPXOR T1, B0, B0 + +avx2PtxStoreLoop: + PEXTRB $0, B0, (ptx) + PSRLDQ $1, B0 + LEAQ 1(ptx), ptx + DECQ ptxLen + + JNE avx2PtxStoreLoop + +avx2GcmSm4DecDone: + VMOVDQU ACC0, (tPtr) + VZEROUPPER + RET diff --git a/sm4/gcm_amd64_test.go b/sm4/gcm_amd64_test.go new file mode 100644 index 0000000..a1d9f0b --- /dev/null +++ b/sm4/gcm_amd64_test.go @@ -0,0 +1,123 @@ +//go:build amd64 +// +build amd64 + +package sm4 + +import ( + "encoding/hex" + "testing" +) + +func createGcm() *gcmAsm { + key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} + c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64} + expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0]) + c1 := &sm4CipherGCM{c} + g := &gcmAsm{} + g.cipher = &c1.sm4CipherAsm + g.tagSize = 16 + gcmSm4Init(&g.bytesProductTable, g.cipher.enc) + return g +} + +var sm4GCMTests = []struct { + plaintext string +}{ + { // case 0: < 16 + "abcdefg", + }, + { // case 1: = 16 + "abcdefgabcdefghg", + }, + { // case 2: > 16 , < 64 + "abcdefgabcdefghgabcdefgabcdefghgaaa", + }, + { // case 3: = 64 + "abcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghg", + }, + { // case 4: > 64, < 128 + "abcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgaaa", + }, + { // case 5: = 128 + "abcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghg", + }, + { // case 6: 227 > 128, < 256, 128 + 64 + 35 + "abcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgaaa", + }, + { // case 7: = 256 + "abcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghg", + }, + { // case 8: > 256, = 355 + "abcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgabcdefgabcdefghgaaa", + }, +} + +func initCounter(i byte, counter *[16]byte) { + copy(counter[:], []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}) + counter[gcmBlockSize-1] = i +} + +func resetTag(tag *[16]byte) { + for j := 0; j < 16; j++ { + tag[j] = 0 + } +} + +func TestGcmSm4Enc(t *testing.T) { + var counter1, counter2 [16]byte + gcm := createGcm() + var tagOut1, tagOut2 [gcmTagSize]byte + + for i, test := range sm4GCMTests { + initCounter(2, &counter1) + initCounter(1, &counter2) + + gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut1) + out1 := make([]byte, len(test.plaintext)+gcm.tagSize) + gcm.counterCrypt(out1, []byte(test.plaintext), &counter1) + gcmSm4Data(&gcm.bytesProductTable, out1[:len(test.plaintext)], &tagOut1) + + out2 := make([]byte, len(test.plaintext)+gcm.tagSize) + gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2) + gcmSm4Enc(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc) + if hex.EncodeToString(out1) != hex.EncodeToString(out2) { + t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString(out1), hex.EncodeToString(out2)) + } + if hex.EncodeToString(tagOut1[:]) != hex.EncodeToString(tagOut2[:]) { + t.Errorf("#%d: tag expected %s, got %s", i, hex.EncodeToString(tagOut1[:]), hex.EncodeToString(tagOut2[:])) + } + resetTag(&tagOut1) + resetTag(&tagOut2) + } +} + +func TestGcmSm4Dec(t *testing.T) { + var counter1, counter2 [16]byte + gcm := createGcm() + var tagOut1, tagOut2 [gcmTagSize]byte + + for i, test := range sm4GCMTests { + initCounter(2, &counter1) + initCounter(1, &counter2) + + gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut1) + out1 := make([]byte, len(test.plaintext)+gcm.tagSize) + gcm.counterCrypt(out1, []byte(test.plaintext), &counter1) + gcmSm4Data(&gcm.bytesProductTable, out1[:len(test.plaintext)], &tagOut1) + + out1 = out1[:len(test.plaintext)] + + out2 := make([]byte, len(test.plaintext)+gcm.tagSize) + gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2) + gcmSm4Dec(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc) + + if hex.EncodeToString([]byte(test.plaintext)) != hex.EncodeToString(out2[:len(test.plaintext)]) { + t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString([]byte(test.plaintext)), hex.EncodeToString(out2[:len(test.plaintext)])) + } + if hex.EncodeToString(tagOut1[:]) != hex.EncodeToString(tagOut2[:]) { + t.Errorf("#%d: tag expected %s, got %s", i, hex.EncodeToString(tagOut1[:]), hex.EncodeToString(tagOut2[:])) + } + resetTag(&tagOut1) + resetTag(&tagOut2) + } +} diff --git a/sm4/sm4_gcm_amd64.go b/sm4/sm4_gcm_amd64.go index 378dc41..0704989 100644 --- a/sm4/sm4_gcm_amd64.go +++ b/sm4/sm4_gcm_amd64.go @@ -23,6 +23,12 @@ var _ gcmAble = (*sm4CipherGCM)(nil) //go:noescape func gcmSm4Init(productTable *[256]byte, rk []uint32) +//go:noescape +func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) + +//go:noescape +func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) + //go:noescape func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) @@ -76,10 +82,8 @@ func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte { } g.cipher.Encrypt(tagMask[:], counter[:]) - gcmInc32(&counter) var tagOut [gcmTagSize]byte - gcmSm4Data(&g.bytesProductTable, data, &tagOut) ret, out := subtle.SliceForAppend(dst, len(plaintext)+g.tagSize) @@ -88,8 +92,7 @@ func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte { } if len(plaintext) > 0 { - g.counterCrypt(out, plaintext, &counter) - gcmSm4Data(&g.bytesProductTable, out[:len(plaintext)], &tagOut) + gcmSm4Enc(&g.bytesProductTable, out, plaintext, &counter, &tagOut, g.cipher.enc) } gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data))) copy(out[len(plaintext):], tagOut[:]) @@ -133,7 +136,6 @@ func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) { } g.cipher.Encrypt(tagMask[:], counter[:]) - gcmInc32(&counter) var expectedTag [gcmTagSize]byte gcmSm4Data(&g.bytesProductTable, data, &expectedTag) @@ -143,7 +145,7 @@ func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) { panic("cipher: invalid buffer overlap") } if len(ciphertext) > 0 { - gcmSm4Data(&g.bytesProductTable, ciphertext, &expectedTag) + gcmSm4Dec(&g.bytesProductTable, out, ciphertext, &counter, &expectedTag, g.cipher.enc) } gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data))) @@ -153,8 +155,5 @@ func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) { } return nil, errOpen } - - g.counterCrypt(out, ciphertext, &counter) - return ret, nil }