gmsm/sm4/gcm_amd64.s

2212 lines
52 KiB
ArmAsm
Raw Normal View History

2022-01-21 11:24:10 +08:00
// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
// The implementation uses some optimization as described in:
// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
// Instruction and its Usage for Computing the GCM Mode rev. 2.02
// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
// Hardware
#include "textflag.h"
#define B0 X0
#define B1 X1
#define B2 X2
#define B3 X3
#define B4 X4
#define B5 X5
#define B6 X6
#define B7 X7
#define DWB0 Y0
#define DWB1 Y2
#define DWB2 Y4
#define DWB3 Y6
#define XDWORD Y1
#define YDWORD Y3
#define XDWTMP0 Y5
#define XDWTMP1 Y7
#define ACC0 X8
#define ACC1 X9
#define ACCM X10
#define T0 X11
#define T1 X12
#define T2 X13
#define POLY X14
#define BSWAP X15
#define DWBSWAP Y15
#define NIBBLE_MASK Y11
#define X_NIBBLE_MASK X11
// shuffle byte order from LE to BE
DATA flipMask<>+0x00(SB)/8, $0x0405060700010203
DATA flipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
//nibble mask
DATA nibbleMask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA nibbleMask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
// inverse shift rows
DATA inverseShiftRows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverseShiftRows<>+0x08(SB)/8, $0x0306090C0F020508
// Affine transform 1 (low and high hibbles)
DATA m1Low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1Low<>+0x08(SB)/8, $0x3045F98CEF9A2653
2022-01-21 11:24:10 +08:00
DATA m1High<>+0x00(SB)/8, $0xC35BF46CAF379800
DATA m1High<>+0x08(SB)/8, $0x68F05FC7049C33AB
2022-01-21 11:24:10 +08:00
// Affine transform 2 (low and high hibbles)
DATA m2Low<>+0x00(SB)/8, $0x9A950A05FEF16E61
DATA m2Low<>+0x08(SB)/8, $0x0E019E916A65FAF5
2022-01-21 11:24:10 +08:00
DATA m2High<>+0x00(SB)/8, $0x892D69CD44E0A400
DATA m2High<>+0x08(SB)/8, $0x2C88CC68E14501A5
2022-01-21 11:24:10 +08:00
// left rotations of 32-bit words by 8-bit increments
DATA r08Mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08Mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA r16Mask<>+0x00(SB)/8, $0x0504070601000302
DATA r16Mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
DATA r24Mask<>+0x00(SB)/8, $0x0407060500030201
DATA r24Mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
DATA fkMask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fkMask<>+0x08(SB)/8, $0xb27022dc677d9197
DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
DATA andMask<>+0x08(SB)/8, $0x0000000000000000
DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
DATA andMask<>+0x18(SB)/8, $0x0000000000000000
DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
DATA andMask<>+0x28(SB)/8, $0x0000000000000000
DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
DATA andMask<>+0x38(SB)/8, $0x0000000000000000
DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
DATA andMask<>+0x48(SB)/8, $0x0000000000000000
DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0x58(SB)/8, $0x0000000000000000
DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
DATA andMask<>+0x68(SB)/8, $0x0000000000000000
DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x78(SB)/8, $0x0000000000000000
DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
GLOBL flipMask<>(SB), (NOPTR+RODATA), $16
GLOBL nibbleMask<>(SB), (NOPTR+RODATA), $16
GLOBL inverseShiftRows<>(SB), (NOPTR+RODATA), $16
GLOBL m1Low<>(SB), (NOPTR+RODATA), $16
GLOBL m1High<>(SB), (NOPTR+RODATA), $16
GLOBL m2Low<>(SB), (NOPTR+RODATA), $16
GLOBL m2High<>(SB), (NOPTR+RODATA), $16
GLOBL r08Mask<>(SB), (NOPTR+RODATA), $16
GLOBL r16Mask<>(SB), (NOPTR+RODATA), $16
GLOBL r24Mask<>(SB), (NOPTR+RODATA), $16
GLOBL fkMask<>(SB), (NOPTR+RODATA), $16
GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
GLOBL andMask<>(SB), (NOPTR+RODATA), $240
// func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#define pTbl DI
#define tMsk SI
#define tPtr DX
#define plen AX
#define dlen CX
MOVQ productTable+0(FP), pTbl
MOVQ tagMask+8(FP), tMsk
MOVQ T+16(FP), tPtr
MOVQ pLen+24(FP), plen
MOVQ dLen+32(FP), dlen
MOVOU (tPtr), ACC0
MOVOU (tMsk), T2
MOVOU bswapMask<>(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
SHLQ $3, plen
SHLQ $3, dlen
MOVQ plen, B0
PINSRQ $1, dlen, B0
PXOR ACC0, B0
MOVOU (16*14)(pTbl), ACC0
MOVOU (16*15)(pTbl), ACCM
MOVOU ACC0, ACC1
PCLMULQDQ $0x00, B0, ACC0
PCLMULQDQ $0x11, B0, ACC1
PSHUFD $78, B0, T0
PXOR B0, T0
PCLMULQDQ $0x00, T0, ACCM
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
PXOR ACC1, ACC0
PSHUFB BSWAP, ACC0
PXOR T2, ACC0
MOVOU ACC0, (tPtr)
RET
#undef pTbl
#undef tMsk
#undef tPtr
#undef plen
#undef dlen
#define SM4_SBOX(x, y, z) \
; \ //############################# inner affine ############################//
MOVOU x, z; \
PAND nibbleMask<>(SB), z; \ //y = _mm_and_si128(x, c0f);
MOVOU m1Low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y);
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m1High<>(SB), z; \
PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x);
MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x);
PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
; \ // inverse ShiftRows
PSHUFB inverseShiftRows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr);
AESENCLAST nibbleMask<>(SB), x; \ // AESNI instruction
; \ //############################# outer affine ############################//
MOVOU x, z; \
PANDN nibbleMask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f);
MOVOU m2Low<>(SB), y; \
PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z)
PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4);
PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f);
MOVOU m2High<>(SB), z; \
PSHUFB x, z; \
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
#define SM4_TAO_L1(x, y, z) \
SM4_SBOX(x, y, z); \
; \ //#################### 4 parallel L1 linear transforms ##################//
MOVOU x, y; \
PSHUFB r08Mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
MOVOU x, z; \
PSHUFB r16Mask<>(SB), z; \
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
MOVOU y, z; \
PSLLL $2, z; \
PSRLL $30, y; \
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
MOVOU x, z; \
PSHUFB r24Mask<>(SB), z; \
PXOR y, x; \ //x = x xor y
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
PINSRD $0, (index * 4)(RK)(IND*1), x; \
PSHUFD $0, x, x; \
PXOR t1, x; \
PXOR t2, x; \
PXOR t3, x; \
SM4_TAO_L1(x, y, z); \
PXOR x, t0
// MOVOU r0, tmp2;
// PUNPCKHDQ r1, tmp2;
// PUNPCKLDQ r1, r0;
// MOVOU r2, tmp1;
// PUNPCKLDQ r3, tmp1;
// PUNPCKHDQ r3, r2;
// MOVOU r0, r1;
// PUNPCKHQDQ tmp1, r1;
// PUNPCKLQDQ tmp1, r0;
// MOVOU tmp2, r3;
// PUNPCKHQDQ r2, r3;
// PUNPCKLQDQ r2, tmp2;
// MOVOU tmp2, r2
#define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
PEXTRD $2, r0, r; \
PINSRD $0, r, tmp2; \
PEXTRD $2, r1, r; \
PINSRD $1, r, tmp2; \
; \
PEXTRD $3, r0, r; \
PINSRD $2, r, tmp2; \
PEXTRD $3, r1, r; \
PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2]
; \
PEXTRD $1, r0, r; \
PINSRD $2, r, r0; \
PEXTRD $0, r1, r; \
PINSRD $1, r, r0; \
PEXTRD $1, r1, r; \
PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0]
; \
PEXTRD $0, r2, r; \
PINSRD $0, r, tmp1; \
PEXTRD $0, r3, r; \
PINSRD $1, r, tmp1; \
PEXTRD $1, r2, r; \
PINSRD $2, r, tmp1; \
PEXTRD $1, r3, r; \
PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8]
; \
PEXTRD $2, r2, r; \
PINSRD $0, r, r2; \
PEXTRD $2, r3, r; \
PINSRD $1, r, r2; \
PEXTRD $3, r2, r; \
PINSRD $2, r, r2; \
PEXTRD $3, r3, r; \
PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10]
; \
MOVOU r0, r1; \
PEXTRQ $1, r1, r; \
PINSRQ $0, r, r1; \
PEXTRQ $1, tmp1, r; \
PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1]
; \
PEXTRQ $0, tmp1, r; \
PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0]
; \
MOVOU tmp2, r3; \
PEXTRQ $1, r3, r; \
PINSRQ $0, r, r3; \
PEXTRQ $1, r2, r; \
PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3]
; \
PEXTRQ $0, r2, r; \
PINSRQ $1, r, r2; \
PEXTRQ $0, tmp2, r; \
PINSRQ $0, r, r2
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
PSHUFB flipMask<>(SB), t0; \
PSHUFB flipMask<>(SB), t1; \
PSHUFB flipMask<>(SB), t2; \
PSHUFB flipMask<>(SB), t3; \
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \
XORL IND, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
ADDL $16, IND; \
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \
PSHUFB BSWAP, t3; \
PSHUFB BSWAP, t2; \
PSHUFB BSWAP, t1; \
PSHUFB BSWAP, t0
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
#define AVX2_SM4_SBOX(x, y, xw, yw, tmp) \
VPAND NIBBLE_MASK, x, tmp; \
VBROADCASTI128 m1Low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND NIBBLE_MASK, x, x; \
VBROADCASTI128 m1High<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VBROADCASTI128 inverseShiftRows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VEXTRACTI128 $1, x, yw \
VAESENCLAST X_NIBBLE_MASK, xw, xw; \
VAESENCLAST X_NIBBLE_MASK, yw, yw; \
VINSERTI128 $1, yw, x, x; \
VPANDN NIBBLE_MASK, x, tmp; \
VBROADCASTI128 m2Low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND NIBBLE_MASK, x, x; \
VBROADCASTI128 m2High<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
#define AVX2_SM4_TAO_L1(x, y, xw, yw, tmp) \
AVX2_SM4_SBOX(x, y, xw, yw, tmp); \
VBROADCASTI128 r08Mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VBROADCASTI128 r16Mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VBROADCASTI128 r24Mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, xw, yw, tmp); \
VPXOR x, t0, t0
#define AVX_SM4_SBOX(x, y, tmp) \
VPAND X_NIBBLE_MASK, x, tmp; \
VMOVDQU m1Low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m1High<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x; \
VMOVDQU inverseShiftRows<>(SB), tmp; \
VPSHUFB tmp, x, x; \
VAESENCLAST X_NIBBLE_MASK, x, x; \
VPANDN X_NIBBLE_MASK, x, tmp; \
VMOVDQU m2Low<>(SB), y; \
VPSHUFB tmp, y, y; \
VPSRLQ $4, x, x; \
VPAND X_NIBBLE_MASK, x, x; \
VMOVDQU m2High<>(SB), tmp; \
VPSHUFB x, tmp, x; \
VPXOR y, x, x
#define AVX_SM4_TAO_L1(x, y, tmp) \
AVX_SM4_SBOX(x, y, tmp); \
VMOVDQU r08Mask<>(SB), tmp; \
VPSHUFB tmp, x, y; \
VPXOR x, y, y; \
VMOVDQU r16Mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR tmp, y, y; \
VPSLLD $2, y, tmp; \
VPSRLD $30, y, y; \
VPXOR tmp, y, y; \
VMOVDQU r24Mask<>(SB), tmp; \
VPSHUFB tmp, x, tmp; \
VPXOR y, x, x; \
VPXOR x, tmp, x
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, tmp); \
VPXOR x, t0, t0
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
#define dst DI
#define RK SI
MOVQ productTable+0(FP), dst
MOVQ rk+8(FP), RK
MOVOU gcmPoly<>(SB), POLY
// Encrypt block 0, with the sm4 round keys to generate the hash key H
PXOR B0, B0
PXOR B1, B1
PXOR B2, B2
PXOR B3, B3
XORL CX, CX
sm4InitEncLoop:
SM4_SINGLE_ROUND(0, RK, CX, T0, T1, T2, B0, B1, B2, B3)
SM4_SINGLE_ROUND(1, RK, CX, T0, T1, T2, B1, B2, B3, B0)
SM4_SINGLE_ROUND(2, RK, CX, T0, T1, T2, B2, B3, B0, B1)
SM4_SINGLE_ROUND(3, RK, CX, T0, T1, T2, B3, B0, B1, B2)
ADDL $16, CX
CMPL CX, $4*32
JB sm4InitEncLoop
PEXTRD $0, B1, R8
PINSRD $1, R8, B0
PEXTRD $0, B2, R8
PINSRD $2, R8, B0
PEXTRD $0, B3, R8
PINSRD $3, R8, B0
// H * 2
PSHUFD $0xff, B0, T0
MOVOU B0, T1
PSRAL $31, T0
PAND POLY, T0
PSRLL $31, T1
PSLLDQ $4, T1
PSLLL $1, B0
PXOR T0, B0
PXOR T1, B0
// Karatsuba pre-computations
MOVOU B0, (16*14)(dst)
PSHUFD $78, B0, B1
PXOR B0, B1
MOVOU B1, (16*15)(dst)
MOVOU B0, B2
MOVOU B1, B3
// Now prepare powers of H and pre-computations for them
MOVQ $7, AX
initLoop:
MOVOU B2, T0
MOVOU B2, T1
MOVOU B3, T2
PCLMULQDQ $0x00, B0, T0
PCLMULQDQ $0x11, B0, T1
PCLMULQDQ $0x00, B1, T2
PXOR T0, T2
PXOR T1, T2
MOVOU T2, B4
PSLLDQ $8, B4
PSRLDQ $8, T2
PXOR B4, T0
PXOR T2, T1
MOVOU POLY, B2
PCLMULQDQ $0x01, T0, B2
PSHUFD $78, T0, T0
PXOR B2, T0
MOVOU POLY, B2
PCLMULQDQ $0x01, T0, B2
PSHUFD $78, T0, T0
PXOR T0, B2
PXOR T1, B2
MOVOU B2, (16*12)(dst)
PSHUFD $78, B2, B3
PXOR B2, B3
MOVOU B3, (16*13)(dst)
DECQ AX
LEAQ (-16*2)(dst), dst
JNE initLoop
RET
#undef RK
#undef dst
// func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
TEXT ·gcmSm4Data(SB),NOSPLIT,$0
#define pTbl DI
#define aut SI
#define tPtr CX
#define autLen DX
#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
#define mulRoundAAD(X ,i) \
MOVOU (16*(i*2))(pTbl), T1;\
MOVOU T1, T2;\
PCLMULQDQ $0x00, X, T1;\
PXOR T1, ACC0;\
PCLMULQDQ $0x11, X, T2;\
PXOR T2, ACC1;\
PSHUFD $78, X, T1;\
PXOR T1, X;\
MOVOU (16*(i*2+1))(pTbl), T1;\
PCLMULQDQ $0x00, X, T1;\
PXOR T1, ACCM
MOVQ productTable+0(FP), pTbl
MOVQ data_base+8(FP), aut
MOVQ data_len+16(FP), autLen
MOVQ T+32(FP), tPtr
//PXOR ACC0, ACC0
MOVOU (tPtr), ACC0
MOVOU bswapMask<>(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
TESTQ autLen, autLen
JEQ dataBail
CMPQ autLen, $13 // optimize the TLS case
JE dataTLS
CMPQ autLen, $128
JB startSinglesLoop
JMP dataOctaLoop
dataTLS:
MOVOU (16*14)(pTbl), T1
MOVOU (16*15)(pTbl), T2
PXOR B0, B0
MOVQ (aut), B0
PINSRD $2, 8(aut), B0
PINSRB $12, 12(aut), B0
XORQ autLen, autLen
JMP dataMul
dataOctaLoop:
CMPQ autLen, $128
JB startSinglesLoop
SUBQ $128, autLen
MOVOU (16*0)(aut), X0
MOVOU (16*1)(aut), X1
MOVOU (16*2)(aut), X2
MOVOU (16*3)(aut), X3
MOVOU (16*4)(aut), X4
MOVOU (16*5)(aut), X5
MOVOU (16*6)(aut), X6
MOVOU (16*7)(aut), X7
LEAQ (16*8)(aut), aut
PSHUFB BSWAP, X0
PSHUFB BSWAP, X1
PSHUFB BSWAP, X2
PSHUFB BSWAP, X3
PSHUFB BSWAP, X4
PSHUFB BSWAP, X5
PSHUFB BSWAP, X6
PSHUFB BSWAP, X7
PXOR ACC0, X0
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PSHUFD $78, X0, T1
PXOR X0, T1
PCLMULQDQ $0x00, X0, ACC0
PCLMULQDQ $0x11, X0, ACC1
PCLMULQDQ $0x00, T1, ACCM
mulRoundAAD(X1, 1)
mulRoundAAD(X2, 2)
mulRoundAAD(X3, 3)
mulRoundAAD(X4, 4)
mulRoundAAD(X5, 5)
mulRoundAAD(X6, 6)
mulRoundAAD(X7, 7)
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
JMP dataOctaLoop
startSinglesLoop:
MOVOU (16*14)(pTbl), T1
MOVOU (16*15)(pTbl), T2
dataSinglesLoop:
CMPQ autLen, $16
JB dataEnd
SUBQ $16, autLen
MOVOU (aut), B0
dataMul:
PSHUFB BSWAP, B0
PXOR ACC0, B0
MOVOU T1, ACC0
MOVOU T2, ACCM
MOVOU T1, ACC1
PSHUFD $78, B0, T0
PXOR B0, T0
PCLMULQDQ $0x00, B0, ACC0
PCLMULQDQ $0x11, B0, ACC1
PCLMULQDQ $0x00, T0, ACCM
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
PXOR ACC1, ACC0
LEAQ 16(aut), aut
JMP dataSinglesLoop
dataEnd:
TESTQ autLen, autLen
JEQ dataBail
PXOR B0, B0
LEAQ -1(aut)(autLen*1), aut
dataLoadLoop:
PSLLDQ $1, B0
PINSRB $0, (aut), B0
LEAQ -1(aut), aut
DECQ autLen
JNE dataLoadLoop
JMP dataMul
dataBail:
MOVOU ACC0, (tPtr)
RET
#undef pTbl
#undef aut
#undef tPtr
#undef autLen
// func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4Enc(SB),0,$256-96
#define pTbl DI
#define ctx DX
#define ctrPtr CX
#define ptx SI
#define rk AX
#define tPtr R8
#define ptxLen R9
#define aluCTR R10
#define aluTMP R11
#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
#define mulRound(i) \
MOVOU (16*i)(SP), T0;\
MOVOU (16*(i*2))(pTbl), T1;\
MOVOU T1, T2;\
PCLMULQDQ $0x00, T0, T1;\
PXOR T1, ACC0;\
PCLMULQDQ $0x11, T0, T2;\
PXOR T2, ACC1;\
PSHUFD $78, T0, T1;\
PXOR T1, T0;\
MOVOU (16*(i*2+1))(pTbl), T1;\
PCLMULQDQ $0x00, T0, T1;\
PXOR T1, ACCM
#define gcmEncDataStep(B) \
PSHUFB BSWAP, B; \
PXOR ACC0, B; \
MOVOU T2, ACC0; \
MOVOU T2, ACC1; \
MOVOU (16*15)(pTbl), ACCM; \
PSHUFD $78, B, T0; \
PXOR B, T0; \
PCLMULQDQ $0x00, B, ACC0; \
PCLMULQDQ $0x11, B, ACC1; \
PCLMULQDQ $0x00, T0, ACCM; \
PXOR ACC0, ACCM; \
PXOR ACC1, ACCM; \
MOVOU ACCM, T0; \
PSRLDQ $8, ACCM; \
PSLLDQ $8, T0; \
PXOR ACCM, ACC1; \
PXOR T0, ACC0; \
reduceRound(ACC0); \
reduceRound(ACC0); \
PXOR ACC1, ACC0
MOVQ productTable+0(FP), pTbl
MOVQ dst+8(FP), ctx
MOVQ src_base+32(FP), ptx
MOVQ src_len+40(FP), ptxLen
MOVQ ctr+56(FP), ctrPtr
MOVQ T+64(FP), tPtr
MOVQ rk_base+72(FP), rk
CMPB ·useAVX2(SB), $1
JE avx2GcmSm4Enc
MOVOU bswapMask<>(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
MOVOU (tPtr), ACC0
PXOR ACC1, ACC1
PXOR ACCM, ACCM
MOVOU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR
MOVOU T0, (8*16 + 0*16)(SP)
increment(0)
MOVOU T0, (8*16 + 1*16)(SP)
increment(1)
MOVOU T0, (8*16 + 2*16)(SP)
increment(2)
MOVOU T0, (8*16 + 3*16)(SP)
increment(3)
CMPQ ptxLen, $128
JB gcmSm4EncNibbles
SUBQ $128, ptxLen
// We have at least 8 blocks to encrypt, prepare the rest of the counters
MOVOU T0, (8*16 + 4*16)(SP)
increment(4)
MOVOU T0, (8*16 + 5*16)(SP)
increment(5)
MOVOU T0, (8*16 + 6*16)(SP)
increment(6)
MOVOU T0, (8*16 + 7*16)(SP)
increment(7)
// load 8 ctrs for encryption
MOVOU (8*16 + 0*16)(SP), B0
MOVOU (8*16 + 1*16)(SP), B1
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
MOVOU (8*16 + 4*16)(SP), B4
MOVOU (8*16 + 5*16)(SP), B5
MOVOU (8*16 + 6*16)(SP), B6
MOVOU (8*16 + 7*16)(SP), B7
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
increment(0)
increment(1)
increment(2)
increment(3)
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
increment(4)
increment(5)
increment(6)
increment(7)
// XOR plaintext
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
MOVOU (16*4)(ptx), T0
PXOR T0, B4
MOVOU (16*5)(ptx), T0
PXOR T0, B5
MOVOU (16*6)(ptx), T0
PXOR T0, B6
MOVOU (16*7)(ptx), T0
PXOR T0, B7
// Store ciphertext
MOVOU B0, (16*0)(ctx)
PSHUFB BSWAP, B0
PXOR ACC0, B0
MOVOU B1, (16*1)(ctx)
PSHUFB BSWAP, B1
MOVOU B2, (16*2)(ctx)
PSHUFB BSWAP, B2
MOVOU B3, (16*3)(ctx)
PSHUFB BSWAP, B3
MOVOU B4, (16*4)(ctx)
PSHUFB BSWAP, B4
MOVOU B5, (16*5)(ctx)
PSHUFB BSWAP, B5
MOVOU B6, (16*6)(ctx)
PSHUFB BSWAP, B6
MOVOU B7, (16*7)(ctx)
PSHUFB BSWAP, B7
MOVOU B0, (16*0)(SP)
MOVOU B1, (16*1)(SP)
MOVOU B2, (16*2)(SP)
MOVOU B3, (16*3)(SP)
MOVOU B4, (16*4)(SP)
MOVOU B5, (16*5)(SP)
MOVOU B6, (16*6)(SP)
MOVOU B7, (16*7)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
gcmSm4EncOctetsLoop:
CMPQ ptxLen, $128
JB gcmSm4EncOctetsEnd
SUBQ $128, ptxLen
MOVOU (8*16 + 0*16)(SP), B0
MOVOU (8*16 + 1*16)(SP), B1
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
MOVOU (8*16 + 4*16)(SP), B4
MOVOU (8*16 + 5*16)(SP), B5
MOVOU (8*16 + 6*16)(SP), B6
MOVOU (8*16 + 7*16)(SP), B7
MOVOU (16*0)(SP), T0
PSHUFD $78, T0, T1
PXOR T0, T1
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PCLMULQDQ $0x00, T1, ACCM
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
mulRound(1)
increment(0)
mulRound(2)
increment(1)
mulRound(3)
increment(2)
mulRound(4)
increment(3)
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
mulRound(5)
increment(4)
mulRound(6)
increment(5)
mulRound(7)
increment(6)
increment(7)
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
MOVOU (16*4)(ptx), T0
PXOR T0, B4
MOVOU (16*5)(ptx), T0
PXOR T0, B5
MOVOU (16*6)(ptx), T0
PXOR T0, B6
MOVOU (16*7)(ptx), T0
PXOR T0, B7
MOVOU B0, (16*0)(ctx)
PSHUFB BSWAP, B0
PXOR ACC0, B0
MOVOU B1, (16*1)(ctx)
PSHUFB BSWAP, B1
MOVOU B2, (16*2)(ctx)
PSHUFB BSWAP, B2
MOVOU B3, (16*3)(ctx)
PSHUFB BSWAP, B3
MOVOU B4, (16*4)(ctx)
PSHUFB BSWAP, B4
MOVOU B5, (16*5)(ctx)
PSHUFB BSWAP, B5
MOVOU B6, (16*6)(ctx)
PSHUFB BSWAP, B6
MOVOU B7, (16*7)(ctx)
PSHUFB BSWAP, B7
MOVOU B0, (16*0)(SP)
MOVOU B1, (16*1)(SP)
MOVOU B2, (16*2)(SP)
MOVOU B3, (16*3)(SP)
MOVOU B4, (16*4)(SP)
MOVOU B5, (16*5)(SP)
MOVOU B6, (16*6)(SP)
MOVOU B7, (16*7)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP gcmSm4EncOctetsLoop
gcmSm4EncOctetsEnd:
MOVOU (16*0)(SP), T0
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PSHUFD $78, T0, T1
PXOR T0, T1
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
PCLMULQDQ $0x00, T1, ACCM
mulRound(1)
mulRound(2)
mulRound(3)
mulRound(4)
mulRound(5)
mulRound(6)
mulRound(7)
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
TESTQ ptxLen, ptxLen
JE gcmSm4EncDone
SUBQ $4, aluCTR
gcmSm4EncNibbles:
CMPQ ptxLen, $64
2022-01-24 16:14:42 +08:00
JBE gcmSm4EncSingles
2022-01-21 11:24:10 +08:00
SUBQ $64, ptxLen
MOVOU (8*16 + 0*16)(SP), B0
MOVOU (8*16 + 1*16)(SP), B1
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
MOVOU B0, (16*0)(ctx)
MOVOU B1, (16*1)(ctx)
MOVOU B2, (16*2)(ctx)
MOVOU B3, (16*3)(ctx)
MOVOU (16*14)(pTbl), T2
gcmEncDataStep(B0)
gcmEncDataStep(B1)
gcmEncDataStep(B2)
gcmEncDataStep(B3)
increment(0)
increment(1)
increment(2)
increment(3)
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
gcmSm4EncSingles:
TESTQ ptxLen, ptxLen
JE gcmSm4EncDone
MOVOU (8*16 + 0*16)(SP), B0
MOVOU (8*16 + 1*16)(SP), B1
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
MOVOU B0, (16*0)(SP)
MOVOU B1, (16*1)(SP)
MOVOU B2, (16*2)(SP)
MOVOU B3, (16*3)(SP)
MOVOU (16*14)(pTbl), T2
MOVQ SP, BP
gcmSm4EncSinglesLoop:
CMPQ ptxLen, $16
JB gcmSm4EncTail
SUBQ $16, ptxLen
MOVOU (16*0)(BP), B0
MOVOU (ptx), T0
PXOR T0, B0
MOVOU B0, (ctx)
gcmEncDataStep(B0)
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP gcmSm4EncSinglesLoop
gcmSm4EncTail:
TESTQ ptxLen, ptxLen
JE gcmSm4EncDone
MOVOU (16*0)(BP), B0
MOVOU B0, T0
LEAQ -1(ptx)(ptxLen*1), ptx
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
MOVOU -16(aluCTR)(aluTMP*1), T1
PXOR B0, B0
ptxLoadLoop:
PSLLDQ $1, B0
PINSRB $0, (ptx), B0
LEAQ -1(ptx), ptx
DECQ ptxLen
JNE ptxLoadLoop
PXOR T0, B0
PAND T1, B0
MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT
gcmEncDataStep(B0)
gcmSm4EncDone:
MOVOU ACC0, (tPtr)
RET
avx2GcmSm4Enc:
VMOVDQU bswapMask<>(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR
VMOVDQU T0, (8*16 + 0*16)(SP)
increment(0)
VMOVDQU T0, (8*16 + 1*16)(SP)
increment(1)
VMOVDQU T0, (8*16 + 2*16)(SP)
increment(2)
VMOVDQU T0, (8*16 + 3*16)(SP)
increment(3)
CMPQ ptxLen, $128
JB avx2GcmSm4EncNibbles
SUBQ $128, ptxLen
// We have at least 8 blocks to encrypt, prepare the rest of the counters
VMOVDQU T0, (8*16 + 4*16)(SP)
increment(4)
VMOVDQU T0, (8*16 + 5*16)(SP)
increment(5)
VMOVDQU T0, (8*16 + 6*16)(SP)
increment(6)
VMOVDQU T0, (8*16 + 7*16)(SP)
increment(7)
// load 8 ctrs for encryption
VMOVDQU (4*32 + 0*32)(SP), DWB0
VMOVDQU (4*32 + 1*32)(SP), DWB1
VMOVDQU (4*32 + 2*32)(SP), DWB2
VMOVDQU (4*32 + 3*32)(SP), DWB3
VBROADCASTI128 flipMask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
VPSHUFB XDWTMP0, DWB2, DWB2
VPSHUFB XDWTMP0, DWB3, DWB3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
XORL BX, BX
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
avx2GcmSm4Enc8Loop1:
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Enc8Loop1
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
VPSHUFB DWBSWAP, DWB0, DWB0
VPSHUFB DWBSWAP, DWB1, DWB1
VPSHUFB DWBSWAP, DWB2, DWB2
VPSHUFB DWBSWAP, DWB3, DWB3
increment(0)
increment(1)
increment(2)
increment(3)
increment(4)
increment(5)
increment(6)
increment(7)
// XOR plaintext
VMOVDQU (32*0)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB0, DWB0
VMOVDQU (32*1)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB1, DWB1
VMOVDQU (32*2)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB2, DWB2
VMOVDQU (32*3)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB3, DWB3
// Store ciphertext
VMOVDQU DWB0, (32*0)(ctx)
VPSHUFB DWBSWAP, DWB0, DWB0
VMOVDQU DWB1, (32*1)(ctx)
VPSHUFB DWBSWAP, DWB1, DWB1
VMOVDQU DWB2, (32*2)(ctx)
VPSHUFB DWBSWAP, DWB2, DWB2
VMOVDQU DWB3, (32*3)(ctx)
VPSHUFB DWBSWAP, DWB3, DWB3
//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
//VPXOR XDWTMP0, DWB0, DWB0
PXOR ACC0, B0 // Can't call VPXOR here
VMOVDQU DWB0, (32*0)(SP)
VMOVDQU DWB1, (32*1)(SP)
VMOVDQU DWB2, (32*2)(SP)
VMOVDQU DWB3, (32*3)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
avx2GcmSm4EncOctetsLoop:
CMPQ ptxLen, $128
JB avx2GcmSm4EncOctetsEnd
SUBQ $128, ptxLen
// load 8 ctrs for encryption
VMOVDQU (4*32 + 0*32)(SP), DWB0
VMOVDQU (4*32 + 1*32)(SP), DWB1
VMOVDQU (4*32 + 2*32)(SP), DWB2
VMOVDQU (4*32 + 3*32)(SP), DWB3
VBROADCASTI128 flipMask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
VPSHUFB XDWTMP0, DWB2, DWB2
VPSHUFB XDWTMP0, DWB3, DWB3
VMOVDQU (16*0)(SP), T0
VPSHUFD $78, T0, T1
VPXOR T0, T1, T1
VMOVDQU (16*0)(pTbl), ACC0
VMOVDQU (16*1)(pTbl), ACCM
VMOVDQU ACC0, ACC1
PCLMULQDQ $0x00, T1, ACCM
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
XORL BX, BX
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
avx2GcmSm4Enc8Loop2:
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Enc8Loop2
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
VPSHUFB DWBSWAP, DWB0, DWB0
VPSHUFB DWBSWAP, DWB1, DWB1
VPSHUFB DWBSWAP, DWB2, DWB2
VPSHUFB DWBSWAP, DWB3, DWB3
mulRound(1)
increment(0)
mulRound(2)
increment(1)
mulRound(3)
increment(2)
mulRound(4)
increment(3)
mulRound(5)
increment(4)
mulRound(6)
increment(5)
mulRound(7)
increment(6)
increment(7)
VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0
VPSRLDQ $8, ACCM, ACCM
VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0
// XOR plaintext
VMOVDQU (32*0)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB0, DWB0
VMOVDQU (32*1)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB1, DWB1
VMOVDQU (32*2)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB2, DWB2
VMOVDQU (32*3)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB3, DWB3
// Store ciphertext
VMOVDQU DWB0, (32*0)(ctx)
VPSHUFB DWBSWAP, DWB0, DWB0
VMOVDQU DWB1, (32*1)(ctx)
VPSHUFB DWBSWAP, DWB1, DWB1
VMOVDQU DWB2, (32*2)(ctx)
VPSHUFB DWBSWAP, DWB2, DWB2
VMOVDQU DWB3, (32*3)(ctx)
VPSHUFB DWBSWAP, DWB3, DWB3
//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
//VPXOR XDWTMP0, DWB0, DWB0
PXOR ACC0, B0 // Can't call VPXOR here
VMOVDQU DWB0, (32*0)(SP)
VMOVDQU DWB1, (32*1)(SP)
VMOVDQU DWB2, (32*2)(SP)
VMOVDQU DWB3, (32*3)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP avx2GcmSm4EncOctetsLoop
avx2GcmSm4EncOctetsEnd:
VMOVDQU (16*0)(SP), T0
VMOVDQU (16*0)(pTbl), ACC0
VMOVDQU (16*1)(pTbl), ACCM
VMOVDQU ACC0, ACC1
VPSHUFD $78, T0, T1
VPXOR T0, T1, T1
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
PCLMULQDQ $0x00, T1, ACCM
mulRound(1)
mulRound(2)
mulRound(3)
mulRound(4)
mulRound(5)
mulRound(6)
mulRound(7)
VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0
VPSRLDQ $8, ACCM, ACCM
VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0
TESTQ ptxLen, ptxLen
JE avx2GcmSm4EncDone
SUBQ $4, aluCTR
avx2GcmSm4EncNibbles:
2022-01-24 16:14:42 +08:00
VMOVDQU flipMask<>(SB), B7
2022-01-21 11:24:10 +08:00
CMPQ ptxLen, $64
2022-01-24 16:14:42 +08:00
JBE avx2GcmSm4EncSingles
2022-01-21 11:24:10 +08:00
SUBQ $64, ptxLen
VMOVDQU (8*16 + 0*16)(SP), B0
VMOVDQU (8*16 + 1*16)(SP), B1
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
2022-01-24 16:14:42 +08:00
VPSHUFB B7, B0, B0
VPSHUFB B7, B1, B1
VPSHUFB B7, B2, B2
VPSHUFB B7, B3, B3
2022-01-21 11:24:10 +08:00
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Enc4Loop2:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Enc4Loop2
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
VPSHUFB BSWAP, B0, B0
VPSHUFB BSWAP, B1, B1
VPSHUFB BSWAP, B2, B2
VPSHUFB BSWAP, B3, B3
VMOVDQU (16*0)(ptx), T0
VPXOR T0, B0, B0
VMOVDQU (16*1)(ptx), T0
VPXOR T0, B1, B1
VMOVDQU (16*2)(ptx), T0
VPXOR T0, B2, B2
VMOVDQU (16*3)(ptx), T0
VPXOR T0, B3, B3
VMOVDQU B0, (16*0)(ctx)
VMOVDQU B1, (16*1)(ctx)
VMOVDQU B2, (16*2)(ctx)
VMOVDQU B3, (16*3)(ctx)
VMOVDQU (16*14)(pTbl), T2
gcmEncDataStep(B0)
gcmEncDataStep(B1)
gcmEncDataStep(B2)
gcmEncDataStep(B3)
increment(0)
increment(1)
increment(2)
increment(3)
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
avx2GcmSm4EncSingles:
TESTQ ptxLen, ptxLen
JE avx2GcmSm4EncDone
VMOVDQU (8*16 + 0*16)(SP), B0
VMOVDQU (8*16 + 1*16)(SP), B1
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
2022-01-24 16:14:42 +08:00
VPSHUFB B7, B0, B0
VPSHUFB B7, B1, B1
VPSHUFB B7, B2, B2
VPSHUFB B7, B3, B3
2022-01-21 11:24:10 +08:00
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Enc4Loop1:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Enc4Loop1
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
VPSHUFB BSWAP, B0, B0
VPSHUFB BSWAP, B1, B1
VPSHUFB BSWAP, B2, B2
VPSHUFB BSWAP, B3, B3
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
VMOVDQU B3, (16*3)(SP)
VMOVDQU (16*14)(pTbl), T2
MOVQ SP, BP
avx2GcmSm4EncSinglesLoop:
CMPQ ptxLen, $16
JB avx2GcmSm4EncTail
SUBQ $16, ptxLen
VMOVDQU (16*0)(BP), B0
VMOVDQU (ptx), T0
VPXOR T0, B0, B0
VMOVDQU B0, (ctx)
gcmEncDataStep(B0)
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP avx2GcmSm4EncSinglesLoop
avx2GcmSm4EncTail:
TESTQ ptxLen, ptxLen
JE avx2GcmSm4EncDone
VMOVDQU (16*0)(BP), B0
VMOVDQU B0, T0
LEAQ -1(ptx)(ptxLen*1), ptx
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
VMOVDQU -16(aluCTR)(aluTMP*1), T1
VPXOR B0, B0, B0
avx2PtxLoadLoop:
PSLLDQ $1, B0
PINSRB $0, (ptx), B0
LEAQ -1(ptx), ptx
DECQ ptxLen
JNE avx2PtxLoadLoop
VPXOR T0, B0, B0
VPAND T1, B0, B0
VMOVDQU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT
gcmEncDataStep(B0)
avx2GcmSm4EncDone:
VMOVDQU ACC0, (tPtr)
VZEROUPPER
RET
#undef increment
// func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4Dec(SB),0,$128-96
#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
#define decMulRound(i) \
MOVOU (16*i)(ctx), T0;\
PSHUFB BSWAP, T0;\
2022-01-24 16:14:42 +08:00
internalDecMulRound(i)
#define internalDecMulRound(i) \
MOVOU (16*(i*2))(pTbl), T1;\
MOVOU T1, T2;\
2022-01-21 11:24:10 +08:00
PCLMULQDQ $0x00, T0, T1;\
PXOR T1, ACC0;\
PSHUFD $78, T0, T1;\
PCLMULQDQ $0x11, T0, T2;\
PXOR T1, T0;\
PXOR T2, ACC1;\
MOVOU (16*(i*2+1))(pTbl), T2;\
PCLMULQDQ $0x00, T2, T0;\
PXOR T0, ACCM
#define decGhashRound(i) \
MOVOU (16*i)(ctx), B0; \
2022-01-24 16:14:42 +08:00
internalDecGhashRound()
#define internalDecGhashRound() \
2022-01-21 11:24:10 +08:00
PSHUFB BSWAP, B0; \
PXOR ACC0, B0; \
MOVOU T2, ACC0; \
MOVOU T2, ACC1; \
MOVOU (16*15)(pTbl), ACCM; \
PCLMULQDQ $0x00, B0, ACC0; \
PCLMULQDQ $0x11, B0, ACC1; \
PSHUFD $78, B0, T0; \
PXOR B0, T0; \
PCLMULQDQ $0x00, T0, ACCM; \
PXOR ACC0, ACCM; \
PXOR ACC1, ACCM; \
MOVOU ACCM, T0; \
PSRLDQ $8, ACCM; \
PSLLDQ $8, T0; \
PXOR ACCM, ACC1; \
PXOR T0, ACC0; \
reduceRound(ACC0); \
reduceRound(ACC0); \
PXOR ACC1, ACC0
MOVQ productTable+0(FP), pTbl
MOVQ dst+8(FP), ptx
MOVQ src_base+32(FP), ctx
MOVQ src_len+40(FP), ptxLen
MOVQ ctr+56(FP), ctrPtr
MOVQ T+64(FP), tPtr
MOVQ rk_base+72(FP), rk
CMPB ·useAVX2(SB), $1
JE avx2GcmSm4Dec
MOVOU bswapMask<>(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
MOVOU (tPtr), ACC0
PXOR ACC1, ACC1
PXOR ACCM, ACCM
MOVOU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR
MOVOU T0, (0*16)(SP)
increment(0)
MOVOU T0, (1*16)(SP)
increment(1)
MOVOU T0, (2*16)(SP)
increment(2)
MOVOU T0, (3*16)(SP)
increment(3)
CMPQ ptxLen, $128
JB gcmSm4DecNibbles
// We have at least 8 blocks to dencrypt, prepare the rest of the counters
MOVOU T0, (4*16)(SP)
increment(4)
MOVOU T0, (5*16)(SP)
increment(5)
MOVOU T0, (6*16)(SP)
increment(6)
MOVOU T0, (7*16)(SP)
increment(7)
gcmSm4DecOctetsLoop:
CMPQ ptxLen, $128
JB gcmSm4DecEndOctets
SUBQ $128, ptxLen
MOVOU (0*16)(SP), B0
MOVOU (1*16)(SP), B1
MOVOU (2*16)(SP), B2
MOVOU (3*16)(SP), B3
MOVOU (4*16)(SP), B4
MOVOU (5*16)(SP), B5
MOVOU (6*16)(SP), B6
MOVOU (7*16)(SP), B7
MOVOU (16*0)(ctx), T0
PSHUFB BSWAP, T0
PXOR ACC0, T0
PSHUFD $78, T0, T1
PXOR T0, T1
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PCLMULQDQ $0x00, T1, ACCM
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
decMulRound(1)
increment(0)
decMulRound(2)
increment(1)
decMulRound(3)
increment(2)
decMulRound(4)
increment(3)
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
decMulRound(5)
increment(4)
decMulRound(6)
increment(5)
decMulRound(7)
increment(6)
increment(7)
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
MOVOU (16*0)(ctx), T0
PXOR T0, B0
MOVOU (16*1)(ctx), T0
PXOR T0, B1
MOVOU (16*2)(ctx), T0
PXOR T0, B2
MOVOU (16*3)(ctx), T0
PXOR T0, B3
MOVOU (16*4)(ctx), T0
PXOR T0, B4
MOVOU (16*5)(ctx), T0
PXOR T0, B5
MOVOU (16*6)(ctx), T0
PXOR T0, B6
MOVOU (16*7)(ctx), T0
PXOR T0, B7
MOVOU B0, (16*0)(ptx)
MOVOU B1, (16*1)(ptx)
MOVOU B2, (16*2)(ptx)
MOVOU B3, (16*3)(ptx)
MOVOU B4, (16*4)(ptx)
MOVOU B5, (16*5)(ptx)
MOVOU B6, (16*6)(ptx)
MOVOU B7, (16*7)(ptx)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP gcmSm4DecOctetsLoop
gcmSm4DecEndOctets:
SUBQ $4, aluCTR
gcmSm4DecNibbles:
CMPQ ptxLen, $64
2022-01-24 16:14:42 +08:00
JBE gcmSm4DecSingles
2022-01-21 11:24:10 +08:00
SUBQ $64, ptxLen
MOVOU (0*16)(SP), B0
MOVOU (1*16)(SP), B1
MOVOU (2*16)(SP), B2
MOVOU (3*16)(SP), B3
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
MOVOU (16*14)(pTbl), T2
MOVOU (16*0)(ctx), T0
PXOR T0, B0
MOVOU (16*1)(ctx), T0
PXOR T0, B1
MOVOU (16*2)(ctx), T0
PXOR T0, B2
MOVOU (16*3)(ctx), T0
PXOR T0, B3
MOVOU B0, (16*0)(ptx)
MOVOU B1, (16*1)(ptx)
MOVOU B2, (16*2)(ptx)
MOVOU B3, (16*3)(ptx)
decGhashRound(0)
increment(0)
decGhashRound(1)
increment(1)
decGhashRound(2)
increment(2)
decGhashRound(3)
increment(3)
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
gcmSm4DecSingles:
TESTQ ptxLen, ptxLen
JE gcmSm4DecDone
MOVOU (0*16)(SP), B0
MOVOU (1*16)(SP), B1
MOVOU (2*16)(SP), B2
MOVOU (3*16)(SP), B3
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
MOVOU B0, (16*4)(SP)
MOVOU B1, (16*5)(SP)
MOVOU B2, (16*6)(SP)
MOVOU B3, (16*7)(SP)
MOVOU (16*14)(pTbl), T2
MOVQ SP, BP
ADDQ $64, BP
gcmSm4DecSinglesLoop:
CMPQ ptxLen, $16
JB gcmSm4DecTail
SUBQ $16, ptxLen
MOVOU (16*0)(BP), B0
MOVOU (ctx), T0
PXOR T0, B0
MOVOU B0, (ptx)
decGhashRound(0)
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP gcmSm4DecSinglesLoop
gcmSm4DecTail:
TESTQ ptxLen, ptxLen
JE gcmSm4DecDone
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
MOVOU -16(aluCTR)(aluTMP*1), T1
MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow
PAND T1, B0
MOVOU B0, T1
PSHUFB BSWAP, B0
PXOR ACC0, B0
MOVOU (16*14)(pTbl), ACC0
MOVOU (16*15)(pTbl), ACCM
MOVOU ACC0, ACC1
PCLMULQDQ $0x00, B0, ACC0
PCLMULQDQ $0x11, B0, ACC1
PSHUFD $78, B0, T0
PXOR B0, T0
PCLMULQDQ $0x00, T0, ACCM
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
MOVOU (16*0)(BP), B0
PXOR T1, B0
ptxStoreLoop:
PEXTRB $0, B0, (ptx)
PSRLDQ $1, B0
LEAQ 1(ptx), ptx
DECQ ptxLen
JNE ptxStoreLoop
gcmSm4DecDone:
MOVOU ACC0, (tPtr)
RET
avx2GcmSm4Dec:
VMOVDQU bswapMask<>(SB), BSWAP
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
MOVL (3*4)(ctrPtr), aluCTR
BSWAPL aluCTR
VMOVDQU T0, (0*16)(SP)
increment(0)
VMOVDQU T0, (1*16)(SP)
increment(1)
VMOVDQU T0, (2*16)(SP)
increment(2)
VMOVDQU T0, (3*16)(SP)
increment(3)
CMPQ ptxLen, $128
JB avx2GcmSm4DecNibbles
// We have at least 8 blocks to dencrypt, prepare the rest of the counters
VMOVDQU T0, (4*16)(SP)
increment(4)
VMOVDQU T0, (5*16)(SP)
increment(5)
VMOVDQU T0, (6*16)(SP)
increment(6)
VMOVDQU T0, (7*16)(SP)
increment(7)
avx2GcmSm4DecOctetsLoop:
CMPQ ptxLen, $128
JB avx2GcmSm4DecEndOctets
SUBQ $128, ptxLen
// load 8 ctrs for encryption
VMOVDQU (0*32)(SP), DWB0
VMOVDQU (1*32)(SP), DWB1
VMOVDQU (2*32)(SP), DWB2
VMOVDQU (3*32)(SP), DWB3
VBROADCASTI128 flipMask<>(SB), XDWTMP0
// Apply Byte Flip Mask: LE -> BE
VPSHUFB XDWTMP0, DWB0, DWB0
VPSHUFB XDWTMP0, DWB1, DWB1
VPSHUFB XDWTMP0, DWB2, DWB2
VPSHUFB XDWTMP0, DWB3, DWB3
VMOVDQU (16*0)(ctx), T0
VPSHUFB BSWAP, T0, T0
VPXOR ACC0, T0, T0
VPSHUFD $78, T0, T1
VPXOR T0, T1, T1
VMOVDQU (16*0)(pTbl), ACC0
VMOVDQU (16*1)(pTbl), ACCM
VMOVDQU ACC0, ACC1
PCLMULQDQ $0x00, T1, ACCM
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
XORL BX, BX
VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
avx2GcmSm4Dec8Loop2:
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Dec8Loop2
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
VBROADCASTI128 bswapMask<>(SB), DWBSWAP
VPSHUFB DWBSWAP, DWB0, DWB0
VPSHUFB DWBSWAP, DWB1, DWB1
VPSHUFB DWBSWAP, DWB2, DWB2
VPSHUFB DWBSWAP, DWB3, DWB3
2022-01-24 16:14:42 +08:00
VMOVDQU (32*0)(ctx), XDWTMP0
VPXOR XDWTMP0, DWB0, DWB0
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
VEXTRACTI128 $1, XDWTMP0, T0
internalDecMulRound(1)
2022-01-21 11:24:10 +08:00
increment(0)
2022-01-24 16:14:42 +08:00
VMOVDQU (32*1)(ctx), XDWTMP0
VPXOR XDWTMP0, DWB1, DWB1
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
VEXTRACTI128 $0, XDWTMP0, T0
internalDecMulRound(2)
2022-01-21 11:24:10 +08:00
increment(1)
2022-01-24 16:14:42 +08:00
VEXTRACTI128 $1, XDWTMP0, T0
internalDecMulRound(3)
2022-01-21 11:24:10 +08:00
increment(2)
2022-01-24 16:14:42 +08:00
VMOVDQU (32*2)(ctx), XDWTMP0
VPXOR XDWTMP0, DWB2, DWB2
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
VEXTRACTI128 $0, XDWTMP0, T0
internalDecMulRound(4)
2022-01-21 11:24:10 +08:00
increment(3)
2022-01-24 16:14:42 +08:00
VEXTRACTI128 $1, XDWTMP0, T0
internalDecMulRound(5)
2022-01-21 11:24:10 +08:00
increment(4)
2022-01-24 16:14:42 +08:00
VMOVDQU (32*3)(ctx), XDWTMP0
VPXOR XDWTMP0, DWB3, DWB3
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
VEXTRACTI128 $0, XDWTMP0, T0
internalDecMulRound(6)
2022-01-21 11:24:10 +08:00
increment(5)
2022-01-24 16:14:42 +08:00
VEXTRACTI128 $1, XDWTMP0, T0
internalDecMulRound(7)
2022-01-21 11:24:10 +08:00
increment(6)
increment(7)
2022-01-24 16:14:42 +08:00
VMOVDQU DWB0, (32*0)(ptx)
VMOVDQU DWB1, (32*1)(ptx)
VMOVDQU DWB2, (32*2)(ptx)
VMOVDQU DWB3, (32*3)(ptx)
2022-01-21 11:24:10 +08:00
VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0
VPSRLDQ $8, ACCM, ACCM
VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP avx2GcmSm4DecOctetsLoop
avx2GcmSm4DecEndOctets:
SUBQ $4, aluCTR
avx2GcmSm4DecNibbles:
2022-01-24 16:14:42 +08:00
VMOVDQU flipMask<>(SB), B7 // DO NOT CHANGE B7
2022-01-21 11:24:10 +08:00
CMPQ ptxLen, $64
2022-01-24 16:14:42 +08:00
JBE avx2GcmSm4DecSingles
2022-01-21 11:24:10 +08:00
SUBQ $64, ptxLen
VMOVDQU (0*16)(SP), B0
VMOVDQU (1*16)(SP), B1
VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3
2022-01-24 16:14:42 +08:00
VPSHUFB B7, B0, B0
VPSHUFB B7, B1, B1
VPSHUFB B7, B2, B2
VPSHUFB B7, B3, B3
2022-01-21 11:24:10 +08:00
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
XORL BX, BX
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Dec4Loop2:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Dec4Loop2
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
2022-01-24 16:14:42 +08:00
VPSHUFB BSWAP, B0, B4
2022-01-21 11:24:10 +08:00
VPSHUFB BSWAP, B1, B1
VPSHUFB BSWAP, B2, B2
VPSHUFB BSWAP, B3, B3
VMOVDQU (16*14)(pTbl), T2
2022-01-24 16:14:42 +08:00
VMOVDQU (16*0)(ctx), B0
VPXOR B0, B4, B4
internalDecGhashRound()
VMOVDQU (16*1)(ctx), B0
VPXOR B0, B1, B1
internalDecGhashRound()
VMOVDQU (16*2)(ctx), B0
VPXOR B0, B2, B2
internalDecGhashRound()
2022-01-21 11:24:10 +08:00
2022-01-24 16:14:42 +08:00
VMOVDQU (16*3)(ctx), B0
VPXOR B0, B3, B3
internalDecGhashRound()
VMOVDQU B4, (16*0)(ptx)
2022-01-21 11:24:10 +08:00
VMOVDQU B1, (16*1)(ptx)
VMOVDQU B2, (16*2)(ptx)
VMOVDQU B3, (16*3)(ptx)
increment(0)
increment(1)
increment(2)
increment(3)
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
avx2GcmSm4DecSingles:
TESTQ ptxLen, ptxLen
JE avx2GcmSm4DecDone
2022-01-24 16:14:42 +08:00
2022-01-21 11:24:10 +08:00
VMOVDQU (0*16)(SP), B0
VMOVDQU (1*16)(SP), B1
VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3
2022-01-24 16:14:42 +08:00
VPSHUFB B7, B0, B0
VPSHUFB B7, B1, B1
VPSHUFB B7, B2, B2
VPSHUFB B7, B3, B3
2022-01-21 11:24:10 +08:00
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
2022-01-24 16:14:42 +08:00
2022-01-21 11:24:10 +08:00
XORL BX, BX
VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
avx2GcmSm4Dec4Loop1:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
ADDL $16, BX
CMPL BX, $4*32
JB avx2GcmSm4Dec4Loop1
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
VPSHUFB BSWAP, B0, B0
VPSHUFB BSWAP, B1, B1
VPSHUFB BSWAP, B2, B2
VPSHUFB BSWAP, B3, B3
VMOVDQU B0, (16*4)(SP)
VMOVDQU B1, (16*5)(SP)
VMOVDQU B2, (16*6)(SP)
VMOVDQU B3, (16*7)(SP)
VMOVDQU (16*14)(pTbl), T2
MOVQ SP, BP
ADDQ $64, BP
avx2GcmSm4DecSinglesLoop:
CMPQ ptxLen, $16
JB avx2GcmSm4DecTail
SUBQ $16, ptxLen
2022-01-24 16:14:42 +08:00
VMOVDQU (16*0)(BP), T0
VMOVDQU (ctx), B0
VPXOR T0, B0, T0
VMOVDQU T0, (ptx)
2022-01-21 11:24:10 +08:00
2022-01-24 16:14:42 +08:00
internalDecGhashRound()
2022-01-21 11:24:10 +08:00
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP avx2GcmSm4DecSinglesLoop
avx2GcmSm4DecTail:
TESTQ ptxLen, ptxLen
JE avx2GcmSm4DecDone
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
2022-01-24 16:14:42 +08:00
VMOVDQU -16(aluCTR)(aluTMP*1), T1 // Fetch and-mask according ptxLen
2022-01-21 11:24:10 +08:00
VMOVDQU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow
2022-01-24 16:14:42 +08:00
VPAND T1, B0, B0 // Just keep ptxLen bytes, others will be zero
2022-01-21 11:24:10 +08:00
VMOVDQU B0, T1
2022-01-24 16:14:42 +08:00
internalDecGhashRound()
2022-01-21 11:24:10 +08:00
VMOVDQU (16*0)(BP), B0
VPXOR T1, B0, B0
avx2PtxStoreLoop:
PEXTRB $0, B0, (ptx)
PSRLDQ $1, B0
LEAQ 1(ptx), ptx
DECQ ptxLen
JNE avx2PtxStoreLoop
avx2GcmSm4DecDone:
VMOVDQU ACC0, (tPtr)
VZEROUPPER
RET
2022-04-29 13:01:34 +08:00
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
RET
// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
RET