gmsm/sm4/gcm_amd64.s

2297 lines
45 KiB
ArmAsm
Raw Normal View History

2022-01-21 11:24:10 +08:00
// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
// The implementation uses some optimization as described in:
// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
// Instruction and its Usage for Computing the GCM Mode rev. 2.02
// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
// Hardware
2024-03-05 09:47:49 +08:00
//go:build !purego
2022-01-21 11:24:10 +08:00
#include "textflag.h"
#define B0 X0
#define B1 X1
#define B2 X2
#define B3 X3
#define B4 X4
#define B5 X5
#define B6 X6
#define B7 X7
#define DWB0 Y0
#define DWB1 Y2
#define DWB2 Y4
#define DWB3 Y6
#define XDWORD Y1
#define YDWORD Y3
#define XDWTMP0 Y5
#define ACC0 X8
#define ACC1 X9
#define ACCM X10
#define T0 X11
#define T1 X12
#define T2 X13
#define POLY X14
#define BSWAP X15
#define DWBSWAP Y15
2023-08-28 14:41:15 +08:00
#define NIBBLE_MASK Y7
#define X_NIBBLE_MASK X7
2022-01-21 11:24:10 +08:00
DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
DATA andMask<>+0x08(SB)/8, $0x0000000000000000
DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
DATA andMask<>+0x18(SB)/8, $0x0000000000000000
DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
DATA andMask<>+0x28(SB)/8, $0x0000000000000000
DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
DATA andMask<>+0x38(SB)/8, $0x0000000000000000
DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
DATA andMask<>+0x48(SB)/8, $0x0000000000000000
DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0x58(SB)/8, $0x0000000000000000
DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
DATA andMask<>+0x68(SB)/8, $0x0000000000000000
DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x78(SB)/8, $0x0000000000000000
DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
GLOBL andMask<>(SB), (NOPTR+RODATA), $240
2023-06-28 16:52:40 +08:00
#include "aesni_macros_amd64.s"
2022-07-21 13:41:56 +08:00
2022-01-21 11:24:10 +08:00
// func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#define pTbl DI
#define tMsk SI
#define tPtr DX
#define plen AX
#define dlen CX
MOVQ productTable+0(FP), pTbl
MOVQ tagMask+8(FP), tMsk
MOVQ T+16(FP), tPtr
MOVQ pLen+24(FP), plen
MOVQ dLen+32(FP), dlen
MOVOU (tPtr), ACC0
MOVOU (tMsk), T2
MOVOU ·bswap_mask(SB), BSWAP
2022-01-21 11:24:10 +08:00
MOVOU gcmPoly<>(SB), POLY
SHLQ $3, plen
SHLQ $3, dlen
MOVQ plen, B0
PINSRQ $1, dlen, B0
PXOR ACC0, B0
MOVOU (16*14)(pTbl), ACC0
MOVOU (16*15)(pTbl), ACCM
MOVOU ACC0, ACC1
PCLMULQDQ $0x00, B0, ACC0
PCLMULQDQ $0x11, B0, ACC1
PSHUFD $78, B0, T0
PXOR B0, T0
PCLMULQDQ $0x00, T0, ACCM
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
PXOR ACC1, ACC0
PSHUFB BSWAP, ACC0
PXOR T2, ACC0
MOVOU ACC0, (tPtr)
RET
#undef pTbl
#undef tMsk
#undef tPtr
#undef plen
#undef dlen
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
#define dst DI
#define RK SI
MOVQ productTable+0(FP), dst
MOVQ rk+8(FP), RK
MOVOU gcmPoly<>(SB), POLY
// Encrypt block 0, with the sm4 round keys to generate the hash key H
PXOR B0, B0
PXOR B1, B1
PXOR B2, B2
PXOR B3, B3
XORL CX, CX
sm4InitEncLoop:
MOVUPS (RK)(CX*1), B4
MOVOU B4, T0
SM4_SINGLE_ROUND(T0, T1, T2, B3, B2, B1, B0)
PSHUFD $1, B4, T0
SM4_SINGLE_ROUND(T0, T1, T2, B2, B1, B0, B3)
PSHUFD $2, B4, T0
SM4_SINGLE_ROUND(T0, T1, T2, B1, B0, B3, B2)
PSHUFD $3, B4, T0
SM4_SINGLE_ROUND(T0, T1, T2, B0, B3, B2, B1)
ADDL $16, CX
CMPL CX, $4*32
JB sm4InitEncLoop
2022-01-21 11:24:10 +08:00
PALIGNR $4, B3, B3
PALIGNR $4, B3, B2
PALIGNR $4, B2, B1
PALIGNR $4, B1, B0
2022-01-21 11:24:10 +08:00
// H * 2
PSHUFD $0xff, B0, T0
MOVOU B0, T1
PSRAL $31, T0
PAND POLY, T0
PSRLL $31, T1
PSLLDQ $4, T1
PSLLL $1, B0
PXOR T0, B0
PXOR T1, B0
// Karatsuba pre-computations
MOVOU B0, (16*14)(dst)
PSHUFD $78, B0, B1
PXOR B0, B1
MOVOU B1, (16*15)(dst)
MOVOU B0, B2
MOVOU B1, B3
// Now prepare powers of H and pre-computations for them
MOVQ $7, AX
initLoop:
2024-10-30 08:31:12 +08:00
// B0 * B2, Karatsuba Approach
2022-01-21 11:24:10 +08:00
MOVOU B2, T0
MOVOU B2, T1
MOVOU B3, T2
2024-10-30 08:31:12 +08:00
PCLMULQDQ $0x00, B0, T0 // B0[0] * B2[0]
PCLMULQDQ $0x11, B0, T1 // B0[1] * B2[1]
PCLMULQDQ $0x00, B1, T2 // (B0[0] + B0[1]) * (B2[0] + B2[1])
2022-01-21 11:24:10 +08:00
2024-10-30 08:31:12 +08:00
PXOR T0, T2 // (B0[0] + B0[1]) * (B2[0] + B2[1]) - B0[0] * B2[0]
PXOR T1, T2 // B0[0] * B2[1] + B0[1] * B2[0]
2022-01-21 11:24:10 +08:00
MOVOU T2, B4
PSLLDQ $8, B4
PSRLDQ $8, T2
PXOR B4, T0
2024-10-30 08:31:12 +08:00
PXOR T2, T1 // [T1, T0] = B0 * B2
2022-01-21 11:24:10 +08:00
2024-10-30 08:31:12 +08:00
// Fast reduction
// 1st reduction
2022-01-21 11:24:10 +08:00
MOVOU POLY, B2
2024-10-30 08:31:12 +08:00
PCLMULQDQ $0x01, T0, B2 // B2 = T0[0] * POLY[1]
2022-01-21 11:24:10 +08:00
PSHUFD $78, T0, T0
PXOR B2, T0
2024-10-30 08:31:12 +08:00
// 2nd reduction
2022-01-21 11:24:10 +08:00
MOVOU POLY, B2
PCLMULQDQ $0x01, T0, B2
PSHUFD $78, T0, T0
PXOR T0, B2
PXOR T1, B2
MOVOU B2, (16*12)(dst)
PSHUFD $78, B2, B3
PXOR B2, B3
MOVOU B3, (16*13)(dst)
DECQ AX
LEAQ (-16*2)(dst), dst
JNE initLoop
RET
#undef RK
#undef dst
// func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
TEXT ·gcmSm4Data(SB),NOSPLIT,$0
#define pTbl DI
#define aut SI
#define tPtr CX
#define autLen DX
#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
2023-07-03 12:00:27 +08:00
#define avxReduceRound(a) VPCLMULQDQ $0x01, a, POLY, T0; VPSHUFD $78, a, a; VPXOR T0, a, a
2022-01-21 11:24:10 +08:00
#define mulRoundAAD(X ,i) \
MOVOU (16*(i*2))(pTbl), T1;\
MOVOU T1, T2;\
PCLMULQDQ $0x00, X, T1;\
PXOR T1, ACC0;\
PCLMULQDQ $0x11, X, T2;\
PXOR T2, ACC1;\
PSHUFD $78, X, T1;\
PXOR T1, X;\
MOVOU (16*(i*2+1))(pTbl), T1;\
PCLMULQDQ $0x00, X, T1;\
PXOR T1, ACCM
MOVQ productTable+0(FP), pTbl
MOVQ data_base+8(FP), aut
MOVQ data_len+16(FP), autLen
MOVQ T+32(FP), tPtr
2022-07-28 09:37:03 +08:00
PXOR ACC0, ACC0
// MOVOU (tPtr), ACC0 // originally we passed in tag initial value
MOVOU ·bswap_mask(SB), BSWAP
2022-01-21 11:24:10 +08:00
MOVOU gcmPoly<>(SB), POLY
TESTQ autLen, autLen
JEQ dataBail
CMPQ autLen, $13 // optimize the TLS case
JE dataTLS
CMPQ autLen, $128
JB startSinglesLoop
JMP dataOctaLoop
dataTLS:
MOVOU (16*14)(pTbl), T1
MOVOU (16*15)(pTbl), T2
PXOR B0, B0
MOVQ (aut), B0
PINSRD $2, 8(aut), B0
PINSRB $12, 12(aut), B0
XORQ autLen, autLen
JMP dataMul
dataOctaLoop:
CMPQ autLen, $128
JB startSinglesLoop
SUBQ $128, autLen
MOVOU (16*0)(aut), X0
MOVOU (16*1)(aut), X1
MOVOU (16*2)(aut), X2
MOVOU (16*3)(aut), X3
MOVOU (16*4)(aut), X4
MOVOU (16*5)(aut), X5
MOVOU (16*6)(aut), X6
MOVOU (16*7)(aut), X7
LEAQ (16*8)(aut), aut
PSHUFB BSWAP, X0
PSHUFB BSWAP, X1
PSHUFB BSWAP, X2
PSHUFB BSWAP, X3
PSHUFB BSWAP, X4
PSHUFB BSWAP, X5
PSHUFB BSWAP, X6
PSHUFB BSWAP, X7
PXOR ACC0, X0
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PSHUFD $78, X0, T1
PXOR X0, T1
PCLMULQDQ $0x00, X0, ACC0
PCLMULQDQ $0x11, X0, ACC1
PCLMULQDQ $0x00, T1, ACCM
mulRoundAAD(X1, 1)
mulRoundAAD(X2, 2)
mulRoundAAD(X3, 3)
mulRoundAAD(X4, 4)
mulRoundAAD(X5, 5)
mulRoundAAD(X6, 6)
mulRoundAAD(X7, 7)
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
JMP dataOctaLoop
startSinglesLoop:
MOVOU (16*14)(pTbl), T1
MOVOU (16*15)(pTbl), T2
dataSinglesLoop:
CMPQ autLen, $16
JB dataEnd
SUBQ $16, autLen
MOVOU (aut), B0
dataMul:
PSHUFB BSWAP, B0
PXOR ACC0, B0
MOVOU T1, ACC0
MOVOU T2, ACCM
MOVOU T1, ACC1
PSHUFD $78, B0, T0
PXOR B0, T0
PCLMULQDQ $0x00, B0, ACC0
PCLMULQDQ $0x11, B0, ACC1
PCLMULQDQ $0x00, T0, ACCM
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
MOVOU POLY, T0
PCLMULQDQ $0x01, ACC0, T0
PSHUFD $78, ACC0, ACC0
PXOR T0, ACC0
PXOR ACC1, ACC0
LEAQ 16(aut), aut
JMP dataSinglesLoop
dataEnd:
TESTQ autLen, autLen
JEQ dataBail
PXOR B0, B0
LEAQ -1(aut)(autLen*1), aut
dataLoadLoop:
PSLLDQ $1, B0
PINSRB $0, (aut), B0
LEAQ -1(aut), aut
DECQ autLen
JNE dataLoadLoop
JMP dataMul
dataBail:
MOVOU ACC0, (tPtr)
RET
#undef pTbl
#undef aut
#undef tPtr
#undef autLen
// func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4Enc(SB),0,$256-96
#define pTbl DI
#define ctx DX
#define ctrPtr CX
#define ptx SI
#define rk AX
#define tPtr R8
#define ptxLen R9
#define aluCTR R10
#define aluTMP R11
#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, (3*4 + 8*16 + i*16)(SP)
2022-01-21 11:24:10 +08:00
#define mulRound(i) \
MOVOU (16*i)(SP), T0;\
MOVOU (16*(i*2))(pTbl), T1;\
MOVOU T1, T2;\
PCLMULQDQ $0x00, T0, T1;\
PXOR T1, ACC0;\
PCLMULQDQ $0x11, T0, T2;\
PXOR T2, ACC1;\
PSHUFD $78, T0, T1;\
PXOR T1, T0;\
MOVOU (16*(i*2+1))(pTbl), T1;\
PCLMULQDQ $0x00, T0, T1;\
PXOR T1, ACCM
#define gcmEncDataStep(B) \
PSHUFB BSWAP, B; \
PXOR ACC0, B; \
MOVOU T2, ACC0; \
MOVOU T2, ACC1; \
MOVOU (16*15)(pTbl), ACCM; \
PSHUFD $78, B, T0; \
PXOR B, T0; \
PCLMULQDQ $0x00, B, ACC0; \
PCLMULQDQ $0x11, B, ACC1; \
PCLMULQDQ $0x00, T0, ACCM; \
PXOR ACC0, ACCM; \
PXOR ACC1, ACCM; \
MOVOU ACCM, T0; \
PSRLDQ $8, ACCM; \
PSLLDQ $8, T0; \
PXOR ACCM, ACC1; \
PXOR T0, ACC0; \
reduceRound(ACC0); \
reduceRound(ACC0); \
PXOR ACC1, ACC0
2023-07-03 12:00:27 +08:00
#define avxMulRound(i) \
VMOVDQU (16*i)(SP), T0;\
VMOVDQU (16*(i*2))(pTbl), T2;\
VPCLMULQDQ $0x00, T0, T2, T1;\
VPXOR T1, ACC0, ACC0;\
VPCLMULQDQ $0x11, T0, T2, T2;\
VPXOR T2, ACC1, ACC1;\
VPSHUFD $78, T0, T1;\
VPXOR T1, T0, T0;\
VMOVDQU (16*(i*2+1))(pTbl), T1;\
VPCLMULQDQ $0x00, T0, T1, T1;\
VPXOR T1, ACCM, ACCM
#define avxGcmEncDataStep(B) \
VPSHUFB BSWAP, B, B; \
VPXOR ACC0, B, B; \
VMOVDQU (16*15)(pTbl), ACCM; \
VPSHUFD $78, B, T0; \
VPXOR B, T0, T0; \
VPCLMULQDQ $0x00, B, T2, ACC0; \
VPCLMULQDQ $0x11, B, T2, ACC1; \
VPCLMULQDQ $0x00, T0, ACCM, ACCM; \
VPXOR ACC0, ACCM, ACCM; \
VPXOR ACC1, ACCM, ACCM; \
VPSLLDQ $8, ACCM, T0; \
VPSRLDQ $8, ACCM, ACCM; \
VPXOR ACCM, ACC1, ACC1; \
VPXOR T0, ACC0, ACC0; \
avxReduceRound(ACC0); \
avxReduceRound(ACC0); \
VPXOR ACC1, ACC0, ACC0
2022-01-21 11:24:10 +08:00
MOVQ productTable+0(FP), pTbl
MOVQ dst+8(FP), ctx
MOVQ src_base+32(FP), ptx
MOVQ src_len+40(FP), ptxLen
MOVQ ctr+56(FP), ctrPtr
MOVQ T+64(FP), tPtr
MOVQ rk_base+72(FP), rk
CMPB ·useAVX2(SB), $1
JE avx2GcmSm4Enc
2023-07-03 12:00:27 +08:00
CMPB ·useAVX(SB), $1
JE avxGcmSm4Enc
MOVOU ·bswap_mask(SB), BSWAP
2022-01-21 11:24:10 +08:00
MOVOU gcmPoly<>(SB), POLY
MOVOU (tPtr), ACC0
PXOR ACC1, ACC1
PXOR ACCM, ACCM
MOVOU (ctrPtr), T0
PSHUFB ·flip_mask(SB), T0
2023-08-28 14:41:15 +08:00
PEXTRD $3, T0, aluCTR
2022-01-21 11:24:10 +08:00
MOVOU T0, (8*16 + 0*16)(SP)
increment(0)
MOVOU T0, (8*16 + 1*16)(SP)
increment(1)
MOVOU T0, (8*16 + 2*16)(SP)
increment(2)
MOVOU T0, (8*16 + 3*16)(SP)
increment(3)
CMPQ ptxLen, $128
JB gcmSm4EncNibbles
SUBQ $128, ptxLen
// We have at least 8 blocks to encrypt, prepare the rest of the counters
MOVOU T0, (8*16 + 4*16)(SP)
increment(4)
MOVOU T0, (8*16 + 5*16)(SP)
increment(5)
MOVOU T0, (8*16 + 6*16)(SP)
increment(6)
MOVOU T0, (8*16 + 7*16)(SP)
increment(7)
// load 8 ctrs for encryption
MOVOU (8*16 + 0*16)(SP), B0
MOVOU (8*16 + 1*16)(SP), B1
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
MOVOU (8*16 + 4*16)(SP), B4
MOVOU (8*16 + 5*16)(SP), B5
MOVOU (8*16 + 6*16)(SP), B6
MOVOU (8*16 + 7*16)(SP), B7
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
2022-01-21 11:24:10 +08:00
increment(0)
// XOR plaintext
MOVOU (16*0)(ptx), T0
PXOR T0, B0
2023-08-28 14:41:15 +08:00
increment(1)
2022-01-21 11:24:10 +08:00
MOVOU (16*1)(ptx), T0
PXOR T0, B1
2023-08-28 14:41:15 +08:00
increment(2)
2022-01-21 11:24:10 +08:00
MOVOU (16*2)(ptx), T0
PXOR T0, B2
2023-08-28 14:41:15 +08:00
increment(3)
2022-01-21 11:24:10 +08:00
MOVOU (16*3)(ptx), T0
PXOR T0, B3
2023-08-28 14:41:15 +08:00
increment(4)
2022-01-21 11:24:10 +08:00
MOVOU (16*4)(ptx), T0
PXOR T0, B4
2023-08-28 14:41:15 +08:00
increment(5)
2022-01-21 11:24:10 +08:00
MOVOU (16*5)(ptx), T0
PXOR T0, B5
2023-08-28 14:41:15 +08:00
increment(6)
2022-01-21 11:24:10 +08:00
MOVOU (16*6)(ptx), T0
PXOR T0, B6
2023-08-28 14:41:15 +08:00
increment(7)
2022-01-21 11:24:10 +08:00
MOVOU (16*7)(ptx), T0
PXOR T0, B7
// Store ciphertext
MOVOU B0, (16*0)(ctx)
PSHUFB BSWAP, B0
PXOR ACC0, B0
MOVOU B1, (16*1)(ctx)
PSHUFB BSWAP, B1
MOVOU B2, (16*2)(ctx)
PSHUFB BSWAP, B2
MOVOU B3, (16*3)(ctx)
PSHUFB BSWAP, B3
MOVOU B4, (16*4)(ctx)
PSHUFB BSWAP, B4
MOVOU B5, (16*5)(ctx)
PSHUFB BSWAP, B5
MOVOU B6, (16*6)(ctx)
PSHUFB BSWAP, B6
MOVOU B7, (16*7)(ctx)
PSHUFB BSWAP, B7
MOVOU B0, (16*0)(SP)
MOVOU B1, (16*1)(SP)
MOVOU B2, (16*2)(SP)
MOVOU B3, (16*3)(SP)
MOVOU B4, (16*4)(SP)
MOVOU B5, (16*5)(SP)
MOVOU B6, (16*6)(SP)
MOVOU B7, (16*7)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
gcmSm4EncOctetsLoop:
CMPQ ptxLen, $128
JB gcmSm4EncOctetsEnd
SUBQ $128, ptxLen
MOVOU (8*16 + 0*16)(SP), B0
MOVOU (8*16 + 1*16)(SP), B1
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
MOVOU (8*16 + 4*16)(SP), B4
MOVOU (8*16 + 5*16)(SP), B5
MOVOU (8*16 + 6*16)(SP), B6
MOVOU (8*16 + 7*16)(SP), B7
MOVOU (16*0)(SP), T0
PSHUFD $78, T0, T1
PXOR T0, T1
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PCLMULQDQ $0x00, T1, ACCM
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
mulRound(1)
increment(0)
mulRound(2)
increment(1)
mulRound(3)
increment(2)
mulRound(4)
increment(3)
mulRound(5)
increment(4)
mulRound(6)
increment(5)
mulRound(7)
increment(6)
2023-08-28 14:41:15 +08:00
2022-01-21 11:24:10 +08:00
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
2023-08-28 14:41:15 +08:00
increment(7)
2022-01-21 11:24:10 +08:00
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
2023-08-03 15:17:01 +08:00
2023-11-01 15:00:49 +08:00
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
MOVOU (16*4)(ptx), T0
PXOR T0, B4
MOVOU (16*5)(ptx), T0
PXOR T0, B5
MOVOU (16*6)(ptx), T0
PXOR T0, B6
MOVOU (16*7)(ptx), T0
PXOR T0, B7
2022-01-21 11:24:10 +08:00
MOVOU B0, (16*0)(ctx)
PSHUFB BSWAP, B0
PXOR ACC0, B0
MOVOU B1, (16*1)(ctx)
PSHUFB BSWAP, B1
MOVOU B2, (16*2)(ctx)
PSHUFB BSWAP, B2
MOVOU B3, (16*3)(ctx)
PSHUFB BSWAP, B3
MOVOU B4, (16*4)(ctx)
PSHUFB BSWAP, B4
MOVOU B5, (16*5)(ctx)
PSHUFB BSWAP, B5
MOVOU B6, (16*6)(ctx)
PSHUFB BSWAP, B6
MOVOU B7, (16*7)(ctx)
PSHUFB BSWAP, B7
MOVOU B0, (16*0)(SP)
MOVOU B1, (16*1)(SP)
MOVOU B2, (16*2)(SP)
MOVOU B3, (16*3)(SP)
MOVOU B4, (16*4)(SP)
MOVOU B5, (16*5)(SP)
MOVOU B6, (16*6)(SP)
MOVOU B7, (16*7)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP gcmSm4EncOctetsLoop
gcmSm4EncOctetsEnd:
MOVOU (16*0)(SP), T0
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PSHUFD $78, T0, T1
PXOR T0, T1
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
PCLMULQDQ $0x00, T1, ACCM
mulRound(1)
mulRound(2)
mulRound(3)
mulRound(4)
mulRound(5)
mulRound(6)
mulRound(7)
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
TESTQ ptxLen, ptxLen
JE gcmSm4EncDone
SUBQ $4, aluCTR
gcmSm4EncNibbles:
CMPQ ptxLen, $64
2022-01-24 16:14:42 +08:00
JBE gcmSm4EncSingles
2022-01-21 11:24:10 +08:00
SUBQ $64, ptxLen
MOVOU (8*16 + 0*16)(SP), B0
MOVOU (8*16 + 1*16)(SP), B1
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
2023-11-01 15:00:49 +08:00
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
2022-01-21 11:24:10 +08:00
MOVOU B0, (16*0)(ctx)
MOVOU B1, (16*1)(ctx)
MOVOU B2, (16*2)(ctx)
MOVOU B3, (16*3)(ctx)
MOVOU (16*14)(pTbl), T2
increment(0)
2023-08-28 14:41:15 +08:00
gcmEncDataStep(B0)
2022-01-21 11:24:10 +08:00
increment(1)
2023-08-28 14:41:15 +08:00
gcmEncDataStep(B1)
2022-01-21 11:24:10 +08:00
increment(2)
2023-08-28 14:41:15 +08:00
gcmEncDataStep(B2)
2022-01-21 11:24:10 +08:00
increment(3)
2023-08-28 14:41:15 +08:00
gcmEncDataStep(B3)
2022-01-21 11:24:10 +08:00
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
gcmSm4EncSingles:
TESTQ ptxLen, ptxLen
JE gcmSm4EncDone
MOVOU (8*16 + 0*16)(SP), B0
MOVOU (8*16 + 1*16)(SP), B1
MOVOU (8*16 + 2*16)(SP), B2
MOVOU (8*16 + 3*16)(SP), B3
SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
2022-01-21 11:24:10 +08:00
MOVOU B0, (16*0)(SP)
MOVOU B1, (16*1)(SP)
MOVOU B2, (16*2)(SP)
MOVOU B3, (16*3)(SP)
MOVOU (16*14)(pTbl), T2
MOVQ SP, BP
gcmSm4EncSinglesLoop:
CMPQ ptxLen, $16
JB gcmSm4EncTail
SUBQ $16, ptxLen
MOVOU (16*0)(BP), B0
MOVOU (ptx), T0
PXOR T0, B0
MOVOU B0, (ctx)
gcmEncDataStep(B0)
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP gcmSm4EncSinglesLoop
gcmSm4EncTail:
TESTQ ptxLen, ptxLen
JE gcmSm4EncDone
MOVOU (16*0)(BP), B0
MOVOU B0, T0
LEAQ -1(ptx)(ptxLen*1), ptx
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
MOVOU -16(aluCTR)(aluTMP*1), T1
PXOR B0, B0
ptxLoadLoop:
PSLLDQ $1, B0
PINSRB $0, (ptx), B0
LEAQ -1(ptx), ptx
DECQ ptxLen
JNE ptxLoadLoop
PXOR T0, B0
PAND T1, B0
MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT
gcmEncDataStep(B0)
gcmSm4EncDone:
MOVOU ACC0, (tPtr)
RET
2023-07-03 12:00:27 +08:00
avxGcmSm4Enc:
VMOVDQU ·bswap_mask(SB), BSWAP
2023-07-03 12:00:27 +08:00
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
VPSHUFB ·flip_mask(SB), T0, T0
2023-08-28 14:41:15 +08:00
VPEXTRD $3, T0, aluCTR
2023-07-03 12:00:27 +08:00
VMOVDQU T0, (8*16 + 0*16)(SP)
increment(0)
VMOVDQU T0, (8*16 + 1*16)(SP)
increment(1)
VMOVDQU T0, (8*16 + 2*16)(SP)
increment(2)
VMOVDQU T0, (8*16 + 3*16)(SP)
increment(3)
CMPQ ptxLen, $128
JB avxGcmSm4EncNibbles
SUBQ $128, ptxLen
// We have at least 8 blocks to encrypt, prepare the rest of the counters
VMOVDQU T0, (8*16 + 4*16)(SP)
increment(4)
VMOVDQU T0, (8*16 + 5*16)(SP)
increment(5)
VMOVDQU T0, (8*16 + 6*16)(SP)
increment(6)
VMOVDQU T0, (8*16 + 7*16)(SP)
increment(7)
2023-09-04 08:36:03 +08:00
// load 8 ctrs for encryption
2023-07-03 12:00:27 +08:00
VMOVDQU (8*16 + 0*16)(SP), B0
VMOVDQU (8*16 + 1*16)(SP), B1
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
2023-08-03 15:17:01 +08:00
VMOVDQU (8*16 + 4*16)(SP), B4
VMOVDQU (8*16 + 5*16)(SP), B5
VMOVDQU (8*16 + 6*16)(SP), B6
VMOVDQU (8*16 + 7*16)(SP), B7
2023-07-03 12:00:27 +08:00
AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
2023-07-03 12:00:27 +08:00
increment(0)
2023-08-28 14:41:15 +08:00
2023-07-03 12:00:27 +08:00
// XOR plaintext
VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1
2023-08-28 14:41:15 +08:00
increment(1)
2023-07-03 12:00:27 +08:00
VPXOR (16*2)(ptx), B2, B2
VPXOR (16*3)(ptx), B3, B3
2023-08-28 14:41:15 +08:00
increment(2)
2023-08-03 15:17:01 +08:00
VPXOR (16*4)(ptx), B4, B4
VPXOR (16*5)(ptx), B5, B5
2023-08-28 14:41:15 +08:00
increment(3)
2023-08-03 15:17:01 +08:00
VPXOR (16*6)(ptx), B6, B6
VPXOR (16*7)(ptx), B7, B7
2023-07-03 12:00:27 +08:00
// Store ciphertext
VMOVDQU B0, (16*0)(ctx)
VPSHUFB BSWAP, B0, B0
2023-08-28 14:41:15 +08:00
increment(4)
2023-07-03 12:00:27 +08:00
VMOVDQU B1, (16*1)(ctx)
VPSHUFB BSWAP, B1, B1
2023-08-28 14:41:15 +08:00
increment(5)
2023-07-03 12:00:27 +08:00
VMOVDQU B2, (16*2)(ctx)
VPSHUFB BSWAP, B2, B2
2023-08-28 14:41:15 +08:00
increment(6)
2023-07-03 12:00:27 +08:00
VMOVDQU B3, (16*3)(ctx)
VPSHUFB BSWAP, B3, B3
2023-08-28 14:41:15 +08:00
increment(7)
2023-07-03 12:00:27 +08:00
VMOVDQU B4, (16*4)(ctx)
VPSHUFB BSWAP, B4, B4
VMOVDQU B5, (16*5)(ctx)
VPSHUFB BSWAP, B5, B5
VMOVDQU B6, (16*6)(ctx)
VPSHUFB BSWAP, B6, B6
VMOVDQU B7, (16*7)(ctx)
VPSHUFB BSWAP, B7, B7
2023-08-03 15:17:01 +08:00
VPXOR ACC0, B0, B0
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
VMOVDQU B3, (16*3)(SP)
2023-07-03 12:00:27 +08:00
VMOVDQU B4, (16*4)(SP)
VMOVDQU B5, (16*5)(SP)
VMOVDQU B6, (16*6)(SP)
VMOVDQU B7, (16*7)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
avxGcmSm4EncOctetsLoop:
CMPQ ptxLen, $128
JB avxGcmSm4EncOctetsEnd
SUBQ $128, ptxLen
// load 8 ctrs for encryption
VMOVDQU (8*16 + 0*16)(SP), B0
VMOVDQU (8*16 + 1*16)(SP), B1
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
VMOVDQU (8*16 + 4*16)(SP), B4
VMOVDQU (8*16 + 5*16)(SP), B5
VMOVDQU (8*16 + 6*16)(SP), B6
VMOVDQU (8*16 + 7*16)(SP), B7
VMOVDQU (16*0)(SP), T0
VPSHUFD $78, T0, T1
VPXOR T0, T1, T1
VMOVDQU (16*0)(pTbl), ACC1
VMOVDQU (16*1)(pTbl), ACCM
VPCLMULQDQ $0x00, T1, ACCM, ACCM
VPCLMULQDQ $0x00, T0, ACC1, ACC0
VPCLMULQDQ $0x11, T0, ACC1, ACC1
avxMulRound(1)
increment(0)
avxMulRound(2)
increment(1)
avxMulRound(3)
increment(2)
avxMulRound(4)
increment(3)
avxMulRound(5)
increment(4)
avxMulRound(6)
increment(5)
avxMulRound(7)
increment(6)
VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0
VPSRLDQ $8, ACCM, ACCM
VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0
2023-08-28 14:41:15 +08:00
increment(7)
2023-07-03 12:00:27 +08:00
avxReduceRound(ACC0)
avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0
AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
2023-07-03 12:00:27 +08:00
// XOR plaintext
VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1
VPXOR (16*2)(ptx), B2, B2
VPXOR (16*3)(ptx), B3, B3
2023-08-03 15:17:01 +08:00
VPXOR (16*4)(ptx), B4, B4
VPXOR (16*5)(ptx), B5, B5
VPXOR (16*6)(ptx), B6, B6
VPXOR (16*7)(ptx), B7, B7
2023-07-03 12:00:27 +08:00
// Store ciphertext
VMOVDQU B0, (16*0)(ctx)
VPSHUFB BSWAP, B0, B0
VMOVDQU B1, (16*1)(ctx)
VPSHUFB BSWAP, B1, B1
VMOVDQU B2, (16*2)(ctx)
VPSHUFB BSWAP, B2, B2
VMOVDQU B3, (16*3)(ctx)
VPSHUFB BSWAP, B3, B3
VMOVDQU B4, (16*4)(ctx)
VPSHUFB BSWAP, B4, B4
VMOVDQU B5, (16*5)(ctx)
VPSHUFB BSWAP, B5, B5
VMOVDQU B6, (16*6)(ctx)
VPSHUFB BSWAP, B6, B6
VMOVDQU B7, (16*7)(ctx)
VPSHUFB BSWAP, B7, B7
2023-08-03 15:17:01 +08:00
VPXOR ACC0, B0, B0
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
VMOVDQU B3, (16*3)(SP)
2023-07-03 12:00:27 +08:00
VMOVDQU B4, (16*4)(SP)
VMOVDQU B5, (16*5)(SP)
VMOVDQU B6, (16*6)(SP)
VMOVDQU B7, (16*7)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP avxGcmSm4EncOctetsLoop
avxGcmSm4EncOctetsEnd:
VMOVDQU (16*0)(SP), T0
VMOVDQU (16*0)(pTbl), ACC0
VMOVDQU (16*1)(pTbl), ACCM
VMOVDQU ACC0, ACC1
VPSHUFD $78, T0, T1
VPXOR T0, T1, T1
VPCLMULQDQ $0x00, T0, ACC0, ACC0
VPCLMULQDQ $0x11, T0, ACC1, ACC1
VPCLMULQDQ $0x00, T1, ACCM, ACCM
avxMulRound(1)
avxMulRound(2)
avxMulRound(3)
avxMulRound(4)
avxMulRound(5)
avxMulRound(6)
avxMulRound(7)
VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0
VPSRLDQ $8, ACCM, ACCM
VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0
avxReduceRound(ACC0)
avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0
TESTQ ptxLen, ptxLen
JE avxGcmSm4EncDone
SUBQ $4, aluCTR
avxGcmSm4EncNibbles:
CMPQ ptxLen, $64
JBE avxGcmSm4EncSingles
SUBQ $64, ptxLen
// load 4 ctrs for encryption
VMOVDQU (8*16 + 0*16)(SP), B0
VMOVDQU (8*16 + 1*16)(SP), B1
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
AVX_SM4_4BLOCKS_WO_BS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
2023-07-03 12:00:27 +08:00
// XOR plaintext
VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1
VPXOR (16*2)(ptx), B2, B2
VPXOR (16*3)(ptx), B3, B3
// Store ciphertext
VMOVDQU B0, (16*0)(ctx)
VMOVDQU B1, (16*1)(ctx)
VMOVDQU B2, (16*2)(ctx)
VMOVDQU B3, (16*3)(ctx)
VMOVDQU (16*14)(pTbl), T2
increment(0)
2023-08-28 14:41:15 +08:00
avxGcmEncDataStep(B0)
2023-07-03 12:00:27 +08:00
increment(1)
2023-08-28 14:41:15 +08:00
avxGcmEncDataStep(B1)
2023-07-03 12:00:27 +08:00
increment(2)
2023-08-28 14:41:15 +08:00
avxGcmEncDataStep(B2)
2023-07-03 12:00:27 +08:00
increment(3)
2023-08-28 14:41:15 +08:00
avxGcmEncDataStep(B3)
2023-07-03 12:00:27 +08:00
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
avxGcmSm4EncSingles:
TESTQ ptxLen, ptxLen
JE avxGcmSm4EncDone
VMOVDQU (8*16 + 0*16)(SP), B0
VMOVDQU (8*16 + 1*16)(SP), B1
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
AVX_SM4_4BLOCKS_WO_BS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
2023-07-03 12:00:27 +08:00
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
VMOVDQU B3, (16*3)(SP)
VMOVDQU (16*14)(pTbl), T2
MOVQ SP, BP
avxGcmSm4EncSinglesLoop:
CMPQ ptxLen, $16
JB avxGcmSm4EncTail
SUBQ $16, ptxLen
VMOVDQU (16*0)(BP), B0
VMOVDQU (ptx), T0
VPXOR T0, B0, B0
VMOVDQU B0, (ctx)
avxGcmEncDataStep(B0)
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP avxGcmSm4EncSinglesLoop
avxGcmSm4EncTail:
TESTQ ptxLen, ptxLen
JE avxGcmSm4EncDone
VMOVDQU (16*0)(BP), B0
VMOVDQU B0, T0
LEAQ -1(ptx)(ptxLen*1), ptx
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
VMOVDQU -16(aluCTR)(aluTMP*1), T1
VPXOR B0, B0, B0
avxPtxLoadLoop:
VPSLLDQ $1, B0, B0
VPINSRB $0, (ptx), B0, B0
LEAQ -1(ptx), ptx
DECQ ptxLen
JNE avxPtxLoadLoop
VPXOR T0, B0, B0
VPAND T1, B0, B0
VMOVDQU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT
avxGcmEncDataStep(B0)
avxGcmSm4EncDone:
VMOVDQU ACC0, (tPtr)
RET
2022-01-21 11:24:10 +08:00
avx2GcmSm4Enc:
VMOVDQU ·bswap_mask(SB), BSWAP
2022-01-21 11:24:10 +08:00
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
VPSHUFB ·flip_mask(SB), T0, T0
2023-08-28 14:41:15 +08:00
VPEXTRD $3, T0, aluCTR
2023-08-28 14:41:15 +08:00
VINSERTI128 $1, T0, Y11, Y11
VMOVDQU Y11, (8*16 + 0*32)(SP)
2022-01-21 11:24:10 +08:00
increment(0)
increment(1)
2023-08-28 14:41:15 +08:00
VMOVDQU Y11, (8*16 + 1*32)(SP)
2022-01-21 11:24:10 +08:00
increment(2)
increment(3)
CMPQ ptxLen, $128
JB avx2GcmSm4EncNibbles
SUBQ $128, ptxLen
// We have at least 8 blocks to encrypt, prepare the rest of the counters
2023-08-28 14:41:15 +08:00
VMOVDQU Y11, (8*16 + 2*32)(SP)
2022-01-21 11:24:10 +08:00
increment(4)
increment(5)
2023-08-28 14:41:15 +08:00
VMOVDQU Y11, (8*16 + 3*32)(SP)
2022-01-21 11:24:10 +08:00
increment(6)
increment(7)
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
2022-01-21 11:24:10 +08:00
// load 8 ctrs for encryption
VMOVDQU (4*32 + 0*32)(SP), DWB0
VMOVDQU (4*32 + 1*32)(SP), DWB1
VMOVDQU (4*32 + 2*32)(SP), DWB2
VMOVDQU (4*32 + 3*32)(SP), DWB3
2023-08-28 14:41:15 +08:00
increment(0)
2022-01-21 11:24:10 +08:00
// Transpose matrix 4 x 4 32bits word
2023-08-28 14:41:15 +08:00
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
2023-08-28 14:41:15 +08:00
increment(1)
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
increment(2)
2022-01-21 11:24:10 +08:00
// Transpose matrix 4 x 4 32bits word
2023-08-28 14:41:15 +08:00
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
2022-01-21 11:24:10 +08:00
VPSHUFB DWBSWAP, DWB0, DWB0
VPSHUFB DWBSWAP, DWB1, DWB1
2023-08-28 14:41:15 +08:00
increment(3)
2022-01-21 11:24:10 +08:00
VPSHUFB DWBSWAP, DWB2, DWB2
VPSHUFB DWBSWAP, DWB3, DWB3
increment(4)
2023-08-28 14:41:15 +08:00
2022-01-21 11:24:10 +08:00
// XOR plaintext
VMOVDQU (32*0)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB0, DWB0
VMOVDQU (32*1)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB1, DWB1
2023-08-28 14:41:15 +08:00
increment(5)
2022-01-21 11:24:10 +08:00
VMOVDQU (32*2)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB2, DWB2
VMOVDQU (32*3)(ptx), XDWTMP0
VPXOR XDWTMP0, DWB3, DWB3
2023-08-28 14:41:15 +08:00
increment(6)
2022-01-21 11:24:10 +08:00
// Store ciphertext
VMOVDQU DWB0, (32*0)(ctx)
VPSHUFB DWBSWAP, DWB0, DWB0
VMOVDQU DWB1, (32*1)(ctx)
VPSHUFB DWBSWAP, DWB1, DWB1
VMOVDQU DWB2, (32*2)(ctx)
VPSHUFB DWBSWAP, DWB2, DWB2
VMOVDQU DWB3, (32*3)(ctx)
VPSHUFB DWBSWAP, DWB3, DWB3
2023-08-28 14:41:15 +08:00
increment(7)
2022-01-21 11:24:10 +08:00
//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
//VPXOR XDWTMP0, DWB0, DWB0
PXOR ACC0, B0 // Can't call VPXOR here
VMOVDQU DWB0, (32*0)(SP)
VMOVDQU DWB1, (32*1)(SP)
VMOVDQU DWB2, (32*2)(SP)
VMOVDQU DWB3, (32*3)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
avx2GcmSm4EncOctetsLoop:
CMPQ ptxLen, $128
JB avx2GcmSm4EncOctetsEnd
SUBQ $128, ptxLen
// load 8 ctrs for encryption
VMOVDQU (4*32 + 0*32)(SP), DWB0
VMOVDQU (4*32 + 1*32)(SP), DWB1
VMOVDQU (4*32 + 2*32)(SP), DWB2
VMOVDQU (4*32 + 3*32)(SP), DWB3
VMOVDQU (16*0)(SP), T0
VPSHUFD $78, T0, T1
VPXOR T0, T1, T1
2023-07-03 12:00:27 +08:00
VMOVDQU (16*0)(pTbl), ACC1
2022-01-21 11:24:10 +08:00
VMOVDQU (16*1)(pTbl), ACCM
2023-07-03 12:00:27 +08:00
VPCLMULQDQ $0x00, T1, ACCM, ACCM
VPCLMULQDQ $0x00, T0, ACC1, ACC0
VPCLMULQDQ $0x11, T0, ACC1, ACC1
2022-01-21 11:24:10 +08:00
// Transpose matrix 4 x 4 32bits word
2023-08-28 14:41:15 +08:00
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
2022-01-21 11:24:10 +08:00
2023-08-28 14:41:15 +08:00
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
2022-01-21 11:24:10 +08:00
// Transpose matrix 4 x 4 32bits word
2023-08-28 14:41:15 +08:00
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
2022-01-21 11:24:10 +08:00
VPSHUFB DWBSWAP, DWB0, DWB0
VPSHUFB DWBSWAP, DWB1, DWB1
VPSHUFB DWBSWAP, DWB2, DWB2
VPSHUFB DWBSWAP, DWB3, DWB3
2023-07-03 12:00:27 +08:00
avxMulRound(1)
2022-01-21 11:24:10 +08:00
increment(0)
2023-07-03 12:00:27 +08:00
avxMulRound(2)
2022-01-21 11:24:10 +08:00
increment(1)
2023-07-03 12:00:27 +08:00
avxMulRound(3)
2022-01-21 11:24:10 +08:00
increment(2)
2023-07-03 12:00:27 +08:00
avxMulRound(4)
2022-01-21 11:24:10 +08:00
increment(3)
2023-07-03 12:00:27 +08:00
avxMulRound(5)
2022-01-21 11:24:10 +08:00
increment(4)
2023-07-03 12:00:27 +08:00
avxMulRound(6)
2022-01-21 11:24:10 +08:00
increment(5)
2023-07-03 12:00:27 +08:00
avxMulRound(7)
2022-01-21 11:24:10 +08:00
increment(6)
2023-08-28 14:41:15 +08:00
2022-01-21 11:24:10 +08:00
VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0
VPSRLDQ $8, ACCM, ACCM
VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0
2023-08-28 14:41:15 +08:00
increment(7)
2023-07-03 12:00:27 +08:00
avxReduceRound(ACC0)
avxReduceRound(ACC0)
2022-01-21 11:24:10 +08:00
VPXOR ACC1, ACC0, ACC0
// XOR plaintext
2023-08-04 15:52:00 +08:00
VPXOR (32*0)(ptx), DWB0, DWB0
VPXOR (32*1)(ptx), DWB1, DWB1
VPXOR (32*2)(ptx), DWB2, DWB2
VPXOR (32*3)(ptx), DWB3, DWB3
2022-01-21 11:24:10 +08:00
// Store ciphertext
VMOVDQU DWB0, (32*0)(ctx)
VPSHUFB DWBSWAP, DWB0, DWB0
VMOVDQU DWB1, (32*1)(ctx)
VPSHUFB DWBSWAP, DWB1, DWB1
VMOVDQU DWB2, (32*2)(ctx)
VPSHUFB DWBSWAP, DWB2, DWB2
VMOVDQU DWB3, (32*3)(ctx)
VPSHUFB DWBSWAP, DWB3, DWB3
//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
//VPXOR XDWTMP0, DWB0, DWB0
PXOR ACC0, B0 // Can't call VPXOR here
VMOVDQU DWB0, (32*0)(SP)
VMOVDQU DWB1, (32*1)(SP)
VMOVDQU DWB2, (32*2)(SP)
VMOVDQU DWB3, (32*3)(SP)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP avx2GcmSm4EncOctetsLoop
avx2GcmSm4EncOctetsEnd:
VMOVDQU (16*0)(SP), T0
VMOVDQU (16*0)(pTbl), ACC0
VMOVDQU (16*1)(pTbl), ACCM
VMOVDQU ACC0, ACC1
VPSHUFD $78, T0, T1
VPXOR T0, T1, T1
2023-07-03 12:00:27 +08:00
VPCLMULQDQ $0x00, T0, ACC0, ACC0
VPCLMULQDQ $0x11, T0, ACC1, ACC1
VPCLMULQDQ $0x00, T1, ACCM, ACCM
avxMulRound(1)
avxMulRound(2)
avxMulRound(3)
avxMulRound(4)
avxMulRound(5)
avxMulRound(6)
avxMulRound(7)
2022-01-21 11:24:10 +08:00
VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0
VPSRLDQ $8, ACCM, ACCM
VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0
2023-07-03 12:00:27 +08:00
avxReduceRound(ACC0)
avxReduceRound(ACC0)
2022-01-21 11:24:10 +08:00
VPXOR ACC1, ACC0, ACC0
TESTQ ptxLen, ptxLen
JE avx2GcmSm4EncDone
SUBQ $4, aluCTR
avx2GcmSm4EncNibbles:
CMPQ ptxLen, $64
2022-01-24 16:14:42 +08:00
JBE avx2GcmSm4EncSingles
2022-01-21 11:24:10 +08:00
SUBQ $64, ptxLen
VMOVDQU (8*16 + 0*16)(SP), B0
VMOVDQU (8*16 + 1*16)(SP), B1
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
2022-01-24 16:14:42 +08:00
AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
2022-01-21 11:24:10 +08:00
2023-08-04 15:52:00 +08:00
VPXOR (16*0)(ptx), B0, B0
VPXOR (16*1)(ptx), B1, B1
VPXOR (16*2)(ptx), B2, B2
VPXOR (16*3)(ptx), B3, B3
2022-01-21 11:24:10 +08:00
VMOVDQU B0, (16*0)(ctx)
VMOVDQU B1, (16*1)(ctx)
VMOVDQU B2, (16*2)(ctx)
VMOVDQU B3, (16*3)(ctx)
VMOVDQU (16*14)(pTbl), T2
2023-07-03 12:00:27 +08:00
avxGcmEncDataStep(B0)
2022-01-21 11:24:10 +08:00
increment(0)
2023-08-28 14:41:15 +08:00
avxGcmEncDataStep(B1)
2022-01-21 11:24:10 +08:00
increment(1)
2023-08-28 14:41:15 +08:00
avxGcmEncDataStep(B2)
2022-01-21 11:24:10 +08:00
increment(2)
2023-08-28 14:41:15 +08:00
avxGcmEncDataStep(B3)
2022-01-21 11:24:10 +08:00
increment(3)
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
avx2GcmSm4EncSingles:
TESTQ ptxLen, ptxLen
JE avx2GcmSm4EncDone
VMOVDQU (8*16 + 0*16)(SP), B0
VMOVDQU (8*16 + 1*16)(SP), B1
VMOVDQU (8*16 + 2*16)(SP), B2
VMOVDQU (8*16 + 3*16)(SP), B3
AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
2022-01-21 11:24:10 +08:00
VMOVDQU B0, (16*0)(SP)
VMOVDQU B1, (16*1)(SP)
VMOVDQU B2, (16*2)(SP)
VMOVDQU B3, (16*3)(SP)
VMOVDQU (16*14)(pTbl), T2
MOVQ SP, BP
avx2GcmSm4EncSinglesLoop:
CMPQ ptxLen, $16
JB avx2GcmSm4EncTail
SUBQ $16, ptxLen
VMOVDQU (16*0)(BP), B0
VMOVDQU (ptx), T0
VPXOR T0, B0, B0
VMOVDQU B0, (ctx)
2023-07-03 12:00:27 +08:00
avxGcmEncDataStep(B0)
2022-01-21 11:24:10 +08:00
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP avx2GcmSm4EncSinglesLoop
avx2GcmSm4EncTail:
TESTQ ptxLen, ptxLen
JE avx2GcmSm4EncDone
VMOVDQU (16*0)(BP), B0
VMOVDQU B0, T0
LEAQ -1(ptx)(ptxLen*1), ptx
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
VMOVDQU -16(aluCTR)(aluTMP*1), T1
VPXOR B0, B0, B0
avx2PtxLoadLoop:
2023-07-03 12:00:27 +08:00
VPSLLDQ $1, B0, B0
VPINSRB $0, (ptx), B0, B0
2022-01-21 11:24:10 +08:00
LEAQ -1(ptx), ptx
DECQ ptxLen
JNE avx2PtxLoadLoop
VPXOR T0, B0, B0
VPAND T1, B0, B0
VMOVDQU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT
2023-07-03 12:00:27 +08:00
avxGcmEncDataStep(B0)
2022-01-21 11:24:10 +08:00
avx2GcmSm4EncDone:
VMOVDQU ACC0, (tPtr)
VZEROUPPER
RET
#undef increment
// func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4Dec(SB),0,$128-96
#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, (3*4 + i*16)(SP)
2022-01-21 11:24:10 +08:00
#define decMulRound(i) \
MOVOU (16*i)(ctx), T0;\
PSHUFB BSWAP, T0;\
2022-01-24 16:14:42 +08:00
internalDecMulRound(i)
#define internalDecMulRound(i) \
MOVOU (16*(i*2))(pTbl), T1;\
MOVOU T1, T2;\
2022-01-21 11:24:10 +08:00
PCLMULQDQ $0x00, T0, T1;\
PXOR T1, ACC0;\
PSHUFD $78, T0, T1;\
PCLMULQDQ $0x11, T0, T2;\
PXOR T1, T0;\
PXOR T2, ACC1;\
MOVOU (16*(i*2+1))(pTbl), T2;\
PCLMULQDQ $0x00, T2, T0;\
PXOR T0, ACCM
#define decGhashRound(i) \
MOVOU (16*i)(ctx), B0; \
2022-01-24 16:14:42 +08:00
internalDecGhashRound()
#define internalDecGhashRound() \
2022-01-21 11:24:10 +08:00
PSHUFB BSWAP, B0; \
PXOR ACC0, B0; \
MOVOU T2, ACC0; \
MOVOU T2, ACC1; \
MOVOU (16*15)(pTbl), ACCM; \
PCLMULQDQ $0x00, B0, ACC0; \
PCLMULQDQ $0x11, B0, ACC1; \
PSHUFD $78, B0, T0; \
PXOR B0, T0; \
PCLMULQDQ $0x00, T0, ACCM; \
PXOR ACC0, ACCM; \
PXOR ACC1, ACCM; \
MOVOU ACCM, T0; \
PSRLDQ $8, ACCM; \
PSLLDQ $8, T0; \
PXOR ACCM, ACC1; \
PXOR T0, ACC0; \
reduceRound(ACC0); \
reduceRound(ACC0); \
PXOR ACC1, ACC0
2023-07-03 12:00:27 +08:00
#define avxDecMulRound(i) \
VMOVDQU (16*i)(ctx), T0;\
VPSHUFB BSWAP, T0, T0;\
VMOVDQU (16*(i*2))(pTbl), T2;\
VPCLMULQDQ $0x00, T0, T2, T1;\
VPXOR T1, ACC0, ACC0;\
VPSHUFD $78, T0, T1;\
VPCLMULQDQ $0x11, T0, T2, T2;\
VPXOR T1, T0, T0;\
VPXOR T2, ACC1, ACC1;\
VMOVDQU (16*(i*2+1))(pTbl), T2;\
VPCLMULQDQ $0x00, T2, T0, T0;\
VPXOR T0, ACCM, ACCM
#define internalAvxDecGhashRound() \
VPSHUFB BSWAP, B0, B0; \
VPXOR ACC0, B0, B0; \
VMOVDQU (16*15)(pTbl), ACCM; \
VPCLMULQDQ $0x00, B0, T2, ACC0; \
VPCLMULQDQ $0x11, B0, T2, ACC1; \
VPSHUFD $78, B0, T0; \
VPXOR B0, T0, T0; \
VPCLMULQDQ $0x00, T0, ACCM, ACCM; \
VPXOR ACC0, ACCM, ACCM; \
VPXOR ACC1, ACCM, ACCM; \
VPSLLDQ $8, ACCM, T0; \
VPSRLDQ $8, ACCM, ACCM; \
VPXOR ACCM, ACC1, ACC1; \
VPXOR T0, ACC0, ACC0; \
avxReduceRound(ACC0); \
avxReduceRound(ACC0); \
VPXOR ACC1, ACC0, ACC0
2022-01-21 11:24:10 +08:00
MOVQ productTable+0(FP), pTbl
MOVQ dst+8(FP), ptx
MOVQ src_base+32(FP), ctx
MOVQ src_len+40(FP), ptxLen
MOVQ ctr+56(FP), ctrPtr
MOVQ T+64(FP), tPtr
MOVQ rk_base+72(FP), rk
CMPB ·useAVX2(SB), $1
JE avx2GcmSm4Dec
2023-07-03 12:00:27 +08:00
CMPB ·useAVX(SB), $1
JE avxGcmSm4Dec
MOVOU ·bswap_mask(SB), BSWAP
2022-01-21 11:24:10 +08:00
MOVOU gcmPoly<>(SB), POLY
MOVOU (tPtr), ACC0
PXOR ACC1, ACC1
PXOR ACCM, ACCM
MOVOU (ctrPtr), T0
PSHUFB ·flip_mask(SB), T0
2023-08-28 14:41:15 +08:00
PEXTRD $3, T0, aluCTR
2022-01-21 11:24:10 +08:00
MOVOU T0, (0*16)(SP)
increment(0)
MOVOU T0, (1*16)(SP)
increment(1)
MOVOU T0, (2*16)(SP)
increment(2)
MOVOU T0, (3*16)(SP)
increment(3)
CMPQ ptxLen, $128
JB gcmSm4DecNibbles
// We have at least 8 blocks to dencrypt, prepare the rest of the counters
MOVOU T0, (4*16)(SP)
increment(4)
MOVOU T0, (5*16)(SP)
increment(5)
MOVOU T0, (6*16)(SP)
increment(6)
MOVOU T0, (7*16)(SP)
increment(7)
gcmSm4DecOctetsLoop:
CMPQ ptxLen, $128
JB gcmSm4DecEndOctets
SUBQ $128, ptxLen
MOVOU (0*16)(SP), B0
MOVOU (1*16)(SP), B1
MOVOU (2*16)(SP), B2
MOVOU (3*16)(SP), B3
MOVOU (4*16)(SP), B4
MOVOU (5*16)(SP), B5
MOVOU (6*16)(SP), B6
MOVOU (7*16)(SP), B7
MOVOU (16*0)(ctx), T0
PSHUFB BSWAP, T0
PXOR ACC0, T0
PSHUFD $78, T0, T1
PXOR T0, T1
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PCLMULQDQ $0x00, T1, ACCM
PCLMULQDQ $0x00, T0, ACC0
PCLMULQDQ $0x11, T0, ACC1
decMulRound(1)
increment(0)
decMulRound(2)
increment(1)
decMulRound(3)
increment(2)
decMulRound(4)
increment(3)
decMulRound(5)
increment(4)
decMulRound(6)
increment(5)
decMulRound(7)
increment(6)
increment(7)
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
2023-08-03 15:17:01 +08:00
2023-11-01 15:00:49 +08:00
MOVOU (16*0)(ctx), T0
PXOR T0, B0
MOVOU (16*1)(ctx), T0
PXOR T0, B1
MOVOU (16*2)(ctx), T0
PXOR T0, B2
MOVOU (16*3)(ctx), T0
PXOR T0, B3
MOVOU (16*4)(ctx), T0
PXOR T0, B4
MOVOU (16*5)(ctx), T0
PXOR T0, B5
MOVOU (16*6)(ctx), T0
PXOR T0, B6
MOVOU (16*7)(ctx), T0
PXOR T0, B7
2022-01-21 11:24:10 +08:00
MOVOU B0, (16*0)(ptx)
MOVOU B1, (16*1)(ptx)
MOVOU B2, (16*2)(ptx)
MOVOU B3, (16*3)(ptx)
MOVOU B4, (16*4)(ptx)
MOVOU B5, (16*5)(ptx)
MOVOU B6, (16*6)(ptx)
MOVOU B7, (16*7)(ptx)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP gcmSm4DecOctetsLoop
gcmSm4DecEndOctets:
SUBQ $4, aluCTR
gcmSm4DecNibbles:
CMPQ ptxLen, $64
2022-01-24 16:14:42 +08:00
JBE gcmSm4DecSingles
2022-01-21 11:24:10 +08:00
SUBQ $64, ptxLen
2022-06-06 13:45:46 +08:00
MOVOU (0*16)(SP), B4
MOVOU (1*16)(SP), B5
MOVOU (2*16)(SP), B6
MOVOU (3*16)(SP), B7
2022-01-21 11:24:10 +08:00
SM4_4BLOCKS_WO_BS(rk, B0, T0, T1, T2, B4, B5, B6, B7)
2022-01-21 11:24:10 +08:00
MOVOU (16*14)(pTbl), T2
2023-08-28 14:41:15 +08:00
MOVOU (16*0)(ctx), B0
PXOR B0, B4
internalDecGhashRound()
2022-01-21 11:24:10 +08:00
increment(0)
2023-08-28 14:41:15 +08:00
MOVOU (16*1)(ctx), B0
PXOR B0, B5
internalDecGhashRound()
2022-01-21 11:24:10 +08:00
increment(1)
2023-08-28 14:41:15 +08:00
MOVOU (16*2)(ctx), B0
PXOR B0, B6
internalDecGhashRound()
2022-01-21 11:24:10 +08:00
increment(2)
2023-08-28 14:41:15 +08:00
MOVOU (16*3)(ctx), B0
PXOR B0, B7
internalDecGhashRound()
2022-01-21 11:24:10 +08:00
increment(3)
2022-06-06 13:45:46 +08:00
MOVOU B4, (16*0)(ptx)
MOVOU B5, (16*1)(ptx)
MOVOU B6, (16*2)(ptx)
MOVOU B7, (16*3)(ptx)
2022-01-21 11:24:10 +08:00
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
gcmSm4DecSingles:
TESTQ ptxLen, ptxLen
JE gcmSm4DecDone
MOVOU (0*16)(SP), B0
MOVOU (1*16)(SP), B1
MOVOU (2*16)(SP), B2
MOVOU (3*16)(SP), B3
SM4_4BLOCKS_WO_BS(rk, B4, T0, T1, T2, B0, B1, B2, B3)
2022-01-21 11:24:10 +08:00
MOVOU B0, (16*4)(SP)
MOVOU B1, (16*5)(SP)
MOVOU B2, (16*6)(SP)
MOVOU B3, (16*7)(SP)
MOVOU (16*14)(pTbl), T2
MOVQ SP, BP
ADDQ $64, BP
gcmSm4DecSinglesLoop:
CMPQ ptxLen, $16
JB gcmSm4DecTail
SUBQ $16, ptxLen
2022-06-06 13:45:46 +08:00
MOVOU (16*0)(BP), B1
2022-01-21 11:24:10 +08:00
MOVOU (ctx), T0
2022-06-06 13:45:46 +08:00
PXOR T0, B1
2022-01-21 11:24:10 +08:00
decGhashRound(0)
2022-06-06 13:45:46 +08:00
MOVOU B1, (ptx)
2022-01-21 11:24:10 +08:00
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP gcmSm4DecSinglesLoop
gcmSm4DecTail:
TESTQ ptxLen, ptxLen
JE gcmSm4DecDone
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
MOVOU -16(aluCTR)(aluTMP*1), T1
MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow
PAND T1, B0
MOVOU B0, T1
2023-07-03 12:00:27 +08:00
internalDecGhashRound()
2022-01-21 11:24:10 +08:00
MOVOU (16*0)(BP), B0
PXOR T1, B0
ptxStoreLoop:
PEXTRB $0, B0, (ptx)
PSRLDQ $1, B0
LEAQ 1(ptx), ptx
DECQ ptxLen
JNE ptxStoreLoop
gcmSm4DecDone:
MOVOU ACC0, (tPtr)
RET
2023-07-03 12:00:27 +08:00
avxGcmSm4Dec:
VMOVDQU ·bswap_mask(SB), BSWAP
2023-07-03 12:00:27 +08:00
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
VPSHUFB ·flip_mask(SB), T0, T0
2023-08-28 14:41:15 +08:00
VPEXTRD $3, T0, aluCTR
2023-07-03 12:00:27 +08:00
VMOVDQU T0, (0*16)(SP)
increment(0)
VMOVDQU T0, (1*16)(SP)
increment(1)
VMOVDQU T0, (2*16)(SP)
increment(2)
VMOVDQU T0, (3*16)(SP)
increment(3)
CMPQ ptxLen, $128
JB avxGcmSm4DecNibbles
// We have at least 8 blocks to dencrypt, prepare the rest of the counters
VMOVDQU T0, (4*16)(SP)
increment(4)
VMOVDQU T0, (5*16)(SP)
increment(5)
VMOVDQU T0, (6*16)(SP)
increment(6)
VMOVDQU T0, (7*16)(SP)
increment(7)
avxGcmSm4DecOctetsLoop:
CMPQ ptxLen, $128
JB avxGcmSm4DecEndOctets
SUBQ $128, ptxLen
VMOVDQU (0*16)(SP), B0
VMOVDQU (1*16)(SP), B1
VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3
VMOVDQU (4*16)(SP), B4
VMOVDQU (5*16)(SP), B5
VMOVDQU (6*16)(SP), B6
VMOVDQU (7*16)(SP), B7
VMOVDQU (16*0)(ctx), T0
VPSHUFB BSWAP, T0, T0
VPXOR ACC0, T0, T0
VPSHUFD $78, T0, T1
VPXOR T0, T1, T1
VMOVDQU (16*0)(pTbl), ACC1
VMOVDQU (16*1)(pTbl), ACCM
VPCLMULQDQ $0x00, T1, ACCM, ACCM
VPCLMULQDQ $0x00, T0, ACC1, ACC0
VPCLMULQDQ $0x11, T0, ACC1, ACC1
avxDecMulRound(1)
increment(0)
avxDecMulRound(2)
increment(1)
avxDecMulRound(3)
increment(2)
avxDecMulRound(4)
increment(3)
avxDecMulRound(5)
increment(4)
avxDecMulRound(6)
increment(5)
avxDecMulRound(7)
increment(6)
2023-08-28 14:41:15 +08:00
2023-07-03 12:00:27 +08:00
VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0
VPSRLDQ $8, ACCM, ACCM
VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0
2023-08-28 14:41:15 +08:00
increment(7)
2023-07-03 12:00:27 +08:00
avxReduceRound(ACC0)
avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0
AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
2023-07-03 12:00:27 +08:00
VPXOR (16*0)(ctx), B0, B0
VPXOR (16*1)(ctx), B1, B1
VPXOR (16*2)(ctx), B2, B2
VPXOR (16*3)(ctx), B3, B3
VPXOR (16*4)(ctx), B4, B4
VPXOR (16*5)(ctx), B5, B5
VPXOR (16*6)(ctx), B6, B6
VPXOR (16*7)(ctx), B7, B7
2023-08-03 15:17:01 +08:00
VMOVDQU B0, (16*0)(ptx)
VMOVDQU B1, (16*1)(ptx)
VMOVDQU B2, (16*2)(ptx)
VMOVDQU B3, (16*3)(ptx)
2023-07-03 12:00:27 +08:00
VMOVDQU B4, (16*4)(ptx)
VMOVDQU B5, (16*5)(ptx)
VMOVDQU B6, (16*6)(ptx)
VMOVDQU B7, (16*7)(ptx)
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP avxGcmSm4DecOctetsLoop
avxGcmSm4DecEndOctets:
SUBQ $4, aluCTR
avxGcmSm4DecNibbles:
CMPQ ptxLen, $64
JBE avxGcmSm4DecSingles
SUBQ $64, ptxLen
VMOVDQU (0*16)(SP), B4
VMOVDQU (1*16)(SP), B5
VMOVDQU (2*16)(SP), B6
VMOVDQU (3*16)(SP), B7
AVX_SM4_4BLOCKS_WO_BS(rk, B0, B1, T1, T2, B4, B5, B6, B7)
2023-07-03 12:00:27 +08:00
VMOVDQU (16*14)(pTbl), T2
VMOVDQU (16*0)(ctx), B0
VPXOR B0, B4, B4
internalAvxDecGhashRound()
increment(0)
VMOVDQU (16*1)(ctx), B0
VPXOR B0, B5, B5
internalAvxDecGhashRound()
increment(1)
VMOVDQU (16*2)(ctx), B0
VPXOR B0, B6, B6
internalAvxDecGhashRound()
increment(2)
VMOVDQU (16*3)(ctx), B0
VPXOR B0, B7, B7
internalAvxDecGhashRound()
increment(3)
VMOVDQU B4, (16*0)(ptx)
VMOVDQU B5, (16*1)(ptx)
VMOVDQU B6, (16*2)(ptx)
VMOVDQU B7, (16*3)(ptx)
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
avxGcmSm4DecSingles:
TESTQ ptxLen, ptxLen
JE avxGcmSm4DecDone
VMOVDQU (0*16)(SP), B0
VMOVDQU (1*16)(SP), B1
VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3
AVX_SM4_4BLOCKS_WO_BS(rk, B7, B6, B5, B4, B0, B1, B2, B3)
2023-07-03 12:00:27 +08:00
VMOVDQU B0, (16*4)(SP)
VMOVDQU B1, (16*5)(SP)
VMOVDQU B2, (16*6)(SP)
VMOVDQU B3, (16*7)(SP)
VMOVDQU (16*14)(pTbl), T2
MOVQ SP, BP
ADDQ $64, BP
avxGcmSm4DecSinglesLoop:
CMPQ ptxLen, $16
JB avxGcmSm4DecTail
SUBQ $16, ptxLen
VMOVDQU (16*0)(BP), T0
VMOVDQU (ctx), B0
VPXOR T0, B0, T0
VMOVDQU T0, (ptx)
internalAvxDecGhashRound()
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP avxGcmSm4DecSinglesLoop
avxGcmSm4DecTail:
TESTQ ptxLen, ptxLen
JE avxGcmSm4DecDone
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
VMOVDQU -16(aluCTR)(aluTMP*1), T1
VMOVDQU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow
VPAND T1, B0, B0
VMOVDQU B0, T1
internalAvxDecGhashRound()
VMOVDQU (16*0)(BP), B0
VPXOR T1, B0, B0
avxPtxStoreLoop:
VPEXTRB $0, B0, (ptx)
VPSRLDQ $1, B0, B0
LEAQ 1(ptx), ptx
DECQ ptxLen
JNE avxPtxStoreLoop
avxGcmSm4DecDone:
VMOVDQU ACC0, (tPtr)
RET
2022-01-21 11:24:10 +08:00
avx2GcmSm4Dec:
VMOVDQU ·bswap_mask(SB), BSWAP
2022-01-21 11:24:10 +08:00
VMOVDQU gcmPoly<>(SB), POLY
VMOVDQU (tPtr), ACC0
VPXOR ACC1, ACC1, ACC1
VPXOR ACCM, ACCM, ACCM
VMOVDQU (ctrPtr), T0
VPSHUFB ·flip_mask(SB), T0, T0
2023-08-28 14:41:15 +08:00
VPEXTRD $3, T0, aluCTR
VINSERTI128 $1, T0, Y11, Y11
VMOVDQU Y11, (0*32)(SP)
2022-01-21 11:24:10 +08:00
increment(0)
increment(1)
2023-08-28 14:41:15 +08:00
VMOVDQU Y11, (1*32)(SP)
2022-01-21 11:24:10 +08:00
increment(2)
increment(3)
CMPQ ptxLen, $128
JB avx2GcmSm4DecNibbles
// We have at least 8 blocks to dencrypt, prepare the rest of the counters
2023-08-28 14:41:15 +08:00
VMOVDQU Y11, (2*32)(SP)
2022-01-21 11:24:10 +08:00
increment(4)
increment(5)
2023-08-28 14:41:15 +08:00
VMOVDQU Y11, (3*32)(SP)
2022-01-21 11:24:10 +08:00
increment(6)
increment(7)
VBROADCASTI128 ·bswap_mask(SB), DWBSWAP
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
2023-08-04 15:52:00 +08:00
2022-01-21 11:24:10 +08:00
avx2GcmSm4DecOctetsLoop:
CMPQ ptxLen, $128
JB avx2GcmSm4DecEndOctets
SUBQ $128, ptxLen
// load 8 ctrs for encryption
VMOVDQU (0*32)(SP), DWB0
VMOVDQU (1*32)(SP), DWB1
VMOVDQU (2*32)(SP), DWB2
VMOVDQU (3*32)(SP), DWB3
VMOVDQU (16*0)(ctx), T0
VPSHUFB BSWAP, T0, T0
VPXOR ACC0, T0, T0
VPSHUFD $78, T0, T1
VPXOR T0, T1, T1
2023-07-03 12:00:27 +08:00
VMOVDQU (16*0)(pTbl), ACC1
2022-01-21 11:24:10 +08:00
VMOVDQU (16*1)(pTbl), ACCM
2023-07-03 12:00:27 +08:00
VPCLMULQDQ $0x00, T1, ACCM, ACCM
VPCLMULQDQ $0x00, T0, ACC1, ACC0
VPCLMULQDQ $0x11, T0, ACC1, ACC1
2022-01-21 11:24:10 +08:00
2023-08-04 15:52:00 +08:00
avxDecMulRound(1)
increment(0)
avxDecMulRound(2)
increment(1)
avxDecMulRound(3)
increment(2)
avxDecMulRound(4)
increment(3)
avxDecMulRound(5)
increment(4)
avxDecMulRound(6)
increment(5)
avxDecMulRound(7)
increment(6)
2023-08-28 14:41:15 +08:00
2023-08-04 15:52:00 +08:00
VPXOR ACC0, ACCM, ACCM
VPXOR ACC1, ACCM, ACCM
VPSLLDQ $8, ACCM, T0
VPSRLDQ $8, ACCM, ACCM
VPXOR ACCM, ACC1, ACC1
VPXOR T0, ACC0, ACC0
2023-08-28 14:41:15 +08:00
increment(7)
2023-08-04 15:52:00 +08:00
avxReduceRound(ACC0)
avxReduceRound(ACC0)
VPXOR ACC1, ACC0, ACC0
2022-01-21 11:24:10 +08:00
// Transpose matrix 4 x 4 32bits word
2023-08-28 14:41:15 +08:00
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
2022-01-21 11:24:10 +08:00
2023-08-28 14:41:15 +08:00
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
2022-01-21 11:24:10 +08:00
// Transpose matrix 4 x 4 32bits word
2023-08-28 14:41:15 +08:00
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
2022-01-21 11:24:10 +08:00
VPSHUFB DWBSWAP, DWB0, DWB0
VPSHUFB DWBSWAP, DWB1, DWB1
VPSHUFB DWBSWAP, DWB2, DWB2
VPSHUFB DWBSWAP, DWB3, DWB3
2022-01-24 16:14:42 +08:00
2023-08-04 15:52:00 +08:00
VPXOR (32*0)(ctx), DWB0, DWB0
VPXOR (32*1)(ctx), DWB1, DWB1
VPXOR (32*2)(ctx), DWB2, DWB2
VPXOR (32*3)(ctx), DWB3, DWB3
2022-01-21 11:24:10 +08:00
2022-01-24 16:14:42 +08:00
VMOVDQU DWB0, (32*0)(ptx)
VMOVDQU DWB1, (32*1)(ptx)
VMOVDQU DWB2, (32*2)(ptx)
VMOVDQU DWB3, (32*3)(ptx)
2022-01-21 11:24:10 +08:00
LEAQ 128(ptx), ptx
LEAQ 128(ctx), ctx
JMP avx2GcmSm4DecOctetsLoop
avx2GcmSm4DecEndOctets:
SUBQ $4, aluCTR
avx2GcmSm4DecNibbles:
CMPQ ptxLen, $64
2022-01-24 16:14:42 +08:00
JBE avx2GcmSm4DecSingles
2022-01-21 11:24:10 +08:00
SUBQ $64, ptxLen
VMOVDQU (0*16)(SP), B4
2022-01-21 11:24:10 +08:00
VMOVDQU (1*16)(SP), B1
VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3
2022-01-24 16:14:42 +08:00
AVX_SM4_4BLOCKS_WO_BS(rk, B0, B5, B6, B7, B4, B1, B2, B3)
2022-01-21 11:24:10 +08:00
VMOVDQU (16*14)(pTbl), T2
2022-01-24 16:14:42 +08:00
VMOVDQU (16*0)(ctx), B0
VPXOR B0, B4, B4
2023-08-28 14:41:15 +08:00
increment(0)
2023-07-03 12:00:27 +08:00
internalAvxDecGhashRound()
2022-01-24 16:14:42 +08:00
VMOVDQU (16*1)(ctx), B0
VPXOR B0, B1, B1
2023-08-28 14:41:15 +08:00
increment(1)
2023-07-03 12:00:27 +08:00
internalAvxDecGhashRound()
2022-01-24 16:14:42 +08:00
VMOVDQU (16*2)(ctx), B0
VPXOR B0, B2, B2
2023-08-28 14:41:15 +08:00
increment(2)
2023-07-03 12:00:27 +08:00
internalAvxDecGhashRound()
2022-01-21 11:24:10 +08:00
2022-01-24 16:14:42 +08:00
VMOVDQU (16*3)(ctx), B0
VPXOR B0, B3, B3
2023-08-28 14:41:15 +08:00
increment(3)
2023-07-03 12:00:27 +08:00
internalAvxDecGhashRound()
2022-01-24 16:14:42 +08:00
VMOVDQU B4, (16*0)(ptx)
2022-01-21 11:24:10 +08:00
VMOVDQU B1, (16*1)(ptx)
VMOVDQU B2, (16*2)(ptx)
VMOVDQU B3, (16*3)(ptx)
LEAQ 64(ptx), ptx
LEAQ 64(ctx), ctx
avx2GcmSm4DecSingles:
TESTQ ptxLen, ptxLen
JE avx2GcmSm4DecDone
2022-01-24 16:14:42 +08:00
2022-01-21 11:24:10 +08:00
VMOVDQU (0*16)(SP), B0
VMOVDQU (1*16)(SP), B1
VMOVDQU (2*16)(SP), B2
VMOVDQU (3*16)(SP), B3
AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
2022-01-21 11:24:10 +08:00
VMOVDQU B0, (16*4)(SP)
VMOVDQU B1, (16*5)(SP)
VMOVDQU B2, (16*6)(SP)
VMOVDQU B3, (16*7)(SP)
VMOVDQU (16*14)(pTbl), T2
MOVQ SP, BP
ADDQ $64, BP
avx2GcmSm4DecSinglesLoop:
CMPQ ptxLen, $16
JB avx2GcmSm4DecTail
SUBQ $16, ptxLen
2022-01-24 16:14:42 +08:00
VMOVDQU (16*0)(BP), T0
VMOVDQU (ctx), B0
VPXOR T0, B0, T0
VMOVDQU T0, (ptx)
2022-01-21 11:24:10 +08:00
2023-07-03 12:00:27 +08:00
internalAvxDecGhashRound()
2022-01-21 11:24:10 +08:00
LEAQ (16*1)(ptx), ptx
LEAQ (16*1)(ctx), ctx
ADDQ $16, BP
JMP avx2GcmSm4DecSinglesLoop
avx2GcmSm4DecTail:
TESTQ ptxLen, ptxLen
JE avx2GcmSm4DecDone
MOVQ ptxLen, aluTMP
SHLQ $4, aluTMP
LEAQ andMask<>(SB), aluCTR
2022-01-24 16:14:42 +08:00
VMOVDQU -16(aluCTR)(aluTMP*1), T1 // Fetch and-mask according ptxLen
2022-01-21 11:24:10 +08:00
VMOVDQU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow
2022-01-24 16:14:42 +08:00
VPAND T1, B0, B0 // Just keep ptxLen bytes, others will be zero
2022-01-21 11:24:10 +08:00
VMOVDQU B0, T1
2023-07-03 12:00:27 +08:00
internalAvxDecGhashRound()
2022-01-21 11:24:10 +08:00
VMOVDQU (16*0)(BP), B0
VPXOR T1, B0, B0
avx2PtxStoreLoop:
2023-07-03 12:00:27 +08:00
VPEXTRB $0, B0, (ptx)
VPSRLDQ $1, B0, B0
2022-01-21 11:24:10 +08:00
LEAQ 1(ptx), ptx
DECQ ptxLen
JNE avx2PtxStoreLoop
avx2GcmSm4DecDone:
VMOVDQU ACC0, (tPtr)
VZEROUPPER
RET
2022-04-29 13:01:34 +08:00
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
RET
// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
RET