zuc: amd64 update comments

This commit is contained in:
Sun Yimin 2023-09-06 08:16:38 +08:00 committed by GitHub
parent 5301412e48
commit dffaf4fcad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 69 additions and 53 deletions

View File

@ -1,4 +1,6 @@
// Referenced https://github.com/intel/intel-ipsec-mb/ // Referenced Intel(R) Multi-Buffer Crypto for IPsec
// https://github.com/intel/intel-ipsec-mb/
// https://gist.github.com/emmansun/15d2fce6659ab97ffaf7ab66e278caee
//go:build amd64 && !purego //go:build amd64 && !purego
// +build amd64,!purego // +build amd64,!purego
@ -56,10 +58,6 @@ DATA Cancel_aes<>+0x00(SB)/8, $0x6363636363636363
DATA Cancel_aes<>+0x08(SB)/8, $0x6363636363636363 DATA Cancel_aes<>+0x08(SB)/8, $0x6363636363636363
GLOBL Cancel_aes<>(SB), RODATA, $16 GLOBL Cancel_aes<>(SB), RODATA, $16
DATA Const_comb_matrix<>+0x00(SB)/8, $0x5555555555555555
DATA Const_comb_matrix<>+0x08(SB)/8, $0x5555555555555555
GLOBL Const_comb_matrix<>(SB), RODATA, $16
DATA CombMatrix<>+0x00(SB)/8, $0x3C1A99B2AD1ED43A DATA CombMatrix<>+0x00(SB)/8, $0x3C1A99B2AD1ED43A
DATA CombMatrix<>+0x08(SB)/8, $0x3C1A99B2AD1ED43A DATA CombMatrix<>+0x08(SB)/8, $0x3C1A99B2AD1ED43A
GLOBL CombMatrix<>(SB), RODATA, $16 GLOBL CombMatrix<>(SB), RODATA, $16
@ -284,7 +282,7 @@ GLOBL flip_mask<>(SB), RODATA, $16
\ // LFSR_S16 = (LFSR_S15++) = AX \ // LFSR_S16 = (LFSR_S15++) = AX
MOVL AX, (((0 + idx) % 16)*4)(SI) MOVL AX, (((0 + idx) % 16)*4)(SI)
#define NONLIN_FUN() \ #define NONLIN_FUN \
MOVL BRC_X0, AX \ MOVL BRC_X0, AX \
XORL F_R1, AX \ // F_R1 xor BRC_X1 XORL F_R1, AX \ // F_R1 xor BRC_X1
ADDL F_R2, AX \ // W = (F_R1 xor BRC_X1) + F_R2 ADDL F_R2, AX \ // W = (F_R1 xor BRC_X1) + F_R2
@ -329,8 +327,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
// return // return
// W in AX // W in AX
// updated F_R1, F_R2 // updated F_R1, F_R2
#define NONLIN_FUN_SSE() \ #define NONLIN_FUN_SSE \
NONLIN_FUN() \ NONLIN_FUN \
MOVQ DX, X0 \ MOVQ DX, X0 \
MOVOU X0, X1 \ MOVOU X0, X1 \
S0_comput_SSE(X1, X2, X3) \ S0_comput_SSE(X1, X2, X3) \
@ -344,7 +342,7 @@ GLOBL flip_mask<>(SB), RODATA, $16
PEXTRD $1, X0, F_R2 PEXTRD $1, X0, F_R2
// RESTORE_LFSR_0, appends the first 4 bytes to last. // RESTORE_LFSR_0, appends the first 4 bytes to last.
#define RESTORE_LFSR_0() \ #define RESTORE_LFSR_0 \
MOVL (0*4)(SI), AX \ // first 4-bytes MOVL (0*4)(SI), AX \ // first 4-bytes
MOVUPS (4)(SI), X0 \ MOVUPS (4)(SI), X0 \
MOVUPS (20)(SI), X1 \ MOVUPS (20)(SI), X1 \
@ -360,7 +358,7 @@ GLOBL flip_mask<>(SB), RODATA, $16
MOVL AX, (60)(SI) MOVL AX, (60)(SI)
// RESTORE_LFSR_2, appends the first 8 bytes to last. // RESTORE_LFSR_2, appends the first 8 bytes to last.
#define RESTORE_LFSR_2() \ #define RESTORE_LFSR_2 \
MOVQ (0)(SI), AX \ // first 8-bytes MOVQ (0)(SI), AX \ // first 8-bytes
MOVUPS (8)(SI), X0 \ MOVUPS (8)(SI), X0 \
MOVUPS (24)(SI), X1 \ MOVUPS (24)(SI), X1 \
@ -374,7 +372,7 @@ GLOBL flip_mask<>(SB), RODATA, $16
MOVQ AX, (56)(SI) MOVQ AX, (56)(SI)
// RESTORE_LFSR_4, appends the first 16 bytes to last. // RESTORE_LFSR_4, appends the first 16 bytes to last.
#define RESTORE_LFSR_4() \ #define RESTORE_LFSR_4 \
MOVUPS (0)(SI), X0 \ // first 16 bytes MOVUPS (0)(SI), X0 \ // first 16 bytes
MOVUPS (16)(SI), X1 \ MOVUPS (16)(SI), X1 \
MOVUPS (32)(SI), X2 \ MOVUPS (32)(SI), X2 \
@ -386,7 +384,7 @@ GLOBL flip_mask<>(SB), RODATA, $16
MOVUPS X0, (48)(SI) MOVUPS X0, (48)(SI)
// RESTORE_LFSR_8, appends the first 32 bytes to last. // RESTORE_LFSR_8, appends the first 32 bytes to last.
#define RESTORE_LFSR_8() \ #define RESTORE_LFSR_8 \
MOVUPS (0)(SI), X0 \ MOVUPS (0)(SI), X0 \
MOVUPS (16)(SI), X1 \ MOVUPS (16)(SI), X1 \
MOVUPS (32)(SI), X2 \ MOVUPS (32)(SI), X2 \
@ -404,8 +402,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
// return // return
// W in AX // W in AX
// updated F_R1, F_R2 // updated F_R1, F_R2
#define NONLIN_FUN_AVX() \ #define NONLIN_FUN_AVX \
NONLIN_FUN() \ NONLIN_FUN \
VMOVQ DX, X0 \ VMOVQ DX, X0 \
VMOVDQA X0, X1 \ VMOVDQA X0, X1 \
S0_comput_AVX(X1, X2, X3) \ S0_comput_AVX(X1, X2, X3) \
@ -418,7 +416,7 @@ GLOBL flip_mask<>(SB), RODATA, $16
MOVL X0, R10 \ // F_R1 MOVL X0, R10 \ // F_R1
VPEXTRD $1, X0, R11 VPEXTRD $1, X0, R11
#define LOAD_STATE() \ #define LOAD_STATE \
MOVL OFFSET_FR1(SI), F_R1 \ MOVL OFFSET_FR1(SI), F_R1 \
MOVL OFFSET_FR2(SI), F_R2 \ MOVL OFFSET_FR2(SI), F_R2 \
MOVL OFFSET_BRC_X0(SI), BRC_X0 \ MOVL OFFSET_BRC_X0(SI), BRC_X0 \
@ -426,7 +424,7 @@ GLOBL flip_mask<>(SB), RODATA, $16
MOVL OFFSET_BRC_X2(SI), BRC_X2 \ MOVL OFFSET_BRC_X2(SI), BRC_X2 \
MOVL OFFSET_BRC_X3(SI), BRC_X3 MOVL OFFSET_BRC_X3(SI), BRC_X3
#define SAVE_STATE() \ #define SAVE_STATE \
MOVL F_R1, OFFSET_FR1(SI) \ MOVL F_R1, OFFSET_FR1(SI) \
MOVL F_R2, OFFSET_FR2(SI) \ MOVL F_R2, OFFSET_FR2(SI) \
MOVL BRC_X0, OFFSET_BRC_X0(SI) \ MOVL BRC_X0, OFFSET_BRC_X0(SI) \
@ -438,14 +436,14 @@ GLOBL flip_mask<>(SB), RODATA, $16
TEXT ·genKeywordAsm(SB),NOSPLIT,$0 TEXT ·genKeywordAsm(SB),NOSPLIT,$0
MOVQ pState+0(FP), SI MOVQ pState+0(FP), SI
LOAD_STATE() LOAD_STATE
BITS_REORG(0) BITS_REORG(0)
CMPB ·useAVX(SB), $1 CMPB ·useAVX(SB), $1
JE avx JE avx
sse: sse:
NONLIN_FUN_SSE() NONLIN_FUN_SSE
// (BRC_X3 xor W) as result // (BRC_X3 xor W) as result
XORL BRC_X3, AX XORL BRC_X3, AX
@ -455,13 +453,13 @@ sse:
XORQ AX, AX XORQ AX, AX
LFSR_UPDT(0) LFSR_UPDT(0)
SAVE_STATE() SAVE_STATE
RESTORE_LFSR_0() RESTORE_LFSR_0
RET RET
avx: avx:
NONLIN_FUN_AVX() NONLIN_FUN_AVX
// (BRC_X3 xor W) as result // (BRC_X3 xor W) as result
XORL BRC_X3, AX XORL BRC_X3, AX
@ -471,14 +469,14 @@ avx:
XORQ AX, AX XORQ AX, AX
LFSR_UPDT(0) LFSR_UPDT(0)
SAVE_STATE() SAVE_STATE
RESTORE_LFSR_0() RESTORE_LFSR_0
RET RET
#define ROUND_SSE(idx) \ #define ROUND_SSE(idx) \
BITS_REORG(idx) \ BITS_REORG(idx) \
NONLIN_FUN_SSE() \ NONLIN_FUN_SSE \
XORL R15, AX \ XORL R15, AX \
MOVL AX, (idx*4)(DI) \ MOVL AX, (idx*4)(DI) \
XORQ AX, AX \ XORQ AX, AX \
@ -486,7 +484,7 @@ avx:
#define ROUND_AVX(idx) \ #define ROUND_AVX(idx) \
BITS_REORG(idx) \ BITS_REORG(idx) \
NONLIN_FUN_AVX() \ NONLIN_FUN_AVX \
XORL R15, AX \ XORL R15, AX \
MOVL AX, (idx*4)(DI) \ MOVL AX, (idx*4)(DI) \
XORQ AX, AX \ XORQ AX, AX \
@ -494,7 +492,7 @@ avx:
#define ROUND_REV32_SSE(idx) \ #define ROUND_REV32_SSE(idx) \
BITS_REORG(idx) \ BITS_REORG(idx) \
NONLIN_FUN_SSE() \ NONLIN_FUN_SSE \
XORL R15, AX \ XORL R15, AX \
BSWAPL AX \ BSWAPL AX \
MOVL AX, (idx*4)(DI) \ MOVL AX, (idx*4)(DI) \
@ -503,7 +501,7 @@ avx:
#define ROUND_REV32_AVX(idx) \ #define ROUND_REV32_AVX(idx) \
BITS_REORG(idx) \ BITS_REORG(idx) \
NONLIN_FUN_AVX() \ NONLIN_FUN_AVX \
XORL R15, AX \ XORL R15, AX \
BSWAPL AX \ BSWAPL AX \
MOVL AX, (idx*4)(DI) \ MOVL AX, (idx*4)(DI) \
@ -516,7 +514,7 @@ TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
MOVQ ks_len+8(FP), BP MOVQ ks_len+8(FP), BP
MOVQ pState+24(FP), SI MOVQ pState+24(FP), SI
LOAD_STATE() LOAD_STATE
CMPB ·useAVX(SB), $1 CMPB ·useAVX(SB), $1
JE avxZucSixteens JE avxZucSixteens
@ -557,7 +555,8 @@ sseZucOctet:
ROUND_SSE(6) ROUND_SSE(6)
ROUND_SSE(7) ROUND_SSE(7)
LEAQ 32(DI), DI LEAQ 32(DI), DI
RESTORE_LFSR_8() RESTORE_LFSR_8
sseZucNibble: sseZucNibble:
CMPQ BP, $4 CMPQ BP, $4
JB sseZucDouble JB sseZucDouble
@ -567,7 +566,8 @@ sseZucNibble:
ROUND_SSE(2) ROUND_SSE(2)
ROUND_SSE(3) ROUND_SSE(3)
LEAQ 16(DI), DI LEAQ 16(DI), DI
RESTORE_LFSR_4() RESTORE_LFSR_4
sseZucDouble: sseZucDouble:
CMPQ BP, $2 CMPQ BP, $2
JB sseZucSingle JB sseZucSingle
@ -575,14 +575,16 @@ sseZucDouble:
ROUND_SSE(0) ROUND_SSE(0)
ROUND_SSE(1) ROUND_SSE(1)
LEAQ 8(DI), DI LEAQ 8(DI), DI
RESTORE_LFSR_2() RESTORE_LFSR_2
sseZucSingle: sseZucSingle:
TESTQ BP, BP TESTQ BP, BP
JE sseZucRet JE sseZucRet
ROUND_SSE(0) ROUND_SSE(0)
RESTORE_LFSR_0() RESTORE_LFSR_0
sseZucRet: sseZucRet:
SAVE_STATE() SAVE_STATE
RET RET
avxZucSixteens: avxZucSixteens:
@ -621,7 +623,8 @@ avxZucOctet:
ROUND_AVX(6) ROUND_AVX(6)
ROUND_AVX(7) ROUND_AVX(7)
LEAQ 32(DI), DI LEAQ 32(DI), DI
RESTORE_LFSR_8() RESTORE_LFSR_8
avxZucNibble: avxZucNibble:
CMPQ BP, $4 CMPQ BP, $4
JB avxZucDouble JB avxZucDouble
@ -631,7 +634,8 @@ avxZucNibble:
ROUND_AVX(2) ROUND_AVX(2)
ROUND_AVX(3) ROUND_AVX(3)
LEAQ 16(DI), DI LEAQ 16(DI), DI
RESTORE_LFSR_4() RESTORE_LFSR_4
avxZucDouble: avxZucDouble:
CMPQ BP, $2 CMPQ BP, $2
JB avxZucSingle JB avxZucSingle
@ -639,14 +643,16 @@ avxZucDouble:
ROUND_AVX(0) ROUND_AVX(0)
ROUND_AVX(1) ROUND_AVX(1)
LEAQ 8(DI), DI LEAQ 8(DI), DI
RESTORE_LFSR_2() RESTORE_LFSR_2
avxZucSingle: avxZucSingle:
TESTQ BP, BP TESTQ BP, BP
JE avxZucRet JE avxZucRet
ROUND_AVX(0) ROUND_AVX(0)
RESTORE_LFSR_0() RESTORE_LFSR_0
avxZucRet: avxZucRet:
SAVE_STATE() SAVE_STATE
RET RET
// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) // func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
@ -657,7 +663,7 @@ TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0
SHRQ $2, BP SHRQ $2, BP
LOAD_STATE() LOAD_STATE
CMPB ·useAVX(SB), $1 CMPB ·useAVX(SB), $1
JE avxZucSixteens JE avxZucSixteens
@ -698,7 +704,8 @@ sseZucOctet:
ROUND_REV32_SSE(6) ROUND_REV32_SSE(6)
ROUND_REV32_SSE(7) ROUND_REV32_SSE(7)
LEAQ 32(DI), DI LEAQ 32(DI), DI
RESTORE_LFSR_8() RESTORE_LFSR_8
sseZucNibble: sseZucNibble:
CMPQ BP, $4 CMPQ BP, $4
JB sseZucDouble JB sseZucDouble
@ -708,7 +715,8 @@ sseZucNibble:
ROUND_REV32_SSE(2) ROUND_REV32_SSE(2)
ROUND_REV32_SSE(3) ROUND_REV32_SSE(3)
LEAQ 16(DI), DI LEAQ 16(DI), DI
RESTORE_LFSR_4() RESTORE_LFSR_4
sseZucDouble: sseZucDouble:
CMPQ BP, $2 CMPQ BP, $2
JB sseZucSingle JB sseZucSingle
@ -716,14 +724,16 @@ sseZucDouble:
ROUND_REV32_SSE(0) ROUND_REV32_SSE(0)
ROUND_REV32_SSE(1) ROUND_REV32_SSE(1)
LEAQ 8(DI), DI LEAQ 8(DI), DI
RESTORE_LFSR_2() RESTORE_LFSR_2
sseZucSingle: sseZucSingle:
TESTQ BP, BP TESTQ BP, BP
JE sseZucRet JE sseZucRet
ROUND_REV32_SSE(0) ROUND_REV32_SSE(0)
RESTORE_LFSR_0() RESTORE_LFSR_0
sseZucRet: sseZucRet:
SAVE_STATE() SAVE_STATE
RET RET
avxZucSixteens: avxZucSixteens:
@ -762,7 +772,8 @@ avxZucOctet:
ROUND_REV32_AVX(6) ROUND_REV32_AVX(6)
ROUND_REV32_AVX(7) ROUND_REV32_AVX(7)
LEAQ 32(DI), DI LEAQ 32(DI), DI
RESTORE_LFSR_8() RESTORE_LFSR_8
avxZucNibble: avxZucNibble:
CMPQ BP, $4 CMPQ BP, $4
JB avxZucDouble JB avxZucDouble
@ -772,7 +783,8 @@ avxZucNibble:
ROUND_REV32_AVX(2) ROUND_REV32_AVX(2)
ROUND_REV32_AVX(3) ROUND_REV32_AVX(3)
LEAQ 16(DI), DI LEAQ 16(DI), DI
RESTORE_LFSR_4() RESTORE_LFSR_4
avxZucDouble: avxZucDouble:
CMPQ BP, $2 CMPQ BP, $2
JB avxZucSingle JB avxZucSingle
@ -780,12 +792,14 @@ avxZucDouble:
ROUND_REV32_AVX(0) ROUND_REV32_AVX(0)
ROUND_REV32_AVX(1) ROUND_REV32_AVX(1)
LEAQ 8(DI), DI LEAQ 8(DI), DI
RESTORE_LFSR_2() RESTORE_LFSR_2
avxZucSingle: avxZucSingle:
TESTQ BP, BP TESTQ BP, BP
JE avxZucRet JE avxZucRet
ROUND_REV32_AVX(0) ROUND_REV32_AVX(0)
RESTORE_LFSR_0() RESTORE_LFSR_0
avxZucRet: avxZucRet:
SAVE_STATE() SAVE_STATE
RET RET

View File

@ -1,4 +1,5 @@
// Referenced https://github.com/intel/intel-ipsec-mb/ // Referenced Intel(R) Multi-Buffer Crypto for IPsec
// https://github.com/intel/intel-ipsec-mb/
//go:build amd64 && !purego //go:build amd64 && !purego
// +build amd64,!purego // +build amd64,!purego

View File

@ -1,4 +1,5 @@
// Referenced https://github.com/intel/intel-ipsec-mb/ // Referenced Intel(R) Multi-Buffer Crypto for IPsec
// https://github.com/intel/intel-ipsec-mb/
//go:build amd64 && !purego //go:build amd64 && !purego
// +build amd64,!purego // +build amd64,!purego