zuc: minor optimization

This commit is contained in:
Sun Yimin 2024-11-08 11:03:43 +08:00 committed by GitHub
parent a33c2ae118
commit b721bed0cc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 31 additions and 41 deletions

View File

@ -17,10 +17,6 @@ DATA Low_nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL Low_nibble_mask<>(SB), RODATA, $16
DATA High_nibble_mask<>+0x00(SB)/8, $0xF0F0F0F0F0F0F0F0
DATA High_nibble_mask<>+0x08(SB)/8, $0xF0F0F0F0F0F0F0F0
GLOBL High_nibble_mask<>(SB), RODATA, $16
DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09
DATA P1<>+0x08(SB)/8, $0x090305070C000400
GLOBL P1<>(SB), RODATA, $16
@ -99,10 +95,9 @@ GLOBL flip_mask<>(SB), RODATA, $16
#define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \
MOVOU IN_OUT, XTMP1 \
\
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
\
PAND High_nibble_mask<>(SB), XTMP1 \
PSRLQ $4, XTMP1 \ // x1
PAND Low_nibble_mask<>(SB), XTMP1 \
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
\
MOVOU P1<>(SB), XTMP2 \
PSHUFB IN_OUT, XTMP2 \ // P1[x2]
@ -124,16 +119,15 @@ GLOBL flip_mask<>(SB), RODATA, $16
// for high and low nible of each input byte, SSE versiion.
#define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \
\ // Get low nibble of input data
MOVOU Low_nibble_mask<>(SB), XTMP \
PAND XIN, XTMP \
MOVOU XIN, XTMP \
PAND Low_nibble_mask<>(SB), XTMP \
\ // Get low nibble of output
PSHUFB XTMP, XLO \
\ // Get high nibble of input data
MOVOU High_nibble_mask<>(SB), XTMP \
PAND XIN, XTMP \
PSRLQ $4, XTMP \
PSRLQ $4, XIN \
PAND Low_nibble_mask<>(SB), XIN \
\ // Get high nibble of output
PSHUFB XTMP, XHI_OUT \
PSHUFB XIN, XHI_OUT \
\ // XOR high and low nibbles to get full bytes
PXOR XLO, XHI_OUT
@ -160,9 +154,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
// Compute 16 S0 box values from 16 bytes, AVX version.
#define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \
VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \
VPSRLQ $4, XTMP1, XTMP1 \ // x1
\
VPSRLQ $4, IN_OUT, XTMP1 \ // x1
VPAND Low_nibble_mask<>(SB), XTMP1, XTMP1 \
VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2
\
VMOVDQU P1<>(SB), XTMP2 \
@ -189,8 +182,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
\ // Get low nibble of output
VPSHUFB XTMP, XLO, XLO \
\ // Get high nibble of input data
VPAND High_nibble_mask<>(SB), XIN, XTMP \
VPSRLQ $4, XTMP, XTMP \
VPSRLQ $4, XIN, XTMP \
VPAND Low_nibble_mask<>(SB), XTMP, XTMP \
\ // Get high nibble of output
VPSHUFB XTMP, XHI_OUT, XHI_OUT \
\ // XOR high and low nibbles to get full bytes

View File

@ -94,9 +94,7 @@ GLOBL mask_S01<>(SB), RODATA, $32
VORR XTMP0.B16, XDATA.B16, XDATA.B16
#define S0_comput(IN_OUT, XTMP1, XTMP2) \
VUSHR $4, IN_OUT.S4, XTMP1.S4 \
VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \
\
VUSHR $4, IN_OUT.B16, XTMP1.B16 \
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
\
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \

View File

@ -65,9 +65,8 @@ GLOBL rcon<>(SB), RODATA, $160
LXVD2X (R4)(R5), S1_MASK
#define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \
VSRW IN_OUT, V_FOUR, XTMP1; \
VAND XTMP1, NIBBLE_MASK, XTMP1; \
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \
VSRB IN_OUT, V_FOUR, XTMP1; \ // XTMP1 = hi 4 bits of IN_OUT
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \ // low 4 bits of IN_OUT
VPERM P1, P1, IN_OUT, XTMP2; \
VXOR XTMP1, XTMP2, XTMP2; \
VPERM P2, P2, XTMP2, XTMP1; \
@ -87,8 +86,6 @@ GLOBL rcon<>(SB), RODATA, $160
// zuc sbox function
// parameters:
// - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - z: 128 bits temp register
#define S1_comput(x, y, z) \
VPERMXOR M1H, M1L, x, x; \
VSBOX x, x; \

View File

@ -2,6 +2,11 @@
package zuc
import (
"github.com/emmansun/gmsm/internal/cpuid"
"golang.org/x/sys/cpu"
)
// Generate single keyword, 4 bytes.
//
//go:noescape
@ -12,6 +17,9 @@ func genKeywordAsm(s *zucState32) uint32
//go:noescape
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
var supportsAES = cpuid.HasAES
var useAVX = cpu.X86.HasAVX
func genKeyStream(keyStream []uint32, pState *zucState32) {
if supportsAES {
genKeyStreamAsm(keyStream, pState)

View File

@ -42,8 +42,7 @@ TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
// Reverse data bytes
VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VUSHR $4, XDATA.B16, XTMP1.B16
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
@ -115,8 +114,7 @@ TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
// Reverse data bytes
VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VUSHR $4, XDATA.B16, XTMP1.B16
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16

View File

@ -4,12 +4,9 @@ package zuc
import (
"github.com/emmansun/gmsm/internal/cpuid"
"golang.org/x/sys/cpu"
)
var supportsAES = cpuid.HasAES
var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD
var useAVX = cpu.X86.HasAVX
//go:noescape
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)

View File

@ -53,8 +53,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
// Reverse data bytes
VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VUSHR $4, XDATA.B16, XTMP1.B16
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16