mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
zuc: minor optimization
This commit is contained in:
parent
a33c2ae118
commit
b721bed0cc
@ -17,10 +17,6 @@ DATA Low_nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
GLOBL Low_nibble_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA High_nibble_mask<>+0x00(SB)/8, $0xF0F0F0F0F0F0F0F0
|
||||
DATA High_nibble_mask<>+0x08(SB)/8, $0xF0F0F0F0F0F0F0F0
|
||||
GLOBL High_nibble_mask<>(SB), RODATA, $16
|
||||
|
||||
DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09
|
||||
DATA P1<>+0x08(SB)/8, $0x090305070C000400
|
||||
GLOBL P1<>(SB), RODATA, $16
|
||||
@ -99,10 +95,9 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
||||
#define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \
|
||||
MOVOU IN_OUT, XTMP1 \
|
||||
\
|
||||
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
|
||||
\
|
||||
PAND High_nibble_mask<>(SB), XTMP1 \
|
||||
PSRLQ $4, XTMP1 \ // x1
|
||||
PAND Low_nibble_mask<>(SB), XTMP1 \
|
||||
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
|
||||
\
|
||||
MOVOU P1<>(SB), XTMP2 \
|
||||
PSHUFB IN_OUT, XTMP2 \ // P1[x2]
|
||||
@ -124,16 +119,15 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
||||
// for high and low nible of each input byte, SSE versiion.
|
||||
#define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \
|
||||
\ // Get low nibble of input data
|
||||
MOVOU Low_nibble_mask<>(SB), XTMP \
|
||||
PAND XIN, XTMP \
|
||||
MOVOU XIN, XTMP \
|
||||
PAND Low_nibble_mask<>(SB), XTMP \
|
||||
\ // Get low nibble of output
|
||||
PSHUFB XTMP, XLO \
|
||||
\ // Get high nibble of input data
|
||||
MOVOU High_nibble_mask<>(SB), XTMP \
|
||||
PAND XIN, XTMP \
|
||||
PSRLQ $4, XTMP \
|
||||
PSRLQ $4, XIN \
|
||||
PAND Low_nibble_mask<>(SB), XIN \
|
||||
\ // Get high nibble of output
|
||||
PSHUFB XTMP, XHI_OUT \
|
||||
PSHUFB XIN, XHI_OUT \
|
||||
\ // XOR high and low nibbles to get full bytes
|
||||
PXOR XLO, XHI_OUT
|
||||
|
||||
@ -160,9 +154,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
||||
|
||||
// Compute 16 S0 box values from 16 bytes, AVX version.
|
||||
#define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \
|
||||
VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \
|
||||
VPSRLQ $4, XTMP1, XTMP1 \ // x1
|
||||
\
|
||||
VPSRLQ $4, IN_OUT, XTMP1 \ // x1
|
||||
VPAND Low_nibble_mask<>(SB), XTMP1, XTMP1 \
|
||||
VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2
|
||||
\
|
||||
VMOVDQU P1<>(SB), XTMP2 \
|
||||
@ -189,8 +182,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
|
||||
\ // Get low nibble of output
|
||||
VPSHUFB XTMP, XLO, XLO \
|
||||
\ // Get high nibble of input data
|
||||
VPAND High_nibble_mask<>(SB), XIN, XTMP \
|
||||
VPSRLQ $4, XTMP, XTMP \
|
||||
VPSRLQ $4, XIN, XTMP \
|
||||
VPAND Low_nibble_mask<>(SB), XTMP, XTMP \
|
||||
\ // Get high nibble of output
|
||||
VPSHUFB XTMP, XHI_OUT, XHI_OUT \
|
||||
\ // XOR high and low nibbles to get full bytes
|
||||
|
@ -94,9 +94,7 @@ GLOBL mask_S01<>(SB), RODATA, $32
|
||||
VORR XTMP0.B16, XDATA.B16, XDATA.B16
|
||||
|
||||
#define S0_comput(IN_OUT, XTMP1, XTMP2) \
|
||||
VUSHR $4, IN_OUT.S4, XTMP1.S4 \
|
||||
VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \
|
||||
\
|
||||
VUSHR $4, IN_OUT.B16, XTMP1.B16 \
|
||||
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
|
||||
\
|
||||
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \
|
||||
|
@ -65,9 +65,8 @@ GLOBL rcon<>(SB), RODATA, $160
|
||||
LXVD2X (R4)(R5), S1_MASK
|
||||
|
||||
#define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \
|
||||
VSRW IN_OUT, V_FOUR, XTMP1; \
|
||||
VAND XTMP1, NIBBLE_MASK, XTMP1; \
|
||||
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \
|
||||
VSRB IN_OUT, V_FOUR, XTMP1; \ // XTMP1 = hi 4 bits of IN_OUT
|
||||
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \ // low 4 bits of IN_OUT
|
||||
VPERM P1, P1, IN_OUT, XTMP2; \
|
||||
VXOR XTMP1, XTMP2, XTMP2; \
|
||||
VPERM P2, P2, XTMP2, XTMP1; \
|
||||
@ -87,8 +86,6 @@ GLOBL rcon<>(SB), RODATA, $160
|
||||
// zuc sbox function
|
||||
// parameters:
|
||||
// - x: 128 bits register as sbox input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - z: 128 bits temp register
|
||||
#define S1_comput(x, y, z) \
|
||||
VPERMXOR M1H, M1L, x, x; \
|
||||
VSBOX x, x; \
|
||||
|
@ -2,6 +2,11 @@
|
||||
|
||||
package zuc
|
||||
|
||||
import (
|
||||
"github.com/emmansun/gmsm/internal/cpuid"
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
// Generate single keyword, 4 bytes.
|
||||
//
|
||||
//go:noescape
|
||||
@ -12,6 +17,9 @@ func genKeywordAsm(s *zucState32) uint32
|
||||
//go:noescape
|
||||
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
||||
|
||||
var supportsAES = cpuid.HasAES
|
||||
var useAVX = cpu.X86.HasAVX
|
||||
|
||||
func genKeyStream(keyStream []uint32, pState *zucState32) {
|
||||
if supportsAES {
|
||||
genKeyStreamAsm(keyStream, pState)
|
||||
|
@ -42,8 +42,7 @@ TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
|
||||
// Reverse data bytes
|
||||
VLD1 (CX), [XDATA.B16]
|
||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
||||
VUSHR $4, XDATA.S4, XTMP1.S4
|
||||
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
|
||||
VUSHR $4, XDATA.B16, XTMP1.B16
|
||||
|
||||
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
||||
@ -115,8 +114,7 @@ TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
|
||||
// Reverse data bytes
|
||||
VLD1 (CX), [XDATA.B16]
|
||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
||||
VUSHR $4, XDATA.S4, XTMP1.S4
|
||||
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
|
||||
VUSHR $4, XDATA.B16, XTMP1.B16
|
||||
|
||||
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
||||
|
@ -4,12 +4,9 @@ package zuc
|
||||
|
||||
import (
|
||||
"github.com/emmansun/gmsm/internal/cpuid"
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
var supportsAES = cpuid.HasAES
|
||||
var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD
|
||||
var useAVX = cpu.X86.HasAVX
|
||||
|
||||
//go:noescape
|
||||
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||
|
@ -53,8 +53,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||
// Reverse data bytes
|
||||
VLD1 (CX), [XDATA.B16]
|
||||
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
|
||||
VUSHR $4, XDATA.S4, XTMP1.S4
|
||||
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
|
||||
VUSHR $4, XDATA.B16, XTMP1.B16
|
||||
|
||||
VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
|
||||
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
|
||||
|
Loading…
x
Reference in New Issue
Block a user