mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
zuc: EIA performance improvement
This commit is contained in:
parent
87f6f6a736
commit
39274df2bd
@ -48,10 +48,18 @@ func (s *zucState32) f32(x0, x1, x2 uint32) uint32 {
|
|||||||
fmt.Println()
|
fmt.Println()
|
||||||
```
|
```
|
||||||
|
|
||||||
## Performance with AMD64 SIMD & AESNI:
|
## EEA Performance with AMD64 SIMD & AESNI:
|
||||||
goos: windows
|
goos: windows
|
||||||
goarch: amd64
|
goarch: amd64
|
||||||
pkg: github.com/emmansun/gmsm/zuc
|
pkg: github.com/emmansun/gmsm/zuc
|
||||||
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
|
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
|
||||||
BenchmarkEncrypt1K-6 409755 2802 ns/op 363.62 MB/s
|
BenchmarkEncrypt1K-6 409755 2802 ns/op 363.62 MB/s
|
||||||
BenchmarkEncrypt8K-6 54120 22413 ns/op 365.28 MB/s
|
BenchmarkEncrypt8K-6 54120 22413 ns/op 365.28 MB/s
|
||||||
|
|
||||||
|
## EIA Performance with AMD64 SIMD & AESNI & CLMUL:
|
||||||
|
goos: windows
|
||||||
|
goarch: amd64
|
||||||
|
pkg: github.com/emmansun/gmsm/zuc
|
||||||
|
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
|
||||||
|
BenchmarkHash1K-6 317750 3833 ns/op 267.13 MB/s
|
||||||
|
BenchmarkHash8K-6 40460 28921 ns/op 283.26 MB/s
|
||||||
|
1064
zuc/asm_amd64.s
1064
zuc/asm_amd64.s
File diff suppressed because it is too large
Load Diff
608
zuc/asm_arm64.s
608
zuc/asm_arm64.s
@ -127,35 +127,35 @@ GLOBL mask_S1<>(SB), RODATA, $16
|
|||||||
VMOV R1, INVERSE_SHIFT_ROWS.D[1]
|
VMOV R1, INVERSE_SHIFT_ROWS.D[1]
|
||||||
|
|
||||||
#define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now
|
#define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now
|
||||||
LSLW n, a \
|
LSLW n, a \
|
||||||
LSRW n, b \
|
LSRW n, b \
|
||||||
ORRW b, a
|
ORRW b, a
|
||||||
|
|
||||||
#define Rotl_5(XDATA, XTMP0) \
|
#define Rotl_5(XDATA, XTMP0) \
|
||||||
VSHL $5, XDATA.S4, XTMP0.S4 \
|
VSHL $5, XDATA.S4, XTMP0.S4 \
|
||||||
VUSHR $3, XDATA.S4, XDATA.S4 \
|
VUSHR $3, XDATA.S4, XDATA.S4 \
|
||||||
VAND TOP3_BITS.B16, XTMP0.B16, XTMP0.B16 \
|
VAND TOP3_BITS.B16, XTMP0.B16, XTMP0.B16 \
|
||||||
VAND BOTTOM5_BITS.B16, XDATA.B16, XDATA.B16 \
|
VAND BOTTOM5_BITS.B16, XDATA.B16, XDATA.B16 \
|
||||||
VORR XTMP0.B16, XDATA.B16, XDATA.B16
|
VORR XTMP0.B16, XDATA.B16, XDATA.B16
|
||||||
|
|
||||||
#define S0_comput(IN_OUT, XTMP1, XTMP2) \
|
#define S0_comput(IN_OUT, XTMP1, XTMP2) \
|
||||||
VUSHR $4, IN_OUT.S4, XTMP1.S4 \
|
VUSHR $4, IN_OUT.S4, XTMP1.S4 \
|
||||||
VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \
|
VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \
|
||||||
\
|
\
|
||||||
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
|
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
|
||||||
\
|
\
|
||||||
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \
|
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \
|
||||||
VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16 \
|
VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16 \
|
||||||
\
|
\
|
||||||
VTBL XTMP2.B16, [P2.B16], XTMP1.B16 \
|
VTBL XTMP2.B16, [P2.B16], XTMP1.B16 \
|
||||||
VEOR IN_OUT.B16, XTMP1.B16, XTMP1.B16 \
|
VEOR IN_OUT.B16, XTMP1.B16, XTMP1.B16 \
|
||||||
\
|
\
|
||||||
VTBL XTMP1.B16, [P3.B16], IN_OUT.B16 \
|
VTBL XTMP1.B16, [P3.B16], IN_OUT.B16 \
|
||||||
VEOR XTMP2.B16, IN_OUT.B16, IN_OUT.B16 \
|
VEOR XTMP2.B16, IN_OUT.B16, IN_OUT.B16 \
|
||||||
\
|
\
|
||||||
VSHL $4, IN_OUT.S4, IN_OUT.S4 \
|
VSHL $4, IN_OUT.S4, IN_OUT.S4 \
|
||||||
VEOR XTMP1.B16, IN_OUT.B16, IN_OUT.B16 \
|
VEOR XTMP1.B16, IN_OUT.B16, IN_OUT.B16 \
|
||||||
Rotl_5(IN_OUT, XTMP1)
|
Rotl_5(IN_OUT, XTMP1)
|
||||||
|
|
||||||
#define S1_comput(x, XTMP1, XTMP2) \
|
#define S1_comput(x, XTMP1, XTMP2) \
|
||||||
VAND x.B16, NIBBLE_MASK.B16, XTMP1.B16; \
|
VAND x.B16, NIBBLE_MASK.B16, XTMP1.B16; \
|
||||||
@ -174,335 +174,335 @@ GLOBL mask_S1<>(SB), RODATA, $16
|
|||||||
VEOR XTMP2.B16, XTMP1.B16, x.B16
|
VEOR XTMP2.B16, XTMP1.B16, x.B16
|
||||||
|
|
||||||
#define BITS_REORG(idx) \
|
#define BITS_REORG(idx) \
|
||||||
MOVW (((15 + idx) % 16)*4)(SI), R12 \
|
MOVW (((15 + idx) % 16)*4)(SI), R12 \
|
||||||
MOVW (((14 + idx) % 16)*4)(SI), AX \
|
MOVW (((14 + idx) % 16)*4)(SI), AX \
|
||||||
MOVW (((11 + idx) % 16)*4)(SI), R13 \
|
MOVW (((11 + idx) % 16)*4)(SI), R13 \
|
||||||
MOVW (((9 + idx) % 16)*4)(SI), BX \
|
MOVW (((9 + idx) % 16)*4)(SI), BX \
|
||||||
MOVW (((7 + idx) % 16)*4)(SI), R14 \
|
MOVW (((7 + idx) % 16)*4)(SI), R14 \
|
||||||
MOVW (((5 + idx) % 16)*4)(SI), CX \
|
MOVW (((5 + idx) % 16)*4)(SI), CX \
|
||||||
MOVW (((2 + idx) % 16)*4)(SI), R15 \
|
MOVW (((2 + idx) % 16)*4)(SI), R15 \
|
||||||
MOVW (((0 + idx) % 16)*4)(SI), DX \
|
MOVW (((0 + idx) % 16)*4)(SI), DX \
|
||||||
LSRW $15, R12 \
|
LSRW $15, R12 \
|
||||||
LSLW $16, AX \
|
LSLW $16, AX \
|
||||||
LSLW $1, BX \
|
LSLW $1, BX \
|
||||||
LSLW $1, CX \
|
LSLW $1, CX \
|
||||||
LSLW $1, DX \
|
LSLW $1, DX \
|
||||||
SHLDL(R12, AX, $16) \
|
SHLDL(R12, AX, $16) \
|
||||||
SHLDL(R13, BX, $16) \
|
SHLDL(R13, BX, $16) \
|
||||||
SHLDL(R14, CX, $16) \
|
SHLDL(R14, CX, $16) \
|
||||||
SHLDL(R15, DX, $16)
|
SHLDL(R15, DX, $16)
|
||||||
|
|
||||||
#define LFSR_UPDT(idx) \
|
#define LFSR_UPDT(idx) \
|
||||||
MOVW (((0 + idx) % 16)*4)(SI), BX \
|
MOVW (((0 + idx) % 16)*4)(SI), BX \
|
||||||
MOVW (((4 + idx) % 16)*4)(SI), CX \
|
MOVW (((4 + idx) % 16)*4)(SI), CX \
|
||||||
MOVW (((10 + idx) % 16)*4)(SI), DX \
|
MOVW (((10 + idx) % 16)*4)(SI), DX \
|
||||||
MOVW (((13 + idx) % 16)*4)(SI), R8 \
|
MOVW (((13 + idx) % 16)*4)(SI), R8 \
|
||||||
MOVW (((15 + idx) % 16)*4)(SI), R9 \
|
MOVW (((15 + idx) % 16)*4)(SI), R9 \
|
||||||
ADD BX, AX \
|
ADD BX, AX \
|
||||||
LSL $8, BX \
|
LSL $8, BX \
|
||||||
LSL $20, CX \
|
LSL $20, CX \
|
||||||
LSL $21, DX \
|
LSL $21, DX \
|
||||||
LSL $17, R8 \
|
LSL $17, R8 \
|
||||||
LSL $15, R9 \
|
LSL $15, R9 \
|
||||||
ADD BX, AX \
|
ADD BX, AX \
|
||||||
ADD CX, AX \
|
ADD CX, AX \
|
||||||
ADD DX, AX \
|
ADD DX, AX \
|
||||||
ADD R8, AX \
|
ADD R8, AX \
|
||||||
ADD R9, AX \
|
ADD R9, AX \
|
||||||
\
|
\
|
||||||
MOVD AX, BX \
|
MOVD AX, BX \
|
||||||
AND $0x7FFFFFFF, AX \
|
AND $0x7FFFFFFF, AX \
|
||||||
LSR $31, BX \
|
LSR $31, BX \
|
||||||
ADD BX, AX \
|
ADD BX, AX \
|
||||||
\
|
\
|
||||||
SUBS $0x7FFFFFFF, AX, BX \
|
SUBS $0x7FFFFFFF, AX, BX \
|
||||||
CSEL CS, BX, AX, AX \
|
CSEL CS, BX, AX, AX \
|
||||||
\
|
\
|
||||||
MOVW AX, (((0 + idx) % 16)*4)(SI)
|
MOVW AX, (((0 + idx) % 16)*4)(SI)
|
||||||
|
|
||||||
#define NONLIN_FUN() \
|
#define NONLIN_FUN() \
|
||||||
MOVW R12, AX \
|
MOVW R12, AX \
|
||||||
EORW R10, AX \
|
EORW R10, AX \
|
||||||
ADDW R11, AX \
|
ADDW R11, AX \
|
||||||
ADDW R13, R10 \ // W1= F_R1 + BRC_X1
|
ADDW R13, R10 \ // W1= F_R1 + BRC_X1
|
||||||
EORW R14, R11 \ // W2= F_R2 ^ BRC_X2
|
EORW R14, R11 \ // W2= F_R2 ^ BRC_X2
|
||||||
\
|
\
|
||||||
MOVW R10, DX \
|
MOVW R10, DX \
|
||||||
MOVW R11, CX \
|
MOVW R11, CX \
|
||||||
SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16)
|
SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16)
|
||||||
SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16)
|
SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16)
|
||||||
MOVW DX, BX \
|
MOVW DX, BX \
|
||||||
MOVW DX, CX \
|
MOVW DX, CX \
|
||||||
MOVW DX, R8 \
|
MOVW DX, R8 \
|
||||||
MOVW DX, R9 \
|
MOVW DX, R9 \
|
||||||
RORW $30, BX \
|
RORW $30, BX \
|
||||||
RORW $22, CX \
|
RORW $22, CX \
|
||||||
RORW $14, R8 \
|
RORW $14, R8 \
|
||||||
RORW $8, R9 \
|
RORW $8, R9 \
|
||||||
EORW BX, DX \
|
EORW BX, DX \
|
||||||
EORW CX, DX \
|
EORW CX, DX \
|
||||||
EORW R8, DX \
|
EORW R8, DX \
|
||||||
EORW R9, DX \ // U = L1(P) = EDX, hi(RDX)=0
|
EORW R9, DX \ // U = L1(P) = EDX, hi(RDX)=0
|
||||||
MOVW R11, BX \
|
MOVW R11, BX \
|
||||||
MOVW R11, CX \
|
MOVW R11, CX \
|
||||||
MOVW R11, R8 \
|
MOVW R11, R8 \
|
||||||
MOVW R11, R9 \
|
MOVW R11, R9 \
|
||||||
RORW $24, BX \
|
RORW $24, BX \
|
||||||
RORW $18, CX \
|
RORW $18, CX \
|
||||||
RORW $10, R8 \
|
RORW $10, R8 \
|
||||||
RORW $2, R9 \
|
RORW $2, R9 \
|
||||||
EORW BX, R11 \
|
EORW BX, R11 \
|
||||||
EORW CX, R11 \
|
EORW CX, R11 \
|
||||||
EORW R8, R11 \
|
EORW R8, R11 \
|
||||||
EORW R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0
|
EORW R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0
|
||||||
LSL $32, R11 \
|
LSL $32, R11 \
|
||||||
EOR R11, DX \
|
EOR R11, DX \
|
||||||
VMOV DX, V0.D2 \
|
VMOV DX, V0.D2 \
|
||||||
VMOV V0.B16, V1.B16 \
|
VMOV V0.B16, V1.B16 \
|
||||||
S0_comput(V1, V2, V3) \
|
S0_comput(V1, V2, V3) \
|
||||||
S1_comput(V0, V2, V3) \
|
S1_comput(V0, V2, V3) \
|
||||||
\
|
\
|
||||||
VAND S1_MASK.B16, V0.B16, V0.B16 \
|
VAND S1_MASK.B16, V0.B16, V0.B16 \
|
||||||
VAND S0_MASK.B16, V1.B16, V1.B16 \
|
VAND S0_MASK.B16, V1.B16, V1.B16 \
|
||||||
VEOR V1.B16, V0.B16, V0.B16 \
|
VEOR V1.B16, V0.B16, V0.B16 \
|
||||||
\
|
\
|
||||||
VMOV V0.S[0], R10 \ // F_R1
|
VMOV V0.S[0], R10 \ // F_R1
|
||||||
VMOV V0.S[1], R11
|
VMOV V0.S[1], R11
|
||||||
|
|
||||||
#define RESTORE_LFSR_0() \
|
#define RESTORE_LFSR_0() \
|
||||||
MOVW.P 4(SI), AX \
|
MOVW.P 4(SI), AX \
|
||||||
VLD1 (SI), [V0.B16, V1.B16, V2.B16] \
|
VLD1 (SI), [V0.B16, V1.B16, V2.B16] \
|
||||||
SUB $4, SI \
|
SUB $4, SI \
|
||||||
MOVD (52)(SI), BX \
|
MOVD (52)(SI), BX \
|
||||||
MOVW (60)(SI), CX \
|
MOVW (60)(SI), CX \
|
||||||
\
|
\
|
||||||
VST1 [V0.B16, V1.B16, V2.B16], (SI) \
|
VST1 [V0.B16, V1.B16, V2.B16], (SI) \
|
||||||
MOVD BX, (48)(SI) \
|
MOVD BX, (48)(SI) \
|
||||||
MOVW CX, (56)(SI) \
|
MOVW CX, (56)(SI) \
|
||||||
MOVW AX, (60)(SI)
|
MOVW AX, (60)(SI)
|
||||||
|
|
||||||
#define RESTORE_LFSR_2() \
|
#define RESTORE_LFSR_2() \
|
||||||
MOVD.P 8(SI), AX \
|
MOVD.P 8(SI), AX \
|
||||||
VLD1 (SI), [V0.B16, V1.B16, V2.B16] \
|
VLD1 (SI), [V0.B16, V1.B16, V2.B16] \
|
||||||
SUB $8, SI \
|
SUB $8, SI \
|
||||||
MOVD (56)(SI), BX \
|
MOVD (56)(SI), BX \
|
||||||
\
|
\
|
||||||
VST1 [V0.B16, V1.B16, V2.B16], (SI) \
|
VST1 [V0.B16, V1.B16, V2.B16], (SI) \
|
||||||
MOVD BX, (48)(SI) \
|
MOVD BX, (48)(SI) \
|
||||||
MOVD AX, (56)(SI)
|
MOVD AX, (56)(SI)
|
||||||
|
|
||||||
#define RESTORE_LFSR_4() \
|
#define RESTORE_LFSR_4() \
|
||||||
VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \
|
VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \
|
||||||
\
|
\
|
||||||
VST1.P [V1.B16, V2.B16, V3.B16], 48(SI) \
|
VST1.P [V1.B16, V2.B16, V3.B16], 48(SI) \
|
||||||
VST1 [V0.B16], (SI) \
|
VST1 [V0.B16], (SI) \
|
||||||
SUB $48, SI
|
SUB $48, SI
|
||||||
|
|
||||||
#define RESTORE_LFSR_8() \
|
#define RESTORE_LFSR_8() \
|
||||||
VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \
|
VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \
|
||||||
\
|
\
|
||||||
VST1.P [V2.B16, V3.B16], 32(SI) \
|
VST1.P [V2.B16, V3.B16], 32(SI) \
|
||||||
VST1 [V0.B16, V1.B16], (SI) \
|
VST1 [V0.B16, V1.B16], (SI) \
|
||||||
SUB $32, SI
|
SUB $32, SI
|
||||||
|
|
||||||
#define LOAD_STATE(r) \
|
#define LOAD_STATE(r) \
|
||||||
MOVW 64+r, R10 \
|
MOVW 64+r, R10 \
|
||||||
MOVW 68+r, R11 \
|
MOVW 68+r, R11 \
|
||||||
MOVW 72+r, R12 \
|
MOVW 72+r, R12 \
|
||||||
MOVW 76+r, R13 \
|
MOVW 76+r, R13 \
|
||||||
MOVW 80+r, R14 \
|
MOVW 80+r, R14 \
|
||||||
MOVW 84+r, R15
|
MOVW 84+r, R15
|
||||||
|
|
||||||
#define SAVE_STATE(r) \
|
#define SAVE_STATE(r) \
|
||||||
MOVW R10, 64+r \
|
MOVW R10, 64+r \
|
||||||
MOVW R11, 68+r \
|
MOVW R11, 68+r \
|
||||||
MOVW R12, 72+r \
|
MOVW R12, 72+r \
|
||||||
MOVW R13, 76+r \
|
MOVW R13, 76+r \
|
||||||
MOVW R14, 80+r \
|
MOVW R14, 80+r \
|
||||||
MOVW R15, 84+r
|
MOVW R15, 84+r
|
||||||
|
|
||||||
// func genKeywordAsm(s *zucState32) uint32
|
// func genKeywordAsm(s *zucState32) uint32
|
||||||
TEXT ·genKeywordAsm(SB),NOSPLIT,$0
|
TEXT ·genKeywordAsm(SB),NOSPLIT,$0
|
||||||
LOAD_GLOBAL_DATA()
|
LOAD_GLOBAL_DATA()
|
||||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
MOVD pState+0(FP), SI
|
MOVD pState+0(FP), SI
|
||||||
LOAD_STATE(0(SI))
|
LOAD_STATE(0(SI))
|
||||||
|
|
||||||
BITS_REORG(0)
|
BITS_REORG(0)
|
||||||
NONLIN_FUN()
|
NONLIN_FUN()
|
||||||
|
|
||||||
EORW R15, AX
|
EORW R15, AX
|
||||||
MOVW AX, ret+8(FP)
|
MOVW AX, ret+8(FP)
|
||||||
EOR AX, AX
|
EOR AX, AX
|
||||||
LFSR_UPDT(0)
|
LFSR_UPDT(0)
|
||||||
SAVE_STATE(0(SI))
|
SAVE_STATE(0(SI))
|
||||||
RESTORE_LFSR_0()
|
RESTORE_LFSR_0()
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
#define ONEROUND(idx) \
|
#define ONEROUND(idx) \
|
||||||
BITS_REORG(idx) \
|
BITS_REORG(idx) \
|
||||||
NONLIN_FUN() \
|
NONLIN_FUN() \
|
||||||
EORW R15, AX \
|
EORW R15, AX \
|
||||||
MOVW AX, (idx*4)(DI) \
|
MOVW AX, (idx*4)(DI) \
|
||||||
EOR AX, AX \
|
EOR AX, AX \
|
||||||
LFSR_UPDT(idx)
|
LFSR_UPDT(idx)
|
||||||
|
|
||||||
#define ROUND_REV32(idx) \
|
#define ROUND_REV32(idx) \
|
||||||
BITS_REORG(idx) \
|
BITS_REORG(idx) \
|
||||||
NONLIN_FUN() \
|
NONLIN_FUN() \
|
||||||
EORW R15, AX \
|
EORW R15, AX \
|
||||||
REVW AX, AX \
|
REVW AX, AX \
|
||||||
MOVW AX, (idx*4)(DI) \
|
MOVW AX, (idx*4)(DI) \
|
||||||
EOR AX, AX \
|
EOR AX, AX \
|
||||||
LFSR_UPDT(idx)
|
LFSR_UPDT(idx)
|
||||||
|
|
||||||
// func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
// func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
|
||||||
TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
|
TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
|
||||||
LOAD_GLOBAL_DATA()
|
LOAD_GLOBAL_DATA()
|
||||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
MOVD ks+0(FP), DI
|
MOVD ks+0(FP), DI
|
||||||
MOVD ks_len+8(FP), BP
|
MOVD ks_len+8(FP), BP
|
||||||
MOVD pState+24(FP), SI
|
MOVD pState+24(FP), SI
|
||||||
|
|
||||||
LOAD_STATE(0(SI))
|
LOAD_STATE(0(SI))
|
||||||
|
|
||||||
zucSixteens:
|
zucSixteens:
|
||||||
CMP $16, BP
|
CMP $16, BP
|
||||||
BLT zucOctet
|
BLT zucOctet
|
||||||
SUB $16, BP
|
SUB $16, BP
|
||||||
ONEROUND(0)
|
ONEROUND(0)
|
||||||
ONEROUND(1)
|
ONEROUND(1)
|
||||||
ONEROUND(2)
|
ONEROUND(2)
|
||||||
ONEROUND(3)
|
ONEROUND(3)
|
||||||
ONEROUND(4)
|
ONEROUND(4)
|
||||||
ONEROUND(5)
|
ONEROUND(5)
|
||||||
ONEROUND(6)
|
ONEROUND(6)
|
||||||
ONEROUND(7)
|
ONEROUND(7)
|
||||||
ONEROUND(8)
|
ONEROUND(8)
|
||||||
ONEROUND(9)
|
ONEROUND(9)
|
||||||
ONEROUND(10)
|
ONEROUND(10)
|
||||||
ONEROUND(11)
|
ONEROUND(11)
|
||||||
ONEROUND(12)
|
ONEROUND(12)
|
||||||
ONEROUND(13)
|
ONEROUND(13)
|
||||||
ONEROUND(14)
|
ONEROUND(14)
|
||||||
ONEROUND(15)
|
ONEROUND(15)
|
||||||
ADD $4*16, DI
|
ADD $4*16, DI
|
||||||
B zucSixteens
|
B zucSixteens
|
||||||
|
|
||||||
zucOctet:
|
zucOctet:
|
||||||
CMP $8, BP
|
CMP $8, BP
|
||||||
BLT zucNibble
|
BLT zucNibble
|
||||||
SUB $8, BP
|
SUB $8, BP
|
||||||
ONEROUND(0)
|
ONEROUND(0)
|
||||||
ONEROUND(1)
|
ONEROUND(1)
|
||||||
ONEROUND(2)
|
ONEROUND(2)
|
||||||
ONEROUND(3)
|
ONEROUND(3)
|
||||||
ONEROUND(4)
|
ONEROUND(4)
|
||||||
ONEROUND(5)
|
ONEROUND(5)
|
||||||
ONEROUND(6)
|
ONEROUND(6)
|
||||||
ONEROUND(7)
|
ONEROUND(7)
|
||||||
ADD $2*16, DI
|
ADD $2*16, DI
|
||||||
RESTORE_LFSR_8()
|
RESTORE_LFSR_8()
|
||||||
zucNibble:
|
zucNibble:
|
||||||
CMP $4, BP
|
CMP $4, BP
|
||||||
BLT zucDouble
|
BLT zucDouble
|
||||||
SUB $4, BP
|
SUB $4, BP
|
||||||
ONEROUND(0)
|
ONEROUND(0)
|
||||||
ONEROUND(1)
|
ONEROUND(1)
|
||||||
ONEROUND(2)
|
ONEROUND(2)
|
||||||
ONEROUND(3)
|
ONEROUND(3)
|
||||||
ADD $1*16, DI
|
ADD $1*16, DI
|
||||||
RESTORE_LFSR_4()
|
RESTORE_LFSR_4()
|
||||||
zucDouble:
|
zucDouble:
|
||||||
CMP $2, BP
|
CMP $2, BP
|
||||||
BLT zucSingle
|
BLT zucSingle
|
||||||
SUB $2, BP
|
SUB $2, BP
|
||||||
ONEROUND(0)
|
ONEROUND(0)
|
||||||
ONEROUND(1)
|
ONEROUND(1)
|
||||||
ADD $8, DI
|
ADD $8, DI
|
||||||
RESTORE_LFSR_2()
|
RESTORE_LFSR_2()
|
||||||
zucSingle:
|
zucSingle:
|
||||||
TBZ $0, BP, zucRet
|
TBZ $0, BP, zucRet
|
||||||
ONEROUND(0)
|
ONEROUND(0)
|
||||||
RESTORE_LFSR_0()
|
RESTORE_LFSR_0()
|
||||||
zucRet:
|
zucRet:
|
||||||
SAVE_STATE(0(SI))
|
SAVE_STATE(0(SI))
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
|
// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
|
||||||
TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0
|
TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0
|
||||||
LOAD_GLOBAL_DATA()
|
LOAD_GLOBAL_DATA()
|
||||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
MOVD ks+0(FP), DI
|
MOVD ks+0(FP), DI
|
||||||
MOVD ks_len+8(FP), BP
|
MOVD ks_len+8(FP), BP
|
||||||
MOVD pState+24(FP), SI
|
MOVD pState+24(FP), SI
|
||||||
|
|
||||||
LSR $2, BP
|
LSR $2, BP
|
||||||
LOAD_STATE(0(SI))
|
LOAD_STATE(0(SI))
|
||||||
|
|
||||||
zucSixteens:
|
zucSixteens:
|
||||||
CMP $16, BP
|
CMP $16, BP
|
||||||
BLT zucOctet
|
BLT zucOctet
|
||||||
SUB $16, BP
|
SUB $16, BP
|
||||||
ROUND_REV32(0)
|
ROUND_REV32(0)
|
||||||
ROUND_REV32(1)
|
ROUND_REV32(1)
|
||||||
ROUND_REV32(2)
|
ROUND_REV32(2)
|
||||||
ROUND_REV32(3)
|
ROUND_REV32(3)
|
||||||
ROUND_REV32(4)
|
ROUND_REV32(4)
|
||||||
ROUND_REV32(5)
|
ROUND_REV32(5)
|
||||||
ROUND_REV32(6)
|
ROUND_REV32(6)
|
||||||
ROUND_REV32(7)
|
ROUND_REV32(7)
|
||||||
ROUND_REV32(8)
|
ROUND_REV32(8)
|
||||||
ROUND_REV32(9)
|
ROUND_REV32(9)
|
||||||
ROUND_REV32(10)
|
ROUND_REV32(10)
|
||||||
ROUND_REV32(11)
|
ROUND_REV32(11)
|
||||||
ROUND_REV32(12)
|
ROUND_REV32(12)
|
||||||
ROUND_REV32(13)
|
ROUND_REV32(13)
|
||||||
ROUND_REV32(14)
|
ROUND_REV32(14)
|
||||||
ROUND_REV32(15)
|
ROUND_REV32(15)
|
||||||
ADD $4*16, DI
|
ADD $4*16, DI
|
||||||
B zucSixteens
|
B zucSixteens
|
||||||
|
|
||||||
zucOctet:
|
zucOctet:
|
||||||
CMP $8, BP
|
CMP $8, BP
|
||||||
BLT zucNibble
|
BLT zucNibble
|
||||||
SUB $8, BP
|
SUB $8, BP
|
||||||
ROUND_REV32(0)
|
ROUND_REV32(0)
|
||||||
ROUND_REV32(1)
|
ROUND_REV32(1)
|
||||||
ROUND_REV32(2)
|
ROUND_REV32(2)
|
||||||
ROUND_REV32(3)
|
ROUND_REV32(3)
|
||||||
ROUND_REV32(4)
|
ROUND_REV32(4)
|
||||||
ROUND_REV32(5)
|
ROUND_REV32(5)
|
||||||
ROUND_REV32(6)
|
ROUND_REV32(6)
|
||||||
ROUND_REV32(7)
|
ROUND_REV32(7)
|
||||||
ADD $2*16, DI
|
ADD $2*16, DI
|
||||||
RESTORE_LFSR_8()
|
RESTORE_LFSR_8()
|
||||||
zucNibble:
|
zucNibble:
|
||||||
CMP $4, BP
|
CMP $4, BP
|
||||||
BLT zucDouble
|
BLT zucDouble
|
||||||
SUB $4, BP
|
SUB $4, BP
|
||||||
ROUND_REV32(0)
|
ROUND_REV32(0)
|
||||||
ROUND_REV32(1)
|
ROUND_REV32(1)
|
||||||
ROUND_REV32(2)
|
ROUND_REV32(2)
|
||||||
ROUND_REV32(3)
|
ROUND_REV32(3)
|
||||||
ADD $16, DI
|
ADD $16, DI
|
||||||
RESTORE_LFSR_4()
|
RESTORE_LFSR_4()
|
||||||
zucDouble:
|
zucDouble:
|
||||||
CMP $2, BP
|
CMP $2, BP
|
||||||
BLT zucSingle
|
BLT zucSingle
|
||||||
SUB $2, BP
|
SUB $2, BP
|
||||||
ROUND_REV32(0)
|
ROUND_REV32(0)
|
||||||
ROUND_REV32(1)
|
ROUND_REV32(1)
|
||||||
ADD $8, DI
|
ADD $8, DI
|
||||||
RESTORE_LFSR_2()
|
RESTORE_LFSR_2()
|
||||||
zucSingle:
|
zucSingle:
|
||||||
TBZ $0, BP, zucRet
|
TBZ $0, BP, zucRet
|
||||||
ROUND_REV32(0)
|
ROUND_REV32(0)
|
||||||
RESTORE_LFSR_0()
|
RESTORE_LFSR_0()
|
||||||
zucRet:
|
zucRet:
|
||||||
SAVE_STATE(0(SI))
|
SAVE_STATE(0(SI))
|
||||||
RET
|
RET
|
||||||
|
16
zuc/eia.go
16
zuc/eia.go
@ -1,7 +1,5 @@
|
|||||||
package zuc
|
package zuc
|
||||||
|
|
||||||
// Just for reference, no performance advantage due to the block size / chunk are 4 bytes only!
|
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"fmt"
|
"fmt"
|
||||||
@ -29,6 +27,7 @@ func NewHash(key, iv []byte) (*ZUC128Mac, error) {
|
|||||||
ivLen := len(iv)
|
ivLen := len(iv)
|
||||||
mac := &ZUC128Mac{}
|
mac := &ZUC128Mac{}
|
||||||
mac.tagSize = 4
|
mac.tagSize = 4
|
||||||
|
|
||||||
switch k {
|
switch k {
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("zuc/eia: invalid key size %d, expect 16 in bytes", k)
|
return nil, fmt.Errorf("zuc/eia: invalid key size %d, expect 16 in bytes", k)
|
||||||
@ -38,6 +37,7 @@ func NewHash(key, iv []byte) (*ZUC128Mac, error) {
|
|||||||
}
|
}
|
||||||
mac.loadKeyIV16(key, iv)
|
mac.loadKeyIV16(key, iv)
|
||||||
}
|
}
|
||||||
|
|
||||||
// initialization
|
// initialization
|
||||||
for i := 0; i < 32; i++ {
|
for i := 0; i < 32; i++ {
|
||||||
mac.bitReorganization()
|
mac.bitReorganization()
|
||||||
@ -89,10 +89,10 @@ func (m *ZUC128Mac) Reset() {
|
|||||||
m.r1 = m.initState.r1
|
m.r1 = m.initState.r1
|
||||||
m.r2 = m.initState.r2
|
m.r2 = m.initState.r2
|
||||||
copy(m.lfsr[:], m.initState.lfsr[:])
|
copy(m.lfsr[:], m.initState.lfsr[:])
|
||||||
m.genKeywords(m.k0[:4])
|
m.genKeywords(m.k0[:len(m.k0)/2])
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *ZUC128Mac) block(p []byte) {
|
func blockGeneric(m *ZUC128Mac, p []byte) {
|
||||||
var k64, t64 uint64
|
var k64, t64 uint64
|
||||||
t64 = uint64(m.t) << 32
|
t64 = uint64(m.t) << 32
|
||||||
for len(p) >= chunk {
|
for len(p) >= chunk {
|
||||||
@ -121,14 +121,14 @@ func (m *ZUC128Mac) Write(p []byte) (nn int, err error) {
|
|||||||
n := copy(m.x[m.nx:], p)
|
n := copy(m.x[m.nx:], p)
|
||||||
m.nx += n
|
m.nx += n
|
||||||
if m.nx == chunk {
|
if m.nx == chunk {
|
||||||
m.block(m.x[:])
|
block(m, m.x[:])
|
||||||
m.nx = 0
|
m.nx = 0
|
||||||
}
|
}
|
||||||
p = p[n:]
|
p = p[n:]
|
||||||
}
|
}
|
||||||
if len(p) >= chunk {
|
if len(p) >= chunk {
|
||||||
n := len(p) &^ (chunk - 1)
|
n := len(p) &^ (chunk - 1)
|
||||||
m.block(p[:n])
|
block(m, p[:n])
|
||||||
p = p[n:]
|
p = p[n:]
|
||||||
}
|
}
|
||||||
if len(p) > 0 {
|
if len(p) > 0 {
|
||||||
@ -139,7 +139,7 @@ func (m *ZUC128Mac) Write(p []byte) (nn int, err error) {
|
|||||||
|
|
||||||
func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte {
|
func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte {
|
||||||
if m.nx >= chunk {
|
if m.nx >= chunk {
|
||||||
panic("m.nx >= 16")
|
panic("m.nx >= chunk")
|
||||||
}
|
}
|
||||||
kIdx := 0
|
kIdx := 0
|
||||||
if m.nx > 0 || additionalBits > 0 {
|
if m.nx > 0 || additionalBits > 0 {
|
||||||
@ -147,7 +147,7 @@ func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte {
|
|||||||
t64 = uint64(m.t) << 32
|
t64 = uint64(m.t) << 32
|
||||||
m.x[m.nx] = b
|
m.x[m.nx] = b
|
||||||
nRemainBits := 8*m.nx + additionalBits
|
nRemainBits := 8*m.nx + additionalBits
|
||||||
if nRemainBits > 64 {
|
if nRemainBits > 2*32 {
|
||||||
m.genKeywords(m.k0[4:6])
|
m.genKeywords(m.k0[4:6])
|
||||||
}
|
}
|
||||||
words := (nRemainBits + 31) / 32
|
words := (nRemainBits + 31) / 32
|
||||||
|
24
zuc/eia_asm.go
Normal file
24
zuc/eia_asm.go
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
//go:build (amd64 && !generic)
|
||||||
|
// +build amd64,!generic
|
||||||
|
|
||||||
|
package zuc
|
||||||
|
|
||||||
|
import "golang.org/x/sys/cpu"
|
||||||
|
|
||||||
|
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
|
||||||
|
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||||
|
|
||||||
|
func block(m *ZUC128Mac, p []byte) {
|
||||||
|
if supportsGFMUL {
|
||||||
|
for len(p) >= chunk {
|
||||||
|
m.genKeywords(m.k0[4:])
|
||||||
|
eia3Round16B(&m.t, &m.k0[0], &p[0], m.tagSize)
|
||||||
|
p = p[chunk:]
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
blockGeneric(m, p)
|
||||||
|
}
|
||||||
|
}
|
153
zuc/eia_asm_amd64.s
Normal file
153
zuc/eia_asm_amd64.s
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
// Referenced https://github.com/intel/intel-ipsec-mb/
|
||||||
|
//go:build amd64 && !generic
|
||||||
|
// +build amd64,!generic
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800
|
||||||
|
DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901
|
||||||
|
GLOBL bit_reverse_table_l<>(SB), RODATA, $16
|
||||||
|
|
||||||
|
DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000
|
||||||
|
DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010
|
||||||
|
GLOBL bit_reverse_table_h<>(SB), RODATA, $16
|
||||||
|
|
||||||
|
DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
|
||||||
|
DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
|
||||||
|
GLOBL bit_reverse_and_table<>(SB), RODATA, $16
|
||||||
|
|
||||||
|
DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100
|
||||||
|
DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
|
||||||
|
GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16
|
||||||
|
|
||||||
|
DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908
|
||||||
|
DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c
|
||||||
|
GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
|
||||||
|
|
||||||
|
#define XTMP1 X1
|
||||||
|
#define XTMP2 X2
|
||||||
|
#define XTMP3 X3
|
||||||
|
#define XTMP4 X4
|
||||||
|
#define XTMP5 X5
|
||||||
|
#define XTMP6 X6
|
||||||
|
#define XDATA X7
|
||||||
|
#define XDIGEST X8
|
||||||
|
#define KS_L X9
|
||||||
|
#define KS_M1 X10
|
||||||
|
#define KS_M2 X11
|
||||||
|
#define KS_H X12
|
||||||
|
|
||||||
|
// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
|
||||||
|
TEXT ·eia3Round16B(SB),NOSPLIT,$0
|
||||||
|
MOVQ t+0(FP), AX
|
||||||
|
MOVQ ks+8(FP), BX
|
||||||
|
MOVQ p+16(FP), CX
|
||||||
|
MOVQ tagSize+24(FP), DX
|
||||||
|
|
||||||
|
CMPB ·useAVX2(SB), $1
|
||||||
|
JE avx2
|
||||||
|
|
||||||
|
// Reverse data bytes
|
||||||
|
MOVUPS (0)(CX), XDATA
|
||||||
|
MOVOU bit_reverse_and_table<>(SB), XTMP4
|
||||||
|
MOVOU XDATA, XTMP2
|
||||||
|
PAND XTMP4, XTMP2
|
||||||
|
|
||||||
|
PANDN XDATA, XTMP4
|
||||||
|
PSRLQ $4, XTMP4
|
||||||
|
|
||||||
|
MOVOU bit_reverse_table_h<>(SB), XTMP3
|
||||||
|
PSHUFB XTMP2, XTMP3
|
||||||
|
|
||||||
|
MOVOU bit_reverse_table_l<>(SB), XTMP1
|
||||||
|
PSHUFB XTMP4, XTMP1
|
||||||
|
|
||||||
|
PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes
|
||||||
|
|
||||||
|
// ZUC authentication part, 4x32 data bits
|
||||||
|
// setup KS
|
||||||
|
MOVUPS (0*4)(BX), XTMP1
|
||||||
|
MOVUPS (2*4)(BX), XTMP2
|
||||||
|
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||||
|
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
|
||||||
|
|
||||||
|
// setup DATA
|
||||||
|
MOVOU XTMP3, XTMP1
|
||||||
|
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
|
||||||
|
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
|
||||||
|
|
||||||
|
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
|
||||||
|
MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
|
||||||
|
|
||||||
|
// clmul
|
||||||
|
// xor the results from 4 32-bit words together
|
||||||
|
// Calculate lower 32 bits of tag
|
||||||
|
PCLMULQDQ $0x00, KS_L, XTMP1
|
||||||
|
PCLMULQDQ $0x11, KS_L, XTMP2
|
||||||
|
PCLMULQDQ $0x00, KS_M1, XDIGEST
|
||||||
|
PCLMULQDQ $0x11, KS_M1, XTMP3
|
||||||
|
|
||||||
|
// XOR all products and move 32-bits to lower 32 bits
|
||||||
|
PXOR XTMP1, XTMP2
|
||||||
|
PXOR XTMP3, XDIGEST
|
||||||
|
PXOR XTMP2, XDIGEST
|
||||||
|
PSRLDQ $4, XDIGEST
|
||||||
|
|
||||||
|
// Update tag
|
||||||
|
MOVL XDIGEST, R10
|
||||||
|
XORL R10, (AX)
|
||||||
|
|
||||||
|
// Copy last 16 bytes of KS to the front
|
||||||
|
MOVUPS (4*4)(BX), XTMP1
|
||||||
|
MOVUPS XTMP1, (0*4)(BX)
|
||||||
|
|
||||||
|
RET
|
||||||
|
|
||||||
|
avx2:
|
||||||
|
VMOVDQU (0)(CX), XDATA
|
||||||
|
|
||||||
|
// Reverse data bytes
|
||||||
|
VMOVDQU bit_reverse_and_table<>(SB), XTMP1
|
||||||
|
VPAND XTMP1, XDATA, XTMP2
|
||||||
|
VPANDN XDATA, XTMP1, XTMP3
|
||||||
|
VPSRLD $4, XTMP3, XTMP3
|
||||||
|
|
||||||
|
VMOVDQU bit_reverse_table_h<>(SB), XTMP1
|
||||||
|
VPSHUFB XTMP2, XTMP1, XTMP4
|
||||||
|
VMOVDQU bit_reverse_table_l<>(SB), XTMP1
|
||||||
|
VPSHUFB XTMP3, XTMP1, XTMP1
|
||||||
|
VPOR XTMP1, XTMP4, XTMP4
|
||||||
|
|
||||||
|
// ZUC authentication part, 4x32 data bits
|
||||||
|
// setup KS
|
||||||
|
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
|
||||||
|
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
|
||||||
|
|
||||||
|
// setup DATA
|
||||||
|
// Data bytes [31:0 0s 63:32 0s]
|
||||||
|
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
|
||||||
|
// Data bytes [95:64 0s 127:96 0s]
|
||||||
|
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
|
||||||
|
|
||||||
|
// clmul
|
||||||
|
// xor the results from 4 32-bit words together
|
||||||
|
// Calculate lower 32 bits of tag
|
||||||
|
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
|
||||||
|
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
|
||||||
|
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
|
||||||
|
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
|
||||||
|
|
||||||
|
VPXOR XTMP3, XTMP4, XTMP3
|
||||||
|
VPXOR XTMP5, XTMP6, XTMP5
|
||||||
|
VPXOR XTMP3, XTMP5, XDIGEST
|
||||||
|
|
||||||
|
VMOVQ XDIGEST, R10
|
||||||
|
SHRQ $32, R10
|
||||||
|
XORL R10, (AX)
|
||||||
|
|
||||||
|
// Copy last 16 bytes of KS to the front
|
||||||
|
VMOVDQU (4*4)(BX), XTMP1
|
||||||
|
VMOVDQU XTMP1, (0*4)(BX)
|
||||||
|
|
||||||
|
VZEROUPPER
|
||||||
|
RET
|
8
zuc/eia_generic.go
Normal file
8
zuc/eia_generic.go
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
//go:build !amd64 || generic
|
||||||
|
// +build !amd64 generic
|
||||||
|
|
||||||
|
package zuc
|
||||||
|
|
||||||
|
func block(m *ZUC128Mac, p []byte) {
|
||||||
|
blockGeneric(m, p)
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user