zuc: EIA performance improvement

This commit is contained in:
Sun Yimin 2022-07-13 11:51:44 +08:00 committed by GitHub
parent 87f6f6a736
commit 39274df2bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1038 additions and 845 deletions

View File

@ -48,10 +48,18 @@ func (s *zucState32) f32(x0, x1, x2 uint32) uint32 {
fmt.Println() fmt.Println()
``` ```
## Performance with AMD64 SIMD & AESNI: ## EEA Performance with AMD64 SIMD & AESNI:
goos: windows goos: windows
goarch: amd64 goarch: amd64
pkg: github.com/emmansun/gmsm/zuc pkg: github.com/emmansun/gmsm/zuc
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkEncrypt1K-6 409755 2802 ns/op 363.62 MB/s BenchmarkEncrypt1K-6 409755 2802 ns/op 363.62 MB/s
BenchmarkEncrypt8K-6 54120 22413 ns/op 365.28 MB/s BenchmarkEncrypt8K-6 54120 22413 ns/op 365.28 MB/s
## EIA Performance with AMD64 SIMD & AESNI & CLMUL:
goos: windows
goarch: amd64
pkg: github.com/emmansun/gmsm/zuc
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkHash1K-6 317750 3833 ns/op 267.13 MB/s
BenchmarkHash8K-6 40460 28921 ns/op 283.26 MB/s

File diff suppressed because it is too large Load Diff

View File

@ -127,35 +127,35 @@ GLOBL mask_S1<>(SB), RODATA, $16
VMOV R1, INVERSE_SHIFT_ROWS.D[1] VMOV R1, INVERSE_SHIFT_ROWS.D[1]
#define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now #define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now
LSLW n, a \ LSLW n, a \
LSRW n, b \ LSRW n, b \
ORRW b, a ORRW b, a
#define Rotl_5(XDATA, XTMP0) \ #define Rotl_5(XDATA, XTMP0) \
VSHL $5, XDATA.S4, XTMP0.S4 \ VSHL $5, XDATA.S4, XTMP0.S4 \
VUSHR $3, XDATA.S4, XDATA.S4 \ VUSHR $3, XDATA.S4, XDATA.S4 \
VAND TOP3_BITS.B16, XTMP0.B16, XTMP0.B16 \ VAND TOP3_BITS.B16, XTMP0.B16, XTMP0.B16 \
VAND BOTTOM5_BITS.B16, XDATA.B16, XDATA.B16 \ VAND BOTTOM5_BITS.B16, XDATA.B16, XDATA.B16 \
VORR XTMP0.B16, XDATA.B16, XDATA.B16 VORR XTMP0.B16, XDATA.B16, XDATA.B16
#define S0_comput(IN_OUT, XTMP1, XTMP2) \ #define S0_comput(IN_OUT, XTMP1, XTMP2) \
VUSHR $4, IN_OUT.S4, XTMP1.S4 \ VUSHR $4, IN_OUT.S4, XTMP1.S4 \
VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \ VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \
\ \
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \ VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
\ \
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \ VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \
VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16 \ VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16 \
\ \
VTBL XTMP2.B16, [P2.B16], XTMP1.B16 \ VTBL XTMP2.B16, [P2.B16], XTMP1.B16 \
VEOR IN_OUT.B16, XTMP1.B16, XTMP1.B16 \ VEOR IN_OUT.B16, XTMP1.B16, XTMP1.B16 \
\ \
VTBL XTMP1.B16, [P3.B16], IN_OUT.B16 \ VTBL XTMP1.B16, [P3.B16], IN_OUT.B16 \
VEOR XTMP2.B16, IN_OUT.B16, IN_OUT.B16 \ VEOR XTMP2.B16, IN_OUT.B16, IN_OUT.B16 \
\ \
VSHL $4, IN_OUT.S4, IN_OUT.S4 \ VSHL $4, IN_OUT.S4, IN_OUT.S4 \
VEOR XTMP1.B16, IN_OUT.B16, IN_OUT.B16 \ VEOR XTMP1.B16, IN_OUT.B16, IN_OUT.B16 \
Rotl_5(IN_OUT, XTMP1) Rotl_5(IN_OUT, XTMP1)
#define S1_comput(x, XTMP1, XTMP2) \ #define S1_comput(x, XTMP1, XTMP2) \
VAND x.B16, NIBBLE_MASK.B16, XTMP1.B16; \ VAND x.B16, NIBBLE_MASK.B16, XTMP1.B16; \
@ -174,335 +174,335 @@ GLOBL mask_S1<>(SB), RODATA, $16
VEOR XTMP2.B16, XTMP1.B16, x.B16 VEOR XTMP2.B16, XTMP1.B16, x.B16
#define BITS_REORG(idx) \ #define BITS_REORG(idx) \
MOVW (((15 + idx) % 16)*4)(SI), R12 \ MOVW (((15 + idx) % 16)*4)(SI), R12 \
MOVW (((14 + idx) % 16)*4)(SI), AX \ MOVW (((14 + idx) % 16)*4)(SI), AX \
MOVW (((11 + idx) % 16)*4)(SI), R13 \ MOVW (((11 + idx) % 16)*4)(SI), R13 \
MOVW (((9 + idx) % 16)*4)(SI), BX \ MOVW (((9 + idx) % 16)*4)(SI), BX \
MOVW (((7 + idx) % 16)*4)(SI), R14 \ MOVW (((7 + idx) % 16)*4)(SI), R14 \
MOVW (((5 + idx) % 16)*4)(SI), CX \ MOVW (((5 + idx) % 16)*4)(SI), CX \
MOVW (((2 + idx) % 16)*4)(SI), R15 \ MOVW (((2 + idx) % 16)*4)(SI), R15 \
MOVW (((0 + idx) % 16)*4)(SI), DX \ MOVW (((0 + idx) % 16)*4)(SI), DX \
LSRW $15, R12 \ LSRW $15, R12 \
LSLW $16, AX \ LSLW $16, AX \
LSLW $1, BX \ LSLW $1, BX \
LSLW $1, CX \ LSLW $1, CX \
LSLW $1, DX \ LSLW $1, DX \
SHLDL(R12, AX, $16) \ SHLDL(R12, AX, $16) \
SHLDL(R13, BX, $16) \ SHLDL(R13, BX, $16) \
SHLDL(R14, CX, $16) \ SHLDL(R14, CX, $16) \
SHLDL(R15, DX, $16) SHLDL(R15, DX, $16)
#define LFSR_UPDT(idx) \ #define LFSR_UPDT(idx) \
MOVW (((0 + idx) % 16)*4)(SI), BX \ MOVW (((0 + idx) % 16)*4)(SI), BX \
MOVW (((4 + idx) % 16)*4)(SI), CX \ MOVW (((4 + idx) % 16)*4)(SI), CX \
MOVW (((10 + idx) % 16)*4)(SI), DX \ MOVW (((10 + idx) % 16)*4)(SI), DX \
MOVW (((13 + idx) % 16)*4)(SI), R8 \ MOVW (((13 + idx) % 16)*4)(SI), R8 \
MOVW (((15 + idx) % 16)*4)(SI), R9 \ MOVW (((15 + idx) % 16)*4)(SI), R9 \
ADD BX, AX \ ADD BX, AX \
LSL $8, BX \ LSL $8, BX \
LSL $20, CX \ LSL $20, CX \
LSL $21, DX \ LSL $21, DX \
LSL $17, R8 \ LSL $17, R8 \
LSL $15, R9 \ LSL $15, R9 \
ADD BX, AX \ ADD BX, AX \
ADD CX, AX \ ADD CX, AX \
ADD DX, AX \ ADD DX, AX \
ADD R8, AX \ ADD R8, AX \
ADD R9, AX \ ADD R9, AX \
\ \
MOVD AX, BX \ MOVD AX, BX \
AND $0x7FFFFFFF, AX \ AND $0x7FFFFFFF, AX \
LSR $31, BX \ LSR $31, BX \
ADD BX, AX \ ADD BX, AX \
\ \
SUBS $0x7FFFFFFF, AX, BX \ SUBS $0x7FFFFFFF, AX, BX \
CSEL CS, BX, AX, AX \ CSEL CS, BX, AX, AX \
\ \
MOVW AX, (((0 + idx) % 16)*4)(SI) MOVW AX, (((0 + idx) % 16)*4)(SI)
#define NONLIN_FUN() \ #define NONLIN_FUN() \
MOVW R12, AX \ MOVW R12, AX \
EORW R10, AX \ EORW R10, AX \
ADDW R11, AX \ ADDW R11, AX \
ADDW R13, R10 \ // W1= F_R1 + BRC_X1 ADDW R13, R10 \ // W1= F_R1 + BRC_X1
EORW R14, R11 \ // W2= F_R2 ^ BRC_X2 EORW R14, R11 \ // W2= F_R2 ^ BRC_X2
\ \
MOVW R10, DX \ MOVW R10, DX \
MOVW R11, CX \ MOVW R11, CX \
SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16) SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16)
SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16) SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16)
MOVW DX, BX \ MOVW DX, BX \
MOVW DX, CX \ MOVW DX, CX \
MOVW DX, R8 \ MOVW DX, R8 \
MOVW DX, R9 \ MOVW DX, R9 \
RORW $30, BX \ RORW $30, BX \
RORW $22, CX \ RORW $22, CX \
RORW $14, R8 \ RORW $14, R8 \
RORW $8, R9 \ RORW $8, R9 \
EORW BX, DX \ EORW BX, DX \
EORW CX, DX \ EORW CX, DX \
EORW R8, DX \ EORW R8, DX \
EORW R9, DX \ // U = L1(P) = EDX, hi(RDX)=0 EORW R9, DX \ // U = L1(P) = EDX, hi(RDX)=0
MOVW R11, BX \ MOVW R11, BX \
MOVW R11, CX \ MOVW R11, CX \
MOVW R11, R8 \ MOVW R11, R8 \
MOVW R11, R9 \ MOVW R11, R9 \
RORW $24, BX \ RORW $24, BX \
RORW $18, CX \ RORW $18, CX \
RORW $10, R8 \ RORW $10, R8 \
RORW $2, R9 \ RORW $2, R9 \
EORW BX, R11 \ EORW BX, R11 \
EORW CX, R11 \ EORW CX, R11 \
EORW R8, R11 \ EORW R8, R11 \
EORW R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0 EORW R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0
LSL $32, R11 \ LSL $32, R11 \
EOR R11, DX \ EOR R11, DX \
VMOV DX, V0.D2 \ VMOV DX, V0.D2 \
VMOV V0.B16, V1.B16 \ VMOV V0.B16, V1.B16 \
S0_comput(V1, V2, V3) \ S0_comput(V1, V2, V3) \
S1_comput(V0, V2, V3) \ S1_comput(V0, V2, V3) \
\ \
VAND S1_MASK.B16, V0.B16, V0.B16 \ VAND S1_MASK.B16, V0.B16, V0.B16 \
VAND S0_MASK.B16, V1.B16, V1.B16 \ VAND S0_MASK.B16, V1.B16, V1.B16 \
VEOR V1.B16, V0.B16, V0.B16 \ VEOR V1.B16, V0.B16, V0.B16 \
\ \
VMOV V0.S[0], R10 \ // F_R1 VMOV V0.S[0], R10 \ // F_R1
VMOV V0.S[1], R11 VMOV V0.S[1], R11
#define RESTORE_LFSR_0() \ #define RESTORE_LFSR_0() \
MOVW.P 4(SI), AX \ MOVW.P 4(SI), AX \
VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ VLD1 (SI), [V0.B16, V1.B16, V2.B16] \
SUB $4, SI \ SUB $4, SI \
MOVD (52)(SI), BX \ MOVD (52)(SI), BX \
MOVW (60)(SI), CX \ MOVW (60)(SI), CX \
\ \
VST1 [V0.B16, V1.B16, V2.B16], (SI) \ VST1 [V0.B16, V1.B16, V2.B16], (SI) \
MOVD BX, (48)(SI) \ MOVD BX, (48)(SI) \
MOVW CX, (56)(SI) \ MOVW CX, (56)(SI) \
MOVW AX, (60)(SI) MOVW AX, (60)(SI)
#define RESTORE_LFSR_2() \ #define RESTORE_LFSR_2() \
MOVD.P 8(SI), AX \ MOVD.P 8(SI), AX \
VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ VLD1 (SI), [V0.B16, V1.B16, V2.B16] \
SUB $8, SI \ SUB $8, SI \
MOVD (56)(SI), BX \ MOVD (56)(SI), BX \
\ \
VST1 [V0.B16, V1.B16, V2.B16], (SI) \ VST1 [V0.B16, V1.B16, V2.B16], (SI) \
MOVD BX, (48)(SI) \ MOVD BX, (48)(SI) \
MOVD AX, (56)(SI) MOVD AX, (56)(SI)
#define RESTORE_LFSR_4() \ #define RESTORE_LFSR_4() \
VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \
\ \
VST1.P [V1.B16, V2.B16, V3.B16], 48(SI) \ VST1.P [V1.B16, V2.B16, V3.B16], 48(SI) \
VST1 [V0.B16], (SI) \ VST1 [V0.B16], (SI) \
SUB $48, SI SUB $48, SI
#define RESTORE_LFSR_8() \ #define RESTORE_LFSR_8() \
VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \
\ \
VST1.P [V2.B16, V3.B16], 32(SI) \ VST1.P [V2.B16, V3.B16], 32(SI) \
VST1 [V0.B16, V1.B16], (SI) \ VST1 [V0.B16, V1.B16], (SI) \
SUB $32, SI SUB $32, SI
#define LOAD_STATE(r) \ #define LOAD_STATE(r) \
MOVW 64+r, R10 \ MOVW 64+r, R10 \
MOVW 68+r, R11 \ MOVW 68+r, R11 \
MOVW 72+r, R12 \ MOVW 72+r, R12 \
MOVW 76+r, R13 \ MOVW 76+r, R13 \
MOVW 80+r, R14 \ MOVW 80+r, R14 \
MOVW 84+r, R15 MOVW 84+r, R15
#define SAVE_STATE(r) \ #define SAVE_STATE(r) \
MOVW R10, 64+r \ MOVW R10, 64+r \
MOVW R11, 68+r \ MOVW R11, 68+r \
MOVW R12, 72+r \ MOVW R12, 72+r \
MOVW R13, 76+r \ MOVW R13, 76+r \
MOVW R14, 80+r \ MOVW R14, 80+r \
MOVW R15, 84+r MOVW R15, 84+r
// func genKeywordAsm(s *zucState32) uint32 // func genKeywordAsm(s *zucState32) uint32
TEXT ·genKeywordAsm(SB),NOSPLIT,$0 TEXT ·genKeywordAsm(SB),NOSPLIT,$0
LOAD_GLOBAL_DATA() LOAD_GLOBAL_DATA()
VEOR ZERO.B16, ZERO.B16, ZERO.B16 VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD pState+0(FP), SI MOVD pState+0(FP), SI
LOAD_STATE(0(SI)) LOAD_STATE(0(SI))
BITS_REORG(0) BITS_REORG(0)
NONLIN_FUN() NONLIN_FUN()
EORW R15, AX EORW R15, AX
MOVW AX, ret+8(FP) MOVW AX, ret+8(FP)
EOR AX, AX EOR AX, AX
LFSR_UPDT(0) LFSR_UPDT(0)
SAVE_STATE(0(SI)) SAVE_STATE(0(SI))
RESTORE_LFSR_0() RESTORE_LFSR_0()
RET RET
#define ONEROUND(idx) \ #define ONEROUND(idx) \
BITS_REORG(idx) \ BITS_REORG(idx) \
NONLIN_FUN() \ NONLIN_FUN() \
EORW R15, AX \ EORW R15, AX \
MOVW AX, (idx*4)(DI) \ MOVW AX, (idx*4)(DI) \
EOR AX, AX \ EOR AX, AX \
LFSR_UPDT(idx) LFSR_UPDT(idx)
#define ROUND_REV32(idx) \ #define ROUND_REV32(idx) \
BITS_REORG(idx) \ BITS_REORG(idx) \
NONLIN_FUN() \ NONLIN_FUN() \
EORW R15, AX \ EORW R15, AX \
REVW AX, AX \ REVW AX, AX \
MOVW AX, (idx*4)(DI) \ MOVW AX, (idx*4)(DI) \
EOR AX, AX \ EOR AX, AX \
LFSR_UPDT(idx) LFSR_UPDT(idx)
// func genKeyStreamAsm(keyStream []uint32, pState *zucState32) // func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0 TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
LOAD_GLOBAL_DATA() LOAD_GLOBAL_DATA()
VEOR ZERO.B16, ZERO.B16, ZERO.B16 VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD ks+0(FP), DI MOVD ks+0(FP), DI
MOVD ks_len+8(FP), BP MOVD ks_len+8(FP), BP
MOVD pState+24(FP), SI MOVD pState+24(FP), SI
LOAD_STATE(0(SI)) LOAD_STATE(0(SI))
zucSixteens: zucSixteens:
CMP $16, BP CMP $16, BP
BLT zucOctet BLT zucOctet
SUB $16, BP SUB $16, BP
ONEROUND(0) ONEROUND(0)
ONEROUND(1) ONEROUND(1)
ONEROUND(2) ONEROUND(2)
ONEROUND(3) ONEROUND(3)
ONEROUND(4) ONEROUND(4)
ONEROUND(5) ONEROUND(5)
ONEROUND(6) ONEROUND(6)
ONEROUND(7) ONEROUND(7)
ONEROUND(8) ONEROUND(8)
ONEROUND(9) ONEROUND(9)
ONEROUND(10) ONEROUND(10)
ONEROUND(11) ONEROUND(11)
ONEROUND(12) ONEROUND(12)
ONEROUND(13) ONEROUND(13)
ONEROUND(14) ONEROUND(14)
ONEROUND(15) ONEROUND(15)
ADD $4*16, DI ADD $4*16, DI
B zucSixteens B zucSixteens
zucOctet: zucOctet:
CMP $8, BP CMP $8, BP
BLT zucNibble BLT zucNibble
SUB $8, BP SUB $8, BP
ONEROUND(0) ONEROUND(0)
ONEROUND(1) ONEROUND(1)
ONEROUND(2) ONEROUND(2)
ONEROUND(3) ONEROUND(3)
ONEROUND(4) ONEROUND(4)
ONEROUND(5) ONEROUND(5)
ONEROUND(6) ONEROUND(6)
ONEROUND(7) ONEROUND(7)
ADD $2*16, DI ADD $2*16, DI
RESTORE_LFSR_8() RESTORE_LFSR_8()
zucNibble: zucNibble:
CMP $4, BP CMP $4, BP
BLT zucDouble BLT zucDouble
SUB $4, BP SUB $4, BP
ONEROUND(0) ONEROUND(0)
ONEROUND(1) ONEROUND(1)
ONEROUND(2) ONEROUND(2)
ONEROUND(3) ONEROUND(3)
ADD $1*16, DI ADD $1*16, DI
RESTORE_LFSR_4() RESTORE_LFSR_4()
zucDouble: zucDouble:
CMP $2, BP CMP $2, BP
BLT zucSingle BLT zucSingle
SUB $2, BP SUB $2, BP
ONEROUND(0) ONEROUND(0)
ONEROUND(1) ONEROUND(1)
ADD $8, DI ADD $8, DI
RESTORE_LFSR_2() RESTORE_LFSR_2()
zucSingle: zucSingle:
TBZ $0, BP, zucRet TBZ $0, BP, zucRet
ONEROUND(0) ONEROUND(0)
RESTORE_LFSR_0() RESTORE_LFSR_0()
zucRet: zucRet:
SAVE_STATE(0(SI)) SAVE_STATE(0(SI))
RET RET
// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) // func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0 TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0
LOAD_GLOBAL_DATA() LOAD_GLOBAL_DATA()
VEOR ZERO.B16, ZERO.B16, ZERO.B16 VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD ks+0(FP), DI MOVD ks+0(FP), DI
MOVD ks_len+8(FP), BP MOVD ks_len+8(FP), BP
MOVD pState+24(FP), SI MOVD pState+24(FP), SI
LSR $2, BP LSR $2, BP
LOAD_STATE(0(SI)) LOAD_STATE(0(SI))
zucSixteens: zucSixteens:
CMP $16, BP CMP $16, BP
BLT zucOctet BLT zucOctet
SUB $16, BP SUB $16, BP
ROUND_REV32(0) ROUND_REV32(0)
ROUND_REV32(1) ROUND_REV32(1)
ROUND_REV32(2) ROUND_REV32(2)
ROUND_REV32(3) ROUND_REV32(3)
ROUND_REV32(4) ROUND_REV32(4)
ROUND_REV32(5) ROUND_REV32(5)
ROUND_REV32(6) ROUND_REV32(6)
ROUND_REV32(7) ROUND_REV32(7)
ROUND_REV32(8) ROUND_REV32(8)
ROUND_REV32(9) ROUND_REV32(9)
ROUND_REV32(10) ROUND_REV32(10)
ROUND_REV32(11) ROUND_REV32(11)
ROUND_REV32(12) ROUND_REV32(12)
ROUND_REV32(13) ROUND_REV32(13)
ROUND_REV32(14) ROUND_REV32(14)
ROUND_REV32(15) ROUND_REV32(15)
ADD $4*16, DI ADD $4*16, DI
B zucSixteens B zucSixteens
zucOctet: zucOctet:
CMP $8, BP CMP $8, BP
BLT zucNibble BLT zucNibble
SUB $8, BP SUB $8, BP
ROUND_REV32(0) ROUND_REV32(0)
ROUND_REV32(1) ROUND_REV32(1)
ROUND_REV32(2) ROUND_REV32(2)
ROUND_REV32(3) ROUND_REV32(3)
ROUND_REV32(4) ROUND_REV32(4)
ROUND_REV32(5) ROUND_REV32(5)
ROUND_REV32(6) ROUND_REV32(6)
ROUND_REV32(7) ROUND_REV32(7)
ADD $2*16, DI ADD $2*16, DI
RESTORE_LFSR_8() RESTORE_LFSR_8()
zucNibble: zucNibble:
CMP $4, BP CMP $4, BP
BLT zucDouble BLT zucDouble
SUB $4, BP SUB $4, BP
ROUND_REV32(0) ROUND_REV32(0)
ROUND_REV32(1) ROUND_REV32(1)
ROUND_REV32(2) ROUND_REV32(2)
ROUND_REV32(3) ROUND_REV32(3)
ADD $16, DI ADD $16, DI
RESTORE_LFSR_4() RESTORE_LFSR_4()
zucDouble: zucDouble:
CMP $2, BP CMP $2, BP
BLT zucSingle BLT zucSingle
SUB $2, BP SUB $2, BP
ROUND_REV32(0) ROUND_REV32(0)
ROUND_REV32(1) ROUND_REV32(1)
ADD $8, DI ADD $8, DI
RESTORE_LFSR_2() RESTORE_LFSR_2()
zucSingle: zucSingle:
TBZ $0, BP, zucRet TBZ $0, BP, zucRet
ROUND_REV32(0) ROUND_REV32(0)
RESTORE_LFSR_0() RESTORE_LFSR_0()
zucRet: zucRet:
SAVE_STATE(0(SI)) SAVE_STATE(0(SI))
RET RET

View File

@ -1,7 +1,5 @@
package zuc package zuc
// Just for reference, no performance advantage due to the block size / chunk are 4 bytes only!
import ( import (
"encoding/binary" "encoding/binary"
"fmt" "fmt"
@ -29,6 +27,7 @@ func NewHash(key, iv []byte) (*ZUC128Mac, error) {
ivLen := len(iv) ivLen := len(iv)
mac := &ZUC128Mac{} mac := &ZUC128Mac{}
mac.tagSize = 4 mac.tagSize = 4
switch k { switch k {
default: default:
return nil, fmt.Errorf("zuc/eia: invalid key size %d, expect 16 in bytes", k) return nil, fmt.Errorf("zuc/eia: invalid key size %d, expect 16 in bytes", k)
@ -38,6 +37,7 @@ func NewHash(key, iv []byte) (*ZUC128Mac, error) {
} }
mac.loadKeyIV16(key, iv) mac.loadKeyIV16(key, iv)
} }
// initialization // initialization
for i := 0; i < 32; i++ { for i := 0; i < 32; i++ {
mac.bitReorganization() mac.bitReorganization()
@ -89,10 +89,10 @@ func (m *ZUC128Mac) Reset() {
m.r1 = m.initState.r1 m.r1 = m.initState.r1
m.r2 = m.initState.r2 m.r2 = m.initState.r2
copy(m.lfsr[:], m.initState.lfsr[:]) copy(m.lfsr[:], m.initState.lfsr[:])
m.genKeywords(m.k0[:4]) m.genKeywords(m.k0[:len(m.k0)/2])
} }
func (m *ZUC128Mac) block(p []byte) { func blockGeneric(m *ZUC128Mac, p []byte) {
var k64, t64 uint64 var k64, t64 uint64
t64 = uint64(m.t) << 32 t64 = uint64(m.t) << 32
for len(p) >= chunk { for len(p) >= chunk {
@ -121,14 +121,14 @@ func (m *ZUC128Mac) Write(p []byte) (nn int, err error) {
n := copy(m.x[m.nx:], p) n := copy(m.x[m.nx:], p)
m.nx += n m.nx += n
if m.nx == chunk { if m.nx == chunk {
m.block(m.x[:]) block(m, m.x[:])
m.nx = 0 m.nx = 0
} }
p = p[n:] p = p[n:]
} }
if len(p) >= chunk { if len(p) >= chunk {
n := len(p) &^ (chunk - 1) n := len(p) &^ (chunk - 1)
m.block(p[:n]) block(m, p[:n])
p = p[n:] p = p[n:]
} }
if len(p) > 0 { if len(p) > 0 {
@ -139,7 +139,7 @@ func (m *ZUC128Mac) Write(p []byte) (nn int, err error) {
func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte { func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte {
if m.nx >= chunk { if m.nx >= chunk {
panic("m.nx >= 16") panic("m.nx >= chunk")
} }
kIdx := 0 kIdx := 0
if m.nx > 0 || additionalBits > 0 { if m.nx > 0 || additionalBits > 0 {
@ -147,7 +147,7 @@ func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte {
t64 = uint64(m.t) << 32 t64 = uint64(m.t) << 32
m.x[m.nx] = b m.x[m.nx] = b
nRemainBits := 8*m.nx + additionalBits nRemainBits := 8*m.nx + additionalBits
if nRemainBits > 64 { if nRemainBits > 2*32 {
m.genKeywords(m.k0[4:6]) m.genKeywords(m.k0[4:6])
} }
words := (nRemainBits + 31) / 32 words := (nRemainBits + 31) / 32

24
zuc/eia_asm.go Normal file
View File

@ -0,0 +1,24 @@
//go:build (amd64 && !generic)
// +build amd64,!generic
package zuc
import "golang.org/x/sys/cpu"
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
//go:noescape
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
func block(m *ZUC128Mac, p []byte) {
if supportsGFMUL {
for len(p) >= chunk {
m.genKeywords(m.k0[4:])
eia3Round16B(&m.t, &m.k0[0], &p[0], m.tagSize)
p = p[chunk:]
}
} else {
blockGeneric(m, p)
}
}

153
zuc/eia_asm_amd64.s Normal file
View File

@ -0,0 +1,153 @@
// Referenced https://github.com/intel/intel-ipsec-mb/
//go:build amd64 && !generic
// +build amd64,!generic
#include "textflag.h"
DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800
DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901
GLOBL bit_reverse_table_l<>(SB), RODATA, $16
DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000
DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010
GLOBL bit_reverse_table_h<>(SB), RODATA, $16
DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
GLOBL bit_reverse_and_table<>(SB), RODATA, $16
DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100
DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16
DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908
DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c
GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
#define XTMP1 X1
#define XTMP2 X2
#define XTMP3 X3
#define XTMP4 X4
#define XTMP5 X5
#define XTMP6 X6
#define XDATA X7
#define XDIGEST X8
#define KS_L X9
#define KS_M1 X10
#define KS_M2 X11
#define KS_H X12
// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
TEXT ·eia3Round16B(SB),NOSPLIT,$0
MOVQ t+0(FP), AX
MOVQ ks+8(FP), BX
MOVQ p+16(FP), CX
MOVQ tagSize+24(FP), DX
CMPB ·useAVX2(SB), $1
JE avx2
// Reverse data bytes
MOVUPS (0)(CX), XDATA
MOVOU bit_reverse_and_table<>(SB), XTMP4
MOVOU XDATA, XTMP2
PAND XTMP4, XTMP2
PANDN XDATA, XTMP4
PSRLQ $4, XTMP4
MOVOU bit_reverse_table_h<>(SB), XTMP3
PSHUFB XTMP2, XTMP3
MOVOU bit_reverse_table_l<>(SB), XTMP1
PSHUFB XTMP4, XTMP1
PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes
// ZUC authentication part, 4x32 data bits
// setup KS
MOVUPS (0*4)(BX), XTMP1
MOVUPS (2*4)(BX), XTMP2
PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32]
PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
// setup DATA
MOVOU XTMP3, XTMP1
PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
// clmul
// xor the results from 4 32-bit words together
// Calculate lower 32 bits of tag
PCLMULQDQ $0x00, KS_L, XTMP1
PCLMULQDQ $0x11, KS_L, XTMP2
PCLMULQDQ $0x00, KS_M1, XDIGEST
PCLMULQDQ $0x11, KS_M1, XTMP3
// XOR all products and move 32-bits to lower 32 bits
PXOR XTMP1, XTMP2
PXOR XTMP3, XDIGEST
PXOR XTMP2, XDIGEST
PSRLDQ $4, XDIGEST
// Update tag
MOVL XDIGEST, R10
XORL R10, (AX)
// Copy last 16 bytes of KS to the front
MOVUPS (4*4)(BX), XTMP1
MOVUPS XTMP1, (0*4)(BX)
RET
avx2:
VMOVDQU (0)(CX), XDATA
// Reverse data bytes
VMOVDQU bit_reverse_and_table<>(SB), XTMP1
VPAND XTMP1, XDATA, XTMP2
VPANDN XDATA, XTMP1, XTMP3
VPSRLD $4, XTMP3, XTMP3
VMOVDQU bit_reverse_table_h<>(SB), XTMP1
VPSHUFB XTMP2, XTMP1, XTMP4
VMOVDQU bit_reverse_table_l<>(SB), XTMP1
VPSHUFB XTMP3, XTMP1, XTMP1
VPOR XTMP1, XTMP4, XTMP4
// ZUC authentication part, 4x32 data bits
// setup KS
VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32]
VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32]
// setup DATA
// Data bytes [31:0 0s 63:32 0s]
VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
// Data bytes [95:64 0s 127:96 0s]
VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
// clmul
// xor the results from 4 32-bit words together
// Calculate lower 32 bits of tag
VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
VPXOR XTMP3, XTMP4, XTMP3
VPXOR XTMP5, XTMP6, XTMP5
VPXOR XTMP3, XTMP5, XDIGEST
VMOVQ XDIGEST, R10
SHRQ $32, R10
XORL R10, (AX)
// Copy last 16 bytes of KS to the front
VMOVDQU (4*4)(BX), XTMP1
VMOVDQU XTMP1, (0*4)(BX)
VZEROUPPER
RET

8
zuc/eia_generic.go Normal file
View File

@ -0,0 +1,8 @@
//go:build !amd64 || generic
// +build !amd64 generic
package zuc
func block(m *ZUC128Mac, p []byte) {
blockGeneric(m, p)
}