From 4734c4a6703312e659be4cc65dbad4b340c31e42 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 6 Jul 2022 10:50:57 +0800 Subject: [PATCH] zuc: init arm64 --- zuc/asm_arm64.s | 509 ++++++++++++++++++++++++++++++++++++++++++++ zuc/core_asm.go | 4 +- zuc/core_generic.go | 4 +- zuc/eea_asm.go | 4 +- zuc/eea_generic.go | 4 +- 5 files changed, 517 insertions(+), 8 deletions(-) create mode 100644 zuc/asm_arm64.s diff --git a/zuc/asm_arm64.s b/zuc/asm_arm64.s new file mode 100644 index 0000000..d32d032 --- /dev/null +++ b/zuc/asm_arm64.s @@ -0,0 +1,509 @@ +//go:build arm64 && !generic +// +build arm64,!generic + +#include "textflag.h" + +DATA Top3_bits_of_the_byte<>+0x00(SB)/8, $0xe0e0e0e0e0e0e0e0 +DATA Top3_bits_of_the_byte<>+0x08(SB)/8, $0xe0e0e0e0e0e0e0e0 +GLOBL Top3_bits_of_the_byte<>(SB), RODATA, $16 + +DATA Bottom5_bits_of_the_byte<>+0x00(SB)/8, $0x1f1f1f1f1f1f1f1f +DATA Bottom5_bits_of_the_byte<>+0x08(SB)/8, $0x1f1f1f1f1f1f1f1f +GLOBL Bottom5_bits_of_the_byte<>(SB), RODATA, $16 + +DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F +DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F +GLOBL nibble_mask<>(SB), RODATA, $16 + +DATA P1_data<>+0x00(SB)/8, $0x0A020F0F0E000F09 +DATA P1_data<>+0x08(SB)/8, $0x090305070C000400 +GLOBL P1_data<>(SB), RODATA, $16 + +DATA P2_data<>+0x00(SB)/8, $0x040C000705060D08 +DATA P2_data<>+0x08(SB)/8, $0x0209030F0A0E010B +GLOBL P2_data<>(SB), RODATA, $16 + +DATA P3_data<>+0x00(SB)/8, $0x0F0A0D00060A0602 +DATA P3_data<>+0x08(SB)/8, $0x0D0C0900050D0303 +GLOBL P3_data<>(SB), RODATA, $16 + +DATA Aes_to_Zuc_mul_low_nibble<>+0x00(SB)/8, $0x1D1C9F9E83820100 +DATA Aes_to_Zuc_mul_low_nibble<>+0x08(SB)/8, $0x3938BBBAA7A62524 +GLOBL Aes_to_Zuc_mul_low_nibble<>(SB), RODATA, $16 + +DATA Aes_to_Zuc_mul_high_nibble<>+0x00(SB)/8, $0xA174A97CDD08D500 +DATA Aes_to_Zuc_mul_high_nibble<>+0x08(SB)/8, $0x3DE835E04194499C +GLOBL Aes_to_Zuc_mul_high_nibble<>(SB), RODATA, $16 + +DATA Comb_matrix_mul_low_nibble<>+0x00(SB)/8, $0xA8BC0216D9CD7367 +DATA Comb_matrix_mul_low_nibble<>+0x08(SB)/8, $0x1F0BB5A16E7AC4D0 +GLOBL Comb_matrix_mul_low_nibble<>(SB), RODATA, $16 + +DATA Comb_matrix_mul_high_nibble<>+0x00(SB)/8, $0x638CFA1523CCBA55 +DATA Comb_matrix_mul_high_nibble<>+0x08(SB)/8, $0x3FD0A6497F90E609 +GLOBL Comb_matrix_mul_high_nibble<>(SB), RODATA, $16 + +DATA Shuf_mask<>+0x00(SB)/8, $0x0B0E0104070A0D00 +DATA Shuf_mask<>+0x08(SB)/8, $0x0306090C0F020508 +GLOBL Shuf_mask<>(SB), RODATA, $16 + +DATA mask_S0<>+0x00(SB)/8, $0xff00ff00ff00ff00 +DATA mask_S0<>+0x08(SB)/8, $0xff00ff00ff00ff00 +GLOBL mask_S0<>(SB), RODATA, $16 + +DATA mask_S1<>+0x00(SB)/8, $0x00ff00ff00ff00ff +DATA mask_S1<>+0x08(SB)/8, $0x00ff00ff00ff00ff +GLOBL mask_S1<>(SB), RODATA, $16 + +#define SI R0 +#define DI R1 +#define BP R2 +#define AX R3 +#define BX R4 +#define CX R5 +#define DX R6 + +#define ZERO V16 +#define TOP3_BITS V19 +#define BOTTOM5_BITS V20 +#define NIBBLE_MASK V21 +#define INVERSE_SHIFT_ROWS V22 +#define M1L V23 +#define M1H V24 +#define M2L V25 +#define M2H V26 +#define P1 V27 +#define P2 V28 +#define P3 V29 +#define S0_MASK V30 +#define S1_MASK V31 + +#define OFFSET_FR1 (16*4) +#define OFFSET_FR2 (17*4) +#define OFFSET_BRC_X0 (18*4) +#define OFFSET_BRC_X1 (19*4) +#define OFFSET_BRC_X2 (20*4) +#define OFFSET_BRC_X3 (21*4) + +#define LOAD_GLOBAL_DATA() \ + LDP nibble_mask<>(SB), (R0, R1) \ + VMOV R0, NIBBLE_MASK.D[0] \ + VMOV R1, NIBBLE_MASK.D[1] \ + LDP Top3_bits_of_the_byte<>(SB), (R0, R1) \ + VMOV R0, TOP3_BITS.D[0] \ + VMOV R1, TOP3_BITS.D[1] \ + LDP Bottom5_bits_of_the_byte<>(SB), (R0, R1) \ + VMOV R0, BOTTOM5_BITS.D[0] \ + VMOV R1, BOTTOM5_BITS.D[1] \ + LDP Aes_to_Zuc_mul_low_nibble<>(SB), (R0, R1) \ + VMOV R0, M1L.D[0] \ + VMOV R1, M1L.D[1] \ + LDP Aes_to_Zuc_mul_high_nibble<>(SB), (R0, R1) \ + VMOV R0, M1H.D[0] \ + VMOV R1, M1H.D[1] \ + LDP Comb_matrix_mul_low_nibble<>(SB), (R0, R1) \ + VMOV R0, M2L.D[0] \ + VMOV R1, M2L.D[1] \ + LDP Comb_matrix_mul_high_nibble<>(SB), (R0, R1) \ + VMOV R0, M2H.D[0] \ + VMOV R1, M2H.D[1] \ + LDP P1_data<>(SB), (R0, R1) \ + VMOV R0, P1.D[0] \ + VMOV R1, P1.D[1] \ + LDP P2_data<>(SB), (R0, R1) \ + VMOV R0, P2.D[0] \ + VMOV R1, P2.D[1] \ + LDP P3_data<>(SB), (R0, R1) \ + VMOV R0, P3.D[0] \ + VMOV R1, P3.D[1] \ + LDP mask_S0<>(SB), (R0, R1) \ + VMOV R0, S0_MASK.D[0] \ + VMOV R1, S0_MASK.D[1] \ + LDP mask_S1<>(SB), (R0, R1) \ + VMOV R0, S1_MASK.D[0] \ + VMOV R1, S1_MASK.D[1] \ + LDP Shuf_mask<>(SB), (R0, R1) \ + VMOV R0, INVERSE_SHIFT_ROWS.D[0] \ + VMOV R1, INVERSE_SHIFT_ROWS.D[1] + +#define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now + LSLW n, a \ + LSRW n, b \ + ORRW b, a + +#define Rotl_5(XDATA, XTMP0) \ + VSHL $5, XDATA.S4, XTMP0.S4 \ + VUSHR $3, XDATA.S4, XDATA.S4 \ + VAND TOP3_BITS.B16, XTMP0.B16, XTMP0.B16 \ + VAND BOTTOM5_BITS.B16, XDATA.B16, XDATA.B16 \ + VORR XTMP0.B16, XDATA.B16, XDATA.B16 + +#define S0_comput(IN_OUT, XTMP1, XTMP2) \ + VUSHR $4, IN_OUT.S4, XTMP1.S4 \ + VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \ + \ + VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \ + \ + VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \ + VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16 \ + \ + VTBL XTMP2.B16, [P2.B16], XTMP1.B16 \ + VEOR IN_OUT.B16, XTMP1.B16, XTMP1.B16 \ + \ + VTBL XTMP1.B16, [P3.B16], IN_OUT.B16 \ + VEOR XTMP2.B16, IN_OUT.B16, IN_OUT.B16 \ + \ + VSHL $4, IN_OUT.S4, IN_OUT.S4 \ + VEOR XTMP1.B16, IN_OUT.B16, IN_OUT.B16 \ + Rotl_5(IN_OUT, XTMP1) + +#define S1_comput(x, XTMP1, XTMP2) \ + VAND x.B16, NIBBLE_MASK.B16, XTMP1.B16; \ + VTBL XTMP1.B16, [M1L.B16], XTMP2.B16; \ + VUSHR $4, x.D2, x.D2; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP1.B16; \ + VTBL XTMP1.B16, [M1H.B16], XTMP1.B16; \ + VEOR XTMP2.B16, XTMP1.B16, x.B16; \ + VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ + AESE ZERO.B16, x.B16; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP1.B16; \ + VTBL XTMP1.B16, [M2L.B16], XTMP2.B16; \ + VUSHR $4, x.D2, x.D2; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP1.B16; \ + VTBL XTMP1.B16, [M2H.B16], XTMP1.B16; \ + VEOR XTMP2.B16, XTMP1.B16, x.B16 + +#define BITS_REORG(idx) \ + MOVW (((15 + idx) % 16)*4)(SI), R12 \ + MOVW (((14 + idx) % 16)*4)(SI), AX \ + MOVW (((11 + idx) % 16)*4)(SI), R13 \ + MOVW (((9 + idx) % 16)*4)(SI), BX \ + MOVW (((7 + idx) % 16)*4)(SI), R14 \ + MOVW (((5 + idx) % 16)*4)(SI), CX \ + MOVW (((2 + idx) % 16)*4)(SI), R15 \ + MOVW (((0 + idx) % 16)*4)(SI), DX \ + LSRW $15, R12 \ + LSLW $16, AX \ + LSLW $1, BX \ + LSLW $1, CX \ + LSLW $1, DX \ + SHLDL(R12, AX, $16) \ + SHLDL(R13, BX, $16) \ + SHLDL(R14, CX, $16) \ + SHLDL(R15, DX, $16) + +#define LFSR_UPDT(idx) \ + MOVW (((0 + idx) % 16)*4)(SI), BX \ + MOVW (((4 + idx) % 16)*4)(SI), CX \ + MOVW (((10 + idx) % 16)*4)(SI), DX \ + MOVW (((13 + idx) % 16)*4)(SI), R8 \ + MOVW (((15 + idx) % 16)*4)(SI), R9 \ + ADD BX, AX \ + LSL $8, BX \ + LSL $20, CX \ + LSL $21, DX \ + LSL $17, R8 \ + LSL $15, R9 \ + ADD BX, AX \ + ADD CX, AX \ + ADD DX, AX \ + ADD R8, AX \ + ADD R9, AX \ + \ + MOVD AX, BX \ + AND $0x7FFFFFFF, AX \ + LSR $31, BX \ + ADD BX, AX \ + \ + SUBS $0x7FFFFFFF, AX, BX \ + CSEL CS, BX, AX, AX \ + \ + MOVW AX, (((0 + idx) % 16)*4)(SI) + +#define NONLIN_FUN() \ + MOVW R12, AX \ + EORW R10, AX \ + ADDW R11, AX \ + ADDW R13, R10 \ // W1= F_R1 + BRC_X1 + EORW R14, R11 \ // W2= F_R2 ^ BRC_X2 + \ + MOVW R10, DX \ + MOVW R11, CX \ + SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16) + SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16) + MOVW DX, BX \ + MOVW DX, CX \ + MOVW DX, R8 \ + MOVW DX, R9 \ + RORW $30, BX \ + RORW $22, CX \ + RORW $14, R8 \ + RORW $8, R9 \ + EORW BX, DX \ + EORW CX, DX \ + EORW R8, DX \ + EORW R9, DX \ // U = L1(P) = EDX, hi(RDX)=0 + MOVW R11, BX \ + MOVW R11, CX \ + MOVW R11, R8 \ + MOVW R11, R9 \ + RORW $24, BX \ + RORW $18, CX \ + RORW $10, R8 \ + RORW $2, R9 \ + EORW BX, R11 \ + EORW CX, R11 \ + EORW R8, R11 \ + EORW R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0 + LSL $32, R11 \ + EOR R11, DX \ + VMOV DX, V0.D2 \ + VMOV V0.B16, V1.B16 \ + S0_comput(V1, V2, V3) \ + S1_comput(V0, V2, V3) \ + \ + VAND S1_MASK.B16, V0.B16, V0.B16 \ + VAND S0_MASK.B16, V1.B16, V1.B16 \ + VEOR V1.B16, V0.B16, V0.B16 \ + \ + VMOV V0.S[0], R10 \ // F_R1 + VMOV V0.S[1], R11 + +#define RESTORE_LFSR_0() \ + MOVW.P 4(SI), AX \ + VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ + SUB $4, SI \ + MOVD (52)(SI), BX \ + MOVW (60)(SI), CX \ + \ + VST1 [V0.B16, V1.B16, V2.B16], (SI) \ + MOVD BX, (48)(SI) \ + MOVW CX, (56)(SI) \ + MOVW AX, (60)(SI) + +#define RESTORE_LFSR_2() \ + MOVD.P 8(SI), AX \ + VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ + SUB $8, SI \ + MOVD (56)(SI), BX \ + \ + VST1 [V0.B16, V1.B16, V2.B16], (SI) \ + MOVD BX, (48)(SI) \ + MOVD AX, (56)(SI) + +#define RESTORE_LFSR_4() \ + VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ + \ + VST1.P [V1.B16, V2.B16, V3.B16], 48(SI) \ + VST1 [V0.B16], (SI) \ + SUB $48, SI + +#define RESTORE_LFSR_8() \ + VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ + \ + VST1.P [V2.B16, V3.B16], 32(SI) \ + VST1 [V0.B16, V1.B16], (SI) \ + SUB $32, SI + +#define LOAD_STATE() \ + MOVW OFFSET_FR1(SI), R10 \ + MOVW OFFSET_FR2(SI), R11 \ + MOVW OFFSET_BRC_X0(SI), R12 \ + MOVW OFFSET_BRC_X1(SI), R13 \ + MOVW OFFSET_BRC_X2(SI), R14 \ + MOVW OFFSET_BRC_X3(SI), R15 + +#define SAVE_STATE() \ + MOVW R10, OFFSET_FR1(SI) \ + MOVW R11, OFFSET_FR2(SI) \ + MOVW R12, OFFSET_BRC_X0(SI) \ + MOVW R13, OFFSET_BRC_X1(SI) \ + MOVW R14, OFFSET_BRC_X2(SI) \ + MOVW R15, OFFSET_BRC_X3(SI) + +// func genKeywordAsm(s *zucState32) uint32 +TEXT ·genKeywordAsm(SB),NOSPLIT,$0 + MOVD pState+0(FP), SI + + LOAD_GLOBAL_DATA() + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + LOAD_STATE() + + BITS_REORG(0) + NONLIN_FUN() + + EORW R15, AX + MOVW AX, ret+8(FP) + EOR AX, AX + LFSR_UPDT(0) + SAVE_STATE() + RESTORE_LFSR_0() + + RET + +#define ONEROUND(idx) \ + BITS_REORG(idx) \ + NONLIN_FUN() \ + EORW R15, AX \ + MOVW AX, (idx*4)(DI) \ + EOR AX, AX \ + LFSR_UPDT(idx) + +#define ROUND_REV32(idx) \ + BITS_REORG(idx) \ + NONLIN_FUN() \ + EORW R15, AX \ + REVW AX, AX \ + MOVW AX, (idx*4)(DI) \ + EOR AX, AX \ + LFSR_UPDT(idx) + +// func genKeyStreamAsm(keyStream []uint32, pState *zucState32) +TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0 + MOVD ks+0(FP), DI + MOVD ks_len+8(FP), BP + MOVD pState+24(FP), SI + + LOAD_GLOBAL_DATA() + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + LOAD_STATE() + +zucSixteens: + CMP $16, BP + BLT zucOctet + SUB $16, BP + ONEROUND(0) + ONEROUND(1) + ONEROUND(2) + ONEROUND(3) + ONEROUND(4) + ONEROUND(5) + ONEROUND(6) + ONEROUND(7) + ONEROUND(8) + ONEROUND(9) + ONEROUND(10) + ONEROUND(11) + ONEROUND(12) + ONEROUND(13) + ONEROUND(14) + ONEROUND(15) + ADD $4*16, DI + B zucSixteens + +zucOctet: + CMP $8, BP + BLT zucNibble + SUB $8, BP + ONEROUND(0) + ONEROUND(1) + ONEROUND(2) + ONEROUND(3) + ONEROUND(4) + ONEROUND(5) + ONEROUND(6) + ONEROUND(7) + ADD $2*16, DI + RESTORE_LFSR_8() +zucNibble: + CMP $4, BP + BLT zucDouble + SUB $4, BP + ONEROUND(0) + ONEROUND(1) + ONEROUND(2) + ONEROUND(3) + ADD $1*16, DI + RESTORE_LFSR_4() +zucDouble: + CMP $2, BP + BLT zucSingle + SUB $2, BP + ONEROUND(0) + ONEROUND(1) + ADD $8, DI + RESTORE_LFSR_2() +zucSingle: + TBZ $0, BP, zucRet + ONEROUND(0) + RESTORE_LFSR_0() +zucRet: + SAVE_STATE() + RET + +// func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) +TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0 + MOVD ks+0(FP), DI + MOVD ks_len+8(FP), BP + MOVD pState+24(FP), SI + + LOAD_GLOBAL_DATA() + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + LSR $2, BP + + LOAD_STATE() + +zucSixteens: + CMP $16, BP + BLT zucOctet + SUB $16, BP + ROUND_REV32(0) + ROUND_REV32(1) + ROUND_REV32(2) + ROUND_REV32(3) + ROUND_REV32(4) + ROUND_REV32(5) + ROUND_REV32(6) + ROUND_REV32(7) + ROUND_REV32(8) + ROUND_REV32(9) + ROUND_REV32(10) + ROUND_REV32(11) + ROUND_REV32(12) + ROUND_REV32(13) + ROUND_REV32(14) + ROUND_REV32(15) + ADD $4*16, DI + B zucSixteens + +zucOctet: + CMP $8, BP + BLT zucNibble + SUB $8, BP + ROUND_REV32(0) + ROUND_REV32(1) + ROUND_REV32(2) + ROUND_REV32(3) + ROUND_REV32(4) + ROUND_REV32(5) + ROUND_REV32(6) + ROUND_REV32(7) + ADD $2*16, DI + RESTORE_LFSR_8() +zucNibble: + CMP $4, BP + BLT zucDouble + SUB $4, BP + ROUND_REV32(0) + ROUND_REV32(1) + ROUND_REV32(2) + ROUND_REV32(3) + ADD $16, DI + RESTORE_LFSR_4() +zucDouble: + CMP $2, BP + BLT zucSingle + SUB $2, BP + ROUND_REV32(0) + ROUND_REV32(1) + ADD $8, DI + RESTORE_LFSR_2() +zucSingle: + TBZ $0, BP, zucRet + ROUND_REV32(0) + RESTORE_LFSR_0() +zucRet: + SAVE_STATE() + RET diff --git a/zuc/core_asm.go b/zuc/core_asm.go index c630751..8c93a89 100644 --- a/zuc/core_asm.go +++ b/zuc/core_asm.go @@ -1,5 +1,5 @@ -//go:build (amd64 && !generic) -// +build amd64,!generic +//go:build (amd64 && !generic) || (arm64 && !generic) +// +build amd64,!generic arm64,!generic package zuc diff --git a/zuc/core_generic.go b/zuc/core_generic.go index f91a358..ba7adb8 100644 --- a/zuc/core_generic.go +++ b/zuc/core_generic.go @@ -1,5 +1,5 @@ -//go:build !amd64 || generic -// +build !amd64 generic +//go:build !amd64 && !arm64 || generic +// +build !amd64,!arm64 generic package zuc diff --git a/zuc/eea_asm.go b/zuc/eea_asm.go index f8bfc02..4f5ee95 100644 --- a/zuc/eea_asm.go +++ b/zuc/eea_asm.go @@ -1,5 +1,5 @@ -//go:build (amd64 && !generic) -// +build amd64,!generic +//go:build (amd64 && !generic) || (arm64 && !generic) +// +build amd64,!generic arm64,!generic package zuc diff --git a/zuc/eea_generic.go b/zuc/eea_generic.go index e7135c5..8754497 100644 --- a/zuc/eea_generic.go +++ b/zuc/eea_generic.go @@ -1,5 +1,5 @@ -//go:build !amd64 || generic -// +build !amd64 generic +//go:build !amd64 && !arm64 || generic +// +build !amd64,!arm64 generic package zuc