diff --git a/sm4/arm64_verify.s b/sm4/arm64_verify.s deleted file mode 100644 index c6f3dc5..0000000 --- a/sm4/arm64_verify.s +++ /dev/null @@ -1,15 +0,0 @@ -#include "textflag.h" - -// func tblAsm(in, imm, out *byte) -TEXT ·tblAsm(SB),NOSPLIT,$0 - MOVD in+0(FP), R8 - MOVD imm+8(FP), R9 - MOVD out+16(FP), R10 - - VLD1 (R8), [V0.B16] - VLD1 (R9), [V1.B16] - - VTBL V1.B16, [V0.B16], V2.B16 - VST1 [V2.B16], (R10) - RET - \ No newline at end of file diff --git a/sm4/arm64_verify_test.go b/sm4/arm64_verify_test.go deleted file mode 100644 index 7c756c8..0000000 --- a/sm4/arm64_verify_test.go +++ /dev/null @@ -1,25 +0,0 @@ -//go:build arm64 -// +build arm64 - -package sm4 - -import ( - "reflect" - "testing" -) - -//go:noescape -func tblAsm(in, imm, out *byte) - -func TestTblAsm(t *testing.T) { - in := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} - imm := []byte{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0} - - out := make([]byte, 16) - tblAsm(&in[0], &imm[0], &out[0]) - if !reflect.DeepEqual(out, imm) { - t.Errorf("expected=%v, result=%v\n", imm, out) - } - - imm = []byte{0, 7, 2, 3} -} diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index bce05ed..afb56a0 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -114,6 +114,44 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 VEOR XTMP7.B16, y.B16, y.B16; \ VEOR x.B16, y.B16, x.B16 +#define load_global_data_1() \ + LDP flip_mask<>(SB), (R0, R1) \ + VMOV R0, FLIP_MASK.D[0] \ + VMOV R1, FLIP_MASK.D[1] \ + LDP nibble_mask<>(SB), (R0, R1) \ + VMOV R0, NIBBLE_MASK.D[0] \ + VMOV R1, NIBBLE_MASK.D[1] \ + LDP m1_low<>(SB), (R0, R1) \ + VMOV R0, M1L.D[0] \ + VMOV R1, M1L.D[1] \ + LDP m1_high<>(SB), (R0, R1) \ + VMOV R0, M1H.D[0] \ + VMOV R1, M1H.D[1] \ + LDP m2_low<>(SB), (R0, R1) \ + VMOV R0, M2L.D[0] \ + VMOV R1, M2L.D[1] \ + LDP m2_high<>(SB), (R0, R1) \ + VMOV R0, M2H.D[0] \ + VMOV R1, M2H.D[1] \ + LDP fk_mask<>(SB), (R0, R1) \ + VMOV R0, FK_MASK.D[0] \ + VMOV R1, FK_MASK.D[1] \ + LDP inverse_shift_rows<>(SB), (R0, R1) \ + VMOV R0, INVERSE_SHIFT_ROWS.D[0] \ + VMOV R1, INVERSE_SHIFT_ROWS.D[1] + +#define load_global_data_2() \ + load_global_data_1() \ + LDP r08_mask<>(SB), (R0, R1) \ + VMOV R0, R08_MASK.D[0] \ + VMOV R1, R08_MASK.D[1] \ + LDP r16_mask<>(SB), (R0, R1) \ + VMOV R0, R16_MASK.D[0] \ + VMOV R1, R16_MASK.D[1] \ + LDP r24_mask<>(SB), (R0, R1) \ + VMOV R0, R24_MASK.D[0] \ + VMOV R1, R24_MASK.D[1] + // func expandKeyAsm(key *byte, ck, enc, dec *uint32) TEXT ·expandKeyAsm(SB),NOSPLIT,$0 MOVD key+0(FP), R8 @@ -121,37 +159,7 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0 MOVD enc+16(FP), R10 MOVD dec+24(FP), R11 - LDP flip_mask<>(SB), (R0, R1) - VMOV R0, FLIP_MASK.D[0] - VMOV R1, FLIP_MASK.D[1] - - LDP nibble_mask<>(SB), (R0, R1) - VMOV R0, NIBBLE_MASK.D[0] - VMOV R1, NIBBLE_MASK.D[1] - - LDP m1_low<>(SB), (R0, R1) - VMOV R0, M1L.D[0] - VMOV R1, M1L.D[1] - - LDP m1_high<>(SB), (R0, R1) - VMOV R0, M1H.D[0] - VMOV R1, M1H.D[1] - - LDP m2_low<>(SB), (R0, R1) - VMOV R0, M2L.D[0] - VMOV R1, M2L.D[1] - - LDP m2_high<>(SB), (R0, R1) - VMOV R0, M2H.D[0] - VMOV R1, M2H.D[1] - - LDP fk_mask<>(SB), (R0, R1) - VMOV R0, FK_MASK.D[0] - VMOV R1, FK_MASK.D[1] - - LDP inverse_shift_rows<>(SB), (R0, R1) - VMOV R0, INVERSE_SHIFT_ROWS.D[0] - VMOV R1, INVERSE_SHIFT_ROWS.D[1] + load_global_data_1() VLD1 (R8), [t0.B16] VTBL FLIP_MASK.B16, [t0.B16], t0.B16 @@ -249,49 +257,7 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 VMOV R21, t2.S[3] VMOV R22, t3.S[3] - LDP flip_mask<>(SB), (R0, R1) - VMOV R0, FLIP_MASK.D[0] - VMOV R1, FLIP_MASK.D[1] - - LDP nibble_mask<>(SB), (R0, R1) - VMOV R0, NIBBLE_MASK.D[0] - VMOV R1, NIBBLE_MASK.D[1] - - LDP m1_low<>(SB), (R0, R1) - VMOV R0, M1L.D[0] - VMOV R1, M1L.D[1] - - LDP m1_high<>(SB), (R0, R1) - VMOV R0, M1H.D[0] - VMOV R1, M1H.D[1] - - LDP m2_low<>(SB), (R0, R1) - VMOV R0, M2L.D[0] - VMOV R1, M2L.D[1] - - LDP m2_high<>(SB), (R0, R1) - VMOV R0, M2H.D[0] - VMOV R1, M2H.D[1] - - LDP fk_mask<>(SB), (R0, R1) - VMOV R0, FK_MASK.D[0] - VMOV R1, FK_MASK.D[1] - - LDP inverse_shift_rows<>(SB), (R0, R1) - VMOV R0, INVERSE_SHIFT_ROWS.D[0] - VMOV R1, INVERSE_SHIFT_ROWS.D[1] - - LDP r08_mask<>(SB), (R0, R1) - VMOV R0, R08_MASK.D[0] - VMOV R1, R08_MASK.D[1] - - LDP r16_mask<>(SB), (R0, R1) - VMOV R0, R16_MASK.D[0] - VMOV R1, R16_MASK.D[1] - - LDP r24_mask<>(SB), (R0, R1) - VMOV R0, R24_MASK.D[0] - VMOV R1, R24_MASK.D[1] + load_global_data_2() VTBL FLIP_MASK.B16, [t0.B16], t0.B16 VTBL FLIP_MASK.B16, [t1.B16], t1.B16 @@ -399,49 +365,7 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 VMOV R21, t2.S[0] VMOV R22, t3.S[0] - LDP flip_mask<>(SB), (R0, R1) - VMOV R0, FLIP_MASK.D[0] - VMOV R1, FLIP_MASK.D[1] - - LDP nibble_mask<>(SB), (R0, R1) - VMOV R0, NIBBLE_MASK.D[0] - VMOV R1, NIBBLE_MASK.D[1] - - LDP m1_low<>(SB), (R0, R1) - VMOV R0, M1L.D[0] - VMOV R1, M1L.D[1] - - LDP m1_high<>(SB), (R0, R1) - VMOV R0, M1H.D[0] - VMOV R1, M1H.D[1] - - LDP m2_low<>(SB), (R0, R1) - VMOV R0, M2L.D[0] - VMOV R1, M2L.D[1] - - LDP m2_high<>(SB), (R0, R1) - VMOV R0, M2H.D[0] - VMOV R1, M2H.D[1] - - LDP fk_mask<>(SB), (R0, R1) - VMOV R0, FK_MASK.D[0] - VMOV R1, FK_MASK.D[1] - - LDP inverse_shift_rows<>(SB), (R0, R1) - VMOV R0, INVERSE_SHIFT_ROWS.D[0] - VMOV R1, INVERSE_SHIFT_ROWS.D[1] - - LDP r08_mask<>(SB), (R0, R1) - VMOV R0, R08_MASK.D[0] - VMOV R1, R08_MASK.D[1] - - LDP r16_mask<>(SB), (R0, R1) - VMOV R0, R16_MASK.D[0] - VMOV R1, R16_MASK.D[1] - - LDP r24_mask<>(SB), (R0, R1) - VMOV R0, R24_MASK.D[0] - VMOV R1, R24_MASK.D[1] + load_global_data_2() VEOR ZERO.B16, ZERO.B16, ZERO.B16 EOR R0, R0