From c71d5ccd965ca4d0358cf4be54dbe48d463392bf Mon Sep 17 00:00:00 2001 From: Emman Date: Fri, 29 Apr 2022 17:29:08 +0800 Subject: [PATCH] Fix compile error --- sm3/gen_sm3block_ni.go | 40 +-- sm3/sm3blockni_arm64.s | 550 ++++++++++++++++++++--------------------- sm4/asm_arm64.s | 75 +++--- sm4/gcm_amd64.s | 1 - sm4/gcm_arm64.s | 16 +- sm4/gcm_sm4ni_arm64.s | 129 +++++----- sm4/gen_arm64_ni.go | 36 +-- 7 files changed, 424 insertions(+), 423 deletions(-) diff --git a/sm3/gen_sm3block_ni.go b/sm3/gen_sm3block_ni.go index 2985f91..64d2e37 100644 --- a/sm3/gen_sm3block_ni.go +++ b/sm3/gen_sm3block_ni.go @@ -58,18 +58,18 @@ func sm3tt2b(Vd, Vn, Vm, imm2 byte) uint32 { // Used v5 as temp register func roundA(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) { - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2) fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i) } // Used v5 as temp register func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) { - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2) fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i) } // Compress 4 words and generate 4 words, use v6, v7, v10 as temp registers @@ -82,12 +82,12 @@ func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) { // st1, st2, sm3 state func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) { fmt.Fprintf(buf, "\t// Extension\n") - fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4) - fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6) - fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6) - fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0) + fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4) + fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6) + fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6) + fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0) fmt.Fprintf(buf, "\t// Compression\n") roundA(buf, 0, t, st1, st2, s0, 10) roundA(buf, 1, t, st1, st2, s0, 10) @@ -100,13 +100,13 @@ func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) { func qroundB(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) { if s4 != 0xff { fmt.Fprintf(buf, "\t// Extension\n") - fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4) - fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6) - fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6) + fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4) + fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6) + fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6) } - fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0) + fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0) fmt.Fprintf(buf, "\t// Compression\n") roundB(buf, 0, t, st1, st2, s0, 10) roundB(buf, 1, t, st1, st2, s0, 10) @@ -165,8 +165,8 @@ blockloop: fmt.Fprint(buf, ` SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes - VEOR V8.S4, V15.S4, V8.S4 - VEOR V9.S4, V16.S4, V9.S4 + VEOR V8.B16, V15.B16, V8.B16 + VEOR V9.B16, V16.B16, V9.B16 CBNZ R3, blockloop sm3ret: diff --git a/sm3/sm3blockni_arm64.s b/sm3/sm3blockni_arm64.s index de1355b..4884d37 100644 --- a/sm3/sm3blockni_arm64.s +++ b/sm3/sm3blockni_arm64.s @@ -23,392 +23,392 @@ blockloop: // first 16 rounds VMOV R5, V11.S[3] // Extension - VEXT 3, V2, V1, V4 - VEXT 3, V1, V0, V6 - VEXT 2, V3, V2, V7 - WORD 0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S - WORD 0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S - VEOR V1, V0, V10 + VEXT $3, V2.B16, V1.B16, V4.B16 + VEXT $3, V1.B16, V0.B16, V6.B16 + VEXT $2, V3.B16, V2.B16, V7.B16 + WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S + WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S + VEOR V1.B16, V0.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9a840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9a840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9b840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 3 + WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9b840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 3 // Extension - VEXT 3, V3, V2, V0 - VEXT 3, V2, V1, V6 - VEXT 2, V4, V3, V7 - WORD 0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S - WORD 0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S - VEOR V2, V1, V10 + VEXT $3, V3.B16, V2.B16, V0.B16 + VEXT $3, V2.B16, V1.B16, V6.B16 + VEXT $2, V4.B16, V3.B16, V7.B16 + WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S + WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S + VEOR V2.B16, V1.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9a841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9a841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9b841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 3 + WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9b841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 3 // Extension - VEXT 3, V4, V3, V1 - VEXT 3, V3, V2, V6 - VEXT 2, V0, V4, V7 - WORD 0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S - WORD 0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S - VEOR V3, V2, V10 + VEXT $3, V4.B16, V3.B16, V1.B16 + VEXT $3, V3.B16, V2.B16, V6.B16 + VEXT $2, V0.B16, V4.B16, V7.B16 + WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S + WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S + VEOR V3.B16, V2.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9a842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9a842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9b842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 3 + WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9b842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 3 // Extension - VEXT 3, V0, V4, V2 - VEXT 3, V4, V3, V6 - VEXT 2, V1, V0, V7 - WORD 0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S - WORD 0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S - VEOR V4, V3, V10 + VEXT $3, V0.B16, V4.B16, V2.B16 + VEXT $3, V4.B16, V3.B16, V6.B16 + VEXT $2, V1.B16, V0.B16, V7.B16 + WORD $0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S + WORD $0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S + VEOR V4.B16, V3.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9a843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9a843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9b843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 3 + WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9b843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 3 // second 48 rounds VMOV R6, V11.S[3] // Extension - VEXT 3, V1, V0, V3 - VEXT 3, V0, V4, V6 - VEXT 2, V2, V1, V7 - WORD 0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S - WORD 0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S - VEOR V0, V4, V10 + VEXT $3, V1.B16, V0.B16, V3.B16 + VEXT $3, V0.B16, V4.B16, V6.B16 + VEXT $2, V2.B16, V1.B16, V7.B16 + WORD $0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S + WORD $0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S + VEOR V0.B16, V4.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3 // Extension - VEXT 3, V2, V1, V4 - VEXT 3, V1, V0, V6 - VEXT 2, V3, V2, V7 - WORD 0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S - WORD 0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S - VEOR V1, V0, V10 + VEXT $3, V2.B16, V1.B16, V4.B16 + VEXT $3, V1.B16, V0.B16, V6.B16 + VEXT $2, V3.B16, V2.B16, V7.B16 + WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S + WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S + VEOR V1.B16, V0.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3 // Extension - VEXT 3, V3, V2, V0 - VEXT 3, V2, V1, V6 - VEXT 2, V4, V3, V7 - WORD 0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S - WORD 0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S - VEOR V2, V1, V10 + VEXT $3, V3.B16, V2.B16, V0.B16 + VEXT $3, V2.B16, V1.B16, V6.B16 + VEXT $2, V4.B16, V3.B16, V7.B16 + WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S + WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S + VEOR V2.B16, V1.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3 // Extension - VEXT 3, V4, V3, V1 - VEXT 3, V3, V2, V6 - VEXT 2, V0, V4, V7 - WORD 0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S - WORD 0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S - VEOR V3, V2, V10 + VEXT $3, V4.B16, V3.B16, V1.B16 + VEXT $3, V3.B16, V2.B16, V6.B16 + VEXT $2, V0.B16, V4.B16, V7.B16 + WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S + WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S + VEOR V3.B16, V2.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3 // Extension - VEXT 3, V0, V4, V2 - VEXT 3, V4, V3, V6 - VEXT 2, V1, V0, V7 - WORD 0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S - WORD 0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S - VEOR V4, V3, V10 + VEXT $3, V0.B16, V4.B16, V2.B16 + VEXT $3, V4.B16, V3.B16, V6.B16 + VEXT $2, V1.B16, V0.B16, V7.B16 + WORD $0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S + WORD $0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S + VEOR V4.B16, V3.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3 // Extension - VEXT 3, V1, V0, V3 - VEXT 3, V0, V4, V6 - VEXT 2, V2, V1, V7 - WORD 0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S - WORD 0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S - VEOR V0, V4, V10 + VEXT $3, V1.B16, V0.B16, V3.B16 + VEXT $3, V0.B16, V4.B16, V6.B16 + VEXT $2, V2.B16, V1.B16, V7.B16 + WORD $0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S + WORD $0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S + VEOR V0.B16, V4.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3 // Extension - VEXT 3, V2, V1, V4 - VEXT 3, V1, V0, V6 - VEXT 2, V3, V2, V7 - WORD 0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S - WORD 0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S - VEOR V1, V0, V10 + VEXT $3, V2.B16, V1.B16, V4.B16 + VEXT $3, V1.B16, V0.B16, V6.B16 + VEXT $2, V3.B16, V2.B16, V7.B16 + WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S + WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S + VEOR V1.B16, V0.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3 // Extension - VEXT 3, V3, V2, V0 - VEXT 3, V2, V1, V6 - VEXT 2, V4, V3, V7 - WORD 0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S - WORD 0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S - VEOR V2, V1, V10 + VEXT $3, V3.B16, V2.B16, V0.B16 + VEXT $3, V2.B16, V1.B16, V6.B16 + VEXT $2, V4.B16, V3.B16, V7.B16 + WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S + WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S + VEOR V2.B16, V1.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3 // Extension - VEXT 3, V4, V3, V1 - VEXT 3, V3, V2, V6 - VEXT 2, V0, V4, V7 - WORD 0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S - WORD 0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S - VEOR V3, V2, V10 + VEXT $3, V4.B16, V3.B16, V1.B16 + VEXT $3, V3.B16, V2.B16, V6.B16 + VEXT $2, V0.B16, V4.B16, V7.B16 + WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S + WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S + VEOR V3.B16, V2.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3 - VEOR V4, V3, V10 + VEOR V4.B16, V3.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3 - VEOR V0, V4, V10 + VEOR V0.B16, V4.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3 - VEOR V1, V0, V10 + VEOR V1.B16, V0.B16, V10.B16 // Compression - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 - WORD 0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 - WORD 0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 - WORD 0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2 - WORD 0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S VSHL $1, V11.S4, V11.S4 - WORD 0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 - WORD 0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3 SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes - VEOR V8.S4, V15.S4, V8.S4 - VEOR V9.S4, V16.S4, V9.S4 + VEOR V8.B16, V15.B16, V8.B16 + VEOR V9.B16, V16.B16, V9.B16 CBNZ R3, blockloop sm3ret: diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index 99ca4b9..9e26b04 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -165,28 +165,29 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 VMOV R1, R24_MASK.D[1] #define SM4EKEY_EXPORT_KEYS() \ - VMOV V8.S[3], V10.S[0] \ - VMOV V8.S[2], V10.S[1] \ - VMOV V8.S[1], V10.S[2] \ - VMOV V8.S[0], V10.S[3] \ - VMOV V9.S[3], V11.S[0] \ - VMOV V9.S[2], V11.S[1] \ - VMOV V9.S[1], V11.S[2] \ - VMOV V9.S[0], V11.S[3] \ - VST1.P [V9.S4, V8.S4], 32(R10) \ - VST1.P [V10.S4, V11.S4], -32(R11) + VMOV V9.S[3], V10.S[0] \ + VMOV V9.S[2], V10.S[1] \ + VMOV V9.S[1], V10.S[2] \ + VMOV V9.S[0], V10.S[3] \ + VMOV V8.S[3], V11.S[0] \ + VMOV V8.S[2], V11.S[1] \ + VMOV V8.S[1], V11.S[2] \ + VMOV V8.S[0], V11.S[3] \ + VST1.P [V8.S4, V9.S4], 32(R10) \ + VST1 [V10.S4, V11.S4], (R11) \ + SUB $32, R11, R11 #define SM4E_ROUND() \ VLD1.P 16(R10), [V8.B16] \ VREV32 V8.B16, V8.B16 \ - WORD 0x0884c0ce \ - WORD 0x2884c0ce \ - WORD 0x4884c0ce \ - WORD 0x6884c0ce \ - WORD 0x8884c0ce \ - WORD 0xa884c0ce \ - WORD 0xc884c0ce \ - WORD 0xe884c0ce \ + WORD $0x0884c0ce \ + WORD $0x2884c0ce \ + WORD $0x4884c0ce \ + WORD $0x6884c0ce \ + WORD $0x8884c0ce \ + WORD $0xa884c0ce \ + WORD $0xc884c0ce \ + WORD $0xe884c0ce \ VREV32 V8.B16, V8.B16 \ VST1.P [V8.B16], 16(R9) @@ -229,27 +230,27 @@ sm4ekey: LDP fk_mask<>(SB), (R0, R1) VMOV R0, FK_MASK.D[0] VMOV R1, FK_MASK.D[1] - VLD1 (R8), [V8.B16] - VREV32 V8.B16, V8.B16 - VEOR FK_MASK, V8, V8 + VLD1 (R8), [V9.B16] + VREV32 V9.B16, V9.B16 + VEOR FK_MASK.B16, V9.B16, V9.B16 ADD $96, R11 VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4] - WORD 0x09c960ce //SM4EKEY V9.4S, V8.4S, V0.4S - WORD 0x28c961ce //SM4EKEY V8.4S, V9.4S, V1.4S + WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S + WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S SM4EKEY_EXPORT_KEYS() - WORD 0x09c962ce //SM4EKEY V9.4S, V8.4S, V2.4S - WORD 0x28c963ce //SM4EKEY V8.4S, V9.4S, V3.4S + WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S + WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S SM4EKEY_EXPORT_KEYS() VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4] - WORD 0x09c960ce //SM4EKEY V9.4S, V8.4S, V0.4S - WORD 0x28c961ce //SM4EKEY V8.4S, V9.4S, V1.4S + WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S + WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S SM4EKEY_EXPORT_KEYS() - WORD 0x09c962ce //SM4EKEY V9.4S, V8.4S, V2.4S - WORD 0x28c963ce //SM4EKEY V8.4S, V9.4S, V3.4S + WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S + WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S SM4EKEY_EXPORT_KEYS() RET @@ -391,15 +392,15 @@ sm4niblock: VLD1 (R10), [V8.B16] VREV32 V8.B16, V8.B16 VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] - WORD 0x0884c0ce //SM4E V8.4S, V0.4S - WORD 0x2884c0ce //SM4E V8.4S, V1.4S - WORD 0x4884c0ce //SM4E V8.4S, V2.4S - WORD 0x6884c0ce //SM4E V8.4S, V3.4S + WORD $0x0884c0ce //SM4E V8.4S, V0.4S + WORD $0x2884c0ce //SM4E V8.4S, V1.4S + WORD $0x4884c0ce //SM4E V8.4S, V2.4S + WORD $0x6884c0ce //SM4E V8.4S, V3.4S VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] - WORD 0x0884c0ce //SM4E V8.4S, V0.4S - WORD 0x2884c0ce //SM4E V8.4S, V1.4S - WORD 0x4884c0ce //SM4E V8.4S, V2.4S - WORD 0x6884c0ce //SM4E V8.4S, V3.4S + WORD $0x0884c0ce //SM4E V8.4S, V0.4S + WORD $0x2884c0ce //SM4E V8.4S, V1.4S + WORD $0x4884c0ce //SM4E V8.4S, V2.4S + WORD $0x6884c0ce //SM4E V8.4S, V3.4S VREV32 V8.B16, V8.B16 VST1 [V8.B16], (R9) RET diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index 83b47e1..a89df14 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -2209,4 +2209,3 @@ TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 // func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) TEXT ·gcmSm4niDec(SB),NOSPLIT,$0 RET - \ No newline at end of file diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index 00ef81d..ad496a5 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -298,15 +298,15 @@ sm4InitEncLoop: sm4InitSM4E: VEOR B0.B16, B0.B16, B0.B16 VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4] - WORD 0x6085c0ce //SM4E V0.4S, V11.4S - WORD 0x8085c0ce //SM4E V0.4S, V12.4S - WORD 0xa085c0ce //SM4E V0.4S, V13.4S - WORD 0xc085c0ce //SM4E V0.4S, V14.4S + WORD $0x6085c0ce //SM4E V0.4S, V11.4S + WORD $0x8085c0ce //SM4E V0.4S, V12.4S + WORD $0xa085c0ce //SM4E V0.4S, V13.4S + WORD $0xc085c0ce //SM4E V0.4S, V14.4S VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4] - WORD 0x6085c0ce //SM4E V0.4S, V11.4S - WORD 0x8085c0ce //SM4E V0.4S, V12.4S - WORD 0xa085c0ce //SM4E V0.4S, V13.4S - WORD 0xc085c0ce //SM4E V0.4S, V14.4S + WORD $0x6085c0ce //SM4E V0.4S, V11.4S + WORD $0x8085c0ce //SM4E V0.4S, V12.4S + WORD $0xa085c0ce //SM4E V0.4S, V13.4S + WORD $0xc085c0ce //SM4E V0.4S, V14.4S VREV32 B0.B16, B0.B16 VREV64 B0.B16, B0.B16 sm4InitEncDone: diff --git a/sm4/gcm_sm4ni_arm64.s b/sm4/gcm_sm4ni_arm64.s index 5312c73..dbde380 100644 --- a/sm4/gcm_sm4ni_arm64.s +++ b/sm4/gcm_sm4ni_arm64.s @@ -60,73 +60,73 @@ VEOR T3.B16, ACCM.B16, ACCM.B16 #define sm4eEnc1block() \ - WORD 0x6086c0ce \ //SM4E V0.4S, V19.4S - WORD 0x8086c0ce \ //SM4E V0.4S, V20.4S - WORD 0xa086c0ce \ //SM4E V0.4S, V21.4S - WORD 0xc086c0ce \ //SM4E V0.4S, V22.4S - WORD 0xe086c0ce \ //SM4E V0.4S, V23.4S - WORD 0x0087c0ce \ //SM4E V0.4S, V24.4S - WORD 0x2087c0ce \ //SM4E V0.4S, V25.4S - WORD 0x4087c0ce //SM4E V0.4S, V26.4S + WORD $0x6086c0ce \ //SM4E V0.4S, V19.4S + WORD $0x8086c0ce \ //SM4E V0.4S, V20.4S + WORD $0xa086c0ce \ //SM4E V0.4S, V21.4S + WORD $0xc086c0ce \ //SM4E V0.4S, V22.4S + WORD $0xe086c0ce \ //SM4E V0.4S, V23.4S + WORD $0x0087c0ce \ //SM4E V0.4S, V24.4S + WORD $0x2087c0ce \ //SM4E V0.4S, V25.4S + WORD $0x4087c0ce //SM4E V0.4S, V26.4S #define sm4eEnc8blocks() \ sm4eEnc1block() \ - WORD 0x6186c0ce \ //SM4E V1.4S, V19.4S - WORD 0x8186c0ce \ //SM4E V1.4S, V20.4S - WORD 0xa186c0ce \ //SM4E V1.4S, V21.4S - WORD 0xc186c0ce \ //SM4E V1.4S, V22.4S - WORD 0xe186c0ce \ //SM4E V1.4S, V23.4S - WORD 0x0187c0ce \ //SM4E V1.4S, V24.4S - WORD 0x2187c0ce \ //SM4E V1.4S, V25.4S - WORD 0x4187c0ce \ //SM4E V1.4S, V26.4S - WORD 0x6286c0ce \ //SM4E V2.4S, V19.4S - WORD 0x8286c0ce \ //SM4E V2.4S, V20.4S - WORD 0xa286c0ce \ //SM4E V2.4S, V21.4S - WORD 0xc286c0ce \ //SM4E V2.4S, V22.4S - WORD 0xe286c0ce \ //SM4E V2.4S, V23.4S - WORD 0x0287c0ce \ //SM4E V2.4S, V24.4S - WORD 0x2287c0ce \ //SM4E V2.4S, V25.4S - WORD 0x4287c0ce \ //SM4E V2.4S, V26.4S - WORD 0x6386c0ce \ //SM4E V3.4S, V19.4S - WORD 0x8386c0ce \ //SM4E V3.4S, V20.4S - WORD 0xa386c0ce \ //SM4E V3.4S, V21.4S - WORD 0xc386c0ce \ //SM4E V3.4S, V22.4S - WORD 0xe386c0ce \ //SM4E V3.4S, V23.4S - WORD 0x0387c0ce \ //SM4E V3.4S, V24.4S - WORD 0x2387c0ce \ //SM4E V3.4S, V25.4S - WORD 0x4387c0ce \ //SM4E V3.4S, V26.4S - WORD 0x6486c0ce \ //SM4E V4.4S, V19.4S - WORD 0x8486c0ce \ //SM4E V4.4S, V20.4S - WORD 0xa486c0ce \ //SM4E V4.4S, V21.4S - WORD 0xc486c0ce \ //SM4E V4.4S, V22.4S - WORD 0xe486c0ce \ //SM4E V4.4S, V23.4S - WORD 0x0487c0ce \ //SM4E V4.4S, V24.4S - WORD 0x2487c0ce \ //SM4E V4.4S, V25.4S - WORD 0x4487c0ce \ //SM4E V4.4S, V26.4S - WORD 0x6586c0ce \ //SM4E V5.4S, V19.4S - WORD 0x8586c0ce \ //SM4E V5.4S, V20.4S - WORD 0xa586c0ce \ //SM4E V5.4S, V21.4S - WORD 0xc586c0ce \ //SM4E V5.4S, V22.4S - WORD 0xe586c0ce \ //SM4E V5.4S, V23.4S - WORD 0x0587c0ce \ //SM4E V5.4S, V24.4S - WORD 0x2587c0ce \ //SM4E V5.4S, V25.4S - WORD 0x4587c0ce \ //SM4E V5.4S, V26.4S - WORD 0x6686c0ce \ //SM4E V6.4S, V19.4S - WORD 0x8686c0ce \ //SM4E V6.4S, V20.4S - WORD 0xa686c0ce \ //SM4E V6.4S, V21.4S - WORD 0xc686c0ce \ //SM4E V6.4S, V22.4S - WORD 0xe686c0ce \ //SM4E V6.4S, V23.4S - WORD 0x0687c0ce \ //SM4E V6.4S, V24.4S - WORD 0x2687c0ce \ //SM4E V6.4S, V25.4S - WORD 0x4687c0ce \ //SM4E V6.4S, V26.4S - WORD 0x6786c0ce \ //SM4E V7.4S, V19.4S - WORD 0x8786c0ce \ //SM4E V7.4S, V20.4S - WORD 0xa786c0ce \ //SM4E V7.4S, V21.4S - WORD 0xc786c0ce \ //SM4E V7.4S, V22.4S - WORD 0xe786c0ce \ //SM4E V7.4S, V23.4S - WORD 0x0787c0ce \ //SM4E V7.4S, V24.4S - WORD 0x2787c0ce \ //SM4E V7.4S, V25.4S - WORD 0x4787c0ce //SM4E V7.4S, V26.4S + WORD $0x6186c0ce \ //SM4E V1.4S, V19.4S + WORD $0x8186c0ce \ //SM4E V1.4S, V20.4S + WORD $0xa186c0ce \ //SM4E V1.4S, V21.4S + WORD $0xc186c0ce \ //SM4E V1.4S, V22.4S + WORD $0xe186c0ce \ //SM4E V1.4S, V23.4S + WORD $0x0187c0ce \ //SM4E V1.4S, V24.4S + WORD $0x2187c0ce \ //SM4E V1.4S, V25.4S + WORD $0x4187c0ce \ //SM4E V1.4S, V26.4S + WORD $0x6286c0ce \ //SM4E V2.4S, V19.4S + WORD $0x8286c0ce \ //SM4E V2.4S, V20.4S + WORD $0xa286c0ce \ //SM4E V2.4S, V21.4S + WORD $0xc286c0ce \ //SM4E V2.4S, V22.4S + WORD $0xe286c0ce \ //SM4E V2.4S, V23.4S + WORD $0x0287c0ce \ //SM4E V2.4S, V24.4S + WORD $0x2287c0ce \ //SM4E V2.4S, V25.4S + WORD $0x4287c0ce \ //SM4E V2.4S, V26.4S + WORD $0x6386c0ce \ //SM4E V3.4S, V19.4S + WORD $0x8386c0ce \ //SM4E V3.4S, V20.4S + WORD $0xa386c0ce \ //SM4E V3.4S, V21.4S + WORD $0xc386c0ce \ //SM4E V3.4S, V22.4S + WORD $0xe386c0ce \ //SM4E V3.4S, V23.4S + WORD $0x0387c0ce \ //SM4E V3.4S, V24.4S + WORD $0x2387c0ce \ //SM4E V3.4S, V25.4S + WORD $0x4387c0ce \ //SM4E V3.4S, V26.4S + WORD $0x6486c0ce \ //SM4E V4.4S, V19.4S + WORD $0x8486c0ce \ //SM4E V4.4S, V20.4S + WORD $0xa486c0ce \ //SM4E V4.4S, V21.4S + WORD $0xc486c0ce \ //SM4E V4.4S, V22.4S + WORD $0xe486c0ce \ //SM4E V4.4S, V23.4S + WORD $0x0487c0ce \ //SM4E V4.4S, V24.4S + WORD $0x2487c0ce \ //SM4E V4.4S, V25.4S + WORD $0x4487c0ce \ //SM4E V4.4S, V26.4S + WORD $0x6586c0ce \ //SM4E V5.4S, V19.4S + WORD $0x8586c0ce \ //SM4E V5.4S, V20.4S + WORD $0xa586c0ce \ //SM4E V5.4S, V21.4S + WORD $0xc586c0ce \ //SM4E V5.4S, V22.4S + WORD $0xe586c0ce \ //SM4E V5.4S, V23.4S + WORD $0x0587c0ce \ //SM4E V5.4S, V24.4S + WORD $0x2587c0ce \ //SM4E V5.4S, V25.4S + WORD $0x4587c0ce \ //SM4E V5.4S, V26.4S + WORD $0x6686c0ce \ //SM4E V6.4S, V19.4S + WORD $0x8686c0ce \ //SM4E V6.4S, V20.4S + WORD $0xa686c0ce \ //SM4E V6.4S, V21.4S + WORD $0xc686c0ce \ //SM4E V6.4S, V22.4S + WORD $0xe686c0ce \ //SM4E V6.4S, V23.4S + WORD $0x0687c0ce \ //SM4E V6.4S, V24.4S + WORD $0x2687c0ce \ //SM4E V6.4S, V25.4S + WORD $0x4687c0ce \ //SM4E V6.4S, V26.4S + WORD $0x6786c0ce \ //SM4E V7.4S, V19.4S + WORD $0x8786c0ce \ //SM4E V7.4S, V20.4S + WORD $0xa786c0ce \ //SM4E V7.4S, V21.4S + WORD $0xc786c0ce \ //SM4E V7.4S, V22.4S + WORD $0xe786c0ce \ //SM4E V7.4S, V23.4S + WORD $0x0787c0ce \ //SM4E V7.4S, V24.4S + WORD $0x2787c0ce \ //SM4E V7.4S, V25.4S + WORD $0x4787c0ce //SM4E V7.4S, V26.4S // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 @@ -142,6 +142,7 @@ TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 #define H0 R9 #define H1 R10 #define pTblSave R11 +#define rkSave R12 MOVD productTable+0(FP), pTbl MOVD dst+8(FP), dstPtr MOVD src_base+32(FP), srcPtr diff --git a/sm4/gen_arm64_ni.go b/sm4/gen_arm64_ni.go index 864d40a..0db7724 100644 --- a/sm4/gen_arm64_ni.go +++ b/sm4/gen_arm64_ni.go @@ -27,11 +27,11 @@ func sm4ekey(Vd, Vn, Vm byte) uint32 { } func sm4ekeyRound(buf *bytes.Buffer, d, n, m byte) { - fmt.Fprintf(buf, "\tWORD 0x%08x //SM4EKEY V%d.4S, V%d.4S, V%d.4S\n", sm4ekey(d, n, m), d, n, m) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM4EKEY V%d.4S, V%d.4S, V%d.4S\n", sm4ekey(d, n, m), d, n, m) } func sm4eRound(buf *bytes.Buffer, d, n byte) { - fmt.Fprintf(buf, "\tWORD 0x%08x //SM4E V%d.4S, V%d.4S\n", sm4e(d, n), d, n) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM4E V%d.4S, V%d.4S\n", sm4e(d, n), d, n) } func main() { @@ -48,27 +48,27 @@ TEXT ·expandKeySM4E(SB),NOSPLIT,$0 MOVD ck+16(FP), R10 MOVD enc+24(FP), R11 - VLD1 (R8), [V8.B16] - VREV32 V8.B16, V8.B16 - VLD1 (R9), [V9.S4] - VEOR V9, V8, V8 + VLD1 (R8), [V9.B16] + VREV32 V9.B16, V9.B16 + VLD1 (R9), [V8.S4] + VEOR V9, V8, V9 VLD1.P 64(R10), [V0.S4, V1.S4, V2.S4, V3.S4] `[1:]) - sm4ekeyRound(buf, 9, 8, 0) - sm4ekeyRound(buf, 8, 9, 1) - fmt.Fprintf(buf, "\tVST1.P [V9.S4, V8.S4], 32(R11)\n") - sm4ekeyRound(buf, 9, 8, 2) - sm4ekeyRound(buf, 8, 9, 3) - fmt.Fprintf(buf, "\tVST1.P [V9.S4, V8.S4], 32(R11)\n") + sm4ekeyRound(buf, 8, 9, 0) + sm4ekeyRound(buf, 9, 8, 1) + fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n") + sm4ekeyRound(buf, 8, 9, 2) + sm4ekeyRound(buf, 9, 8, 3) + fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n") fmt.Fprintf(buf, "\tVLD1.P 64(R10), [V0.S4, V1.S4, V2.S4, V3.S4]\n") - sm4ekeyRound(buf, 9, 8, 0) - sm4ekeyRound(buf, 8, 9, 1) - fmt.Fprintf(buf, "\tVST1.P [V9.S4, V8.S4], 32(R11)\n") - sm4ekeyRound(buf, 9, 8, 2) - sm4ekeyRound(buf, 8, 9, 3) + sm4ekeyRound(buf, 8, 9, 0) + sm4ekeyRound(buf, 9, 8, 1) + fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n") + sm4ekeyRound(buf, 8, 9, 2) + sm4ekeyRound(buf, 9, 8, 3) fmt.Fprintf(buf, ` - VST1.P [V9.S4, V8.S4], 32(R11) + VST1.P [V8.S4, V9.S4], 32(R11) RET `[1:]) fmt.Fprint(buf, `