diff --git a/.github/workflows/sm3_sm4_ni.ci.yml b/.github/workflows/sm3_sm4_ni.ci.yml new file mode 100644 index 0000000..95dc5a4 --- /dev/null +++ b/.github/workflows/sm3_sm4_ni.ci.yml @@ -0,0 +1,39 @@ +name: ci + +on: + push: + branches: [ sm3_sm4_ni ] + pull_request: + branches: [ sm3_sm4_ni ] + +jobs: + + build: + runs-on: ubuntu-latest + strategy: + matrix: + goVer: ['1.15', '1.16', '1.17'] + steps: + - name: Checkout Repo + uses: actions/checkout@v2 + + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.goVer }} + + - name: Setup Environment + run: | + echo "GOPATH=$(go env GOPATH)" >> $GITHUB_ENV + echo "$(go env GOPATH)/bin" >> $GITHUB_PATH + + - name: Module cache + uses: actions/cache@v2.1.7 + env: + cache-name: go-mod-cache + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/go.sum') }} + + - name: Test + run: go test -v ./... diff --git a/sm3/gen_sm3block_ni.go b/sm3/gen_sm3block_ni.go index 2985f91..64d2e37 100644 --- a/sm3/gen_sm3block_ni.go +++ b/sm3/gen_sm3block_ni.go @@ -58,18 +58,18 @@ func sm3tt2b(Vd, Vn, Vm, imm2 byte) uint32 { // Used v5 as temp register func roundA(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) { - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2) fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i) } // Used v5 as temp register func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) { - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2) fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i) } // Compress 4 words and generate 4 words, use v6, v7, v10 as temp registers @@ -82,12 +82,12 @@ func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) { // st1, st2, sm3 state func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) { fmt.Fprintf(buf, "\t// Extension\n") - fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4) - fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6) - fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6) - fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0) + fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4) + fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6) + fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6) + fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0) fmt.Fprintf(buf, "\t// Compression\n") roundA(buf, 0, t, st1, st2, s0, 10) roundA(buf, 1, t, st1, st2, s0, 10) @@ -100,13 +100,13 @@ func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) { func qroundB(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) { if s4 != 0xff { fmt.Fprintf(buf, "\t// Extension\n") - fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4) - fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6) - fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3) - fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6) + fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4) + fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6) + fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3) + fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6) } - fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0) + fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0) fmt.Fprintf(buf, "\t// Compression\n") roundB(buf, 0, t, st1, st2, s0, 10) roundB(buf, 1, t, st1, st2, s0, 10) @@ -165,8 +165,8 @@ blockloop: fmt.Fprint(buf, ` SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes - VEOR V8.S4, V15.S4, V8.S4 - VEOR V9.S4, V16.S4, V9.S4 + VEOR V8.B16, V15.B16, V8.B16 + VEOR V9.B16, V16.B16, V9.B16 CBNZ R3, blockloop sm3ret: diff --git a/sm3/sm3block_arm64.go b/sm3/sm3block_arm64.go index 1076e8e..2bb89b9 100644 --- a/sm3/sm3block_arm64.go +++ b/sm3/sm3block_arm64.go @@ -19,10 +19,10 @@ func blockARM64(dig *digest, p []byte) func blockSM3NI(h []uint32, p []byte, t []uint32) func block(dig *digest, p []byte) { - //if !useSM3NI { + if !useSM3NI { blockARM64(dig, p) - //} else { - // h := dig.h[:] - // blockSM3NI(h, p, t) - //} + } else { + h := dig.h[:] + blockSM3NI(h, p, t) + } } diff --git a/sm3/sm3blockni_arm64.s b/sm3/sm3blockni_arm64.s new file mode 100644 index 0000000..4884d37 --- /dev/null +++ b/sm3/sm3blockni_arm64.s @@ -0,0 +1,416 @@ +// Generated by gen_sm3block_ni.go. DO NOT EDIT. + +#include "textflag.h" + +// func blockSM3NI(h []uint32, p []byte, t []uint32) +TEXT ·blockSM3NI(SB), 0, $0 + MOVD h_base+0(FP), R0 // Hash value first address + MOVD p_base+24(FP), R1 // message first address + MOVD p_len+32(FP), R3 // message length + MOVD t_base+48(FP), R2 // t constants first address + + VLD1 (R0), [V8.S4, V9.S4] // load h(a,b,c,d,e,f,g,h) + LDPW (0*8)(R2), (R5, R6) // load t constants + +blockloop: + VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16] // load 64bytes message + VMOV V8.B16, V15.B16 // backup: V8 h(dcba) + VMOV V9.B16, V16.B16 // backup: V9 h(hgfe) + VREV32 V0.B16, V0.B16 // prepare for using message in Byte format + VREV32 V1.B16, V1.B16 + VREV32 V2.B16, V2.B16 + VREV32 V3.B16, V3.B16 + // first 16 rounds + VMOV R5, V11.S[3] + // Extension + VEXT $3, V2.B16, V1.B16, V4.B16 + VEXT $3, V1.B16, V0.B16, V6.B16 + VEXT $2, V3.B16, V2.B16, V7.B16 + WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S + WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S + VEOR V1.B16, V0.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9a840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9b840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 3 + + // Extension + VEXT $3, V3.B16, V2.B16, V0.B16 + VEXT $3, V2.B16, V1.B16, V6.B16 + VEXT $2, V4.B16, V3.B16, V7.B16 + WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S + WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S + VEOR V2.B16, V1.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9a841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9b841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 3 + + // Extension + VEXT $3, V4.B16, V3.B16, V1.B16 + VEXT $3, V3.B16, V2.B16, V6.B16 + VEXT $2, V0.B16, V4.B16, V7.B16 + WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S + WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S + VEOR V3.B16, V2.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9a842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9b842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 3 + + // Extension + VEXT $3, V0.B16, V4.B16, V2.B16 + VEXT $3, V4.B16, V3.B16, V6.B16 + VEXT $2, V1.B16, V0.B16, V7.B16 + WORD $0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S + WORD $0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S + VEOR V4.B16, V3.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9a843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9b843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 3 + + // second 48 rounds + VMOV R6, V11.S[3] + // Extension + VEXT $3, V1.B16, V0.B16, V3.B16 + VEXT $3, V0.B16, V4.B16, V6.B16 + VEXT $2, V2.B16, V1.B16, V7.B16 + WORD $0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S + WORD $0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S + VEOR V0.B16, V4.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3 + + // Extension + VEXT $3, V2.B16, V1.B16, V4.B16 + VEXT $3, V1.B16, V0.B16, V6.B16 + VEXT $2, V3.B16, V2.B16, V7.B16 + WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S + WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S + VEOR V1.B16, V0.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3 + + // Extension + VEXT $3, V3.B16, V2.B16, V0.B16 + VEXT $3, V2.B16, V1.B16, V6.B16 + VEXT $2, V4.B16, V3.B16, V7.B16 + WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S + WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S + VEOR V2.B16, V1.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3 + + // Extension + VEXT $3, V4.B16, V3.B16, V1.B16 + VEXT $3, V3.B16, V2.B16, V6.B16 + VEXT $2, V0.B16, V4.B16, V7.B16 + WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S + WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S + VEOR V3.B16, V2.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3 + + // Extension + VEXT $3, V0.B16, V4.B16, V2.B16 + VEXT $3, V4.B16, V3.B16, V6.B16 + VEXT $2, V1.B16, V0.B16, V7.B16 + WORD $0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S + WORD $0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S + VEOR V4.B16, V3.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3 + + // Extension + VEXT $3, V1.B16, V0.B16, V3.B16 + VEXT $3, V0.B16, V4.B16, V6.B16 + VEXT $2, V2.B16, V1.B16, V7.B16 + WORD $0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S + WORD $0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S + VEOR V0.B16, V4.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3 + + // Extension + VEXT $3, V2.B16, V1.B16, V4.B16 + VEXT $3, V1.B16, V0.B16, V6.B16 + VEXT $2, V3.B16, V2.B16, V7.B16 + WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S + WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S + VEOR V1.B16, V0.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3 + + // Extension + VEXT $3, V3.B16, V2.B16, V0.B16 + VEXT $3, V2.B16, V1.B16, V6.B16 + VEXT $2, V4.B16, V3.B16, V7.B16 + WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S + WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S + VEOR V2.B16, V1.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3 + + // Extension + VEXT $3, V4.B16, V3.B16, V1.B16 + VEXT $3, V3.B16, V2.B16, V6.B16 + VEXT $2, V0.B16, V4.B16, V7.B16 + WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S + WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S + VEOR V3.B16, V2.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3 + + VEOR V4.B16, V3.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3 + + VEOR V0.B16, V4.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3 + + VEOR V1.B16, V0.B16, V10.B16 + // Compression + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0 + WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1 + WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2 + WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2 + WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S + VSHL $1, V11.S4, V11.S4 + WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3 + WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3 + + SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes + VEOR V8.B16, V15.B16, V8.B16 + VEOR V9.B16, V16.B16, V9.B16 + CBNZ R3, blockloop + +sm3ret: + VST1 [V8.S4, V9.S4], (R0) // store hash value H + RET diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 929a1c8..3c80ecd 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -290,7 +290,7 @@ GLOBL fk_mask<>(SB), RODATA, $16 AVX_SM4_TAO_L1(x, y); \ VPXOR x, t0, t0 -// func expandKeyAsm(key *byte, ck, enc, dec *uint32) +// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) TEXT ·expandKeyAsm(SB),NOSPLIT,$0 MOVQ key+0(FP), AX MOVQ ck+8(FP), BX @@ -321,7 +321,7 @@ loop: expand_end: RET -// func encryptBlocksAsm(xk *uint32, dst, src []byte) +// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 MOVQ xk+0(FP), AX MOVQ dst+8(FP), BX @@ -497,7 +497,7 @@ avx2_sm4_done: VZEROUPPER RET -// func encryptBlockAsm(xk *uint32, dst, src *byte) +// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 MOVQ xk+0(FP), AX MOVQ dst+8(FP), BX diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index 97f0163..9e26b04 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -164,13 +164,44 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 VMOV R0, R24_MASK.D[0] \ VMOV R1, R24_MASK.D[1] -// func expandKeyAsm(key *byte, ck, enc, dec *uint32) +#define SM4EKEY_EXPORT_KEYS() \ + VMOV V9.S[3], V10.S[0] \ + VMOV V9.S[2], V10.S[1] \ + VMOV V9.S[1], V10.S[2] \ + VMOV V9.S[0], V10.S[3] \ + VMOV V8.S[3], V11.S[0] \ + VMOV V8.S[2], V11.S[1] \ + VMOV V8.S[1], V11.S[2] \ + VMOV V8.S[0], V11.S[3] \ + VST1.P [V8.S4, V9.S4], 32(R10) \ + VST1 [V10.S4, V11.S4], (R11) \ + SUB $32, R11, R11 + +#define SM4E_ROUND() \ + VLD1.P 16(R10), [V8.B16] \ + VREV32 V8.B16, V8.B16 \ + WORD $0x0884c0ce \ + WORD $0x2884c0ce \ + WORD $0x4884c0ce \ + WORD $0x6884c0ce \ + WORD $0x8884c0ce \ + WORD $0xa884c0ce \ + WORD $0xc884c0ce \ + WORD $0xe884c0ce \ + VREV32 V8.B16, V8.B16 \ + VST1.P [V8.B16], 16(R9) + +// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) TEXT ·expandKeyAsm(SB),NOSPLIT,$0 MOVD key+0(FP), R8 MOVD ck+8(FP), R9 MOVD enc+16(FP), R10 MOVD dec+24(FP), R11 - + MOVD inst+32(FP), R12 + + CMP $1, R12 + BEQ sm4ekey + load_global_data_1() VLD1 (R8), [t0.B16] @@ -193,14 +224,46 @@ ksLoop: ADD $16, R0 CMP $128, R0 BNE ksLoop - RET -// func encryptBlocksAsm(xk *uint32, dst, src []byte) +sm4ekey: + LDP fk_mask<>(SB), (R0, R1) + VMOV R0, FK_MASK.D[0] + VMOV R1, FK_MASK.D[1] + VLD1 (R8), [V9.B16] + VREV32 V9.B16, V9.B16 + VEOR FK_MASK.B16, V9.B16, V9.B16 + ADD $96, R11 + + VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4] + WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S + WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S + SM4EKEY_EXPORT_KEYS() + + WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S + WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S + SM4EKEY_EXPORT_KEYS() + + VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4] + WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S + WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S + SM4EKEY_EXPORT_KEYS() + + WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S + WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S + SM4EKEY_EXPORT_KEYS() + RET + +// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 MOVD xk+0(FP), R8 MOVD dst+8(FP), R9 MOVD src+32(FP), R10 + MOVD src_len+40(FP), R12 + MOVD inst+56(FP), R11 + + CMP $1, R11 + BEQ sm4niblocks VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4] VMOV V5.S[0], t0.S[0] @@ -271,15 +334,26 @@ encryptBlocksLoop: VMOV t1.S[3], V8.S[2] VMOV t0.S[3], V8.S[3] VST1 [V8.B16], (R9) - RET +sm4niblocks: + VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] + VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4] +sm4niblockloop: + SM4E_ROUND() + SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes + CBNZ R12, sm4niblockloop + RET -// func encryptBlockAsm(xk *uint32, dst, src *byte) +// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 MOVD xk+0(FP), R8 MOVD dst+8(FP), R9 MOVD src+16(FP), R10 + MOVD inst+24(FP), R11 + + CMP $1, R11 + BEQ sm4niblock VLD1 (R10), [t0.S4] VREV32 t0.B16, t0.B16 @@ -312,5 +386,21 @@ encryptBlockLoop: VMOV t1.S[0], V8.S[2] VMOV t0.S[0], V8.S[3] VST1 [V8.B16], (R9) + RET +sm4niblock: + VLD1 (R10), [V8.B16] + VREV32 V8.B16, V8.B16 + VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] + WORD $0x0884c0ce //SM4E V8.4S, V0.4S + WORD $0x2884c0ce //SM4E V8.4S, V1.4S + WORD $0x4884c0ce //SM4E V8.4S, V2.4S + WORD $0x6884c0ce //SM4E V8.4S, V3.4S + VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] + WORD $0x0884c0ce //SM4E V8.4S, V0.4S + WORD $0x2884c0ce //SM4E V8.4S, V1.4S + WORD $0x4884c0ce //SM4E V8.4S, V2.4S + WORD $0x6884c0ce //SM4E V8.4S, V3.4S + VREV32 V8.B16, V8.B16 + VST1 [V8.B16], (R9) RET diff --git a/sm4/cipher_asm.go b/sm4/cipher_asm.go index ad08b5e..b05775a 100644 --- a/sm4/cipher_asm.go +++ b/sm4/cipher_asm.go @@ -15,14 +15,19 @@ var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 -//go:noescape -func encryptBlocksAsm(xk *uint32, dst, src []byte) +const ( + INST_AES int = iota + INST_SM4 +) //go:noescape -func encryptBlockAsm(xk *uint32, dst, src *byte) +func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) //go:noescape -func expandKeyAsm(key *byte, ck, enc, dec *uint32) +func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) + +//go:noescape +func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) type sm4CipherAsm struct { sm4Cipher @@ -30,24 +35,66 @@ type sm4CipherAsm struct { blocksSize int } +type sm4CipherNI struct { + sm4Cipher +} + +func newCipherNI(key []byte) (cipher.Block, error) { + c := &sm4CipherNI{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}} + expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0], INST_SM4) + if supportsGFMUL { + return &sm4CipherNIGCM{c}, nil + } + return c, nil +} + +func (c *sm4CipherNI) Encrypt(dst, src []byte) { + if len(src) < BlockSize { + panic("sm4: input not full block") + } + if len(dst) < BlockSize { + panic("sm4: output not full block") + } + if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) { + panic("sm4: invalid buffer overlap") + } + encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_SM4) +} + +func (c *sm4CipherNI) Decrypt(dst, src []byte) { + if len(src) < BlockSize { + panic("sm4: input not full block") + } + if len(dst) < BlockSize { + panic("sm4: output not full block") + } + if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) { + panic("sm4: invalid buffer overlap") + } + encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_SM4) +} + func newCipher(key []byte) (cipher.Block, error) { + if supportSM4 { + return newCipherNI(key) + } + if !supportsAES { return newCipherGeneric(key) } + blocks := 4 if useAVX2 { blocks = 8 } - c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, blocks, blocks * BlockSize} - expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0]) - if supportsAES && supportsGFMUL { + c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, blocks, blocks * BlockSize} + expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0], INST_AES) + if supportsGFMUL { return &sm4CipherGCM{c}, nil } - return &c, nil + return c, nil } -func (c *sm4CipherAsm) BlockSize() int { return BlockSize } - func (c *sm4CipherAsm) Concurrency() int { return c.batchBlocks } func (c *sm4CipherAsm) Encrypt(dst, src []byte) { @@ -60,7 +107,7 @@ func (c *sm4CipherAsm) Encrypt(dst, src []byte) { if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) { panic("sm4: invalid buffer overlap") } - encryptBlockAsm(&c.enc[0], &dst[0], &src[0]) + encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_AES) } func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) { @@ -73,7 +120,7 @@ func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) { if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) { panic("sm4: invalid buffer overlap") } - encryptBlocksAsm(&c.enc[0], dst, src) + encryptBlocksAsm(&c.enc[0], dst, src, INST_AES) } func (c *sm4CipherAsm) Decrypt(dst, src []byte) { @@ -86,7 +133,7 @@ func (c *sm4CipherAsm) Decrypt(dst, src []byte) { if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) { panic("sm4: invalid buffer overlap") } - encryptBlockAsm(&c.dec[0], &dst[0], &src[0]) + encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_AES) } func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) { @@ -99,14 +146,16 @@ func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) { if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) { panic("sm4: invalid buffer overlap") } - encryptBlocksAsm(&c.dec[0], dst, src) + encryptBlocksAsm(&c.dec[0], dst, src, INST_AES) } // expandKey is used by BenchmarkExpand to ensure that the asm implementation // of key expansion is used for the benchmark when it is available. func expandKey(key []byte, enc, dec []uint32) { - if supportsAES { - expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0]) + if supportSM4 { + expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0], INST_SM4) + } else if supportsAES { + expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0], INST_AES) } else { expandKeyGo(key, enc, dec) } diff --git a/sm4/cipher_asm_fuzzy_test.go b/sm4/cipher_asm_fuzzy_test.go index 22c3821..b3894cd 100644 --- a/sm4/cipher_asm_fuzzy_test.go +++ b/sm4/cipher_asm_fuzzy_test.go @@ -34,7 +34,7 @@ func TestExpandKey(t *testing.T) { } io.ReadFull(rand.Reader, key) expandKeyGo(key, encRes1, decRes1) - expandKeyAsm(&key[0], &ck[0], &encRes2[0], &decRes2[0]) + expandKey(key, encRes2, decRes2) if !reflect.DeepEqual(encRes1, encRes2) { t.Errorf("expected=%v, result=%v\n", encRes1, encRes2) } diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index 9a4235c..a89df14 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -2201,3 +2201,11 @@ avx2GcmSm4DecDone: VMOVDQU ACC0, (tPtr) VZEROUPPER RET + +// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) +TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 + RET + +// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) +TEXT ·gcmSm4niDec(SB),NOSPLIT,$0 + RET diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index 2915314..ad496a5 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -252,7 +252,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 SM4_TAO_L1(x, y, z); \ VEOR x.B16, t0.B16, t0.B16 -// func gcmSm4Init(productTable *[256]byte, rk []uint32) +// func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int) TEXT ·gcmSm4Init(SB),NOSPLIT,$0 #define pTbl R0 #define RK R1 @@ -260,6 +260,7 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0 MOVD productTable+0(FP), pTbl MOVD rk+8(FP), RK + MOVD inst+16(FP), R5 MOVD $0xC2, I LSL $56, I @@ -269,6 +270,9 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0 VEOR ZERO.B16, ZERO.B16, ZERO.B16 // Encrypt block 0 with the SM4 keys to generate the hash key H + CMP $1, R5 + BEQ sm4InitSM4E + LOAD_SM4_AESNI_CONSTS() VEOR B0.B16, B0.B16, B0.B16 VEOR B1.B16, B1.B16, B1.B16 @@ -290,7 +294,22 @@ sm4InitEncLoop: VMOV B1.S[0], B0.S[3] VMOV B2.S[0], B0.S[0] VMOV B3.S[0], B0.S[1] - + B sm4InitEncDone +sm4InitSM4E: + VEOR B0.B16, B0.B16, B0.B16 + VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4] + WORD $0x6085c0ce //SM4E V0.4S, V11.4S + WORD $0x8085c0ce //SM4E V0.4S, V12.4S + WORD $0xa085c0ce //SM4E V0.4S, V13.4S + WORD $0xc085c0ce //SM4E V0.4S, V14.4S + VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4] + WORD $0x6085c0ce //SM4E V0.4S, V11.4S + WORD $0x8085c0ce //SM4E V0.4S, V12.4S + WORD $0xa085c0ce //SM4E V0.4S, V13.4S + WORD $0xc085c0ce //SM4E V0.4S, V14.4S + VREV32 B0.B16, B0.B16 + VREV64 B0.B16, B0.B16 +sm4InitEncDone: // Multiply by 2 modulo P VMOV B0.D[0], I ASR $63, I @@ -547,6 +566,7 @@ TEXT ·gcmSm4Enc(SB),NOSPLIT,$0 VMOV H0, INC.S[3] VREV32 CTR.B16, CTR.B16 VADD CTR.S4, INC.S4, CTR.S4 + // Skip to <8 blocks loop CMP $128, srcPtrLen @@ -587,7 +607,7 @@ encOctetsEnc4Blocks1: VREV32 B2.B16, B2.B16 VREV32 B3.B16, B3.B16 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) - // encryption first 4 blocks + // encryption second 4 blocks PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) MOVD rkSave, rk @@ -880,7 +900,7 @@ decOctetsEnc4Blocks1: VREV32 B3.B16, B3.B16 TRANSPOSE_MATRIX(T1, T2, B2, B3, K0) - // encryption first 4 blocks + // encryption second 4 blocks PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) MOVD rkSave, rk diff --git a/sm4/gcm_sm4ni_arm64.s b/sm4/gcm_sm4ni_arm64.s new file mode 100644 index 0000000..dbde380 --- /dev/null +++ b/sm4/gcm_sm4ni_arm64.s @@ -0,0 +1,525 @@ +#include "textflag.h" + +#define B0 V0 +#define B1 V1 +#define B2 V2 +#define B3 V3 +#define B4 V4 +#define B5 V5 +#define B6 V6 +#define B7 V7 + +#define ACC0 V8 +#define ACC1 V9 +#define ACCM V10 + +#define T0 V11 +#define T1 V12 +#define T2 V13 +#define T3 V14 + +#define POLY V15 +#define ZERO V16 +#define INC V17 +#define CTR V18 + +#define K0 V19 +#define K1 V20 +#define K2 V21 +#define K3 V22 +#define K4 V23 +#define K5 V24 +#define K6 V25 +#define K7 V26 + +#define reduce() \ + VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ + VEOR ACC1.B16, ACCM.B16, ACCM.B16 \ + VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \ + VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \ + VEOR ACCM.B16, ACC0.B16, ACC0.B16 \ + VEOR T0.B16, ACC1.B16, ACC1.B16 \ + VPMULL POLY.D1, ACC0.D1, T0.Q1 \ + VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \ + VEOR T0.B16, ACC0.B16, ACC0.B16 \ + VPMULL POLY.D1, ACC0.D1, T0.Q1 \ + VEOR T0.B16, ACC1.B16, ACC1.B16 \ + VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \ + VEOR ACC1.B16, ACC0.B16, ACC0.B16 \ + +#define mulRound(X) \ + VLD1.P 32(pTbl), [T1.B16, T2.B16] \ + VREV64 X.B16, X.B16 \ + VEXT $8, X.B16, X.B16, T0.B16 \ + VEOR X.B16, T0.B16, T0.B16 \ + VPMULL X.D1, T1.D1, T3.Q1 \ + VEOR T3.B16, ACC1.B16, ACC1.B16 \ + VPMULL2 X.D2, T1.D2, T3.Q1 \ + VEOR T3.B16, ACC0.B16, ACC0.B16 \ + VPMULL T0.D1, T2.D1, T3.Q1 \ + VEOR T3.B16, ACCM.B16, ACCM.B16 + +#define sm4eEnc1block() \ + WORD $0x6086c0ce \ //SM4E V0.4S, V19.4S + WORD $0x8086c0ce \ //SM4E V0.4S, V20.4S + WORD $0xa086c0ce \ //SM4E V0.4S, V21.4S + WORD $0xc086c0ce \ //SM4E V0.4S, V22.4S + WORD $0xe086c0ce \ //SM4E V0.4S, V23.4S + WORD $0x0087c0ce \ //SM4E V0.4S, V24.4S + WORD $0x2087c0ce \ //SM4E V0.4S, V25.4S + WORD $0x4087c0ce //SM4E V0.4S, V26.4S + +#define sm4eEnc8blocks() \ + sm4eEnc1block() \ + WORD $0x6186c0ce \ //SM4E V1.4S, V19.4S + WORD $0x8186c0ce \ //SM4E V1.4S, V20.4S + WORD $0xa186c0ce \ //SM4E V1.4S, V21.4S + WORD $0xc186c0ce \ //SM4E V1.4S, V22.4S + WORD $0xe186c0ce \ //SM4E V1.4S, V23.4S + WORD $0x0187c0ce \ //SM4E V1.4S, V24.4S + WORD $0x2187c0ce \ //SM4E V1.4S, V25.4S + WORD $0x4187c0ce \ //SM4E V1.4S, V26.4S + WORD $0x6286c0ce \ //SM4E V2.4S, V19.4S + WORD $0x8286c0ce \ //SM4E V2.4S, V20.4S + WORD $0xa286c0ce \ //SM4E V2.4S, V21.4S + WORD $0xc286c0ce \ //SM4E V2.4S, V22.4S + WORD $0xe286c0ce \ //SM4E V2.4S, V23.4S + WORD $0x0287c0ce \ //SM4E V2.4S, V24.4S + WORD $0x2287c0ce \ //SM4E V2.4S, V25.4S + WORD $0x4287c0ce \ //SM4E V2.4S, V26.4S + WORD $0x6386c0ce \ //SM4E V3.4S, V19.4S + WORD $0x8386c0ce \ //SM4E V3.4S, V20.4S + WORD $0xa386c0ce \ //SM4E V3.4S, V21.4S + WORD $0xc386c0ce \ //SM4E V3.4S, V22.4S + WORD $0xe386c0ce \ //SM4E V3.4S, V23.4S + WORD $0x0387c0ce \ //SM4E V3.4S, V24.4S + WORD $0x2387c0ce \ //SM4E V3.4S, V25.4S + WORD $0x4387c0ce \ //SM4E V3.4S, V26.4S + WORD $0x6486c0ce \ //SM4E V4.4S, V19.4S + WORD $0x8486c0ce \ //SM4E V4.4S, V20.4S + WORD $0xa486c0ce \ //SM4E V4.4S, V21.4S + WORD $0xc486c0ce \ //SM4E V4.4S, V22.4S + WORD $0xe486c0ce \ //SM4E V4.4S, V23.4S + WORD $0x0487c0ce \ //SM4E V4.4S, V24.4S + WORD $0x2487c0ce \ //SM4E V4.4S, V25.4S + WORD $0x4487c0ce \ //SM4E V4.4S, V26.4S + WORD $0x6586c0ce \ //SM4E V5.4S, V19.4S + WORD $0x8586c0ce \ //SM4E V5.4S, V20.4S + WORD $0xa586c0ce \ //SM4E V5.4S, V21.4S + WORD $0xc586c0ce \ //SM4E V5.4S, V22.4S + WORD $0xe586c0ce \ //SM4E V5.4S, V23.4S + WORD $0x0587c0ce \ //SM4E V5.4S, V24.4S + WORD $0x2587c0ce \ //SM4E V5.4S, V25.4S + WORD $0x4587c0ce \ //SM4E V5.4S, V26.4S + WORD $0x6686c0ce \ //SM4E V6.4S, V19.4S + WORD $0x8686c0ce \ //SM4E V6.4S, V20.4S + WORD $0xa686c0ce \ //SM4E V6.4S, V21.4S + WORD $0xc686c0ce \ //SM4E V6.4S, V22.4S + WORD $0xe686c0ce \ //SM4E V6.4S, V23.4S + WORD $0x0687c0ce \ //SM4E V6.4S, V24.4S + WORD $0x2687c0ce \ //SM4E V6.4S, V25.4S + WORD $0x4687c0ce \ //SM4E V6.4S, V26.4S + WORD $0x6786c0ce \ //SM4E V7.4S, V19.4S + WORD $0x8786c0ce \ //SM4E V7.4S, V20.4S + WORD $0xa786c0ce \ //SM4E V7.4S, V21.4S + WORD $0xc786c0ce \ //SM4E V7.4S, V22.4S + WORD $0xe786c0ce \ //SM4E V7.4S, V23.4S + WORD $0x0787c0ce \ //SM4E V7.4S, V24.4S + WORD $0x2787c0ce \ //SM4E V7.4S, V25.4S + WORD $0x4787c0ce //SM4E V7.4S, V26.4S + +// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) +TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 +#define pTbl R0 +#define dstPtr R1 +#define ctrPtr R2 +#define srcPtr R3 +#define rk R4 +#define tPtr R5 +#define srcPtrLen R6 +#define aluCTR R7 +#define aluTMP R8 +#define H0 R9 +#define H1 R10 +#define pTblSave R11 +#define rkSave R12 + MOVD productTable+0(FP), pTbl + MOVD dst+8(FP), dstPtr + MOVD src_base+32(FP), srcPtr + MOVD src_len+40(FP), srcPtrLen + MOVD ctr+56(FP), ctrPtr + MOVD T+64(FP), tPtr + MOVD rk_base+72(FP), rk + + MOVD $0xC2, H1 + LSL $56, H1 + MOVD $1, H0 + VMOV H1, POLY.D[0] + VMOV H0, POLY.D[1] + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD pTbl, pTblSave + // Current tag, after AAD + VLD1 (tPtr), [ACC0.B16] + VEOR ACC1.B16, ACC1.B16, ACC1.B16 + VEOR ACCM.B16, ACCM.B16, ACCM.B16 + // Prepare initial counter, and the increment vector + VLD1 (ctrPtr), [CTR.B16] + VEOR INC.B16, INC.B16, INC.B16 + MOVD $1, H0 + VMOV H0, INC.S[3] + VREV32 CTR.B16, CTR.B16 + VADD CTR.S4, INC.S4, CTR.S4 + + // Skip to <8 blocks loop + CMP $128, srcPtrLen + + MOVD rk, H0 + // For SM4 round keys are stored in: K0 .. K7 + VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4] + VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4] + + BLT startSingles +octetsLoop: + SUB $128, srcPtrLen + // Prepare 8 counters + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VADD B1.S4, INC.S4, B2.S4 + VADD B2.S4, INC.S4, B3.S4 + VADD B3.S4, INC.S4, B4.S4 + VADD B4.S4, INC.S4, B5.S4 + VADD B5.S4, INC.S4, B6.S4 + VADD B6.S4, INC.S4, B7.S4 + VADD B7.S4, INC.S4, CTR.S4 + + sm4eEnc8blocks() + VREV32 B0.B16, B0.B16 + VREV32 B1.B16, B1.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + VREV32 B4.B16, B4.B16 + VREV32 B5.B16, B5.B16 + VREV32 B6.B16, B6.B16 + VREV32 B7.B16, B7.B16 + + // XOR plaintext and store ciphertext + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B0.B16, T1.B16, B0.B16 + VEOR B1.B16, T2.B16, B1.B16 + VST1.P [B0.B16, B1.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B2.B16, T1.B16, B2.B16 + VEOR B3.B16, T2.B16, B3.B16 + VST1.P [B2.B16, B3.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B4.B16, T1.B16, B4.B16 + VEOR B5.B16, T2.B16, B5.B16 + VST1.P [B4.B16, B5.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B6.B16, T1.B16, B6.B16 + VEOR B7.B16, T2.B16, B7.B16 + VST1.P [B6.B16, B7.B16], 32(dstPtr) + + VLD1.P 32(pTbl), [T1.B16, T2.B16] + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + + mulRound(B1) + mulRound(B2) + mulRound(B3) + mulRound(B4) + mulRound(B5) + mulRound(B6) + mulRound(B7) + MOVD pTblSave, pTbl + reduce() + + CMP $128, srcPtrLen + BGE octetsLoop + +startSingles: + CBZ srcPtrLen, done + ADD $14*16, pTbl + // Preload H and its Karatsuba precomp + VLD1.P (pTbl), [T1.B16, T2.B16] + +singlesLoop: + CMP $16, srcPtrLen + BLT tail + SUB $16, srcPtrLen + + VMOV CTR.B16, B0.B16 + VADD CTR.S4, INC.S4, CTR.S4 + sm4eEnc1block() + VREV32 B0.B16, B0.B16 + +singlesLast: + VLD1.P 16(srcPtr), [T0.B16] + VEOR T0.B16, B0.B16, B0.B16 + +encReduce: + VST1.P [B0.B16], 16(dstPtr) + + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + + reduce() + + B singlesLoop +tail: + CBZ srcPtrLen, done + + VEOR T0.B16, T0.B16, T0.B16 + VEOR T3.B16, T3.B16, T3.B16 + MOVD $0, H1 + SUB $1, H1 + ADD srcPtrLen, srcPtr + + TBZ $3, srcPtrLen, ld4 + MOVD.W -8(srcPtr), H0 + VMOV H0, T0.D[0] + VMOV H1, T3.D[0] + +ld4: + TBZ $2, srcPtrLen, ld2 + MOVW.W -4(srcPtr), H0 + VEXT $12, T0.B16, ZERO.B16, T0.B16 + VEXT $12, T3.B16, ZERO.B16, T3.B16 + VMOV H0, T0.S[0] + VMOV H1, T3.S[0] +ld2: + TBZ $1, srcPtrLen, ld1 + MOVH.W -2(srcPtr), H0 + VEXT $14, T0.B16, ZERO.B16, T0.B16 + VEXT $14, T3.B16, ZERO.B16, T3.B16 + VMOV H0, T0.H[0] + VMOV H1, T3.H[0] +ld1: + TBZ $0, srcPtrLen, ld0 + MOVB.W -1(srcPtr), H0 + VEXT $15, T0.B16, ZERO.B16, T0.B16 + VEXT $15, T3.B16, ZERO.B16, T3.B16 + VMOV H0, T0.B[0] + VMOV H1, T3.B[0] +ld0: + MOVD ZR, srcPtrLen + VMOV CTR.B16, B0.B16 + sm4eEnc1block() + VREV32 B0.B16, B0.B16 + +tailLast: + VEOR T0.B16, B0.B16, B0.B16 + VAND T3.B16, B0.B16, B0.B16 + B encReduce + +done: + VST1 [ACC0.B16], (tPtr) + RET + +// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) +TEXT ·gcmSm4niDec(SB),NOSPLIT,$0 + MOVD productTable+0(FP), pTbl + MOVD dst+8(FP), dstPtr + MOVD src_base+32(FP), srcPtr + MOVD src_len+40(FP), srcPtrLen + MOVD ctr+56(FP), ctrPtr + MOVD T+64(FP), tPtr + MOVD rk_base+72(FP), rk + + MOVD $0xC2, H1 + LSL $56, H1 + MOVD $1, H0 + VMOV H1, POLY.D[0] + VMOV H0, POLY.D[1] + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD pTbl, pTblSave + MOVD rk, rkSave + // Current tag, after AAD + VLD1 (tPtr), [ACC0.B16] + VEOR ACC1.B16, ACC1.B16, ACC1.B16 + VEOR ACCM.B16, ACCM.B16, ACCM.B16 + // Prepare initial counter, and the increment vector + VLD1 (ctrPtr), [CTR.B16] + VEOR INC.B16, INC.B16, INC.B16 + MOVD $1, H0 + VMOV H0, INC.S[3] + VREV32 CTR.B16, CTR.B16 + VADD CTR.S4, INC.S4, CTR.S4 + + // Skip to <8 blocks loop + CMP $128, srcPtrLen + + MOVD rk, H0 + // For SM4 round keys are stored in: K0 .. K7 + VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4] + VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4] + + BLT startSingles +octetsLoop: + SUB $128, srcPtrLen + + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VADD B1.S4, INC.S4, B2.S4 + VADD B2.S4, INC.S4, B3.S4 + VADD B3.S4, INC.S4, B4.S4 + VADD B4.S4, INC.S4, B5.S4 + VADD B5.S4, INC.S4, B6.S4 + VADD B6.S4, INC.S4, B7.S4 + VADD B7.S4, INC.S4, CTR.S4 + + sm4eEnc8blocks() + VREV32 B0.B16, T1.B16 + VREV32 B1.B16, T2.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + VREV32 B4.B16, B4.B16 + VREV32 B5.B16, B5.B16 + VREV32 B6.B16, B6.B16 + VREV32 B7.B16, B7.B16 + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B0.B16, T1.B16, T1.B16 + VEOR B1.B16, T2.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + + VLD1.P 32(pTbl), [T1.B16, T2.B16] + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + mulRound(B1) + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B2.B16, B0.B16, T1.B16 + VEOR B3.B16, B1.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + mulRound(B0) + mulRound(B1) + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B4.B16, B0.B16, T1.B16 + VEOR B5.B16, B1.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + mulRound(B0) + mulRound(B1) + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B6.B16, B0.B16, T1.B16 + VEOR B7.B16, B1.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + mulRound(B0) + mulRound(B1) + + MOVD pTblSave, pTbl + reduce() + + CMP $128, srcPtrLen + BGE octetsLoop + +startSingles: + CBZ srcPtrLen, done + ADD $14*16, pTbl + // Preload H and its Karatsuba precomp + VLD1.P (pTbl), [T1.B16, T2.B16] + +singlesLoop: + CMP $16, srcPtrLen + BLT tail + SUB $16, srcPtrLen + + VLD1.P 16(srcPtr), [T0.B16] + VREV64 T0.B16, B5.B16 + + VMOV CTR.B16, B0.B16 + VADD CTR.S4, INC.S4, CTR.S4 + sm4eEnc1block() + VREV32 B0.B16, B0.B16 + +singlesLast: + VEOR T0.B16, B0.B16, B0.B16 + VST1.P [B0.B16], 16(dstPtr) + + VEOR ACC0.B16, B5.B16, B5.B16 + VEXT $8, B5.B16, B5.B16, T0.B16 + VEOR B5.B16, T0.B16, T0.B16 + VPMULL B5.D1, T1.D1, ACC1.Q1 + VPMULL2 B5.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + reduce() + + B singlesLoop +tail: + CBZ srcPtrLen, done + VMOV CTR.B16, B0.B16 + VADD CTR.S4, INC.S4, CTR.S4 + sm4eEnc1block() + VREV32 B0.B16, B0.B16 +tailLast: + // Assuming it is safe to load past dstPtr due to the presence of the tag + // B5 stored last ciphertext + VLD1 (srcPtr), [B5.B16] + + VEOR B5.B16, B0.B16, B0.B16 + + VEOR T3.B16, T3.B16, T3.B16 + MOVD $0, H1 + SUB $1, H1 + + TBZ $3, srcPtrLen, ld4 // Test if srcPtrLen < 8, if yes, goto ld4 + VMOV B0.D[0], H0 + MOVD.P H0, 8(dstPtr) + VMOV H1, T3.D[0] + VEXT $8, ZERO.B16, B0.B16, B0.B16 +ld4: + TBZ $2, srcPtrLen, ld2 // Test if srcPtrLen < 4, if yes, goto ld2 + VMOV B0.S[0], H0 + MOVW.P H0, 4(dstPtr) + VEXT $12, T3.B16, ZERO.B16, T3.B16 + VMOV H1, T3.S[0] + VEXT $4, ZERO.B16, B0.B16, B0.B16 +ld2: + TBZ $1, srcPtrLen, ld1 // Test if srcPtrLen < 2, if yes, goto ld1 + VMOV B0.H[0], H0 + MOVH.P H0, 2(dstPtr) + VEXT $14, T3.B16, ZERO.B16, T3.B16 + VMOV H1, T3.H[0] + VEXT $2, ZERO.B16, B0.B16, B0.B16 +ld1: + TBZ $0, srcPtrLen, ld0 // Test if srcPtrLen < 1, if yes, goto ld0 + VMOV B0.B[0], H0 + MOVB.P H0, 1(dstPtr) + VEXT $15, T3.B16, ZERO.B16, T3.B16 + VMOV H1, T3.B[0] +ld0: + + VAND T3.B16, B5.B16, B5.B16 + VREV64 B5.B16, B5.B16 + + VEOR ACC0.B16, B5.B16, B5.B16 + VEXT $8, B5.B16, B5.B16, T0.B16 + VEOR B5.B16, T0.B16, T0.B16 + VPMULL B5.D1, T1.D1, ACC1.Q1 + VPMULL2 B5.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + reduce() +done: + VST1 [ACC0.B16], (tPtr) + + RET diff --git a/sm4/gen_arm64_ni.go b/sm4/gen_arm64_ni.go new file mode 100644 index 0000000..0db7724 --- /dev/null +++ b/sm4/gen_arm64_ni.go @@ -0,0 +1,137 @@ +// Not used yet!!! +// go run gen_arm64_ni.go + +//go:build ignore +// +build ignore + +package main + +import ( + "bytes" + "fmt" + "log" + "math/bits" + "os" +) + +//SM4E .4S, .4S +func sm4e(Vd, Vn byte) uint32 { + inst := uint32(0xcec08400) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5 + return bits.ReverseBytes32(inst) +} + +//SM4EKEY .4S, .4S, .4S +func sm4ekey(Vd, Vn, Vm byte) uint32 { + inst := uint32(0xce60c800) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5 | (uint32(Vm&0x1f) << 16) + return bits.ReverseBytes32(inst) +} + +func sm4ekeyRound(buf *bytes.Buffer, d, n, m byte) { + fmt.Fprintf(buf, "\tWORD $0x%08x //SM4EKEY V%d.4S, V%d.4S, V%d.4S\n", sm4ekey(d, n, m), d, n, m) +} + +func sm4eRound(buf *bytes.Buffer, d, n byte) { + fmt.Fprintf(buf, "\tWORD $0x%08x //SM4E V%d.4S, V%d.4S\n", sm4e(d, n), d, n) +} + +func main() { + buf := new(bytes.Buffer) + fmt.Fprint(buf, ` +// Generated by gen_arm64_ni.go. DO NOT EDIT. + +#include "textflag.h" + +// func expandKeySM4E(key *byte, fk, ck, enc *uint32) +TEXT ·expandKeySM4E(SB),NOSPLIT,$0 + MOVD key+0(FP), R8 + MOVD fk+8(FP), R9 + MOVD ck+16(FP), R10 + MOVD enc+24(FP), R11 + + VLD1 (R8), [V9.B16] + VREV32 V9.B16, V9.B16 + VLD1 (R9), [V8.S4] + VEOR V9, V8, V9 + VLD1.P 64(R10), [V0.S4, V1.S4, V2.S4, V3.S4] +`[1:]) + + sm4ekeyRound(buf, 8, 9, 0) + sm4ekeyRound(buf, 9, 8, 1) + fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n") + sm4ekeyRound(buf, 8, 9, 2) + sm4ekeyRound(buf, 9, 8, 3) + fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n") + fmt.Fprintf(buf, "\tVLD1.P 64(R10), [V0.S4, V1.S4, V2.S4, V3.S4]\n") + sm4ekeyRound(buf, 8, 9, 0) + sm4ekeyRound(buf, 9, 8, 1) + fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n") + sm4ekeyRound(buf, 8, 9, 2) + sm4ekeyRound(buf, 9, 8, 3) + fmt.Fprintf(buf, ` + VST1.P [V8.S4, V9.S4], 32(R11) + RET +`[1:]) + fmt.Fprint(buf, ` + +// func encryptBlockSM4E(xk *uint32, dst, src *byte) +TEXT ·encryptBlockSM4E(SB),NOSPLIT,$0 + MOVD xk+0(FP), R8 + MOVD dst+8(FP), R9 + MOVD src+16(FP), R10 + + VLD1 (R10), [V8.B16] + VREV32 V8.B16, V8.B16 + VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] +`[1:]) + sm4eRound(buf, 8, 0) + sm4eRound(buf, 8, 1) + sm4eRound(buf, 8, 2) + sm4eRound(buf, 8, 3) + fmt.Fprintf(buf, "\tVLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]\n") + sm4eRound(buf, 8, 0) + sm4eRound(buf, 8, 1) + sm4eRound(buf, 8, 2) + sm4eRound(buf, 8, 3) + fmt.Fprintf(buf, ` + VREV32 V8.B16, V8.B16 + VST1 [V8.B16], (R9) + RET +`[1:]) + + fmt.Fprint(buf, ` + +// func encryptBlocksSM4E(xk *uint32, dst, src *byte) +TEXT ·encryptBlocksSM4E(SB),NOSPLIT,$0 + MOVD xk+0(FP), R8 + MOVD dst+8(FP), R9 + MOVD src+16(FP), R10 + + VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] + VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4] + +`[1:]) + for i := 0; i < 4; i++ { + fmt.Fprintf(buf, "\tVLD1.P 16(R10), [V8.B16]\n") + fmt.Fprintf(buf, "\tVREV32 V8.B16, V8.B16\n") + sm4eRound(buf, 8, 0) + sm4eRound(buf, 8, 1) + sm4eRound(buf, 8, 2) + sm4eRound(buf, 8, 3) + sm4eRound(buf, 8, 4) + sm4eRound(buf, 8, 5) + sm4eRound(buf, 8, 6) + sm4eRound(buf, 8, 7) + fmt.Fprintf(buf, "\tVREV32 V8.B16, V8.B16\n") + fmt.Fprintf(buf, "\tVST1.P [V8.B16], 16(R9)\n\n") + } + fmt.Fprintf(buf, ` + RET +`[1:]) + + src := buf.Bytes() + // fmt.Println(string(src)) + err := os.WriteFile("sm4e_arm64.s", src, 0644) + if err != nil { + log.Fatal(err) + } +} diff --git a/sm4/sm4_gcm_asm.go b/sm4/sm4_gcm_asm.go index 515f754..74c5481 100644 --- a/sm4/sm4_gcm_asm.go +++ b/sm4/sm4_gcm_asm.go @@ -12,16 +12,16 @@ import ( // sm4CipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM // will use the optimised implementation in this file when possible. Instances -// of this type only exist when hasGCMAsm returns true. +// of this type only exist when hasGCMAsm and hasAES returns true. type sm4CipherGCM struct { - sm4CipherAsm + *sm4CipherAsm } // Assert that sm4CipherGCM implements the gcmAble interface. var _ gcmAble = (*sm4CipherGCM)(nil) //go:noescape -func gcmSm4Init(productTable *[256]byte, rk []uint32) +func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int) //go:noescape func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) @@ -35,6 +35,33 @@ func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) //go:noescape func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) +// gcmSm4InitInst is used for test +func gcmSm4InitInst(productTable *[256]byte, rk []uint32) { + if supportSM4 { + gcmSm4Init(productTable, rk, INST_SM4) + } else { + gcmSm4Init(productTable, rk, INST_AES) + } +} + +// gcmSm4EncInst is used for test +func gcmSm4EncInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) { + if supportSM4 { + gcmSm4niEnc(productTable, dst, src, ctr, T, rk) + } else { + gcmSm4Enc(productTable, dst, src, ctr, T, rk) + } +} + +// gcmSm4DecInst is used for test +func gcmSm4DecInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) { + if supportSM4 { + gcmSm4niDec(productTable, dst, src, ctr, T, rk) + } else { + gcmSm4Dec(productTable, dst, src, ctr, T, rk) + } +} + type gcmAsm struct { gcm bytesProductTable [256]byte @@ -44,10 +71,10 @@ type gcmAsm struct { // called by crypto/cipher.NewGCM via the gcmAble interface. func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) { g := &gcmAsm{} - g.cipher = &c.sm4CipherAsm + g.cipher = c.sm4CipherAsm g.nonceSize = nonceSize g.tagSize = tagSize - gcmSm4Init(&g.bytesProductTable, g.cipher.enc) + gcmSm4Init(&g.bytesProductTable, g.cipher.enc, INST_AES) return g, nil } diff --git a/sm4/sm4_gcm_test.go b/sm4/sm4_gcm_test.go index 219d8ac..36e30bc 100644 --- a/sm4/sm4_gcm_test.go +++ b/sm4/sm4_gcm_test.go @@ -11,12 +11,12 @@ import ( func genPrecomputeTable() *gcmAsm { key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} - c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64} - expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0]) + c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64} + expandKey(key, c.enc, c.dec) c1 := &sm4CipherGCM{c} g := &gcmAsm{} - g.cipher = &c1.sm4CipherAsm - gcmSm4Init(&g.bytesProductTable, g.cipher.enc) + g.cipher = c1.sm4CipherAsm + gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc) return g } @@ -145,13 +145,13 @@ func TestBothDataPlaintext(t *testing.T) { func createGcm() *gcmAsm { key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} - c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64} - expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0]) + c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64} + expandKey(key, c.enc, c.dec) c1 := &sm4CipherGCM{c} g := &gcmAsm{} - g.cipher = &c1.sm4CipherAsm + g.cipher = c1.sm4CipherAsm g.tagSize = 16 - gcmSm4Init(&g.bytesProductTable, g.cipher.enc) + gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc) return g } @@ -214,7 +214,7 @@ func TestGcmSm4Enc(t *testing.T) { out2 := make([]byte, len(test.plaintext)+gcm.tagSize) gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2) - gcmSm4Enc(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc) + gcmSm4EncInst(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc) if hex.EncodeToString(out1) != hex.EncodeToString(out2) { t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString(out1), hex.EncodeToString(out2)) } @@ -244,7 +244,7 @@ func TestGcmSm4Dec(t *testing.T) { out2 := make([]byte, len(test.plaintext)+gcm.tagSize) gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2) - gcmSm4Dec(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc) + gcmSm4DecInst(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc) if hex.EncodeToString([]byte(test.plaintext)) != hex.EncodeToString(out2[:len(test.plaintext)]) { t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString([]byte(test.plaintext)), hex.EncodeToString(out2[:len(test.plaintext)])) diff --git a/sm4/sm4ni_gcm_asm.go b/sm4/sm4ni_gcm_asm.go new file mode 100644 index 0000000..7832378 --- /dev/null +++ b/sm4/sm4ni_gcm_asm.go @@ -0,0 +1,152 @@ +//go:build amd64 || arm64 +// +build amd64 arm64 + +package sm4 + +import ( + "crypto/cipher" + goSubtle "crypto/subtle" + + "github.com/emmansun/gmsm/internal/subtle" +) + +//go:noescape +func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) + +//go:noescape +func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) + +// sm4CipherNIGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM +// will use the optimised implementation in this file when possible. Instances +// of this type only exist when hasGCMAsm and hasSM4 returns true. +type sm4CipherNIGCM struct { + *sm4CipherNI +} + +// Assert that sm4CipherNIGCM implements the gcmAble interface. +var _ gcmAble = (*sm4CipherNIGCM)(nil) + +type gcmNI struct { + cipher *sm4CipherNI + nonceSize int + tagSize int + bytesProductTable [256]byte +} + +func (g *gcmNI) NonceSize() int { + return g.nonceSize +} + +func (g *gcmNI) Overhead() int { + return g.tagSize +} + +// NewGCM returns the SM4 cipher wrapped in Galois Counter Mode. This is only +// called by crypto/cipher.NewGCM via the gcmAble interface. +func (c *sm4CipherNIGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) { + g := &gcmNI{} + g.cipher = c.sm4CipherNI + g.nonceSize = nonceSize + g.tagSize = tagSize + gcmSm4Init(&g.bytesProductTable, g.cipher.enc, INST_SM4) + return g, nil +} + +// Seal encrypts and authenticates plaintext. See the cipher.AEAD interface for +// details. +func (g *gcmNI) Seal(dst, nonce, plaintext, data []byte) []byte { + if len(nonce) != g.nonceSize { + panic("cipher: incorrect nonce length given to GCM") + } + if uint64(len(plaintext)) > ((1<<32)-2)*BlockSize { + panic("cipher: message too large for GCM") + } + + var counter, tagMask [gcmBlockSize]byte + + if len(nonce) == gcmStandardNonceSize { + // Init counter to nonce||1 + copy(counter[:], nonce) + counter[gcmBlockSize-1] = 1 + } else { + // Otherwise counter = GHASH(nonce) + gcmSm4Data(&g.bytesProductTable, nonce, &counter) + gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0)) + } + + g.cipher.Encrypt(tagMask[:], counter[:]) + + var tagOut [gcmTagSize]byte + gcmSm4Data(&g.bytesProductTable, data, &tagOut) + + ret, out := subtle.SliceForAppend(dst, len(plaintext)+g.tagSize) + if subtle.InexactOverlap(out[:len(plaintext)], plaintext) { + panic("cipher: invalid buffer overlap") + } + + if len(plaintext) > 0 { + gcmSm4niEnc(&g.bytesProductTable, out, plaintext, &counter, &tagOut, g.cipher.enc) + } + gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data))) + copy(out[len(plaintext):], tagOut[:]) + + return ret +} + +// Open authenticates and decrypts ciphertext. See the cipher.AEAD interface +// for details. +func (g *gcmNI) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) { + if len(nonce) != g.nonceSize { + panic("cipher: incorrect nonce length given to GCM") + } + // Sanity check to prevent the authentication from always succeeding if an implementation + // leaves tagSize uninitialized, for example. + if g.tagSize < gcmMinimumTagSize { + panic("cipher: incorrect GCM tag size") + } + + if len(ciphertext) < g.tagSize { + return nil, errOpen + } + if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(BlockSize)+uint64(g.tagSize) { + return nil, errOpen + } + + tag := ciphertext[len(ciphertext)-g.tagSize:] + ciphertext = ciphertext[:len(ciphertext)-g.tagSize] + + // See GCM spec, section 7.1. + var counter, tagMask [gcmBlockSize]byte + + if len(nonce) == gcmStandardNonceSize { + // Init counter to nonce||1 + copy(counter[:], nonce) + counter[gcmBlockSize-1] = 1 + } else { + // Otherwise counter = GHASH(nonce) + gcmSm4Data(&g.bytesProductTable, nonce, &counter) + gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0)) + } + + g.cipher.Encrypt(tagMask[:], counter[:]) + + var expectedTag [gcmTagSize]byte + gcmSm4Data(&g.bytesProductTable, data, &expectedTag) + + ret, out := subtle.SliceForAppend(dst, len(ciphertext)) + if subtle.InexactOverlap(out, ciphertext) { + panic("cipher: invalid buffer overlap") + } + if len(ciphertext) > 0 { + gcmSm4niDec(&g.bytesProductTable, out, ciphertext, &counter, &expectedTag, g.cipher.enc) + } + gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data))) + + if goSubtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 { + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + return ret, nil +}