mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
commit
9a2d7123f8
39
.github/workflows/sm3_sm4_ni.ci.yml
vendored
Normal file
39
.github/workflows/sm3_sm4_ni.ci.yml
vendored
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
name: ci
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ sm3_sm4_ni ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ sm3_sm4_ni ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
goVer: ['1.15', '1.16', '1.17']
|
||||||
|
steps:
|
||||||
|
- name: Checkout Repo
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: Set up Go
|
||||||
|
uses: actions/setup-go@v2
|
||||||
|
with:
|
||||||
|
go-version: ${{ matrix.goVer }}
|
||||||
|
|
||||||
|
- name: Setup Environment
|
||||||
|
run: |
|
||||||
|
echo "GOPATH=$(go env GOPATH)" >> $GITHUB_ENV
|
||||||
|
echo "$(go env GOPATH)/bin" >> $GITHUB_PATH
|
||||||
|
|
||||||
|
- name: Module cache
|
||||||
|
uses: actions/cache@v2.1.7
|
||||||
|
env:
|
||||||
|
cache-name: go-mod-cache
|
||||||
|
with:
|
||||||
|
path: ~/go/pkg/mod
|
||||||
|
key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/go.sum') }}
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
run: go test -v ./...
|
@ -58,18 +58,18 @@ func sm3tt2b(Vd, Vn, Vm, imm2 byte) uint32 {
|
|||||||
|
|
||||||
// Used v5 as temp register
|
// Used v5 as temp register
|
||||||
func roundA(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
|
func roundA(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
|
||||||
fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t)
|
fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t)
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i)
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used v5 as temp register
|
// Used v5 as temp register
|
||||||
func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
|
func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
|
||||||
fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t)
|
fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t)
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i)
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compress 4 words and generate 4 words, use v6, v7, v10 as temp registers
|
// Compress 4 words and generate 4 words, use v6, v7, v10 as temp registers
|
||||||
@ -82,12 +82,12 @@ func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
|
|||||||
// st1, st2, sm3 state
|
// st1, st2, sm3 state
|
||||||
func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
|
func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
|
||||||
fmt.Fprintf(buf, "\t// Extension\n")
|
fmt.Fprintf(buf, "\t// Extension\n")
|
||||||
fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4)
|
fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4)
|
||||||
fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6)
|
fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6)
|
||||||
fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7)
|
fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7)
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
|
||||||
fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0)
|
fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0)
|
||||||
fmt.Fprintf(buf, "\t// Compression\n")
|
fmt.Fprintf(buf, "\t// Compression\n")
|
||||||
roundA(buf, 0, t, st1, st2, s0, 10)
|
roundA(buf, 0, t, st1, st2, s0, 10)
|
||||||
roundA(buf, 1, t, st1, st2, s0, 10)
|
roundA(buf, 1, t, st1, st2, s0, 10)
|
||||||
@ -100,13 +100,13 @@ func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
|
|||||||
func qroundB(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
|
func qroundB(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
|
||||||
if s4 != 0xff {
|
if s4 != 0xff {
|
||||||
fmt.Fprintf(buf, "\t// Extension\n")
|
fmt.Fprintf(buf, "\t// Extension\n")
|
||||||
fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4)
|
fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4)
|
||||||
fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6)
|
fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6)
|
||||||
fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7)
|
fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7)
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
|
||||||
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0)
|
fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0)
|
||||||
fmt.Fprintf(buf, "\t// Compression\n")
|
fmt.Fprintf(buf, "\t// Compression\n")
|
||||||
roundB(buf, 0, t, st1, st2, s0, 10)
|
roundB(buf, 0, t, st1, st2, s0, 10)
|
||||||
roundB(buf, 1, t, st1, st2, s0, 10)
|
roundB(buf, 1, t, st1, st2, s0, 10)
|
||||||
@ -165,8 +165,8 @@ blockloop:
|
|||||||
|
|
||||||
fmt.Fprint(buf, `
|
fmt.Fprint(buf, `
|
||||||
SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes
|
SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes
|
||||||
VEOR V8.S4, V15.S4, V8.S4
|
VEOR V8.B16, V15.B16, V8.B16
|
||||||
VEOR V9.S4, V16.S4, V9.S4
|
VEOR V9.B16, V16.B16, V9.B16
|
||||||
CBNZ R3, blockloop
|
CBNZ R3, blockloop
|
||||||
|
|
||||||
sm3ret:
|
sm3ret:
|
||||||
|
@ -19,10 +19,10 @@ func blockARM64(dig *digest, p []byte)
|
|||||||
func blockSM3NI(h []uint32, p []byte, t []uint32)
|
func blockSM3NI(h []uint32, p []byte, t []uint32)
|
||||||
|
|
||||||
func block(dig *digest, p []byte) {
|
func block(dig *digest, p []byte) {
|
||||||
//if !useSM3NI {
|
if !useSM3NI {
|
||||||
blockARM64(dig, p)
|
blockARM64(dig, p)
|
||||||
//} else {
|
} else {
|
||||||
// h := dig.h[:]
|
h := dig.h[:]
|
||||||
// blockSM3NI(h, p, t)
|
blockSM3NI(h, p, t)
|
||||||
//}
|
}
|
||||||
}
|
}
|
||||||
|
416
sm3/sm3blockni_arm64.s
Normal file
416
sm3/sm3blockni_arm64.s
Normal file
@ -0,0 +1,416 @@
|
|||||||
|
// Generated by gen_sm3block_ni.go. DO NOT EDIT.
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
// func blockSM3NI(h []uint32, p []byte, t []uint32)
|
||||||
|
TEXT ·blockSM3NI(SB), 0, $0
|
||||||
|
MOVD h_base+0(FP), R0 // Hash value first address
|
||||||
|
MOVD p_base+24(FP), R1 // message first address
|
||||||
|
MOVD p_len+32(FP), R3 // message length
|
||||||
|
MOVD t_base+48(FP), R2 // t constants first address
|
||||||
|
|
||||||
|
VLD1 (R0), [V8.S4, V9.S4] // load h(a,b,c,d,e,f,g,h)
|
||||||
|
LDPW (0*8)(R2), (R5, R6) // load t constants
|
||||||
|
|
||||||
|
blockloop:
|
||||||
|
VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16] // load 64bytes message
|
||||||
|
VMOV V8.B16, V15.B16 // backup: V8 h(dcba)
|
||||||
|
VMOV V9.B16, V16.B16 // backup: V9 h(hgfe)
|
||||||
|
VREV32 V0.B16, V0.B16 // prepare for using message in Byte format
|
||||||
|
VREV32 V1.B16, V1.B16
|
||||||
|
VREV32 V2.B16, V2.B16
|
||||||
|
VREV32 V3.B16, V3.B16
|
||||||
|
// first 16 rounds
|
||||||
|
VMOV R5, V11.S[3]
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V2.B16, V1.B16, V4.B16
|
||||||
|
VEXT $3, V1.B16, V0.B16, V6.B16
|
||||||
|
VEXT $2, V3.B16, V2.B16, V7.B16
|
||||||
|
WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S
|
||||||
|
WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S
|
||||||
|
VEOR V1.B16, V0.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9a840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9b840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V3.B16, V2.B16, V0.B16
|
||||||
|
VEXT $3, V2.B16, V1.B16, V6.B16
|
||||||
|
VEXT $2, V4.B16, V3.B16, V7.B16
|
||||||
|
WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S
|
||||||
|
WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S
|
||||||
|
VEOR V2.B16, V1.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9a841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9b841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V4.B16, V3.B16, V1.B16
|
||||||
|
VEXT $3, V3.B16, V2.B16, V6.B16
|
||||||
|
VEXT $2, V0.B16, V4.B16, V7.B16
|
||||||
|
WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S
|
||||||
|
WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S
|
||||||
|
VEOR V3.B16, V2.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9a842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9b842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V0.B16, V4.B16, V2.B16
|
||||||
|
VEXT $3, V4.B16, V3.B16, V6.B16
|
||||||
|
VEXT $2, V1.B16, V0.B16, V7.B16
|
||||||
|
WORD $0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S
|
||||||
|
WORD $0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S
|
||||||
|
VEOR V4.B16, V3.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9a843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9b843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 3
|
||||||
|
|
||||||
|
// second 48 rounds
|
||||||
|
VMOV R6, V11.S[3]
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V1.B16, V0.B16, V3.B16
|
||||||
|
VEXT $3, V0.B16, V4.B16, V6.B16
|
||||||
|
VEXT $2, V2.B16, V1.B16, V7.B16
|
||||||
|
WORD $0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S
|
||||||
|
WORD $0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S
|
||||||
|
VEOR V0.B16, V4.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V2.B16, V1.B16, V4.B16
|
||||||
|
VEXT $3, V1.B16, V0.B16, V6.B16
|
||||||
|
VEXT $2, V3.B16, V2.B16, V7.B16
|
||||||
|
WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S
|
||||||
|
WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S
|
||||||
|
VEOR V1.B16, V0.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V3.B16, V2.B16, V0.B16
|
||||||
|
VEXT $3, V2.B16, V1.B16, V6.B16
|
||||||
|
VEXT $2, V4.B16, V3.B16, V7.B16
|
||||||
|
WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S
|
||||||
|
WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S
|
||||||
|
VEOR V2.B16, V1.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V4.B16, V3.B16, V1.B16
|
||||||
|
VEXT $3, V3.B16, V2.B16, V6.B16
|
||||||
|
VEXT $2, V0.B16, V4.B16, V7.B16
|
||||||
|
WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S
|
||||||
|
WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S
|
||||||
|
VEOR V3.B16, V2.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V0.B16, V4.B16, V2.B16
|
||||||
|
VEXT $3, V4.B16, V3.B16, V6.B16
|
||||||
|
VEXT $2, V1.B16, V0.B16, V7.B16
|
||||||
|
WORD $0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S
|
||||||
|
WORD $0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S
|
||||||
|
VEOR V4.B16, V3.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V1.B16, V0.B16, V3.B16
|
||||||
|
VEXT $3, V0.B16, V4.B16, V6.B16
|
||||||
|
VEXT $2, V2.B16, V1.B16, V7.B16
|
||||||
|
WORD $0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S
|
||||||
|
WORD $0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S
|
||||||
|
VEOR V0.B16, V4.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V2.B16, V1.B16, V4.B16
|
||||||
|
VEXT $3, V1.B16, V0.B16, V6.B16
|
||||||
|
VEXT $2, V3.B16, V2.B16, V7.B16
|
||||||
|
WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S
|
||||||
|
WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S
|
||||||
|
VEOR V1.B16, V0.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V3.B16, V2.B16, V0.B16
|
||||||
|
VEXT $3, V2.B16, V1.B16, V6.B16
|
||||||
|
VEXT $2, V4.B16, V3.B16, V7.B16
|
||||||
|
WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S
|
||||||
|
WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S
|
||||||
|
VEOR V2.B16, V1.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3
|
||||||
|
|
||||||
|
// Extension
|
||||||
|
VEXT $3, V4.B16, V3.B16, V1.B16
|
||||||
|
VEXT $3, V3.B16, V2.B16, V6.B16
|
||||||
|
VEXT $2, V0.B16, V4.B16, V7.B16
|
||||||
|
WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S
|
||||||
|
WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S
|
||||||
|
VEOR V3.B16, V2.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3
|
||||||
|
|
||||||
|
VEOR V4.B16, V3.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3
|
||||||
|
|
||||||
|
VEOR V0.B16, V4.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3
|
||||||
|
|
||||||
|
VEOR V1.B16, V0.B16, V10.B16
|
||||||
|
// Compression
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
|
||||||
|
WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
|
||||||
|
WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
|
||||||
|
WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2
|
||||||
|
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
|
||||||
|
VSHL $1, V11.S4, V11.S4
|
||||||
|
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
|
||||||
|
WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3
|
||||||
|
|
||||||
|
SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes
|
||||||
|
VEOR V8.B16, V15.B16, V8.B16
|
||||||
|
VEOR V9.B16, V16.B16, V9.B16
|
||||||
|
CBNZ R3, blockloop
|
||||||
|
|
||||||
|
sm3ret:
|
||||||
|
VST1 [V8.S4, V9.S4], (R0) // store hash value H
|
||||||
|
RET
|
@ -290,7 +290,7 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
|||||||
AVX_SM4_TAO_L1(x, y); \
|
AVX_SM4_TAO_L1(x, y); \
|
||||||
VPXOR x, t0, t0
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
// func expandKeyAsm(key *byte, ck, enc, dec *uint32)
|
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
|
||||||
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
||||||
MOVQ key+0(FP), AX
|
MOVQ key+0(FP), AX
|
||||||
MOVQ ck+8(FP), BX
|
MOVQ ck+8(FP), BX
|
||||||
@ -321,7 +321,7 @@ loop:
|
|||||||
expand_end:
|
expand_end:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// func encryptBlocksAsm(xk *uint32, dst, src []byte)
|
// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
|
||||||
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
||||||
MOVQ xk+0(FP), AX
|
MOVQ xk+0(FP), AX
|
||||||
MOVQ dst+8(FP), BX
|
MOVQ dst+8(FP), BX
|
||||||
@ -497,7 +497,7 @@ avx2_sm4_done:
|
|||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// func encryptBlockAsm(xk *uint32, dst, src *byte)
|
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
|
||||||
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
||||||
MOVQ xk+0(FP), AX
|
MOVQ xk+0(FP), AX
|
||||||
MOVQ dst+8(FP), BX
|
MOVQ dst+8(FP), BX
|
||||||
|
102
sm4/asm_arm64.s
102
sm4/asm_arm64.s
@ -164,13 +164,44 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
|||||||
VMOV R0, R24_MASK.D[0] \
|
VMOV R0, R24_MASK.D[0] \
|
||||||
VMOV R1, R24_MASK.D[1]
|
VMOV R1, R24_MASK.D[1]
|
||||||
|
|
||||||
// func expandKeyAsm(key *byte, ck, enc, dec *uint32)
|
#define SM4EKEY_EXPORT_KEYS() \
|
||||||
|
VMOV V9.S[3], V10.S[0] \
|
||||||
|
VMOV V9.S[2], V10.S[1] \
|
||||||
|
VMOV V9.S[1], V10.S[2] \
|
||||||
|
VMOV V9.S[0], V10.S[3] \
|
||||||
|
VMOV V8.S[3], V11.S[0] \
|
||||||
|
VMOV V8.S[2], V11.S[1] \
|
||||||
|
VMOV V8.S[1], V11.S[2] \
|
||||||
|
VMOV V8.S[0], V11.S[3] \
|
||||||
|
VST1.P [V8.S4, V9.S4], 32(R10) \
|
||||||
|
VST1 [V10.S4, V11.S4], (R11) \
|
||||||
|
SUB $32, R11, R11
|
||||||
|
|
||||||
|
#define SM4E_ROUND() \
|
||||||
|
VLD1.P 16(R10), [V8.B16] \
|
||||||
|
VREV32 V8.B16, V8.B16 \
|
||||||
|
WORD $0x0884c0ce \
|
||||||
|
WORD $0x2884c0ce \
|
||||||
|
WORD $0x4884c0ce \
|
||||||
|
WORD $0x6884c0ce \
|
||||||
|
WORD $0x8884c0ce \
|
||||||
|
WORD $0xa884c0ce \
|
||||||
|
WORD $0xc884c0ce \
|
||||||
|
WORD $0xe884c0ce \
|
||||||
|
VREV32 V8.B16, V8.B16 \
|
||||||
|
VST1.P [V8.B16], 16(R9)
|
||||||
|
|
||||||
|
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
|
||||||
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
|
||||||
MOVD key+0(FP), R8
|
MOVD key+0(FP), R8
|
||||||
MOVD ck+8(FP), R9
|
MOVD ck+8(FP), R9
|
||||||
MOVD enc+16(FP), R10
|
MOVD enc+16(FP), R10
|
||||||
MOVD dec+24(FP), R11
|
MOVD dec+24(FP), R11
|
||||||
|
MOVD inst+32(FP), R12
|
||||||
|
|
||||||
|
CMP $1, R12
|
||||||
|
BEQ sm4ekey
|
||||||
|
|
||||||
load_global_data_1()
|
load_global_data_1()
|
||||||
|
|
||||||
VLD1 (R8), [t0.B16]
|
VLD1 (R8), [t0.B16]
|
||||||
@ -193,14 +224,46 @@ ksLoop:
|
|||||||
ADD $16, R0
|
ADD $16, R0
|
||||||
CMP $128, R0
|
CMP $128, R0
|
||||||
BNE ksLoop
|
BNE ksLoop
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// func encryptBlocksAsm(xk *uint32, dst, src []byte)
|
sm4ekey:
|
||||||
|
LDP fk_mask<>(SB), (R0, R1)
|
||||||
|
VMOV R0, FK_MASK.D[0]
|
||||||
|
VMOV R1, FK_MASK.D[1]
|
||||||
|
VLD1 (R8), [V9.B16]
|
||||||
|
VREV32 V9.B16, V9.B16
|
||||||
|
VEOR FK_MASK.B16, V9.B16, V9.B16
|
||||||
|
ADD $96, R11
|
||||||
|
|
||||||
|
VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||||
|
WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S
|
||||||
|
WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S
|
||||||
|
SM4EKEY_EXPORT_KEYS()
|
||||||
|
|
||||||
|
WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S
|
||||||
|
WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S
|
||||||
|
SM4EKEY_EXPORT_KEYS()
|
||||||
|
|
||||||
|
VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||||
|
WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S
|
||||||
|
WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S
|
||||||
|
SM4EKEY_EXPORT_KEYS()
|
||||||
|
|
||||||
|
WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S
|
||||||
|
WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S
|
||||||
|
SM4EKEY_EXPORT_KEYS()
|
||||||
|
RET
|
||||||
|
|
||||||
|
// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
|
||||||
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
||||||
MOVD xk+0(FP), R8
|
MOVD xk+0(FP), R8
|
||||||
MOVD dst+8(FP), R9
|
MOVD dst+8(FP), R9
|
||||||
MOVD src+32(FP), R10
|
MOVD src+32(FP), R10
|
||||||
|
MOVD src_len+40(FP), R12
|
||||||
|
MOVD inst+56(FP), R11
|
||||||
|
|
||||||
|
CMP $1, R11
|
||||||
|
BEQ sm4niblocks
|
||||||
|
|
||||||
VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4]
|
VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4]
|
||||||
VMOV V5.S[0], t0.S[0]
|
VMOV V5.S[0], t0.S[0]
|
||||||
@ -271,15 +334,26 @@ encryptBlocksLoop:
|
|||||||
VMOV t1.S[3], V8.S[2]
|
VMOV t1.S[3], V8.S[2]
|
||||||
VMOV t0.S[3], V8.S[3]
|
VMOV t0.S[3], V8.S[3]
|
||||||
VST1 [V8.B16], (R9)
|
VST1 [V8.B16], (R9)
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
sm4niblocks:
|
||||||
|
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||||
|
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
|
||||||
|
sm4niblockloop:
|
||||||
|
SM4E_ROUND()
|
||||||
|
SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes
|
||||||
|
CBNZ R12, sm4niblockloop
|
||||||
|
RET
|
||||||
|
|
||||||
// func encryptBlockAsm(xk *uint32, dst, src *byte)
|
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
|
||||||
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
||||||
MOVD xk+0(FP), R8
|
MOVD xk+0(FP), R8
|
||||||
MOVD dst+8(FP), R9
|
MOVD dst+8(FP), R9
|
||||||
MOVD src+16(FP), R10
|
MOVD src+16(FP), R10
|
||||||
|
MOVD inst+24(FP), R11
|
||||||
|
|
||||||
|
CMP $1, R11
|
||||||
|
BEQ sm4niblock
|
||||||
|
|
||||||
VLD1 (R10), [t0.S4]
|
VLD1 (R10), [t0.S4]
|
||||||
VREV32 t0.B16, t0.B16
|
VREV32 t0.B16, t0.B16
|
||||||
@ -312,5 +386,21 @@ encryptBlockLoop:
|
|||||||
VMOV t1.S[0], V8.S[2]
|
VMOV t1.S[0], V8.S[2]
|
||||||
VMOV t0.S[0], V8.S[3]
|
VMOV t0.S[0], V8.S[3]
|
||||||
VST1 [V8.B16], (R9)
|
VST1 [V8.B16], (R9)
|
||||||
|
RET
|
||||||
|
|
||||||
|
sm4niblock:
|
||||||
|
VLD1 (R10), [V8.B16]
|
||||||
|
VREV32 V8.B16, V8.B16
|
||||||
|
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||||
|
WORD $0x0884c0ce //SM4E V8.4S, V0.4S
|
||||||
|
WORD $0x2884c0ce //SM4E V8.4S, V1.4S
|
||||||
|
WORD $0x4884c0ce //SM4E V8.4S, V2.4S
|
||||||
|
WORD $0x6884c0ce //SM4E V8.4S, V3.4S
|
||||||
|
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||||
|
WORD $0x0884c0ce //SM4E V8.4S, V0.4S
|
||||||
|
WORD $0x2884c0ce //SM4E V8.4S, V1.4S
|
||||||
|
WORD $0x4884c0ce //SM4E V8.4S, V2.4S
|
||||||
|
WORD $0x6884c0ce //SM4E V8.4S, V3.4S
|
||||||
|
VREV32 V8.B16, V8.B16
|
||||||
|
VST1 [V8.B16], (R9)
|
||||||
RET
|
RET
|
||||||
|
@ -15,14 +15,19 @@ var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
|
|||||||
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
|
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
|
||||||
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
|
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
|
||||||
|
|
||||||
//go:noescape
|
const (
|
||||||
func encryptBlocksAsm(xk *uint32, dst, src []byte)
|
INST_AES int = iota
|
||||||
|
INST_SM4
|
||||||
|
)
|
||||||
|
|
||||||
//go:noescape
|
//go:noescape
|
||||||
func encryptBlockAsm(xk *uint32, dst, src *byte)
|
func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
|
||||||
|
|
||||||
//go:noescape
|
//go:noescape
|
||||||
func expandKeyAsm(key *byte, ck, enc, dec *uint32)
|
func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
|
||||||
|
|
||||||
type sm4CipherAsm struct {
|
type sm4CipherAsm struct {
|
||||||
sm4Cipher
|
sm4Cipher
|
||||||
@ -30,24 +35,66 @@ type sm4CipherAsm struct {
|
|||||||
blocksSize int
|
blocksSize int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type sm4CipherNI struct {
|
||||||
|
sm4Cipher
|
||||||
|
}
|
||||||
|
|
||||||
|
func newCipherNI(key []byte) (cipher.Block, error) {
|
||||||
|
c := &sm4CipherNI{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}}
|
||||||
|
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0], INST_SM4)
|
||||||
|
if supportsGFMUL {
|
||||||
|
return &sm4CipherNIGCM{c}, nil
|
||||||
|
}
|
||||||
|
return c, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *sm4CipherNI) Encrypt(dst, src []byte) {
|
||||||
|
if len(src) < BlockSize {
|
||||||
|
panic("sm4: input not full block")
|
||||||
|
}
|
||||||
|
if len(dst) < BlockSize {
|
||||||
|
panic("sm4: output not full block")
|
||||||
|
}
|
||||||
|
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
||||||
|
panic("sm4: invalid buffer overlap")
|
||||||
|
}
|
||||||
|
encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_SM4)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *sm4CipherNI) Decrypt(dst, src []byte) {
|
||||||
|
if len(src) < BlockSize {
|
||||||
|
panic("sm4: input not full block")
|
||||||
|
}
|
||||||
|
if len(dst) < BlockSize {
|
||||||
|
panic("sm4: output not full block")
|
||||||
|
}
|
||||||
|
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
||||||
|
panic("sm4: invalid buffer overlap")
|
||||||
|
}
|
||||||
|
encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_SM4)
|
||||||
|
}
|
||||||
|
|
||||||
func newCipher(key []byte) (cipher.Block, error) {
|
func newCipher(key []byte) (cipher.Block, error) {
|
||||||
|
if supportSM4 {
|
||||||
|
return newCipherNI(key)
|
||||||
|
}
|
||||||
|
|
||||||
if !supportsAES {
|
if !supportsAES {
|
||||||
return newCipherGeneric(key)
|
return newCipherGeneric(key)
|
||||||
}
|
}
|
||||||
|
|
||||||
blocks := 4
|
blocks := 4
|
||||||
if useAVX2 {
|
if useAVX2 {
|
||||||
blocks = 8
|
blocks = 8
|
||||||
}
|
}
|
||||||
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, blocks, blocks * BlockSize}
|
c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, blocks, blocks * BlockSize}
|
||||||
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
|
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0], INST_AES)
|
||||||
if supportsAES && supportsGFMUL {
|
if supportsGFMUL {
|
||||||
return &sm4CipherGCM{c}, nil
|
return &sm4CipherGCM{c}, nil
|
||||||
}
|
}
|
||||||
return &c, nil
|
return c, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *sm4CipherAsm) BlockSize() int { return BlockSize }
|
|
||||||
|
|
||||||
func (c *sm4CipherAsm) Concurrency() int { return c.batchBlocks }
|
func (c *sm4CipherAsm) Concurrency() int { return c.batchBlocks }
|
||||||
|
|
||||||
func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
|
func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
|
||||||
@ -60,7 +107,7 @@ func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
|
|||||||
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
||||||
panic("sm4: invalid buffer overlap")
|
panic("sm4: invalid buffer overlap")
|
||||||
}
|
}
|
||||||
encryptBlockAsm(&c.enc[0], &dst[0], &src[0])
|
encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_AES)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
|
func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
|
||||||
@ -73,7 +120,7 @@ func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
|
|||||||
if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) {
|
if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) {
|
||||||
panic("sm4: invalid buffer overlap")
|
panic("sm4: invalid buffer overlap")
|
||||||
}
|
}
|
||||||
encryptBlocksAsm(&c.enc[0], dst, src)
|
encryptBlocksAsm(&c.enc[0], dst, src, INST_AES)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
|
func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
|
||||||
@ -86,7 +133,7 @@ func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
|
|||||||
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
|
||||||
panic("sm4: invalid buffer overlap")
|
panic("sm4: invalid buffer overlap")
|
||||||
}
|
}
|
||||||
encryptBlockAsm(&c.dec[0], &dst[0], &src[0])
|
encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_AES)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) {
|
func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) {
|
||||||
@ -99,14 +146,16 @@ func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) {
|
|||||||
if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) {
|
if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) {
|
||||||
panic("sm4: invalid buffer overlap")
|
panic("sm4: invalid buffer overlap")
|
||||||
}
|
}
|
||||||
encryptBlocksAsm(&c.dec[0], dst, src)
|
encryptBlocksAsm(&c.dec[0], dst, src, INST_AES)
|
||||||
}
|
}
|
||||||
|
|
||||||
// expandKey is used by BenchmarkExpand to ensure that the asm implementation
|
// expandKey is used by BenchmarkExpand to ensure that the asm implementation
|
||||||
// of key expansion is used for the benchmark when it is available.
|
// of key expansion is used for the benchmark when it is available.
|
||||||
func expandKey(key []byte, enc, dec []uint32) {
|
func expandKey(key []byte, enc, dec []uint32) {
|
||||||
if supportsAES {
|
if supportSM4 {
|
||||||
expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0])
|
expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0], INST_SM4)
|
||||||
|
} else if supportsAES {
|
||||||
|
expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0], INST_AES)
|
||||||
} else {
|
} else {
|
||||||
expandKeyGo(key, enc, dec)
|
expandKeyGo(key, enc, dec)
|
||||||
}
|
}
|
||||||
|
@ -34,7 +34,7 @@ func TestExpandKey(t *testing.T) {
|
|||||||
}
|
}
|
||||||
io.ReadFull(rand.Reader, key)
|
io.ReadFull(rand.Reader, key)
|
||||||
expandKeyGo(key, encRes1, decRes1)
|
expandKeyGo(key, encRes1, decRes1)
|
||||||
expandKeyAsm(&key[0], &ck[0], &encRes2[0], &decRes2[0])
|
expandKey(key, encRes2, decRes2)
|
||||||
if !reflect.DeepEqual(encRes1, encRes2) {
|
if !reflect.DeepEqual(encRes1, encRes2) {
|
||||||
t.Errorf("expected=%v, result=%v\n", encRes1, encRes2)
|
t.Errorf("expected=%v, result=%v\n", encRes1, encRes2)
|
||||||
}
|
}
|
||||||
|
@ -2201,3 +2201,11 @@ avx2GcmSm4DecDone:
|
|||||||
VMOVDQU ACC0, (tPtr)
|
VMOVDQU ACC0, (tPtr)
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||||
|
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
|
||||||
|
RET
|
||||||
|
|
||||||
|
// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||||
|
TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
|
||||||
|
RET
|
||||||
|
@ -252,7 +252,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
|||||||
SM4_TAO_L1(x, y, z); \
|
SM4_TAO_L1(x, y, z); \
|
||||||
VEOR x.B16, t0.B16, t0.B16
|
VEOR x.B16, t0.B16, t0.B16
|
||||||
|
|
||||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
// func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
|
||||||
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||||
#define pTbl R0
|
#define pTbl R0
|
||||||
#define RK R1
|
#define RK R1
|
||||||
@ -260,6 +260,7 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
|||||||
|
|
||||||
MOVD productTable+0(FP), pTbl
|
MOVD productTable+0(FP), pTbl
|
||||||
MOVD rk+8(FP), RK
|
MOVD rk+8(FP), RK
|
||||||
|
MOVD inst+16(FP), R5
|
||||||
|
|
||||||
MOVD $0xC2, I
|
MOVD $0xC2, I
|
||||||
LSL $56, I
|
LSL $56, I
|
||||||
@ -269,6 +270,9 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
|||||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
// Encrypt block 0 with the SM4 keys to generate the hash key H
|
// Encrypt block 0 with the SM4 keys to generate the hash key H
|
||||||
|
CMP $1, R5
|
||||||
|
BEQ sm4InitSM4E
|
||||||
|
|
||||||
LOAD_SM4_AESNI_CONSTS()
|
LOAD_SM4_AESNI_CONSTS()
|
||||||
VEOR B0.B16, B0.B16, B0.B16
|
VEOR B0.B16, B0.B16, B0.B16
|
||||||
VEOR B1.B16, B1.B16, B1.B16
|
VEOR B1.B16, B1.B16, B1.B16
|
||||||
@ -290,7 +294,22 @@ sm4InitEncLoop:
|
|||||||
VMOV B1.S[0], B0.S[3]
|
VMOV B1.S[0], B0.S[3]
|
||||||
VMOV B2.S[0], B0.S[0]
|
VMOV B2.S[0], B0.S[0]
|
||||||
VMOV B3.S[0], B0.S[1]
|
VMOV B3.S[0], B0.S[1]
|
||||||
|
B sm4InitEncDone
|
||||||
|
sm4InitSM4E:
|
||||||
|
VEOR B0.B16, B0.B16, B0.B16
|
||||||
|
VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
|
||||||
|
WORD $0x6085c0ce //SM4E V0.4S, V11.4S
|
||||||
|
WORD $0x8085c0ce //SM4E V0.4S, V12.4S
|
||||||
|
WORD $0xa085c0ce //SM4E V0.4S, V13.4S
|
||||||
|
WORD $0xc085c0ce //SM4E V0.4S, V14.4S
|
||||||
|
VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
|
||||||
|
WORD $0x6085c0ce //SM4E V0.4S, V11.4S
|
||||||
|
WORD $0x8085c0ce //SM4E V0.4S, V12.4S
|
||||||
|
WORD $0xa085c0ce //SM4E V0.4S, V13.4S
|
||||||
|
WORD $0xc085c0ce //SM4E V0.4S, V14.4S
|
||||||
|
VREV32 B0.B16, B0.B16
|
||||||
|
VREV64 B0.B16, B0.B16
|
||||||
|
sm4InitEncDone:
|
||||||
// Multiply by 2 modulo P
|
// Multiply by 2 modulo P
|
||||||
VMOV B0.D[0], I
|
VMOV B0.D[0], I
|
||||||
ASR $63, I
|
ASR $63, I
|
||||||
@ -547,6 +566,7 @@ TEXT ·gcmSm4Enc(SB),NOSPLIT,$0
|
|||||||
VMOV H0, INC.S[3]
|
VMOV H0, INC.S[3]
|
||||||
VREV32 CTR.B16, CTR.B16
|
VREV32 CTR.B16, CTR.B16
|
||||||
VADD CTR.S4, INC.S4, CTR.S4
|
VADD CTR.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
// Skip to <8 blocks loop
|
// Skip to <8 blocks loop
|
||||||
CMP $128, srcPtrLen
|
CMP $128, srcPtrLen
|
||||||
|
|
||||||
@ -587,7 +607,7 @@ encOctetsEnc4Blocks1:
|
|||||||
VREV32 B2.B16, B2.B16
|
VREV32 B2.B16, B2.B16
|
||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
||||||
// encryption first 4 blocks
|
// encryption second 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
@ -880,7 +900,7 @@ decOctetsEnc4Blocks1:
|
|||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0)
|
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0)
|
||||||
|
|
||||||
// encryption first 4 blocks
|
// encryption second 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
|
525
sm4/gcm_sm4ni_arm64.s
Normal file
525
sm4/gcm_sm4ni_arm64.s
Normal file
@ -0,0 +1,525 @@
|
|||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
#define B0 V0
|
||||||
|
#define B1 V1
|
||||||
|
#define B2 V2
|
||||||
|
#define B3 V3
|
||||||
|
#define B4 V4
|
||||||
|
#define B5 V5
|
||||||
|
#define B6 V6
|
||||||
|
#define B7 V7
|
||||||
|
|
||||||
|
#define ACC0 V8
|
||||||
|
#define ACC1 V9
|
||||||
|
#define ACCM V10
|
||||||
|
|
||||||
|
#define T0 V11
|
||||||
|
#define T1 V12
|
||||||
|
#define T2 V13
|
||||||
|
#define T3 V14
|
||||||
|
|
||||||
|
#define POLY V15
|
||||||
|
#define ZERO V16
|
||||||
|
#define INC V17
|
||||||
|
#define CTR V18
|
||||||
|
|
||||||
|
#define K0 V19
|
||||||
|
#define K1 V20
|
||||||
|
#define K2 V21
|
||||||
|
#define K3 V22
|
||||||
|
#define K4 V23
|
||||||
|
#define K5 V24
|
||||||
|
#define K6 V25
|
||||||
|
#define K7 V26
|
||||||
|
|
||||||
|
#define reduce() \
|
||||||
|
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
|
||||||
|
VEOR ACC1.B16, ACCM.B16, ACCM.B16 \
|
||||||
|
VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \
|
||||||
|
VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \
|
||||||
|
VEOR ACCM.B16, ACC0.B16, ACC0.B16 \
|
||||||
|
VEOR T0.B16, ACC1.B16, ACC1.B16 \
|
||||||
|
VPMULL POLY.D1, ACC0.D1, T0.Q1 \
|
||||||
|
VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \
|
||||||
|
VEOR T0.B16, ACC0.B16, ACC0.B16 \
|
||||||
|
VPMULL POLY.D1, ACC0.D1, T0.Q1 \
|
||||||
|
VEOR T0.B16, ACC1.B16, ACC1.B16 \
|
||||||
|
VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \
|
||||||
|
VEOR ACC1.B16, ACC0.B16, ACC0.B16 \
|
||||||
|
|
||||||
|
#define mulRound(X) \
|
||||||
|
VLD1.P 32(pTbl), [T1.B16, T2.B16] \
|
||||||
|
VREV64 X.B16, X.B16 \
|
||||||
|
VEXT $8, X.B16, X.B16, T0.B16 \
|
||||||
|
VEOR X.B16, T0.B16, T0.B16 \
|
||||||
|
VPMULL X.D1, T1.D1, T3.Q1 \
|
||||||
|
VEOR T3.B16, ACC1.B16, ACC1.B16 \
|
||||||
|
VPMULL2 X.D2, T1.D2, T3.Q1 \
|
||||||
|
VEOR T3.B16, ACC0.B16, ACC0.B16 \
|
||||||
|
VPMULL T0.D1, T2.D1, T3.Q1 \
|
||||||
|
VEOR T3.B16, ACCM.B16, ACCM.B16
|
||||||
|
|
||||||
|
#define sm4eEnc1block() \
|
||||||
|
WORD $0x6086c0ce \ //SM4E V0.4S, V19.4S
|
||||||
|
WORD $0x8086c0ce \ //SM4E V0.4S, V20.4S
|
||||||
|
WORD $0xa086c0ce \ //SM4E V0.4S, V21.4S
|
||||||
|
WORD $0xc086c0ce \ //SM4E V0.4S, V22.4S
|
||||||
|
WORD $0xe086c0ce \ //SM4E V0.4S, V23.4S
|
||||||
|
WORD $0x0087c0ce \ //SM4E V0.4S, V24.4S
|
||||||
|
WORD $0x2087c0ce \ //SM4E V0.4S, V25.4S
|
||||||
|
WORD $0x4087c0ce //SM4E V0.4S, V26.4S
|
||||||
|
|
||||||
|
#define sm4eEnc8blocks() \
|
||||||
|
sm4eEnc1block() \
|
||||||
|
WORD $0x6186c0ce \ //SM4E V1.4S, V19.4S
|
||||||
|
WORD $0x8186c0ce \ //SM4E V1.4S, V20.4S
|
||||||
|
WORD $0xa186c0ce \ //SM4E V1.4S, V21.4S
|
||||||
|
WORD $0xc186c0ce \ //SM4E V1.4S, V22.4S
|
||||||
|
WORD $0xe186c0ce \ //SM4E V1.4S, V23.4S
|
||||||
|
WORD $0x0187c0ce \ //SM4E V1.4S, V24.4S
|
||||||
|
WORD $0x2187c0ce \ //SM4E V1.4S, V25.4S
|
||||||
|
WORD $0x4187c0ce \ //SM4E V1.4S, V26.4S
|
||||||
|
WORD $0x6286c0ce \ //SM4E V2.4S, V19.4S
|
||||||
|
WORD $0x8286c0ce \ //SM4E V2.4S, V20.4S
|
||||||
|
WORD $0xa286c0ce \ //SM4E V2.4S, V21.4S
|
||||||
|
WORD $0xc286c0ce \ //SM4E V2.4S, V22.4S
|
||||||
|
WORD $0xe286c0ce \ //SM4E V2.4S, V23.4S
|
||||||
|
WORD $0x0287c0ce \ //SM4E V2.4S, V24.4S
|
||||||
|
WORD $0x2287c0ce \ //SM4E V2.4S, V25.4S
|
||||||
|
WORD $0x4287c0ce \ //SM4E V2.4S, V26.4S
|
||||||
|
WORD $0x6386c0ce \ //SM4E V3.4S, V19.4S
|
||||||
|
WORD $0x8386c0ce \ //SM4E V3.4S, V20.4S
|
||||||
|
WORD $0xa386c0ce \ //SM4E V3.4S, V21.4S
|
||||||
|
WORD $0xc386c0ce \ //SM4E V3.4S, V22.4S
|
||||||
|
WORD $0xe386c0ce \ //SM4E V3.4S, V23.4S
|
||||||
|
WORD $0x0387c0ce \ //SM4E V3.4S, V24.4S
|
||||||
|
WORD $0x2387c0ce \ //SM4E V3.4S, V25.4S
|
||||||
|
WORD $0x4387c0ce \ //SM4E V3.4S, V26.4S
|
||||||
|
WORD $0x6486c0ce \ //SM4E V4.4S, V19.4S
|
||||||
|
WORD $0x8486c0ce \ //SM4E V4.4S, V20.4S
|
||||||
|
WORD $0xa486c0ce \ //SM4E V4.4S, V21.4S
|
||||||
|
WORD $0xc486c0ce \ //SM4E V4.4S, V22.4S
|
||||||
|
WORD $0xe486c0ce \ //SM4E V4.4S, V23.4S
|
||||||
|
WORD $0x0487c0ce \ //SM4E V4.4S, V24.4S
|
||||||
|
WORD $0x2487c0ce \ //SM4E V4.4S, V25.4S
|
||||||
|
WORD $0x4487c0ce \ //SM4E V4.4S, V26.4S
|
||||||
|
WORD $0x6586c0ce \ //SM4E V5.4S, V19.4S
|
||||||
|
WORD $0x8586c0ce \ //SM4E V5.4S, V20.4S
|
||||||
|
WORD $0xa586c0ce \ //SM4E V5.4S, V21.4S
|
||||||
|
WORD $0xc586c0ce \ //SM4E V5.4S, V22.4S
|
||||||
|
WORD $0xe586c0ce \ //SM4E V5.4S, V23.4S
|
||||||
|
WORD $0x0587c0ce \ //SM4E V5.4S, V24.4S
|
||||||
|
WORD $0x2587c0ce \ //SM4E V5.4S, V25.4S
|
||||||
|
WORD $0x4587c0ce \ //SM4E V5.4S, V26.4S
|
||||||
|
WORD $0x6686c0ce \ //SM4E V6.4S, V19.4S
|
||||||
|
WORD $0x8686c0ce \ //SM4E V6.4S, V20.4S
|
||||||
|
WORD $0xa686c0ce \ //SM4E V6.4S, V21.4S
|
||||||
|
WORD $0xc686c0ce \ //SM4E V6.4S, V22.4S
|
||||||
|
WORD $0xe686c0ce \ //SM4E V6.4S, V23.4S
|
||||||
|
WORD $0x0687c0ce \ //SM4E V6.4S, V24.4S
|
||||||
|
WORD $0x2687c0ce \ //SM4E V6.4S, V25.4S
|
||||||
|
WORD $0x4687c0ce \ //SM4E V6.4S, V26.4S
|
||||||
|
WORD $0x6786c0ce \ //SM4E V7.4S, V19.4S
|
||||||
|
WORD $0x8786c0ce \ //SM4E V7.4S, V20.4S
|
||||||
|
WORD $0xa786c0ce \ //SM4E V7.4S, V21.4S
|
||||||
|
WORD $0xc786c0ce \ //SM4E V7.4S, V22.4S
|
||||||
|
WORD $0xe786c0ce \ //SM4E V7.4S, V23.4S
|
||||||
|
WORD $0x0787c0ce \ //SM4E V7.4S, V24.4S
|
||||||
|
WORD $0x2787c0ce \ //SM4E V7.4S, V25.4S
|
||||||
|
WORD $0x4787c0ce //SM4E V7.4S, V26.4S
|
||||||
|
|
||||||
|
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||||
|
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
|
||||||
|
#define pTbl R0
|
||||||
|
#define dstPtr R1
|
||||||
|
#define ctrPtr R2
|
||||||
|
#define srcPtr R3
|
||||||
|
#define rk R4
|
||||||
|
#define tPtr R5
|
||||||
|
#define srcPtrLen R6
|
||||||
|
#define aluCTR R7
|
||||||
|
#define aluTMP R8
|
||||||
|
#define H0 R9
|
||||||
|
#define H1 R10
|
||||||
|
#define pTblSave R11
|
||||||
|
#define rkSave R12
|
||||||
|
MOVD productTable+0(FP), pTbl
|
||||||
|
MOVD dst+8(FP), dstPtr
|
||||||
|
MOVD src_base+32(FP), srcPtr
|
||||||
|
MOVD src_len+40(FP), srcPtrLen
|
||||||
|
MOVD ctr+56(FP), ctrPtr
|
||||||
|
MOVD T+64(FP), tPtr
|
||||||
|
MOVD rk_base+72(FP), rk
|
||||||
|
|
||||||
|
MOVD $0xC2, H1
|
||||||
|
LSL $56, H1
|
||||||
|
MOVD $1, H0
|
||||||
|
VMOV H1, POLY.D[0]
|
||||||
|
VMOV H0, POLY.D[1]
|
||||||
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
|
MOVD pTbl, pTblSave
|
||||||
|
// Current tag, after AAD
|
||||||
|
VLD1 (tPtr), [ACC0.B16]
|
||||||
|
VEOR ACC1.B16, ACC1.B16, ACC1.B16
|
||||||
|
VEOR ACCM.B16, ACCM.B16, ACCM.B16
|
||||||
|
// Prepare initial counter, and the increment vector
|
||||||
|
VLD1 (ctrPtr), [CTR.B16]
|
||||||
|
VEOR INC.B16, INC.B16, INC.B16
|
||||||
|
MOVD $1, H0
|
||||||
|
VMOV H0, INC.S[3]
|
||||||
|
VREV32 CTR.B16, CTR.B16
|
||||||
|
VADD CTR.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
|
// Skip to <8 blocks loop
|
||||||
|
CMP $128, srcPtrLen
|
||||||
|
|
||||||
|
MOVD rk, H0
|
||||||
|
// For SM4 round keys are stored in: K0 .. K7
|
||||||
|
VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
|
||||||
|
VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
|
||||||
|
|
||||||
|
BLT startSingles
|
||||||
|
octetsLoop:
|
||||||
|
SUB $128, srcPtrLen
|
||||||
|
// Prepare 8 counters
|
||||||
|
VMOV CTR.B16, B0.B16
|
||||||
|
VADD B0.S4, INC.S4, B1.S4
|
||||||
|
VADD B1.S4, INC.S4, B2.S4
|
||||||
|
VADD B2.S4, INC.S4, B3.S4
|
||||||
|
VADD B3.S4, INC.S4, B4.S4
|
||||||
|
VADD B4.S4, INC.S4, B5.S4
|
||||||
|
VADD B5.S4, INC.S4, B6.S4
|
||||||
|
VADD B6.S4, INC.S4, B7.S4
|
||||||
|
VADD B7.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
|
sm4eEnc8blocks()
|
||||||
|
VREV32 B0.B16, B0.B16
|
||||||
|
VREV32 B1.B16, B1.B16
|
||||||
|
VREV32 B2.B16, B2.B16
|
||||||
|
VREV32 B3.B16, B3.B16
|
||||||
|
VREV32 B4.B16, B4.B16
|
||||||
|
VREV32 B5.B16, B5.B16
|
||||||
|
VREV32 B6.B16, B6.B16
|
||||||
|
VREV32 B7.B16, B7.B16
|
||||||
|
|
||||||
|
// XOR plaintext and store ciphertext
|
||||||
|
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||||
|
VEOR B0.B16, T1.B16, B0.B16
|
||||||
|
VEOR B1.B16, T2.B16, B1.B16
|
||||||
|
VST1.P [B0.B16, B1.B16], 32(dstPtr)
|
||||||
|
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||||
|
VEOR B2.B16, T1.B16, B2.B16
|
||||||
|
VEOR B3.B16, T2.B16, B3.B16
|
||||||
|
VST1.P [B2.B16, B3.B16], 32(dstPtr)
|
||||||
|
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||||
|
VEOR B4.B16, T1.B16, B4.B16
|
||||||
|
VEOR B5.B16, T2.B16, B5.B16
|
||||||
|
VST1.P [B4.B16, B5.B16], 32(dstPtr)
|
||||||
|
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||||
|
VEOR B6.B16, T1.B16, B6.B16
|
||||||
|
VEOR B7.B16, T2.B16, B7.B16
|
||||||
|
VST1.P [B6.B16, B7.B16], 32(dstPtr)
|
||||||
|
|
||||||
|
VLD1.P 32(pTbl), [T1.B16, T2.B16]
|
||||||
|
VREV64 B0.B16, B0.B16
|
||||||
|
VEOR ACC0.B16, B0.B16, B0.B16
|
||||||
|
VEXT $8, B0.B16, B0.B16, T0.B16
|
||||||
|
VEOR B0.B16, T0.B16, T0.B16
|
||||||
|
VPMULL B0.D1, T1.D1, ACC1.Q1
|
||||||
|
VPMULL2 B0.D2, T1.D2, ACC0.Q1
|
||||||
|
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||||
|
|
||||||
|
mulRound(B1)
|
||||||
|
mulRound(B2)
|
||||||
|
mulRound(B3)
|
||||||
|
mulRound(B4)
|
||||||
|
mulRound(B5)
|
||||||
|
mulRound(B6)
|
||||||
|
mulRound(B7)
|
||||||
|
MOVD pTblSave, pTbl
|
||||||
|
reduce()
|
||||||
|
|
||||||
|
CMP $128, srcPtrLen
|
||||||
|
BGE octetsLoop
|
||||||
|
|
||||||
|
startSingles:
|
||||||
|
CBZ srcPtrLen, done
|
||||||
|
ADD $14*16, pTbl
|
||||||
|
// Preload H and its Karatsuba precomp
|
||||||
|
VLD1.P (pTbl), [T1.B16, T2.B16]
|
||||||
|
|
||||||
|
singlesLoop:
|
||||||
|
CMP $16, srcPtrLen
|
||||||
|
BLT tail
|
||||||
|
SUB $16, srcPtrLen
|
||||||
|
|
||||||
|
VMOV CTR.B16, B0.B16
|
||||||
|
VADD CTR.S4, INC.S4, CTR.S4
|
||||||
|
sm4eEnc1block()
|
||||||
|
VREV32 B0.B16, B0.B16
|
||||||
|
|
||||||
|
singlesLast:
|
||||||
|
VLD1.P 16(srcPtr), [T0.B16]
|
||||||
|
VEOR T0.B16, B0.B16, B0.B16
|
||||||
|
|
||||||
|
encReduce:
|
||||||
|
VST1.P [B0.B16], 16(dstPtr)
|
||||||
|
|
||||||
|
VREV64 B0.B16, B0.B16
|
||||||
|
VEOR ACC0.B16, B0.B16, B0.B16
|
||||||
|
|
||||||
|
VEXT $8, B0.B16, B0.B16, T0.B16
|
||||||
|
VEOR B0.B16, T0.B16, T0.B16
|
||||||
|
VPMULL B0.D1, T1.D1, ACC1.Q1
|
||||||
|
VPMULL2 B0.D2, T1.D2, ACC0.Q1
|
||||||
|
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||||
|
|
||||||
|
reduce()
|
||||||
|
|
||||||
|
B singlesLoop
|
||||||
|
tail:
|
||||||
|
CBZ srcPtrLen, done
|
||||||
|
|
||||||
|
VEOR T0.B16, T0.B16, T0.B16
|
||||||
|
VEOR T3.B16, T3.B16, T3.B16
|
||||||
|
MOVD $0, H1
|
||||||
|
SUB $1, H1
|
||||||
|
ADD srcPtrLen, srcPtr
|
||||||
|
|
||||||
|
TBZ $3, srcPtrLen, ld4
|
||||||
|
MOVD.W -8(srcPtr), H0
|
||||||
|
VMOV H0, T0.D[0]
|
||||||
|
VMOV H1, T3.D[0]
|
||||||
|
|
||||||
|
ld4:
|
||||||
|
TBZ $2, srcPtrLen, ld2
|
||||||
|
MOVW.W -4(srcPtr), H0
|
||||||
|
VEXT $12, T0.B16, ZERO.B16, T0.B16
|
||||||
|
VEXT $12, T3.B16, ZERO.B16, T3.B16
|
||||||
|
VMOV H0, T0.S[0]
|
||||||
|
VMOV H1, T3.S[0]
|
||||||
|
ld2:
|
||||||
|
TBZ $1, srcPtrLen, ld1
|
||||||
|
MOVH.W -2(srcPtr), H0
|
||||||
|
VEXT $14, T0.B16, ZERO.B16, T0.B16
|
||||||
|
VEXT $14, T3.B16, ZERO.B16, T3.B16
|
||||||
|
VMOV H0, T0.H[0]
|
||||||
|
VMOV H1, T3.H[0]
|
||||||
|
ld1:
|
||||||
|
TBZ $0, srcPtrLen, ld0
|
||||||
|
MOVB.W -1(srcPtr), H0
|
||||||
|
VEXT $15, T0.B16, ZERO.B16, T0.B16
|
||||||
|
VEXT $15, T3.B16, ZERO.B16, T3.B16
|
||||||
|
VMOV H0, T0.B[0]
|
||||||
|
VMOV H1, T3.B[0]
|
||||||
|
ld0:
|
||||||
|
MOVD ZR, srcPtrLen
|
||||||
|
VMOV CTR.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VREV32 B0.B16, B0.B16
|
||||||
|
|
||||||
|
tailLast:
|
||||||
|
VEOR T0.B16, B0.B16, B0.B16
|
||||||
|
VAND T3.B16, B0.B16, B0.B16
|
||||||
|
B encReduce
|
||||||
|
|
||||||
|
done:
|
||||||
|
VST1 [ACC0.B16], (tPtr)
|
||||||
|
RET
|
||||||
|
|
||||||
|
// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||||
|
TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
|
||||||
|
MOVD productTable+0(FP), pTbl
|
||||||
|
MOVD dst+8(FP), dstPtr
|
||||||
|
MOVD src_base+32(FP), srcPtr
|
||||||
|
MOVD src_len+40(FP), srcPtrLen
|
||||||
|
MOVD ctr+56(FP), ctrPtr
|
||||||
|
MOVD T+64(FP), tPtr
|
||||||
|
MOVD rk_base+72(FP), rk
|
||||||
|
|
||||||
|
MOVD $0xC2, H1
|
||||||
|
LSL $56, H1
|
||||||
|
MOVD $1, H0
|
||||||
|
VMOV H1, POLY.D[0]
|
||||||
|
VMOV H0, POLY.D[1]
|
||||||
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
|
MOVD pTbl, pTblSave
|
||||||
|
MOVD rk, rkSave
|
||||||
|
// Current tag, after AAD
|
||||||
|
VLD1 (tPtr), [ACC0.B16]
|
||||||
|
VEOR ACC1.B16, ACC1.B16, ACC1.B16
|
||||||
|
VEOR ACCM.B16, ACCM.B16, ACCM.B16
|
||||||
|
// Prepare initial counter, and the increment vector
|
||||||
|
VLD1 (ctrPtr), [CTR.B16]
|
||||||
|
VEOR INC.B16, INC.B16, INC.B16
|
||||||
|
MOVD $1, H0
|
||||||
|
VMOV H0, INC.S[3]
|
||||||
|
VREV32 CTR.B16, CTR.B16
|
||||||
|
VADD CTR.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
|
// Skip to <8 blocks loop
|
||||||
|
CMP $128, srcPtrLen
|
||||||
|
|
||||||
|
MOVD rk, H0
|
||||||
|
// For SM4 round keys are stored in: K0 .. K7
|
||||||
|
VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
|
||||||
|
VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
|
||||||
|
|
||||||
|
BLT startSingles
|
||||||
|
octetsLoop:
|
||||||
|
SUB $128, srcPtrLen
|
||||||
|
|
||||||
|
VMOV CTR.B16, B0.B16
|
||||||
|
VADD B0.S4, INC.S4, B1.S4
|
||||||
|
VADD B1.S4, INC.S4, B2.S4
|
||||||
|
VADD B2.S4, INC.S4, B3.S4
|
||||||
|
VADD B3.S4, INC.S4, B4.S4
|
||||||
|
VADD B4.S4, INC.S4, B5.S4
|
||||||
|
VADD B5.S4, INC.S4, B6.S4
|
||||||
|
VADD B6.S4, INC.S4, B7.S4
|
||||||
|
VADD B7.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
|
sm4eEnc8blocks()
|
||||||
|
VREV32 B0.B16, T1.B16
|
||||||
|
VREV32 B1.B16, T2.B16
|
||||||
|
VREV32 B2.B16, B2.B16
|
||||||
|
VREV32 B3.B16, B3.B16
|
||||||
|
VREV32 B4.B16, B4.B16
|
||||||
|
VREV32 B5.B16, B5.B16
|
||||||
|
VREV32 B6.B16, B6.B16
|
||||||
|
VREV32 B7.B16, B7.B16
|
||||||
|
|
||||||
|
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||||
|
VEOR B0.B16, T1.B16, T1.B16
|
||||||
|
VEOR B1.B16, T2.B16, T2.B16
|
||||||
|
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||||
|
|
||||||
|
VLD1.P 32(pTbl), [T1.B16, T2.B16]
|
||||||
|
VREV64 B0.B16, B0.B16
|
||||||
|
VEOR ACC0.B16, B0.B16, B0.B16
|
||||||
|
VEXT $8, B0.B16, B0.B16, T0.B16
|
||||||
|
VEOR B0.B16, T0.B16, T0.B16
|
||||||
|
VPMULL B0.D1, T1.D1, ACC1.Q1
|
||||||
|
VPMULL2 B0.D2, T1.D2, ACC0.Q1
|
||||||
|
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||||
|
mulRound(B1)
|
||||||
|
|
||||||
|
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||||
|
VEOR B2.B16, B0.B16, T1.B16
|
||||||
|
VEOR B3.B16, B1.B16, T2.B16
|
||||||
|
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||||
|
mulRound(B0)
|
||||||
|
mulRound(B1)
|
||||||
|
|
||||||
|
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||||
|
VEOR B4.B16, B0.B16, T1.B16
|
||||||
|
VEOR B5.B16, B1.B16, T2.B16
|
||||||
|
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||||
|
mulRound(B0)
|
||||||
|
mulRound(B1)
|
||||||
|
|
||||||
|
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||||
|
VEOR B6.B16, B0.B16, T1.B16
|
||||||
|
VEOR B7.B16, B1.B16, T2.B16
|
||||||
|
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||||
|
mulRound(B0)
|
||||||
|
mulRound(B1)
|
||||||
|
|
||||||
|
MOVD pTblSave, pTbl
|
||||||
|
reduce()
|
||||||
|
|
||||||
|
CMP $128, srcPtrLen
|
||||||
|
BGE octetsLoop
|
||||||
|
|
||||||
|
startSingles:
|
||||||
|
CBZ srcPtrLen, done
|
||||||
|
ADD $14*16, pTbl
|
||||||
|
// Preload H and its Karatsuba precomp
|
||||||
|
VLD1.P (pTbl), [T1.B16, T2.B16]
|
||||||
|
|
||||||
|
singlesLoop:
|
||||||
|
CMP $16, srcPtrLen
|
||||||
|
BLT tail
|
||||||
|
SUB $16, srcPtrLen
|
||||||
|
|
||||||
|
VLD1.P 16(srcPtr), [T0.B16]
|
||||||
|
VREV64 T0.B16, B5.B16
|
||||||
|
|
||||||
|
VMOV CTR.B16, B0.B16
|
||||||
|
VADD CTR.S4, INC.S4, CTR.S4
|
||||||
|
sm4eEnc1block()
|
||||||
|
VREV32 B0.B16, B0.B16
|
||||||
|
|
||||||
|
singlesLast:
|
||||||
|
VEOR T0.B16, B0.B16, B0.B16
|
||||||
|
VST1.P [B0.B16], 16(dstPtr)
|
||||||
|
|
||||||
|
VEOR ACC0.B16, B5.B16, B5.B16
|
||||||
|
VEXT $8, B5.B16, B5.B16, T0.B16
|
||||||
|
VEOR B5.B16, T0.B16, T0.B16
|
||||||
|
VPMULL B5.D1, T1.D1, ACC1.Q1
|
||||||
|
VPMULL2 B5.D2, T1.D2, ACC0.Q1
|
||||||
|
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||||
|
reduce()
|
||||||
|
|
||||||
|
B singlesLoop
|
||||||
|
tail:
|
||||||
|
CBZ srcPtrLen, done
|
||||||
|
VMOV CTR.B16, B0.B16
|
||||||
|
VADD CTR.S4, INC.S4, CTR.S4
|
||||||
|
sm4eEnc1block()
|
||||||
|
VREV32 B0.B16, B0.B16
|
||||||
|
tailLast:
|
||||||
|
// Assuming it is safe to load past dstPtr due to the presence of the tag
|
||||||
|
// B5 stored last ciphertext
|
||||||
|
VLD1 (srcPtr), [B5.B16]
|
||||||
|
|
||||||
|
VEOR B5.B16, B0.B16, B0.B16
|
||||||
|
|
||||||
|
VEOR T3.B16, T3.B16, T3.B16
|
||||||
|
MOVD $0, H1
|
||||||
|
SUB $1, H1
|
||||||
|
|
||||||
|
TBZ $3, srcPtrLen, ld4 // Test if srcPtrLen < 8, if yes, goto ld4
|
||||||
|
VMOV B0.D[0], H0
|
||||||
|
MOVD.P H0, 8(dstPtr)
|
||||||
|
VMOV H1, T3.D[0]
|
||||||
|
VEXT $8, ZERO.B16, B0.B16, B0.B16
|
||||||
|
ld4:
|
||||||
|
TBZ $2, srcPtrLen, ld2 // Test if srcPtrLen < 4, if yes, goto ld2
|
||||||
|
VMOV B0.S[0], H0
|
||||||
|
MOVW.P H0, 4(dstPtr)
|
||||||
|
VEXT $12, T3.B16, ZERO.B16, T3.B16
|
||||||
|
VMOV H1, T3.S[0]
|
||||||
|
VEXT $4, ZERO.B16, B0.B16, B0.B16
|
||||||
|
ld2:
|
||||||
|
TBZ $1, srcPtrLen, ld1 // Test if srcPtrLen < 2, if yes, goto ld1
|
||||||
|
VMOV B0.H[0], H0
|
||||||
|
MOVH.P H0, 2(dstPtr)
|
||||||
|
VEXT $14, T3.B16, ZERO.B16, T3.B16
|
||||||
|
VMOV H1, T3.H[0]
|
||||||
|
VEXT $2, ZERO.B16, B0.B16, B0.B16
|
||||||
|
ld1:
|
||||||
|
TBZ $0, srcPtrLen, ld0 // Test if srcPtrLen < 1, if yes, goto ld0
|
||||||
|
VMOV B0.B[0], H0
|
||||||
|
MOVB.P H0, 1(dstPtr)
|
||||||
|
VEXT $15, T3.B16, ZERO.B16, T3.B16
|
||||||
|
VMOV H1, T3.B[0]
|
||||||
|
ld0:
|
||||||
|
|
||||||
|
VAND T3.B16, B5.B16, B5.B16
|
||||||
|
VREV64 B5.B16, B5.B16
|
||||||
|
|
||||||
|
VEOR ACC0.B16, B5.B16, B5.B16
|
||||||
|
VEXT $8, B5.B16, B5.B16, T0.B16
|
||||||
|
VEOR B5.B16, T0.B16, T0.B16
|
||||||
|
VPMULL B5.D1, T1.D1, ACC1.Q1
|
||||||
|
VPMULL2 B5.D2, T1.D2, ACC0.Q1
|
||||||
|
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||||
|
reduce()
|
||||||
|
done:
|
||||||
|
VST1 [ACC0.B16], (tPtr)
|
||||||
|
|
||||||
|
RET
|
137
sm4/gen_arm64_ni.go
Normal file
137
sm4/gen_arm64_ni.go
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
// Not used yet!!!
|
||||||
|
// go run gen_arm64_ni.go
|
||||||
|
|
||||||
|
//go:build ignore
|
||||||
|
// +build ignore
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"math/bits"
|
||||||
|
"os"
|
||||||
|
)
|
||||||
|
|
||||||
|
//SM4E <Vd>.4S, <Vn>.4S
|
||||||
|
func sm4e(Vd, Vn byte) uint32 {
|
||||||
|
inst := uint32(0xcec08400) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5
|
||||||
|
return bits.ReverseBytes32(inst)
|
||||||
|
}
|
||||||
|
|
||||||
|
//SM4EKEY <Vd>.4S, <Vn>.4S, <Vm>.4S
|
||||||
|
func sm4ekey(Vd, Vn, Vm byte) uint32 {
|
||||||
|
inst := uint32(0xce60c800) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5 | (uint32(Vm&0x1f) << 16)
|
||||||
|
return bits.ReverseBytes32(inst)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sm4ekeyRound(buf *bytes.Buffer, d, n, m byte) {
|
||||||
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM4EKEY V%d.4S, V%d.4S, V%d.4S\n", sm4ekey(d, n, m), d, n, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sm4eRound(buf *bytes.Buffer, d, n byte) {
|
||||||
|
fmt.Fprintf(buf, "\tWORD $0x%08x //SM4E V%d.4S, V%d.4S\n", sm4e(d, n), d, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
fmt.Fprint(buf, `
|
||||||
|
// Generated by gen_arm64_ni.go. DO NOT EDIT.
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
// func expandKeySM4E(key *byte, fk, ck, enc *uint32)
|
||||||
|
TEXT ·expandKeySM4E(SB),NOSPLIT,$0
|
||||||
|
MOVD key+0(FP), R8
|
||||||
|
MOVD fk+8(FP), R9
|
||||||
|
MOVD ck+16(FP), R10
|
||||||
|
MOVD enc+24(FP), R11
|
||||||
|
|
||||||
|
VLD1 (R8), [V9.B16]
|
||||||
|
VREV32 V9.B16, V9.B16
|
||||||
|
VLD1 (R9), [V8.S4]
|
||||||
|
VEOR V9, V8, V9
|
||||||
|
VLD1.P 64(R10), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||||
|
`[1:])
|
||||||
|
|
||||||
|
sm4ekeyRound(buf, 8, 9, 0)
|
||||||
|
sm4ekeyRound(buf, 9, 8, 1)
|
||||||
|
fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n")
|
||||||
|
sm4ekeyRound(buf, 8, 9, 2)
|
||||||
|
sm4ekeyRound(buf, 9, 8, 3)
|
||||||
|
fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n")
|
||||||
|
fmt.Fprintf(buf, "\tVLD1.P 64(R10), [V0.S4, V1.S4, V2.S4, V3.S4]\n")
|
||||||
|
sm4ekeyRound(buf, 8, 9, 0)
|
||||||
|
sm4ekeyRound(buf, 9, 8, 1)
|
||||||
|
fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n")
|
||||||
|
sm4ekeyRound(buf, 8, 9, 2)
|
||||||
|
sm4ekeyRound(buf, 9, 8, 3)
|
||||||
|
fmt.Fprintf(buf, `
|
||||||
|
VST1.P [V8.S4, V9.S4], 32(R11)
|
||||||
|
RET
|
||||||
|
`[1:])
|
||||||
|
fmt.Fprint(buf, `
|
||||||
|
|
||||||
|
// func encryptBlockSM4E(xk *uint32, dst, src *byte)
|
||||||
|
TEXT ·encryptBlockSM4E(SB),NOSPLIT,$0
|
||||||
|
MOVD xk+0(FP), R8
|
||||||
|
MOVD dst+8(FP), R9
|
||||||
|
MOVD src+16(FP), R10
|
||||||
|
|
||||||
|
VLD1 (R10), [V8.B16]
|
||||||
|
VREV32 V8.B16, V8.B16
|
||||||
|
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||||
|
`[1:])
|
||||||
|
sm4eRound(buf, 8, 0)
|
||||||
|
sm4eRound(buf, 8, 1)
|
||||||
|
sm4eRound(buf, 8, 2)
|
||||||
|
sm4eRound(buf, 8, 3)
|
||||||
|
fmt.Fprintf(buf, "\tVLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]\n")
|
||||||
|
sm4eRound(buf, 8, 0)
|
||||||
|
sm4eRound(buf, 8, 1)
|
||||||
|
sm4eRound(buf, 8, 2)
|
||||||
|
sm4eRound(buf, 8, 3)
|
||||||
|
fmt.Fprintf(buf, `
|
||||||
|
VREV32 V8.B16, V8.B16
|
||||||
|
VST1 [V8.B16], (R9)
|
||||||
|
RET
|
||||||
|
`[1:])
|
||||||
|
|
||||||
|
fmt.Fprint(buf, `
|
||||||
|
|
||||||
|
// func encryptBlocksSM4E(xk *uint32, dst, src *byte)
|
||||||
|
TEXT ·encryptBlocksSM4E(SB),NOSPLIT,$0
|
||||||
|
MOVD xk+0(FP), R8
|
||||||
|
MOVD dst+8(FP), R9
|
||||||
|
MOVD src+16(FP), R10
|
||||||
|
|
||||||
|
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||||
|
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
|
||||||
|
|
||||||
|
`[1:])
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
fmt.Fprintf(buf, "\tVLD1.P 16(R10), [V8.B16]\n")
|
||||||
|
fmt.Fprintf(buf, "\tVREV32 V8.B16, V8.B16\n")
|
||||||
|
sm4eRound(buf, 8, 0)
|
||||||
|
sm4eRound(buf, 8, 1)
|
||||||
|
sm4eRound(buf, 8, 2)
|
||||||
|
sm4eRound(buf, 8, 3)
|
||||||
|
sm4eRound(buf, 8, 4)
|
||||||
|
sm4eRound(buf, 8, 5)
|
||||||
|
sm4eRound(buf, 8, 6)
|
||||||
|
sm4eRound(buf, 8, 7)
|
||||||
|
fmt.Fprintf(buf, "\tVREV32 V8.B16, V8.B16\n")
|
||||||
|
fmt.Fprintf(buf, "\tVST1.P [V8.B16], 16(R9)\n\n")
|
||||||
|
}
|
||||||
|
fmt.Fprintf(buf, `
|
||||||
|
RET
|
||||||
|
`[1:])
|
||||||
|
|
||||||
|
src := buf.Bytes()
|
||||||
|
// fmt.Println(string(src))
|
||||||
|
err := os.WriteFile("sm4e_arm64.s", src, 0644)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
@ -12,16 +12,16 @@ import (
|
|||||||
|
|
||||||
// sm4CipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
|
// sm4CipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
|
||||||
// will use the optimised implementation in this file when possible. Instances
|
// will use the optimised implementation in this file when possible. Instances
|
||||||
// of this type only exist when hasGCMAsm returns true.
|
// of this type only exist when hasGCMAsm and hasAES returns true.
|
||||||
type sm4CipherGCM struct {
|
type sm4CipherGCM struct {
|
||||||
sm4CipherAsm
|
*sm4CipherAsm
|
||||||
}
|
}
|
||||||
|
|
||||||
// Assert that sm4CipherGCM implements the gcmAble interface.
|
// Assert that sm4CipherGCM implements the gcmAble interface.
|
||||||
var _ gcmAble = (*sm4CipherGCM)(nil)
|
var _ gcmAble = (*sm4CipherGCM)(nil)
|
||||||
|
|
||||||
//go:noescape
|
//go:noescape
|
||||||
func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
|
||||||
|
|
||||||
//go:noescape
|
//go:noescape
|
||||||
func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||||
@ -35,6 +35,33 @@ func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
|
|||||||
//go:noescape
|
//go:noescape
|
||||||
func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
|
func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
|
||||||
|
|
||||||
|
// gcmSm4InitInst is used for test
|
||||||
|
func gcmSm4InitInst(productTable *[256]byte, rk []uint32) {
|
||||||
|
if supportSM4 {
|
||||||
|
gcmSm4Init(productTable, rk, INST_SM4)
|
||||||
|
} else {
|
||||||
|
gcmSm4Init(productTable, rk, INST_AES)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// gcmSm4EncInst is used for test
|
||||||
|
func gcmSm4EncInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
|
||||||
|
if supportSM4 {
|
||||||
|
gcmSm4niEnc(productTable, dst, src, ctr, T, rk)
|
||||||
|
} else {
|
||||||
|
gcmSm4Enc(productTable, dst, src, ctr, T, rk)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// gcmSm4DecInst is used for test
|
||||||
|
func gcmSm4DecInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
|
||||||
|
if supportSM4 {
|
||||||
|
gcmSm4niDec(productTable, dst, src, ctr, T, rk)
|
||||||
|
} else {
|
||||||
|
gcmSm4Dec(productTable, dst, src, ctr, T, rk)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type gcmAsm struct {
|
type gcmAsm struct {
|
||||||
gcm
|
gcm
|
||||||
bytesProductTable [256]byte
|
bytesProductTable [256]byte
|
||||||
@ -44,10 +71,10 @@ type gcmAsm struct {
|
|||||||
// called by crypto/cipher.NewGCM via the gcmAble interface.
|
// called by crypto/cipher.NewGCM via the gcmAble interface.
|
||||||
func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
|
func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
|
||||||
g := &gcmAsm{}
|
g := &gcmAsm{}
|
||||||
g.cipher = &c.sm4CipherAsm
|
g.cipher = c.sm4CipherAsm
|
||||||
g.nonceSize = nonceSize
|
g.nonceSize = nonceSize
|
||||||
g.tagSize = tagSize
|
g.tagSize = tagSize
|
||||||
gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
|
gcmSm4Init(&g.bytesProductTable, g.cipher.enc, INST_AES)
|
||||||
return g, nil
|
return g, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,12 +11,12 @@ import (
|
|||||||
|
|
||||||
func genPrecomputeTable() *gcmAsm {
|
func genPrecomputeTable() *gcmAsm {
|
||||||
key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
|
key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
|
||||||
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
|
c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
|
||||||
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
|
expandKey(key, c.enc, c.dec)
|
||||||
c1 := &sm4CipherGCM{c}
|
c1 := &sm4CipherGCM{c}
|
||||||
g := &gcmAsm{}
|
g := &gcmAsm{}
|
||||||
g.cipher = &c1.sm4CipherAsm
|
g.cipher = c1.sm4CipherAsm
|
||||||
gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
|
gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
|
||||||
return g
|
return g
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -145,13 +145,13 @@ func TestBothDataPlaintext(t *testing.T) {
|
|||||||
|
|
||||||
func createGcm() *gcmAsm {
|
func createGcm() *gcmAsm {
|
||||||
key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
|
key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
|
||||||
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
|
c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
|
||||||
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
|
expandKey(key, c.enc, c.dec)
|
||||||
c1 := &sm4CipherGCM{c}
|
c1 := &sm4CipherGCM{c}
|
||||||
g := &gcmAsm{}
|
g := &gcmAsm{}
|
||||||
g.cipher = &c1.sm4CipherAsm
|
g.cipher = c1.sm4CipherAsm
|
||||||
g.tagSize = 16
|
g.tagSize = 16
|
||||||
gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
|
gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
|
||||||
return g
|
return g
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -214,7 +214,7 @@ func TestGcmSm4Enc(t *testing.T) {
|
|||||||
|
|
||||||
out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
|
out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
|
||||||
gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
|
gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
|
||||||
gcmSm4Enc(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
|
gcmSm4EncInst(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
|
||||||
if hex.EncodeToString(out1) != hex.EncodeToString(out2) {
|
if hex.EncodeToString(out1) != hex.EncodeToString(out2) {
|
||||||
t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString(out1), hex.EncodeToString(out2))
|
t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString(out1), hex.EncodeToString(out2))
|
||||||
}
|
}
|
||||||
@ -244,7 +244,7 @@ func TestGcmSm4Dec(t *testing.T) {
|
|||||||
|
|
||||||
out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
|
out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
|
||||||
gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
|
gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
|
||||||
gcmSm4Dec(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)
|
gcmSm4DecInst(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)
|
||||||
|
|
||||||
if hex.EncodeToString([]byte(test.plaintext)) != hex.EncodeToString(out2[:len(test.plaintext)]) {
|
if hex.EncodeToString([]byte(test.plaintext)) != hex.EncodeToString(out2[:len(test.plaintext)]) {
|
||||||
t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString([]byte(test.plaintext)), hex.EncodeToString(out2[:len(test.plaintext)]))
|
t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString([]byte(test.plaintext)), hex.EncodeToString(out2[:len(test.plaintext)]))
|
||||||
|
152
sm4/sm4ni_gcm_asm.go
Normal file
152
sm4/sm4ni_gcm_asm.go
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
//go:build amd64 || arm64
|
||||||
|
// +build amd64 arm64
|
||||||
|
|
||||||
|
package sm4
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crypto/cipher"
|
||||||
|
goSubtle "crypto/subtle"
|
||||||
|
|
||||||
|
"github.com/emmansun/gmsm/internal/subtle"
|
||||||
|
)
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||||
|
|
||||||
|
// sm4CipherNIGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
|
||||||
|
// will use the optimised implementation in this file when possible. Instances
|
||||||
|
// of this type only exist when hasGCMAsm and hasSM4 returns true.
|
||||||
|
type sm4CipherNIGCM struct {
|
||||||
|
*sm4CipherNI
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assert that sm4CipherNIGCM implements the gcmAble interface.
|
||||||
|
var _ gcmAble = (*sm4CipherNIGCM)(nil)
|
||||||
|
|
||||||
|
type gcmNI struct {
|
||||||
|
cipher *sm4CipherNI
|
||||||
|
nonceSize int
|
||||||
|
tagSize int
|
||||||
|
bytesProductTable [256]byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *gcmNI) NonceSize() int {
|
||||||
|
return g.nonceSize
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *gcmNI) Overhead() int {
|
||||||
|
return g.tagSize
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewGCM returns the SM4 cipher wrapped in Galois Counter Mode. This is only
|
||||||
|
// called by crypto/cipher.NewGCM via the gcmAble interface.
|
||||||
|
func (c *sm4CipherNIGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
|
||||||
|
g := &gcmNI{}
|
||||||
|
g.cipher = c.sm4CipherNI
|
||||||
|
g.nonceSize = nonceSize
|
||||||
|
g.tagSize = tagSize
|
||||||
|
gcmSm4Init(&g.bytesProductTable, g.cipher.enc, INST_SM4)
|
||||||
|
return g, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seal encrypts and authenticates plaintext. See the cipher.AEAD interface for
|
||||||
|
// details.
|
||||||
|
func (g *gcmNI) Seal(dst, nonce, plaintext, data []byte) []byte {
|
||||||
|
if len(nonce) != g.nonceSize {
|
||||||
|
panic("cipher: incorrect nonce length given to GCM")
|
||||||
|
}
|
||||||
|
if uint64(len(plaintext)) > ((1<<32)-2)*BlockSize {
|
||||||
|
panic("cipher: message too large for GCM")
|
||||||
|
}
|
||||||
|
|
||||||
|
var counter, tagMask [gcmBlockSize]byte
|
||||||
|
|
||||||
|
if len(nonce) == gcmStandardNonceSize {
|
||||||
|
// Init counter to nonce||1
|
||||||
|
copy(counter[:], nonce)
|
||||||
|
counter[gcmBlockSize-1] = 1
|
||||||
|
} else {
|
||||||
|
// Otherwise counter = GHASH(nonce)
|
||||||
|
gcmSm4Data(&g.bytesProductTable, nonce, &counter)
|
||||||
|
gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
|
||||||
|
}
|
||||||
|
|
||||||
|
g.cipher.Encrypt(tagMask[:], counter[:])
|
||||||
|
|
||||||
|
var tagOut [gcmTagSize]byte
|
||||||
|
gcmSm4Data(&g.bytesProductTable, data, &tagOut)
|
||||||
|
|
||||||
|
ret, out := subtle.SliceForAppend(dst, len(plaintext)+g.tagSize)
|
||||||
|
if subtle.InexactOverlap(out[:len(plaintext)], plaintext) {
|
||||||
|
panic("cipher: invalid buffer overlap")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(plaintext) > 0 {
|
||||||
|
gcmSm4niEnc(&g.bytesProductTable, out, plaintext, &counter, &tagOut, g.cipher.enc)
|
||||||
|
}
|
||||||
|
gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data)))
|
||||||
|
copy(out[len(plaintext):], tagOut[:])
|
||||||
|
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
// Open authenticates and decrypts ciphertext. See the cipher.AEAD interface
|
||||||
|
// for details.
|
||||||
|
func (g *gcmNI) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
|
||||||
|
if len(nonce) != g.nonceSize {
|
||||||
|
panic("cipher: incorrect nonce length given to GCM")
|
||||||
|
}
|
||||||
|
// Sanity check to prevent the authentication from always succeeding if an implementation
|
||||||
|
// leaves tagSize uninitialized, for example.
|
||||||
|
if g.tagSize < gcmMinimumTagSize {
|
||||||
|
panic("cipher: incorrect GCM tag size")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(ciphertext) < g.tagSize {
|
||||||
|
return nil, errOpen
|
||||||
|
}
|
||||||
|
if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(BlockSize)+uint64(g.tagSize) {
|
||||||
|
return nil, errOpen
|
||||||
|
}
|
||||||
|
|
||||||
|
tag := ciphertext[len(ciphertext)-g.tagSize:]
|
||||||
|
ciphertext = ciphertext[:len(ciphertext)-g.tagSize]
|
||||||
|
|
||||||
|
// See GCM spec, section 7.1.
|
||||||
|
var counter, tagMask [gcmBlockSize]byte
|
||||||
|
|
||||||
|
if len(nonce) == gcmStandardNonceSize {
|
||||||
|
// Init counter to nonce||1
|
||||||
|
copy(counter[:], nonce)
|
||||||
|
counter[gcmBlockSize-1] = 1
|
||||||
|
} else {
|
||||||
|
// Otherwise counter = GHASH(nonce)
|
||||||
|
gcmSm4Data(&g.bytesProductTable, nonce, &counter)
|
||||||
|
gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
|
||||||
|
}
|
||||||
|
|
||||||
|
g.cipher.Encrypt(tagMask[:], counter[:])
|
||||||
|
|
||||||
|
var expectedTag [gcmTagSize]byte
|
||||||
|
gcmSm4Data(&g.bytesProductTable, data, &expectedTag)
|
||||||
|
|
||||||
|
ret, out := subtle.SliceForAppend(dst, len(ciphertext))
|
||||||
|
if subtle.InexactOverlap(out, ciphertext) {
|
||||||
|
panic("cipher: invalid buffer overlap")
|
||||||
|
}
|
||||||
|
if len(ciphertext) > 0 {
|
||||||
|
gcmSm4niDec(&g.bytesProductTable, out, ciphertext, &counter, &expectedTag, g.cipher.enc)
|
||||||
|
}
|
||||||
|
gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data)))
|
||||||
|
|
||||||
|
if goSubtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 {
|
||||||
|
for i := range out {
|
||||||
|
out[i] = 0
|
||||||
|
}
|
||||||
|
return nil, errOpen
|
||||||
|
}
|
||||||
|
return ret, nil
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user