Merge pull request #52 from emmansun/sm3_sm4_ni

Sm3 sm4 ni
This commit is contained in:
Sun Yimin 2022-05-01 16:19:35 +08:00 committed by GitHub
commit 9a2d7123f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 1533 additions and 70 deletions

39
.github/workflows/sm3_sm4_ni.ci.yml vendored Normal file
View File

@ -0,0 +1,39 @@
name: ci
on:
push:
branches: [ sm3_sm4_ni ]
pull_request:
branches: [ sm3_sm4_ni ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
goVer: ['1.15', '1.16', '1.17']
steps:
- name: Checkout Repo
uses: actions/checkout@v2
- name: Set up Go
uses: actions/setup-go@v2
with:
go-version: ${{ matrix.goVer }}
- name: Setup Environment
run: |
echo "GOPATH=$(go env GOPATH)" >> $GITHUB_ENV
echo "$(go env GOPATH)/bin" >> $GITHUB_PATH
- name: Module cache
uses: actions/cache@v2.1.7
env:
cache-name: go-mod-cache
with:
path: ~/go/pkg/mod
key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/go.sum') }}
- name: Test
run: go test -v ./...

View File

@ -58,18 +58,18 @@ func sm3tt2b(Vd, Vn, Vm, imm2 byte) uint32 {
// Used v5 as temp register
func roundA(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t)
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i)
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT1A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1a(st1, 5, wt, i), st1, 5, wt, i)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT2A V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2a(st2, 5, w, i), st2, 5, w, i)
}
// Used v5 as temp register
func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3SS1 V%d.4S, V%d.4S, V%d.4S, V%d.4S\n", sm3ss1(5, st1, t, st2), 5, st1, t, st2)
fmt.Fprintf(buf, "\tVSHL $1, V%d.S4, V%d.S4\n", t, t)
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i)
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT1B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt1b(st1, 5, wt, i), st1, 5, wt, i)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3TT2B V%dd.4S, V%d.4S, V%d.S, %d\n", sm3tt2b(st2, 5, w, i), st2, 5, w, i)
}
// Compress 4 words and generate 4 words, use v6, v7, v10 as temp registers
@ -82,12 +82,12 @@ func roundB(buf *bytes.Buffer, i, t, st1, st2, w, wt byte) {
// st1, st2, sm3 state
func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
fmt.Fprintf(buf, "\t// Extension\n")
fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4)
fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6)
fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7)
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0)
fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4)
fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6)
fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0)
fmt.Fprintf(buf, "\t// Compression\n")
roundA(buf, 0, t, st1, st2, s0, 10)
roundA(buf, 1, t, st1, st2, s0, 10)
@ -100,13 +100,13 @@ func qroundA(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
func qroundB(buf *bytes.Buffer, t, st1, st2, s0, s1, s2, s3, s4 byte) {
if s4 != 0xff {
fmt.Fprintf(buf, "\t// Extension\n")
fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s2, s1, s4)
fmt.Fprintf(buf, "\tVEXT 3, V%d, V%d, V%d\n", s1, s0, 6)
fmt.Fprintf(buf, "\tVEXT 2, V%d, V%d, V%d\n", s3, s2, 7)
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
fmt.Fprintf(buf, "\tWORD 0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s2, s1, s4)
fmt.Fprintf(buf, "\tVEXT $3, V%d.B16, V%d.B16, V%d.B16\n", s1, s0, 6)
fmt.Fprintf(buf, "\tVEXT $2, V%d.B16, V%d.B16, V%d.B16\n", s3, s2, 7)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW1 V%d.4S, V%d.4S, V%d.4S\n", sm3partw1(s4, s0, s3), s4, s0, s3)
fmt.Fprintf(buf, "\tWORD $0x%08x //SM3PARTW2 V%d.4S, V%d.4S, V%d.4S\n", sm3partw2(s4, 7, 6), s4, 7, 6)
}
fmt.Fprintf(buf, "\tVEOR V%d, V%d, V10\n", s1, s0)
fmt.Fprintf(buf, "\tVEOR V%d.B16, V%d.B16, V10.B16\n", s1, s0)
fmt.Fprintf(buf, "\t// Compression\n")
roundB(buf, 0, t, st1, st2, s0, 10)
roundB(buf, 1, t, st1, st2, s0, 10)
@ -165,8 +165,8 @@ blockloop:
fmt.Fprint(buf, `
SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes
VEOR V8.S4, V15.S4, V8.S4
VEOR V9.S4, V16.S4, V9.S4
VEOR V8.B16, V15.B16, V8.B16
VEOR V9.B16, V16.B16, V9.B16
CBNZ R3, blockloop
sm3ret:

View File

@ -19,10 +19,10 @@ func blockARM64(dig *digest, p []byte)
func blockSM3NI(h []uint32, p []byte, t []uint32)
func block(dig *digest, p []byte) {
//if !useSM3NI {
if !useSM3NI {
blockARM64(dig, p)
//} else {
// h := dig.h[:]
// blockSM3NI(h, p, t)
//}
} else {
h := dig.h[:]
blockSM3NI(h, p, t)
}
}

416
sm3/sm3blockni_arm64.s Normal file
View File

@ -0,0 +1,416 @@
// Generated by gen_sm3block_ni.go. DO NOT EDIT.
#include "textflag.h"
// func blockSM3NI(h []uint32, p []byte, t []uint32)
TEXT ·blockSM3NI(SB), 0, $0
MOVD h_base+0(FP), R0 // Hash value first address
MOVD p_base+24(FP), R1 // message first address
MOVD p_len+32(FP), R3 // message length
MOVD t_base+48(FP), R2 // t constants first address
VLD1 (R0), [V8.S4, V9.S4] // load h(a,b,c,d,e,f,g,h)
LDPW (0*8)(R2), (R5, R6) // load t constants
blockloop:
VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16] // load 64bytes message
VMOV V8.B16, V15.B16 // backup: V8 h(dcba)
VMOV V9.B16, V16.B16 // backup: V9 h(hgfe)
VREV32 V0.B16, V0.B16 // prepare for using message in Byte format
VREV32 V1.B16, V1.B16
VREV32 V2.B16, V2.B16
VREV32 V3.B16, V3.B16
// first 16 rounds
VMOV R5, V11.S[3]
// Extension
VEXT $3, V2.B16, V1.B16, V4.B16
VEXT $3, V1.B16, V0.B16, V6.B16
VEXT $2, V3.B16, V2.B16, V7.B16
WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S
WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S
VEOR V1.B16, V0.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0
WORD $0xa98840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1
WORD $0xa99840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2
WORD $0xa9a840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3
WORD $0xa9b840ce //SM3TT2A V9d.4S, V5.4S, V0.S, 3
// Extension
VEXT $3, V3.B16, V2.B16, V0.B16
VEXT $3, V2.B16, V1.B16, V6.B16
VEXT $2, V4.B16, V3.B16, V7.B16
WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S
WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S
VEOR V2.B16, V1.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0
WORD $0xa98841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1
WORD $0xa99841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2
WORD $0xa9a841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3
WORD $0xa9b841ce //SM3TT2A V9d.4S, V5.4S, V1.S, 3
// Extension
VEXT $3, V4.B16, V3.B16, V1.B16
VEXT $3, V3.B16, V2.B16, V6.B16
VEXT $2, V0.B16, V4.B16, V7.B16
WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S
WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S
VEOR V3.B16, V2.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0
WORD $0xa98842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1
WORD $0xa99842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2
WORD $0xa9a842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3
WORD $0xa9b842ce //SM3TT2A V9d.4S, V5.4S, V2.S, 3
// Extension
VEXT $3, V0.B16, V4.B16, V2.B16
VEXT $3, V4.B16, V3.B16, V6.B16
VEXT $2, V1.B16, V0.B16, V7.B16
WORD $0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S
WORD $0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S
VEOR V4.B16, V3.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8804ace //SM3TT1A V8d.4S, V5.4S, V10.S, 0
WORD $0xa98843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8904ace //SM3TT1A V8d.4S, V5.4S, V10.S, 1
WORD $0xa99843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 2
WORD $0xa9a843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b04ace //SM3TT1A V8d.4S, V5.4S, V10.S, 3
WORD $0xa9b843ce //SM3TT2A V9d.4S, V5.4S, V3.S, 3
// second 48 rounds
VMOV R6, V11.S[3]
// Extension
VEXT $3, V1.B16, V0.B16, V3.B16
VEXT $3, V0.B16, V4.B16, V6.B16
VEXT $2, V2.B16, V1.B16, V7.B16
WORD $0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S
WORD $0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S
VEOR V0.B16, V4.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3
// Extension
VEXT $3, V2.B16, V1.B16, V4.B16
VEXT $3, V1.B16, V0.B16, V6.B16
VEXT $2, V3.B16, V2.B16, V7.B16
WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S
WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S
VEOR V1.B16, V0.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3
// Extension
VEXT $3, V3.B16, V2.B16, V0.B16
VEXT $3, V2.B16, V1.B16, V6.B16
VEXT $2, V4.B16, V3.B16, V7.B16
WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S
WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S
VEOR V2.B16, V1.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3
// Extension
VEXT $3, V4.B16, V3.B16, V1.B16
VEXT $3, V3.B16, V2.B16, V6.B16
VEXT $2, V0.B16, V4.B16, V7.B16
WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S
WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S
VEOR V3.B16, V2.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3
// Extension
VEXT $3, V0.B16, V4.B16, V2.B16
VEXT $3, V4.B16, V3.B16, V6.B16
VEXT $2, V1.B16, V0.B16, V7.B16
WORD $0x62c061ce //SM3PARTW1 V2.4S, V3.4S, V1.4S
WORD $0xe2c466ce //SM3PARTW2 V2.4S, V7.4S, V6.4S
VEOR V4.B16, V3.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3
// Extension
VEXT $3, V1.B16, V0.B16, V3.B16
VEXT $3, V0.B16, V4.B16, V6.B16
VEXT $2, V2.B16, V1.B16, V7.B16
WORD $0x83c062ce //SM3PARTW1 V3.4S, V4.4S, V2.4S
WORD $0xe3c466ce //SM3PARTW2 V3.4S, V7.4S, V6.4S
VEOR V0.B16, V4.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3
// Extension
VEXT $3, V2.B16, V1.B16, V4.B16
VEXT $3, V1.B16, V0.B16, V6.B16
VEXT $2, V3.B16, V2.B16, V7.B16
WORD $0x04c063ce //SM3PARTW1 V4.4S, V0.4S, V3.4S
WORD $0xe4c466ce //SM3PARTW2 V4.4S, V7.4S, V6.4S
VEOR V1.B16, V0.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3
// Extension
VEXT $3, V3.B16, V2.B16, V0.B16
VEXT $3, V2.B16, V1.B16, V6.B16
VEXT $2, V4.B16, V3.B16, V7.B16
WORD $0x20c064ce //SM3PARTW1 V0.4S, V1.4S, V4.4S
WORD $0xe0c466ce //SM3PARTW2 V0.4S, V7.4S, V6.4S
VEOR V2.B16, V1.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc41ce //SM3TT2B V9d.4S, V5.4S, V1.S, 3
// Extension
VEXT $3, V4.B16, V3.B16, V1.B16
VEXT $3, V3.B16, V2.B16, V6.B16
VEXT $2, V0.B16, V4.B16, V7.B16
WORD $0x41c060ce //SM3PARTW1 V1.4S, V2.4S, V0.4S
WORD $0xe1c466ce //SM3PARTW2 V1.4S, V7.4S, V6.4S
VEOR V3.B16, V2.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc42ce //SM3TT2B V9d.4S, V5.4S, V2.S, 3
VEOR V4.B16, V3.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc43ce //SM3TT2B V9d.4S, V5.4S, V3.S, 3
VEOR V0.B16, V4.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc44ce //SM3TT2B V9d.4S, V5.4S, V4.S, 3
VEOR V1.B16, V0.B16, V10.B16
// Compression
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8844ace //SM3TT1B V8d.4S, V5.4S, V10.S, 0
WORD $0xa98c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 0
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8944ace //SM3TT1B V8d.4S, V5.4S, V10.S, 1
WORD $0xa99c40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 1
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8a44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 2
WORD $0xa9ac40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 2
WORD $0x05254bce //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
VSHL $1, V11.S4, V11.S4
WORD $0xa8b44ace //SM3TT1B V8d.4S, V5.4S, V10.S, 3
WORD $0xa9bc40ce //SM3TT2B V9d.4S, V5.4S, V0.S, 3
SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes
VEOR V8.B16, V15.B16, V8.B16
VEOR V9.B16, V16.B16, V9.B16
CBNZ R3, blockloop
sm3ret:
VST1 [V8.S4, V9.S4], (R0) // store hash value H
RET

View File

@ -290,7 +290,7 @@ GLOBL fk_mask<>(SB), RODATA, $16
AVX_SM4_TAO_L1(x, y); \
VPXOR x, t0, t0
// func expandKeyAsm(key *byte, ck, enc, dec *uint32)
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
MOVQ key+0(FP), AX
MOVQ ck+8(FP), BX
@ -321,7 +321,7 @@ loop:
expand_end:
RET
// func encryptBlocksAsm(xk *uint32, dst, src []byte)
// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX
@ -497,7 +497,7 @@ avx2_sm4_done:
VZEROUPPER
RET
// func encryptBlockAsm(xk *uint32, dst, src *byte)
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX

View File

@ -164,13 +164,44 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
VMOV R0, R24_MASK.D[0] \
VMOV R1, R24_MASK.D[1]
// func expandKeyAsm(key *byte, ck, enc, dec *uint32)
#define SM4EKEY_EXPORT_KEYS() \
VMOV V9.S[3], V10.S[0] \
VMOV V9.S[2], V10.S[1] \
VMOV V9.S[1], V10.S[2] \
VMOV V9.S[0], V10.S[3] \
VMOV V8.S[3], V11.S[0] \
VMOV V8.S[2], V11.S[1] \
VMOV V8.S[1], V11.S[2] \
VMOV V8.S[0], V11.S[3] \
VST1.P [V8.S4, V9.S4], 32(R10) \
VST1 [V10.S4, V11.S4], (R11) \
SUB $32, R11, R11
#define SM4E_ROUND() \
VLD1.P 16(R10), [V8.B16] \
VREV32 V8.B16, V8.B16 \
WORD $0x0884c0ce \
WORD $0x2884c0ce \
WORD $0x4884c0ce \
WORD $0x6884c0ce \
WORD $0x8884c0ce \
WORD $0xa884c0ce \
WORD $0xc884c0ce \
WORD $0xe884c0ce \
VREV32 V8.B16, V8.B16 \
VST1.P [V8.B16], 16(R9)
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
MOVD key+0(FP), R8
MOVD ck+8(FP), R9
MOVD enc+16(FP), R10
MOVD dec+24(FP), R11
MOVD inst+32(FP), R12
CMP $1, R12
BEQ sm4ekey
load_global_data_1()
VLD1 (R8), [t0.B16]
@ -193,14 +224,46 @@ ksLoop:
ADD $16, R0
CMP $128, R0
BNE ksLoop
RET
// func encryptBlocksAsm(xk *uint32, dst, src []byte)
sm4ekey:
LDP fk_mask<>(SB), (R0, R1)
VMOV R0, FK_MASK.D[0]
VMOV R1, FK_MASK.D[1]
VLD1 (R8), [V9.B16]
VREV32 V9.B16, V9.B16
VEOR FK_MASK.B16, V9.B16, V9.B16
ADD $96, R11
VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4]
WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S
WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S
SM4EKEY_EXPORT_KEYS()
WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S
WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S
SM4EKEY_EXPORT_KEYS()
VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4]
WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S
WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S
SM4EKEY_EXPORT_KEYS()
WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S
WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S
SM4EKEY_EXPORT_KEYS()
RET
// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
MOVD xk+0(FP), R8
MOVD dst+8(FP), R9
MOVD src+32(FP), R10
MOVD src_len+40(FP), R12
MOVD inst+56(FP), R11
CMP $1, R11
BEQ sm4niblocks
VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4]
VMOV V5.S[0], t0.S[0]
@ -271,15 +334,26 @@ encryptBlocksLoop:
VMOV t1.S[3], V8.S[2]
VMOV t0.S[3], V8.S[3]
VST1 [V8.B16], (R9)
RET
sm4niblocks:
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
sm4niblockloop:
SM4E_ROUND()
SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes
CBNZ R12, sm4niblockloop
RET
// func encryptBlockAsm(xk *uint32, dst, src *byte)
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVD xk+0(FP), R8
MOVD dst+8(FP), R9
MOVD src+16(FP), R10
MOVD inst+24(FP), R11
CMP $1, R11
BEQ sm4niblock
VLD1 (R10), [t0.S4]
VREV32 t0.B16, t0.B16
@ -312,5 +386,21 @@ encryptBlockLoop:
VMOV t1.S[0], V8.S[2]
VMOV t0.S[0], V8.S[3]
VST1 [V8.B16], (R9)
RET
sm4niblock:
VLD1 (R10), [V8.B16]
VREV32 V8.B16, V8.B16
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
WORD $0x0884c0ce //SM4E V8.4S, V0.4S
WORD $0x2884c0ce //SM4E V8.4S, V1.4S
WORD $0x4884c0ce //SM4E V8.4S, V2.4S
WORD $0x6884c0ce //SM4E V8.4S, V3.4S
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
WORD $0x0884c0ce //SM4E V8.4S, V0.4S
WORD $0x2884c0ce //SM4E V8.4S, V1.4S
WORD $0x4884c0ce //SM4E V8.4S, V2.4S
WORD $0x6884c0ce //SM4E V8.4S, V3.4S
VREV32 V8.B16, V8.B16
VST1 [V8.B16], (R9)
RET

View File

@ -15,14 +15,19 @@ var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
//go:noescape
func encryptBlocksAsm(xk *uint32, dst, src []byte)
const (
INST_AES int = iota
INST_SM4
)
//go:noescape
func encryptBlockAsm(xk *uint32, dst, src *byte)
func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
//go:noescape
func expandKeyAsm(key *byte, ck, enc, dec *uint32)
func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
//go:noescape
func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
type sm4CipherAsm struct {
sm4Cipher
@ -30,24 +35,66 @@ type sm4CipherAsm struct {
blocksSize int
}
type sm4CipherNI struct {
sm4Cipher
}
func newCipherNI(key []byte) (cipher.Block, error) {
c := &sm4CipherNI{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}}
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0], INST_SM4)
if supportsGFMUL {
return &sm4CipherNIGCM{c}, nil
}
return c, nil
}
func (c *sm4CipherNI) Encrypt(dst, src []byte) {
if len(src) < BlockSize {
panic("sm4: input not full block")
}
if len(dst) < BlockSize {
panic("sm4: output not full block")
}
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
panic("sm4: invalid buffer overlap")
}
encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_SM4)
}
func (c *sm4CipherNI) Decrypt(dst, src []byte) {
if len(src) < BlockSize {
panic("sm4: input not full block")
}
if len(dst) < BlockSize {
panic("sm4: output not full block")
}
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
panic("sm4: invalid buffer overlap")
}
encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_SM4)
}
func newCipher(key []byte) (cipher.Block, error) {
if supportSM4 {
return newCipherNI(key)
}
if !supportsAES {
return newCipherGeneric(key)
}
blocks := 4
if useAVX2 {
blocks = 8
}
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, blocks, blocks * BlockSize}
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
if supportsAES && supportsGFMUL {
c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, blocks, blocks * BlockSize}
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0], INST_AES)
if supportsGFMUL {
return &sm4CipherGCM{c}, nil
}
return &c, nil
return c, nil
}
func (c *sm4CipherAsm) BlockSize() int { return BlockSize }
func (c *sm4CipherAsm) Concurrency() int { return c.batchBlocks }
func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
@ -60,7 +107,7 @@ func (c *sm4CipherAsm) Encrypt(dst, src []byte) {
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
panic("sm4: invalid buffer overlap")
}
encryptBlockAsm(&c.enc[0], &dst[0], &src[0])
encryptBlockAsm(&c.enc[0], &dst[0], &src[0], INST_AES)
}
func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
@ -73,7 +120,7 @@ func (c *sm4CipherAsm) EncryptBlocks(dst, src []byte) {
if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) {
panic("sm4: invalid buffer overlap")
}
encryptBlocksAsm(&c.enc[0], dst, src)
encryptBlocksAsm(&c.enc[0], dst, src, INST_AES)
}
func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
@ -86,7 +133,7 @@ func (c *sm4CipherAsm) Decrypt(dst, src []byte) {
if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
panic("sm4: invalid buffer overlap")
}
encryptBlockAsm(&c.dec[0], &dst[0], &src[0])
encryptBlockAsm(&c.dec[0], &dst[0], &src[0], INST_AES)
}
func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) {
@ -99,14 +146,16 @@ func (c *sm4CipherAsm) DecryptBlocks(dst, src []byte) {
if subtle.InexactOverlap(dst[:c.blocksSize], src[:c.blocksSize]) {
panic("sm4: invalid buffer overlap")
}
encryptBlocksAsm(&c.dec[0], dst, src)
encryptBlocksAsm(&c.dec[0], dst, src, INST_AES)
}
// expandKey is used by BenchmarkExpand to ensure that the asm implementation
// of key expansion is used for the benchmark when it is available.
func expandKey(key []byte, enc, dec []uint32) {
if supportsAES {
expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0])
if supportSM4 {
expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0], INST_SM4)
} else if supportsAES {
expandKeyAsm(&key[0], &ck[0], &enc[0], &dec[0], INST_AES)
} else {
expandKeyGo(key, enc, dec)
}

View File

@ -34,7 +34,7 @@ func TestExpandKey(t *testing.T) {
}
io.ReadFull(rand.Reader, key)
expandKeyGo(key, encRes1, decRes1)
expandKeyAsm(&key[0], &ck[0], &encRes2[0], &decRes2[0])
expandKey(key, encRes2, decRes2)
if !reflect.DeepEqual(encRes1, encRes2) {
t.Errorf("expected=%v, result=%v\n", encRes1, encRes2)
}

View File

@ -2201,3 +2201,11 @@ avx2GcmSm4DecDone:
VMOVDQU ACC0, (tPtr)
VZEROUPPER
RET
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
RET
// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
RET

View File

@ -252,7 +252,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
SM4_TAO_L1(x, y, z); \
VEOR x.B16, t0.B16, t0.B16
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
// func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
#define pTbl R0
#define RK R1
@ -260,6 +260,7 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
MOVD productTable+0(FP), pTbl
MOVD rk+8(FP), RK
MOVD inst+16(FP), R5
MOVD $0xC2, I
LSL $56, I
@ -269,6 +270,9 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
VEOR ZERO.B16, ZERO.B16, ZERO.B16
// Encrypt block 0 with the SM4 keys to generate the hash key H
CMP $1, R5
BEQ sm4InitSM4E
LOAD_SM4_AESNI_CONSTS()
VEOR B0.B16, B0.B16, B0.B16
VEOR B1.B16, B1.B16, B1.B16
@ -290,7 +294,22 @@ sm4InitEncLoop:
VMOV B1.S[0], B0.S[3]
VMOV B2.S[0], B0.S[0]
VMOV B3.S[0], B0.S[1]
B sm4InitEncDone
sm4InitSM4E:
VEOR B0.B16, B0.B16, B0.B16
VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
WORD $0x6085c0ce //SM4E V0.4S, V11.4S
WORD $0x8085c0ce //SM4E V0.4S, V12.4S
WORD $0xa085c0ce //SM4E V0.4S, V13.4S
WORD $0xc085c0ce //SM4E V0.4S, V14.4S
VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
WORD $0x6085c0ce //SM4E V0.4S, V11.4S
WORD $0x8085c0ce //SM4E V0.4S, V12.4S
WORD $0xa085c0ce //SM4E V0.4S, V13.4S
WORD $0xc085c0ce //SM4E V0.4S, V14.4S
VREV32 B0.B16, B0.B16
VREV64 B0.B16, B0.B16
sm4InitEncDone:
// Multiply by 2 modulo P
VMOV B0.D[0], I
ASR $63, I
@ -547,6 +566,7 @@ TEXT ·gcmSm4Enc(SB),NOSPLIT,$0
VMOV H0, INC.S[3]
VREV32 CTR.B16, CTR.B16
VADD CTR.S4, INC.S4, CTR.S4
// Skip to <8 blocks loop
CMP $128, srcPtrLen
@ -587,7 +607,7 @@ encOctetsEnc4Blocks1:
VREV32 B2.B16, B2.B16
VREV32 B3.B16, B3.B16
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
// encryption first 4 blocks
// encryption second 4 blocks
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
MOVD rkSave, rk
@ -880,7 +900,7 @@ decOctetsEnc4Blocks1:
VREV32 B3.B16, B3.B16
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0)
// encryption first 4 blocks
// encryption second 4 blocks
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
MOVD rkSave, rk

525
sm4/gcm_sm4ni_arm64.s Normal file
View File

@ -0,0 +1,525 @@
#include "textflag.h"
#define B0 V0
#define B1 V1
#define B2 V2
#define B3 V3
#define B4 V4
#define B5 V5
#define B6 V6
#define B7 V7
#define ACC0 V8
#define ACC1 V9
#define ACCM V10
#define T0 V11
#define T1 V12
#define T2 V13
#define T3 V14
#define POLY V15
#define ZERO V16
#define INC V17
#define CTR V18
#define K0 V19
#define K1 V20
#define K2 V21
#define K3 V22
#define K4 V23
#define K5 V24
#define K6 V25
#define K7 V26
#define reduce() \
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
VEOR ACC1.B16, ACCM.B16, ACCM.B16 \
VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \
VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \
VEOR ACCM.B16, ACC0.B16, ACC0.B16 \
VEOR T0.B16, ACC1.B16, ACC1.B16 \
VPMULL POLY.D1, ACC0.D1, T0.Q1 \
VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \
VEOR T0.B16, ACC0.B16, ACC0.B16 \
VPMULL POLY.D1, ACC0.D1, T0.Q1 \
VEOR T0.B16, ACC1.B16, ACC1.B16 \
VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \
VEOR ACC1.B16, ACC0.B16, ACC0.B16 \
#define mulRound(X) \
VLD1.P 32(pTbl), [T1.B16, T2.B16] \
VREV64 X.B16, X.B16 \
VEXT $8, X.B16, X.B16, T0.B16 \
VEOR X.B16, T0.B16, T0.B16 \
VPMULL X.D1, T1.D1, T3.Q1 \
VEOR T3.B16, ACC1.B16, ACC1.B16 \
VPMULL2 X.D2, T1.D2, T3.Q1 \
VEOR T3.B16, ACC0.B16, ACC0.B16 \
VPMULL T0.D1, T2.D1, T3.Q1 \
VEOR T3.B16, ACCM.B16, ACCM.B16
#define sm4eEnc1block() \
WORD $0x6086c0ce \ //SM4E V0.4S, V19.4S
WORD $0x8086c0ce \ //SM4E V0.4S, V20.4S
WORD $0xa086c0ce \ //SM4E V0.4S, V21.4S
WORD $0xc086c0ce \ //SM4E V0.4S, V22.4S
WORD $0xe086c0ce \ //SM4E V0.4S, V23.4S
WORD $0x0087c0ce \ //SM4E V0.4S, V24.4S
WORD $0x2087c0ce \ //SM4E V0.4S, V25.4S
WORD $0x4087c0ce //SM4E V0.4S, V26.4S
#define sm4eEnc8blocks() \
sm4eEnc1block() \
WORD $0x6186c0ce \ //SM4E V1.4S, V19.4S
WORD $0x8186c0ce \ //SM4E V1.4S, V20.4S
WORD $0xa186c0ce \ //SM4E V1.4S, V21.4S
WORD $0xc186c0ce \ //SM4E V1.4S, V22.4S
WORD $0xe186c0ce \ //SM4E V1.4S, V23.4S
WORD $0x0187c0ce \ //SM4E V1.4S, V24.4S
WORD $0x2187c0ce \ //SM4E V1.4S, V25.4S
WORD $0x4187c0ce \ //SM4E V1.4S, V26.4S
WORD $0x6286c0ce \ //SM4E V2.4S, V19.4S
WORD $0x8286c0ce \ //SM4E V2.4S, V20.4S
WORD $0xa286c0ce \ //SM4E V2.4S, V21.4S
WORD $0xc286c0ce \ //SM4E V2.4S, V22.4S
WORD $0xe286c0ce \ //SM4E V2.4S, V23.4S
WORD $0x0287c0ce \ //SM4E V2.4S, V24.4S
WORD $0x2287c0ce \ //SM4E V2.4S, V25.4S
WORD $0x4287c0ce \ //SM4E V2.4S, V26.4S
WORD $0x6386c0ce \ //SM4E V3.4S, V19.4S
WORD $0x8386c0ce \ //SM4E V3.4S, V20.4S
WORD $0xa386c0ce \ //SM4E V3.4S, V21.4S
WORD $0xc386c0ce \ //SM4E V3.4S, V22.4S
WORD $0xe386c0ce \ //SM4E V3.4S, V23.4S
WORD $0x0387c0ce \ //SM4E V3.4S, V24.4S
WORD $0x2387c0ce \ //SM4E V3.4S, V25.4S
WORD $0x4387c0ce \ //SM4E V3.4S, V26.4S
WORD $0x6486c0ce \ //SM4E V4.4S, V19.4S
WORD $0x8486c0ce \ //SM4E V4.4S, V20.4S
WORD $0xa486c0ce \ //SM4E V4.4S, V21.4S
WORD $0xc486c0ce \ //SM4E V4.4S, V22.4S
WORD $0xe486c0ce \ //SM4E V4.4S, V23.4S
WORD $0x0487c0ce \ //SM4E V4.4S, V24.4S
WORD $0x2487c0ce \ //SM4E V4.4S, V25.4S
WORD $0x4487c0ce \ //SM4E V4.4S, V26.4S
WORD $0x6586c0ce \ //SM4E V5.4S, V19.4S
WORD $0x8586c0ce \ //SM4E V5.4S, V20.4S
WORD $0xa586c0ce \ //SM4E V5.4S, V21.4S
WORD $0xc586c0ce \ //SM4E V5.4S, V22.4S
WORD $0xe586c0ce \ //SM4E V5.4S, V23.4S
WORD $0x0587c0ce \ //SM4E V5.4S, V24.4S
WORD $0x2587c0ce \ //SM4E V5.4S, V25.4S
WORD $0x4587c0ce \ //SM4E V5.4S, V26.4S
WORD $0x6686c0ce \ //SM4E V6.4S, V19.4S
WORD $0x8686c0ce \ //SM4E V6.4S, V20.4S
WORD $0xa686c0ce \ //SM4E V6.4S, V21.4S
WORD $0xc686c0ce \ //SM4E V6.4S, V22.4S
WORD $0xe686c0ce \ //SM4E V6.4S, V23.4S
WORD $0x0687c0ce \ //SM4E V6.4S, V24.4S
WORD $0x2687c0ce \ //SM4E V6.4S, V25.4S
WORD $0x4687c0ce \ //SM4E V6.4S, V26.4S
WORD $0x6786c0ce \ //SM4E V7.4S, V19.4S
WORD $0x8786c0ce \ //SM4E V7.4S, V20.4S
WORD $0xa786c0ce \ //SM4E V7.4S, V21.4S
WORD $0xc786c0ce \ //SM4E V7.4S, V22.4S
WORD $0xe786c0ce \ //SM4E V7.4S, V23.4S
WORD $0x0787c0ce \ //SM4E V7.4S, V24.4S
WORD $0x2787c0ce \ //SM4E V7.4S, V25.4S
WORD $0x4787c0ce //SM4E V7.4S, V26.4S
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
#define pTbl R0
#define dstPtr R1
#define ctrPtr R2
#define srcPtr R3
#define rk R4
#define tPtr R5
#define srcPtrLen R6
#define aluCTR R7
#define aluTMP R8
#define H0 R9
#define H1 R10
#define pTblSave R11
#define rkSave R12
MOVD productTable+0(FP), pTbl
MOVD dst+8(FP), dstPtr
MOVD src_base+32(FP), srcPtr
MOVD src_len+40(FP), srcPtrLen
MOVD ctr+56(FP), ctrPtr
MOVD T+64(FP), tPtr
MOVD rk_base+72(FP), rk
MOVD $0xC2, H1
LSL $56, H1
MOVD $1, H0
VMOV H1, POLY.D[0]
VMOV H0, POLY.D[1]
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD pTbl, pTblSave
// Current tag, after AAD
VLD1 (tPtr), [ACC0.B16]
VEOR ACC1.B16, ACC1.B16, ACC1.B16
VEOR ACCM.B16, ACCM.B16, ACCM.B16
// Prepare initial counter, and the increment vector
VLD1 (ctrPtr), [CTR.B16]
VEOR INC.B16, INC.B16, INC.B16
MOVD $1, H0
VMOV H0, INC.S[3]
VREV32 CTR.B16, CTR.B16
VADD CTR.S4, INC.S4, CTR.S4
// Skip to <8 blocks loop
CMP $128, srcPtrLen
MOVD rk, H0
// For SM4 round keys are stored in: K0 .. K7
VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
BLT startSingles
octetsLoop:
SUB $128, srcPtrLen
// Prepare 8 counters
VMOV CTR.B16, B0.B16
VADD B0.S4, INC.S4, B1.S4
VADD B1.S4, INC.S4, B2.S4
VADD B2.S4, INC.S4, B3.S4
VADD B3.S4, INC.S4, B4.S4
VADD B4.S4, INC.S4, B5.S4
VADD B5.S4, INC.S4, B6.S4
VADD B6.S4, INC.S4, B7.S4
VADD B7.S4, INC.S4, CTR.S4
sm4eEnc8blocks()
VREV32 B0.B16, B0.B16
VREV32 B1.B16, B1.B16
VREV32 B2.B16, B2.B16
VREV32 B3.B16, B3.B16
VREV32 B4.B16, B4.B16
VREV32 B5.B16, B5.B16
VREV32 B6.B16, B6.B16
VREV32 B7.B16, B7.B16
// XOR plaintext and store ciphertext
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
VEOR B0.B16, T1.B16, B0.B16
VEOR B1.B16, T2.B16, B1.B16
VST1.P [B0.B16, B1.B16], 32(dstPtr)
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
VEOR B2.B16, T1.B16, B2.B16
VEOR B3.B16, T2.B16, B3.B16
VST1.P [B2.B16, B3.B16], 32(dstPtr)
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
VEOR B4.B16, T1.B16, B4.B16
VEOR B5.B16, T2.B16, B5.B16
VST1.P [B4.B16, B5.B16], 32(dstPtr)
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
VEOR B6.B16, T1.B16, B6.B16
VEOR B7.B16, T2.B16, B7.B16
VST1.P [B6.B16, B7.B16], 32(dstPtr)
VLD1.P 32(pTbl), [T1.B16, T2.B16]
VREV64 B0.B16, B0.B16
VEOR ACC0.B16, B0.B16, B0.B16
VEXT $8, B0.B16, B0.B16, T0.B16
VEOR B0.B16, T0.B16, T0.B16
VPMULL B0.D1, T1.D1, ACC1.Q1
VPMULL2 B0.D2, T1.D2, ACC0.Q1
VPMULL T0.D1, T2.D1, ACCM.Q1
mulRound(B1)
mulRound(B2)
mulRound(B3)
mulRound(B4)
mulRound(B5)
mulRound(B6)
mulRound(B7)
MOVD pTblSave, pTbl
reduce()
CMP $128, srcPtrLen
BGE octetsLoop
startSingles:
CBZ srcPtrLen, done
ADD $14*16, pTbl
// Preload H and its Karatsuba precomp
VLD1.P (pTbl), [T1.B16, T2.B16]
singlesLoop:
CMP $16, srcPtrLen
BLT tail
SUB $16, srcPtrLen
VMOV CTR.B16, B0.B16
VADD CTR.S4, INC.S4, CTR.S4
sm4eEnc1block()
VREV32 B0.B16, B0.B16
singlesLast:
VLD1.P 16(srcPtr), [T0.B16]
VEOR T0.B16, B0.B16, B0.B16
encReduce:
VST1.P [B0.B16], 16(dstPtr)
VREV64 B0.B16, B0.B16
VEOR ACC0.B16, B0.B16, B0.B16
VEXT $8, B0.B16, B0.B16, T0.B16
VEOR B0.B16, T0.B16, T0.B16
VPMULL B0.D1, T1.D1, ACC1.Q1
VPMULL2 B0.D2, T1.D2, ACC0.Q1
VPMULL T0.D1, T2.D1, ACCM.Q1
reduce()
B singlesLoop
tail:
CBZ srcPtrLen, done
VEOR T0.B16, T0.B16, T0.B16
VEOR T3.B16, T3.B16, T3.B16
MOVD $0, H1
SUB $1, H1
ADD srcPtrLen, srcPtr
TBZ $3, srcPtrLen, ld4
MOVD.W -8(srcPtr), H0
VMOV H0, T0.D[0]
VMOV H1, T3.D[0]
ld4:
TBZ $2, srcPtrLen, ld2
MOVW.W -4(srcPtr), H0
VEXT $12, T0.B16, ZERO.B16, T0.B16
VEXT $12, T3.B16, ZERO.B16, T3.B16
VMOV H0, T0.S[0]
VMOV H1, T3.S[0]
ld2:
TBZ $1, srcPtrLen, ld1
MOVH.W -2(srcPtr), H0
VEXT $14, T0.B16, ZERO.B16, T0.B16
VEXT $14, T3.B16, ZERO.B16, T3.B16
VMOV H0, T0.H[0]
VMOV H1, T3.H[0]
ld1:
TBZ $0, srcPtrLen, ld0
MOVB.W -1(srcPtr), H0
VEXT $15, T0.B16, ZERO.B16, T0.B16
VEXT $15, T3.B16, ZERO.B16, T3.B16
VMOV H0, T0.B[0]
VMOV H1, T3.B[0]
ld0:
MOVD ZR, srcPtrLen
VMOV CTR.B16, B0.B16
sm4eEnc1block()
VREV32 B0.B16, B0.B16
tailLast:
VEOR T0.B16, B0.B16, B0.B16
VAND T3.B16, B0.B16, B0.B16
B encReduce
done:
VST1 [ACC0.B16], (tPtr)
RET
// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
MOVD productTable+0(FP), pTbl
MOVD dst+8(FP), dstPtr
MOVD src_base+32(FP), srcPtr
MOVD src_len+40(FP), srcPtrLen
MOVD ctr+56(FP), ctrPtr
MOVD T+64(FP), tPtr
MOVD rk_base+72(FP), rk
MOVD $0xC2, H1
LSL $56, H1
MOVD $1, H0
VMOV H1, POLY.D[0]
VMOV H0, POLY.D[1]
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD pTbl, pTblSave
MOVD rk, rkSave
// Current tag, after AAD
VLD1 (tPtr), [ACC0.B16]
VEOR ACC1.B16, ACC1.B16, ACC1.B16
VEOR ACCM.B16, ACCM.B16, ACCM.B16
// Prepare initial counter, and the increment vector
VLD1 (ctrPtr), [CTR.B16]
VEOR INC.B16, INC.B16, INC.B16
MOVD $1, H0
VMOV H0, INC.S[3]
VREV32 CTR.B16, CTR.B16
VADD CTR.S4, INC.S4, CTR.S4
// Skip to <8 blocks loop
CMP $128, srcPtrLen
MOVD rk, H0
// For SM4 round keys are stored in: K0 .. K7
VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
BLT startSingles
octetsLoop:
SUB $128, srcPtrLen
VMOV CTR.B16, B0.B16
VADD B0.S4, INC.S4, B1.S4
VADD B1.S4, INC.S4, B2.S4
VADD B2.S4, INC.S4, B3.S4
VADD B3.S4, INC.S4, B4.S4
VADD B4.S4, INC.S4, B5.S4
VADD B5.S4, INC.S4, B6.S4
VADD B6.S4, INC.S4, B7.S4
VADD B7.S4, INC.S4, CTR.S4
sm4eEnc8blocks()
VREV32 B0.B16, T1.B16
VREV32 B1.B16, T2.B16
VREV32 B2.B16, B2.B16
VREV32 B3.B16, B3.B16
VREV32 B4.B16, B4.B16
VREV32 B5.B16, B5.B16
VREV32 B6.B16, B6.B16
VREV32 B7.B16, B7.B16
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
VEOR B0.B16, T1.B16, T1.B16
VEOR B1.B16, T2.B16, T2.B16
VST1.P [T1.B16, T2.B16], 32(dstPtr)
VLD1.P 32(pTbl), [T1.B16, T2.B16]
VREV64 B0.B16, B0.B16
VEOR ACC0.B16, B0.B16, B0.B16
VEXT $8, B0.B16, B0.B16, T0.B16
VEOR B0.B16, T0.B16, T0.B16
VPMULL B0.D1, T1.D1, ACC1.Q1
VPMULL2 B0.D2, T1.D2, ACC0.Q1
VPMULL T0.D1, T2.D1, ACCM.Q1
mulRound(B1)
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
VEOR B2.B16, B0.B16, T1.B16
VEOR B3.B16, B1.B16, T2.B16
VST1.P [T1.B16, T2.B16], 32(dstPtr)
mulRound(B0)
mulRound(B1)
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
VEOR B4.B16, B0.B16, T1.B16
VEOR B5.B16, B1.B16, T2.B16
VST1.P [T1.B16, T2.B16], 32(dstPtr)
mulRound(B0)
mulRound(B1)
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
VEOR B6.B16, B0.B16, T1.B16
VEOR B7.B16, B1.B16, T2.B16
VST1.P [T1.B16, T2.B16], 32(dstPtr)
mulRound(B0)
mulRound(B1)
MOVD pTblSave, pTbl
reduce()
CMP $128, srcPtrLen
BGE octetsLoop
startSingles:
CBZ srcPtrLen, done
ADD $14*16, pTbl
// Preload H and its Karatsuba precomp
VLD1.P (pTbl), [T1.B16, T2.B16]
singlesLoop:
CMP $16, srcPtrLen
BLT tail
SUB $16, srcPtrLen
VLD1.P 16(srcPtr), [T0.B16]
VREV64 T0.B16, B5.B16
VMOV CTR.B16, B0.B16
VADD CTR.S4, INC.S4, CTR.S4
sm4eEnc1block()
VREV32 B0.B16, B0.B16
singlesLast:
VEOR T0.B16, B0.B16, B0.B16
VST1.P [B0.B16], 16(dstPtr)
VEOR ACC0.B16, B5.B16, B5.B16
VEXT $8, B5.B16, B5.B16, T0.B16
VEOR B5.B16, T0.B16, T0.B16
VPMULL B5.D1, T1.D1, ACC1.Q1
VPMULL2 B5.D2, T1.D2, ACC0.Q1
VPMULL T0.D1, T2.D1, ACCM.Q1
reduce()
B singlesLoop
tail:
CBZ srcPtrLen, done
VMOV CTR.B16, B0.B16
VADD CTR.S4, INC.S4, CTR.S4
sm4eEnc1block()
VREV32 B0.B16, B0.B16
tailLast:
// Assuming it is safe to load past dstPtr due to the presence of the tag
// B5 stored last ciphertext
VLD1 (srcPtr), [B5.B16]
VEOR B5.B16, B0.B16, B0.B16
VEOR T3.B16, T3.B16, T3.B16
MOVD $0, H1
SUB $1, H1
TBZ $3, srcPtrLen, ld4 // Test if srcPtrLen < 8, if yes, goto ld4
VMOV B0.D[0], H0
MOVD.P H0, 8(dstPtr)
VMOV H1, T3.D[0]
VEXT $8, ZERO.B16, B0.B16, B0.B16
ld4:
TBZ $2, srcPtrLen, ld2 // Test if srcPtrLen < 4, if yes, goto ld2
VMOV B0.S[0], H0
MOVW.P H0, 4(dstPtr)
VEXT $12, T3.B16, ZERO.B16, T3.B16
VMOV H1, T3.S[0]
VEXT $4, ZERO.B16, B0.B16, B0.B16
ld2:
TBZ $1, srcPtrLen, ld1 // Test if srcPtrLen < 2, if yes, goto ld1
VMOV B0.H[0], H0
MOVH.P H0, 2(dstPtr)
VEXT $14, T3.B16, ZERO.B16, T3.B16
VMOV H1, T3.H[0]
VEXT $2, ZERO.B16, B0.B16, B0.B16
ld1:
TBZ $0, srcPtrLen, ld0 // Test if srcPtrLen < 1, if yes, goto ld0
VMOV B0.B[0], H0
MOVB.P H0, 1(dstPtr)
VEXT $15, T3.B16, ZERO.B16, T3.B16
VMOV H1, T3.B[0]
ld0:
VAND T3.B16, B5.B16, B5.B16
VREV64 B5.B16, B5.B16
VEOR ACC0.B16, B5.B16, B5.B16
VEXT $8, B5.B16, B5.B16, T0.B16
VEOR B5.B16, T0.B16, T0.B16
VPMULL B5.D1, T1.D1, ACC1.Q1
VPMULL2 B5.D2, T1.D2, ACC0.Q1
VPMULL T0.D1, T2.D1, ACCM.Q1
reduce()
done:
VST1 [ACC0.B16], (tPtr)
RET

137
sm4/gen_arm64_ni.go Normal file
View File

@ -0,0 +1,137 @@
// Not used yet!!!
// go run gen_arm64_ni.go
//go:build ignore
// +build ignore
package main
import (
"bytes"
"fmt"
"log"
"math/bits"
"os"
)
//SM4E <Vd>.4S, <Vn>.4S
func sm4e(Vd, Vn byte) uint32 {
inst := uint32(0xcec08400) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5
return bits.ReverseBytes32(inst)
}
//SM4EKEY <Vd>.4S, <Vn>.4S, <Vm>.4S
func sm4ekey(Vd, Vn, Vm byte) uint32 {
inst := uint32(0xce60c800) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5 | (uint32(Vm&0x1f) << 16)
return bits.ReverseBytes32(inst)
}
func sm4ekeyRound(buf *bytes.Buffer, d, n, m byte) {
fmt.Fprintf(buf, "\tWORD $0x%08x //SM4EKEY V%d.4S, V%d.4S, V%d.4S\n", sm4ekey(d, n, m), d, n, m)
}
func sm4eRound(buf *bytes.Buffer, d, n byte) {
fmt.Fprintf(buf, "\tWORD $0x%08x //SM4E V%d.4S, V%d.4S\n", sm4e(d, n), d, n)
}
func main() {
buf := new(bytes.Buffer)
fmt.Fprint(buf, `
// Generated by gen_arm64_ni.go. DO NOT EDIT.
#include "textflag.h"
// func expandKeySM4E(key *byte, fk, ck, enc *uint32)
TEXT ·expandKeySM4E(SB),NOSPLIT,$0
MOVD key+0(FP), R8
MOVD fk+8(FP), R9
MOVD ck+16(FP), R10
MOVD enc+24(FP), R11
VLD1 (R8), [V9.B16]
VREV32 V9.B16, V9.B16
VLD1 (R9), [V8.S4]
VEOR V9, V8, V9
VLD1.P 64(R10), [V0.S4, V1.S4, V2.S4, V3.S4]
`[1:])
sm4ekeyRound(buf, 8, 9, 0)
sm4ekeyRound(buf, 9, 8, 1)
fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n")
sm4ekeyRound(buf, 8, 9, 2)
sm4ekeyRound(buf, 9, 8, 3)
fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n")
fmt.Fprintf(buf, "\tVLD1.P 64(R10), [V0.S4, V1.S4, V2.S4, V3.S4]\n")
sm4ekeyRound(buf, 8, 9, 0)
sm4ekeyRound(buf, 9, 8, 1)
fmt.Fprintf(buf, "\tVST1.P [V8.S4, V9.S4], 32(R11)\n")
sm4ekeyRound(buf, 8, 9, 2)
sm4ekeyRound(buf, 9, 8, 3)
fmt.Fprintf(buf, `
VST1.P [V8.S4, V9.S4], 32(R11)
RET
`[1:])
fmt.Fprint(buf, `
// func encryptBlockSM4E(xk *uint32, dst, src *byte)
TEXT ·encryptBlockSM4E(SB),NOSPLIT,$0
MOVD xk+0(FP), R8
MOVD dst+8(FP), R9
MOVD src+16(FP), R10
VLD1 (R10), [V8.B16]
VREV32 V8.B16, V8.B16
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
`[1:])
sm4eRound(buf, 8, 0)
sm4eRound(buf, 8, 1)
sm4eRound(buf, 8, 2)
sm4eRound(buf, 8, 3)
fmt.Fprintf(buf, "\tVLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]\n")
sm4eRound(buf, 8, 0)
sm4eRound(buf, 8, 1)
sm4eRound(buf, 8, 2)
sm4eRound(buf, 8, 3)
fmt.Fprintf(buf, `
VREV32 V8.B16, V8.B16
VST1 [V8.B16], (R9)
RET
`[1:])
fmt.Fprint(buf, `
// func encryptBlocksSM4E(xk *uint32, dst, src *byte)
TEXT ·encryptBlocksSM4E(SB),NOSPLIT,$0
MOVD xk+0(FP), R8
MOVD dst+8(FP), R9
MOVD src+16(FP), R10
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
`[1:])
for i := 0; i < 4; i++ {
fmt.Fprintf(buf, "\tVLD1.P 16(R10), [V8.B16]\n")
fmt.Fprintf(buf, "\tVREV32 V8.B16, V8.B16\n")
sm4eRound(buf, 8, 0)
sm4eRound(buf, 8, 1)
sm4eRound(buf, 8, 2)
sm4eRound(buf, 8, 3)
sm4eRound(buf, 8, 4)
sm4eRound(buf, 8, 5)
sm4eRound(buf, 8, 6)
sm4eRound(buf, 8, 7)
fmt.Fprintf(buf, "\tVREV32 V8.B16, V8.B16\n")
fmt.Fprintf(buf, "\tVST1.P [V8.B16], 16(R9)\n\n")
}
fmt.Fprintf(buf, `
RET
`[1:])
src := buf.Bytes()
// fmt.Println(string(src))
err := os.WriteFile("sm4e_arm64.s", src, 0644)
if err != nil {
log.Fatal(err)
}
}

View File

@ -12,16 +12,16 @@ import (
// sm4CipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
// will use the optimised implementation in this file when possible. Instances
// of this type only exist when hasGCMAsm returns true.
// of this type only exist when hasGCMAsm and hasAES returns true.
type sm4CipherGCM struct {
sm4CipherAsm
*sm4CipherAsm
}
// Assert that sm4CipherGCM implements the gcmAble interface.
var _ gcmAble = (*sm4CipherGCM)(nil)
//go:noescape
func gcmSm4Init(productTable *[256]byte, rk []uint32)
func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
//go:noescape
func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
@ -35,6 +35,33 @@ func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
//go:noescape
func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
// gcmSm4InitInst is used for test
func gcmSm4InitInst(productTable *[256]byte, rk []uint32) {
if supportSM4 {
gcmSm4Init(productTable, rk, INST_SM4)
} else {
gcmSm4Init(productTable, rk, INST_AES)
}
}
// gcmSm4EncInst is used for test
func gcmSm4EncInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
if supportSM4 {
gcmSm4niEnc(productTable, dst, src, ctr, T, rk)
} else {
gcmSm4Enc(productTable, dst, src, ctr, T, rk)
}
}
// gcmSm4DecInst is used for test
func gcmSm4DecInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
if supportSM4 {
gcmSm4niDec(productTable, dst, src, ctr, T, rk)
} else {
gcmSm4Dec(productTable, dst, src, ctr, T, rk)
}
}
type gcmAsm struct {
gcm
bytesProductTable [256]byte
@ -44,10 +71,10 @@ type gcmAsm struct {
// called by crypto/cipher.NewGCM via the gcmAble interface.
func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
g := &gcmAsm{}
g.cipher = &c.sm4CipherAsm
g.cipher = c.sm4CipherAsm
g.nonceSize = nonceSize
g.tagSize = tagSize
gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
gcmSm4Init(&g.bytesProductTable, g.cipher.enc, INST_AES)
return g, nil
}

View File

@ -11,12 +11,12 @@ import (
func genPrecomputeTable() *gcmAsm {
key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
expandKey(key, c.enc, c.dec)
c1 := &sm4CipherGCM{c}
g := &gcmAsm{}
g.cipher = &c1.sm4CipherAsm
gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
g.cipher = c1.sm4CipherAsm
gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
return g
}
@ -145,13 +145,13 @@ func TestBothDataPlaintext(t *testing.T) {
func createGcm() *gcmAsm {
key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
c := &sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
expandKey(key, c.enc, c.dec)
c1 := &sm4CipherGCM{c}
g := &gcmAsm{}
g.cipher = &c1.sm4CipherAsm
g.cipher = c1.sm4CipherAsm
g.tagSize = 16
gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
return g
}
@ -214,7 +214,7 @@ func TestGcmSm4Enc(t *testing.T) {
out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
gcmSm4Enc(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
gcmSm4EncInst(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
if hex.EncodeToString(out1) != hex.EncodeToString(out2) {
t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString(out1), hex.EncodeToString(out2))
}
@ -244,7 +244,7 @@ func TestGcmSm4Dec(t *testing.T) {
out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
gcmSm4Dec(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)
gcmSm4DecInst(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)
if hex.EncodeToString([]byte(test.plaintext)) != hex.EncodeToString(out2[:len(test.plaintext)]) {
t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString([]byte(test.plaintext)), hex.EncodeToString(out2[:len(test.plaintext)]))

152
sm4/sm4ni_gcm_asm.go Normal file
View File

@ -0,0 +1,152 @@
//go:build amd64 || arm64
// +build amd64 arm64
package sm4
import (
"crypto/cipher"
goSubtle "crypto/subtle"
"github.com/emmansun/gmsm/internal/subtle"
)
//go:noescape
func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
//go:noescape
func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
// sm4CipherNIGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
// will use the optimised implementation in this file when possible. Instances
// of this type only exist when hasGCMAsm and hasSM4 returns true.
type sm4CipherNIGCM struct {
*sm4CipherNI
}
// Assert that sm4CipherNIGCM implements the gcmAble interface.
var _ gcmAble = (*sm4CipherNIGCM)(nil)
type gcmNI struct {
cipher *sm4CipherNI
nonceSize int
tagSize int
bytesProductTable [256]byte
}
func (g *gcmNI) NonceSize() int {
return g.nonceSize
}
func (g *gcmNI) Overhead() int {
return g.tagSize
}
// NewGCM returns the SM4 cipher wrapped in Galois Counter Mode. This is only
// called by crypto/cipher.NewGCM via the gcmAble interface.
func (c *sm4CipherNIGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
g := &gcmNI{}
g.cipher = c.sm4CipherNI
g.nonceSize = nonceSize
g.tagSize = tagSize
gcmSm4Init(&g.bytesProductTable, g.cipher.enc, INST_SM4)
return g, nil
}
// Seal encrypts and authenticates plaintext. See the cipher.AEAD interface for
// details.
func (g *gcmNI) Seal(dst, nonce, plaintext, data []byte) []byte {
if len(nonce) != g.nonceSize {
panic("cipher: incorrect nonce length given to GCM")
}
if uint64(len(plaintext)) > ((1<<32)-2)*BlockSize {
panic("cipher: message too large for GCM")
}
var counter, tagMask [gcmBlockSize]byte
if len(nonce) == gcmStandardNonceSize {
// Init counter to nonce||1
copy(counter[:], nonce)
counter[gcmBlockSize-1] = 1
} else {
// Otherwise counter = GHASH(nonce)
gcmSm4Data(&g.bytesProductTable, nonce, &counter)
gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
}
g.cipher.Encrypt(tagMask[:], counter[:])
var tagOut [gcmTagSize]byte
gcmSm4Data(&g.bytesProductTable, data, &tagOut)
ret, out := subtle.SliceForAppend(dst, len(plaintext)+g.tagSize)
if subtle.InexactOverlap(out[:len(plaintext)], plaintext) {
panic("cipher: invalid buffer overlap")
}
if len(plaintext) > 0 {
gcmSm4niEnc(&g.bytesProductTable, out, plaintext, &counter, &tagOut, g.cipher.enc)
}
gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data)))
copy(out[len(plaintext):], tagOut[:])
return ret
}
// Open authenticates and decrypts ciphertext. See the cipher.AEAD interface
// for details.
func (g *gcmNI) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
if len(nonce) != g.nonceSize {
panic("cipher: incorrect nonce length given to GCM")
}
// Sanity check to prevent the authentication from always succeeding if an implementation
// leaves tagSize uninitialized, for example.
if g.tagSize < gcmMinimumTagSize {
panic("cipher: incorrect GCM tag size")
}
if len(ciphertext) < g.tagSize {
return nil, errOpen
}
if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(BlockSize)+uint64(g.tagSize) {
return nil, errOpen
}
tag := ciphertext[len(ciphertext)-g.tagSize:]
ciphertext = ciphertext[:len(ciphertext)-g.tagSize]
// See GCM spec, section 7.1.
var counter, tagMask [gcmBlockSize]byte
if len(nonce) == gcmStandardNonceSize {
// Init counter to nonce||1
copy(counter[:], nonce)
counter[gcmBlockSize-1] = 1
} else {
// Otherwise counter = GHASH(nonce)
gcmSm4Data(&g.bytesProductTable, nonce, &counter)
gcmSm4Finish(&g.bytesProductTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
}
g.cipher.Encrypt(tagMask[:], counter[:])
var expectedTag [gcmTagSize]byte
gcmSm4Data(&g.bytesProductTable, data, &expectedTag)
ret, out := subtle.SliceForAppend(dst, len(ciphertext))
if subtle.InexactOverlap(out, ciphertext) {
panic("cipher: invalid buffer overlap")
}
if len(ciphertext) > 0 {
gcmSm4niDec(&g.bytesProductTable, out, ciphertext, &counter, &expectedTag, g.cipher.enc)
}
gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data)))
if goSubtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 {
for i := range out {
out[i] = 0
}
return nil, errOpen
}
return ret, nil
}