sm4: xts sm4ni arm64 #151

This commit is contained in:
Sun Yimin 2023-08-25 13:13:43 +08:00 committed by GitHub
parent 16c5556655
commit 1019226803
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 702 additions and 160 deletions

View File

@ -62,90 +62,7 @@
VPMULL T0.D1, T2.D1, T3.Q1 \ VPMULL T0.D1, T2.D1, T3.Q1 \
VEOR T3.B16, ACCM.B16, ACCM.B16 VEOR T3.B16, ACCM.B16, ACCM.B16
#define sm4eEnc1block() \ #include "sm4ni_macros_arm64.s"
WORD $0xcec08660 \ //SM4E V0.4S, V19.4S
WORD $0xcec08680 \ //SM4E V0.4S, V20.4S
WORD $0xcec086a0 \ //SM4E V0.4S, V21.4S
WORD $0xcec086c0 \ //SM4E V0.4S, V22.4S
WORD $0xcec086e0 \ //SM4E V0.4S, V23.4S
WORD $0xcec08700 \ //SM4E V0.4S, V24.4S
WORD $0xcec08720 \ //SM4E V0.4S, V25.4S
WORD $0xcec08740 \ //SM4E V0.4S, V26.4S
VREV64 V0.B16, V0.B16 \
VEXT $8, V0.B16, V0.B16, V0.B16
#define sm4eEnc8blocks() \
sm4eEnc1block() \
WORD $0xcec08661 \ //SM4E V1.4S, V19.4S
WORD $0xcec08681 \ //SM4E V1.4S, V20.4S
WORD $0xcec086a1 \ //SM4E V1.4S, V21.4S
WORD $0xcec086c1 \ //SM4E V1.4S, V22.4S
WORD $0xcec086e1 \ //SM4E V1.4S, V23.4S
WORD $0xcec08701 \ //SM4E V1.4S, V24.4S
WORD $0xcec08721 \ //SM4E V1.4S, V25.4S
WORD $0xcec08741 \ //SM4E V1.4S, V26.4S
VREV64 V1.B16, V1.B16 \
VEXT $8, V1.B16, V1.B16, V1.B16 \
WORD $0xcec08662 \ //SM4E V2.4S, V19.4S
WORD $0xcec08682 \ //SM4E V2.4S, V20.4S
WORD $0xcec086a2 \ //SM4E V2.4S, V21.4S
WORD $0xcec086c2 \ //SM4E V2.4S, V22.4S
WORD $0xcec086e2 \ //SM4E V2.4S, V23.4S
WORD $0xcec08702 \ //SM4E V2.4S, V24.4S
WORD $0xcec08722 \ //SM4E V2.4S, V25.4S
WORD $0xcec08742 \ //SM4E V2.4S, V26.4S
VREV64 V2.B16, V2.B16 \
VEXT $8, V2.B16, V2.B16, V2.B16 \
WORD $0xcec08663 \ //SM4E V3.4S, V19.4S
WORD $0xcec08683 \ //SM4E V3.4S, V20.4S
WORD $0xcec086a3 \ //SM4E V3.4S, V21.4S
WORD $0xcec086c3 \ //SM4E V3.4S, V22.4S
WORD $0xcec086e3 \ //SM4E V3.4S, V23.4S
WORD $0xcec08703 \ //SM4E V3.4S, V24.4S
WORD $0xcec08723 \ //SM4E V3.4S, V25.4S
WORD $0xcec08743 \ //SM4E V3.4S, V26.4S
VREV64 V3.B16, V3.B16 \
VEXT $8, V3.B16, V3.B16, V3.B16 \
WORD $0xcec08664 \ //SM4E V4.4S, V19.4S
WORD $0xcec08684 \ //SM4E V4.4S, V20.4S
WORD $0xcec086a4 \ //SM4E V4.4S, V21.4S
WORD $0xcec086c4 \ //SM4E V4.4S, V22.4S
WORD $0xcec086e4 \ //SM4E V4.4S, V23.4S
WORD $0xcec08704 \ //SM4E V4.4S, V24.4S
WORD $0xcec08724 \ //SM4E V4.4S, V25.4S
WORD $0xcec08744 \ //SM4E V4.4S, V26.4S
VREV64 V4.B16, V4.B16 \
VEXT $8, V4.B16, V4.B16, V4.B16 \
WORD $0xcec08665 \ //SM4E V5.4S, V19.4S
WORD $0xcec08685 \ //SM4E V5.4S, V20.4S
WORD $0xcec086a5 \ //SM4E V5.4S, V21.4S
WORD $0xcec086c5 \ //SM4E V5.4S, V22.4S
WORD $0xcec086e5 \ //SM4E V5.4S, V23.4S
WORD $0xcec08705 \ //SM4E V5.4S, V24.4S
WORD $0xcec08725 \ //SM4E V5.4S, V25.4S
WORD $0xcec08745 \ //SM4E V5.4S, V26.4S
VREV64 V5.B16, V5.B16 \
VEXT $8, V5.B16, V5.B16, V5.B16 \
WORD $0xcec08666 \ //SM4E V6.4S, V19.4S
WORD $0xcec08686 \ //SM4E V6.4S, V20.4S
WORD $0xcec086a6 \ //SM4E V6.4S, V21.4S
WORD $0xcec086c6 \ //SM4E V6.4S, V22.4S
WORD $0xcec086e6 \ //SM4E V6.4S, V23.4S
WORD $0xcec08706 \ //SM4E V6.4S, V24.4S
WORD $0xcec08726 \ //SM4E V6.4S, V25.4S
WORD $0xcec08746 \ //SM4E V6.4S, V26.4S
VREV64 V6.B16, V6.B16 \
VEXT $8, V6.B16, V6.B16, V6.B16 \
WORD $0xcec08667 \ //SM4E V7.4S, V19.4S
WORD $0xcec08687 \ //SM4E V7.4S, V20.4S
WORD $0xcec086a7 \ //SM4E V7.4S, V21.4S
WORD $0xcec086c7 \ //SM4E V7.4S, V22.4S
WORD $0xcec086e7 \ //SM4E V7.4S, V23.4S
WORD $0xcec08707 \ //SM4E V7.4S, V24.4S
WORD $0xcec08727 \ //SM4E V7.4S, V25.4S
WORD $0xcec08747 \ //SM4E V7.4S, V26.4S
VREV64 V7.B16, V7.B16 \
VEXT $8, V7.B16, V7.B16, V7.B16
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0

View File

@ -55,7 +55,7 @@ func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
//go:noescape //go:noescape
func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
func (x *xts) CryptBlocks(dst, src []byte) { func validateXtsInput(dst, src []byte) {
if len(dst) < len(src) { if len(dst) < len(src) {
panic("xts: dst is smaller than src") panic("xts: dst is smaller than src")
} }
@ -65,6 +65,10 @@ func (x *xts) CryptBlocks(dst, src []byte) {
if alias.InexactOverlap(dst[:len(src)], src) { if alias.InexactOverlap(dst[:len(src)], src) {
panic("xts: invalid buffer overlap") panic("xts: invalid buffer overlap")
} }
}
func (x *xts) CryptBlocks(dst, src []byte) {
validateXtsInput(dst, src)
if x.enc == xtsEncrypt { if x.enc == xtsEncrypt {
if x.isGB { if x.isGB {
encryptSm4XtsGB(&x.b.enc[0], &x.tweak, dst, src) encryptSm4XtsGB(&x.b.enc[0], &x.tweak, dst, src)

84
sm4/sm4ni_macros_arm64.s Normal file
View File

@ -0,0 +1,84 @@
#define sm4eEnc1block() \
WORD $0xcec08660 \ //SM4E V0.4S, V19.4S
WORD $0xcec08680 \ //SM4E V0.4S, V20.4S
WORD $0xcec086a0 \ //SM4E V0.4S, V21.4S
WORD $0xcec086c0 \ //SM4E V0.4S, V22.4S
WORD $0xcec086e0 \ //SM4E V0.4S, V23.4S
WORD $0xcec08700 \ //SM4E V0.4S, V24.4S
WORD $0xcec08720 \ //SM4E V0.4S, V25.4S
WORD $0xcec08740 \ //SM4E V0.4S, V26.4S
VREV64 V0.B16, V0.B16 \
VEXT $8, V0.B16, V0.B16, V0.B16
#define sm4eEnc8blocks() \
sm4eEnc1block() \
WORD $0xcec08661 \ //SM4E V1.4S, V19.4S
WORD $0xcec08681 \ //SM4E V1.4S, V20.4S
WORD $0xcec086a1 \ //SM4E V1.4S, V21.4S
WORD $0xcec086c1 \ //SM4E V1.4S, V22.4S
WORD $0xcec086e1 \ //SM4E V1.4S, V23.4S
WORD $0xcec08701 \ //SM4E V1.4S, V24.4S
WORD $0xcec08721 \ //SM4E V1.4S, V25.4S
WORD $0xcec08741 \ //SM4E V1.4S, V26.4S
VREV64 V1.B16, V1.B16 \
VEXT $8, V1.B16, V1.B16, V1.B16 \
WORD $0xcec08662 \ //SM4E V2.4S, V19.4S
WORD $0xcec08682 \ //SM4E V2.4S, V20.4S
WORD $0xcec086a2 \ //SM4E V2.4S, V21.4S
WORD $0xcec086c2 \ //SM4E V2.4S, V22.4S
WORD $0xcec086e2 \ //SM4E V2.4S, V23.4S
WORD $0xcec08702 \ //SM4E V2.4S, V24.4S
WORD $0xcec08722 \ //SM4E V2.4S, V25.4S
WORD $0xcec08742 \ //SM4E V2.4S, V26.4S
VREV64 V2.B16, V2.B16 \
VEXT $8, V2.B16, V2.B16, V2.B16 \
WORD $0xcec08663 \ //SM4E V3.4S, V19.4S
WORD $0xcec08683 \ //SM4E V3.4S, V20.4S
WORD $0xcec086a3 \ //SM4E V3.4S, V21.4S
WORD $0xcec086c3 \ //SM4E V3.4S, V22.4S
WORD $0xcec086e3 \ //SM4E V3.4S, V23.4S
WORD $0xcec08703 \ //SM4E V3.4S, V24.4S
WORD $0xcec08723 \ //SM4E V3.4S, V25.4S
WORD $0xcec08743 \ //SM4E V3.4S, V26.4S
VREV64 V3.B16, V3.B16 \
VEXT $8, V3.B16, V3.B16, V3.B16 \
WORD $0xcec08664 \ //SM4E V4.4S, V19.4S
WORD $0xcec08684 \ //SM4E V4.4S, V20.4S
WORD $0xcec086a4 \ //SM4E V4.4S, V21.4S
WORD $0xcec086c4 \ //SM4E V4.4S, V22.4S
WORD $0xcec086e4 \ //SM4E V4.4S, V23.4S
WORD $0xcec08704 \ //SM4E V4.4S, V24.4S
WORD $0xcec08724 \ //SM4E V4.4S, V25.4S
WORD $0xcec08744 \ //SM4E V4.4S, V26.4S
VREV64 V4.B16, V4.B16 \
VEXT $8, V4.B16, V4.B16, V4.B16 \
WORD $0xcec08665 \ //SM4E V5.4S, V19.4S
WORD $0xcec08685 \ //SM4E V5.4S, V20.4S
WORD $0xcec086a5 \ //SM4E V5.4S, V21.4S
WORD $0xcec086c5 \ //SM4E V5.4S, V22.4S
WORD $0xcec086e5 \ //SM4E V5.4S, V23.4S
WORD $0xcec08705 \ //SM4E V5.4S, V24.4S
WORD $0xcec08725 \ //SM4E V5.4S, V25.4S
WORD $0xcec08745 \ //SM4E V5.4S, V26.4S
VREV64 V5.B16, V5.B16 \
VEXT $8, V5.B16, V5.B16, V5.B16 \
WORD $0xcec08666 \ //SM4E V6.4S, V19.4S
WORD $0xcec08686 \ //SM4E V6.4S, V20.4S
WORD $0xcec086a6 \ //SM4E V6.4S, V21.4S
WORD $0xcec086c6 \ //SM4E V6.4S, V22.4S
WORD $0xcec086e6 \ //SM4E V6.4S, V23.4S
WORD $0xcec08706 \ //SM4E V6.4S, V24.4S
WORD $0xcec08726 \ //SM4E V6.4S, V25.4S
WORD $0xcec08746 \ //SM4E V6.4S, V26.4S
VREV64 V6.B16, V6.B16 \
VEXT $8, V6.B16, V6.B16, V6.B16 \
WORD $0xcec08667 \ //SM4E V7.4S, V19.4S
WORD $0xcec08687 \ //SM4E V7.4S, V20.4S
WORD $0xcec086a7 \ //SM4E V7.4S, V21.4S
WORD $0xcec086c7 \ //SM4E V7.4S, V22.4S
WORD $0xcec086e7 \ //SM4E V7.4S, V23.4S
WORD $0xcec08707 \ //SM4E V7.4S, V24.4S
WORD $0xcec08727 \ //SM4E V7.4S, V25.4S
WORD $0xcec08747 \ //SM4E V7.4S, V26.4S
VREV64 V7.B16, V7.B16 \
VEXT $8, V7.B16, V7.B16, V7.B16

56
sm4/sm4ni_xts.go Normal file
View File

@ -0,0 +1,56 @@
//go:build (amd64 && !purego) || (arm64 && !purego)
// +build amd64,!purego arm64,!purego
package sm4
import (
"crypto/cipher"
)
// Assert that sm4CipherAsm implements the xtsEncAble and xtsDecAble interfaces.
var _ xtsEncAble = (*sm4CipherNI)(nil)
var _ xtsDecAble = (*sm4CipherNI)(nil)
type xtsNI struct {
b *sm4CipherNI
tweak [BlockSize]byte
isGB bool // if true, follows GB/T 17964-2021
enc int
}
func (b *sm4CipherNI) NewXTSEncrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode {
var c xtsNI
c.b = b
c.enc = xtsEncrypt
c.isGB = isGB
copy(c.tweak[:], encryptedTweak[:])
return &c
}
func (b *sm4CipherNI) NewXTSDecrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode {
var c xtsNI
c.b = b
c.enc = xtsDecrypt
c.isGB = isGB
copy(c.tweak[:], encryptedTweak[:])
return &c
}
func (x *xtsNI) BlockSize() int { return BlockSize }
func (x *xtsNI) CryptBlocks(dst, src []byte) {
validateXtsInput(dst, src)
if x.enc == xtsEncrypt {
if x.isGB {
encryptSm4XtsGB(&x.b.enc[0], &x.tweak, dst, src)
} else {
encryptSm4Xts(&x.b.enc[0], &x.tweak, dst, src)
}
} else {
if x.isGB {
decryptSm4XtsGB(&x.b.dec[0], &x.tweak, dst, src)
} else {
decryptSm4Xts(&x.b.dec[0], &x.tweak, dst, src)
}
}
}

View File

@ -41,79 +41,7 @@
#define R24_MASK V31 #define R24_MASK V31
#include "aesni_macros_arm64.s" #include "aesni_macros_arm64.s"
#include "xts_macros_arm64.s"
#define mul2Inline \
VMOV TW.D[1], I; \
ASR $63, I; \
VMOV I, K0.D2; \
VAND POLY.B16, K0.B16, K0.B16; \
\
VUSHR $63, TW.D2, K1.D2; \
VEXT $8, K1.B16, ZERO.B16, K1.B16; \
VSHL $1, TW.D2, TW.D2; \
VEOR K0.B16, TW.B16, TW.B16; \
VEOR K1.B16, TW.B16, TW.B16
#define mul2GBInline \
VREV64 TW.B16, TW.B16; \
VEXT $8, TW.B16, TW.B16, TW.B16; \
\
VMOV TW.D[0], I; \
LSL $63, I; \
ASR $63, I; \
VMOV I, K0.D2; \
VAND POLY.B16, K0.B16, K0.B16; \
\
VSHL $63, TW.D2, K1.D2; \
VEXT $8, ZERO.B16, K1.B16, K1.B16; \
VUSHR $1, TW.D2, TW.D2; \
VEOR K0.B16, TW.B16, TW.B16; \
VEOR K1.B16, TW.B16, TW.B16; \
\
VEXT $8, TW.B16, TW.B16, TW.B16; \
VREV64 TW.B16, TW.B16
#define prepare4Tweaks \
VMOV TW.B16, T0.B16; \
mul2Inline; \
VMOV TW.B16, T1.B16; \
mul2Inline; \
VMOV TW.B16, T2.B16; \
mul2Inline; \
VMOV TW.B16, T3.B16; \
mul2Inline
#define prepare8Tweaks \
prepare4Tweaks; \
VMOV TW.B16, T4.B16; \
mul2Inline; \
VMOV TW.B16, T5.B16; \
mul2Inline; \
VMOV TW.B16, T6.B16; \
mul2Inline; \
VMOV TW.B16, T7.B16; \
mul2Inline
#define prepareGB4Tweaks \
VMOV TW.B16, T0.B16; \
mul2GBInline; \
VMOV TW.B16, T1.B16; \
mul2GBInline; \
VMOV TW.B16, T2.B16; \
mul2GBInline; \
VMOV TW.B16, T3.B16; \
mul2GBInline
#define prepareGB8Tweaks \
prepareGB4Tweaks; \
VMOV TW.B16, T4.B16; \
mul2GBInline; \
VMOV TW.B16, T5.B16; \
mul2GBInline; \
VMOV TW.B16, T6.B16; \
mul2GBInline; \
VMOV TW.B16, T7.B16; \
mul2GBInline
#define load8blocks \ #define load8blocks \
VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \ VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \
@ -635,7 +563,6 @@ decLastCompleteBlockLoop:
VMOV B4.B16, TW.B16 VMOV B4.B16, TW.B16
VST1 [B3.B16], (RSP) VST1 [B3.B16], (RSP)
SUB $16, srcPtrLen
SUB $16, dstPtr, R7 SUB $16, dstPtr, R7
MOVD R7, R9 MOVD R7, R9
MOVD RSP, R8 MOVD RSP, R8
@ -831,7 +758,6 @@ decLastCompleteBlockLoop:
VMOV B4.B16, TW.B16 VMOV B4.B16, TW.B16
VST1 [B3.B16], (RSP) VST1 [B3.B16], (RSP)
SUB $16, srcPtrLen
SUB $16, dstPtr, R7 SUB $16, dstPtr, R7
MOVD R7, R9 MOVD R7, R9
MOVD RSP, R8 MOVD RSP, R8

72
sm4/xts_macros_arm64.s Normal file
View File

@ -0,0 +1,72 @@
#define mul2Inline \
VMOV TW.D[1], I; \
ASR $63, I; \
VMOV I, K0.D2; \
VAND POLY.B16, K0.B16, K0.B16; \
\
VUSHR $63, TW.D2, K1.D2; \
VEXT $8, K1.B16, ZERO.B16, K1.B16; \
VSHL $1, TW.D2, TW.D2; \
VEOR K0.B16, TW.B16, TW.B16; \
VEOR K1.B16, TW.B16, TW.B16
#define mul2GBInline \
VREV64 TW.B16, TW.B16; \
VEXT $8, TW.B16, TW.B16, TW.B16; \
\
VMOV TW.D[0], I; \
LSL $63, I; \
ASR $63, I; \
VMOV I, K0.D2; \
VAND POLY.B16, K0.B16, K0.B16; \
\
VSHL $63, TW.D2, K1.D2; \
VEXT $8, ZERO.B16, K1.B16, K1.B16; \
VUSHR $1, TW.D2, TW.D2; \
VEOR K0.B16, TW.B16, TW.B16; \
VEOR K1.B16, TW.B16, TW.B16; \
\
VEXT $8, TW.B16, TW.B16, TW.B16; \
VREV64 TW.B16, TW.B16
#define prepare4Tweaks \
VMOV TW.B16, T0.B16; \
mul2Inline; \
VMOV TW.B16, T1.B16; \
mul2Inline; \
VMOV TW.B16, T2.B16; \
mul2Inline; \
VMOV TW.B16, T3.B16; \
mul2Inline
#define prepare8Tweaks \
prepare4Tweaks; \
VMOV TW.B16, T4.B16; \
mul2Inline; \
VMOV TW.B16, T5.B16; \
mul2Inline; \
VMOV TW.B16, T6.B16; \
mul2Inline; \
VMOV TW.B16, T7.B16; \
mul2Inline
#define prepareGB4Tweaks \
VMOV TW.B16, T0.B16; \
mul2GBInline; \
VMOV TW.B16, T1.B16; \
mul2GBInline; \
VMOV TW.B16, T2.B16; \
mul2GBInline; \
VMOV TW.B16, T3.B16; \
mul2GBInline
#define prepareGB8Tweaks \
prepareGB4Tweaks; \
VMOV TW.B16, T4.B16; \
mul2GBInline; \
VMOV TW.B16, T5.B16; \
mul2GBInline; \
VMOV TW.B16, T6.B16; \
mul2GBInline; \
VMOV TW.B16, T7.B16; \
mul2GBInline

483
sm4/xts_sm4ni_arm64.s Normal file
View File

@ -0,0 +1,483 @@
//go:build arm64 && !purego
// +build arm64,!purego
#include "textflag.h"
#define B0 V0
#define B1 V1
#define B2 V2
#define B3 V3
#define B4 V4
#define B5 V5
#define B6 V6
#define B7 V7
#define POLY V8
#define ZERO V9
#define TW V10
#define T0 V11
#define T1 V12
#define T2 V13
#define T3 V14
#define T4 V15
#define T5 V16
#define T6 V17
#define T7 V18
#define K0 V19
#define K1 V20
#define K2 V21
#define K3 V22
#define K4 V23
#define K5 V24
#define K6 V25
#define K7 V26
#include "sm4ni_macros_arm64.s"
#include "xts_macros_arm64.s"
#define load8blocks \
VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \
VEOR T0.B16, B0.B16, B0.B16; \
VEOR T1.B16, B1.B16, B1.B16; \
VEOR T2.B16, B2.B16, B2.B16; \
VEOR T3.B16, B3.B16, B3.B16; \
\
VLD1.P 64(srcPtr), [B4.S4, B5.S4, B6.S4, B7.S4]; \
VEOR T4.B16, B4.B16, B4.B16; \
VEOR T5.B16, B5.B16, B5.B16; \
VEOR T6.B16, B6.B16, B6.B16; \
VEOR T7.B16, B7.B16, B7.B16
#define store8blocks \
VEOR T0.B16, B0.B16, B0.B16; \
VEOR T1.B16, B1.B16, B1.B16; \
VEOR T2.B16, B2.B16, B2.B16; \
VEOR T3.B16, B3.B16, B3.B16; \
VEOR T4.B16, B4.B16, B4.B16; \
VEOR T5.B16, B5.B16, B5.B16; \
VEOR T6.B16, B6.B16, B6.B16; \
VEOR T7.B16, B7.B16, B7.B16; \
\
VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr); \
VST1.P [B4.S4, B5.S4, B6.S4, B7.S4], 64(dstPtr)
#define dstPtr R2
#define srcPtr R3
#define rk R0
#define twPtr R1
#define srcPtrLen R4
#define I R5
// func encryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
TEXT ·encryptSm4Xts(SB),0,$128-64
MOVD xk+0(FP), rk
MOVD tweak+8(FP), twPtr
MOVD dst+16(FP), dstPtr
MOVD src+40(FP), srcPtr
MOVD src_len+48(FP), srcPtrLen
VEOR POLY.B16, POLY.B16, POLY.B16
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD $0x87, I
VMOV I, POLY.D[0]
// For SM4 round keys are stored in: K0 .. K7
VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4]
VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4]
VLD1 (twPtr), [TW.B16]
xtsSm4EncOctets:
CMP $128, srcPtrLen
BLT xtsSm4EncSingles
SUB $128, srcPtrLen
prepare8Tweaks
load8blocks
sm4eEnc8blocks()
store8blocks
B xtsSm4EncOctets
xtsSm4EncSingles:
CMP $16, srcPtrLen
BLT xtsSm4EncTail
SUB $16, srcPtrLen
VLD1.P 16(srcPtr), [B0.S4]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1.P [B0.S4], 16(dstPtr)
mul2Inline
B xtsSm4EncSingles
xtsSm4EncTail:
CBZ srcPtrLen, xtsSm4EncDone
SUB $16, dstPtr, R7
MOVD R7, R9
MOVD RSP, R8
VLD1 (R7), [B0.B16]
VST1 [B0.B16], (R8)
TBZ $3, srcPtrLen, less_than8
MOVD.P 8(srcPtr), R11
MOVD.P R11, 8(R8)
MOVD.P 8(R7), R12
MOVD.P R12, 8(dstPtr)
less_than8:
TBZ $2, srcPtrLen, less_than4
MOVWU.P 4(srcPtr), R11
MOVWU.P R11, 4(R8)
MOVWU.P 4(R7), R12
MOVWU.P R12, 4(dstPtr)
less_than4:
TBZ $1, srcPtrLen, less_than2
MOVHU.P 2(srcPtr), R11
MOVHU.P R11, 2(R8)
MOVHU.P 2(R7), R12
MOVHU.P R12, 2(dstPtr)
less_than2:
TBZ $0, srcPtrLen, xtsSm4EncTailEnc
MOVBU (srcPtr), R11
MOVBU R11, (R8)
MOVBU (R7), R12
MOVBU R12, (dstPtr)
xtsSm4EncTailEnc:
VLD1 (RSP), [B0.B16]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1 [B0.B16], (R9)
xtsSm4EncDone:
VST1 [TW.B16], (twPtr)
RET
// func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
TEXT ·encryptSm4XtsGB(SB),0,$128-64
MOVD xk+0(FP), rk
MOVD tweak+8(FP), twPtr
MOVD dst+16(FP), dstPtr
MOVD src+40(FP), srcPtr
MOVD src_len+48(FP), srcPtrLen
VEOR POLY.B16, POLY.B16, POLY.B16
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD $0xE1, I
LSL $56, I
VMOV I, POLY.D[1]
// For SM4 round keys are stored in: K0 .. K7
VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4]
VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4]
VLD1 (twPtr), [TW.B16]
xtsSm4EncOctets:
CMP $128, srcPtrLen
BLT xtsSm4EncSingles
SUB $128, srcPtrLen
prepareGB8Tweaks
load8blocks
sm4eEnc8blocks()
store8blocks
B xtsSm4EncOctets
xtsSm4EncSingles:
CMP $16, srcPtrLen
BLT xtsSm4EncTail
SUB $16, srcPtrLen
VLD1.P 16(srcPtr), [B0.S4]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1.P [B0.S4], 16(dstPtr)
mul2GBInline
B xtsSm4EncSingles
xtsSm4EncTail:
CBZ srcPtrLen, xtsSm4EncDone
SUB $16, dstPtr, R7
MOVD R7, R9
MOVD RSP, R8
VLD1 (R7), [B0.B16]
VST1 [B0.B16], (R8)
TBZ $3, srcPtrLen, less_than8
MOVD.P 8(srcPtr), R11
MOVD.P R11, 8(R8)
MOVD.P 8(R7), R12
MOVD.P R12, 8(dstPtr)
less_than8:
TBZ $2, srcPtrLen, less_than4
MOVWU.P 4(srcPtr), R11
MOVWU.P R11, 4(R8)
MOVWU.P 4(R7), R12
MOVWU.P R12, 4(dstPtr)
less_than4:
TBZ $1, srcPtrLen, less_than2
MOVHU.P 2(srcPtr), R11
MOVHU.P R11, 2(R8)
MOVHU.P 2(R7), R12
MOVHU.P R12, 2(dstPtr)
less_than2:
TBZ $0, srcPtrLen, xtsSm4EncTailEnc
MOVBU (srcPtr), R11
MOVBU R11, (R8)
MOVBU (R7), R12
MOVBU R12, (dstPtr)
xtsSm4EncTailEnc:
VLD1 (RSP), [B0.B16]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1 [B0.B16], (R9)
xtsSm4EncDone:
VST1 [TW.B16], (twPtr)
RET
// func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
TEXT ·decryptSm4Xts(SB),0,$128-64
MOVD xk+0(FP), rk
MOVD tweak+8(FP), twPtr
MOVD dst+16(FP), dstPtr
MOVD src+40(FP), srcPtr
MOVD src_len+48(FP), srcPtrLen
VEOR POLY.B16, POLY.B16, POLY.B16
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD $0x87, I
VMOV I, POLY.D[0]
// For SM4 round keys are stored in: K0 .. K7
VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4]
VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4]
VLD1 (twPtr), [TW.B16]
xtsSm4DecOctets:
CMP $128, srcPtrLen
BLT xtsSm4DecSingles
SUB $128, srcPtrLen
prepare8Tweaks
load8blocks
sm4eEnc8blocks()
store8blocks
B xtsSm4DecOctets
xtsSm4DecSingles:
CMP $32, srcPtrLen
BLT xtsSm4DecTail
SUB $16, srcPtrLen
VLD1.P 16(srcPtr), [B0.S4]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1.P [B0.S4], 16(dstPtr)
mul2Inline
B xtsSm4DecSingles
xtsSm4DecTail:
CBZ srcPtrLen, xtsSm4DecDone
CMP $16, srcPtrLen
BEQ xtsSm4DecLastBlock
VMOV TW.B16, B4.B16
mul2Inline
VLD1.P 16(srcPtr), [B0.S4]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1.P [B0.S4], 16(dstPtr)
VMOV B4.B16, TW.B16
VST1 [B0.B16], (RSP)
SUB $16, dstPtr, R7
MOVD R7, R9
MOVD RSP, R8
TBZ $3, srcPtrLen, less_than8
MOVD.P 8(srcPtr), R11
MOVD.P R11, 8(R8)
MOVD.P 8(R7), R12
MOVD.P R12, 8(dstPtr)
less_than8:
TBZ $2, srcPtrLen, less_than4
MOVWU.P 4(srcPtr), R11
MOVWU.P R11, 4(R8)
MOVWU.P 4(R7), R12
MOVWU.P R12, 4(dstPtr)
less_than4:
TBZ $1, srcPtrLen, less_than2
MOVHU.P 2(srcPtr), R11
MOVHU.P R11, 2(R8)
MOVHU.P 2(R7), R12
MOVHU.P R12, 2(dstPtr)
less_than2:
TBZ $0, srcPtrLen, xtsSm4DecTailDec
MOVBU (srcPtr), R11
MOVBU R11, (R8)
MOVBU (R7), R12
MOVBU R12, (dstPtr)
xtsSm4DecTailDec:
VLD1 (RSP), [B0.B16]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1 [B0.B16], (R9)
B xtsSm4DecDone
xtsSm4DecLastBlock:
VLD1.P 16(srcPtr), [B0.S4]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1.P [B0.S4], 16(dstPtr)
mul2Inline
xtsSm4DecDone:
VST1 [TW.B16], (twPtr)
RET
// func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
TEXT ·decryptSm4XtsGB(SB),0,$128-64
MOVD xk+0(FP), rk
MOVD tweak+8(FP), twPtr
MOVD dst+16(FP), dstPtr
MOVD src+40(FP), srcPtr
MOVD src_len+48(FP), srcPtrLen
VEOR POLY.B16, POLY.B16, POLY.B16
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD $0xE1, I
LSL $56, I
VMOV I, POLY.D[1]
// For SM4 round keys are stored in: K0 .. K7
VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4]
VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4]
VLD1 (twPtr), [TW.B16]
xtsSm4DecOctets:
CMP $128, srcPtrLen
BLT xtsSm4DecSingles
SUB $128, srcPtrLen
prepareGB8Tweaks
load8blocks
sm4eEnc8blocks()
store8blocks
B xtsSm4DecOctets
xtsSm4DecSingles:
CMP $32, srcPtrLen
BLT xtsSm4DecTail
SUB $16, srcPtrLen
VLD1.P 16(srcPtr), [B0.S4]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1.P [B0.S4], 16(dstPtr)
mul2GBInline
B xtsSm4DecSingles
xtsSm4DecTail:
CBZ srcPtrLen, xtsSm4DecDone
CMP $16, srcPtrLen
BEQ xtsSm4DecLastBlock
VMOV TW.B16, B4.B16
mul2GBInline
VLD1.P 16(srcPtr), [B0.S4]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1.P [B0.S4], 16(dstPtr)
VMOV B4.B16, TW.B16
VST1 [B0.B16], (RSP)
SUB $16, dstPtr, R7
MOVD R7, R9
MOVD RSP, R8
TBZ $3, srcPtrLen, less_than8
MOVD.P 8(srcPtr), R11
MOVD.P R11, 8(R8)
MOVD.P 8(R7), R12
MOVD.P R12, 8(dstPtr)
less_than8:
TBZ $2, srcPtrLen, less_than4
MOVWU.P 4(srcPtr), R11
MOVWU.P R11, 4(R8)
MOVWU.P 4(R7), R12
MOVWU.P R12, 4(dstPtr)
less_than4:
TBZ $1, srcPtrLen, less_than2
MOVHU.P 2(srcPtr), R11
MOVHU.P R11, 2(R8)
MOVHU.P 2(R7), R12
MOVHU.P R12, 2(dstPtr)
less_than2:
TBZ $0, srcPtrLen, xtsSm4DecTailDec
MOVBU (srcPtr), R11
MOVBU R11, (R8)
MOVBU (R7), R12
MOVBU R12, (dstPtr)
xtsSm4DecTailDec:
VLD1 (RSP), [B0.B16]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1 [B0.B16], (R9)
B xtsSm4DecDone
xtsSm4DecLastBlock:
VLD1.P 16(srcPtr), [B0.S4]
VEOR TW.B16, B0.B16, B0.B16
sm4eEnc1block()
VEOR TW.B16, B0.B16, B0.B16
VST1.P [B0.S4], 16(dstPtr)
mul2GBInline
xtsSm4DecDone:
VST1 [TW.B16], (twPtr)
RET