mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
sm4: xts sm4ni arm64 #151
This commit is contained in:
parent
16c5556655
commit
1019226803
@ -62,90 +62,7 @@
|
|||||||
VPMULL T0.D1, T2.D1, T3.Q1 \
|
VPMULL T0.D1, T2.D1, T3.Q1 \
|
||||||
VEOR T3.B16, ACCM.B16, ACCM.B16
|
VEOR T3.B16, ACCM.B16, ACCM.B16
|
||||||
|
|
||||||
#define sm4eEnc1block() \
|
#include "sm4ni_macros_arm64.s"
|
||||||
WORD $0xcec08660 \ //SM4E V0.4S, V19.4S
|
|
||||||
WORD $0xcec08680 \ //SM4E V0.4S, V20.4S
|
|
||||||
WORD $0xcec086a0 \ //SM4E V0.4S, V21.4S
|
|
||||||
WORD $0xcec086c0 \ //SM4E V0.4S, V22.4S
|
|
||||||
WORD $0xcec086e0 \ //SM4E V0.4S, V23.4S
|
|
||||||
WORD $0xcec08700 \ //SM4E V0.4S, V24.4S
|
|
||||||
WORD $0xcec08720 \ //SM4E V0.4S, V25.4S
|
|
||||||
WORD $0xcec08740 \ //SM4E V0.4S, V26.4S
|
|
||||||
VREV64 V0.B16, V0.B16 \
|
|
||||||
VEXT $8, V0.B16, V0.B16, V0.B16
|
|
||||||
|
|
||||||
#define sm4eEnc8blocks() \
|
|
||||||
sm4eEnc1block() \
|
|
||||||
WORD $0xcec08661 \ //SM4E V1.4S, V19.4S
|
|
||||||
WORD $0xcec08681 \ //SM4E V1.4S, V20.4S
|
|
||||||
WORD $0xcec086a1 \ //SM4E V1.4S, V21.4S
|
|
||||||
WORD $0xcec086c1 \ //SM4E V1.4S, V22.4S
|
|
||||||
WORD $0xcec086e1 \ //SM4E V1.4S, V23.4S
|
|
||||||
WORD $0xcec08701 \ //SM4E V1.4S, V24.4S
|
|
||||||
WORD $0xcec08721 \ //SM4E V1.4S, V25.4S
|
|
||||||
WORD $0xcec08741 \ //SM4E V1.4S, V26.4S
|
|
||||||
VREV64 V1.B16, V1.B16 \
|
|
||||||
VEXT $8, V1.B16, V1.B16, V1.B16 \
|
|
||||||
WORD $0xcec08662 \ //SM4E V2.4S, V19.4S
|
|
||||||
WORD $0xcec08682 \ //SM4E V2.4S, V20.4S
|
|
||||||
WORD $0xcec086a2 \ //SM4E V2.4S, V21.4S
|
|
||||||
WORD $0xcec086c2 \ //SM4E V2.4S, V22.4S
|
|
||||||
WORD $0xcec086e2 \ //SM4E V2.4S, V23.4S
|
|
||||||
WORD $0xcec08702 \ //SM4E V2.4S, V24.4S
|
|
||||||
WORD $0xcec08722 \ //SM4E V2.4S, V25.4S
|
|
||||||
WORD $0xcec08742 \ //SM4E V2.4S, V26.4S
|
|
||||||
VREV64 V2.B16, V2.B16 \
|
|
||||||
VEXT $8, V2.B16, V2.B16, V2.B16 \
|
|
||||||
WORD $0xcec08663 \ //SM4E V3.4S, V19.4S
|
|
||||||
WORD $0xcec08683 \ //SM4E V3.4S, V20.4S
|
|
||||||
WORD $0xcec086a3 \ //SM4E V3.4S, V21.4S
|
|
||||||
WORD $0xcec086c3 \ //SM4E V3.4S, V22.4S
|
|
||||||
WORD $0xcec086e3 \ //SM4E V3.4S, V23.4S
|
|
||||||
WORD $0xcec08703 \ //SM4E V3.4S, V24.4S
|
|
||||||
WORD $0xcec08723 \ //SM4E V3.4S, V25.4S
|
|
||||||
WORD $0xcec08743 \ //SM4E V3.4S, V26.4S
|
|
||||||
VREV64 V3.B16, V3.B16 \
|
|
||||||
VEXT $8, V3.B16, V3.B16, V3.B16 \
|
|
||||||
WORD $0xcec08664 \ //SM4E V4.4S, V19.4S
|
|
||||||
WORD $0xcec08684 \ //SM4E V4.4S, V20.4S
|
|
||||||
WORD $0xcec086a4 \ //SM4E V4.4S, V21.4S
|
|
||||||
WORD $0xcec086c4 \ //SM4E V4.4S, V22.4S
|
|
||||||
WORD $0xcec086e4 \ //SM4E V4.4S, V23.4S
|
|
||||||
WORD $0xcec08704 \ //SM4E V4.4S, V24.4S
|
|
||||||
WORD $0xcec08724 \ //SM4E V4.4S, V25.4S
|
|
||||||
WORD $0xcec08744 \ //SM4E V4.4S, V26.4S
|
|
||||||
VREV64 V4.B16, V4.B16 \
|
|
||||||
VEXT $8, V4.B16, V4.B16, V4.B16 \
|
|
||||||
WORD $0xcec08665 \ //SM4E V5.4S, V19.4S
|
|
||||||
WORD $0xcec08685 \ //SM4E V5.4S, V20.4S
|
|
||||||
WORD $0xcec086a5 \ //SM4E V5.4S, V21.4S
|
|
||||||
WORD $0xcec086c5 \ //SM4E V5.4S, V22.4S
|
|
||||||
WORD $0xcec086e5 \ //SM4E V5.4S, V23.4S
|
|
||||||
WORD $0xcec08705 \ //SM4E V5.4S, V24.4S
|
|
||||||
WORD $0xcec08725 \ //SM4E V5.4S, V25.4S
|
|
||||||
WORD $0xcec08745 \ //SM4E V5.4S, V26.4S
|
|
||||||
VREV64 V5.B16, V5.B16 \
|
|
||||||
VEXT $8, V5.B16, V5.B16, V5.B16 \
|
|
||||||
WORD $0xcec08666 \ //SM4E V6.4S, V19.4S
|
|
||||||
WORD $0xcec08686 \ //SM4E V6.4S, V20.4S
|
|
||||||
WORD $0xcec086a6 \ //SM4E V6.4S, V21.4S
|
|
||||||
WORD $0xcec086c6 \ //SM4E V6.4S, V22.4S
|
|
||||||
WORD $0xcec086e6 \ //SM4E V6.4S, V23.4S
|
|
||||||
WORD $0xcec08706 \ //SM4E V6.4S, V24.4S
|
|
||||||
WORD $0xcec08726 \ //SM4E V6.4S, V25.4S
|
|
||||||
WORD $0xcec08746 \ //SM4E V6.4S, V26.4S
|
|
||||||
VREV64 V6.B16, V6.B16 \
|
|
||||||
VEXT $8, V6.B16, V6.B16, V6.B16 \
|
|
||||||
WORD $0xcec08667 \ //SM4E V7.4S, V19.4S
|
|
||||||
WORD $0xcec08687 \ //SM4E V7.4S, V20.4S
|
|
||||||
WORD $0xcec086a7 \ //SM4E V7.4S, V21.4S
|
|
||||||
WORD $0xcec086c7 \ //SM4E V7.4S, V22.4S
|
|
||||||
WORD $0xcec086e7 \ //SM4E V7.4S, V23.4S
|
|
||||||
WORD $0xcec08707 \ //SM4E V7.4S, V24.4S
|
|
||||||
WORD $0xcec08727 \ //SM4E V7.4S, V25.4S
|
|
||||||
WORD $0xcec08747 \ //SM4E V7.4S, V26.4S
|
|
||||||
VREV64 V7.B16, V7.B16 \
|
|
||||||
VEXT $8, V7.B16, V7.B16, V7.B16
|
|
||||||
|
|
||||||
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||||
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
|
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
|
||||||
|
@ -55,7 +55,7 @@ func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
|||||||
//go:noescape
|
//go:noescape
|
||||||
func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
||||||
|
|
||||||
func (x *xts) CryptBlocks(dst, src []byte) {
|
func validateXtsInput(dst, src []byte) {
|
||||||
if len(dst) < len(src) {
|
if len(dst) < len(src) {
|
||||||
panic("xts: dst is smaller than src")
|
panic("xts: dst is smaller than src")
|
||||||
}
|
}
|
||||||
@ -65,6 +65,10 @@ func (x *xts) CryptBlocks(dst, src []byte) {
|
|||||||
if alias.InexactOverlap(dst[:len(src)], src) {
|
if alias.InexactOverlap(dst[:len(src)], src) {
|
||||||
panic("xts: invalid buffer overlap")
|
panic("xts: invalid buffer overlap")
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (x *xts) CryptBlocks(dst, src []byte) {
|
||||||
|
validateXtsInput(dst, src)
|
||||||
if x.enc == xtsEncrypt {
|
if x.enc == xtsEncrypt {
|
||||||
if x.isGB {
|
if x.isGB {
|
||||||
encryptSm4XtsGB(&x.b.enc[0], &x.tweak, dst, src)
|
encryptSm4XtsGB(&x.b.enc[0], &x.tweak, dst, src)
|
||||||
|
84
sm4/sm4ni_macros_arm64.s
Normal file
84
sm4/sm4ni_macros_arm64.s
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
#define sm4eEnc1block() \
|
||||||
|
WORD $0xcec08660 \ //SM4E V0.4S, V19.4S
|
||||||
|
WORD $0xcec08680 \ //SM4E V0.4S, V20.4S
|
||||||
|
WORD $0xcec086a0 \ //SM4E V0.4S, V21.4S
|
||||||
|
WORD $0xcec086c0 \ //SM4E V0.4S, V22.4S
|
||||||
|
WORD $0xcec086e0 \ //SM4E V0.4S, V23.4S
|
||||||
|
WORD $0xcec08700 \ //SM4E V0.4S, V24.4S
|
||||||
|
WORD $0xcec08720 \ //SM4E V0.4S, V25.4S
|
||||||
|
WORD $0xcec08740 \ //SM4E V0.4S, V26.4S
|
||||||
|
VREV64 V0.B16, V0.B16 \
|
||||||
|
VEXT $8, V0.B16, V0.B16, V0.B16
|
||||||
|
|
||||||
|
#define sm4eEnc8blocks() \
|
||||||
|
sm4eEnc1block() \
|
||||||
|
WORD $0xcec08661 \ //SM4E V1.4S, V19.4S
|
||||||
|
WORD $0xcec08681 \ //SM4E V1.4S, V20.4S
|
||||||
|
WORD $0xcec086a1 \ //SM4E V1.4S, V21.4S
|
||||||
|
WORD $0xcec086c1 \ //SM4E V1.4S, V22.4S
|
||||||
|
WORD $0xcec086e1 \ //SM4E V1.4S, V23.4S
|
||||||
|
WORD $0xcec08701 \ //SM4E V1.4S, V24.4S
|
||||||
|
WORD $0xcec08721 \ //SM4E V1.4S, V25.4S
|
||||||
|
WORD $0xcec08741 \ //SM4E V1.4S, V26.4S
|
||||||
|
VREV64 V1.B16, V1.B16 \
|
||||||
|
VEXT $8, V1.B16, V1.B16, V1.B16 \
|
||||||
|
WORD $0xcec08662 \ //SM4E V2.4S, V19.4S
|
||||||
|
WORD $0xcec08682 \ //SM4E V2.4S, V20.4S
|
||||||
|
WORD $0xcec086a2 \ //SM4E V2.4S, V21.4S
|
||||||
|
WORD $0xcec086c2 \ //SM4E V2.4S, V22.4S
|
||||||
|
WORD $0xcec086e2 \ //SM4E V2.4S, V23.4S
|
||||||
|
WORD $0xcec08702 \ //SM4E V2.4S, V24.4S
|
||||||
|
WORD $0xcec08722 \ //SM4E V2.4S, V25.4S
|
||||||
|
WORD $0xcec08742 \ //SM4E V2.4S, V26.4S
|
||||||
|
VREV64 V2.B16, V2.B16 \
|
||||||
|
VEXT $8, V2.B16, V2.B16, V2.B16 \
|
||||||
|
WORD $0xcec08663 \ //SM4E V3.4S, V19.4S
|
||||||
|
WORD $0xcec08683 \ //SM4E V3.4S, V20.4S
|
||||||
|
WORD $0xcec086a3 \ //SM4E V3.4S, V21.4S
|
||||||
|
WORD $0xcec086c3 \ //SM4E V3.4S, V22.4S
|
||||||
|
WORD $0xcec086e3 \ //SM4E V3.4S, V23.4S
|
||||||
|
WORD $0xcec08703 \ //SM4E V3.4S, V24.4S
|
||||||
|
WORD $0xcec08723 \ //SM4E V3.4S, V25.4S
|
||||||
|
WORD $0xcec08743 \ //SM4E V3.4S, V26.4S
|
||||||
|
VREV64 V3.B16, V3.B16 \
|
||||||
|
VEXT $8, V3.B16, V3.B16, V3.B16 \
|
||||||
|
WORD $0xcec08664 \ //SM4E V4.4S, V19.4S
|
||||||
|
WORD $0xcec08684 \ //SM4E V4.4S, V20.4S
|
||||||
|
WORD $0xcec086a4 \ //SM4E V4.4S, V21.4S
|
||||||
|
WORD $0xcec086c4 \ //SM4E V4.4S, V22.4S
|
||||||
|
WORD $0xcec086e4 \ //SM4E V4.4S, V23.4S
|
||||||
|
WORD $0xcec08704 \ //SM4E V4.4S, V24.4S
|
||||||
|
WORD $0xcec08724 \ //SM4E V4.4S, V25.4S
|
||||||
|
WORD $0xcec08744 \ //SM4E V4.4S, V26.4S
|
||||||
|
VREV64 V4.B16, V4.B16 \
|
||||||
|
VEXT $8, V4.B16, V4.B16, V4.B16 \
|
||||||
|
WORD $0xcec08665 \ //SM4E V5.4S, V19.4S
|
||||||
|
WORD $0xcec08685 \ //SM4E V5.4S, V20.4S
|
||||||
|
WORD $0xcec086a5 \ //SM4E V5.4S, V21.4S
|
||||||
|
WORD $0xcec086c5 \ //SM4E V5.4S, V22.4S
|
||||||
|
WORD $0xcec086e5 \ //SM4E V5.4S, V23.4S
|
||||||
|
WORD $0xcec08705 \ //SM4E V5.4S, V24.4S
|
||||||
|
WORD $0xcec08725 \ //SM4E V5.4S, V25.4S
|
||||||
|
WORD $0xcec08745 \ //SM4E V5.4S, V26.4S
|
||||||
|
VREV64 V5.B16, V5.B16 \
|
||||||
|
VEXT $8, V5.B16, V5.B16, V5.B16 \
|
||||||
|
WORD $0xcec08666 \ //SM4E V6.4S, V19.4S
|
||||||
|
WORD $0xcec08686 \ //SM4E V6.4S, V20.4S
|
||||||
|
WORD $0xcec086a6 \ //SM4E V6.4S, V21.4S
|
||||||
|
WORD $0xcec086c6 \ //SM4E V6.4S, V22.4S
|
||||||
|
WORD $0xcec086e6 \ //SM4E V6.4S, V23.4S
|
||||||
|
WORD $0xcec08706 \ //SM4E V6.4S, V24.4S
|
||||||
|
WORD $0xcec08726 \ //SM4E V6.4S, V25.4S
|
||||||
|
WORD $0xcec08746 \ //SM4E V6.4S, V26.4S
|
||||||
|
VREV64 V6.B16, V6.B16 \
|
||||||
|
VEXT $8, V6.B16, V6.B16, V6.B16 \
|
||||||
|
WORD $0xcec08667 \ //SM4E V7.4S, V19.4S
|
||||||
|
WORD $0xcec08687 \ //SM4E V7.4S, V20.4S
|
||||||
|
WORD $0xcec086a7 \ //SM4E V7.4S, V21.4S
|
||||||
|
WORD $0xcec086c7 \ //SM4E V7.4S, V22.4S
|
||||||
|
WORD $0xcec086e7 \ //SM4E V7.4S, V23.4S
|
||||||
|
WORD $0xcec08707 \ //SM4E V7.4S, V24.4S
|
||||||
|
WORD $0xcec08727 \ //SM4E V7.4S, V25.4S
|
||||||
|
WORD $0xcec08747 \ //SM4E V7.4S, V26.4S
|
||||||
|
VREV64 V7.B16, V7.B16 \
|
||||||
|
VEXT $8, V7.B16, V7.B16, V7.B16
|
56
sm4/sm4ni_xts.go
Normal file
56
sm4/sm4ni_xts.go
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
//go:build (amd64 && !purego) || (arm64 && !purego)
|
||||||
|
// +build amd64,!purego arm64,!purego
|
||||||
|
|
||||||
|
package sm4
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crypto/cipher"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Assert that sm4CipherAsm implements the xtsEncAble and xtsDecAble interfaces.
|
||||||
|
var _ xtsEncAble = (*sm4CipherNI)(nil)
|
||||||
|
var _ xtsDecAble = (*sm4CipherNI)(nil)
|
||||||
|
|
||||||
|
type xtsNI struct {
|
||||||
|
b *sm4CipherNI
|
||||||
|
tweak [BlockSize]byte
|
||||||
|
isGB bool // if true, follows GB/T 17964-2021
|
||||||
|
enc int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *sm4CipherNI) NewXTSEncrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode {
|
||||||
|
var c xtsNI
|
||||||
|
c.b = b
|
||||||
|
c.enc = xtsEncrypt
|
||||||
|
c.isGB = isGB
|
||||||
|
copy(c.tweak[:], encryptedTweak[:])
|
||||||
|
return &c
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *sm4CipherNI) NewXTSDecrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode {
|
||||||
|
var c xtsNI
|
||||||
|
c.b = b
|
||||||
|
c.enc = xtsDecrypt
|
||||||
|
c.isGB = isGB
|
||||||
|
copy(c.tweak[:], encryptedTweak[:])
|
||||||
|
return &c
|
||||||
|
}
|
||||||
|
|
||||||
|
func (x *xtsNI) BlockSize() int { return BlockSize }
|
||||||
|
|
||||||
|
func (x *xtsNI) CryptBlocks(dst, src []byte) {
|
||||||
|
validateXtsInput(dst, src)
|
||||||
|
if x.enc == xtsEncrypt {
|
||||||
|
if x.isGB {
|
||||||
|
encryptSm4XtsGB(&x.b.enc[0], &x.tweak, dst, src)
|
||||||
|
} else {
|
||||||
|
encryptSm4Xts(&x.b.enc[0], &x.tweak, dst, src)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if x.isGB {
|
||||||
|
decryptSm4XtsGB(&x.b.dec[0], &x.tweak, dst, src)
|
||||||
|
} else {
|
||||||
|
decryptSm4Xts(&x.b.dec[0], &x.tweak, dst, src)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -41,79 +41,7 @@
|
|||||||
#define R24_MASK V31
|
#define R24_MASK V31
|
||||||
|
|
||||||
#include "aesni_macros_arm64.s"
|
#include "aesni_macros_arm64.s"
|
||||||
|
#include "xts_macros_arm64.s"
|
||||||
#define mul2Inline \
|
|
||||||
VMOV TW.D[1], I; \
|
|
||||||
ASR $63, I; \
|
|
||||||
VMOV I, K0.D2; \
|
|
||||||
VAND POLY.B16, K0.B16, K0.B16; \
|
|
||||||
\
|
|
||||||
VUSHR $63, TW.D2, K1.D2; \
|
|
||||||
VEXT $8, K1.B16, ZERO.B16, K1.B16; \
|
|
||||||
VSHL $1, TW.D2, TW.D2; \
|
|
||||||
VEOR K0.B16, TW.B16, TW.B16; \
|
|
||||||
VEOR K1.B16, TW.B16, TW.B16
|
|
||||||
|
|
||||||
#define mul2GBInline \
|
|
||||||
VREV64 TW.B16, TW.B16; \
|
|
||||||
VEXT $8, TW.B16, TW.B16, TW.B16; \
|
|
||||||
\
|
|
||||||
VMOV TW.D[0], I; \
|
|
||||||
LSL $63, I; \
|
|
||||||
ASR $63, I; \
|
|
||||||
VMOV I, K0.D2; \
|
|
||||||
VAND POLY.B16, K0.B16, K0.B16; \
|
|
||||||
\
|
|
||||||
VSHL $63, TW.D2, K1.D2; \
|
|
||||||
VEXT $8, ZERO.B16, K1.B16, K1.B16; \
|
|
||||||
VUSHR $1, TW.D2, TW.D2; \
|
|
||||||
VEOR K0.B16, TW.B16, TW.B16; \
|
|
||||||
VEOR K1.B16, TW.B16, TW.B16; \
|
|
||||||
\
|
|
||||||
VEXT $8, TW.B16, TW.B16, TW.B16; \
|
|
||||||
VREV64 TW.B16, TW.B16
|
|
||||||
|
|
||||||
#define prepare4Tweaks \
|
|
||||||
VMOV TW.B16, T0.B16; \
|
|
||||||
mul2Inline; \
|
|
||||||
VMOV TW.B16, T1.B16; \
|
|
||||||
mul2Inline; \
|
|
||||||
VMOV TW.B16, T2.B16; \
|
|
||||||
mul2Inline; \
|
|
||||||
VMOV TW.B16, T3.B16; \
|
|
||||||
mul2Inline
|
|
||||||
|
|
||||||
#define prepare8Tweaks \
|
|
||||||
prepare4Tweaks; \
|
|
||||||
VMOV TW.B16, T4.B16; \
|
|
||||||
mul2Inline; \
|
|
||||||
VMOV TW.B16, T5.B16; \
|
|
||||||
mul2Inline; \
|
|
||||||
VMOV TW.B16, T6.B16; \
|
|
||||||
mul2Inline; \
|
|
||||||
VMOV TW.B16, T7.B16; \
|
|
||||||
mul2Inline
|
|
||||||
|
|
||||||
#define prepareGB4Tweaks \
|
|
||||||
VMOV TW.B16, T0.B16; \
|
|
||||||
mul2GBInline; \
|
|
||||||
VMOV TW.B16, T1.B16; \
|
|
||||||
mul2GBInline; \
|
|
||||||
VMOV TW.B16, T2.B16; \
|
|
||||||
mul2GBInline; \
|
|
||||||
VMOV TW.B16, T3.B16; \
|
|
||||||
mul2GBInline
|
|
||||||
|
|
||||||
#define prepareGB8Tweaks \
|
|
||||||
prepareGB4Tweaks; \
|
|
||||||
VMOV TW.B16, T4.B16; \
|
|
||||||
mul2GBInline; \
|
|
||||||
VMOV TW.B16, T5.B16; \
|
|
||||||
mul2GBInline; \
|
|
||||||
VMOV TW.B16, T6.B16; \
|
|
||||||
mul2GBInline; \
|
|
||||||
VMOV TW.B16, T7.B16; \
|
|
||||||
mul2GBInline
|
|
||||||
|
|
||||||
#define load8blocks \
|
#define load8blocks \
|
||||||
VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \
|
VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \
|
||||||
@ -635,7 +563,6 @@ decLastCompleteBlockLoop:
|
|||||||
VMOV B4.B16, TW.B16
|
VMOV B4.B16, TW.B16
|
||||||
VST1 [B3.B16], (RSP)
|
VST1 [B3.B16], (RSP)
|
||||||
|
|
||||||
SUB $16, srcPtrLen
|
|
||||||
SUB $16, dstPtr, R7
|
SUB $16, dstPtr, R7
|
||||||
MOVD R7, R9
|
MOVD R7, R9
|
||||||
MOVD RSP, R8
|
MOVD RSP, R8
|
||||||
@ -831,7 +758,6 @@ decLastCompleteBlockLoop:
|
|||||||
VMOV B4.B16, TW.B16
|
VMOV B4.B16, TW.B16
|
||||||
VST1 [B3.B16], (RSP)
|
VST1 [B3.B16], (RSP)
|
||||||
|
|
||||||
SUB $16, srcPtrLen
|
|
||||||
SUB $16, dstPtr, R7
|
SUB $16, dstPtr, R7
|
||||||
MOVD R7, R9
|
MOVD R7, R9
|
||||||
MOVD RSP, R8
|
MOVD RSP, R8
|
||||||
|
72
sm4/xts_macros_arm64.s
Normal file
72
sm4/xts_macros_arm64.s
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
#define mul2Inline \
|
||||||
|
VMOV TW.D[1], I; \
|
||||||
|
ASR $63, I; \
|
||||||
|
VMOV I, K0.D2; \
|
||||||
|
VAND POLY.B16, K0.B16, K0.B16; \
|
||||||
|
\
|
||||||
|
VUSHR $63, TW.D2, K1.D2; \
|
||||||
|
VEXT $8, K1.B16, ZERO.B16, K1.B16; \
|
||||||
|
VSHL $1, TW.D2, TW.D2; \
|
||||||
|
VEOR K0.B16, TW.B16, TW.B16; \
|
||||||
|
VEOR K1.B16, TW.B16, TW.B16
|
||||||
|
|
||||||
|
#define mul2GBInline \
|
||||||
|
VREV64 TW.B16, TW.B16; \
|
||||||
|
VEXT $8, TW.B16, TW.B16, TW.B16; \
|
||||||
|
\
|
||||||
|
VMOV TW.D[0], I; \
|
||||||
|
LSL $63, I; \
|
||||||
|
ASR $63, I; \
|
||||||
|
VMOV I, K0.D2; \
|
||||||
|
VAND POLY.B16, K0.B16, K0.B16; \
|
||||||
|
\
|
||||||
|
VSHL $63, TW.D2, K1.D2; \
|
||||||
|
VEXT $8, ZERO.B16, K1.B16, K1.B16; \
|
||||||
|
VUSHR $1, TW.D2, TW.D2; \
|
||||||
|
VEOR K0.B16, TW.B16, TW.B16; \
|
||||||
|
VEOR K1.B16, TW.B16, TW.B16; \
|
||||||
|
\
|
||||||
|
VEXT $8, TW.B16, TW.B16, TW.B16; \
|
||||||
|
VREV64 TW.B16, TW.B16
|
||||||
|
|
||||||
|
#define prepare4Tweaks \
|
||||||
|
VMOV TW.B16, T0.B16; \
|
||||||
|
mul2Inline; \
|
||||||
|
VMOV TW.B16, T1.B16; \
|
||||||
|
mul2Inline; \
|
||||||
|
VMOV TW.B16, T2.B16; \
|
||||||
|
mul2Inline; \
|
||||||
|
VMOV TW.B16, T3.B16; \
|
||||||
|
mul2Inline
|
||||||
|
|
||||||
|
#define prepare8Tweaks \
|
||||||
|
prepare4Tweaks; \
|
||||||
|
VMOV TW.B16, T4.B16; \
|
||||||
|
mul2Inline; \
|
||||||
|
VMOV TW.B16, T5.B16; \
|
||||||
|
mul2Inline; \
|
||||||
|
VMOV TW.B16, T6.B16; \
|
||||||
|
mul2Inline; \
|
||||||
|
VMOV TW.B16, T7.B16; \
|
||||||
|
mul2Inline
|
||||||
|
|
||||||
|
#define prepareGB4Tweaks \
|
||||||
|
VMOV TW.B16, T0.B16; \
|
||||||
|
mul2GBInline; \
|
||||||
|
VMOV TW.B16, T1.B16; \
|
||||||
|
mul2GBInline; \
|
||||||
|
VMOV TW.B16, T2.B16; \
|
||||||
|
mul2GBInline; \
|
||||||
|
VMOV TW.B16, T3.B16; \
|
||||||
|
mul2GBInline
|
||||||
|
|
||||||
|
#define prepareGB8Tweaks \
|
||||||
|
prepareGB4Tweaks; \
|
||||||
|
VMOV TW.B16, T4.B16; \
|
||||||
|
mul2GBInline; \
|
||||||
|
VMOV TW.B16, T5.B16; \
|
||||||
|
mul2GBInline; \
|
||||||
|
VMOV TW.B16, T6.B16; \
|
||||||
|
mul2GBInline; \
|
||||||
|
VMOV TW.B16, T7.B16; \
|
||||||
|
mul2GBInline
|
483
sm4/xts_sm4ni_arm64.s
Normal file
483
sm4/xts_sm4ni_arm64.s
Normal file
@ -0,0 +1,483 @@
|
|||||||
|
//go:build arm64 && !purego
|
||||||
|
// +build arm64,!purego
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
#define B0 V0
|
||||||
|
#define B1 V1
|
||||||
|
#define B2 V2
|
||||||
|
#define B3 V3
|
||||||
|
#define B4 V4
|
||||||
|
#define B5 V5
|
||||||
|
#define B6 V6
|
||||||
|
#define B7 V7
|
||||||
|
|
||||||
|
#define POLY V8
|
||||||
|
#define ZERO V9
|
||||||
|
#define TW V10
|
||||||
|
|
||||||
|
#define T0 V11
|
||||||
|
#define T1 V12
|
||||||
|
#define T2 V13
|
||||||
|
#define T3 V14
|
||||||
|
#define T4 V15
|
||||||
|
#define T5 V16
|
||||||
|
#define T6 V17
|
||||||
|
#define T7 V18
|
||||||
|
|
||||||
|
#define K0 V19
|
||||||
|
#define K1 V20
|
||||||
|
#define K2 V21
|
||||||
|
#define K3 V22
|
||||||
|
#define K4 V23
|
||||||
|
#define K5 V24
|
||||||
|
#define K6 V25
|
||||||
|
#define K7 V26
|
||||||
|
|
||||||
|
#include "sm4ni_macros_arm64.s"
|
||||||
|
#include "xts_macros_arm64.s"
|
||||||
|
|
||||||
|
#define load8blocks \
|
||||||
|
VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \
|
||||||
|
VEOR T0.B16, B0.B16, B0.B16; \
|
||||||
|
VEOR T1.B16, B1.B16, B1.B16; \
|
||||||
|
VEOR T2.B16, B2.B16, B2.B16; \
|
||||||
|
VEOR T3.B16, B3.B16, B3.B16; \
|
||||||
|
\
|
||||||
|
VLD1.P 64(srcPtr), [B4.S4, B5.S4, B6.S4, B7.S4]; \
|
||||||
|
VEOR T4.B16, B4.B16, B4.B16; \
|
||||||
|
VEOR T5.B16, B5.B16, B5.B16; \
|
||||||
|
VEOR T6.B16, B6.B16, B6.B16; \
|
||||||
|
VEOR T7.B16, B7.B16, B7.B16
|
||||||
|
|
||||||
|
#define store8blocks \
|
||||||
|
VEOR T0.B16, B0.B16, B0.B16; \
|
||||||
|
VEOR T1.B16, B1.B16, B1.B16; \
|
||||||
|
VEOR T2.B16, B2.B16, B2.B16; \
|
||||||
|
VEOR T3.B16, B3.B16, B3.B16; \
|
||||||
|
VEOR T4.B16, B4.B16, B4.B16; \
|
||||||
|
VEOR T5.B16, B5.B16, B5.B16; \
|
||||||
|
VEOR T6.B16, B6.B16, B6.B16; \
|
||||||
|
VEOR T7.B16, B7.B16, B7.B16; \
|
||||||
|
\
|
||||||
|
VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr); \
|
||||||
|
VST1.P [B4.S4, B5.S4, B6.S4, B7.S4], 64(dstPtr)
|
||||||
|
|
||||||
|
#define dstPtr R2
|
||||||
|
#define srcPtr R3
|
||||||
|
#define rk R0
|
||||||
|
#define twPtr R1
|
||||||
|
#define srcPtrLen R4
|
||||||
|
#define I R5
|
||||||
|
|
||||||
|
// func encryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
||||||
|
TEXT ·encryptSm4Xts(SB),0,$128-64
|
||||||
|
MOVD xk+0(FP), rk
|
||||||
|
MOVD tweak+8(FP), twPtr
|
||||||
|
MOVD dst+16(FP), dstPtr
|
||||||
|
MOVD src+40(FP), srcPtr
|
||||||
|
MOVD src_len+48(FP), srcPtrLen
|
||||||
|
|
||||||
|
VEOR POLY.B16, POLY.B16, POLY.B16
|
||||||
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
|
MOVD $0x87, I
|
||||||
|
VMOV I, POLY.D[0]
|
||||||
|
|
||||||
|
// For SM4 round keys are stored in: K0 .. K7
|
||||||
|
VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4]
|
||||||
|
VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4]
|
||||||
|
|
||||||
|
VLD1 (twPtr), [TW.B16]
|
||||||
|
|
||||||
|
xtsSm4EncOctets:
|
||||||
|
CMP $128, srcPtrLen
|
||||||
|
BLT xtsSm4EncSingles
|
||||||
|
SUB $128, srcPtrLen
|
||||||
|
prepare8Tweaks
|
||||||
|
load8blocks
|
||||||
|
sm4eEnc8blocks()
|
||||||
|
store8blocks
|
||||||
|
|
||||||
|
B xtsSm4EncOctets
|
||||||
|
|
||||||
|
xtsSm4EncSingles:
|
||||||
|
CMP $16, srcPtrLen
|
||||||
|
BLT xtsSm4EncTail
|
||||||
|
SUB $16, srcPtrLen
|
||||||
|
|
||||||
|
VLD1.P 16(srcPtr), [B0.S4]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1.P [B0.S4], 16(dstPtr)
|
||||||
|
|
||||||
|
mul2Inline
|
||||||
|
B xtsSm4EncSingles
|
||||||
|
|
||||||
|
xtsSm4EncTail:
|
||||||
|
CBZ srcPtrLen, xtsSm4EncDone
|
||||||
|
SUB $16, dstPtr, R7
|
||||||
|
MOVD R7, R9
|
||||||
|
MOVD RSP, R8
|
||||||
|
VLD1 (R7), [B0.B16]
|
||||||
|
VST1 [B0.B16], (R8)
|
||||||
|
|
||||||
|
TBZ $3, srcPtrLen, less_than8
|
||||||
|
MOVD.P 8(srcPtr), R11
|
||||||
|
MOVD.P R11, 8(R8)
|
||||||
|
MOVD.P 8(R7), R12
|
||||||
|
MOVD.P R12, 8(dstPtr)
|
||||||
|
|
||||||
|
less_than8:
|
||||||
|
TBZ $2, srcPtrLen, less_than4
|
||||||
|
MOVWU.P 4(srcPtr), R11
|
||||||
|
MOVWU.P R11, 4(R8)
|
||||||
|
MOVWU.P 4(R7), R12
|
||||||
|
MOVWU.P R12, 4(dstPtr)
|
||||||
|
|
||||||
|
less_than4:
|
||||||
|
TBZ $1, srcPtrLen, less_than2
|
||||||
|
MOVHU.P 2(srcPtr), R11
|
||||||
|
MOVHU.P R11, 2(R8)
|
||||||
|
MOVHU.P 2(R7), R12
|
||||||
|
MOVHU.P R12, 2(dstPtr)
|
||||||
|
|
||||||
|
less_than2:
|
||||||
|
TBZ $0, srcPtrLen, xtsSm4EncTailEnc
|
||||||
|
MOVBU (srcPtr), R11
|
||||||
|
MOVBU R11, (R8)
|
||||||
|
MOVBU (R7), R12
|
||||||
|
MOVBU R12, (dstPtr)
|
||||||
|
|
||||||
|
xtsSm4EncTailEnc:
|
||||||
|
VLD1 (RSP), [B0.B16]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1 [B0.B16], (R9)
|
||||||
|
|
||||||
|
xtsSm4EncDone:
|
||||||
|
VST1 [TW.B16], (twPtr)
|
||||||
|
RET
|
||||||
|
|
||||||
|
// func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
||||||
|
TEXT ·encryptSm4XtsGB(SB),0,$128-64
|
||||||
|
MOVD xk+0(FP), rk
|
||||||
|
MOVD tweak+8(FP), twPtr
|
||||||
|
MOVD dst+16(FP), dstPtr
|
||||||
|
MOVD src+40(FP), srcPtr
|
||||||
|
MOVD src_len+48(FP), srcPtrLen
|
||||||
|
|
||||||
|
VEOR POLY.B16, POLY.B16, POLY.B16
|
||||||
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
|
MOVD $0xE1, I
|
||||||
|
LSL $56, I
|
||||||
|
VMOV I, POLY.D[1]
|
||||||
|
|
||||||
|
// For SM4 round keys are stored in: K0 .. K7
|
||||||
|
VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4]
|
||||||
|
VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4]
|
||||||
|
|
||||||
|
VLD1 (twPtr), [TW.B16]
|
||||||
|
|
||||||
|
xtsSm4EncOctets:
|
||||||
|
CMP $128, srcPtrLen
|
||||||
|
BLT xtsSm4EncSingles
|
||||||
|
SUB $128, srcPtrLen
|
||||||
|
prepareGB8Tweaks
|
||||||
|
load8blocks
|
||||||
|
sm4eEnc8blocks()
|
||||||
|
store8blocks
|
||||||
|
|
||||||
|
B xtsSm4EncOctets
|
||||||
|
|
||||||
|
xtsSm4EncSingles:
|
||||||
|
CMP $16, srcPtrLen
|
||||||
|
BLT xtsSm4EncTail
|
||||||
|
SUB $16, srcPtrLen
|
||||||
|
|
||||||
|
VLD1.P 16(srcPtr), [B0.S4]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1.P [B0.S4], 16(dstPtr)
|
||||||
|
|
||||||
|
mul2GBInline
|
||||||
|
B xtsSm4EncSingles
|
||||||
|
|
||||||
|
xtsSm4EncTail:
|
||||||
|
CBZ srcPtrLen, xtsSm4EncDone
|
||||||
|
SUB $16, dstPtr, R7
|
||||||
|
MOVD R7, R9
|
||||||
|
MOVD RSP, R8
|
||||||
|
VLD1 (R7), [B0.B16]
|
||||||
|
VST1 [B0.B16], (R8)
|
||||||
|
|
||||||
|
TBZ $3, srcPtrLen, less_than8
|
||||||
|
MOVD.P 8(srcPtr), R11
|
||||||
|
MOVD.P R11, 8(R8)
|
||||||
|
MOVD.P 8(R7), R12
|
||||||
|
MOVD.P R12, 8(dstPtr)
|
||||||
|
|
||||||
|
less_than8:
|
||||||
|
TBZ $2, srcPtrLen, less_than4
|
||||||
|
MOVWU.P 4(srcPtr), R11
|
||||||
|
MOVWU.P R11, 4(R8)
|
||||||
|
MOVWU.P 4(R7), R12
|
||||||
|
MOVWU.P R12, 4(dstPtr)
|
||||||
|
|
||||||
|
less_than4:
|
||||||
|
TBZ $1, srcPtrLen, less_than2
|
||||||
|
MOVHU.P 2(srcPtr), R11
|
||||||
|
MOVHU.P R11, 2(R8)
|
||||||
|
MOVHU.P 2(R7), R12
|
||||||
|
MOVHU.P R12, 2(dstPtr)
|
||||||
|
|
||||||
|
less_than2:
|
||||||
|
TBZ $0, srcPtrLen, xtsSm4EncTailEnc
|
||||||
|
MOVBU (srcPtr), R11
|
||||||
|
MOVBU R11, (R8)
|
||||||
|
MOVBU (R7), R12
|
||||||
|
MOVBU R12, (dstPtr)
|
||||||
|
|
||||||
|
xtsSm4EncTailEnc:
|
||||||
|
VLD1 (RSP), [B0.B16]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1 [B0.B16], (R9)
|
||||||
|
|
||||||
|
xtsSm4EncDone:
|
||||||
|
VST1 [TW.B16], (twPtr)
|
||||||
|
RET
|
||||||
|
|
||||||
|
// func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
||||||
|
TEXT ·decryptSm4Xts(SB),0,$128-64
|
||||||
|
MOVD xk+0(FP), rk
|
||||||
|
MOVD tweak+8(FP), twPtr
|
||||||
|
MOVD dst+16(FP), dstPtr
|
||||||
|
MOVD src+40(FP), srcPtr
|
||||||
|
MOVD src_len+48(FP), srcPtrLen
|
||||||
|
|
||||||
|
VEOR POLY.B16, POLY.B16, POLY.B16
|
||||||
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
|
MOVD $0x87, I
|
||||||
|
VMOV I, POLY.D[0]
|
||||||
|
|
||||||
|
// For SM4 round keys are stored in: K0 .. K7
|
||||||
|
VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4]
|
||||||
|
VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4]
|
||||||
|
|
||||||
|
VLD1 (twPtr), [TW.B16]
|
||||||
|
|
||||||
|
xtsSm4DecOctets:
|
||||||
|
CMP $128, srcPtrLen
|
||||||
|
BLT xtsSm4DecSingles
|
||||||
|
SUB $128, srcPtrLen
|
||||||
|
|
||||||
|
prepare8Tweaks
|
||||||
|
load8blocks
|
||||||
|
sm4eEnc8blocks()
|
||||||
|
store8blocks
|
||||||
|
|
||||||
|
B xtsSm4DecOctets
|
||||||
|
|
||||||
|
xtsSm4DecSingles:
|
||||||
|
CMP $32, srcPtrLen
|
||||||
|
BLT xtsSm4DecTail
|
||||||
|
SUB $16, srcPtrLen
|
||||||
|
|
||||||
|
VLD1.P 16(srcPtr), [B0.S4]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1.P [B0.S4], 16(dstPtr)
|
||||||
|
|
||||||
|
mul2Inline
|
||||||
|
B xtsSm4DecSingles
|
||||||
|
|
||||||
|
xtsSm4DecTail:
|
||||||
|
CBZ srcPtrLen, xtsSm4DecDone
|
||||||
|
|
||||||
|
CMP $16, srcPtrLen
|
||||||
|
BEQ xtsSm4DecLastBlock
|
||||||
|
|
||||||
|
VMOV TW.B16, B4.B16
|
||||||
|
mul2Inline
|
||||||
|
VLD1.P 16(srcPtr), [B0.S4]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1.P [B0.S4], 16(dstPtr)
|
||||||
|
VMOV B4.B16, TW.B16
|
||||||
|
VST1 [B0.B16], (RSP)
|
||||||
|
|
||||||
|
SUB $16, dstPtr, R7
|
||||||
|
MOVD R7, R9
|
||||||
|
MOVD RSP, R8
|
||||||
|
|
||||||
|
TBZ $3, srcPtrLen, less_than8
|
||||||
|
MOVD.P 8(srcPtr), R11
|
||||||
|
MOVD.P R11, 8(R8)
|
||||||
|
MOVD.P 8(R7), R12
|
||||||
|
MOVD.P R12, 8(dstPtr)
|
||||||
|
|
||||||
|
less_than8:
|
||||||
|
TBZ $2, srcPtrLen, less_than4
|
||||||
|
MOVWU.P 4(srcPtr), R11
|
||||||
|
MOVWU.P R11, 4(R8)
|
||||||
|
MOVWU.P 4(R7), R12
|
||||||
|
MOVWU.P R12, 4(dstPtr)
|
||||||
|
|
||||||
|
less_than4:
|
||||||
|
TBZ $1, srcPtrLen, less_than2
|
||||||
|
MOVHU.P 2(srcPtr), R11
|
||||||
|
MOVHU.P R11, 2(R8)
|
||||||
|
MOVHU.P 2(R7), R12
|
||||||
|
MOVHU.P R12, 2(dstPtr)
|
||||||
|
|
||||||
|
less_than2:
|
||||||
|
TBZ $0, srcPtrLen, xtsSm4DecTailDec
|
||||||
|
MOVBU (srcPtr), R11
|
||||||
|
MOVBU R11, (R8)
|
||||||
|
MOVBU (R7), R12
|
||||||
|
MOVBU R12, (dstPtr)
|
||||||
|
|
||||||
|
xtsSm4DecTailDec:
|
||||||
|
VLD1 (RSP), [B0.B16]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1 [B0.B16], (R9)
|
||||||
|
|
||||||
|
B xtsSm4DecDone
|
||||||
|
|
||||||
|
xtsSm4DecLastBlock:
|
||||||
|
VLD1.P 16(srcPtr), [B0.S4]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1.P [B0.S4], 16(dstPtr)
|
||||||
|
mul2Inline
|
||||||
|
|
||||||
|
xtsSm4DecDone:
|
||||||
|
VST1 [TW.B16], (twPtr)
|
||||||
|
RET
|
||||||
|
|
||||||
|
// func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
||||||
|
TEXT ·decryptSm4XtsGB(SB),0,$128-64
|
||||||
|
MOVD xk+0(FP), rk
|
||||||
|
MOVD tweak+8(FP), twPtr
|
||||||
|
MOVD dst+16(FP), dstPtr
|
||||||
|
MOVD src+40(FP), srcPtr
|
||||||
|
MOVD src_len+48(FP), srcPtrLen
|
||||||
|
|
||||||
|
VEOR POLY.B16, POLY.B16, POLY.B16
|
||||||
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
|
MOVD $0xE1, I
|
||||||
|
LSL $56, I
|
||||||
|
VMOV I, POLY.D[1]
|
||||||
|
|
||||||
|
// For SM4 round keys are stored in: K0 .. K7
|
||||||
|
VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4]
|
||||||
|
VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4]
|
||||||
|
|
||||||
|
VLD1 (twPtr), [TW.B16]
|
||||||
|
|
||||||
|
xtsSm4DecOctets:
|
||||||
|
CMP $128, srcPtrLen
|
||||||
|
BLT xtsSm4DecSingles
|
||||||
|
SUB $128, srcPtrLen
|
||||||
|
|
||||||
|
prepareGB8Tweaks
|
||||||
|
load8blocks
|
||||||
|
sm4eEnc8blocks()
|
||||||
|
store8blocks
|
||||||
|
|
||||||
|
B xtsSm4DecOctets
|
||||||
|
|
||||||
|
xtsSm4DecSingles:
|
||||||
|
CMP $32, srcPtrLen
|
||||||
|
BLT xtsSm4DecTail
|
||||||
|
SUB $16, srcPtrLen
|
||||||
|
|
||||||
|
VLD1.P 16(srcPtr), [B0.S4]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1.P [B0.S4], 16(dstPtr)
|
||||||
|
|
||||||
|
mul2GBInline
|
||||||
|
B xtsSm4DecSingles
|
||||||
|
|
||||||
|
xtsSm4DecTail:
|
||||||
|
CBZ srcPtrLen, xtsSm4DecDone
|
||||||
|
|
||||||
|
CMP $16, srcPtrLen
|
||||||
|
BEQ xtsSm4DecLastBlock
|
||||||
|
|
||||||
|
VMOV TW.B16, B4.B16
|
||||||
|
mul2GBInline
|
||||||
|
VLD1.P 16(srcPtr), [B0.S4]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1.P [B0.S4], 16(dstPtr)
|
||||||
|
VMOV B4.B16, TW.B16
|
||||||
|
VST1 [B0.B16], (RSP)
|
||||||
|
|
||||||
|
SUB $16, dstPtr, R7
|
||||||
|
MOVD R7, R9
|
||||||
|
MOVD RSP, R8
|
||||||
|
|
||||||
|
TBZ $3, srcPtrLen, less_than8
|
||||||
|
MOVD.P 8(srcPtr), R11
|
||||||
|
MOVD.P R11, 8(R8)
|
||||||
|
MOVD.P 8(R7), R12
|
||||||
|
MOVD.P R12, 8(dstPtr)
|
||||||
|
|
||||||
|
less_than8:
|
||||||
|
TBZ $2, srcPtrLen, less_than4
|
||||||
|
MOVWU.P 4(srcPtr), R11
|
||||||
|
MOVWU.P R11, 4(R8)
|
||||||
|
MOVWU.P 4(R7), R12
|
||||||
|
MOVWU.P R12, 4(dstPtr)
|
||||||
|
|
||||||
|
less_than4:
|
||||||
|
TBZ $1, srcPtrLen, less_than2
|
||||||
|
MOVHU.P 2(srcPtr), R11
|
||||||
|
MOVHU.P R11, 2(R8)
|
||||||
|
MOVHU.P 2(R7), R12
|
||||||
|
MOVHU.P R12, 2(dstPtr)
|
||||||
|
|
||||||
|
less_than2:
|
||||||
|
TBZ $0, srcPtrLen, xtsSm4DecTailDec
|
||||||
|
MOVBU (srcPtr), R11
|
||||||
|
MOVBU R11, (R8)
|
||||||
|
MOVBU (R7), R12
|
||||||
|
MOVBU R12, (dstPtr)
|
||||||
|
|
||||||
|
xtsSm4DecTailDec:
|
||||||
|
VLD1 (RSP), [B0.B16]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1 [B0.B16], (R9)
|
||||||
|
|
||||||
|
B xtsSm4DecDone
|
||||||
|
|
||||||
|
xtsSm4DecLastBlock:
|
||||||
|
VLD1.P 16(srcPtr), [B0.S4]
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
sm4eEnc1block()
|
||||||
|
VEOR TW.B16, B0.B16, B0.B16
|
||||||
|
VST1.P [B0.S4], 16(dstPtr)
|
||||||
|
mul2GBInline
|
||||||
|
|
||||||
|
xtsSm4DecDone:
|
||||||
|
VST1 [TW.B16], (twPtr)
|
||||||
|
RET
|
Loading…
x
Reference in New Issue
Block a user