From 10192268039c9dcf85df367dcc3203f51170f067 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Fri, 25 Aug 2023 13:13:43 +0800 Subject: [PATCH] sm4: xts sm4ni arm64 #151 --- sm4/gcm_sm4ni_arm64.s | 85 +------ sm4/sm4_xts.go | 6 +- sm4/sm4ni_macros_arm64.s | 84 +++++++ sm4/sm4ni_xts.go | 56 +++++ sm4/xts_arm64.s | 76 +----- sm4/xts_macros_arm64.s | 72 ++++++ sm4/xts_sm4ni_arm64.s | 483 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 702 insertions(+), 160 deletions(-) create mode 100644 sm4/sm4ni_macros_arm64.s create mode 100644 sm4/sm4ni_xts.go create mode 100644 sm4/xts_macros_arm64.s create mode 100644 sm4/xts_sm4ni_arm64.s diff --git a/sm4/gcm_sm4ni_arm64.s b/sm4/gcm_sm4ni_arm64.s index cc9b1b2..33bb412 100644 --- a/sm4/gcm_sm4ni_arm64.s +++ b/sm4/gcm_sm4ni_arm64.s @@ -62,90 +62,7 @@ VPMULL T0.D1, T2.D1, T3.Q1 \ VEOR T3.B16, ACCM.B16, ACCM.B16 -#define sm4eEnc1block() \ - WORD $0xcec08660 \ //SM4E V0.4S, V19.4S - WORD $0xcec08680 \ //SM4E V0.4S, V20.4S - WORD $0xcec086a0 \ //SM4E V0.4S, V21.4S - WORD $0xcec086c0 \ //SM4E V0.4S, V22.4S - WORD $0xcec086e0 \ //SM4E V0.4S, V23.4S - WORD $0xcec08700 \ //SM4E V0.4S, V24.4S - WORD $0xcec08720 \ //SM4E V0.4S, V25.4S - WORD $0xcec08740 \ //SM4E V0.4S, V26.4S - VREV64 V0.B16, V0.B16 \ - VEXT $8, V0.B16, V0.B16, V0.B16 - -#define sm4eEnc8blocks() \ - sm4eEnc1block() \ - WORD $0xcec08661 \ //SM4E V1.4S, V19.4S - WORD $0xcec08681 \ //SM4E V1.4S, V20.4S - WORD $0xcec086a1 \ //SM4E V1.4S, V21.4S - WORD $0xcec086c1 \ //SM4E V1.4S, V22.4S - WORD $0xcec086e1 \ //SM4E V1.4S, V23.4S - WORD $0xcec08701 \ //SM4E V1.4S, V24.4S - WORD $0xcec08721 \ //SM4E V1.4S, V25.4S - WORD $0xcec08741 \ //SM4E V1.4S, V26.4S - VREV64 V1.B16, V1.B16 \ - VEXT $8, V1.B16, V1.B16, V1.B16 \ - WORD $0xcec08662 \ //SM4E V2.4S, V19.4S - WORD $0xcec08682 \ //SM4E V2.4S, V20.4S - WORD $0xcec086a2 \ //SM4E V2.4S, V21.4S - WORD $0xcec086c2 \ //SM4E V2.4S, V22.4S - WORD $0xcec086e2 \ //SM4E V2.4S, V23.4S - WORD $0xcec08702 \ //SM4E V2.4S, V24.4S - WORD $0xcec08722 \ //SM4E V2.4S, V25.4S - WORD $0xcec08742 \ //SM4E V2.4S, V26.4S - VREV64 V2.B16, V2.B16 \ - VEXT $8, V2.B16, V2.B16, V2.B16 \ - WORD $0xcec08663 \ //SM4E V3.4S, V19.4S - WORD $0xcec08683 \ //SM4E V3.4S, V20.4S - WORD $0xcec086a3 \ //SM4E V3.4S, V21.4S - WORD $0xcec086c3 \ //SM4E V3.4S, V22.4S - WORD $0xcec086e3 \ //SM4E V3.4S, V23.4S - WORD $0xcec08703 \ //SM4E V3.4S, V24.4S - WORD $0xcec08723 \ //SM4E V3.4S, V25.4S - WORD $0xcec08743 \ //SM4E V3.4S, V26.4S - VREV64 V3.B16, V3.B16 \ - VEXT $8, V3.B16, V3.B16, V3.B16 \ - WORD $0xcec08664 \ //SM4E V4.4S, V19.4S - WORD $0xcec08684 \ //SM4E V4.4S, V20.4S - WORD $0xcec086a4 \ //SM4E V4.4S, V21.4S - WORD $0xcec086c4 \ //SM4E V4.4S, V22.4S - WORD $0xcec086e4 \ //SM4E V4.4S, V23.4S - WORD $0xcec08704 \ //SM4E V4.4S, V24.4S - WORD $0xcec08724 \ //SM4E V4.4S, V25.4S - WORD $0xcec08744 \ //SM4E V4.4S, V26.4S - VREV64 V4.B16, V4.B16 \ - VEXT $8, V4.B16, V4.B16, V4.B16 \ - WORD $0xcec08665 \ //SM4E V5.4S, V19.4S - WORD $0xcec08685 \ //SM4E V5.4S, V20.4S - WORD $0xcec086a5 \ //SM4E V5.4S, V21.4S - WORD $0xcec086c5 \ //SM4E V5.4S, V22.4S - WORD $0xcec086e5 \ //SM4E V5.4S, V23.4S - WORD $0xcec08705 \ //SM4E V5.4S, V24.4S - WORD $0xcec08725 \ //SM4E V5.4S, V25.4S - WORD $0xcec08745 \ //SM4E V5.4S, V26.4S - VREV64 V5.B16, V5.B16 \ - VEXT $8, V5.B16, V5.B16, V5.B16 \ - WORD $0xcec08666 \ //SM4E V6.4S, V19.4S - WORD $0xcec08686 \ //SM4E V6.4S, V20.4S - WORD $0xcec086a6 \ //SM4E V6.4S, V21.4S - WORD $0xcec086c6 \ //SM4E V6.4S, V22.4S - WORD $0xcec086e6 \ //SM4E V6.4S, V23.4S - WORD $0xcec08706 \ //SM4E V6.4S, V24.4S - WORD $0xcec08726 \ //SM4E V6.4S, V25.4S - WORD $0xcec08746 \ //SM4E V6.4S, V26.4S - VREV64 V6.B16, V6.B16 \ - VEXT $8, V6.B16, V6.B16, V6.B16 \ - WORD $0xcec08667 \ //SM4E V7.4S, V19.4S - WORD $0xcec08687 \ //SM4E V7.4S, V20.4S - WORD $0xcec086a7 \ //SM4E V7.4S, V21.4S - WORD $0xcec086c7 \ //SM4E V7.4S, V22.4S - WORD $0xcec086e7 \ //SM4E V7.4S, V23.4S - WORD $0xcec08707 \ //SM4E V7.4S, V24.4S - WORD $0xcec08727 \ //SM4E V7.4S, V25.4S - WORD $0xcec08747 \ //SM4E V7.4S, V26.4S - VREV64 V7.B16, V7.B16 \ - VEXT $8, V7.B16, V7.B16, V7.B16 +#include "sm4ni_macros_arm64.s" // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 diff --git a/sm4/sm4_xts.go b/sm4/sm4_xts.go index 8ca879b..112cbbb 100644 --- a/sm4/sm4_xts.go +++ b/sm4/sm4_xts.go @@ -55,7 +55,7 @@ func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) //go:noescape func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) -func (x *xts) CryptBlocks(dst, src []byte) { +func validateXtsInput(dst, src []byte) { if len(dst) < len(src) { panic("xts: dst is smaller than src") } @@ -65,6 +65,10 @@ func (x *xts) CryptBlocks(dst, src []byte) { if alias.InexactOverlap(dst[:len(src)], src) { panic("xts: invalid buffer overlap") } +} + +func (x *xts) CryptBlocks(dst, src []byte) { + validateXtsInput(dst, src) if x.enc == xtsEncrypt { if x.isGB { encryptSm4XtsGB(&x.b.enc[0], &x.tweak, dst, src) diff --git a/sm4/sm4ni_macros_arm64.s b/sm4/sm4ni_macros_arm64.s new file mode 100644 index 0000000..615b15b --- /dev/null +++ b/sm4/sm4ni_macros_arm64.s @@ -0,0 +1,84 @@ +#define sm4eEnc1block() \ + WORD $0xcec08660 \ //SM4E V0.4S, V19.4S + WORD $0xcec08680 \ //SM4E V0.4S, V20.4S + WORD $0xcec086a0 \ //SM4E V0.4S, V21.4S + WORD $0xcec086c0 \ //SM4E V0.4S, V22.4S + WORD $0xcec086e0 \ //SM4E V0.4S, V23.4S + WORD $0xcec08700 \ //SM4E V0.4S, V24.4S + WORD $0xcec08720 \ //SM4E V0.4S, V25.4S + WORD $0xcec08740 \ //SM4E V0.4S, V26.4S + VREV64 V0.B16, V0.B16 \ + VEXT $8, V0.B16, V0.B16, V0.B16 + +#define sm4eEnc8blocks() \ + sm4eEnc1block() \ + WORD $0xcec08661 \ //SM4E V1.4S, V19.4S + WORD $0xcec08681 \ //SM4E V1.4S, V20.4S + WORD $0xcec086a1 \ //SM4E V1.4S, V21.4S + WORD $0xcec086c1 \ //SM4E V1.4S, V22.4S + WORD $0xcec086e1 \ //SM4E V1.4S, V23.4S + WORD $0xcec08701 \ //SM4E V1.4S, V24.4S + WORD $0xcec08721 \ //SM4E V1.4S, V25.4S + WORD $0xcec08741 \ //SM4E V1.4S, V26.4S + VREV64 V1.B16, V1.B16 \ + VEXT $8, V1.B16, V1.B16, V1.B16 \ + WORD $0xcec08662 \ //SM4E V2.4S, V19.4S + WORD $0xcec08682 \ //SM4E V2.4S, V20.4S + WORD $0xcec086a2 \ //SM4E V2.4S, V21.4S + WORD $0xcec086c2 \ //SM4E V2.4S, V22.4S + WORD $0xcec086e2 \ //SM4E V2.4S, V23.4S + WORD $0xcec08702 \ //SM4E V2.4S, V24.4S + WORD $0xcec08722 \ //SM4E V2.4S, V25.4S + WORD $0xcec08742 \ //SM4E V2.4S, V26.4S + VREV64 V2.B16, V2.B16 \ + VEXT $8, V2.B16, V2.B16, V2.B16 \ + WORD $0xcec08663 \ //SM4E V3.4S, V19.4S + WORD $0xcec08683 \ //SM4E V3.4S, V20.4S + WORD $0xcec086a3 \ //SM4E V3.4S, V21.4S + WORD $0xcec086c3 \ //SM4E V3.4S, V22.4S + WORD $0xcec086e3 \ //SM4E V3.4S, V23.4S + WORD $0xcec08703 \ //SM4E V3.4S, V24.4S + WORD $0xcec08723 \ //SM4E V3.4S, V25.4S + WORD $0xcec08743 \ //SM4E V3.4S, V26.4S + VREV64 V3.B16, V3.B16 \ + VEXT $8, V3.B16, V3.B16, V3.B16 \ + WORD $0xcec08664 \ //SM4E V4.4S, V19.4S + WORD $0xcec08684 \ //SM4E V4.4S, V20.4S + WORD $0xcec086a4 \ //SM4E V4.4S, V21.4S + WORD $0xcec086c4 \ //SM4E V4.4S, V22.4S + WORD $0xcec086e4 \ //SM4E V4.4S, V23.4S + WORD $0xcec08704 \ //SM4E V4.4S, V24.4S + WORD $0xcec08724 \ //SM4E V4.4S, V25.4S + WORD $0xcec08744 \ //SM4E V4.4S, V26.4S + VREV64 V4.B16, V4.B16 \ + VEXT $8, V4.B16, V4.B16, V4.B16 \ + WORD $0xcec08665 \ //SM4E V5.4S, V19.4S + WORD $0xcec08685 \ //SM4E V5.4S, V20.4S + WORD $0xcec086a5 \ //SM4E V5.4S, V21.4S + WORD $0xcec086c5 \ //SM4E V5.4S, V22.4S + WORD $0xcec086e5 \ //SM4E V5.4S, V23.4S + WORD $0xcec08705 \ //SM4E V5.4S, V24.4S + WORD $0xcec08725 \ //SM4E V5.4S, V25.4S + WORD $0xcec08745 \ //SM4E V5.4S, V26.4S + VREV64 V5.B16, V5.B16 \ + VEXT $8, V5.B16, V5.B16, V5.B16 \ + WORD $0xcec08666 \ //SM4E V6.4S, V19.4S + WORD $0xcec08686 \ //SM4E V6.4S, V20.4S + WORD $0xcec086a6 \ //SM4E V6.4S, V21.4S + WORD $0xcec086c6 \ //SM4E V6.4S, V22.4S + WORD $0xcec086e6 \ //SM4E V6.4S, V23.4S + WORD $0xcec08706 \ //SM4E V6.4S, V24.4S + WORD $0xcec08726 \ //SM4E V6.4S, V25.4S + WORD $0xcec08746 \ //SM4E V6.4S, V26.4S + VREV64 V6.B16, V6.B16 \ + VEXT $8, V6.B16, V6.B16, V6.B16 \ + WORD $0xcec08667 \ //SM4E V7.4S, V19.4S + WORD $0xcec08687 \ //SM4E V7.4S, V20.4S + WORD $0xcec086a7 \ //SM4E V7.4S, V21.4S + WORD $0xcec086c7 \ //SM4E V7.4S, V22.4S + WORD $0xcec086e7 \ //SM4E V7.4S, V23.4S + WORD $0xcec08707 \ //SM4E V7.4S, V24.4S + WORD $0xcec08727 \ //SM4E V7.4S, V25.4S + WORD $0xcec08747 \ //SM4E V7.4S, V26.4S + VREV64 V7.B16, V7.B16 \ + VEXT $8, V7.B16, V7.B16, V7.B16 diff --git a/sm4/sm4ni_xts.go b/sm4/sm4ni_xts.go new file mode 100644 index 0000000..86a5d41 --- /dev/null +++ b/sm4/sm4ni_xts.go @@ -0,0 +1,56 @@ +//go:build (amd64 && !purego) || (arm64 && !purego) +// +build amd64,!purego arm64,!purego + +package sm4 + +import ( + "crypto/cipher" +) + +// Assert that sm4CipherAsm implements the xtsEncAble and xtsDecAble interfaces. +var _ xtsEncAble = (*sm4CipherNI)(nil) +var _ xtsDecAble = (*sm4CipherNI)(nil) + +type xtsNI struct { + b *sm4CipherNI + tweak [BlockSize]byte + isGB bool // if true, follows GB/T 17964-2021 + enc int +} + +func (b *sm4CipherNI) NewXTSEncrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode { + var c xtsNI + c.b = b + c.enc = xtsEncrypt + c.isGB = isGB + copy(c.tweak[:], encryptedTweak[:]) + return &c +} + +func (b *sm4CipherNI) NewXTSDecrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode { + var c xtsNI + c.b = b + c.enc = xtsDecrypt + c.isGB = isGB + copy(c.tweak[:], encryptedTweak[:]) + return &c +} + +func (x *xtsNI) BlockSize() int { return BlockSize } + +func (x *xtsNI) CryptBlocks(dst, src []byte) { + validateXtsInput(dst, src) + if x.enc == xtsEncrypt { + if x.isGB { + encryptSm4XtsGB(&x.b.enc[0], &x.tweak, dst, src) + } else { + encryptSm4Xts(&x.b.enc[0], &x.tweak, dst, src) + } + } else { + if x.isGB { + decryptSm4XtsGB(&x.b.dec[0], &x.tweak, dst, src) + } else { + decryptSm4Xts(&x.b.dec[0], &x.tweak, dst, src) + } + } +} diff --git a/sm4/xts_arm64.s b/sm4/xts_arm64.s index bbb9de7..8215e30 100644 --- a/sm4/xts_arm64.s +++ b/sm4/xts_arm64.s @@ -41,79 +41,7 @@ #define R24_MASK V31 #include "aesni_macros_arm64.s" - -#define mul2Inline \ - VMOV TW.D[1], I; \ - ASR $63, I; \ - VMOV I, K0.D2; \ - VAND POLY.B16, K0.B16, K0.B16; \ - \ - VUSHR $63, TW.D2, K1.D2; \ - VEXT $8, K1.B16, ZERO.B16, K1.B16; \ - VSHL $1, TW.D2, TW.D2; \ - VEOR K0.B16, TW.B16, TW.B16; \ - VEOR K1.B16, TW.B16, TW.B16 - -#define mul2GBInline \ - VREV64 TW.B16, TW.B16; \ - VEXT $8, TW.B16, TW.B16, TW.B16; \ - \ - VMOV TW.D[0], I; \ - LSL $63, I; \ - ASR $63, I; \ - VMOV I, K0.D2; \ - VAND POLY.B16, K0.B16, K0.B16; \ - \ - VSHL $63, TW.D2, K1.D2; \ - VEXT $8, ZERO.B16, K1.B16, K1.B16; \ - VUSHR $1, TW.D2, TW.D2; \ - VEOR K0.B16, TW.B16, TW.B16; \ - VEOR K1.B16, TW.B16, TW.B16; \ - \ - VEXT $8, TW.B16, TW.B16, TW.B16; \ - VREV64 TW.B16, TW.B16 - -#define prepare4Tweaks \ - VMOV TW.B16, T0.B16; \ - mul2Inline; \ - VMOV TW.B16, T1.B16; \ - mul2Inline; \ - VMOV TW.B16, T2.B16; \ - mul2Inline; \ - VMOV TW.B16, T3.B16; \ - mul2Inline - -#define prepare8Tweaks \ - prepare4Tweaks; \ - VMOV TW.B16, T4.B16; \ - mul2Inline; \ - VMOV TW.B16, T5.B16; \ - mul2Inline; \ - VMOV TW.B16, T6.B16; \ - mul2Inline; \ - VMOV TW.B16, T7.B16; \ - mul2Inline - -#define prepareGB4Tweaks \ - VMOV TW.B16, T0.B16; \ - mul2GBInline; \ - VMOV TW.B16, T1.B16; \ - mul2GBInline; \ - VMOV TW.B16, T2.B16; \ - mul2GBInline; \ - VMOV TW.B16, T3.B16; \ - mul2GBInline - -#define prepareGB8Tweaks \ - prepareGB4Tweaks; \ - VMOV TW.B16, T4.B16; \ - mul2GBInline; \ - VMOV TW.B16, T5.B16; \ - mul2GBInline; \ - VMOV TW.B16, T6.B16; \ - mul2GBInline; \ - VMOV TW.B16, T7.B16; \ - mul2GBInline +#include "xts_macros_arm64.s" #define load8blocks \ VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \ @@ -635,7 +563,6 @@ decLastCompleteBlockLoop: VMOV B4.B16, TW.B16 VST1 [B3.B16], (RSP) - SUB $16, srcPtrLen SUB $16, dstPtr, R7 MOVD R7, R9 MOVD RSP, R8 @@ -831,7 +758,6 @@ decLastCompleteBlockLoop: VMOV B4.B16, TW.B16 VST1 [B3.B16], (RSP) - SUB $16, srcPtrLen SUB $16, dstPtr, R7 MOVD R7, R9 MOVD RSP, R8 diff --git a/sm4/xts_macros_arm64.s b/sm4/xts_macros_arm64.s new file mode 100644 index 0000000..b74aeba --- /dev/null +++ b/sm4/xts_macros_arm64.s @@ -0,0 +1,72 @@ +#define mul2Inline \ + VMOV TW.D[1], I; \ + ASR $63, I; \ + VMOV I, K0.D2; \ + VAND POLY.B16, K0.B16, K0.B16; \ + \ + VUSHR $63, TW.D2, K1.D2; \ + VEXT $8, K1.B16, ZERO.B16, K1.B16; \ + VSHL $1, TW.D2, TW.D2; \ + VEOR K0.B16, TW.B16, TW.B16; \ + VEOR K1.B16, TW.B16, TW.B16 + +#define mul2GBInline \ + VREV64 TW.B16, TW.B16; \ + VEXT $8, TW.B16, TW.B16, TW.B16; \ + \ + VMOV TW.D[0], I; \ + LSL $63, I; \ + ASR $63, I; \ + VMOV I, K0.D2; \ + VAND POLY.B16, K0.B16, K0.B16; \ + \ + VSHL $63, TW.D2, K1.D2; \ + VEXT $8, ZERO.B16, K1.B16, K1.B16; \ + VUSHR $1, TW.D2, TW.D2; \ + VEOR K0.B16, TW.B16, TW.B16; \ + VEOR K1.B16, TW.B16, TW.B16; \ + \ + VEXT $8, TW.B16, TW.B16, TW.B16; \ + VREV64 TW.B16, TW.B16 + +#define prepare4Tweaks \ + VMOV TW.B16, T0.B16; \ + mul2Inline; \ + VMOV TW.B16, T1.B16; \ + mul2Inline; \ + VMOV TW.B16, T2.B16; \ + mul2Inline; \ + VMOV TW.B16, T3.B16; \ + mul2Inline + +#define prepare8Tweaks \ + prepare4Tweaks; \ + VMOV TW.B16, T4.B16; \ + mul2Inline; \ + VMOV TW.B16, T5.B16; \ + mul2Inline; \ + VMOV TW.B16, T6.B16; \ + mul2Inline; \ + VMOV TW.B16, T7.B16; \ + mul2Inline + +#define prepareGB4Tweaks \ + VMOV TW.B16, T0.B16; \ + mul2GBInline; \ + VMOV TW.B16, T1.B16; \ + mul2GBInline; \ + VMOV TW.B16, T2.B16; \ + mul2GBInline; \ + VMOV TW.B16, T3.B16; \ + mul2GBInline + +#define prepareGB8Tweaks \ + prepareGB4Tweaks; \ + VMOV TW.B16, T4.B16; \ + mul2GBInline; \ + VMOV TW.B16, T5.B16; \ + mul2GBInline; \ + VMOV TW.B16, T6.B16; \ + mul2GBInline; \ + VMOV TW.B16, T7.B16; \ + mul2GBInline diff --git a/sm4/xts_sm4ni_arm64.s b/sm4/xts_sm4ni_arm64.s new file mode 100644 index 0000000..58fc493 --- /dev/null +++ b/sm4/xts_sm4ni_arm64.s @@ -0,0 +1,483 @@ +//go:build arm64 && !purego +// +build arm64,!purego + +#include "textflag.h" + +#define B0 V0 +#define B1 V1 +#define B2 V2 +#define B3 V3 +#define B4 V4 +#define B5 V5 +#define B6 V6 +#define B7 V7 + +#define POLY V8 +#define ZERO V9 +#define TW V10 + +#define T0 V11 +#define T1 V12 +#define T2 V13 +#define T3 V14 +#define T4 V15 +#define T5 V16 +#define T6 V17 +#define T7 V18 + +#define K0 V19 +#define K1 V20 +#define K2 V21 +#define K3 V22 +#define K4 V23 +#define K5 V24 +#define K6 V25 +#define K7 V26 + +#include "sm4ni_macros_arm64.s" +#include "xts_macros_arm64.s" + +#define load8blocks \ + VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \ + VEOR T0.B16, B0.B16, B0.B16; \ + VEOR T1.B16, B1.B16, B1.B16; \ + VEOR T2.B16, B2.B16, B2.B16; \ + VEOR T3.B16, B3.B16, B3.B16; \ + \ + VLD1.P 64(srcPtr), [B4.S4, B5.S4, B6.S4, B7.S4]; \ + VEOR T4.B16, B4.B16, B4.B16; \ + VEOR T5.B16, B5.B16, B5.B16; \ + VEOR T6.B16, B6.B16, B6.B16; \ + VEOR T7.B16, B7.B16, B7.B16 + +#define store8blocks \ + VEOR T0.B16, B0.B16, B0.B16; \ + VEOR T1.B16, B1.B16, B1.B16; \ + VEOR T2.B16, B2.B16, B2.B16; \ + VEOR T3.B16, B3.B16, B3.B16; \ + VEOR T4.B16, B4.B16, B4.B16; \ + VEOR T5.B16, B5.B16, B5.B16; \ + VEOR T6.B16, B6.B16, B6.B16; \ + VEOR T7.B16, B7.B16, B7.B16; \ + \ + VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr); \ + VST1.P [B4.S4, B5.S4, B6.S4, B7.S4], 64(dstPtr) + +#define dstPtr R2 +#define srcPtr R3 +#define rk R0 +#define twPtr R1 +#define srcPtrLen R4 +#define I R5 + +// func encryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) +TEXT ·encryptSm4Xts(SB),0,$128-64 + MOVD xk+0(FP), rk + MOVD tweak+8(FP), twPtr + MOVD dst+16(FP), dstPtr + MOVD src+40(FP), srcPtr + MOVD src_len+48(FP), srcPtrLen + + VEOR POLY.B16, POLY.B16, POLY.B16 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD $0x87, I + VMOV I, POLY.D[0] + + // For SM4 round keys are stored in: K0 .. K7 + VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4] + VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4] + + VLD1 (twPtr), [TW.B16] + +xtsSm4EncOctets: + CMP $128, srcPtrLen + BLT xtsSm4EncSingles + SUB $128, srcPtrLen + prepare8Tweaks + load8blocks + sm4eEnc8blocks() + store8blocks + + B xtsSm4EncOctets + +xtsSm4EncSingles: + CMP $16, srcPtrLen + BLT xtsSm4EncTail + SUB $16, srcPtrLen + + VLD1.P 16(srcPtr), [B0.S4] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1.P [B0.S4], 16(dstPtr) + + mul2Inline + B xtsSm4EncSingles + +xtsSm4EncTail: + CBZ srcPtrLen, xtsSm4EncDone + SUB $16, dstPtr, R7 + MOVD R7, R9 + MOVD RSP, R8 + VLD1 (R7), [B0.B16] + VST1 [B0.B16], (R8) + + TBZ $3, srcPtrLen, less_than8 + MOVD.P 8(srcPtr), R11 + MOVD.P R11, 8(R8) + MOVD.P 8(R7), R12 + MOVD.P R12, 8(dstPtr) + +less_than8: + TBZ $2, srcPtrLen, less_than4 + MOVWU.P 4(srcPtr), R11 + MOVWU.P R11, 4(R8) + MOVWU.P 4(R7), R12 + MOVWU.P R12, 4(dstPtr) + +less_than4: + TBZ $1, srcPtrLen, less_than2 + MOVHU.P 2(srcPtr), R11 + MOVHU.P R11, 2(R8) + MOVHU.P 2(R7), R12 + MOVHU.P R12, 2(dstPtr) + +less_than2: + TBZ $0, srcPtrLen, xtsSm4EncTailEnc + MOVBU (srcPtr), R11 + MOVBU R11, (R8) + MOVBU (R7), R12 + MOVBU R12, (dstPtr) + +xtsSm4EncTailEnc: + VLD1 (RSP), [B0.B16] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1 [B0.B16], (R9) + +xtsSm4EncDone: + VST1 [TW.B16], (twPtr) + RET + +// func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) +TEXT ·encryptSm4XtsGB(SB),0,$128-64 + MOVD xk+0(FP), rk + MOVD tweak+8(FP), twPtr + MOVD dst+16(FP), dstPtr + MOVD src+40(FP), srcPtr + MOVD src_len+48(FP), srcPtrLen + + VEOR POLY.B16, POLY.B16, POLY.B16 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD $0xE1, I + LSL $56, I + VMOV I, POLY.D[1] + + // For SM4 round keys are stored in: K0 .. K7 + VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4] + VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4] + + VLD1 (twPtr), [TW.B16] + +xtsSm4EncOctets: + CMP $128, srcPtrLen + BLT xtsSm4EncSingles + SUB $128, srcPtrLen + prepareGB8Tweaks + load8blocks + sm4eEnc8blocks() + store8blocks + + B xtsSm4EncOctets + +xtsSm4EncSingles: + CMP $16, srcPtrLen + BLT xtsSm4EncTail + SUB $16, srcPtrLen + + VLD1.P 16(srcPtr), [B0.S4] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1.P [B0.S4], 16(dstPtr) + + mul2GBInline + B xtsSm4EncSingles + +xtsSm4EncTail: + CBZ srcPtrLen, xtsSm4EncDone + SUB $16, dstPtr, R7 + MOVD R7, R9 + MOVD RSP, R8 + VLD1 (R7), [B0.B16] + VST1 [B0.B16], (R8) + + TBZ $3, srcPtrLen, less_than8 + MOVD.P 8(srcPtr), R11 + MOVD.P R11, 8(R8) + MOVD.P 8(R7), R12 + MOVD.P R12, 8(dstPtr) + +less_than8: + TBZ $2, srcPtrLen, less_than4 + MOVWU.P 4(srcPtr), R11 + MOVWU.P R11, 4(R8) + MOVWU.P 4(R7), R12 + MOVWU.P R12, 4(dstPtr) + +less_than4: + TBZ $1, srcPtrLen, less_than2 + MOVHU.P 2(srcPtr), R11 + MOVHU.P R11, 2(R8) + MOVHU.P 2(R7), R12 + MOVHU.P R12, 2(dstPtr) + +less_than2: + TBZ $0, srcPtrLen, xtsSm4EncTailEnc + MOVBU (srcPtr), R11 + MOVBU R11, (R8) + MOVBU (R7), R12 + MOVBU R12, (dstPtr) + +xtsSm4EncTailEnc: + VLD1 (RSP), [B0.B16] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1 [B0.B16], (R9) + +xtsSm4EncDone: + VST1 [TW.B16], (twPtr) + RET + +// func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) +TEXT ·decryptSm4Xts(SB),0,$128-64 + MOVD xk+0(FP), rk + MOVD tweak+8(FP), twPtr + MOVD dst+16(FP), dstPtr + MOVD src+40(FP), srcPtr + MOVD src_len+48(FP), srcPtrLen + + VEOR POLY.B16, POLY.B16, POLY.B16 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD $0x87, I + VMOV I, POLY.D[0] + + // For SM4 round keys are stored in: K0 .. K7 + VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4] + VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4] + + VLD1 (twPtr), [TW.B16] + +xtsSm4DecOctets: + CMP $128, srcPtrLen + BLT xtsSm4DecSingles + SUB $128, srcPtrLen + + prepare8Tweaks + load8blocks + sm4eEnc8blocks() + store8blocks + + B xtsSm4DecOctets + +xtsSm4DecSingles: + CMP $32, srcPtrLen + BLT xtsSm4DecTail + SUB $16, srcPtrLen + + VLD1.P 16(srcPtr), [B0.S4] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1.P [B0.S4], 16(dstPtr) + + mul2Inline + B xtsSm4DecSingles + +xtsSm4DecTail: + CBZ srcPtrLen, xtsSm4DecDone + + CMP $16, srcPtrLen + BEQ xtsSm4DecLastBlock + + VMOV TW.B16, B4.B16 + mul2Inline + VLD1.P 16(srcPtr), [B0.S4] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1.P [B0.S4], 16(dstPtr) + VMOV B4.B16, TW.B16 + VST1 [B0.B16], (RSP) + + SUB $16, dstPtr, R7 + MOVD R7, R9 + MOVD RSP, R8 + + TBZ $3, srcPtrLen, less_than8 + MOVD.P 8(srcPtr), R11 + MOVD.P R11, 8(R8) + MOVD.P 8(R7), R12 + MOVD.P R12, 8(dstPtr) + +less_than8: + TBZ $2, srcPtrLen, less_than4 + MOVWU.P 4(srcPtr), R11 + MOVWU.P R11, 4(R8) + MOVWU.P 4(R7), R12 + MOVWU.P R12, 4(dstPtr) + +less_than4: + TBZ $1, srcPtrLen, less_than2 + MOVHU.P 2(srcPtr), R11 + MOVHU.P R11, 2(R8) + MOVHU.P 2(R7), R12 + MOVHU.P R12, 2(dstPtr) + +less_than2: + TBZ $0, srcPtrLen, xtsSm4DecTailDec + MOVBU (srcPtr), R11 + MOVBU R11, (R8) + MOVBU (R7), R12 + MOVBU R12, (dstPtr) + +xtsSm4DecTailDec: + VLD1 (RSP), [B0.B16] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1 [B0.B16], (R9) + + B xtsSm4DecDone + +xtsSm4DecLastBlock: + VLD1.P 16(srcPtr), [B0.S4] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1.P [B0.S4], 16(dstPtr) + mul2Inline + +xtsSm4DecDone: + VST1 [TW.B16], (twPtr) + RET + +// func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) +TEXT ·decryptSm4XtsGB(SB),0,$128-64 + MOVD xk+0(FP), rk + MOVD tweak+8(FP), twPtr + MOVD dst+16(FP), dstPtr + MOVD src+40(FP), srcPtr + MOVD src_len+48(FP), srcPtrLen + + VEOR POLY.B16, POLY.B16, POLY.B16 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD $0xE1, I + LSL $56, I + VMOV I, POLY.D[1] + + // For SM4 round keys are stored in: K0 .. K7 + VLD1.P 64(rk), [K0.S4, K1.S4, K2.S4, K3.S4] + VLD1.P 64(rk), [K4.S4, K5.S4, K6.S4, K7.S4] + + VLD1 (twPtr), [TW.B16] + +xtsSm4DecOctets: + CMP $128, srcPtrLen + BLT xtsSm4DecSingles + SUB $128, srcPtrLen + + prepareGB8Tweaks + load8blocks + sm4eEnc8blocks() + store8blocks + + B xtsSm4DecOctets + +xtsSm4DecSingles: + CMP $32, srcPtrLen + BLT xtsSm4DecTail + SUB $16, srcPtrLen + + VLD1.P 16(srcPtr), [B0.S4] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1.P [B0.S4], 16(dstPtr) + + mul2GBInline + B xtsSm4DecSingles + +xtsSm4DecTail: + CBZ srcPtrLen, xtsSm4DecDone + + CMP $16, srcPtrLen + BEQ xtsSm4DecLastBlock + + VMOV TW.B16, B4.B16 + mul2GBInline + VLD1.P 16(srcPtr), [B0.S4] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1.P [B0.S4], 16(dstPtr) + VMOV B4.B16, TW.B16 + VST1 [B0.B16], (RSP) + + SUB $16, dstPtr, R7 + MOVD R7, R9 + MOVD RSP, R8 + + TBZ $3, srcPtrLen, less_than8 + MOVD.P 8(srcPtr), R11 + MOVD.P R11, 8(R8) + MOVD.P 8(R7), R12 + MOVD.P R12, 8(dstPtr) + +less_than8: + TBZ $2, srcPtrLen, less_than4 + MOVWU.P 4(srcPtr), R11 + MOVWU.P R11, 4(R8) + MOVWU.P 4(R7), R12 + MOVWU.P R12, 4(dstPtr) + +less_than4: + TBZ $1, srcPtrLen, less_than2 + MOVHU.P 2(srcPtr), R11 + MOVHU.P R11, 2(R8) + MOVHU.P 2(R7), R12 + MOVHU.P R12, 2(dstPtr) + +less_than2: + TBZ $0, srcPtrLen, xtsSm4DecTailDec + MOVBU (srcPtr), R11 + MOVBU R11, (R8) + MOVBU (R7), R12 + MOVBU R12, (dstPtr) + +xtsSm4DecTailDec: + VLD1 (RSP), [B0.B16] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1 [B0.B16], (R9) + + B xtsSm4DecDone + +xtsSm4DecLastBlock: + VLD1.P 16(srcPtr), [B0.S4] + VEOR TW.B16, B0.B16, B0.B16 + sm4eEnc1block() + VEOR TW.B16, B0.B16, B0.B16 + VST1.P [B0.S4], 16(dstPtr) + mul2GBInline + +xtsSm4DecDone: + VST1 [TW.B16], (twPtr) + RET