diff --git a/cipher/xts.go b/cipher/xts.go index fe23a73..50fbeb7 100644 --- a/cipher/xts.go +++ b/cipher/xts.go @@ -296,13 +296,19 @@ func (c *xtsDecrypter) CryptBlocks(plaintext, ciphertext []byte) { copy(x[remain:], plaintext[remain:blockSize]) //Copy the final plaintext bytes copy(plaintext[blockSize:], plaintext) + + subtle.XORBytes(x[:], x[:], c.tweak[:]) + c.b.Decrypt(x[:], x[:]) + subtle.XORBytes(plaintext, x[:], c.tweak[:]) } else { //The last block contains exactly 128 bits - copy(x[:], ciphertext) + subtle.XORBytes(plaintext, ciphertext, c.tweak[:]) + c.b.Decrypt(plaintext, plaintext) + subtle.XORBytes(plaintext, plaintext, c.tweak[:]) + // Maybe there are still ciphertext + mul2(&c.tweak, c.isGB) } - subtle.XORBytes(x[:], x[:], c.tweak[:]) - c.b.Decrypt(x[:], x[:]) - subtle.XORBytes(plaintext, x[:], c.tweak[:]) + } } diff --git a/cipher/xts_sm4_test.go b/cipher/xts_sm4_test.go index 1d0076f..30af5a8 100644 --- a/cipher/xts_sm4_test.go +++ b/cipher/xts_sm4_test.go @@ -114,7 +114,7 @@ var xtsGBTestVectors = []struct { }, } -func TestXTS_GB(t *testing.T) { +func TestGBXTSSample(t *testing.T) { for i, test := range xtsGBTestVectors { key := fromHex(test.key) tweak := fromHex(test.tweak) @@ -145,3 +145,85 @@ func TestXTS_GB(t *testing.T) { } } } + +var gbXtsTestVectors = []struct { + key string + sector uint64 + plaintext string + ciphertext string +}{ + { // XTS-SM4-128 applied for a data unit of 32 bytes + "0000000000000000000000000000000000000000000000000000000000000000", + 0, + "0000000000000000000000000000000000000000000000000000000000000000", + "d9b421f731c894fdc35b77291fe4e3b0e58e55e613a862b4d2b0f1073b4b4fd0", + }, { + "1111111111111111111111111111111122222222222222222222222222222222", + 0x3333333333, + "4444444444444444444444444444444444444444444444444444444444444444", + "a74d726c11196a32be04e001ff29d0c7724feef81d666ae5afdfe4649544fcf5", + }, { + "fffefdfcfbfaf9f8f7f6f5f4f3f2f1f022222222222222222222222222222222", + 0x3333333333, + "4444444444444444444444444444444444444444444444444444444444444444", + "7f76088effadf70c02ea9f95da0628d3ef2d6a77004beaa9016001d6789dd5a0", + }, { // XTS-SM4-128 applied for a data unit of 512 bytes + "2718281828459045235360287471352631415926535897932384626433832795", + 0, + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff", + "54dd65b6326faea8fad1a83c63614af398bddb6824735dab93ec4e75734215d463f67daf53742fb2a2847d5fde3984f8882cfd5fa9d6642e1e871c155202440044251465211628ba86f8d2998387a685edde23c07610b7388aab17f205aa5dada33c0a8a4225bc114254c796f800c638e016d199cd21dc28e92dc2b8587545509a8e1d659c596d3f6c8c225a27bdb2b02fe5a0c0183a592b396d32765fe733afb438a6ffb305ae1377c56d872badcebbd37812ff79f0571b3f977537570a1f76b9a50c49ab8d867fa024ea4483a25f7947b07885bb839e777abe76af11adf3108d1195933f96b7949b0664bdb89beb3bc48fb5f5d2109d32332f17c9a6ddea55441d1bbf43280ec7e75791e234d651a0716209eb21ae06061e33a72b0c530cb15fe0b55016b188dad75c4c50232dce1f5df61911c79bee60397b64bb914c0f26efcfb6ffab2bb33bdfd8db98c44debbd4ca865d41cbe1d0801b01aba2603cbea599b32c836789deeb9a3c18f3cae977b42ec81f1dfef6e098dd9e9dd6c1822bb938b08641bb72461f8d38c1724a43ae1254b9223e2270cf9f7d71a6bf093df2079fd2cc2fe87e846d799de30483f80164c31e65d8aae5f72d6dc71118932a008df547c712bee45ddebcdce098d673ef5ede91edfd45d17cb90963d3e2e2e2508a376a7b1af4d69e756ea5df52ac440791d57d56b5e057ad00e077d2df5009416", + }, { // Vector 5 + "2718281828459045235360287471352631415926535897932384626433832795", + 1, + "27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89cc78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad02655ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f4341332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203ebb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18deb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568", + "0e36ba273dd121afc77e0d8c00aa4a665f801f3607af61b61058b2f5d007310822200eaaeef759d515ebd032dad4235f5cd2dc735b57b56e003bce3f56890618877db69aa4519edcf681c6fc19c9c4a5655372d1549148c759efba00140275b46b6a5f6522de1702c48ff209a1dd7d1f56e775252796a09c20f903bfb3935bc79c0cdcaa9d2f30e616160e0662fb35311676e86e18d7d90d4203bc6862a9187b8657162143ce914750a86f984cf660311917e00fcf450ee188f088b4222522276bf3391e94de4fdad4134dfc7d08113c65e1b103bd3ad75fb13bba7f842451f9023ed21f1d23bc1c57d593932e021548bbff61ea9a24f359b4f7a8f2a998587495b726411f84734b189f65c4e79f09c7875f9c924b32e5bf2785a9935854e08ce86f5a4a399af6731099a13e10db0b32b888865b4416d69014a8cdb28b3912ec0b832835df7b59637d0687747815ba7cf9efae862dd6e80763acb50898fe1b3ba13a39d81b20d6d50613fbb5fbdcae2a7a87b9377eec455a8bae5102d5e6a7bea9b6b77d3f9895b277a55a524721cd0e59ce35e915de622480c5e0d31d153282dd832278fd2b795933f5dc591c17bd6d7f38fcd6afce551e8485109673881519d2845395ce9ceaea6306e38a73f9bb990931323a3136d18ee76c3e727cfb07cf386519313e1c44adcc50ae79bfac6952e3b98948206fb3dc3ebaed556bf27f16", + }, { // XTS-SM4-128 applied for a data unit that is not a multiple of 16 bytes, but should be a complte byte + "c46acc2e7e013cb71cdbf750cf76b000249fbf4fb6cd17607773c23ffa2c4330", + 94, + "7e9c2289cba460e470222953439cdaa892a5433d4dab2a3f67", + "4d5501ea41cf6b6532b4b7129c6f6ee74605d9fb66f1f12c0c", + }, { + "56ffcc9bbbdf413f0fc0f888f44b7493bb1925a39b8adf02d9009bb16db0a887", + 144, + "9a839cc14363bafcfc0cc93b14f8e769d35b94cc98267438e3", + "f04f3f16b354cccdc39fc664ec7f8db010a83bcacbc5c96353", + }, + { + "7454a43b87b1cf0dec95032c22873be3cace3bb795568854c1a008c07c5813f3", + 108, + "41088fa15195b2733fe824d2c1fdc8306080863945fb2a73cf", + "791a9469ed5a22d8195ac37c43c1b0377dc15126349bed1465", + }, +} + +func TestGBXTS(t *testing.T) { + for i, test := range gbXtsTestVectors { + key := fromHex(test.key) + + encrypter, err := cipher.NewGBXTSEncrypterWithSector(sm4.NewCipher, key[:len(key)/2], key[len(key)/2:], test.sector) + if err != nil { + t.Errorf("#%d: failed to create encrypter: %s", i, err) + continue + } + decrypter, err := cipher.NewGBXTSDecrypterWithSector(sm4.NewCipher, key[:len(key)/2], key[len(key)/2:], test.sector) + if err != nil { + t.Errorf("#%d: failed to create decrypter: %s", i, err) + continue + } + plaintext := fromHex(test.plaintext) + ciphertext := make([]byte, len(plaintext)) + + encrypter.CryptBlocks(ciphertext, plaintext) + expectedCiphertext := fromHex(test.ciphertext) + if !bytes.Equal(ciphertext, expectedCiphertext) { + t.Errorf("#%d: encrypted failed, got: %x, want: %x", i, ciphertext, expectedCiphertext) + continue + } + + decrypted := make([]byte, len(ciphertext)) + decrypter.CryptBlocks(decrypted, ciphertext) + if !bytes.Equal(decrypted, plaintext) { + t.Errorf("#%d: decryption failed, got: %x, want: %x", i, decrypted, plaintext) + } + } +} diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index eb7fc5b..798bc42 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -239,6 +239,33 @@ GLOBL r24_mask256<>(SB), 8, $32 PSHUFD $0xFF, rk128, x; \ SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \ +#define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \ + PSHUFB flip_mask<>(SB), t0; \ + PSHUFD $1, t0, t1; \ + PSHUFD $2, t0, t2; \ + PSHUFD $3, t0, t3; \ + MOVOU (0*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (1*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (2*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (3*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (4*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (5*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (6*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (7*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + PALIGNR $4, t3, t3; \ + PALIGNR $4, t3, t2; \ + PALIGNR $4, t2, t1; \ + PALIGNR $4, t1, t0; \ + PSHUFB flip_mask<>(SB), t0 + #define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ PSHUFB flip_mask<>(SB), t0; \ PSHUFB flip_mask<>(SB), t1; \ diff --git a/sm4/cbc_amd64.s b/sm4/cbc_amd64.s index b415f5a..5c60592 100644 --- a/sm4/cbc_amd64.s +++ b/sm4/cbc_amd64.s @@ -230,7 +230,7 @@ cbCSm4Single: cbcSm4Single16: MOVOU -16(DX), XWORD0 - SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) PXOR 0(SI), XWORD0 diff --git a/sm4/ecb_amd64.s b/sm4/ecb_amd64.s index d3643bd..f637841 100644 --- a/sm4/ecb_amd64.s +++ b/sm4/ecb_amd64.s @@ -117,7 +117,7 @@ ecbSm4Single: JEQ ecbSm4Single32 CMPQ DI, $48 JEQ ecbSm4Single48 - SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) MOVUPS XWORD0, 0(BX) JMP ecbSm4Done diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index 1232fc1..60ac3d9 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -41,10 +41,6 @@ #define NIBBLE_MASK Y11 #define X_NIBBLE_MASK X11 - -DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f -DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 - DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 @@ -79,7 +75,6 @@ DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff -GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 @@ -102,7 +97,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 MOVOU (tPtr), ACC0 MOVOU (tMsk), T2 - MOVOU bswapMask<>(SB), BSWAP + MOVOU bswap_mask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY SHLQ $3, plen @@ -277,7 +272,7 @@ TEXT ·gcmSm4Data(SB),NOSPLIT,$0 PXOR ACC0, ACC0 // MOVOU (tPtr), ACC0 // originally we passed in tag initial value - MOVOU bswapMask<>(SB), BSWAP + MOVOU bswap_mask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY TESTQ autLen, autLen @@ -525,7 +520,7 @@ TEXT ·gcmSm4Enc(SB),0,$256-96 CMPB ·useAVX(SB), $1 JE avxGcmSm4Enc - MOVOU bswapMask<>(SB), BSWAP + MOVOU bswap_mask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY MOVOU (tPtr), ACC0 @@ -868,7 +863,7 @@ gcmSm4EncDone: RET avxGcmSm4Enc: - VMOVDQU bswapMask<>(SB), BSWAP + VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (tPtr), ACC0 @@ -1196,7 +1191,7 @@ avxGcmSm4EncDone: RET avx2GcmSm4Enc: - VMOVDQU bswapMask<>(SB), BSWAP + VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (tPtr), ACC0 @@ -1229,7 +1224,7 @@ avx2GcmSm4Enc: VMOVDQU T0, (8*16 + 7*16)(SP) increment(7) - VBROADCASTI128 bswapMask<>(SB), DWBSWAP + VBROADCASTI128 bswap_mask<>(SB), DWBSWAP // load 8 ctrs for encryption VMOVDQU (4*32 + 0*32)(SP), DWB0 VMOVDQU (4*32 + 1*32)(SP), DWB1 @@ -1631,7 +1626,7 @@ TEXT ·gcmSm4Dec(SB),0,$128-96 CMPB ·useAVX(SB), $1 JE avxGcmSm4Dec - MOVOU bswapMask<>(SB), BSWAP + MOVOU bswap_mask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY MOVOU (tPtr), ACC0 @@ -1859,7 +1854,7 @@ gcmSm4DecDone: RET avxGcmSm4Dec: - VMOVDQU bswapMask<>(SB), BSWAP + VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (tPtr), ACC0 @@ -2082,7 +2077,7 @@ avxGcmSm4DecDone: RET avx2GcmSm4Dec: - VMOVDQU bswapMask<>(SB), BSWAP + VMOVDQU bswap_mask<>(SB), BSWAP VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (tPtr), ACC0 @@ -2114,7 +2109,7 @@ avx2GcmSm4Dec: VMOVDQU T0, (7*16)(SP) increment(7) - VBROADCASTI128 bswapMask<>(SB), DWBSWAP + VBROADCASTI128 bswap_mask<>(SB), DWBSWAP avx2GcmSm4DecOctetsLoop: CMPQ ptxLen, $128 diff --git a/sm4/modes.go b/sm4/modes.go index 9993f69..743949c 100644 --- a/sm4/modes.go +++ b/sm4/modes.go @@ -43,3 +43,19 @@ type ctrAble interface { type gcmAble interface { NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) } + +// xtsEncAble is an interface implemented by ciphers that have a specific +// optimized implementation of XTS encryption, like sm4. +// NewXTSEncrypter will check for this interface and return the specific +// BlockMode if found. +type xtsEncAble interface { + NewXTSEncrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode +} + +// xtsDecAble is an interface implemented by ciphers that have a specific +// optimized implementation of XTS encryption, like sm4. +// NewXTSDecrypter will check for this interface and return the specific +// BlockMode if found. +type xtsDecAble interface { + NewXTSDecrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode +} diff --git a/sm4/sm4_xts.go b/sm4/sm4_xts.go new file mode 100644 index 0000000..4da3151 --- /dev/null +++ b/sm4/sm4_xts.go @@ -0,0 +1,81 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +package sm4 + +import ( + "crypto/cipher" + + "github.com/emmansun/gmsm/internal/alias" +) + +// Assert that sm4CipherAsm implements the xtsEncAble and xtsDecAble interfaces. +var _ xtsEncAble = (*sm4CipherAsm)(nil) +var _ xtsDecAble = (*sm4CipherAsm)(nil) + +const xtsEncrypt = 1 +const xtsDecrypt = 0 + +type xts struct { + b *sm4CipherAsm + tweak [BlockSize]byte + isGB bool // if true, follows GB/T 17964-2021 + enc int +} + +func (b *sm4CipherAsm) NewXTSEncrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode { + var c xts + c.b = b + c.enc = xtsEncrypt + c.isGB = isGB + copy(c.tweak[:], encryptedTweak[:]) + return &c +} + +func (b *sm4CipherAsm) NewXTSDecrypter(encryptedTweak *[BlockSize]byte, isGB bool) cipher.BlockMode { + var c xts + c.b = b + c.enc = xtsDecrypt + c.isGB = isGB + copy(c.tweak[:], encryptedTweak[:]) + return &c +} + +func (x *xts) BlockSize() int { return BlockSize } + +//go:noescape +func encryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) + +//go:noescape +func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) + +//go:noescape +func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) + +//go:noescape +func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) + +func (x *xts) CryptBlocks(dst, src []byte) { + if len(dst) < len(src) { + panic("xts: dst is smaller than src") + } + if len(src) < BlockSize { + panic("xts: src length is smaller than the block size") + } + if alias.InexactOverlap(dst[:len(src)], src) { + panic("xts: invalid buffer overlap") + } + if x.enc == xtsEncrypt { + if x.isGB { + encryptSm4XtsGB(&x.b.enc[0], &x.tweak, dst, src) + } else { + encryptSm4Xts(&x.b.enc[0], &x.tweak, dst, src) + } + } else { + if x.isGB { + decryptSm4XtsGB(&x.b.dec[0], &x.tweak, dst, src) + } else { + decryptSm4Xts(&x.b.dec[0], &x.tweak, dst, src) + } + } +} diff --git a/sm4/xts_amd64.s b/sm4/xts_amd64.s new file mode 100644 index 0000000..6f848ff --- /dev/null +++ b/sm4/xts_amd64.s @@ -0,0 +1,1889 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +#include "textflag.h" + +#define B0 X0 +#define B1 X1 +#define B2 X2 +#define B3 X3 +#define B4 X4 +#define B5 X5 +#define B6 X6 +#define B7 X7 + +#define TW X10 + +#define T0 X11 +#define T1 X12 +#define T2 X13 +#define POLY X14 +#define NIBBLE_MASK Y13 +#define X_NIBBLE_MASK X13 +#define BSWAP X15 +#define DWBSWAP Y15 + +DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000087 +DATA gcmPoly<>+0x08(SB)/8, $0x0000000000000000 + +DATA gbGcmPoly<>+0x00(SB)/8, $0x0000000000000000 +DATA gbGcmPoly<>+0x08(SB)/8, $0xe100000000000000 + +GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 +GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16 + +#include "aesni_macros_amd64.s" + +#define mul2GBInline \ + PSHUFB BSWAP, TW; \ + \// TW * 2 + MOVOU TW, T0; \ + PSHUFD $0, TW, T1; \ + PSRLQ $1, TW; \ + PSLLQ $63, T0; \ + PSRLDQ $8, T0; \ + POR T0, TW; \ + \// reduction + PSLLL $31, T1; \ + PSRAL $31, T1; \ + PAND POLY, T1; \ + PXOR T1, TW; \ + PSHUFB BSWAP, TW + +#define avxMul2GBInline \ + VPSHUFB BSWAP, TW, TW; \ + \// TW * 2 + VPSLLQ $63, TW, T0; \ + VPSHUFD $0, TW, T1; \ + VPSRLQ $1, TW, TW; \ + VPSRLDQ $8, T0, T0; \ + VPOR T0, TW, TW; \ + \// reduction + VPSLLD $31, T1, T1; \ + VPSRAD $31, T1, T1; \ + VPAND POLY, T1, T1; \ + VPXOR T1, TW, TW; \ + VPSHUFB BSWAP, TW, TW + +#define prepareGB4Tweaks \ + MOVOU TW, (16*0)(SP); \ + mul2GBInline; \ + MOVOU TW, (16*1)(SP); \ + mul2GBInline; \ + MOVOU TW, (16*2)(SP); \ + mul2GBInline; \ + MOVOU TW, (16*3)(SP); \ + mul2GBInline + +#define prepareGB8Tweaks \ + prepareGB4Tweaks; \ + MOVOU TW, (16*4)(SP); \ + mul2GBInline; \ + MOVOU TW, (16*5)(SP); \ + mul2GBInline; \ + MOVOU TW, (16*6)(SP); \ + mul2GBInline; \ + MOVOU TW, (16*7)(SP); \ + mul2GBInline + +#define avxPrepareGB4Tweaks \ + VMOVDQU TW, (16*0)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*1)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*2)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*3)(SP); \ + avxMul2GBInline + +#define avxPrepareGB8Tweaks \ + avxPrepareGB4Tweaks; \ + VMOVDQU TW, (16*4)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*5)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*6)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*7)(SP); \ + avxMul2GBInline + +#define avxPrepareGB16Tweaks \ + avxPrepareGB8Tweaks; \ + VMOVDQU TW, (16*8)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*9)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*10)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*11)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*12)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*13)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*14)(SP); \ + avxMul2GBInline; \ + VMOVDQU TW, (16*15)(SP); \ + avxMul2GBInline + +#define mul2Inline \ + PSHUFD $0xff, TW, T0; \ + MOVOU TW, T1; \ + PSRAL $31, T0; \ + PAND POLY, T0; \ + PSRLL $31, T1; \ + PSLLDQ $4, T1; \ + PSLLL $1, TW; \ + PXOR T0, TW; \ + PXOR T1, TW + +#define avxMul2Inline \ + VPSHUFD $0xff, TW, T0; \ + VPSRLD $31, TW, T1; \ + VPSRAD $31, T0, T0; \ + VPAND POLY, T0, T0; \ + VPSLLDQ $4, T1, T1; \ + VPSLLD $1, TW, TW; \ + VPXOR T0, TW, TW; \ + VPXOR T1, TW, TW + +#define prepare4Tweaks \ + MOVOU TW, (16*0)(SP); \ + mul2Inline; \ + MOVOU TW, (16*1)(SP); \ + mul2Inline; \ + MOVOU TW, (16*2)(SP); \ + mul2Inline; \ + MOVOU TW, (16*3)(SP); \ + mul2Inline + +#define prepare8Tweaks \ + prepare4Tweaks; \ + MOVOU TW, (16*4)(SP); \ + mul2Inline; \ + MOVOU TW, (16*5)(SP); \ + mul2Inline; \ + MOVOU TW, (16*6)(SP); \ + mul2Inline; \ + MOVOU TW, (16*7)(SP); \ + mul2Inline + +#define avxPrepare4Tweaks \ + VMOVDQU TW, (16*0)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*1)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*2)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*3)(SP); \ + avxMul2Inline + +#define avxPrepare8Tweaks \ + prepare4Tweaks; \ + VMOVDQU TW, (16*4)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*5)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*6)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*7)(SP); \ + avxMul2Inline + +#define avxPrepare16Tweaks \ + prepare8Tweaks; \ + VMOVDQU TW, (16*8)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*9)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*10)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*11)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*12)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*13)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*14)(SP); \ + avxMul2Inline; \ + VMOVDQU TW, (16*15)(SP); \ + avxMul2Inline + +#define sseLoad4Blocks \ + MOVOU (16*0)(DX), B0; \ + PXOR (16*0)(SP), B0; \ + MOVOU (16*1)(DX), B1; \ + PXOR (16*1)(SP), B1; \ + MOVOU (16*2)(DX), B2; \ + PXOR (16*2)(SP), B2; \ + MOVOU (16*3)(DX), B3; \ + PXOR (16*3)(SP), B3 + +#define sseStore4Blocks \ + PXOR (16*0)(SP), B0; \ + MOVOU B0, (16*0)(CX); \ + PXOR (16*1)(SP), B1; \ + MOVOU B1, (16*1)(CX); \ + PXOR (16*2)(SP), B2; \ + MOVOU B2, (16*2)(CX); \ + PXOR (16*3)(SP), B3; \ + MOVOU B3, (16*3)(CX) + +#define sseLoad8Blocks \ + sseLoad4Blocks; \ + MOVOU (16*4)(DX), B4; \ + PXOR (16*4)(SP), B4; \ + MOVOU (16*5)(DX), B5; \ + PXOR (16*5)(SP), B5; \ + MOVOU (16*6)(DX), B6; \ + PXOR (16*6)(SP), B6; \ + MOVOU (16*7)(DX), B7; \ + PXOR (16*7)(SP), B7 + +#define sseStore8Blocks \ + sseStore4Blocks; \ + PXOR (16*4)(SP), B4; \ + MOVOU B4, (16*4)(CX); \ + PXOR (16*5)(SP), B5; \ + MOVOU B5, (16*5)(CX); \ + PXOR (16*6)(SP), B6; \ + MOVOU B6, (16*6)(CX); \ + PXOR (16*7)(SP), B7; \ + MOVOU B7, (16*7)(CX) + +#define avxLoad4Blocks \ + VMOVDQU (16*0)(DX), B0; \ + VPXOR (16*0)(SP), B0, B0; \ + VMOVDQU (16*1)(DX), B1; \ + VPXOR (16*1)(SP), B1, B1; \ + VMOVDQU (16*2)(DX), B2; \ + VPXOR (16*2)(SP), B2, B2; \ + VMOVDQU (16*3)(DX), B3; \ + VPXOR (16*3)(SP), B3, B3 + +#define avxStore4Blocks \ + VPXOR (16*0)(SP), B0, B0; \ + VMOVDQU B0, (16*0)(CX); \ + VPXOR (16*1)(SP), B1, B1; \ + VMOVDQU B1, (16*1)(CX); \ + VPXOR (16*2)(SP), B2, B2; \ + VMOVDQU B2, (16*2)(CX); \ + VPXOR (16*3)(SP), B3, B3; \ + VMOVDQU B3, (16*3)(CX) + +#define avxLoad8Blocks \ + avxLoad4Blocks; \ + VMOVDQU (16*4)(DX), B4; \ + VPXOR (16*4)(SP), B4, B4; \ + VMOVDQU (16*5)(DX), B5; \ + VPXOR (16*5)(SP), B5, B5; \ + VMOVDQU (16*6)(DX), B6; \ + VPXOR (16*6)(SP), B6, B6; \ + VMOVDQU (16*7)(DX), B7; \ + VPXOR (16*7)(SP), B7, B7 + +#define avxStore8Blocks \ + avxStore4Blocks; \ + VPXOR (16*4)(SP), B4, B4; \ + VMOVDQU B4, (16*4)(CX); \ + VPXOR (16*5)(SP), B5, B5; \ + VMOVDQU B5, (16*5)(CX); \ + VPXOR (16*6)(SP), B6, B6; \ + VMOVDQU B6, (16*6)(CX); \ + VPXOR (16*7)(SP), B7, B7; \ + VMOVDQU B7, (16*7)(CX) + +#define avx2Load8Blocks \ + VMOVDQU (32*0)(DX), Y0; \ + VPXOR (32*0)(SP), Y0, Y0; \ + VMOVDQU (32*1)(DX), Y1; \ + VPXOR (32*1)(SP), Y1, Y1; \ + VMOVDQU (32*2)(DX), Y2; \ + VPXOR (32*2)(SP), Y2, Y2; \ + VMOVDQU (32*3)(DX), Y3; \ + VPXOR (32*3)(SP), Y3, Y3 + +#define avx2Load16Blocks \ + avx2Load8Blocks; \ + VMOVDQU (32*4)(DX), Y4; \ + VPXOR (32*4)(SP), Y4, Y4; \ + VMOVDQU (32*5)(DX), Y5; \ + VPXOR (32*5)(SP), Y5, Y5; \ + VMOVDQU (32*6)(DX), Y6; \ + VPXOR (32*6)(SP), Y6, Y6; \ + VMOVDQU (32*7)(DX), Y7; \ + VPXOR (32*7)(SP), Y7, Y7 + +#define avx2LE2BE8Blocks \ + VBROADCASTI128 flip_mask<>(SB), Y11; \ + VPSHUFB Y11, Y0, Y0; \ + VPSHUFB Y11, Y1, Y1; \ + VPSHUFB Y11, Y2, Y2; \ + VPSHUFB Y11, Y3, Y3; \ + +#define avx2LE2BE16Blocks \ + avx2LE2BE8Blocks; \ + VPSHUFB Y11, Y4, Y4; \ + VPSHUFB Y11, Y5, Y5; \ + VPSHUFB Y11, Y6, Y6; \ + VPSHUFB Y11, Y7, Y7 + +#define avx2Store8Blocks \ + VPXOR (32*0)(SP), Y0, Y0; \ + VMOVDQU Y0, (32*0)(CX); \ + VPXOR (32*1)(SP), Y1, Y1; \ + VMOVDQU Y1, (32*1)(CX); \ + VPXOR (32*2)(SP), Y2, Y2; \ + VMOVDQU Y2, (32*2)(CX); \ + VPXOR (32*3)(SP), Y3, Y3; \ + VMOVDQU Y3, (32*3)(CX); \ + +#define avx2Store16Blocks \ + avx2Store8Blocks; \ + VPXOR (32*4)(SP), Y4, Y4; \ + VMOVDQU Y4, (32*4)(CX); \ + VPXOR (32*5)(SP), Y5, Y5; \ + VMOVDQU Y5, (32*5)(CX); \ + VPXOR (32*6)(SP), Y6, Y6; \ + VMOVDQU Y6, (32*6)(CX); \ + VPXOR (32*7)(SP), Y7, Y7; \ + VMOVDQU Y7, (32*7)(CX) + +#define avx2ByteSwap8Blocks \ + VPSHUFB DWBSWAP, Y0, Y0; \ + VPSHUFB DWBSWAP, Y1, Y1; \ + VPSHUFB DWBSWAP, Y2, Y2; \ + VPSHUFB DWBSWAP, Y3, Y3; \ + +#define avx2ByteSwap16Blocks \ + avx2ByteSwap8Blocks; \ + VPSHUFB DWBSWAP, Y4, Y4; \ + VPSHUFB DWBSWAP, Y5, Y5; \ + VPSHUFB DWBSWAP, Y6, Y6; \ + VPSHUFB DWBSWAP, Y7, Y7 + +// func encryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) +TEXT ·encryptSm4Xts(SB),0,$256-64 + MOVQ xk+0(FP), AX + MOVQ tweak+8(FP), BX + MOVQ dst+16(FP), CX + MOVQ src+40(FP), DX + MOVQ src_len+48(FP), DI + + CMPB ·useAVX2(SB), $1 + JE avx2XtsSm4Enc + + CMPB ·useAVX(SB), $1 + JE avxXtsSm4Enc + + MOVOU gcmPoly<>(SB), POLY + + MOVOU (0*16)(BX), TW + +xtsSm4EncOctets: + CMPQ DI, $128 + JB xtsSm4EncNibbles + SUBQ $128, DI + + // prepare tweaks + prepare8Tweaks + // load 8 blocks for encryption + sseLoad8Blocks + + SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + + sseStore8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + + JMP xtsSm4EncOctets + +xtsSm4EncNibbles: + CMPQ DI, $64 + JB xtsSm4EncSingles + SUBQ $64, DI + + // prepare tweaks + prepare4Tweaks + // load 4 blocks for encryption + sseLoad4Blocks + + SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + sseStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +xtsSm4EncSingles: + CMPQ DI, $16 + JB xtsSm4EncTail + SUBQ $16, DI + + // load 1 block for encryption + MOVOU (16*0)(DX), B0 + + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(CX) + mul2Inline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP xtsSm4EncSingles + +xtsSm4EncTail: + TESTQ DI, DI + JE xtsSm4EncDone + + LEAQ -16(CX), R8 + MOVOU (16*0)(R8), B0 + MOVOU B0, (16*0)(SP) + + CMPQ DI, $8 + JB loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE xtsSm4EncTailEnc + +loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE loop_1b + +xtsSm4EncTailEnc: + MOVOU (16*0)(SP), B0 + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(R8) + +xtsSm4EncDone: + MOVOU TW, (16*0)(BX) + RET + +avxXtsSm4Enc: + VMOVDQU gcmPoly<>(SB), POLY + VMOVDQU (0*16)(BX), TW + +avxXtsSm4EncOctets: + CMPQ DI, $128 + JB avxXtsSm4EncNibbles + SUBQ $128, DI + + // prepare tweaks + avxPrepare8Tweaks + // load 8 blocks for encryption + avxLoad8Blocks + + AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + + avxStore8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + + JMP avxXtsSm4EncOctets + +avxXtsSm4EncNibbles: + CMPQ DI, $64 + JB avxXtsSm4EncSingles + SUBQ $64, DI + + // prepare tweaks + avxPrepare4Tweaks + // load 4 blocks for encryption + avxLoad4Blocks + + AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + avxStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +avxXtsSm4EncSingles: + CMPQ DI, $16 + JB avxXtsSm4EncTail + SUBQ $16, DI + + // load 1 block for encryption + VMOVDQU (16*0)(DX), B0 + + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2Inline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP avxXtsSm4EncSingles + +avxXtsSm4EncTail: + TESTQ DI, DI + JE avxXtsSm4EncDone + + LEAQ -16(CX), R8 + VMOVDQU (16*0)(R8), B0 + VMOVDQU B0, (16*0)(SP) + + CMPQ DI, $8 + JB avx_loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE avxXtsSm4EncTailEnc + +avx_loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE avx_loop_1b + +avxXtsSm4EncTailEnc: + VMOVDQU (16*0)(SP), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(R8) + +avxXtsSm4EncDone: + VMOVDQU TW, (16*0)(BX) + RET + +avx2XtsSm4Enc: + VMOVDQU gcmPoly<>(SB), POLY + VMOVDQU (0*16)(BX), TW + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + VBROADCASTI128 bswap_mask<>(SB), DWBSWAP + +avx2XtsSm4Enc16Blocks: + CMPQ DI, $256 + JB avx2XtsSm4EncOctets + SUBQ $256, DI + + // prepare tweaks + avxPrepare16Tweaks + // load 16 blocks for encryption + avx2Load16Blocks + // Apply Byte Flip Mask: LE -> BE + avx2LE2BE16Blocks + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) + + AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) + avx2ByteSwap16Blocks + avx2Store16Blocks + + LEAQ 256(DX), DX + LEAQ 256(CX), CX + JMP avx2XtsSm4Enc16Blocks + +avx2XtsSm4EncOctets: + CMPQ DI, $128 + JB avx2XtsSm4EncNibbles + SUBQ $128, DI + + // prepare tweaks + avxPrepare8Tweaks + // load 8 blocks for encryption + avx2Load8Blocks + // Apply Byte Flip Mask: LE -> BE + avx2LE2BE8Blocks + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + + AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + avx2ByteSwap8Blocks + avx2Store8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + +avx2XtsSm4EncNibbles: + CMPQ DI, $64 + JB avx2XtsSm4EncSingles + SUBQ $64, DI + + // prepare tweaks + avxPrepare4Tweaks + + // load 4 blocks for encryption + avxLoad4Blocks + + AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + avxStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +avx2XtsSm4EncSingles: + CMPQ DI, $16 + JB avx2XtsSm4EncTail + SUBQ $16, DI + + // load 1 block for encryption + VMOVDQU (16*0)(DX), B0 + + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2Inline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP avx2XtsSm4EncSingles + +avx2XtsSm4EncTail: + TESTQ DI, DI + JE avx2XtsSm4EncDone + + LEAQ -16(CX), R8 + VMOVDQU (16*0)(R8), B0 + VMOVDQU B0, (16*0)(SP) + + CMPQ DI, $8 + JB avx2_loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE avx2XtsSm4EncTailEnc + +avx2_loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE avx2_loop_1b + +avx2XtsSm4EncTailEnc: + VMOVDQU (16*0)(SP), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(R8) + +avx2XtsSm4EncDone: + VMOVDQU TW, (16*0)(BX) + VZEROUPPER + RET + +// func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) +TEXT ·encryptSm4XtsGB(SB),0,$256-64 + MOVQ xk+0(FP), AX + MOVQ tweak+8(FP), BX + MOVQ dst+16(FP), CX + MOVQ src+40(FP), DX + MOVQ src_len+48(FP), DI + + CMPB ·useAVX2(SB), $1 + JE avx2XtsSm4Enc + + CMPB ·useAVX(SB), $1 + JE avxXtsSm4Enc + + MOVOU gbGcmPoly<>(SB), POLY + MOVOU bswap_mask<>(SB), BSWAP + MOVOU (0*16)(BX), TW + +xtsSm4EncOctets: + CMPQ DI, $128 + JB xtsSm4EncNibbles + SUBQ $128, DI + + // prepare tweaks + prepareGB8Tweaks + // load 8 blocks for encryption + sseLoad8Blocks + + SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + + sseStore8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + + JMP xtsSm4EncOctets + +xtsSm4EncNibbles: + CMPQ DI, $64 + JB xtsSm4EncSingles + SUBQ $64, DI + + // prepare tweaks + prepareGB4Tweaks + // load 4 blocks for encryption + sseLoad4Blocks + + SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + sseStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +xtsSm4EncSingles: + CMPQ DI, $16 + JB xtsSm4EncTail + SUBQ $16, DI + + // load 1 block for encryption + MOVOU (16*0)(DX), B0 + + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(CX) + mul2GBInline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP xtsSm4EncSingles + +xtsSm4EncTail: + TESTQ DI, DI + JE xtsSm4EncDone + + LEAQ -16(CX), R8 + MOVOU (16*0)(R8), B0 + MOVOU B0, (16*0)(SP) + + CMPQ DI, $8 + JB loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE xtsSm4EncTailEnc + +loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE loop_1b + +xtsSm4EncTailEnc: + MOVOU (16*0)(SP), B0 + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(R8) + +xtsSm4EncDone: + MOVOU TW, (16*0)(BX) + RET + +avxXtsSm4Enc: + VMOVDQU gbGcmPoly<>(SB), POLY + VMOVDQU bswap_mask<>(SB), BSWAP + VMOVDQU (0*16)(BX), TW + +avxXtsSm4EncOctets: + CMPQ DI, $128 + JB avxXtsSm4EncNibbles + SUBQ $128, DI + + // prepare tweaks + avxPrepareGB8Tweaks + // load 8 blocks for encryption + avxLoad8Blocks + + AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + + avxStore8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + + JMP avxXtsSm4EncOctets + +avxXtsSm4EncNibbles: + CMPQ DI, $64 + JB avxXtsSm4EncSingles + SUBQ $64, DI + + // prepare tweaks + avxPrepareGB4Tweaks + // load 4 blocks for encryption + avxLoad4Blocks + + AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + avxStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +avxXtsSm4EncSingles: + CMPQ DI, $16 + JB avxXtsSm4EncTail + SUBQ $16, DI + + // load 1 block for encryption + VMOVDQU (16*0)(DX), B0 + + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2GBInline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP avxXtsSm4EncSingles + +avxXtsSm4EncTail: + TESTQ DI, DI + JE avxXtsSm4EncDone + + LEAQ -16(CX), R8 + VMOVDQU (16*0)(R8), B0 + VMOVDQU B0, (16*0)(SP) + + CMPQ DI, $8 + JB avx_loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE avxXtsSm4EncTailEnc + +avx_loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE avx_loop_1b + +avxXtsSm4EncTailEnc: + VMOVDQU (16*0)(SP), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(R8) + +avxXtsSm4EncDone: + VMOVDQU TW, (16*0)(BX) + RET + +avx2XtsSm4Enc: + VMOVDQU gbGcmPoly<>(SB), POLY + VMOVDQU (0*16)(BX), TW + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + VBROADCASTI128 bswap_mask<>(SB), DWBSWAP + +avx2XtsSm4Enc16Blocks: + CMPQ DI, $256 + JB avx2XtsSm4EncOctets + SUBQ $256, DI + + // prepare tweaks + avxPrepareGB16Tweaks + // load 16 blocks for encryption + avx2Load16Blocks + // Apply Byte Flip Mask: LE -> BE + avx2LE2BE16Blocks + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) + + AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) + avx2ByteSwap16Blocks + avx2Store16Blocks + + LEAQ 256(DX), DX + LEAQ 256(CX), CX + JMP avx2XtsSm4Enc16Blocks + +avx2XtsSm4EncOctets: + CMPQ DI, $128 + JB avx2XtsSm4EncNibbles + SUBQ $128, DI + + // prepare tweaks + avxPrepareGB8Tweaks + // load 8 blocks for encryption + avx2Load8Blocks + // Apply Byte Flip Mask: LE -> BE + avx2LE2BE8Blocks + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + + AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + avx2ByteSwap8Blocks + avx2Store8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + +avx2XtsSm4EncNibbles: + CMPQ DI, $64 + JB avx2XtsSm4EncSingles + SUBQ $64, DI + + // prepare tweaks + avxPrepareGB4Tweaks + // load 4 blocks for encryption + avxLoad4Blocks + + AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + avxStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +avx2XtsSm4EncSingles: + CMPQ DI, $16 + JB avx2XtsSm4EncTail + SUBQ $16, DI + + // load 1 block for encryption + VMOVDQU (16*0)(DX), B0 + + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2GBInline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP avx2XtsSm4EncSingles + +avx2XtsSm4EncTail: + TESTQ DI, DI + JE avx2XtsSm4EncDone + + LEAQ -16(CX), R8 + VMOVDQU (16*0)(R8), B0 + VMOVDQU B0, (16*0)(SP) + + CMPQ DI, $8 + JB avx2_loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE avx2XtsSm4EncTailEnc + +avx2_loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE avx2_loop_1b + +avx2XtsSm4EncTailEnc: + VMOVDQU (16*0)(SP), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(R8) + +avx2XtsSm4EncDone: + VMOVDQU TW, (16*0)(BX) + VZEROUPPER + RET + +// func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) +TEXT ·decryptSm4Xts(SB),0,$256-64 + MOVQ xk+0(FP), AX + MOVQ tweak+8(FP), BX + MOVQ dst+16(FP), CX + MOVQ src+40(FP), DX + MOVQ src_len+48(FP), DI + + CMPB ·useAVX2(SB), $1 + JE avx2XtsSm4Dec + + CMPB ·useAVX(SB), $1 + JE avxXtsSm4Dec + + MOVOU gcmPoly<>(SB), POLY + MOVOU (0*16)(BX), TW + +xtsSm4DecOctets: + CMPQ DI, $128 + JB xtsSm4DecNibbles + SUBQ $128, DI + + // prepare tweaks + prepare8Tweaks + // load 8 blocks for decryption + sseLoad8Blocks + + SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + + sseStore8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + + JMP xtsSm4DecOctets + +xtsSm4DecNibbles: + CMPQ DI, $64 + JB xtsSm4DecSingles + SUBQ $64, DI + + // prepare tweaks + prepare4Tweaks + // load 4 blocks for decryption + sseLoad4Blocks + + SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + sseStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +xtsSm4DecSingles: + CMPQ DI, $32 + JB xtsSm4DecTail + SUBQ $16, DI + + // load 1 block for decryption + MOVOU (16*0)(DX), B0 + + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(CX) + mul2Inline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP xtsSm4DecSingles + +xtsSm4DecTail: + TESTQ DI, DI + JE xtsSm4DecDone + + CMPQ DI, $16 + JE xtsSm4DecLastBlock + + // length > 16 + // load 1 block for decryption + MOVOU (16*0)(DX), B0 + MOVOU TW, B5 + mul2Inline + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(CX) + MOVOU B5, TW + + SUBQ $16, DI + LEAQ 16(DX), DX + LEAQ 16(CX), CX + LEAQ -16(CX), R8 + MOVOU B0, (16*0)(SP) + + CMPQ DI, $8 + JB loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE xtsSm4DecTailDec + +loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE loop_1b + +xtsSm4DecTailDec: + MOVOU (16*0)(SP), B0 + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(R8) + JMP xtsSm4DecDone + +xtsSm4DecLastBlock: + MOVOU (16*0)(DX), B0 + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(CX) + mul2Inline + +xtsSm4DecDone: + MOVOU TW, (16*0)(BX) + RET + +avxXtsSm4Dec: + VMOVDQU gcmPoly<>(SB), POLY + VMOVDQU (0*16)(BX), TW + +avxXtsSm4DecOctets: + CMPQ DI, $128 + JB avxXtsSm4DecNibbles + SUBQ $128, DI + + // prepare tweaks + avxPrepare8Tweaks + + // load 8 blocks for decryption + avxLoad8Blocks + + AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + + avxStore8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + + JMP avxXtsSm4DecOctets + +avxXtsSm4DecNibbles: + CMPQ DI, $64 + JB avxXtsSm4DecSingles + SUBQ $64, DI + + // prepare tweaks + avxPrepare4Tweaks + // load 4 blocks for decryption + avxLoad4Blocks + + AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + avxStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +avxXtsSm4DecSingles: + CMPQ DI, $32 + JB avxXtsSm4DecTail + SUBQ $16, DI + + // load 1 block for decryption + VMOVDQU (16*0)(DX), B0 + + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2Inline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP avxXtsSm4DecSingles + +avxXtsSm4DecTail: + TESTQ DI, DI + JE avxXtsSm4DecDone + + CMPQ DI, $16 + JE avxXtsSm4DecLastBlock + + // length > 16 + // load 1 block for decryption + VMOVDQU (16*0)(DX), B0 + VMOVDQU TW, B5 + avxMul2Inline + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + VMOVDQU B5, TW + + SUBQ $16, DI + LEAQ 16(DX), DX + LEAQ 16(CX), CX + LEAQ -16(CX), R8 + VMOVDQU B0, (16*0)(SP) + + CMPQ DI, $8 + JB avx_loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE avxXtsSm4DecTailDec + +avx_loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE avx_loop_1b + +avxXtsSm4DecTailDec: + VMOVDQU (16*0)(SP), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(R8) + JMP avxXtsSm4DecDone + +avxXtsSm4DecLastBlock: + VMOVDQU (16*0)(DX), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2Inline + +avxXtsSm4DecDone: + VMOVDQU TW, (16*0)(BX) + RET + +avx2XtsSm4Dec: + VMOVDQU gcmPoly<>(SB), POLY + VMOVDQU (0*16)(BX), TW + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + VBROADCASTI128 bswap_mask<>(SB), DWBSWAP + +avx2XtsSm4Dec16Blocks: + CMPQ DI, $256 + JB avx2XtsSm4DecOctets + SUBQ $256, DI + + // prepare tweaks + avxPrepare16Tweaks + // load 16 blocks for encryption + avx2Load16Blocks + // Apply Byte Flip Mask: LE -> BE + avx2LE2BE16Blocks + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) + + AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) + avx2ByteSwap16Blocks + avx2Store16Blocks + + LEAQ 256(DX), DX + LEAQ 256(CX), CX + + JMP avx2XtsSm4Dec16Blocks + +avx2XtsSm4DecOctets: + CMPQ DI, $128 + JB avx2XtsSm4DecNibbles + SUBQ $128, DI + + // prepare tweaks + avxPrepare8Tweaks + // load 8 blocks for encryption + avx2Load8Blocks + // Apply Byte Flip Mask: LE -> BE + avx2LE2BE8Blocks + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + + AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + avx2ByteSwap8Blocks + avx2Store8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + +avx2XtsSm4DecNibbles: + CMPQ DI, $64 + JB avxXtsSm4DecSingles + SUBQ $64, DI + + // prepare tweaks + avxPrepare4Tweaks + // load 4 blocks for decryption + avxLoad4Blocks + + AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + avxStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +avx2XtsSm4DecSingles: + CMPQ DI, $32 + JB avx2XtsSm4DecTail + SUBQ $16, DI + + // load 1 block for decryption + VMOVDQU (16*0)(DX), B0 + + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2Inline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP avx2XtsSm4DecSingles + +avx2XtsSm4DecTail: + TESTQ DI, DI + JE avx2XtsSm4DecDone + + CMPQ DI, $16 + JE avx2XtsSm4DecLastBlock + + // length > 16 + // load 1 block for decryption + VMOVDQU (16*0)(DX), B0 + VMOVDQU TW, B5 + avxMul2Inline + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + VMOVDQU B5, TW + + SUBQ $16, DI + LEAQ 16(DX), DX + LEAQ 16(CX), CX + LEAQ -16(CX), R8 + VMOVDQU B0, (16*0)(SP) + + CMPQ DI, $8 + JB avx2_loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE avx2XtsSm4DecTailDec + +avx2_loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE avx2_loop_1b + +avx2XtsSm4DecTailDec: + VMOVDQU (16*0)(SP), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(R8) + JMP avx2XtsSm4DecDone + +avx2XtsSm4DecLastBlock: + VMOVDQU (16*0)(DX), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2Inline + +avx2XtsSm4DecDone: + VMOVDQU TW, (16*0)(BX) + VZEROUPPER + RET + +// func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) +TEXT ·decryptSm4XtsGB(SB),0,$256-64 + MOVQ xk+0(FP), AX + MOVQ tweak+8(FP), BX + MOVQ dst+16(FP), CX + MOVQ src+40(FP), DX + MOVQ src_len+48(FP), DI + + CMPB ·useAVX2(SB), $1 + JE avx2XtsSm4Dec + + CMPB ·useAVX(SB), $1 + JE avxXtsSm4Dec + + MOVOU gbGcmPoly<>(SB), POLY + MOVOU bswap_mask<>(SB), BSWAP + MOVOU (0*16)(BX), TW + +xtsSm4DecOctets: + CMPQ DI, $128 + JB xtsSm4DecNibbles + SUBQ $128, DI + + // prepare tweaks + prepareGB8Tweaks + // load 8 blocks for decryption + sseLoad8Blocks + + SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + + sseStore8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + + JMP xtsSm4DecOctets + +xtsSm4DecNibbles: + CMPQ DI, $64 + JB xtsSm4DecSingles + SUBQ $64, DI + + // prepare tweaks + prepareGB4Tweaks + // load 4 blocks for decryption + sseLoad4Blocks + + SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + sseStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +xtsSm4DecSingles: + CMPQ DI, $32 + JB xtsSm4DecTail + SUBQ $16, DI + + // load 1 block for decryption + MOVOU (16*0)(DX), B0 + + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(CX) + mul2GBInline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP xtsSm4DecSingles + +xtsSm4DecTail: + TESTQ DI, DI + JE xtsSm4DecDone + + CMPQ DI, $16 + JE xtsSm4DecLastBlock + + // length > 16 + // load 1 block for decryption + MOVOU (16*0)(DX), B0 + MOVOU TW, B5 + mul2GBInline + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(CX) + MOVOU B5, TW + + SUBQ $16, DI + LEAQ 16(DX), DX + LEAQ 16(CX), CX + LEAQ -16(CX), R8 + MOVOU B0, (16*0)(SP) + + CMPQ DI, $8 + JB loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE xtsSm4DecTailDec + +loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE loop_1b + +xtsSm4DecTailDec: + MOVOU (16*0)(SP), B0 + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(R8) + JMP xtsSm4DecDone + +xtsSm4DecLastBlock: + MOVOU (16*0)(DX), B0 + PXOR TW, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + PXOR TW, B0 + MOVOU B0, (16*0)(CX) + mul2GBInline + +xtsSm4DecDone: + MOVOU TW, (16*0)(BX) + RET + +avxXtsSm4Dec: + VMOVDQU gbGcmPoly<>(SB), POLY + VMOVDQU bswap_mask<>(SB), BSWAP + VMOVDQU (0*16)(BX), TW + +avxXtsSm4DecOctets: + CMPQ DI, $128 + JB avxXtsSm4DecNibbles + SUBQ $128, DI + + // prepare tweaks + avxPrepareGB8Tweaks + // load 8 blocks for decryption + avxLoad8Blocks + + AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + + avxStore8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + + JMP avxXtsSm4DecOctets + +avxXtsSm4DecNibbles: + CMPQ DI, $64 + JB avxXtsSm4DecSingles + SUBQ $64, DI + + // prepare tweaks + avxPrepareGB4Tweaks + // load 4 blocks for decryption + avxLoad4Blocks + + AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + avxStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +avxXtsSm4DecSingles: + CMPQ DI, $32 + JB avxXtsSm4DecTail + SUBQ $16, DI + + // load 1 block for decryption + VMOVDQU (16*0)(DX), B0 + + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2GBInline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP avxXtsSm4DecSingles + +avxXtsSm4DecTail: + TESTQ DI, DI + JE avxXtsSm4DecDone + + CMPQ DI, $16 + JE avxXtsSm4DecLastBlock + + // length > 16 + // load 1 block for decryption + VMOVDQU (16*0)(DX), B0 + VMOVDQU TW, B5 + avxMul2GBInline + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + VMOVDQU B5, TW + + SUBQ $16, DI + LEAQ 16(DX), DX + LEAQ 16(CX), CX + LEAQ -16(CX), R8 + VMOVDQU B0, (16*0)(SP) + + CMPQ DI, $8 + JB avx_loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE avxXtsSm4DecTailDec + +avx_loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE avx_loop_1b + +avxXtsSm4DecTailDec: + VMOVDQU (16*0)(SP), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(R8) + JMP avxXtsSm4DecDone + +avxXtsSm4DecLastBlock: + VMOVDQU (16*0)(DX), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2GBInline + +avxXtsSm4DecDone: + VMOVDQU TW, (16*0)(BX) + RET + +avx2XtsSm4Dec: + VMOVDQU gbGcmPoly<>(SB), POLY + VMOVDQU (0*16)(BX), TW + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + VBROADCASTI128 bswap_mask<>(SB), DWBSWAP + +avx2XtsSm4Dec16Blocks: + CMPQ DI, $256 + JB avx2XtsSm4DecOctets + SUBQ $256, DI + + // prepare tweaks + avxPrepareGB16Tweaks + // load 16 blocks for encryption + avx2Load16Blocks + // Apply Byte Flip Mask: LE -> BE + avx2LE2BE16Blocks + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) + + AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) + avx2ByteSwap16Blocks + avx2Store16Blocks + + LEAQ 256(DX), DX + LEAQ 256(CX), CX + + JMP avx2XtsSm4Dec16Blocks + +avx2XtsSm4DecOctets: + CMPQ DI, $128 + JB avx2XtsSm4DecNibbles + SUBQ $128, DI + + // prepare tweaks + avxPrepareGB8Tweaks + // load 8 blocks for encryption + avx2Load8Blocks + // Apply Byte Flip Mask: LE -> BE + avx2LE2BE8Blocks + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + + AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) + avx2ByteSwap8Blocks + avx2Store8Blocks + + LEAQ 128(DX), DX + LEAQ 128(CX), CX + +avx2XtsSm4DecNibbles: + CMPQ DI, $64 + JB avxXtsSm4DecSingles + SUBQ $64, DI + + // prepare tweaks + avxPrepareGB4Tweaks + // load 4 blocks for decryption + avxLoad4Blocks + + AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) + + avxStore4Blocks + + LEAQ 64(DX), DX + LEAQ 64(CX), CX + +avx2XtsSm4DecSingles: + CMPQ DI, $32 + JB avx2XtsSm4DecTail + SUBQ $16, DI + + // load 1 block for decryption + VMOVDQU (16*0)(DX), B0 + + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2Inline + + LEAQ 16(DX), DX + LEAQ 16(CX), CX + + JMP avx2XtsSm4DecSingles + +avx2XtsSm4DecTail: + TESTQ DI, DI + JE avx2XtsSm4DecDone + + CMPQ DI, $16 + JE avx2XtsSm4DecLastBlock + + // length > 16 + // load 1 block for decryption + VMOVDQU (16*0)(DX), B0 + VMOVDQU TW, B5 + avxMul2GBInline + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + VMOVDQU B5, TW + + SUBQ $16, DI + LEAQ 16(DX), DX + LEAQ 16(CX), CX + LEAQ -16(CX), R8 + VMOVDQU B0, (16*0)(SP) + + CMPQ DI, $8 + JB avx2_loop_1b + SUBQ $8, DI + MOVQ (DX)(DI*1), R9 + MOVQ (SP)(DI*1), R10 + MOVQ R9, (SP)(DI*1) + MOVQ R10, (CX)(DI*1) + + TESTQ DI, DI + JE avx2XtsSm4DecTailDec + +avx2_loop_1b: + SUBQ $1, DI + MOVB (DX)(DI*1), R9 + MOVB (SP)(DI*1), R10 + MOVB R9, (SP)(DI*1) + MOVB R10, (CX)(DI*1) + TESTQ DI, DI + JNE avx2_loop_1b + +avx2XtsSm4DecTailDec: + VMOVDQU (16*0)(SP), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(R8) + JMP avx2XtsSm4DecDone + +avx2XtsSm4DecLastBlock: + VMOVDQU (16*0)(DX), B0 + VPXOR TW, B0, B0 + SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) + VPXOR TW, B0, B0 + VMOVDQU B0, (16*0)(CX) + avxMul2GBInline + +avx2XtsSm4DecDone: + VMOVDQU TW, (16*0)(BX) + VZEROUPPER + RET