From 4eacdccbf65cbff2a38c9279b07c3f97419f89ea Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Fri, 18 Aug 2023 17:49:57 +0800 Subject: [PATCH] cipher: implement double tweak amd64 asm #149 --- cipher/xts.go | 19 ++--- cipher/xts_amd64.s | 157 +++++++++++++++++++++++++++++++++++++++++ cipher/xts_asm.go | 10 +++ cipher/xts_asm_test.go | 79 +++++++++++++++++++++ cipher/xts_generic.go | 16 +++++ 5 files changed, 268 insertions(+), 13 deletions(-) create mode 100644 cipher/xts_amd64.s create mode 100644 cipher/xts_asm.go create mode 100644 cipher/xts_asm_test.go create mode 100644 cipher/xts_generic.go diff --git a/cipher/xts.go b/cipher/xts.go index 6a72650..dd21de5 100644 --- a/cipher/xts.go +++ b/cipher/xts.go @@ -19,8 +19,7 @@ type concurrentBlocks interface { DecryptBlocks(dst, src []byte) } -// Cipher contains an expanded key structure. It is safe for concurrent use if -// the underlying block cipher is safe for concurrent use. +// Cipher contains an expanded key structure. It is unsafe for concurrent use. type xts struct { b _cipher.Block tweak [blockSize]byte @@ -198,12 +197,8 @@ func (c *xtsEncrypter) CryptBlocks(ciphertext, plaintext []byte) { if concCipher, ok := c.b.(concurrentBlocks); ok { batchSize := concCipher.Concurrency() * blockSize var tweaks []byte = make([]byte, batchSize) - for len(plaintext) >= batchSize { - for i := 0; i < concCipher.Concurrency(); i++ { - copy(tweaks[blockSize*i:], c.tweak[:]) - mul2(&c.tweak, c.isGB) - } + doubleTweaks(&c.tweak, tweaks, c.isGB) subtle.XORBytes(ciphertext, plaintext, tweaks) concCipher.EncryptBlocks(ciphertext, ciphertext) subtle.XORBytes(ciphertext, ciphertext, tweaks) @@ -212,6 +207,7 @@ func (c *xtsEncrypter) CryptBlocks(ciphertext, plaintext []byte) { ciphertext = ciphertext[batchSize:] } } + for len(plaintext) >= blockSize { subtle.XORBytes(ciphertext, plaintext, c.tweak[:]) c.b.Encrypt(ciphertext, ciphertext) @@ -262,10 +258,7 @@ func (c *xtsDecrypter) CryptBlocks(plaintext, ciphertext []byte) { var tweaks []byte = make([]byte, batchSize) for len(ciphertext) >= batchSize { - for i := 0; i < concCipher.Concurrency(); i++ { - copy(tweaks[blockSize*i:], c.tweak[:]) - mul2(&c.tweak, c.isGB) - } + doubleTweaks(&c.tweak, tweaks, c.isGB) subtle.XORBytes(plaintext, ciphertext, tweaks) concCipher.DecryptBlocks(plaintext, plaintext) subtle.XORBytes(plaintext, plaintext, tweaks) @@ -313,9 +306,9 @@ func (c *xtsDecrypter) CryptBlocks(plaintext, ciphertext []byte) { } } -// mul2 multiplies tweak by 2 in GF(2¹²⁸) with an irreducible polynomial of +// mul2Generic multiplies tweak by 2 in GF(2¹²⁸) with an irreducible polynomial of // x¹²⁸ + x⁷ + x² + x + 1. -func mul2(tweak *[blockSize]byte, isGB bool) { +func mul2Generic(tweak *[blockSize]byte, isGB bool) { var carryIn byte if !isGB { // tweak[0] represents the coefficients of {x^7, x^6, ..., x^0} diff --git a/cipher/xts_amd64.s b/cipher/xts_amd64.s new file mode 100644 index 0000000..8ad4a10 --- /dev/null +++ b/cipher/xts_amd64.s @@ -0,0 +1,157 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +#include "textflag.h" + +DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f +DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 + +DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000087 +DATA gcmPoly<>+0x08(SB)/8, $0x0000000000000000 + +DATA gbGcmPoly<>+0x00(SB)/8, $0x0000000000000000 +DATA gbGcmPoly<>+0x08(SB)/8, $0xe100000000000000 + +DATA one<>+0x00(SB)/8, $0x0000000000000001 +DATA one<>+0x08(SB)/8, $0x0000000000000000 + +GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 +GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 +GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16 +GLOBL one<>(SB), (NOPTR+RODATA), $16 + + +#define POLY X0 +#define BSWAP X1 +#define ONE X2 +#define B0 X3 +#define T0 X4 +#define T1 X5 + +// func mul2(tweak *[blockSize]byte, isGB bool) +TEXT ·mul2(SB),NOSPLIT,$0 + MOVQ tweak+0(FP), DI + MOVB isGB+8(FP), AX + + MOVOU (0*16)(DI), B0 + + CMPB AX, $1 + JE gb_alg + + MOVOU gcmPoly<>(SB), POLY + + // B0 * 2 + PSHUFD $0xff, B0, T0 + MOVOU B0, T1 + PSRAL $31, T0 // T0 for reduction + PAND POLY, T0 + PSRLL $31, T1 + PSLLDQ $4, T1 + PSLLL $1, B0 + PXOR T0, B0 + PXOR T1, B0 + + MOVOU B0, (0*16)(DI) + + RET + +gb_alg: + MOVOU bswapMask<>(SB), BSWAP + MOVOU gbGcmPoly<>(SB), POLY + MOVOU one<>(SB), ONE + PXOR X6, X6 + + PSHUFB BSWAP, B0 + + // B0 * 2 + MOVOU B0, T0 + MOVOU B0, T1 + PSRLQ $1, B0 + PSLLQ $63, T0 + PSRLDQ $8, T0 + POR T0, B0 + + // reduction + PAND ONE, T1 + PSHUFD $0, T1, T1 + PCMPEQL X6, T1 + PANDN POLY, T1 + PXOR T1, B0 + + PSHUFB BSWAP, B0 + MOVOU B0, (0*16)(DI) + RET + +// func doubleTweaks(tweak *[blockSize]byte, tweaks []byte, isGB bool) +TEXT ·doubleTweaks(SB),NOSPLIT,$0 + MOVQ tweak+0(FP), DI + MOVQ tweaks+8(FP), AX + MOVQ tweaks_len+16(FP), BX + MOVB isGB+32(FP), CX + + MOVOU (0*16)(DI), B0 + + SHRQ $4, BX + XORQ DX, DX + + CMPB CX, $1 + JE dt_gb_alg + + MOVOU gcmPoly<>(SB), POLY + +loop: + MOVOU B0, (0*16)(AX) + LEAQ 16(AX), AX + + // B0 * 2 + PSHUFD $0xff, B0, T0 + MOVOU B0, T1 + PSRAL $31, T0 // T0 for reduction + PAND POLY, T0 + PSRLL $31, T1 + PSLLDQ $4, T1 + PSLLL $1, B0 + PXOR T0, B0 + PXOR T1, B0 + + ADDQ $1, DX + CMPQ DX, BX + JB loop + + MOVOU B0, (0*16)(DI) + RET + +dt_gb_alg: + MOVOU bswapMask<>(SB), BSWAP + MOVOU gbGcmPoly<>(SB), POLY + MOVOU one<>(SB), ONE + PXOR X6, X6 + +gb_loop: + MOVOU B0, (0*16)(AX) + LEAQ 16(AX), AX + + PSHUFB BSWAP, B0 + + // B0 * 2 + MOVOU B0, T0 + MOVOU B0, T1 + PSRLQ $1, B0 + PSLLQ $63, T0 + PSRLDQ $8, T0 + POR T0, B0 + + // reduction + PAND ONE, T1 + PSHUFD $0, T1, T1 + PCMPEQL X6, T1 + PANDN POLY, T1 + PXOR T1, B0 + + PSHUFB BSWAP, B0 + ADDQ $1, DX + CMPQ DX, BX + JB gb_loop + + MOVOU B0, (0*16)(DI) + RET diff --git a/cipher/xts_asm.go b/cipher/xts_asm.go new file mode 100644 index 0000000..2b824a5 --- /dev/null +++ b/cipher/xts_asm.go @@ -0,0 +1,10 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +package cipher + +//go:noescape +func mul2(tweak *[blockSize]byte, isGB bool) + +//go:noescape +func doubleTweaks(tweak *[blockSize]byte, tweaks []byte, isGB bool) diff --git a/cipher/xts_asm_test.go b/cipher/xts_asm_test.go new file mode 100644 index 0000000..d14be04 --- /dev/null +++ b/cipher/xts_asm_test.go @@ -0,0 +1,79 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +package cipher + +import ( + "bytes" + "encoding/hex" + "testing" +) + +var testTweakVector = []string{ + "F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF", + "66e94bd4ef8a2c3b884cfa59ca342b2e", + "3f803bcd0d7fd2b37558419f59d5cda6", + "6dcfba212f5d82bf525ee9793cfa505a", + "c172964cd58be2b8d8e09d9c5e9cfe36", + "1a267577a90caad6ae988e22714a2b8b", + "33fab707493702e77ff8d66ba9e6c6fe", + "23fb188b0f87f6ee2ec0803a99771341", + "e8de0a4188b7efbc1ac3979eb906cf36", +} + +func testDoubleTweak(t *testing.T, isGB bool) { + for _, tk := range testTweakVector { + tweak, _ := hex.DecodeString(tk) + + var t1, t2 [16]byte + copy(t1[:], tweak) + copy(t2[:], tweak) + mul2(&t1, isGB) + mul2Generic(&t2, isGB) + + if !bytes.Equal(t1[:], t2[:]) { + t.Errorf("tweak %v, expected %x, got %x", tk, t2[:], t1[:]) + } + } +} + +func TestDoubleTweak(t *testing.T) { + testDoubleTweak(t, false) +} + +func TestDoubleTweakGB(t *testing.T) { + testDoubleTweak(t, true) +} + +func testDoubleTweaks(t *testing.T, isGB bool) { + for _, tk := range testTweakVector { + tweak, _ := hex.DecodeString(tk) + + var t1, t2 [16]byte + var t11, t12 [128]byte + copy(t1[:], tweak) + copy(t2[:], tweak) + + for i := 0; i < 8; i++ { + copy(t12[16*i:], t2[:]) + mul2Generic(&t2, isGB) + } + + doubleTweaks(&t1, t11[:], isGB) + + if !bytes.Equal(t1[:], t2[:]) { + t.Errorf("1 tweak %v, expected %x, got %x", tk, t2[:], t1[:]) + } + if !bytes.Equal(t11[:], t12[:]) { + t.Errorf("2 tweak %v, expected %x, got %x", tk, t12[:], t11[:]) + } + } +} + +func TestDoubleTweaks(t *testing.T) { + testDoubleTweaks(t, false) +} + +func TestDoubleTweaksGB(t *testing.T) { + testDoubleTweaks(t, true) +} diff --git a/cipher/xts_generic.go b/cipher/xts_generic.go new file mode 100644 index 0000000..01c812d --- /dev/null +++ b/cipher/xts_generic.go @@ -0,0 +1,16 @@ +//go:build !amd64 || purego +// +build !amd64 purego + +package cipher + +func mul2(tweak *[blockSize]byte, isGB bool) { + mul2Generic(tweak, isGB) +} + +func doubleTweaks(tweak *[blockSize]byte, tweaks []byte, isGB bool) { + count := len(tweaks) >> 4 + for i := 0; i < count; i++ { + copy(tweaks[blockSize*i:], tweak[:]) + mul2(tweak, isGB) + } +}