mirror of
https://github.com/emmansun/gmsm.git
synced 2025-05-12 12:06:18 +08:00
sm4: improve throughput #146
This commit is contained in:
parent
fe84641340
commit
4bc3c5d27b
@ -25,6 +25,12 @@ func BenchmarkSM4EBCEncrypt1K(b *testing.B) {
|
|||||||
benchmarkEBCEncrypt1K(b, c)
|
benchmarkEBCEncrypt1K(b, c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkAES128EBCEncrypt1K(b *testing.B) {
|
||||||
|
var key [16]byte
|
||||||
|
c, _ := aes.NewCipher(key[:])
|
||||||
|
benchmarkEBCEncrypt1K(b, c)
|
||||||
|
}
|
||||||
|
|
||||||
func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) {
|
func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) {
|
||||||
buf := make([]byte, 1024)
|
buf := make([]byte, 1024)
|
||||||
b.SetBytes(int64(len(buf)))
|
b.SetBytes(int64(len(buf)))
|
||||||
|
@ -2,6 +2,8 @@ package cipher_test
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"crypto/rand"
|
||||||
|
"io"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/emmansun/gmsm/cipher"
|
"github.com/emmansun/gmsm/cipher"
|
||||||
@ -63,6 +65,11 @@ var ecbSM4Tests = []struct {
|
|||||||
[]byte("0123456789ABCDEF"),
|
[]byte("0123456789ABCDEF"),
|
||||||
[]byte("exampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintext"),
|
[]byte("exampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintext"),
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"18 same blocks",
|
||||||
|
[]byte("0123456789ABCDEF"),
|
||||||
|
[]byte("exampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintext"),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestECBBasic(t *testing.T) {
|
func TestECBBasic(t *testing.T) {
|
||||||
@ -80,11 +87,30 @@ func TestECBBasic(t *testing.T) {
|
|||||||
decrypter := cipher.NewECBDecrypter(c)
|
decrypter := cipher.NewECBDecrypter(c)
|
||||||
decrypter.CryptBlocks(plaintext, ciphertext)
|
decrypter.CryptBlocks(plaintext, ciphertext)
|
||||||
if !bytes.Equal(test.in, plaintext) {
|
if !bytes.Equal(test.in, plaintext) {
|
||||||
t.Errorf("%s: ECB encrypt/decrypt failed", test.name)
|
t.Errorf("%s: ECB encrypt/decrypt failed, %s", test.name, string(plaintext))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestECBRandom(t *testing.T) {
|
||||||
|
key := []byte("0123456789ABCDEF")
|
||||||
|
plaintext := make([]byte, 448)
|
||||||
|
ciphertext := make([]byte, 448)
|
||||||
|
io.ReadFull(rand.Reader, plaintext)
|
||||||
|
c, err := sm4.NewCipher(key)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
encrypter := cipher.NewECBEncrypter(c)
|
||||||
|
encrypter.CryptBlocks(ciphertext, plaintext)
|
||||||
|
result := make([]byte, 448)
|
||||||
|
decrypter := cipher.NewECBDecrypter(c)
|
||||||
|
decrypter.CryptBlocks(result, ciphertext)
|
||||||
|
if !bytes.Equal(result, plaintext) {
|
||||||
|
t.Error("ECB encrypt/decrypt failed")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func shouldPanic(t *testing.T, f func()) {
|
func shouldPanic(t *testing.T, f func()) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
defer func() { _ = recover() }()
|
defer func() { _ = recover() }()
|
||||||
|
@ -15,7 +15,7 @@ GLOBL nibble_mask<>(SB), 8, $16
|
|||||||
|
|
||||||
// inverse shift rows
|
// inverse shift rows
|
||||||
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||||
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
|
||||||
GLOBL inverse_shift_rows<>(SB), 8, $16
|
GLOBL inverse_shift_rows<>(SB), 8, $16
|
||||||
|
|
||||||
// Affine transform 1 (low and high hibbles)
|
// Affine transform 1 (low and high hibbles)
|
||||||
@ -24,7 +24,7 @@ DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
|
|||||||
GLOBL m1_low<>(SB), 8, $16
|
GLOBL m1_low<>(SB), 8, $16
|
||||||
|
|
||||||
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
|
DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
|
||||||
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
|
||||||
GLOBL m1_high<>(SB), 8, $16
|
GLOBL m1_high<>(SB), 8, $16
|
||||||
|
|
||||||
// Affine transform 2 (low and high hibbles)
|
// Affine transform 2 (low and high hibbles)
|
||||||
@ -38,21 +38,46 @@ GLOBL m2_high<>(SB), 8, $16
|
|||||||
|
|
||||||
// left rotations of 32-bit words by 8-bit increments
|
// left rotations of 32-bit words by 8-bit increments
|
||||||
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
||||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||||
GLOBL r08_mask<>(SB), 8, $16
|
GLOBL r08_mask<>(SB), 8, $16
|
||||||
|
|
||||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
||||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||||
GLOBL r16_mask<>(SB), 8, $16
|
GLOBL r16_mask<>(SB), 8, $16
|
||||||
|
|
||||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
||||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||||
GLOBL r24_mask<>(SB), 8, $16
|
GLOBL r24_mask<>(SB), 8, $16
|
||||||
|
|
||||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||||
GLOBL fk_mask<>(SB), 8, $16
|
GLOBL fk_mask<>(SB), 8, $16
|
||||||
|
|
||||||
|
// inverse shift rows
|
||||||
|
DATA inverse_shift_rows256<>+0x00(SB)/8, $0x0B0E0104070A0D00
|
||||||
|
DATA inverse_shift_rows256<>+0x08(SB)/8, $0x0306090C0F020508
|
||||||
|
DATA inverse_shift_rows256<>+0x10(SB)/8, $0x0B0E0104070A0D00
|
||||||
|
DATA inverse_shift_rows256<>+0x18(SB)/8, $0x0306090C0F020508
|
||||||
|
GLOBL inverse_shift_rows256<>(SB), 8, $32
|
||||||
|
|
||||||
|
DATA r08_mask256<>+0x00(SB)/8, $0x0605040702010003
|
||||||
|
DATA r08_mask256<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||||
|
DATA r08_mask256<>+0x10(SB)/8, $0x0605040702010003
|
||||||
|
DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
||||||
|
GLOBL r08_mask256<>(SB), 8, $32
|
||||||
|
|
||||||
|
DATA r16_mask256<>+0x00(SB)/8, $0x0504070601000302
|
||||||
|
DATA r16_mask256<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||||
|
DATA r16_mask256<>+0x10(SB)/8, $0x0504070601000302
|
||||||
|
DATA r16_mask256<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
|
||||||
|
GLOBL r16_mask256<>(SB), 8, $32
|
||||||
|
|
||||||
|
DATA r24_mask256<>+0x00(SB)/8, $0x0407060500030201
|
||||||
|
DATA r24_mask256<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
||||||
|
DATA r24_mask256<>+0x10(SB)/8, $0x0407060500030201
|
||||||
|
DATA r24_mask256<>+0x18(SB)/8, $0x0C0F0E0D080B0A09
|
||||||
|
GLOBL r24_mask256<>(SB), 8, $32
|
||||||
|
|
||||||
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
|
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
|
||||||
// input: from high to low
|
// input: from high to low
|
||||||
// r0 = [w3, w2, w1, w0]
|
// r0 = [w3, w2, w1, w0]
|
||||||
@ -189,7 +214,7 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - t2: 128 bits register for data
|
// - t2: 128 bits register for data
|
||||||
// - t3: 128 bits register for data
|
// - t3: 128 bits register for data
|
||||||
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
MOVL (index * 4)(RK)(IND*1), x; \
|
||||||
PSHUFD $0, x, x; \
|
PSHUFD $0, x, x; \
|
||||||
PXOR t1, x; \
|
PXOR t1, x; \
|
||||||
PXOR t2, x; \
|
PXOR t2, x; \
|
||||||
@ -197,29 +222,128 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
SM4_TAO_L1(x, y, z); \
|
SM4_TAO_L1(x, y, z); \
|
||||||
PXOR x, t0
|
PXOR x, t0
|
||||||
|
|
||||||
|
#define SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3) \
|
||||||
|
PXOR t1, x; \
|
||||||
|
PXOR t2, x; \
|
||||||
|
PXOR t3, x; \
|
||||||
|
SM4_TAO_L1(x, y, z); \
|
||||||
|
PXOR x, t0 \
|
||||||
|
|
||||||
|
#define SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
|
PSHUFD $0, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \
|
||||||
|
PSHUFD $0x55, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \
|
||||||
|
PSHUFD $0xAA, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \
|
||||||
|
PSHUFD $0xFF, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
|
||||||
|
|
||||||
|
#define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
|
PSHUFB flip_mask<>(SB), t0; \
|
||||||
|
PSHUFB flip_mask<>(SB), t1; \
|
||||||
|
PSHUFB flip_mask<>(SB), t2; \
|
||||||
|
PSHUFB flip_mask<>(SB), t3; \
|
||||||
|
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||||
|
MOVOU (0*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
MOVOU (1*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
MOVOU (2*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
MOVOU (3*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
MOVOU (4*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
MOVOU (5*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
MOVOU (6*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
MOVOU (7*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||||
|
PSHUFB bswap_mask<>(SB), t3; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t2; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t1; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t0
|
||||||
|
|
||||||
|
#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
|
PSHUFD $0, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \
|
||||||
|
PSHUFD $0, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t4, t5, t6, t7); \
|
||||||
|
PSHUFD $0x55, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \
|
||||||
|
PSHUFD $0x55, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t5, t6, t7, t4); \
|
||||||
|
PSHUFD $0xAA, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \
|
||||||
|
PSHUFD $0xAA, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t6, t7, t4, t5); \
|
||||||
|
PSHUFD $0xFF, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
|
||||||
|
PSHUFD $0xFF, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \
|
||||||
|
|
||||||
|
#define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
|
PSHUFB flip_mask<>(SB), t0; \
|
||||||
|
PSHUFB flip_mask<>(SB), t1; \
|
||||||
|
PSHUFB flip_mask<>(SB), t2; \
|
||||||
|
PSHUFB flip_mask<>(SB), t3; \
|
||||||
|
PSHUFB flip_mask<>(SB), t4; \
|
||||||
|
PSHUFB flip_mask<>(SB), t5; \
|
||||||
|
PSHUFB flip_mask<>(SB), t6; \
|
||||||
|
PSHUFB flip_mask<>(SB), t7; \
|
||||||
|
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||||
|
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
||||||
|
MOVOU (0*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
MOVOU (1*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
MOVOU (2*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
MOVOU (3*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
MOVOU (4*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
MOVOU (5*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
MOVOU (6*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
MOVOU (7*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
||||||
|
SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \
|
||||||
|
PSHUFB bswap_mask<>(SB), t3; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t2; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t1; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t0; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t7; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t6; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t5; \
|
||||||
|
PSHUFB bswap_mask<>(SB), t4
|
||||||
|
|
||||||
// SM4 sbox function, AVX version
|
// SM4 sbox function, AVX version
|
||||||
// parameters:
|
// parameters:
|
||||||
// - x: 128 bits register as sbox input/output data
|
// - x: 128 bits register as sbox input/output data
|
||||||
// - y: 128 bits temp register
|
// - y: 128 bits temp register
|
||||||
// - X_NIBBLE_MASK: 128 bits register stored nibble mask, should be loaded earlier.
|
|
||||||
// - tmp: 128 bits temp register
|
// - tmp: 128 bits temp register
|
||||||
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
|
#define AVX_SM4_SBOX(x, y, tmp) \
|
||||||
VPAND X_NIBBLE_MASK, x, tmp; \
|
VPAND nibble_mask<>(SB), x, tmp; \
|
||||||
VMOVDQU m1_low<>(SB), y; \
|
VMOVDQU m1_low<>(SB), y; \
|
||||||
VPSHUFB tmp, y, y; \
|
VPSHUFB tmp, y, y; \
|
||||||
VPSRLQ $4, x, x; \
|
VPSRLQ $4, x, x; \
|
||||||
VPAND X_NIBBLE_MASK, x, x; \
|
VPAND nibble_mask<>(SB), x, x; \
|
||||||
VMOVDQU m1_high<>(SB), tmp; \
|
VMOVDQU m1_high<>(SB), tmp; \
|
||||||
VPSHUFB x, tmp, x; \
|
VPSHUFB x, tmp, x; \
|
||||||
VPXOR y, x, x; \
|
VPXOR y, x, x; \
|
||||||
VMOVDQU inverse_shift_rows<>(SB), tmp; \
|
VPSHUFB inverse_shift_rows<>(SB), x, x; \
|
||||||
VPSHUFB tmp, x, x; \
|
VAESENCLAST nibble_mask<>(SB), x, x; \
|
||||||
VAESENCLAST X_NIBBLE_MASK, x, x; \
|
VPANDN nibble_mask<>(SB), x, tmp; \
|
||||||
VPANDN X_NIBBLE_MASK, x, tmp; \
|
|
||||||
VMOVDQU m2_low<>(SB), y; \
|
VMOVDQU m2_low<>(SB), y; \
|
||||||
VPSHUFB tmp, y, y; \
|
VPSHUFB tmp, y, y; \
|
||||||
VPSRLQ $4, x, x; \
|
VPSRLQ $4, x, x; \
|
||||||
VPAND X_NIBBLE_MASK, x, x; \
|
VPAND nibble_mask<>(SB), x, x; \
|
||||||
VMOVDQU m2_high<>(SB), tmp; \
|
VMOVDQU m2_high<>(SB), tmp; \
|
||||||
VPSHUFB x, tmp, x; \
|
VPSHUFB x, tmp, x; \
|
||||||
VPXOR y, x, x
|
VPXOR y, x, x
|
||||||
@ -228,21 +352,17 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// parameters:
|
// parameters:
|
||||||
// - x: 128 bits register as sbox input/output data
|
// - x: 128 bits register as sbox input/output data
|
||||||
// - y: 128 bits temp register
|
// - y: 128 bits temp register
|
||||||
// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
|
|
||||||
// - tmp: 128 bits temp register
|
// - tmp: 128 bits temp register
|
||||||
#define AVX_SM4_TAO_L1(x, y, xNibbleMask, tmp) \
|
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
||||||
AVX_SM4_SBOX(x, y, xNibbleMask, tmp); \
|
AVX_SM4_SBOX(x, y, tmp); \
|
||||||
VMOVDQU r08_mask<>(SB), tmp; \
|
VPSHUFB r08_mask<>(SB), x, y; \
|
||||||
VPSHUFB tmp, x, y; \
|
|
||||||
VPXOR x, y, y; \
|
VPXOR x, y, y; \
|
||||||
VMOVDQU r16_mask<>(SB), tmp; \
|
VPSHUFB r16_mask<>(SB), x, tmp; \
|
||||||
VPSHUFB tmp, x, tmp; \
|
|
||||||
VPXOR tmp, y, y; \
|
VPXOR tmp, y, y; \
|
||||||
VPSLLD $2, y, tmp; \
|
VPSLLD $2, y, tmp; \
|
||||||
VPSRLD $30, y, y; \
|
VPSRLD $30, y, y; \
|
||||||
VPXOR tmp, y, y; \
|
VPXOR tmp, y, y; \
|
||||||
VMOVDQU r24_mask<>(SB), tmp; \
|
VPSHUFB r24_mask<>(SB), x, tmp; \
|
||||||
VPSHUFB tmp, x, tmp; \
|
|
||||||
VPXOR y, x, x; \
|
VPXOR y, x, x; \
|
||||||
VPXOR x, tmp, x
|
VPXOR x, tmp, x
|
||||||
|
|
||||||
@ -280,9 +400,115 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
VPXOR t1, x, x; \
|
VPXOR t1, x, x; \
|
||||||
VPXOR t2, x, x; \
|
VPXOR t2, x, x; \
|
||||||
VPXOR t3, x, x; \
|
VPXOR t3, x, x; \
|
||||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
AVX_SM4_TAO_L1(x, y, tmp); \
|
||||||
VPXOR x, t0, t0
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
|
|
||||||
|
#define SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3) \
|
||||||
|
VPXOR t1, x, x; \
|
||||||
|
VPXOR t2, x, x; \
|
||||||
|
VPXOR t3, x, x; \
|
||||||
|
AVX_SM4_TAO_L1(x, y, z); \
|
||||||
|
VPXOR x, t0, t0 \
|
||||||
|
|
||||||
|
#define SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
|
VPSHUFD $0, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \
|
||||||
|
VPSHUFD $0x55, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \
|
||||||
|
VPSHUFD $0xAA, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \
|
||||||
|
VPSHUFD $0xFF, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
|
||||||
|
|
||||||
|
#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
|
VPSHUFB flip_mask<>(SB), t0, t0 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t1, t1 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t2, t2 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t3, t3 \
|
||||||
|
; \
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||||
|
VMOVDQU (0*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
VMOVDQU (1*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
VMOVDQU (2*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
VMOVDQU (3*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
VMOVDQU (4*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
VMOVDQU (5*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
VMOVDQU (6*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
VMOVDQU (7*16)(RK), rk128; \
|
||||||
|
SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \
|
||||||
|
; \ // Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t0, t0 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t1, t1 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t2, t2 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t3, t3 \
|
||||||
|
|
||||||
|
#define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
|
VPSHUFD $0, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \
|
||||||
|
VPSHUFD $0, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t4, t5, t6, t7); \
|
||||||
|
VPSHUFD $0x55, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \
|
||||||
|
VPSHUFD $0x55, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t5, t6, t7, t4); \
|
||||||
|
VPSHUFD $0xAA, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \
|
||||||
|
VPSHUFD $0xAA, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t6, t7, t4, t5); \
|
||||||
|
VPSHUFD $0xFF, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \
|
||||||
|
VPSHUFD $0xFF, rk128, x; \
|
||||||
|
SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \
|
||||||
|
|
||||||
|
#define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
|
VPSHUFB flip_mask<>(SB), t0, t0 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t1, t1 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t2, t2 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t3, t3 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t4, t4 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t5, t5 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t6, t6 \
|
||||||
|
VPSHUFB flip_mask<>(SB), t7, t7 \
|
||||||
|
; \
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||||
|
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
||||||
|
VMOVDQU (0*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
VMOVDQU (1*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
VMOVDQU (2*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
VMOVDQU (3*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
VMOVDQU (4*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
VMOVDQU (5*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
VMOVDQU (6*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
VMOVDQU (7*16)(RK), rk128; \
|
||||||
|
SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
; \ // Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
||||||
|
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t0, t0 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t1, t1 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t2, t2 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t3, t3 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t4, t4 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t5, t5 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t6, t6 \
|
||||||
|
VPSHUFB bswap_mask<>(SB), t7, t7 \
|
||||||
|
|
||||||
// SM4 sbox function, AVX2 version
|
// SM4 sbox function, AVX2 version
|
||||||
// parameters:
|
// parameters:
|
||||||
// - x: 256 bits register as sbox input/output data
|
// - x: 256 bits register as sbox input/output data
|
||||||
@ -301,8 +527,7 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
VBROADCASTI128 m1_high<>(SB), z; \
|
VBROADCASTI128 m1_high<>(SB), z; \
|
||||||
VPSHUFB x, z, x; \
|
VPSHUFB x, z, x; \
|
||||||
VPXOR y, x, x; \
|
VPXOR y, x, x; \
|
||||||
VBROADCASTI128 inverse_shift_rows<>(SB), z; \
|
VPSHUFB inverse_shift_rows256<>(SB), x, x; \
|
||||||
VPSHUFB z, x, x; \
|
|
||||||
VEXTRACTI128 $1, x, yw \
|
VEXTRACTI128 $1, x, yw \
|
||||||
VAESENCLAST xNibbleMask, xw, xw; \
|
VAESENCLAST xNibbleMask, xw, xw; \
|
||||||
VAESENCLAST xNibbleMask, yw, yw; \
|
VAESENCLAST xNibbleMask, yw, yw; \
|
||||||
@ -327,17 +552,14 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||||
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||||
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
||||||
VBROADCASTI128 r08_mask<>(SB), z; \
|
VPSHUFB r08_mask256<>(SB), x, y; \
|
||||||
VPSHUFB z, x, y; \
|
|
||||||
VPXOR x, y, y; \
|
VPXOR x, y, y; \
|
||||||
VBROADCASTI128 r16_mask<>(SB), z; \
|
VPSHUFB r16_mask256<>(SB), x, z; \
|
||||||
VPSHUFB z, x, z; \
|
|
||||||
VPXOR z, y, y; \
|
VPXOR z, y, y; \
|
||||||
VPSLLD $2, y, z; \
|
VPSLLD $2, y, z; \
|
||||||
VPSRLD $30, y, y; \
|
VPSRLD $30, y, y; \
|
||||||
VPXOR z, y, y; \
|
VPXOR z, y, y; \
|
||||||
VBROADCASTI128 r24_mask<>(SB), z; \
|
VPSHUFB r24_mask256<>(SB), x, z; \
|
||||||
VPSHUFB z, x, z; \
|
|
||||||
VPXOR y, x, x; \
|
VPXOR y, x, x; \
|
||||||
VPXOR x, z, x
|
VPXOR x, z, x
|
||||||
|
|
||||||
@ -359,6 +581,24 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||||
VPXOR x, t0, t0
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
|
// SM4 round function, AVX2 version, handle 256 bits
|
||||||
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
|
// parameters:
|
||||||
|
// - index: round key index immediate number
|
||||||
|
// - x: 256 bits temp register, MUST use XDWORD!
|
||||||
|
// - y: 256 bits temp register, MUST use YDWORD!
|
||||||
|
// - t0: 256 bits register for data as result
|
||||||
|
// - t1: 256 bits register for data
|
||||||
|
// - t2: 256 bits register for data
|
||||||
|
// - t3: 256 bits register for data
|
||||||
|
#define AVX2_SM4_ROUND2(index, RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||||
|
VPBROADCASTD (index * 4)(RK), x; \
|
||||||
|
VPXOR t1, x, x; \
|
||||||
|
VPXOR t2, x, x; \
|
||||||
|
VPXOR t3, x, x; \
|
||||||
|
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||||
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
// SM4 round function, AVX version, handle 128 bits
|
// SM4 round function, AVX version, handle 128 bits
|
||||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
// parameters:
|
// parameters:
|
||||||
@ -371,9 +611,100 @@ GLOBL fk_mask<>(SB), 8, $16
|
|||||||
// - t3: 128 bits register for data
|
// - t3: 128 bits register for data
|
||||||
#define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
#define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||||
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
|
||||||
VPSHUFD $0, x, x; \
|
|
||||||
VPXOR t1, x, x; \
|
VPXOR t1, x, x; \
|
||||||
VPXOR t2, x, x; \
|
VPXOR t2, x, x; \
|
||||||
VPXOR t3, x, x; \
|
VPXOR t3, x, x; \
|
||||||
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
|
AVX_SM4_TAO_L1(x, y, tmp); \
|
||||||
VPXOR x, t0, t0
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
|
#define AVX2_SM4_8BLOCKS(RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \
|
||||||
|
AVX2_SM4_ROUND2(0, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||||
|
AVX2_SM4_ROUND2(1, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||||
|
AVX2_SM4_ROUND2(2, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||||
|
AVX2_SM4_ROUND2(3, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||||
|
AVX2_SM4_ROUND2(4, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||||
|
AVX2_SM4_ROUND2(5, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||||
|
AVX2_SM4_ROUND2(6, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||||
|
AVX2_SM4_ROUND2(7, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||||
|
AVX2_SM4_ROUND2(8, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||||
|
AVX2_SM4_ROUND2(9, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||||
|
AVX2_SM4_ROUND2(10, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||||
|
AVX2_SM4_ROUND2(11, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||||
|
AVX2_SM4_ROUND2(12, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||||
|
AVX2_SM4_ROUND2(13, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||||
|
AVX2_SM4_ROUND2(14, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||||
|
AVX2_SM4_ROUND2(15, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||||
|
AVX2_SM4_ROUND2(16, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||||
|
AVX2_SM4_ROUND2(17, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||||
|
AVX2_SM4_ROUND2(18, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||||
|
AVX2_SM4_ROUND2(19, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||||
|
AVX2_SM4_ROUND2(20, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||||
|
AVX2_SM4_ROUND2(21, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||||
|
AVX2_SM4_ROUND2(22, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||||
|
AVX2_SM4_ROUND2(23, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||||
|
AVX2_SM4_ROUND2(24, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||||
|
AVX2_SM4_ROUND2(25, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||||
|
AVX2_SM4_ROUND2(26, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||||
|
AVX2_SM4_ROUND2(27, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
|
||||||
|
AVX2_SM4_ROUND2(28, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
|
||||||
|
AVX2_SM4_ROUND2(29, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
|
||||||
|
AVX2_SM4_ROUND2(30, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
|
||||||
|
AVX2_SM4_ROUND2(31, RK, x, y, xw, yw, tmp, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
// SM4 round function, AVX2 version, handle 256 bits
|
||||||
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
|
// parameters:
|
||||||
|
// - index: round key index immediate number
|
||||||
|
// - x: 256 bits temp register, MUST use XDWORD!
|
||||||
|
// - y: 256 bits temp register, MUST use YDWORD!
|
||||||
|
// - t0: 256 bits register for data as result
|
||||||
|
// - t1: 256 bits register for data
|
||||||
|
// - t2: 256 bits register for data
|
||||||
|
// - t3: 256 bits register for data
|
||||||
|
#define AVX2_SM4_16BLOCKS_ROUND(index, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
|
VPBROADCASTD (index * 4)(RK), tmp1; \
|
||||||
|
VPXOR t1, tmp1, x; \
|
||||||
|
VPXOR t2, x, x; \
|
||||||
|
VPXOR t3, x, x; \
|
||||||
|
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||||
|
VPXOR x, t0, t0; \
|
||||||
|
;\
|
||||||
|
VPXOR t5, tmp1, x; \
|
||||||
|
VPXOR t6, x, x; \
|
||||||
|
VPXOR t7, x, x; \
|
||||||
|
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||||
|
VPXOR x, t4, t4; \
|
||||||
|
|
||||||
|
#define AVX2_SM4_16BLOCKS(RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(10, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(11, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(12, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(13, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(14, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(15, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(16, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(17, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(18, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(19, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(20, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(21, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(22, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(23, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(24, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(25, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(26, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(27, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(28, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(29, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(30, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
|
||||||
|
AVX2_SM4_16BLOCKS_ROUND(31, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||||
|
@ -171,3 +171,30 @@ GLOBL fk_mask<>(SB), (16+8), $16
|
|||||||
VEOR t3.B16, x.B16, x.B16; \
|
VEOR t3.B16, x.B16, x.B16; \
|
||||||
SM4_TAO_L1(x, y, z); \
|
SM4_TAO_L1(x, y, z); \
|
||||||
VEOR x.B16, t0.B16, t0.B16
|
VEOR x.B16, t0.B16, t0.B16
|
||||||
|
|
||||||
|
// SM4 round function
|
||||||
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
|
// parameters:
|
||||||
|
// - RK: round key register
|
||||||
|
// - tmp32: temp 32/64 bits register
|
||||||
|
// - x: 128 bits temp register
|
||||||
|
// - y: 128 bits temp register
|
||||||
|
// - z: 128 bits temp register
|
||||||
|
// - t0: 128 bits register for data as result
|
||||||
|
// - t1: 128 bits register for data
|
||||||
|
// - t2: 128 bits register for data
|
||||||
|
// - t3: 128 bits register for data
|
||||||
|
#define SM4_8BLOCKS_ROUND(RK, tmp32, x, y, z, tmp, t0, t1, t2, t3, t4, t5, t6, t7) \
|
||||||
|
MOVW.P 4(RK), tmp32; \
|
||||||
|
VMOV tmp32, tmp.S4; \
|
||||||
|
VEOR t1.B16, tmp.B16, x.B16; \
|
||||||
|
VEOR t2.B16, x.B16, x.B16; \
|
||||||
|
VEOR t3.B16, x.B16, x.B16; \
|
||||||
|
SM4_TAO_L1(x, y, z); \
|
||||||
|
VEOR x.B16, t0.B16, t0.B16; \
|
||||||
|
; \
|
||||||
|
VEOR t1.B16, tmp.B16, x.B16; \
|
||||||
|
VEOR t2.B16, x.B16, x.B16; \
|
||||||
|
VEOR t3.B16, x.B16, x.B16; \
|
||||||
|
SM4_TAO_L1(x, y, z); \
|
||||||
|
VEOR x.B16, t0.B16, t0.B16
|
||||||
|
245
sm4/asm_amd64.s
245
sm4/asm_amd64.s
@ -4,15 +4,15 @@
|
|||||||
|
|
||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
#define x X0
|
#define t0 X0
|
||||||
#define y X1
|
#define t1 X1
|
||||||
#define t0 X2
|
#define t2 X2
|
||||||
#define t1 X3
|
#define t3 X3
|
||||||
#define t2 X4
|
|
||||||
#define t3 X5
|
|
||||||
|
|
||||||
#define XTMP6 X6
|
#define x X8
|
||||||
#define XTMP7 X7
|
#define y X9
|
||||||
|
#define XTMP6 X10
|
||||||
|
#define XTMP7 X11
|
||||||
|
|
||||||
#include "aesni_macros_amd64.s"
|
#include "aesni_macros_amd64.s"
|
||||||
|
|
||||||
@ -48,7 +48,7 @@
|
|||||||
// - t2: 128 bits register for data
|
// - t2: 128 bits register for data
|
||||||
// - t3: 128 bits register for data
|
// - t3: 128 bits register for data
|
||||||
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
|
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||||
PINSRD $0, (index * 4)(BX)(CX*1), x; \
|
MOVL (index * 4)(BX)(CX*1), x; \
|
||||||
PXOR t1, x; \
|
PXOR t1, x; \
|
||||||
PXOR t2, x; \
|
PXOR t2, x; \
|
||||||
PXOR t3, x; \
|
PXOR t3, x; \
|
||||||
@ -68,6 +68,16 @@
|
|||||||
#define XWORD2 X6
|
#define XWORD2 X6
|
||||||
#define XWORD3 X7
|
#define XWORD3 X7
|
||||||
|
|
||||||
|
#define XDWORD4 Y10
|
||||||
|
#define XDWORD5 Y11
|
||||||
|
#define XDWORD6 Y12
|
||||||
|
#define XDWORD7 Y14
|
||||||
|
|
||||||
|
#define XWORD4 X10
|
||||||
|
#define XWORD5 X11
|
||||||
|
#define XWORD6 X12
|
||||||
|
#define XWORD7 X14
|
||||||
|
|
||||||
#define XDWTMP0 Y0
|
#define XDWTMP0 Y0
|
||||||
#define XDWTMP1 Y1
|
#define XDWTMP1 Y1
|
||||||
#define XDWTMP2 Y2
|
#define XDWTMP2 Y2
|
||||||
@ -133,91 +143,93 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
|||||||
JE avx
|
JE avx
|
||||||
|
|
||||||
non_avx2_start:
|
non_avx2_start:
|
||||||
MOVOU 0(DX), t0
|
CMPQ DI, $128
|
||||||
MOVOU 16(DX), t1
|
JEQ sse_8blocks
|
||||||
MOVOU 32(DX), t2
|
|
||||||
MOVOU 48(DX), t3
|
|
||||||
PSHUFB flip_mask<>(SB), t0
|
|
||||||
PSHUFB flip_mask<>(SB), t1
|
|
||||||
PSHUFB flip_mask<>(SB), t2
|
|
||||||
PSHUFB flip_mask<>(SB), t3
|
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)
|
|
||||||
|
|
||||||
XORL CX, CX
|
MOVOU 0(DX), XWORD0
|
||||||
|
MOVOU 16(DX), XWORD1
|
||||||
|
MOVOU 32(DX), XWORD2
|
||||||
|
MOVOU 48(DX), XWORD3
|
||||||
|
|
||||||
loop:
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
|
||||||
SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
|
||||||
SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
|
||||||
SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
|
||||||
|
|
||||||
ADDL $16, CX
|
|
||||||
CMPL CX, $4*32
|
|
||||||
JB loop
|
|
||||||
|
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);
|
|
||||||
PSHUFB bswap_mask<>(SB), t3
|
|
||||||
PSHUFB bswap_mask<>(SB), t2
|
|
||||||
PSHUFB bswap_mask<>(SB), t1
|
|
||||||
PSHUFB bswap_mask<>(SB), t0
|
|
||||||
|
|
||||||
MOVOU t0, 0(BX)
|
MOVOU XWORD0, 0(BX)
|
||||||
MOVOU t1, 16(BX)
|
MOVOU XWORD1, 16(BX)
|
||||||
MOVOU t2, 32(BX)
|
MOVOU XWORD2, 32(BX)
|
||||||
MOVOU t3, 48(BX)
|
MOVOU XWORD3, 48(BX)
|
||||||
|
|
||||||
|
RET
|
||||||
|
|
||||||
|
sse_8blocks:
|
||||||
|
MOVOU 0(DX), XWORD0
|
||||||
|
MOVOU 16(DX), XWORD1
|
||||||
|
MOVOU 32(DX), XWORD2
|
||||||
|
MOVOU 48(DX), XWORD3
|
||||||
|
MOVOU 64(DX), XWORD4
|
||||||
|
MOVOU 80(DX), XWORD5
|
||||||
|
MOVOU 96(DX), XWORD6
|
||||||
|
MOVOU 112(DX), XWORD7
|
||||||
|
|
||||||
|
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||||
|
|
||||||
|
MOVOU XWORD0, 0(BX)
|
||||||
|
MOVOU XWORD1, 16(BX)
|
||||||
|
MOVOU XWORD2, 32(BX)
|
||||||
|
MOVOU XWORD3, 48(BX)
|
||||||
|
MOVOU XWORD4, 64(BX)
|
||||||
|
MOVOU XWORD5, 80(BX)
|
||||||
|
MOVOU XWORD6, 96(BX)
|
||||||
|
MOVOU XWORD7, 112(BX)
|
||||||
done_sm4:
|
done_sm4:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
avx:
|
avx:
|
||||||
|
CMPQ DI, $128
|
||||||
|
JEQ avx_8blocks
|
||||||
|
|
||||||
VMOVDQU 0(DX), XWORD0
|
VMOVDQU 0(DX), XWORD0
|
||||||
VMOVDQU 16(DX), XWORD1
|
VMOVDQU 16(DX), XWORD1
|
||||||
VMOVDQU 32(DX), XWORD2
|
VMOVDQU 32(DX), XWORD2
|
||||||
VMOVDQU 48(DX), XWORD3
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
|
||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
|
||||||
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
|
||||||
|
|
||||||
XORL CX, CX
|
|
||||||
|
|
||||||
avx_loop:
|
|
||||||
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
|
||||||
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
|
||||||
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
|
||||||
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
|
||||||
|
|
||||||
ADDL $16, CX
|
|
||||||
CMPL CX, $4*32
|
|
||||||
JB avx_loop
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
|
||||||
|
|
||||||
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
|
||||||
|
|
||||||
VMOVDQU XWORD0, 0(BX)
|
VMOVDQU XWORD0, 0(BX)
|
||||||
VMOVDQU XWORD1, 16(BX)
|
VMOVDQU XWORD1, 16(BX)
|
||||||
VMOVDQU XWORD2, 32(BX)
|
VMOVDQU XWORD2, 32(BX)
|
||||||
VMOVDQU XWORD3, 48(BX)
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
|
||||||
|
RET
|
||||||
|
|
||||||
|
avx_8blocks:
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
VMOVDQU 64(DX), XWORD4
|
||||||
|
VMOVDQU 80(DX), XWORD5
|
||||||
|
VMOVDQU 96(DX), XWORD6
|
||||||
|
VMOVDQU 112(DX), XWORD7
|
||||||
|
|
||||||
|
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
VMOVDQU XWORD4, 64(BX)
|
||||||
|
VMOVDQU XWORD5, 80(BX)
|
||||||
|
VMOVDQU XWORD6, 96(BX)
|
||||||
|
VMOVDQU XWORD7, 112(BX)
|
||||||
|
|
||||||
|
avx_done_sm4:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
avx2:
|
avx2:
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
CMPQ DI, $64
|
|
||||||
JBE avx2_4blocks
|
CMPQ DI, $256
|
||||||
|
JEQ avx2_16blocks
|
||||||
|
|
||||||
avx2_8blocks:
|
avx2_8blocks:
|
||||||
VMOVDQU 0(DX), XDWORD0
|
VMOVDQU 0(DX), XDWORD0
|
||||||
@ -235,17 +247,7 @@ avx2_8blocks:
|
|||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
|
|
||||||
XORL CX, CX
|
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||||
|
|
||||||
avx2_loop:
|
|
||||||
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
|
||||||
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
|
||||||
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
|
||||||
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
|
||||||
|
|
||||||
ADDL $16, CX
|
|
||||||
CMPL CX, $4*32
|
|
||||||
JB avx2_loop
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
@ -260,49 +262,60 @@ avx2_loop:
|
|||||||
VMOVDQU XDWORD1, 32(BX)
|
VMOVDQU XDWORD1, 32(BX)
|
||||||
VMOVDQU XDWORD2, 64(BX)
|
VMOVDQU XDWORD2, 64(BX)
|
||||||
VMOVDQU XDWORD3, 96(BX)
|
VMOVDQU XDWORD3, 96(BX)
|
||||||
JMP avx2_sm4_done
|
|
||||||
|
|
||||||
avx2_4blocks:
|
VZEROUPPER
|
||||||
VMOVDQU 0(DX), XWORD0
|
RET
|
||||||
VMOVDQU 16(DX), XWORD1
|
|
||||||
VMOVDQU 32(DX), XWORD2
|
|
||||||
VMOVDQU 48(DX), XWORD3
|
|
||||||
|
|
||||||
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
avx2_16blocks:
|
||||||
|
VMOVDQU 0(DX), XDWORD0
|
||||||
|
VMOVDQU 32(DX), XDWORD1
|
||||||
|
VMOVDQU 64(DX), XDWORD2
|
||||||
|
VMOVDQU 96(DX), XDWORD3
|
||||||
|
VMOVDQU 128(DX), XDWORD4
|
||||||
|
VMOVDQU 160(DX), XDWORD5
|
||||||
|
VMOVDQU 192(DX), XDWORD6
|
||||||
|
VMOVDQU 224(DX), XDWORD7
|
||||||
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
|
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||||
|
|
||||||
XORL CX, CX
|
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
|
||||||
|
|
||||||
avx2_4blocks_loop:
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
|
||||||
|
|
||||||
ADDL $16, CX
|
|
||||||
CMPL CX, $4*32
|
|
||||||
JB avx2_4blocks_loop
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
|
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||||
|
|
||||||
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||||
VMOVDQU XWORD0, 0(BX)
|
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||||
VMOVDQU XWORD1, 16(BX)
|
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||||
VMOVDQU XWORD2, 32(BX)
|
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||||
VMOVDQU XWORD3, 48(BX)
|
|
||||||
|
VMOVDQU XDWORD0, 0(BX)
|
||||||
|
VMOVDQU XDWORD1, 32(BX)
|
||||||
|
VMOVDQU XDWORD2, 64(BX)
|
||||||
|
VMOVDQU XDWORD3, 96(BX)
|
||||||
|
VMOVDQU XDWORD4, 128(BX)
|
||||||
|
VMOVDQU XDWORD5, 160(BX)
|
||||||
|
VMOVDQU XDWORD6, 192(BX)
|
||||||
|
VMOVDQU XDWORD7, 224(BX)
|
||||||
|
|
||||||
avx2_sm4_done:
|
avx2_sm4_done:
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
|
@ -9,6 +9,10 @@
|
|||||||
#define t1 V3
|
#define t1 V3
|
||||||
#define t2 V4
|
#define t2 V4
|
||||||
#define t3 V5
|
#define t3 V5
|
||||||
|
#define t4 V8
|
||||||
|
#define t5 V9
|
||||||
|
#define t6 V10
|
||||||
|
#define t7 V11
|
||||||
#define ZERO V16
|
#define ZERO V16
|
||||||
#define NIBBLE_MASK V20
|
#define NIBBLE_MASK V20
|
||||||
#define INVERSE_SHIFT_ROWS V21
|
#define INVERSE_SHIFT_ROWS V21
|
||||||
@ -184,6 +188,9 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
|||||||
CMP $1, R11
|
CMP $1, R11
|
||||||
BEQ sm4niblocks
|
BEQ sm4niblocks
|
||||||
|
|
||||||
|
CMP $128, R12
|
||||||
|
BEQ double_enc
|
||||||
|
|
||||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
VREV32 t0.B16, t0.B16
|
VREV32 t0.B16, t0.B16
|
||||||
VREV32 t1.B16, t1.B16
|
VREV32 t1.B16, t1.B16
|
||||||
@ -215,6 +222,51 @@ encryptBlocksLoop:
|
|||||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
|
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
double_enc:
|
||||||
|
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
VREV32 t4.B16, t4.B16
|
||||||
|
VREV32 t5.B16, t5.B16
|
||||||
|
VREV32 t6.B16, t6.B16
|
||||||
|
VREV32 t7.B16, t7.B16
|
||||||
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
|
load_global_data_2()
|
||||||
|
|
||||||
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
EOR R0, R0
|
||||||
|
|
||||||
|
encrypt8BlocksLoop:
|
||||||
|
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||||
|
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
||||||
|
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
||||||
|
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE encrypt8BlocksLoop
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
VREV32 t4.B16, t4.B16
|
||||||
|
VREV32 t5.B16, t5.B16
|
||||||
|
VREV32 t6.B16, t6.B16
|
||||||
|
VREV32 t7.B16, t7.B16
|
||||||
|
|
||||||
|
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
|
||||||
|
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
|
||||||
|
|
||||||
|
RET
|
||||||
|
|
||||||
sm4niblocks:
|
sm4niblocks:
|
||||||
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
|
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
|
||||||
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
|
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
|
||||||
|
@ -74,11 +74,18 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
|
|||||||
// Copy the last block of ciphertext in preparation as the new iv.
|
// Copy the last block of ciphertext in preparation as the new iv.
|
||||||
copy(x.tmp, src[end-BlockSize:end])
|
copy(x.tmp, src[end-BlockSize:end])
|
||||||
|
|
||||||
start := end - x.b.blocksSize
|
|
||||||
var temp []byte = make([]byte, x.b.blocksSize)
|
|
||||||
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
|
|
||||||
|
|
||||||
decKeyPtr := &x.b.dec[0]
|
decKeyPtr := &x.b.dec[0]
|
||||||
|
|
||||||
|
start := end - 2*x.b.blocksSize
|
||||||
|
for start > 0 {
|
||||||
|
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
||||||
|
end = start
|
||||||
|
start -= 2*x.b.blocksSize
|
||||||
|
}
|
||||||
|
|
||||||
|
start = end - x.b.blocksSize
|
||||||
for start > 0 {
|
for start > 0 {
|
||||||
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
||||||
end = start
|
end = start
|
||||||
@ -86,6 +93,8 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Handle remain first blocks
|
// Handle remain first blocks
|
||||||
|
var temp []byte = make([]byte, x.b.blocksSize)
|
||||||
|
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
|
||||||
copy(batchSrc, x.iv)
|
copy(batchSrc, x.iv)
|
||||||
copy(batchSrc[BlockSize:], src[:end])
|
copy(batchSrc[BlockSize:], src[:end])
|
||||||
decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0])
|
decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0])
|
||||||
|
@ -85,6 +85,11 @@ done_sm4:
|
|||||||
#define XDWORD2 Y6
|
#define XDWORD2 Y6
|
||||||
#define XDWORD3 Y7
|
#define XDWORD3 Y7
|
||||||
|
|
||||||
|
#define XDWORD4 Y10
|
||||||
|
#define XDWORD5 Y11
|
||||||
|
#define XDWORD6 Y12
|
||||||
|
#define XDWORD7 Y14
|
||||||
|
|
||||||
#define XWTMP0 X0
|
#define XWTMP0 X0
|
||||||
#define XWTMP1 X1
|
#define XWTMP1 X1
|
||||||
#define XWTMP2 X2
|
#define XWTMP2 X2
|
||||||
@ -94,6 +99,11 @@ done_sm4:
|
|||||||
#define XWORD2 X6
|
#define XWORD2 X6
|
||||||
#define XWORD3 X7
|
#define XWORD3 X7
|
||||||
|
|
||||||
|
#define XWORD4 X10
|
||||||
|
#define XWORD5 X11
|
||||||
|
#define XWORD6 X12
|
||||||
|
#define XWORD7 X14
|
||||||
|
|
||||||
#define NIBBLE_MASK Y3
|
#define NIBBLE_MASK Y3
|
||||||
#define X_NIBBLE_MASK X3
|
#define X_NIBBLE_MASK X3
|
||||||
|
|
||||||
@ -111,6 +121,7 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
|||||||
MOVQ xk+0(FP), AX
|
MOVQ xk+0(FP), AX
|
||||||
MOVQ dst+8(FP), BX
|
MOVQ dst+8(FP), BX
|
||||||
MOVQ src+32(FP), DX
|
MOVQ src+32(FP), DX
|
||||||
|
MOVQ src_len+40(FP), DI
|
||||||
MOVQ iv+56(FP), SI
|
MOVQ iv+56(FP), SI
|
||||||
|
|
||||||
CMPB ·useAVX2(SB), $1
|
CMPB ·useAVX2(SB), $1
|
||||||
@ -120,84 +131,71 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
|||||||
JE avx
|
JE avx
|
||||||
|
|
||||||
non_avx2_start:
|
non_avx2_start:
|
||||||
MOVOU 0(DX), t0
|
CMPQ DI, $128
|
||||||
MOVOU 16(DX), t1
|
JEQ sse_8blocks
|
||||||
MOVOU 32(DX), t2
|
|
||||||
MOVOU 48(DX), t3
|
|
||||||
PSHUFB flip_mask<>(SB), t0
|
|
||||||
PSHUFB flip_mask<>(SB), t1
|
|
||||||
PSHUFB flip_mask<>(SB), t2
|
|
||||||
PSHUFB flip_mask<>(SB), t3
|
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)
|
|
||||||
|
|
||||||
XORL CX, CX
|
MOVOU 0(DX), XWORD0
|
||||||
|
MOVOU 16(DX), XWORD1
|
||||||
|
MOVOU 32(DX), XWORD2
|
||||||
|
MOVOU 48(DX), XWORD3
|
||||||
|
|
||||||
loop:
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
|
||||||
SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
|
||||||
SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
|
||||||
SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
|
||||||
|
|
||||||
ADDL $16, CX
|
PXOR 0(SI), XWORD0
|
||||||
CMPL CX, $4*32
|
PXOR 16(SI), XWORD1
|
||||||
JB loop
|
PXOR 32(SI), XWORD2
|
||||||
|
PXOR 48(SI), XWORD3
|
||||||
|
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);
|
MOVUPS XWORD0, 0(BX)
|
||||||
PSHUFB bswap_mask<>(SB), t3
|
MOVUPS XWORD1, 16(BX)
|
||||||
PSHUFB bswap_mask<>(SB), t2
|
MOVUPS XWORD2, 32(BX)
|
||||||
PSHUFB bswap_mask<>(SB), t1
|
MOVUPS XWORD3, 48(BX)
|
||||||
PSHUFB bswap_mask<>(SB), t0
|
|
||||||
|
|
||||||
PXOR 0(SI), t0
|
RET
|
||||||
PXOR 16(SI), t1
|
|
||||||
PXOR 32(SI), t2
|
|
||||||
PXOR 48(SI), t3
|
|
||||||
|
|
||||||
MOVUPS t0, 0(BX)
|
sse_8blocks:
|
||||||
MOVUPS t1, 16(BX)
|
MOVOU 0(DX), XWORD0
|
||||||
MOVUPS t2, 32(BX)
|
MOVOU 16(DX), XWORD1
|
||||||
MOVUPS t3, 48(BX)
|
MOVOU 32(DX), XWORD2
|
||||||
|
MOVOU 48(DX), XWORD3
|
||||||
|
MOVOU 64(DX), XWORD4
|
||||||
|
MOVOU 80(DX), XWORD5
|
||||||
|
MOVOU 96(DX), XWORD6
|
||||||
|
MOVOU 112(DX), XWORD7
|
||||||
|
|
||||||
|
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||||
|
|
||||||
|
PXOR 0(SI), XWORD0
|
||||||
|
PXOR 16(SI), XWORD1
|
||||||
|
PXOR 32(SI), XWORD2
|
||||||
|
PXOR 48(SI), XWORD3
|
||||||
|
PXOR 64(SI), XWORD4
|
||||||
|
PXOR 80(SI), XWORD5
|
||||||
|
PXOR 96(SI), XWORD6
|
||||||
|
PXOR 112(SI), XWORD7
|
||||||
|
|
||||||
|
MOVOU XWORD0, 0(BX)
|
||||||
|
MOVOU XWORD1, 16(BX)
|
||||||
|
MOVOU XWORD2, 32(BX)
|
||||||
|
MOVOU XWORD3, 48(BX)
|
||||||
|
MOVOU XWORD4, 64(BX)
|
||||||
|
MOVOU XWORD5, 80(BX)
|
||||||
|
MOVOU XWORD6, 96(BX)
|
||||||
|
MOVOU XWORD7, 112(BX)
|
||||||
|
|
||||||
done_sm4:
|
done_sm4:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
avx:
|
avx:
|
||||||
|
CMPQ DI, $128
|
||||||
|
JEQ avx_8blocks
|
||||||
|
|
||||||
VMOVDQU 0(DX), XWORD0
|
VMOVDQU 0(DX), XWORD0
|
||||||
VMOVDQU 16(DX), XWORD1
|
VMOVDQU 16(DX), XWORD1
|
||||||
VMOVDQU 32(DX), XWORD2
|
VMOVDQU 32(DX), XWORD2
|
||||||
VMOVDQU 48(DX), XWORD3
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
|
||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
|
|
||||||
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
|
||||||
|
|
||||||
XORL CX, CX
|
|
||||||
|
|
||||||
avx_loop:
|
|
||||||
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
|
|
||||||
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
|
|
||||||
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
|
|
||||||
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)
|
|
||||||
|
|
||||||
ADDL $16, CX
|
|
||||||
CMPL CX, $4*32
|
|
||||||
JB avx_loop
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
|
|
||||||
|
|
||||||
VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
|
||||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
VPXOR 16(SI), XWORD1, XWORD1
|
VPXOR 16(SI), XWORD1, XWORD1
|
||||||
@ -208,11 +206,45 @@ avx_loop:
|
|||||||
VMOVDQU XWORD1, 16(BX)
|
VMOVDQU XWORD1, 16(BX)
|
||||||
VMOVDQU XWORD2, 32(BX)
|
VMOVDQU XWORD2, 32(BX)
|
||||||
VMOVDQU XWORD3, 48(BX)
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
RET
|
||||||
|
|
||||||
|
avx_8blocks:
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
VMOVDQU 64(DX), XWORD4
|
||||||
|
VMOVDQU 80(DX), XWORD5
|
||||||
|
VMOVDQU 96(DX), XWORD6
|
||||||
|
VMOVDQU 112(DX), XWORD7
|
||||||
|
|
||||||
|
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
VPXOR 16(SI), XWORD1, XWORD1
|
||||||
|
VPXOR 32(SI), XWORD2, XWORD2
|
||||||
|
VPXOR 48(SI), XWORD3, XWORD3
|
||||||
|
VPXOR 64(SI), XWORD4, XWORD4
|
||||||
|
VPXOR 80(SI), XWORD5, XWORD5
|
||||||
|
VPXOR 96(SI), XWORD6, XWORD6
|
||||||
|
VPXOR 112(SI), XWORD7, XWORD7
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
VMOVDQU XWORD4, 64(BX)
|
||||||
|
VMOVDQU XWORD5, 80(BX)
|
||||||
|
VMOVDQU XWORD6, 96(BX)
|
||||||
|
VMOVDQU XWORD7, 112(BX)
|
||||||
|
|
||||||
|
avx_sm4_done:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
avx2:
|
avx2:
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
|
CMPQ DI, $256
|
||||||
|
JEQ avx2_16blocks
|
||||||
|
|
||||||
avx2_8blocks:
|
avx2_8blocks:
|
||||||
VMOVDQU 0(DX), XDWORD0
|
VMOVDQU 0(DX), XDWORD0
|
||||||
@ -230,17 +262,7 @@ avx2_8blocks:
|
|||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
|
|
||||||
XORL CX, CX
|
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||||
|
|
||||||
avx2_loop:
|
|
||||||
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
|
||||||
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
|
||||||
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
|
||||||
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
|
||||||
|
|
||||||
ADDL $16, CX
|
|
||||||
CMPL CX, $4*32
|
|
||||||
JB avx2_loop
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
@ -261,6 +283,68 @@ avx2_loop:
|
|||||||
VMOVDQU XDWORD2, 64(BX)
|
VMOVDQU XDWORD2, 64(BX)
|
||||||
VMOVDQU XDWORD3, 96(BX)
|
VMOVDQU XDWORD3, 96(BX)
|
||||||
|
|
||||||
|
VZEROUPPER
|
||||||
|
RET
|
||||||
|
|
||||||
|
avx2_16blocks:
|
||||||
|
VMOVDQU 0(DX), XDWORD0
|
||||||
|
VMOVDQU 32(DX), XDWORD1
|
||||||
|
VMOVDQU 64(DX), XDWORD2
|
||||||
|
VMOVDQU 96(DX), XDWORD3
|
||||||
|
VMOVDQU 128(DX), XDWORD4
|
||||||
|
VMOVDQU 160(DX), XDWORD5
|
||||||
|
VMOVDQU 192(DX), XDWORD6
|
||||||
|
VMOVDQU 224(DX), XDWORD7
|
||||||
|
|
||||||
|
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||||
|
|
||||||
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
|
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||||
|
|
||||||
|
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||||
|
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||||
|
|
||||||
|
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||||
|
|
||||||
|
VPXOR 0(SI), XDWORD0, XDWORD0
|
||||||
|
VPXOR 32(SI), XDWORD1, XDWORD1
|
||||||
|
VPXOR 64(SI), XDWORD2, XDWORD2
|
||||||
|
VPXOR 96(SI), XDWORD3, XDWORD3
|
||||||
|
VPXOR 128(SI), XDWORD4, XDWORD4
|
||||||
|
VPXOR 160(SI), XDWORD5, XDWORD5
|
||||||
|
VPXOR 192(SI), XDWORD6, XDWORD6
|
||||||
|
VPXOR 224(SI), XDWORD7, XDWORD7
|
||||||
|
|
||||||
|
VMOVDQU XDWORD0, 0(BX)
|
||||||
|
VMOVDQU XDWORD1, 32(BX)
|
||||||
|
VMOVDQU XDWORD2, 64(BX)
|
||||||
|
VMOVDQU XDWORD3, 96(BX)
|
||||||
|
VMOVDQU XDWORD4, 128(BX)
|
||||||
|
VMOVDQU XDWORD5, 160(BX)
|
||||||
|
VMOVDQU XDWORD6, 192(BX)
|
||||||
|
VMOVDQU XDWORD7, 224(BX)
|
||||||
|
|
||||||
avx2_sm4_done:
|
avx2_sm4_done:
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
|
@ -88,6 +88,10 @@ done_sm4:
|
|||||||
#undef rkSave
|
#undef rkSave
|
||||||
|
|
||||||
#define XTMP7 V7
|
#define XTMP7 V7
|
||||||
|
#define t4 V10
|
||||||
|
#define t5 V11
|
||||||
|
#define t6 V12
|
||||||
|
#define t7 V13
|
||||||
|
|
||||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||||
@ -99,6 +103,8 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
|||||||
MOVD src_len+40(FP), R12
|
MOVD src_len+40(FP), R12
|
||||||
MOVD iv+56(FP), R11
|
MOVD iv+56(FP), R11
|
||||||
|
|
||||||
|
CMP $128, R12
|
||||||
|
BEQ double_dec
|
||||||
|
|
||||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
VREV32 t0.B16, t0.B16
|
VREV32 t0.B16, t0.B16
|
||||||
@ -135,3 +141,57 @@ encryptBlocksLoop:
|
|||||||
|
|
||||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
|
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
double_dec:
|
||||||
|
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
VREV32 t4.B16, t4.B16
|
||||||
|
VREV32 t5.B16, t5.B16
|
||||||
|
VREV32 t6.B16, t6.B16
|
||||||
|
VREV32 t7.B16, t7.B16
|
||||||
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
EOR R0, R0
|
||||||
|
|
||||||
|
decrypt8BlocksLoop:
|
||||||
|
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||||
|
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
||||||
|
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
||||||
|
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE decrypt8BlocksLoop
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
VREV32 t4.B16, t4.B16
|
||||||
|
VREV32 t5.B16, t5.B16
|
||||||
|
VREV32 t6.B16, t6.B16
|
||||||
|
VREV32 t7.B16, t7.B16
|
||||||
|
|
||||||
|
VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4]
|
||||||
|
VEOR V6.B16, t0.B16, t0.B16
|
||||||
|
VEOR V7.B16, t1.B16, t1.B16
|
||||||
|
VEOR V8.B16, t2.B16, t2.B16
|
||||||
|
VEOR V9.B16, t3.B16, t3.B16
|
||||||
|
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
|
||||||
|
|
||||||
|
VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4]
|
||||||
|
VEOR V6.B16, t4.B16, t4.B16
|
||||||
|
VEOR V7.B16, t5.B16, t5.B16
|
||||||
|
VEOR V8.B16, t6.B16, t6.B16
|
||||||
|
VEOR V9.B16, t7.B16, t7.B16
|
||||||
|
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
|
||||||
|
|
||||||
|
RET
|
||||||
|
@ -54,6 +54,15 @@ func (x *ecb) CryptBlocks(dst, src []byte) {
|
|||||||
if len(src) == 0 {
|
if len(src) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
for len(src) >= 2*x.b.blocksSize {
|
||||||
|
if x.enc == ecbEncrypt {
|
||||||
|
x.b.EncryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
|
||||||
|
} else {
|
||||||
|
x.b.DecryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
|
||||||
|
}
|
||||||
|
src = src[2*x.b.blocksSize:]
|
||||||
|
dst = dst[2*x.b.blocksSize:]
|
||||||
|
}
|
||||||
for len(src) >= x.b.blocksSize {
|
for len(src) >= x.b.blocksSize {
|
||||||
if x.enc == ecbEncrypt {
|
if x.enc == ecbEncrypt {
|
||||||
x.b.EncryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])
|
x.b.EncryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])
|
||||||
|
378
sm4/gcm_amd64.s
378
sm4/gcm_amd64.s
@ -155,114 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
|||||||
#undef plen
|
#undef plen
|
||||||
#undef dlen
|
#undef dlen
|
||||||
|
|
||||||
#define AVX_SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
|
|
||||||
VMOVDQU flip_mask<>(SB), x \
|
|
||||||
VPSHUFB x, t0, t0 \
|
|
||||||
VPSHUFB x, t1, t1 \
|
|
||||||
VPSHUFB x, t2, t2 \
|
|
||||||
VPSHUFB x, t3, t3 \
|
|
||||||
; \
|
|
||||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
|
||||||
XORL IND, IND \
|
|
||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK \
|
|
||||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
; \ // Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \
|
|
||||||
VPSHUFB BSWAP, t0, t0 \
|
|
||||||
VPSHUFB BSWAP, t1, t1 \
|
|
||||||
VPSHUFB BSWAP, t2, t2 \
|
|
||||||
VPSHUFB BSWAP, t3, t3 \
|
|
||||||
|
|
||||||
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
|
|
||||||
PSHUFB flip_mask<>(SB), t0; \
|
|
||||||
PSHUFB flip_mask<>(SB), t1; \
|
|
||||||
PSHUFB flip_mask<>(SB), t2; \
|
|
||||||
PSHUFB flip_mask<>(SB), t3; \
|
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
|
||||||
XORL IND, IND; \
|
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
ADDL $16, IND; \
|
|
||||||
SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \
|
|
||||||
SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \
|
|
||||||
SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \
|
|
||||||
SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \
|
|
||||||
SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \
|
|
||||||
PSHUFB BSWAP, t3; \
|
|
||||||
PSHUFB BSWAP, t2; \
|
|
||||||
PSHUFB BSWAP, t1; \
|
|
||||||
PSHUFB BSWAP, t0
|
|
||||||
|
|
||||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
||||||
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||||
#define dst DI
|
#define dst DI
|
||||||
@ -676,12 +568,12 @@ TEXT ·gcmSm4Enc(SB),0,$256-96
|
|||||||
MOVOU (8*16 + 6*16)(SP), B6
|
MOVOU (8*16 + 6*16)(SP), B6
|
||||||
MOVOU (8*16 + 7*16)(SP), B7
|
MOVOU (8*16 + 7*16)(SP), B7
|
||||||
|
|
||||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
|
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
|
PXOR ACC1, ACC1
|
||||||
increment(0)
|
increment(0)
|
||||||
increment(1)
|
increment(1)
|
||||||
increment(2)
|
increment(2)
|
||||||
increment(3)
|
increment(3)
|
||||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
|
|
||||||
increment(4)
|
increment(4)
|
||||||
increment(5)
|
increment(5)
|
||||||
increment(6)
|
increment(6)
|
||||||
@ -762,7 +654,6 @@ gcmSm4EncOctetsLoop:
|
|||||||
PCLMULQDQ $0x00, T0, ACC0
|
PCLMULQDQ $0x00, T0, ACC0
|
||||||
PCLMULQDQ $0x11, T0, ACC1
|
PCLMULQDQ $0x11, T0, ACC1
|
||||||
|
|
||||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
|
|
||||||
mulRound(1)
|
mulRound(1)
|
||||||
increment(0)
|
increment(0)
|
||||||
mulRound(2)
|
mulRound(2)
|
||||||
@ -771,7 +662,6 @@ gcmSm4EncOctetsLoop:
|
|||||||
increment(2)
|
increment(2)
|
||||||
mulRound(4)
|
mulRound(4)
|
||||||
increment(3)
|
increment(3)
|
||||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
|
|
||||||
mulRound(5)
|
mulRound(5)
|
||||||
increment(4)
|
increment(4)
|
||||||
mulRound(6)
|
mulRound(6)
|
||||||
@ -791,6 +681,8 @@ gcmSm4EncOctetsLoop:
|
|||||||
reduceRound(ACC0)
|
reduceRound(ACC0)
|
||||||
PXOR ACC1, ACC0
|
PXOR ACC1, ACC0
|
||||||
|
|
||||||
|
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
|
|
||||||
MOVOU (16*0)(ptx), T0
|
MOVOU (16*0)(ptx), T0
|
||||||
PXOR T0, B0
|
PXOR T0, B0
|
||||||
MOVOU (16*1)(ptx), T0
|
MOVOU (16*1)(ptx), T0
|
||||||
@ -886,7 +778,7 @@ gcmSm4EncNibbles:
|
|||||||
MOVOU (8*16 + 2*16)(SP), B2
|
MOVOU (8*16 + 2*16)(SP), B2
|
||||||
MOVOU (8*16 + 3*16)(SP), B3
|
MOVOU (8*16 + 3*16)(SP), B3
|
||||||
|
|
||||||
SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
|
SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
|
||||||
MOVOU (16*0)(ptx), T0
|
MOVOU (16*0)(ptx), T0
|
||||||
PXOR T0, B0
|
PXOR T0, B0
|
||||||
MOVOU (16*1)(ptx), T0
|
MOVOU (16*1)(ptx), T0
|
||||||
@ -922,7 +814,7 @@ gcmSm4EncSingles:
|
|||||||
MOVOU (8*16 + 2*16)(SP), B2
|
MOVOU (8*16 + 2*16)(SP), B2
|
||||||
MOVOU (8*16 + 3*16)(SP), B3
|
MOVOU (8*16 + 3*16)(SP), B3
|
||||||
|
|
||||||
SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
|
SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
|
||||||
MOVOU B0, (16*0)(SP)
|
MOVOU B0, (16*0)(SP)
|
||||||
MOVOU B1, (16*1)(SP)
|
MOVOU B1, (16*1)(SP)
|
||||||
MOVOU B2, (16*2)(SP)
|
MOVOU B2, (16*2)(SP)
|
||||||
@ -1014,17 +906,30 @@ avxGcmSm4Enc:
|
|||||||
VMOVDQU (8*16 + 1*16)(SP), B1
|
VMOVDQU (8*16 + 1*16)(SP), B1
|
||||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||||
|
VMOVDQU (8*16 + 4*16)(SP), B4
|
||||||
|
VMOVDQU (8*16 + 5*16)(SP), B5
|
||||||
|
VMOVDQU (8*16 + 6*16)(SP), B6
|
||||||
|
VMOVDQU (8*16 + 7*16)(SP), B7
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3)
|
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
|
VPXOR ACC1, ACC1, ACC1 // clean ACC1
|
||||||
increment(0)
|
increment(0)
|
||||||
increment(1)
|
increment(1)
|
||||||
increment(2)
|
increment(2)
|
||||||
increment(3)
|
increment(3)
|
||||||
|
increment(4)
|
||||||
|
increment(5)
|
||||||
|
increment(6)
|
||||||
|
increment(7)
|
||||||
// XOR plaintext
|
// XOR plaintext
|
||||||
VPXOR (16*0)(ptx), B0, B0
|
VPXOR (16*0)(ptx), B0, B0
|
||||||
VPXOR (16*1)(ptx), B1, B1
|
VPXOR (16*1)(ptx), B1, B1
|
||||||
VPXOR (16*2)(ptx), B2, B2
|
VPXOR (16*2)(ptx), B2, B2
|
||||||
VPXOR (16*3)(ptx), B3, B3
|
VPXOR (16*3)(ptx), B3, B3
|
||||||
|
VPXOR (16*4)(ptx), B4, B4
|
||||||
|
VPXOR (16*5)(ptx), B5, B5
|
||||||
|
VPXOR (16*6)(ptx), B6, B6
|
||||||
|
VPXOR (16*7)(ptx), B7, B7
|
||||||
// Store ciphertext
|
// Store ciphertext
|
||||||
VMOVDQU B0, (16*0)(ctx)
|
VMOVDQU B0, (16*0)(ctx)
|
||||||
VPSHUFB BSWAP, B0, B0
|
VPSHUFB BSWAP, B0, B0
|
||||||
@ -1034,31 +939,6 @@ avxGcmSm4Enc:
|
|||||||
VPSHUFB BSWAP, B2, B2
|
VPSHUFB BSWAP, B2, B2
|
||||||
VMOVDQU B3, (16*3)(ctx)
|
VMOVDQU B3, (16*3)(ctx)
|
||||||
VPSHUFB BSWAP, B3, B3
|
VPSHUFB BSWAP, B3, B3
|
||||||
VPXOR ACC0, B0, B0
|
|
||||||
|
|
||||||
VMOVDQU B0, (16*0)(SP)
|
|
||||||
VMOVDQU B1, (16*1)(SP)
|
|
||||||
VMOVDQU B2, (16*2)(SP)
|
|
||||||
VMOVDQU B3, (16*3)(SP)
|
|
||||||
|
|
||||||
// load 4 ctrs for encryption
|
|
||||||
VMOVDQU (8*16 + 4*16)(SP), B4
|
|
||||||
VMOVDQU (8*16 + 5*16)(SP), B5
|
|
||||||
VMOVDQU (8*16 + 6*16)(SP), B6
|
|
||||||
VMOVDQU (8*16 + 7*16)(SP), B7
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
|
|
||||||
increment(4)
|
|
||||||
increment(5)
|
|
||||||
increment(6)
|
|
||||||
increment(7)
|
|
||||||
|
|
||||||
// XOR plaintext
|
|
||||||
VPXOR (16*4)(ptx), B4, B4
|
|
||||||
VPXOR (16*5)(ptx), B5, B5
|
|
||||||
VPXOR (16*6)(ptx), B6, B6
|
|
||||||
VPXOR (16*7)(ptx), B7, B7
|
|
||||||
|
|
||||||
// Store ciphertext
|
|
||||||
VMOVDQU B4, (16*4)(ctx)
|
VMOVDQU B4, (16*4)(ctx)
|
||||||
VPSHUFB BSWAP, B4, B4
|
VPSHUFB BSWAP, B4, B4
|
||||||
VMOVDQU B5, (16*5)(ctx)
|
VMOVDQU B5, (16*5)(ctx)
|
||||||
@ -1068,6 +948,12 @@ avxGcmSm4Enc:
|
|||||||
VMOVDQU B7, (16*7)(ctx)
|
VMOVDQU B7, (16*7)(ctx)
|
||||||
VPSHUFB BSWAP, B7, B7
|
VPSHUFB BSWAP, B7, B7
|
||||||
|
|
||||||
|
VPXOR ACC0, B0, B0
|
||||||
|
|
||||||
|
VMOVDQU B0, (16*0)(SP)
|
||||||
|
VMOVDQU B1, (16*1)(SP)
|
||||||
|
VMOVDQU B2, (16*2)(SP)
|
||||||
|
VMOVDQU B3, (16*3)(SP)
|
||||||
VMOVDQU B4, (16*4)(SP)
|
VMOVDQU B4, (16*4)(SP)
|
||||||
VMOVDQU B5, (16*5)(SP)
|
VMOVDQU B5, (16*5)(SP)
|
||||||
VMOVDQU B6, (16*6)(SP)
|
VMOVDQU B6, (16*6)(SP)
|
||||||
@ -1129,12 +1015,16 @@ avxGcmSm4EncOctetsLoop:
|
|||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
VPXOR ACC1, ACC0, ACC0
|
VPXOR ACC1, ACC0, ACC0
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, ACC1, T1, T2, B0, B1, B2, B3)
|
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
// XOR plaintext
|
// XOR plaintext
|
||||||
VPXOR (16*0)(ptx), B0, B0
|
VPXOR (16*0)(ptx), B0, B0
|
||||||
VPXOR (16*1)(ptx), B1, B1
|
VPXOR (16*1)(ptx), B1, B1
|
||||||
VPXOR (16*2)(ptx), B2, B2
|
VPXOR (16*2)(ptx), B2, B2
|
||||||
VPXOR (16*3)(ptx), B3, B3
|
VPXOR (16*3)(ptx), B3, B3
|
||||||
|
VPXOR (16*4)(ptx), B4, B4
|
||||||
|
VPXOR (16*5)(ptx), B5, B5
|
||||||
|
VPXOR (16*6)(ptx), B6, B6
|
||||||
|
VPXOR (16*7)(ptx), B7, B7
|
||||||
|
|
||||||
// Store ciphertext
|
// Store ciphertext
|
||||||
VMOVDQU B0, (16*0)(ctx)
|
VMOVDQU B0, (16*0)(ctx)
|
||||||
@ -1145,21 +1035,6 @@ avxGcmSm4EncOctetsLoop:
|
|||||||
VPSHUFB BSWAP, B2, B2
|
VPSHUFB BSWAP, B2, B2
|
||||||
VMOVDQU B3, (16*3)(ctx)
|
VMOVDQU B3, (16*3)(ctx)
|
||||||
VPSHUFB BSWAP, B3, B3
|
VPSHUFB BSWAP, B3, B3
|
||||||
|
|
||||||
VPXOR ACC0, B0, B0
|
|
||||||
VMOVDQU B0, (16*0)(SP)
|
|
||||||
VMOVDQU B1, (16*1)(SP)
|
|
||||||
VMOVDQU B2, (16*2)(SP)
|
|
||||||
VMOVDQU B3, (16*3)(SP)
|
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
|
|
||||||
// XOR plaintext
|
|
||||||
VPXOR (16*4)(ptx), B4, B4
|
|
||||||
VPXOR (16*5)(ptx), B5, B5
|
|
||||||
VPXOR (16*6)(ptx), B6, B6
|
|
||||||
VPXOR (16*7)(ptx), B7, B7
|
|
||||||
|
|
||||||
// Store ciphertext
|
|
||||||
VMOVDQU B4, (16*4)(ctx)
|
VMOVDQU B4, (16*4)(ctx)
|
||||||
VPSHUFB BSWAP, B4, B4
|
VPSHUFB BSWAP, B4, B4
|
||||||
VMOVDQU B5, (16*5)(ctx)
|
VMOVDQU B5, (16*5)(ctx)
|
||||||
@ -1169,6 +1044,11 @@ avxGcmSm4EncOctetsLoop:
|
|||||||
VMOVDQU B7, (16*7)(ctx)
|
VMOVDQU B7, (16*7)(ctx)
|
||||||
VPSHUFB BSWAP, B7, B7
|
VPSHUFB BSWAP, B7, B7
|
||||||
|
|
||||||
|
VPXOR ACC0, B0, B0
|
||||||
|
VMOVDQU B0, (16*0)(SP)
|
||||||
|
VMOVDQU B1, (16*1)(SP)
|
||||||
|
VMOVDQU B2, (16*2)(SP)
|
||||||
|
VMOVDQU B3, (16*3)(SP)
|
||||||
VMOVDQU B4, (16*4)(SP)
|
VMOVDQU B4, (16*4)(SP)
|
||||||
VMOVDQU B5, (16*5)(SP)
|
VMOVDQU B5, (16*5)(SP)
|
||||||
VMOVDQU B6, (16*6)(SP)
|
VMOVDQU B6, (16*6)(SP)
|
||||||
@ -1226,7 +1106,7 @@ avxGcmSm4EncNibbles:
|
|||||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3)
|
AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
|
||||||
// XOR plaintext
|
// XOR plaintext
|
||||||
VPXOR (16*0)(ptx), B0, B0
|
VPXOR (16*0)(ptx), B0, B0
|
||||||
VPXOR (16*1)(ptx), B1, B1
|
VPXOR (16*1)(ptx), B1, B1
|
||||||
@ -1261,7 +1141,7 @@ avxGcmSm4EncSingles:
|
|||||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3)
|
AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
|
||||||
VMOVDQU B0, (16*0)(SP)
|
VMOVDQU B0, (16*0)(SP)
|
||||||
VMOVDQU B1, (16*1)(SP)
|
VMOVDQU B1, (16*1)(SP)
|
||||||
VMOVDQU B2, (16*2)(SP)
|
VMOVDQU B2, (16*2)(SP)
|
||||||
@ -1364,18 +1244,9 @@ avx2GcmSm4Enc:
|
|||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
XORL BX, BX
|
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Enc8Loop1:
|
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||||
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
|
||||||
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
|
|
||||||
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
|
||||||
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
|
||||||
|
|
||||||
ADDL $16, BX
|
|
||||||
CMPL BX, $4*32
|
|
||||||
JB avx2GcmSm4Enc8Loop1
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
@ -1458,18 +1329,9 @@ avx2GcmSm4EncOctetsLoop:
|
|||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
XORL BX, BX
|
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Enc8Loop2:
|
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||||
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
|
||||||
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
|
|
||||||
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
|
||||||
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
|
||||||
|
|
||||||
ADDL $16, BX
|
|
||||||
CMPL BX, $4*32
|
|
||||||
JB avx2GcmSm4Enc8Loop2
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
@ -1578,7 +1440,6 @@ avx2GcmSm4EncOctetsEnd:
|
|||||||
SUBQ $4, aluCTR
|
SUBQ $4, aluCTR
|
||||||
|
|
||||||
avx2GcmSm4EncNibbles:
|
avx2GcmSm4EncNibbles:
|
||||||
VMOVDQU flip_mask<>(SB), B7
|
|
||||||
CMPQ ptxLen, $64
|
CMPQ ptxLen, $64
|
||||||
JBE avx2GcmSm4EncSingles
|
JBE avx2GcmSm4EncSingles
|
||||||
SUBQ $64, ptxLen
|
SUBQ $64, ptxLen
|
||||||
@ -1588,31 +1449,7 @@ avx2GcmSm4EncNibbles:
|
|||||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||||
|
|
||||||
VPSHUFB B7, B0, B0
|
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
||||||
VPSHUFB B7, B1, B1
|
|
||||||
VPSHUFB B7, B2, B2
|
|
||||||
VPSHUFB B7, B3, B3
|
|
||||||
|
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
|
||||||
XORL BX, BX
|
|
||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
|
||||||
|
|
||||||
avx2GcmSm4Enc4Loop2:
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
|
||||||
|
|
||||||
ADDL $16, BX
|
|
||||||
CMPL BX, $4*32
|
|
||||||
JB avx2GcmSm4Enc4Loop2
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
|
||||||
VPSHUFB BSWAP, B0, B0
|
|
||||||
VPSHUFB BSWAP, B1, B1
|
|
||||||
VPSHUFB BSWAP, B2, B2
|
|
||||||
VPSHUFB BSWAP, B3, B3
|
|
||||||
|
|
||||||
VMOVDQU (16*0)(ptx), T0
|
VMOVDQU (16*0)(ptx), T0
|
||||||
VPXOR T0, B0, B0
|
VPXOR T0, B0, B0
|
||||||
@ -1650,31 +1487,7 @@ avx2GcmSm4EncSingles:
|
|||||||
VMOVDQU (8*16 + 2*16)(SP), B2
|
VMOVDQU (8*16 + 2*16)(SP), B2
|
||||||
VMOVDQU (8*16 + 3*16)(SP), B3
|
VMOVDQU (8*16 + 3*16)(SP), B3
|
||||||
|
|
||||||
VPSHUFB B7, B0, B0
|
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
||||||
VPSHUFB B7, B1, B1
|
|
||||||
VPSHUFB B7, B2, B2
|
|
||||||
VPSHUFB B7, B3, B3
|
|
||||||
|
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
|
||||||
XORL BX, BX
|
|
||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
|
||||||
|
|
||||||
avx2GcmSm4Enc4Loop1:
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
|
||||||
|
|
||||||
ADDL $16, BX
|
|
||||||
CMPL BX, $4*32
|
|
||||||
JB avx2GcmSm4Enc4Loop1
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
|
||||||
VPSHUFB BSWAP, B0, B0
|
|
||||||
VPSHUFB BSWAP, B1, B1
|
|
||||||
VPSHUFB BSWAP, B2, B2
|
|
||||||
VPSHUFB BSWAP, B3, B3
|
|
||||||
|
|
||||||
VMOVDQU B0, (16*0)(SP)
|
VMOVDQU B0, (16*0)(SP)
|
||||||
VMOVDQU B1, (16*1)(SP)
|
VMOVDQU B1, (16*1)(SP)
|
||||||
@ -1890,7 +1703,6 @@ gcmSm4DecOctetsLoop:
|
|||||||
PCLMULQDQ $0x00, T0, ACC0
|
PCLMULQDQ $0x00, T0, ACC0
|
||||||
PCLMULQDQ $0x11, T0, ACC1
|
PCLMULQDQ $0x11, T0, ACC1
|
||||||
|
|
||||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
|
|
||||||
decMulRound(1)
|
decMulRound(1)
|
||||||
increment(0)
|
increment(0)
|
||||||
decMulRound(2)
|
decMulRound(2)
|
||||||
@ -1899,7 +1711,6 @@ gcmSm4DecOctetsLoop:
|
|||||||
increment(2)
|
increment(2)
|
||||||
decMulRound(4)
|
decMulRound(4)
|
||||||
increment(3)
|
increment(3)
|
||||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
|
|
||||||
decMulRound(5)
|
decMulRound(5)
|
||||||
increment(4)
|
increment(4)
|
||||||
decMulRound(6)
|
decMulRound(6)
|
||||||
@ -1920,6 +1731,8 @@ gcmSm4DecOctetsLoop:
|
|||||||
reduceRound(ACC0)
|
reduceRound(ACC0)
|
||||||
PXOR ACC1, ACC0
|
PXOR ACC1, ACC0
|
||||||
|
|
||||||
|
SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
|
|
||||||
MOVOU (16*0)(ctx), T0
|
MOVOU (16*0)(ctx), T0
|
||||||
PXOR T0, B0
|
PXOR T0, B0
|
||||||
MOVOU (16*1)(ctx), T0
|
MOVOU (16*1)(ctx), T0
|
||||||
@ -1964,7 +1777,7 @@ gcmSm4DecNibbles:
|
|||||||
MOVOU (2*16)(SP), B6
|
MOVOU (2*16)(SP), B6
|
||||||
MOVOU (3*16)(SP), B7
|
MOVOU (3*16)(SP), B7
|
||||||
|
|
||||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
|
SM4_4BLOCKS(rk, B0, T0, T1, T2, B4, B5, B6, B7)
|
||||||
MOVOU (16*14)(pTbl), T2
|
MOVOU (16*14)(pTbl), T2
|
||||||
MOVOU (16*0)(ctx), T0
|
MOVOU (16*0)(ctx), T0
|
||||||
PXOR T0, B4
|
PXOR T0, B4
|
||||||
@ -2000,7 +1813,7 @@ gcmSm4DecSingles:
|
|||||||
MOVOU (2*16)(SP), B2
|
MOVOU (2*16)(SP), B2
|
||||||
MOVOU (3*16)(SP), B3
|
MOVOU (3*16)(SP), B3
|
||||||
|
|
||||||
SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
|
SM4_4BLOCKS(rk, B4, T0, T1, T2, B0, B1, B2, B3)
|
||||||
MOVOU B0, (16*4)(SP)
|
MOVOU B0, (16*4)(SP)
|
||||||
MOVOU B1, (16*5)(SP)
|
MOVOU B1, (16*5)(SP)
|
||||||
MOVOU B2, (16*6)(SP)
|
MOVOU B2, (16*6)(SP)
|
||||||
@ -2145,25 +1958,21 @@ avxGcmSm4DecOctetsLoop:
|
|||||||
avxReduceRound(ACC0)
|
avxReduceRound(ACC0)
|
||||||
VPXOR ACC1, ACC0, ACC0
|
VPXOR ACC1, ACC0, ACC0
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, ACC1, T1, T2, B0, B1, B2, B3)
|
AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
|
|
||||||
VPXOR (16*0)(ctx), B0, B0
|
VPXOR (16*0)(ctx), B0, B0
|
||||||
VPXOR (16*1)(ctx), B1, B1
|
VPXOR (16*1)(ctx), B1, B1
|
||||||
VPXOR (16*2)(ctx), B2, B2
|
VPXOR (16*2)(ctx), B2, B2
|
||||||
VPXOR (16*3)(ctx), B3, B3
|
VPXOR (16*3)(ctx), B3, B3
|
||||||
|
|
||||||
VMOVDQU B0, (16*0)(ptx)
|
|
||||||
VMOVDQU B1, (16*1)(ptx)
|
|
||||||
VMOVDQU B2, (16*2)(ptx)
|
|
||||||
VMOVDQU B3, (16*3)(ptx)
|
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
|
|
||||||
|
|
||||||
VPXOR (16*4)(ctx), B4, B4
|
VPXOR (16*4)(ctx), B4, B4
|
||||||
VPXOR (16*5)(ctx), B5, B5
|
VPXOR (16*5)(ctx), B5, B5
|
||||||
VPXOR (16*6)(ctx), B6, B6
|
VPXOR (16*6)(ctx), B6, B6
|
||||||
VPXOR (16*7)(ctx), B7, B7
|
VPXOR (16*7)(ctx), B7, B7
|
||||||
|
|
||||||
|
VMOVDQU B0, (16*0)(ptx)
|
||||||
|
VMOVDQU B1, (16*1)(ptx)
|
||||||
|
VMOVDQU B2, (16*2)(ptx)
|
||||||
|
VMOVDQU B3, (16*3)(ptx)
|
||||||
VMOVDQU B4, (16*4)(ptx)
|
VMOVDQU B4, (16*4)(ptx)
|
||||||
VMOVDQU B5, (16*5)(ptx)
|
VMOVDQU B5, (16*5)(ptx)
|
||||||
VMOVDQU B6, (16*6)(ptx)
|
VMOVDQU B6, (16*6)(ptx)
|
||||||
@ -2187,7 +1996,7 @@ avxGcmSm4DecNibbles:
|
|||||||
VMOVDQU (2*16)(SP), B6
|
VMOVDQU (2*16)(SP), B6
|
||||||
VMOVDQU (3*16)(SP), B7
|
VMOVDQU (3*16)(SP), B7
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7)
|
AVX_SM4_4BLOCKS(rk, B0, B1, T1, T2, B4, B5, B6, B7)
|
||||||
|
|
||||||
VMOVDQU (16*14)(pTbl), T2
|
VMOVDQU (16*14)(pTbl), T2
|
||||||
VMOVDQU (16*0)(ctx), B0
|
VMOVDQU (16*0)(ctx), B0
|
||||||
@ -2227,7 +2036,7 @@ avxGcmSm4DecSingles:
|
|||||||
VMOVDQU (2*16)(SP), B2
|
VMOVDQU (2*16)(SP), B2
|
||||||
VMOVDQU (3*16)(SP), B3
|
VMOVDQU (3*16)(SP), B3
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(rk, BX, B7, B6, B5, B0, B1, B2, B3)
|
AVX_SM4_4BLOCKS(rk, B7, B6, B5, B4, B0, B1, B2, B3)
|
||||||
VMOVDQU B0, (16*4)(SP)
|
VMOVDQU B0, (16*4)(SP)
|
||||||
VMOVDQU B1, (16*5)(SP)
|
VMOVDQU B1, (16*5)(SP)
|
||||||
VMOVDQU B2, (16*6)(SP)
|
VMOVDQU B2, (16*6)(SP)
|
||||||
@ -2328,13 +2137,6 @@ avx2GcmSm4DecOctetsLoop:
|
|||||||
VMOVDQU (2*32)(SP), DWB2
|
VMOVDQU (2*32)(SP), DWB2
|
||||||
VMOVDQU (3*32)(SP), DWB3
|
VMOVDQU (3*32)(SP), DWB3
|
||||||
|
|
||||||
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
|
|
||||||
// Apply Byte Flip Mask: LE -> BE
|
|
||||||
VPSHUFB XDWTMP0, DWB0, DWB0
|
|
||||||
VPSHUFB XDWTMP0, DWB1, DWB1
|
|
||||||
VPSHUFB XDWTMP0, DWB2, DWB2
|
|
||||||
VPSHUFB XDWTMP0, DWB3, DWB3
|
|
||||||
|
|
||||||
VMOVDQU (16*0)(ctx), T0
|
VMOVDQU (16*0)(ctx), T0
|
||||||
VPSHUFB BSWAP, T0, T0
|
VPSHUFB BSWAP, T0, T0
|
||||||
VPXOR ACC0, T0, T0
|
VPXOR ACC0, T0, T0
|
||||||
@ -2348,20 +2150,18 @@ avx2GcmSm4DecOctetsLoop:
|
|||||||
VPCLMULQDQ $0x00, T0, ACC1, ACC0
|
VPCLMULQDQ $0x00, T0, ACC1, ACC0
|
||||||
VPCLMULQDQ $0x11, T0, ACC1, ACC1
|
VPCLMULQDQ $0x11, T0, ACC1, ACC1
|
||||||
|
|
||||||
|
VBROADCASTI128 flip_mask<>(SB), XDWTMP0
|
||||||
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
|
VPSHUFB XDWTMP0, DWB0, DWB0
|
||||||
|
VPSHUFB XDWTMP0, DWB1, DWB1
|
||||||
|
VPSHUFB XDWTMP0, DWB2, DWB2
|
||||||
|
VPSHUFB XDWTMP0, DWB3, DWB3
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
XORL BX, BX
|
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
|
|
||||||
avx2GcmSm4Dec8Loop2:
|
AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
||||||
AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3)
|
|
||||||
AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0)
|
|
||||||
AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1)
|
|
||||||
AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2)
|
|
||||||
|
|
||||||
ADDL $16, BX
|
|
||||||
CMPL BX, $4*32
|
|
||||||
JB avx2GcmSm4Dec8Loop2
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
|
||||||
@ -2374,8 +2174,8 @@ avx2GcmSm4Dec8Loop2:
|
|||||||
|
|
||||||
VMOVDQU (32*0)(ctx), XDWTMP0
|
VMOVDQU (32*0)(ctx), XDWTMP0
|
||||||
VPXOR XDWTMP0, DWB0, DWB0
|
VPXOR XDWTMP0, DWB0, DWB0
|
||||||
VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
|
|
||||||
VEXTRACTI128 $1, XDWTMP0, T0
|
VEXTRACTI128 $1, XDWTMP0, T0
|
||||||
|
VPSHUFB BSWAP, T0, T0
|
||||||
internalAvxDecMulRound(1)
|
internalAvxDecMulRound(1)
|
||||||
increment(0)
|
increment(0)
|
||||||
|
|
||||||
@ -2436,7 +2236,6 @@ avx2GcmSm4DecEndOctets:
|
|||||||
SUBQ $4, aluCTR
|
SUBQ $4, aluCTR
|
||||||
|
|
||||||
avx2GcmSm4DecNibbles:
|
avx2GcmSm4DecNibbles:
|
||||||
VMOVDQU flip_mask<>(SB), B7 // DO NOT CHANGE B7
|
|
||||||
CMPQ ptxLen, $64
|
CMPQ ptxLen, $64
|
||||||
JBE avx2GcmSm4DecSingles
|
JBE avx2GcmSm4DecSingles
|
||||||
SUBQ $64, ptxLen
|
SUBQ $64, ptxLen
|
||||||
@ -2446,31 +2245,7 @@ avx2GcmSm4DecNibbles:
|
|||||||
VMOVDQU (2*16)(SP), B2
|
VMOVDQU (2*16)(SP), B2
|
||||||
VMOVDQU (3*16)(SP), B3
|
VMOVDQU (3*16)(SP), B3
|
||||||
|
|
||||||
VPSHUFB B7, B0, B0
|
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
||||||
VPSHUFB B7, B1, B1
|
|
||||||
VPSHUFB B7, B2, B2
|
|
||||||
VPSHUFB B7, B3, B3
|
|
||||||
|
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
|
||||||
XORL BX, BX
|
|
||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
|
||||||
|
|
||||||
avx2GcmSm4Dec4Loop2:
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
|
||||||
|
|
||||||
ADDL $16, BX
|
|
||||||
CMPL BX, $4*32
|
|
||||||
JB avx2GcmSm4Dec4Loop2
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
|
||||||
VPSHUFB BSWAP, B0, B4
|
|
||||||
VPSHUFB BSWAP, B1, B1
|
|
||||||
VPSHUFB BSWAP, B2, B2
|
|
||||||
VPSHUFB BSWAP, B3, B3
|
|
||||||
|
|
||||||
VMOVDQU (16*14)(pTbl), T2
|
VMOVDQU (16*14)(pTbl), T2
|
||||||
VMOVDQU (16*0)(ctx), B0
|
VMOVDQU (16*0)(ctx), B0
|
||||||
@ -2511,32 +2286,7 @@ avx2GcmSm4DecSingles:
|
|||||||
VMOVDQU (2*16)(SP), B2
|
VMOVDQU (2*16)(SP), B2
|
||||||
VMOVDQU (3*16)(SP), B3
|
VMOVDQU (3*16)(SP), B3
|
||||||
|
|
||||||
VPSHUFB B7, B0, B0
|
AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
|
||||||
VPSHUFB B7, B1, B1
|
|
||||||
VPSHUFB B7, B2, B2
|
|
||||||
VPSHUFB B7, B3, B3
|
|
||||||
|
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
|
|
||||||
|
|
||||||
XORL BX, BX
|
|
||||||
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK
|
|
||||||
|
|
||||||
avx2GcmSm4Dec4Loop1:
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
|
|
||||||
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
|
|
||||||
|
|
||||||
ADDL $16, BX
|
|
||||||
CMPL BX, $4*32
|
|
||||||
JB avx2GcmSm4Dec4Loop1
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
|
|
||||||
VPSHUFB BSWAP, B0, B0
|
|
||||||
VPSHUFB BSWAP, B1, B1
|
|
||||||
VPSHUFB BSWAP, B2, B2
|
|
||||||
VPSHUFB BSWAP, B3, B3
|
|
||||||
|
|
||||||
VMOVDQU B0, (16*4)(SP)
|
VMOVDQU B0, (16*4)(SP)
|
||||||
VMOVDQU B1, (16*5)(SP)
|
VMOVDQU B1, (16*5)(SP)
|
||||||
|
@ -449,36 +449,24 @@ encOctetsLoop:
|
|||||||
|
|
||||||
// encryption first 4 blocks
|
// encryption first 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
|
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||||
EOR R13, R13
|
EOR R13, R13
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
encOctetsEnc4Blocks1:
|
encOctetsEnc8Blocks:
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
|
||||||
|
|
||||||
ADD $1, R13
|
ADD $1, R13
|
||||||
CMP $8, R13
|
CMP $8, R13
|
||||||
BNE encOctetsEnc4Blocks1
|
BNE encOctetsEnc8Blocks
|
||||||
VREV32 B0.B16, B0.B16
|
VREV32 B0.B16, B0.B16
|
||||||
VREV32 B1.B16, B1.B16
|
VREV32 B1.B16, B1.B16
|
||||||
VREV32 B2.B16, B2.B16
|
VREV32 B2.B16, B2.B16
|
||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
// encryption second 4 blocks
|
|
||||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
|
||||||
MOVD rkSave, rk
|
|
||||||
|
|
||||||
encOctetsEnc4Blocks2:
|
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
|
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
|
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
|
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
|
|
||||||
|
|
||||||
ADD $1, R13
|
|
||||||
CMP $16, R13
|
|
||||||
BNE encOctetsEnc4Blocks2
|
|
||||||
VREV32 B4.B16, B4.B16
|
VREV32 B4.B16, B4.B16
|
||||||
VREV32 B5.B16, B5.B16
|
VREV32 B5.B16, B5.B16
|
||||||
VREV32 B6.B16, B6.B16
|
VREV32 B6.B16, B6.B16
|
||||||
@ -741,41 +729,28 @@ decOctetsLoop:
|
|||||||
|
|
||||||
// encryption first 4 blocks
|
// encryption first 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
|
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||||
EOR R13, R13
|
EOR R13, R13
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
decOctetsEnc4Blocks1:
|
decOctetsEnc8Blocks:
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
|
||||||
|
|
||||||
ADD $1, R13
|
ADD $1, R13
|
||||||
CMP $8, R13
|
CMP $8, R13
|
||||||
BNE decOctetsEnc4Blocks1
|
BNE decOctetsEnc8Blocks
|
||||||
VREV32 B0.B16, T1.B16
|
VREV32 B0.B16, T1.B16
|
||||||
VREV32 B1.B16, T2.B16
|
VREV32 B1.B16, T2.B16
|
||||||
VREV32 B2.B16, B2.B16
|
VREV32 B2.B16, B2.B16
|
||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0, K1, K2, K3)
|
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0, K1, K2, K3)
|
||||||
|
|
||||||
// encryption second 4 blocks
|
|
||||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
|
||||||
MOVD rkSave, rk
|
|
||||||
|
|
||||||
decOctetsEnc4Blocks2:
|
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
|
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
|
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
|
|
||||||
SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
|
|
||||||
|
|
||||||
ADD $1, R13
|
|
||||||
CMP $16, R13
|
|
||||||
BNE decOctetsEnc4Blocks2
|
|
||||||
VREV32 B4.B16, B4.B16
|
VREV32 B4.B16, B4.B16
|
||||||
VREV32 B5.B16, B5.B16
|
VREV32 B5.B16, B5.B16
|
||||||
VREV32 B6.B16, B6.B16
|
VREV32 B6.B16, B6.B16
|
||||||
VREV32 B7.B16, B7.B16
|
VREV32 B7.B16, B7.B16
|
||||||
TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||||
|
|
||||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user