diff --git a/cipher/benchmark_test.go b/cipher/benchmark_test.go index 59848e9..334a2b3 100644 --- a/cipher/benchmark_test.go +++ b/cipher/benchmark_test.go @@ -25,6 +25,12 @@ func BenchmarkSM4EBCEncrypt1K(b *testing.B) { benchmarkEBCEncrypt1K(b, c) } +func BenchmarkAES128EBCEncrypt1K(b *testing.B) { + var key [16]byte + c, _ := aes.NewCipher(key[:]) + benchmarkEBCEncrypt1K(b, c) +} + func benchmarkCBCEncrypt1K(b *testing.B, block cipher.Block) { buf := make([]byte, 1024) b.SetBytes(int64(len(buf))) diff --git a/cipher/ecb_sm4_test.go b/cipher/ecb_sm4_test.go index e898a3c..cc29bee 100644 --- a/cipher/ecb_sm4_test.go +++ b/cipher/ecb_sm4_test.go @@ -2,6 +2,8 @@ package cipher_test import ( "bytes" + "crypto/rand" + "io" "testing" "github.com/emmansun/gmsm/cipher" @@ -63,6 +65,11 @@ var ecbSM4Tests = []struct { []byte("0123456789ABCDEF"), []byte("exampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintext"), }, + { + "18 same blocks", + []byte("0123456789ABCDEF"), + []byte("exampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintextexampleplaintext"), + }, } func TestECBBasic(t *testing.T) { @@ -80,11 +87,30 @@ func TestECBBasic(t *testing.T) { decrypter := cipher.NewECBDecrypter(c) decrypter.CryptBlocks(plaintext, ciphertext) if !bytes.Equal(test.in, plaintext) { - t.Errorf("%s: ECB encrypt/decrypt failed", test.name) + t.Errorf("%s: ECB encrypt/decrypt failed, %s", test.name, string(plaintext)) } } } +func TestECBRandom(t *testing.T) { + key := []byte("0123456789ABCDEF") + plaintext := make([]byte, 448) + ciphertext := make([]byte, 448) + io.ReadFull(rand.Reader, plaintext) + c, err := sm4.NewCipher(key) + if err != nil { + t.Fatal(err) + } + encrypter := cipher.NewECBEncrypter(c) + encrypter.CryptBlocks(ciphertext, plaintext) + result := make([]byte, 448) + decrypter := cipher.NewECBDecrypter(c) + decrypter.CryptBlocks(result, ciphertext) + if !bytes.Equal(result, plaintext) { + t.Error("ECB encrypt/decrypt failed") + } +} + func shouldPanic(t *testing.T, f func()) { t.Helper() defer func() { _ = recover() }() diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index 1e4d334..eb7fc5b 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -15,7 +15,7 @@ GLOBL nibble_mask<>(SB), 8, $16 // inverse shift rows DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 +DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 GLOBL inverse_shift_rows<>(SB), 8, $16 // Affine transform 1 (low and high hibbles) @@ -24,7 +24,7 @@ DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 GLOBL m1_low<>(SB), 8, $16 DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 -DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB +DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB GLOBL m1_high<>(SB), 8, $16 // Affine transform 2 (low and high hibbles) @@ -38,21 +38,46 @@ GLOBL m2_high<>(SB), 8, $16 // left rotations of 32-bit words by 8-bit increments DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 -DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B GLOBL r08_mask<>(SB), 8, $16 DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 -DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A GLOBL r16_mask<>(SB), 8, $16 DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 -DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 +DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 GLOBL r24_mask<>(SB), 8, $16 DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 GLOBL fk_mask<>(SB), 8, $16 +// inverse shift rows +DATA inverse_shift_rows256<>+0x00(SB)/8, $0x0B0E0104070A0D00 +DATA inverse_shift_rows256<>+0x08(SB)/8, $0x0306090C0F020508 +DATA inverse_shift_rows256<>+0x10(SB)/8, $0x0B0E0104070A0D00 +DATA inverse_shift_rows256<>+0x18(SB)/8, $0x0306090C0F020508 +GLOBL inverse_shift_rows256<>(SB), 8, $32 + +DATA r08_mask256<>+0x00(SB)/8, $0x0605040702010003 +DATA r08_mask256<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +DATA r08_mask256<>+0x10(SB)/8, $0x0605040702010003 +DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B +GLOBL r08_mask256<>(SB), 8, $32 + +DATA r16_mask256<>+0x00(SB)/8, $0x0504070601000302 +DATA r16_mask256<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +DATA r16_mask256<>+0x10(SB)/8, $0x0504070601000302 +DATA r16_mask256<>+0x18(SB)/8, $0x0D0C0F0E09080B0A +GLOBL r16_mask256<>(SB), 8, $32 + +DATA r24_mask256<>+0x00(SB)/8, $0x0407060500030201 +DATA r24_mask256<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 +DATA r24_mask256<>+0x10(SB)/8, $0x0407060500030201 +DATA r24_mask256<>+0x18(SB)/8, $0x0C0F0E0D080B0A09 +GLOBL r24_mask256<>(SB), 8, $32 + // Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance! // input: from high to low // r0 = [w3, w2, w1, w0] @@ -189,7 +214,7 @@ GLOBL fk_mask<>(SB), 8, $16 // - t2: 128 bits register for data // - t3: 128 bits register for data #define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ - PINSRD $0, (index * 4)(RK)(IND*1), x; \ + MOVL (index * 4)(RK)(IND*1), x; \ PSHUFD $0, x, x; \ PXOR t1, x; \ PXOR t2, x; \ @@ -197,29 +222,128 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_TAO_L1(x, y, z); \ PXOR x, t0 +#define SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3) \ + PXOR t1, x; \ + PXOR t2, x; \ + PXOR t3, x; \ + SM4_TAO_L1(x, y, z); \ + PXOR x, t0 \ + +#define SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3) \ + PSHUFD $0, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \ + PSHUFD $0x55, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \ + PSHUFD $0xAA, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \ + PSHUFD $0xFF, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \ + +#define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ + PSHUFB flip_mask<>(SB), t0; \ + PSHUFB flip_mask<>(SB), t1; \ + PSHUFB flip_mask<>(SB), t2; \ + PSHUFB flip_mask<>(SB), t3; \ + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ + MOVOU (0*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (1*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (2*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (3*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (4*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (5*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (6*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + MOVOU (7*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ + PSHUFB bswap_mask<>(SB), t3; \ + PSHUFB bswap_mask<>(SB), t2; \ + PSHUFB bswap_mask<>(SB), t1; \ + PSHUFB bswap_mask<>(SB), t0 + +#define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ + PSHUFD $0, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \ + PSHUFD $0, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t4, t5, t6, t7); \ + PSHUFD $0x55, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \ + PSHUFD $0x55, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t5, t6, t7, t4); \ + PSHUFD $0xAA, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \ + PSHUFD $0xAA, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t6, t7, t4, t5); \ + PSHUFD $0xFF, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \ + PSHUFD $0xFF, rk128, x; \ + SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \ + +#define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ + PSHUFB flip_mask<>(SB), t0; \ + PSHUFB flip_mask<>(SB), t1; \ + PSHUFB flip_mask<>(SB), t2; \ + PSHUFB flip_mask<>(SB), t3; \ + PSHUFB flip_mask<>(SB), t4; \ + PSHUFB flip_mask<>(SB), t5; \ + PSHUFB flip_mask<>(SB), t6; \ + PSHUFB flip_mask<>(SB), t7; \ + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ + SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ + MOVOU (0*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (1*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (2*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (3*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (4*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (5*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (6*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (7*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ + SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ + PSHUFB bswap_mask<>(SB), t3; \ + PSHUFB bswap_mask<>(SB), t2; \ + PSHUFB bswap_mask<>(SB), t1; \ + PSHUFB bswap_mask<>(SB), t0; \ + PSHUFB bswap_mask<>(SB), t7; \ + PSHUFB bswap_mask<>(SB), t6; \ + PSHUFB bswap_mask<>(SB), t5; \ + PSHUFB bswap_mask<>(SB), t4 + // SM4 sbox function, AVX version // parameters: // - x: 128 bits register as sbox input/output data // - y: 128 bits temp register -// - X_NIBBLE_MASK: 128 bits register stored nibble mask, should be loaded earlier. // - tmp: 128 bits temp register -#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \ - VPAND X_NIBBLE_MASK, x, tmp; \ +#define AVX_SM4_SBOX(x, y, tmp) \ + VPAND nibble_mask<>(SB), x, tmp; \ VMOVDQU m1_low<>(SB), y; \ VPSHUFB tmp, y, y; \ VPSRLQ $4, x, x; \ - VPAND X_NIBBLE_MASK, x, x; \ + VPAND nibble_mask<>(SB), x, x; \ VMOVDQU m1_high<>(SB), tmp; \ VPSHUFB x, tmp, x; \ VPXOR y, x, x; \ - VMOVDQU inverse_shift_rows<>(SB), tmp; \ - VPSHUFB tmp, x, x; \ - VAESENCLAST X_NIBBLE_MASK, x, x; \ - VPANDN X_NIBBLE_MASK, x, tmp; \ + VPSHUFB inverse_shift_rows<>(SB), x, x; \ + VAESENCLAST nibble_mask<>(SB), x, x; \ + VPANDN nibble_mask<>(SB), x, tmp; \ VMOVDQU m2_low<>(SB), y; \ VPSHUFB tmp, y, y; \ VPSRLQ $4, x, x; \ - VPAND X_NIBBLE_MASK, x, x; \ + VPAND nibble_mask<>(SB), x, x; \ VMOVDQU m2_high<>(SB), tmp; \ VPSHUFB x, tmp, x; \ VPXOR y, x, x @@ -228,21 +352,17 @@ GLOBL fk_mask<>(SB), 8, $16 // parameters: // - x: 128 bits register as sbox input/output data // - y: 128 bits temp register -// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier. // - tmp: 128 bits temp register -#define AVX_SM4_TAO_L1(x, y, xNibbleMask, tmp) \ - AVX_SM4_SBOX(x, y, xNibbleMask, tmp); \ - VMOVDQU r08_mask<>(SB), tmp; \ - VPSHUFB tmp, x, y; \ +#define AVX_SM4_TAO_L1(x, y, tmp) \ + AVX_SM4_SBOX(x, y, tmp); \ + VPSHUFB r08_mask<>(SB), x, y; \ VPXOR x, y, y; \ - VMOVDQU r16_mask<>(SB), tmp; \ - VPSHUFB tmp, x, tmp; \ + VPSHUFB r16_mask<>(SB), x, tmp; \ VPXOR tmp, y, y; \ VPSLLD $2, y, tmp; \ VPSRLD $30, y, y; \ VPXOR tmp, y, y; \ - VMOVDQU r24_mask<>(SB), tmp; \ - VPSHUFB tmp, x, tmp; \ + VPSHUFB r24_mask<>(SB), x, tmp; \ VPXOR y, x, x; \ VPXOR x, tmp, x @@ -280,9 +400,115 @@ GLOBL fk_mask<>(SB), 8, $16 VPXOR t1, x, x; \ VPXOR t2, x, x; \ VPXOR t3, x, x; \ - AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \ + AVX_SM4_TAO_L1(x, y, tmp); \ VPXOR x, t0, t0 + +#define SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3) \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX_SM4_TAO_L1(x, y, z); \ + VPXOR x, t0, t0 \ + +#define SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3) \ + VPSHUFD $0, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \ + VPSHUFD $0x55, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \ + VPSHUFD $0xAA, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \ + VPSHUFD $0xFF, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \ + +#define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ + VPSHUFB flip_mask<>(SB), t0, t0 \ + VPSHUFB flip_mask<>(SB), t1, t1 \ + VPSHUFB flip_mask<>(SB), t2, t2 \ + VPSHUFB flip_mask<>(SB), t3, t3 \ + ; \ + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ + VMOVDQU (0*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ + VMOVDQU (1*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ + VMOVDQU (2*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ + VMOVDQU (3*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ + VMOVDQU (4*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ + VMOVDQU (5*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ + VMOVDQU (6*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ + VMOVDQU (7*16)(RK), rk128; \ + SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ + ; \ // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ + VPSHUFB bswap_mask<>(SB), t0, t0 \ + VPSHUFB bswap_mask<>(SB), t1, t1 \ + VPSHUFB bswap_mask<>(SB), t2, t2 \ + VPSHUFB bswap_mask<>(SB), t3, t3 \ + +#define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ + VPSHUFD $0, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \ + VPSHUFD $0, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t4, t5, t6, t7); \ + VPSHUFD $0x55, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \ + VPSHUFD $0x55, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t5, t6, t7, t4); \ + VPSHUFD $0xAA, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \ + VPSHUFD $0xAA, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t6, t7, t4, t5); \ + VPSHUFD $0xFF, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \ + VPSHUFD $0xFF, rk128, x; \ + SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \ + +#define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ + VPSHUFB flip_mask<>(SB), t0, t0 \ + VPSHUFB flip_mask<>(SB), t1, t1 \ + VPSHUFB flip_mask<>(SB), t2, t2 \ + VPSHUFB flip_mask<>(SB), t3, t3 \ + VPSHUFB flip_mask<>(SB), t4, t4 \ + VPSHUFB flip_mask<>(SB), t5, t5 \ + VPSHUFB flip_mask<>(SB), t6, t6 \ + VPSHUFB flip_mask<>(SB), t7, t7 \ + ; \ + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ + TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ + VMOVDQU (0*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (1*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (2*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (3*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (4*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (5*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (6*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (7*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + ; \ // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ + TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ + VPSHUFB bswap_mask<>(SB), t0, t0 \ + VPSHUFB bswap_mask<>(SB), t1, t1 \ + VPSHUFB bswap_mask<>(SB), t2, t2 \ + VPSHUFB bswap_mask<>(SB), t3, t3 \ + VPSHUFB bswap_mask<>(SB), t4, t4 \ + VPSHUFB bswap_mask<>(SB), t5, t5 \ + VPSHUFB bswap_mask<>(SB), t6, t6 \ + VPSHUFB bswap_mask<>(SB), t7, t7 \ + // SM4 sbox function, AVX2 version // parameters: // - x: 256 bits register as sbox input/output data @@ -301,8 +527,7 @@ GLOBL fk_mask<>(SB), 8, $16 VBROADCASTI128 m1_high<>(SB), z; \ VPSHUFB x, z, x; \ VPXOR y, x, x; \ - VBROADCASTI128 inverse_shift_rows<>(SB), z; \ - VPSHUFB z, x, x; \ + VPSHUFB inverse_shift_rows256<>(SB), x, x; \ VEXTRACTI128 $1, x, yw \ VAESENCLAST xNibbleMask, xw, xw; \ VAESENCLAST xNibbleMask, yw, yw; \ @@ -327,17 +552,14 @@ GLOBL fk_mask<>(SB), 8, $16 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. #define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \ - VBROADCASTI128 r08_mask<>(SB), z; \ - VPSHUFB z, x, y; \ + VPSHUFB r08_mask256<>(SB), x, y; \ VPXOR x, y, y; \ - VBROADCASTI128 r16_mask<>(SB), z; \ - VPSHUFB z, x, z; \ + VPSHUFB r16_mask256<>(SB), x, z; \ VPXOR z, y, y; \ VPSLLD $2, y, z; \ VPSRLD $30, y, y; \ VPXOR z, y, y; \ - VBROADCASTI128 r24_mask<>(SB), z; \ - VPSHUFB z, x, z; \ + VPSHUFB r24_mask256<>(SB), x, z; \ VPXOR y, x, x; \ VPXOR x, z, x @@ -359,6 +581,24 @@ GLOBL fk_mask<>(SB), 8, $16 AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ VPXOR x, t0, t0 +// SM4 round function, AVX2 version, handle 256 bits +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - index: round key index immediate number +// - x: 256 bits temp register, MUST use XDWORD! +// - y: 256 bits temp register, MUST use YDWORD! +// - t0: 256 bits register for data as result +// - t1: 256 bits register for data +// - t2: 256 bits register for data +// - t3: 256 bits register for data +#define AVX2_SM4_ROUND2(index, RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \ + VPBROADCASTD (index * 4)(RK), x; \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ + VPXOR x, t0, t0 + // SM4 round function, AVX version, handle 128 bits // t0 ^= tao_l1(t1^t2^t3^xk) // parameters: @@ -371,9 +611,100 @@ GLOBL fk_mask<>(SB), 8, $16 // - t3: 128 bits register for data #define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ VPBROADCASTD (index * 4)(RK)(IND*1), x; \ - VPSHUFD $0, x, x; \ VPXOR t1, x, x; \ VPXOR t2, x, x; \ VPXOR t3, x, x; \ - AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \ + AVX_SM4_TAO_L1(x, y, tmp); \ VPXOR x, t0, t0 + +#define AVX2_SM4_8BLOCKS(RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \ + AVX2_SM4_ROUND2(0, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ + AVX2_SM4_ROUND2(1, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ + AVX2_SM4_ROUND2(2, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ + AVX2_SM4_ROUND2(3, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ + AVX2_SM4_ROUND2(4, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ + AVX2_SM4_ROUND2(5, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ + AVX2_SM4_ROUND2(6, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ + AVX2_SM4_ROUND2(7, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ + AVX2_SM4_ROUND2(8, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ + AVX2_SM4_ROUND2(9, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ + AVX2_SM4_ROUND2(10, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ + AVX2_SM4_ROUND2(11, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ + AVX2_SM4_ROUND2(12, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ + AVX2_SM4_ROUND2(13, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ + AVX2_SM4_ROUND2(14, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ + AVX2_SM4_ROUND2(15, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ + AVX2_SM4_ROUND2(16, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ + AVX2_SM4_ROUND2(17, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ + AVX2_SM4_ROUND2(18, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ + AVX2_SM4_ROUND2(19, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ + AVX2_SM4_ROUND2(20, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ + AVX2_SM4_ROUND2(21, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ + AVX2_SM4_ROUND2(22, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ + AVX2_SM4_ROUND2(23, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ + AVX2_SM4_ROUND2(24, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ + AVX2_SM4_ROUND2(25, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ + AVX2_SM4_ROUND2(26, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ + AVX2_SM4_ROUND2(27, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ + AVX2_SM4_ROUND2(28, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ + AVX2_SM4_ROUND2(29, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ + AVX2_SM4_ROUND2(30, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ + AVX2_SM4_ROUND2(31, RK, x, y, xw, yw, tmp, t3, t0, t1, t2) + +// SM4 round function, AVX2 version, handle 256 bits +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - index: round key index immediate number +// - x: 256 bits temp register, MUST use XDWORD! +// - y: 256 bits temp register, MUST use YDWORD! +// - t0: 256 bits register for data as result +// - t1: 256 bits register for data +// - t2: 256 bits register for data +// - t3: 256 bits register for data +#define AVX2_SM4_16BLOCKS_ROUND(index, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \ + VPBROADCASTD (index * 4)(RK), tmp1; \ + VPXOR t1, tmp1, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ + VPXOR x, t0, t0; \ + ;\ + VPXOR t5, tmp1, x; \ + VPXOR t6, x, x; \ + VPXOR t7, x, x; \ + AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ + VPXOR x, t4, t4; \ + +#define AVX2_SM4_16BLOCKS(RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \ + AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ + AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ + AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(10, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(11, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ + AVX2_SM4_16BLOCKS_ROUND(12, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(13, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(14, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(15, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ + AVX2_SM4_16BLOCKS_ROUND(16, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(17, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(18, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(19, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ + AVX2_SM4_16BLOCKS_ROUND(20, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(21, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(22, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(23, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ + AVX2_SM4_16BLOCKS_ROUND(24, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(25, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(26, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(27, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ + AVX2_SM4_16BLOCKS_ROUND(28, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(29, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(30, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(31, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6) diff --git a/sm4/aesni_macros_arm64.s b/sm4/aesni_macros_arm64.s index caa9ca1..1a9fccb 100644 --- a/sm4/aesni_macros_arm64.s +++ b/sm4/aesni_macros_arm64.s @@ -171,3 +171,30 @@ GLOBL fk_mask<>(SB), (16+8), $16 VEOR t3.B16, x.B16, x.B16; \ SM4_TAO_L1(x, y, z); \ VEOR x.B16, t0.B16, t0.B16 + +// SM4 round function +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - RK: round key register +// - tmp32: temp 32/64 bits register +// - x: 128 bits temp register +// - y: 128 bits temp register +// - z: 128 bits temp register +// - t0: 128 bits register for data as result +// - t1: 128 bits register for data +// - t2: 128 bits register for data +// - t3: 128 bits register for data +#define SM4_8BLOCKS_ROUND(RK, tmp32, x, y, z, tmp, t0, t1, t2, t3, t4, t5, t6, t7) \ + MOVW.P 4(RK), tmp32; \ + VMOV tmp32, tmp.S4; \ + VEOR t1.B16, tmp.B16, x.B16; \ + VEOR t2.B16, x.B16, x.B16; \ + VEOR t3.B16, x.B16, x.B16; \ + SM4_TAO_L1(x, y, z); \ + VEOR x.B16, t0.B16, t0.B16; \ + ; \ + VEOR t1.B16, tmp.B16, x.B16; \ + VEOR t2.B16, x.B16, x.B16; \ + VEOR t3.B16, x.B16, x.B16; \ + SM4_TAO_L1(x, y, z); \ + VEOR x.B16, t0.B16, t0.B16 diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index c17630e..f0d0909 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -4,15 +4,15 @@ #include "textflag.h" -#define x X0 -#define y X1 -#define t0 X2 -#define t1 X3 -#define t2 X4 -#define t3 X5 +#define t0 X0 +#define t1 X1 +#define t2 X2 +#define t3 X3 -#define XTMP6 X6 -#define XTMP7 X7 +#define x X8 +#define y X9 +#define XTMP6 X10 +#define XTMP7 X11 #include "aesni_macros_amd64.s" @@ -48,7 +48,7 @@ // - t2: 128 bits register for data // - t3: 128 bits register for data #define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \ - PINSRD $0, (index * 4)(BX)(CX*1), x; \ + MOVL (index * 4)(BX)(CX*1), x; \ PXOR t1, x; \ PXOR t2, x; \ PXOR t3, x; \ @@ -68,6 +68,16 @@ #define XWORD2 X6 #define XWORD3 X7 +#define XDWORD4 Y10 +#define XDWORD5 Y11 +#define XDWORD6 Y12 +#define XDWORD7 Y14 + +#define XWORD4 X10 +#define XWORD5 X11 +#define XWORD6 X12 +#define XWORD7 X14 + #define XDWTMP0 Y0 #define XDWTMP1 Y1 #define XDWTMP2 Y2 @@ -133,91 +143,93 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 JE avx non_avx2_start: - MOVOU 0(DX), t0 - MOVOU 16(DX), t1 - MOVOU 32(DX), t2 - MOVOU 48(DX), t3 - PSHUFB flip_mask<>(SB), t0 - PSHUFB flip_mask<>(SB), t1 - PSHUFB flip_mask<>(SB), t2 - PSHUFB flip_mask<>(SB), t3 - SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) + CMPQ DI, $128 + JEQ sse_8blocks - XORL CX, CX + MOVOU 0(DX), XWORD0 + MOVOU 16(DX), XWORD1 + MOVOU 32(DX), XWORD2 + MOVOU 48(DX), XWORD3 -loop: - SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3) - SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0) - SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1) - SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2) - - ADDL $16, CX - CMPL CX, $4*32 - JB loop - - SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); - PSHUFB bswap_mask<>(SB), t3 - PSHUFB bswap_mask<>(SB), t2 - PSHUFB bswap_mask<>(SB), t1 - PSHUFB bswap_mask<>(SB), t0 + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) - MOVOU t0, 0(BX) - MOVOU t1, 16(BX) - MOVOU t2, 32(BX) - MOVOU t3, 48(BX) + MOVOU XWORD0, 0(BX) + MOVOU XWORD1, 16(BX) + MOVOU XWORD2, 32(BX) + MOVOU XWORD3, 48(BX) + RET + +sse_8blocks: + MOVOU 0(DX), XWORD0 + MOVOU 16(DX), XWORD1 + MOVOU 32(DX), XWORD2 + MOVOU 48(DX), XWORD3 + MOVOU 64(DX), XWORD4 + MOVOU 80(DX), XWORD5 + MOVOU 96(DX), XWORD6 + MOVOU 112(DX), XWORD7 + + SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) + + MOVOU XWORD0, 0(BX) + MOVOU XWORD1, 16(BX) + MOVOU XWORD2, 32(BX) + MOVOU XWORD3, 48(BX) + MOVOU XWORD4, 64(BX) + MOVOU XWORD5, 80(BX) + MOVOU XWORD6, 96(BX) + MOVOU XWORD7, 112(BX) done_sm4: RET avx: + CMPQ DI, $128 + JEQ avx_8blocks + VMOVDQU 0(DX), XWORD0 VMOVDQU 16(DX), XWORD1 VMOVDQU 32(DX), XWORD2 VMOVDQU 48(DX), XWORD3 - VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK - VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK - - VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 - VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 - VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 - VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) - - XORL CX, CX - -avx_loop: - AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3) - AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0) - AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1) - AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2) - - ADDL $16, CX - CMPL CX, $4*32 - JB avx_loop - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) - - VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK - VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 - VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 - VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 - VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VMOVDQU XWORD0, 0(BX) VMOVDQU XWORD1, 16(BX) VMOVDQU XWORD2, 32(BX) VMOVDQU XWORD3, 48(BX) + + RET +avx_8blocks: + VMOVDQU 0(DX), XWORD0 + VMOVDQU 16(DX), XWORD1 + VMOVDQU 32(DX), XWORD2 + VMOVDQU 48(DX), XWORD3 + VMOVDQU 64(DX), XWORD4 + VMOVDQU 80(DX), XWORD5 + VMOVDQU 96(DX), XWORD6 + VMOVDQU 112(DX), XWORD7 + + AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) + + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + VMOVDQU XWORD2, 32(BX) + VMOVDQU XWORD3, 48(BX) + VMOVDQU XWORD4, 64(BX) + VMOVDQU XWORD5, 80(BX) + VMOVDQU XWORD6, 96(BX) + VMOVDQU XWORD7, 112(BX) + +avx_done_sm4: RET avx2: VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK - CMPQ DI, $64 - JBE avx2_4blocks + + CMPQ DI, $256 + JEQ avx2_16blocks avx2_8blocks: VMOVDQU 0(DX), XDWORD0 @@ -235,17 +247,7 @@ avx2_8blocks: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) - XORL CX, CX - -avx2_loop: - AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - - ADDL $16, CX - CMPL CX, $4*32 - JB avx2_loop + AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) @@ -260,49 +262,60 @@ avx2_loop: VMOVDQU XDWORD1, 32(BX) VMOVDQU XDWORD2, 64(BX) VMOVDQU XDWORD3, 96(BX) - JMP avx2_sm4_done -avx2_4blocks: - VMOVDQU 0(DX), XWORD0 - VMOVDQU 16(DX), XWORD1 - VMOVDQU 32(DX), XWORD2 - VMOVDQU 48(DX), XWORD3 + VZEROUPPER + RET - VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK +avx2_16blocks: + VMOVDQU 0(DX), XDWORD0 + VMOVDQU 32(DX), XDWORD1 + VMOVDQU 64(DX), XDWORD2 + VMOVDQU 96(DX), XDWORD3 + VMOVDQU 128(DX), XDWORD4 + VMOVDQU 160(DX), XDWORD5 + VMOVDQU 192(DX), XDWORD6 + VMOVDQU 224(DX), XDWORD7 - VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 - VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 - VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 - VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 + VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK + + // Apply Byte Flip Mask: LE -> BE + VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 + VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 + VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 + VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 + VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 + VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 + VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 + VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) + TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) - XORL CX, CX - -avx2_4blocks_loop: - AVX2_SM4_ROUND_4BLOCKS(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3) - AVX2_SM4_ROUND_4BLOCKS(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0) - AVX2_SM4_ROUND_4BLOCKS(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1) - AVX2_SM4_ROUND_4BLOCKS(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2) - - ADDL $16, CX - CMPL CX, $4*32 - JB avx2_4blocks_loop + AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7) // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) + TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) - VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK - VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 - VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 - VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 - VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 - - VMOVDQU XWORD0, 0(BX) - VMOVDQU XWORD1, 16(BX) - VMOVDQU XWORD2, 32(BX) - VMOVDQU XWORD3, 48(BX) + VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK + VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 + VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 + VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 + VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 + VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 + VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 + VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 + VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 + + VMOVDQU XDWORD0, 0(BX) + VMOVDQU XDWORD1, 32(BX) + VMOVDQU XDWORD2, 64(BX) + VMOVDQU XDWORD3, 96(BX) + VMOVDQU XDWORD4, 128(BX) + VMOVDQU XDWORD5, 160(BX) + VMOVDQU XDWORD6, 192(BX) + VMOVDQU XDWORD7, 224(BX) avx2_sm4_done: VZEROUPPER diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index abf35f6..03bdac0 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -9,6 +9,10 @@ #define t1 V3 #define t2 V4 #define t3 V5 +#define t4 V8 +#define t5 V9 +#define t6 V10 +#define t7 V11 #define ZERO V16 #define NIBBLE_MASK V20 #define INVERSE_SHIFT_ROWS V21 @@ -184,6 +188,9 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 CMP $1, R11 BEQ sm4niblocks + CMP $128, R12 + BEQ double_enc + VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] VREV32 t0.B16, t0.B16 VREV32 t1.B16, t1.B16 @@ -215,6 +222,51 @@ encryptBlocksLoop: VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9) RET +double_enc: + VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4] + VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + VREV32 t4.B16, t4.B16 + VREV32 t5.B16, t5.B16 + VREV32 t6.B16, t6.B16 + VREV32 t7.B16, t7.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) + + load_global_data_2() + + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + EOR R0, R0 + +encrypt8BlocksLoop: + SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7) + SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4) + SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5) + SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6) + + ADD $16, R0 + CMP $128, R0 + BNE encrypt8BlocksLoop + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + VREV32 t4.B16, t4.B16 + VREV32 t5.B16, t5.B16 + VREV32 t6.B16, t6.B16 + VREV32 t7.B16, t7.B16 + + VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9) + VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9) + + RET + sm4niblocks: VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4] diff --git a/sm4/cbc_cipher_asm.go b/sm4/cbc_cipher_asm.go index b539a4c..9d9f7c8 100644 --- a/sm4/cbc_cipher_asm.go +++ b/sm4/cbc_cipher_asm.go @@ -74,11 +74,18 @@ func (x *cbc) CryptBlocks(dst, src []byte) { // Copy the last block of ciphertext in preparation as the new iv. copy(x.tmp, src[end-BlockSize:end]) - start := end - x.b.blocksSize - var temp []byte = make([]byte, x.b.blocksSize) - var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize) + decKeyPtr := &x.b.dec[0] + + start := end - 2*x.b.blocksSize + for start > 0 { + decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize]) + end = start + start -= 2*x.b.blocksSize + } + + start = end - x.b.blocksSize for start > 0 { decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize]) end = start @@ -86,6 +93,8 @@ func (x *cbc) CryptBlocks(dst, src []byte) { } // Handle remain first blocks + var temp []byte = make([]byte, x.b.blocksSize) + var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize) copy(batchSrc, x.iv) copy(batchSrc[BlockSize:], src[:end]) decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0]) diff --git a/sm4/cbc_cipher_asm_amd64.s b/sm4/cbc_cipher_asm_amd64.s index c1aa62e..3357c3b 100644 --- a/sm4/cbc_cipher_asm_amd64.s +++ b/sm4/cbc_cipher_asm_amd64.s @@ -85,6 +85,11 @@ done_sm4: #define XDWORD2 Y6 #define XDWORD3 Y7 +#define XDWORD4 Y10 +#define XDWORD5 Y11 +#define XDWORD6 Y12 +#define XDWORD7 Y14 + #define XWTMP0 X0 #define XWTMP1 X1 #define XWTMP2 X2 @@ -94,6 +99,11 @@ done_sm4: #define XWORD2 X6 #define XWORD3 X7 +#define XWORD4 X10 +#define XWORD5 X11 +#define XWORD6 X12 +#define XWORD7 X14 + #define NIBBLE_MASK Y3 #define X_NIBBLE_MASK X3 @@ -111,6 +121,7 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 MOVQ xk+0(FP), AX MOVQ dst+8(FP), BX MOVQ src+32(FP), DX + MOVQ src_len+40(FP), DI MOVQ iv+56(FP), SI CMPB ·useAVX2(SB), $1 @@ -120,84 +131,71 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 JE avx non_avx2_start: - MOVOU 0(DX), t0 - MOVOU 16(DX), t1 - MOVOU 32(DX), t2 - MOVOU 48(DX), t3 - PSHUFB flip_mask<>(SB), t0 - PSHUFB flip_mask<>(SB), t1 - PSHUFB flip_mask<>(SB), t2 - PSHUFB flip_mask<>(SB), t3 - SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) + CMPQ DI, $128 + JEQ sse_8blocks - XORL CX, CX + MOVOU 0(DX), XWORD0 + MOVOU 16(DX), XWORD1 + MOVOU 32(DX), XWORD2 + MOVOU 48(DX), XWORD3 -loop: - SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3) - SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0) - SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1) - SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2) + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) - ADDL $16, CX - CMPL CX, $4*32 - JB loop + PXOR 0(SI), XWORD0 + PXOR 16(SI), XWORD1 + PXOR 32(SI), XWORD2 + PXOR 48(SI), XWORD3 - SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); - PSHUFB bswap_mask<>(SB), t3 - PSHUFB bswap_mask<>(SB), t2 - PSHUFB bswap_mask<>(SB), t1 - PSHUFB bswap_mask<>(SB), t0 + MOVUPS XWORD0, 0(BX) + MOVUPS XWORD1, 16(BX) + MOVUPS XWORD2, 32(BX) + MOVUPS XWORD3, 48(BX) - PXOR 0(SI), t0 - PXOR 16(SI), t1 - PXOR 32(SI), t2 - PXOR 48(SI), t3 + RET - MOVUPS t0, 0(BX) - MOVUPS t1, 16(BX) - MOVUPS t2, 32(BX) - MOVUPS t3, 48(BX) +sse_8blocks: + MOVOU 0(DX), XWORD0 + MOVOU 16(DX), XWORD1 + MOVOU 32(DX), XWORD2 + MOVOU 48(DX), XWORD3 + MOVOU 64(DX), XWORD4 + MOVOU 80(DX), XWORD5 + MOVOU 96(DX), XWORD6 + MOVOU 112(DX), XWORD7 + + SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) + + PXOR 0(SI), XWORD0 + PXOR 16(SI), XWORD1 + PXOR 32(SI), XWORD2 + PXOR 48(SI), XWORD3 + PXOR 64(SI), XWORD4 + PXOR 80(SI), XWORD5 + PXOR 96(SI), XWORD6 + PXOR 112(SI), XWORD7 + + MOVOU XWORD0, 0(BX) + MOVOU XWORD1, 16(BX) + MOVOU XWORD2, 32(BX) + MOVOU XWORD3, 48(BX) + MOVOU XWORD4, 64(BX) + MOVOU XWORD5, 80(BX) + MOVOU XWORD6, 96(BX) + MOVOU XWORD7, 112(BX) done_sm4: RET avx: + CMPQ DI, $128 + JEQ avx_8blocks + VMOVDQU 0(DX), XWORD0 VMOVDQU 16(DX), XWORD1 VMOVDQU 32(DX), XWORD2 VMOVDQU 48(DX), XWORD3 - VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK - VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK - - VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 - VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 - VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 - VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) - - XORL CX, CX - -avx_loop: - AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3) - AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0) - AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1) - AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2) - - ADDL $16, CX - CMPL CX, $4*32 - JB avx_loop - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) - - VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK - VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 - VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 - VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 - VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) VPXOR 0(SI), XWORD0, XWORD0 VPXOR 16(SI), XWORD1, XWORD1 @@ -208,11 +206,45 @@ avx_loop: VMOVDQU XWORD1, 16(BX) VMOVDQU XWORD2, 32(BX) VMOVDQU XWORD3, 48(BX) + RET +avx_8blocks: + VMOVDQU 0(DX), XWORD0 + VMOVDQU 16(DX), XWORD1 + VMOVDQU 32(DX), XWORD2 + VMOVDQU 48(DX), XWORD3 + VMOVDQU 64(DX), XWORD4 + VMOVDQU 80(DX), XWORD5 + VMOVDQU 96(DX), XWORD6 + VMOVDQU 112(DX), XWORD7 + + AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) + + VPXOR 0(SI), XWORD0, XWORD0 + VPXOR 16(SI), XWORD1, XWORD1 + VPXOR 32(SI), XWORD2, XWORD2 + VPXOR 48(SI), XWORD3, XWORD3 + VPXOR 64(SI), XWORD4, XWORD4 + VPXOR 80(SI), XWORD5, XWORD5 + VPXOR 96(SI), XWORD6, XWORD6 + VPXOR 112(SI), XWORD7, XWORD7 + + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + VMOVDQU XWORD2, 32(BX) + VMOVDQU XWORD3, 48(BX) + VMOVDQU XWORD4, 64(BX) + VMOVDQU XWORD5, 80(BX) + VMOVDQU XWORD6, 96(BX) + VMOVDQU XWORD7, 112(BX) + +avx_sm4_done: RET avx2: VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + CMPQ DI, $256 + JEQ avx2_16blocks avx2_8blocks: VMOVDQU 0(DX), XDWORD0 @@ -230,17 +262,7 @@ avx2_8blocks: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) - XORL CX, CX - -avx2_loop: - AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - - ADDL $16, CX - CMPL CX, $4*32 - JB avx2_loop + AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) @@ -261,6 +283,68 @@ avx2_loop: VMOVDQU XDWORD2, 64(BX) VMOVDQU XDWORD3, 96(BX) + VZEROUPPER + RET + +avx2_16blocks: + VMOVDQU 0(DX), XDWORD0 + VMOVDQU 32(DX), XDWORD1 + VMOVDQU 64(DX), XDWORD2 + VMOVDQU 96(DX), XDWORD3 + VMOVDQU 128(DX), XDWORD4 + VMOVDQU 160(DX), XDWORD5 + VMOVDQU 192(DX), XDWORD6 + VMOVDQU 224(DX), XDWORD7 + + VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK + + // Apply Byte Flip Mask: LE -> BE + VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 + VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 + VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 + VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 + VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 + VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 + VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 + VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) + TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) + + AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) + TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) + + VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK + VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 + VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 + VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 + VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 + VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 + VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 + VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 + VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 + + VPXOR 0(SI), XDWORD0, XDWORD0 + VPXOR 32(SI), XDWORD1, XDWORD1 + VPXOR 64(SI), XDWORD2, XDWORD2 + VPXOR 96(SI), XDWORD3, XDWORD3 + VPXOR 128(SI), XDWORD4, XDWORD4 + VPXOR 160(SI), XDWORD5, XDWORD5 + VPXOR 192(SI), XDWORD6, XDWORD6 + VPXOR 224(SI), XDWORD7, XDWORD7 + + VMOVDQU XDWORD0, 0(BX) + VMOVDQU XDWORD1, 32(BX) + VMOVDQU XDWORD2, 64(BX) + VMOVDQU XDWORD3, 96(BX) + VMOVDQU XDWORD4, 128(BX) + VMOVDQU XDWORD5, 160(BX) + VMOVDQU XDWORD6, 192(BX) + VMOVDQU XDWORD7, 224(BX) avx2_sm4_done: VZEROUPPER diff --git a/sm4/cbc_cipher_asm_arm64.s b/sm4/cbc_cipher_asm_arm64.s index 0fe466b..b5070cc 100644 --- a/sm4/cbc_cipher_asm_arm64.s +++ b/sm4/cbc_cipher_asm_arm64.s @@ -88,6 +88,10 @@ done_sm4: #undef rkSave #define XTMP7 V7 +#define t4 V10 +#define t5 V11 +#define t6 V12 +#define t7 V13 // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 @@ -99,6 +103,8 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 MOVD src_len+40(FP), R12 MOVD iv+56(FP), R11 + CMP $128, R12 + BEQ double_dec VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] VREV32 t0.B16, t0.B16 @@ -135,3 +141,57 @@ encryptBlocksLoop: VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9) RET + +double_dec: + VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4] + VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + VREV32 t4.B16, t4.B16 + VREV32 t5.B16, t5.B16 + VREV32 t6.B16, t6.B16 + VREV32 t7.B16, t7.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) + + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + EOR R0, R0 + +decrypt8BlocksLoop: + SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7) + SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4) + SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5) + SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6) + + ADD $16, R0 + CMP $128, R0 + BNE decrypt8BlocksLoop + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + VREV32 t4.B16, t4.B16 + VREV32 t5.B16, t5.B16 + VREV32 t6.B16, t6.B16 + VREV32 t7.B16, t7.B16 + + VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4] + VEOR V6.B16, t0.B16, t0.B16 + VEOR V7.B16, t1.B16, t1.B16 + VEOR V8.B16, t2.B16, t2.B16 + VEOR V9.B16, t3.B16, t3.B16 + VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9) + + VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4] + VEOR V6.B16, t4.B16, t4.B16 + VEOR V7.B16, t5.B16, t5.B16 + VEOR V8.B16, t6.B16, t6.B16 + VEOR V9.B16, t7.B16, t7.B16 + VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9) + + RET diff --git a/sm4/ecb_cipher_asm.go b/sm4/ecb_cipher_asm.go index d4975e5..5cb136d 100644 --- a/sm4/ecb_cipher_asm.go +++ b/sm4/ecb_cipher_asm.go @@ -54,6 +54,15 @@ func (x *ecb) CryptBlocks(dst, src []byte) { if len(src) == 0 { return } + for len(src) >= 2*x.b.blocksSize { + if x.enc == ecbEncrypt { + x.b.EncryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize]) + } else { + x.b.DecryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize]) + } + src = src[2*x.b.blocksSize:] + dst = dst[2*x.b.blocksSize:] + } for len(src) >= x.b.blocksSize { if x.enc == ecbEncrypt { x.b.EncryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize]) diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index 07646c5..5d1699d 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -155,114 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 #undef plen #undef dlen -#define AVX_SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \ - VMOVDQU flip_mask<>(SB), x \ - VPSHUFB x, t0, t0 \ - VPSHUFB x, t1, t1 \ - VPSHUFB x, t2, t2 \ - VPSHUFB x, t3, t3 \ - ; \ - TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ - XORL IND, IND \ - VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK \ - AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - AVX_SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - AVX_SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - AVX_SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - AVX_SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ; \ // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ - VPSHUFB BSWAP, t0, t0 \ - VPSHUFB BSWAP, t1, t1 \ - VPSHUFB BSWAP, t2, t2 \ - VPSHUFB BSWAP, t3, t3 \ - -#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \ - PSHUFB flip_mask<>(SB), t0; \ - PSHUFB flip_mask<>(SB), t1; \ - PSHUFB flip_mask<>(SB), t2; \ - PSHUFB flip_mask<>(SB), t3; \ - SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ - XORL IND, IND; \ - SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - ADDL $16, IND; \ - SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ - SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ - SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ - SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ - SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ - PSHUFB BSWAP, t3; \ - PSHUFB BSWAP, t2; \ - PSHUFB BSWAP, t1; \ - PSHUFB BSWAP, t0 - // func gcmSm4Init(productTable *[256]byte, rk []uint32) TEXT ·gcmSm4Init(SB),NOSPLIT,$0 #define dst DI @@ -676,12 +568,12 @@ TEXT ·gcmSm4Enc(SB),0,$256-96 MOVOU (8*16 + 6*16)(SP), B6 MOVOU (8*16 + 7*16)(SP), B7 - SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) + SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + PXOR ACC1, ACC1 increment(0) increment(1) increment(2) increment(3) - SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) increment(4) increment(5) increment(6) @@ -762,7 +654,6 @@ gcmSm4EncOctetsLoop: PCLMULQDQ $0x00, T0, ACC0 PCLMULQDQ $0x11, T0, ACC1 - SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) mulRound(1) increment(0) mulRound(2) @@ -771,7 +662,6 @@ gcmSm4EncOctetsLoop: increment(2) mulRound(4) increment(3) - SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) mulRound(5) increment(4) mulRound(6) @@ -791,6 +681,8 @@ gcmSm4EncOctetsLoop: reduceRound(ACC0) PXOR ACC1, ACC0 + SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + MOVOU (16*0)(ptx), T0 PXOR T0, B0 MOVOU (16*1)(ptx), T0 @@ -886,7 +778,7 @@ gcmSm4EncNibbles: MOVOU (8*16 + 2*16)(SP), B2 MOVOU (8*16 + 3*16)(SP), B3 - SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3) + SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) MOVOU (16*0)(ptx), T0 PXOR T0, B0 MOVOU (16*1)(ptx), T0 @@ -922,7 +814,7 @@ gcmSm4EncSingles: MOVOU (8*16 + 2*16)(SP), B2 MOVOU (8*16 + 3*16)(SP), B3 - SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3) + SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) MOVOU B0, (16*0)(SP) MOVOU B1, (16*1)(SP) MOVOU B2, (16*2)(SP) @@ -1014,17 +906,30 @@ avxGcmSm4Enc: VMOVDQU (8*16 + 1*16)(SP), B1 VMOVDQU (8*16 + 2*16)(SP), B2 VMOVDQU (8*16 + 3*16)(SP), B3 + VMOVDQU (8*16 + 4*16)(SP), B4 + VMOVDQU (8*16 + 5*16)(SP), B5 + VMOVDQU (8*16 + 6*16)(SP), B6 + VMOVDQU (8*16 + 7*16)(SP), B7 - AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3) + AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + VPXOR ACC1, ACC1, ACC1 // clean ACC1 increment(0) increment(1) increment(2) increment(3) + increment(4) + increment(5) + increment(6) + increment(7) // XOR plaintext VPXOR (16*0)(ptx), B0, B0 VPXOR (16*1)(ptx), B1, B1 VPXOR (16*2)(ptx), B2, B2 VPXOR (16*3)(ptx), B3, B3 + VPXOR (16*4)(ptx), B4, B4 + VPXOR (16*5)(ptx), B5, B5 + VPXOR (16*6)(ptx), B6, B6 + VPXOR (16*7)(ptx), B7, B7 // Store ciphertext VMOVDQU B0, (16*0)(ctx) VPSHUFB BSWAP, B0, B0 @@ -1034,31 +939,6 @@ avxGcmSm4Enc: VPSHUFB BSWAP, B2, B2 VMOVDQU B3, (16*3)(ctx) VPSHUFB BSWAP, B3, B3 - VPXOR ACC0, B0, B0 - - VMOVDQU B0, (16*0)(SP) - VMOVDQU B1, (16*1)(SP) - VMOVDQU B2, (16*2)(SP) - VMOVDQU B3, (16*3)(SP) - - // load 4 ctrs for encryption - VMOVDQU (8*16 + 4*16)(SP), B4 - VMOVDQU (8*16 + 5*16)(SP), B5 - VMOVDQU (8*16 + 6*16)(SP), B6 - VMOVDQU (8*16 + 7*16)(SP), B7 - AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7) - increment(4) - increment(5) - increment(6) - increment(7) - - // XOR plaintext - VPXOR (16*4)(ptx), B4, B4 - VPXOR (16*5)(ptx), B5, B5 - VPXOR (16*6)(ptx), B6, B6 - VPXOR (16*7)(ptx), B7, B7 - - // Store ciphertext VMOVDQU B4, (16*4)(ctx) VPSHUFB BSWAP, B4, B4 VMOVDQU B5, (16*5)(ctx) @@ -1068,6 +948,12 @@ avxGcmSm4Enc: VMOVDQU B7, (16*7)(ctx) VPSHUFB BSWAP, B7, B7 + VPXOR ACC0, B0, B0 + + VMOVDQU B0, (16*0)(SP) + VMOVDQU B1, (16*1)(SP) + VMOVDQU B2, (16*2)(SP) + VMOVDQU B3, (16*3)(SP) VMOVDQU B4, (16*4)(SP) VMOVDQU B5, (16*5)(SP) VMOVDQU B6, (16*6)(SP) @@ -1129,12 +1015,16 @@ avxGcmSm4EncOctetsLoop: avxReduceRound(ACC0) VPXOR ACC1, ACC0, ACC0 - AVX_SM4_4BLOCKS(rk, BX, ACC1, T1, T2, B0, B1, B2, B3) + AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) // XOR plaintext VPXOR (16*0)(ptx), B0, B0 VPXOR (16*1)(ptx), B1, B1 VPXOR (16*2)(ptx), B2, B2 VPXOR (16*3)(ptx), B3, B3 + VPXOR (16*4)(ptx), B4, B4 + VPXOR (16*5)(ptx), B5, B5 + VPXOR (16*6)(ptx), B6, B6 + VPXOR (16*7)(ptx), B7, B7 // Store ciphertext VMOVDQU B0, (16*0)(ctx) @@ -1145,21 +1035,6 @@ avxGcmSm4EncOctetsLoop: VPSHUFB BSWAP, B2, B2 VMOVDQU B3, (16*3)(ctx) VPSHUFB BSWAP, B3, B3 - - VPXOR ACC0, B0, B0 - VMOVDQU B0, (16*0)(SP) - VMOVDQU B1, (16*1)(SP) - VMOVDQU B2, (16*2)(SP) - VMOVDQU B3, (16*3)(SP) - - AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7) - // XOR plaintext - VPXOR (16*4)(ptx), B4, B4 - VPXOR (16*5)(ptx), B5, B5 - VPXOR (16*6)(ptx), B6, B6 - VPXOR (16*7)(ptx), B7, B7 - - // Store ciphertext VMOVDQU B4, (16*4)(ctx) VPSHUFB BSWAP, B4, B4 VMOVDQU B5, (16*5)(ctx) @@ -1169,6 +1044,11 @@ avxGcmSm4EncOctetsLoop: VMOVDQU B7, (16*7)(ctx) VPSHUFB BSWAP, B7, B7 + VPXOR ACC0, B0, B0 + VMOVDQU B0, (16*0)(SP) + VMOVDQU B1, (16*1)(SP) + VMOVDQU B2, (16*2)(SP) + VMOVDQU B3, (16*3)(SP) VMOVDQU B4, (16*4)(SP) VMOVDQU B5, (16*5)(SP) VMOVDQU B6, (16*6)(SP) @@ -1226,7 +1106,7 @@ avxGcmSm4EncNibbles: VMOVDQU (8*16 + 2*16)(SP), B2 VMOVDQU (8*16 + 3*16)(SP), B3 - AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3) + AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3) // XOR plaintext VPXOR (16*0)(ptx), B0, B0 VPXOR (16*1)(ptx), B1, B1 @@ -1261,7 +1141,7 @@ avxGcmSm4EncSingles: VMOVDQU (8*16 + 2*16)(SP), B2 VMOVDQU (8*16 + 3*16)(SP), B3 - AVX_SM4_4BLOCKS(rk, BX, B7, T1, T2, B0, B1, B2, B3) + AVX_SM4_4BLOCKS(rk, B6, B7, T1, T2, B0, B1, B2, B3) VMOVDQU B0, (16*0)(SP) VMOVDQU B1, (16*1)(SP) VMOVDQU B2, (16*2)(SP) @@ -1364,18 +1244,9 @@ avx2GcmSm4Enc: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) - XORL BX, BX VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK -avx2GcmSm4Enc8Loop1: - AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) - AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) - AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) - AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) - - ADDL $16, BX - CMPL BX, $4*32 - JB avx2GcmSm4Enc8Loop1 + AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) @@ -1458,18 +1329,9 @@ avx2GcmSm4EncOctetsLoop: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) - XORL BX, BX VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK -avx2GcmSm4Enc8Loop2: - AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) - AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) - AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) - AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) - - ADDL $16, BX - CMPL BX, $4*32 - JB avx2GcmSm4Enc8Loop2 + AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) @@ -1578,7 +1440,6 @@ avx2GcmSm4EncOctetsEnd: SUBQ $4, aluCTR avx2GcmSm4EncNibbles: - VMOVDQU flip_mask<>(SB), B7 CMPQ ptxLen, $64 JBE avx2GcmSm4EncSingles SUBQ $64, ptxLen @@ -1588,31 +1449,7 @@ avx2GcmSm4EncNibbles: VMOVDQU (8*16 + 2*16)(SP), B2 VMOVDQU (8*16 + 3*16)(SP), B3 - VPSHUFB B7, B0, B0 - VPSHUFB B7, B1, B1 - VPSHUFB B7, B2, B2 - VPSHUFB B7, B3, B3 - - TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) - XORL BX, BX - VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK - -avx2GcmSm4Enc4Loop2: - AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) - AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) - AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) - AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) - - ADDL $16, BX - CMPL BX, $4*32 - JB avx2GcmSm4Enc4Loop2 - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) - VPSHUFB BSWAP, B0, B0 - VPSHUFB BSWAP, B1, B1 - VPSHUFB BSWAP, B2, B2 - VPSHUFB BSWAP, B3, B3 + AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3) VMOVDQU (16*0)(ptx), T0 VPXOR T0, B0, B0 @@ -1650,31 +1487,7 @@ avx2GcmSm4EncSingles: VMOVDQU (8*16 + 2*16)(SP), B2 VMOVDQU (8*16 + 3*16)(SP), B3 - VPSHUFB B7, B0, B0 - VPSHUFB B7, B1, B1 - VPSHUFB B7, B2, B2 - VPSHUFB B7, B3, B3 - - TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) - XORL BX, BX - VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK - -avx2GcmSm4Enc4Loop1: - AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) - AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) - AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) - AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) - - ADDL $16, BX - CMPL BX, $4*32 - JB avx2GcmSm4Enc4Loop1 - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) - VPSHUFB BSWAP, B0, B0 - VPSHUFB BSWAP, B1, B1 - VPSHUFB BSWAP, B2, B2 - VPSHUFB BSWAP, B3, B3 + AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3) VMOVDQU B0, (16*0)(SP) VMOVDQU B1, (16*1)(SP) @@ -1890,7 +1703,6 @@ gcmSm4DecOctetsLoop: PCLMULQDQ $0x00, T0, ACC0 PCLMULQDQ $0x11, T0, ACC1 - SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) decMulRound(1) increment(0) decMulRound(2) @@ -1899,7 +1711,6 @@ gcmSm4DecOctetsLoop: increment(2) decMulRound(4) increment(3) - SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) decMulRound(5) increment(4) decMulRound(6) @@ -1920,6 +1731,8 @@ gcmSm4DecOctetsLoop: reduceRound(ACC0) PXOR ACC1, ACC0 + SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) + MOVOU (16*0)(ctx), T0 PXOR T0, B0 MOVOU (16*1)(ctx), T0 @@ -1964,7 +1777,7 @@ gcmSm4DecNibbles: MOVOU (2*16)(SP), B6 MOVOU (3*16)(SP), B7 - SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) + SM4_4BLOCKS(rk, B0, T0, T1, T2, B4, B5, B6, B7) MOVOU (16*14)(pTbl), T2 MOVOU (16*0)(ctx), T0 PXOR T0, B4 @@ -2000,7 +1813,7 @@ gcmSm4DecSingles: MOVOU (2*16)(SP), B2 MOVOU (3*16)(SP), B3 - SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) + SM4_4BLOCKS(rk, B4, T0, T1, T2, B0, B1, B2, B3) MOVOU B0, (16*4)(SP) MOVOU B1, (16*5)(SP) MOVOU B2, (16*6)(SP) @@ -2145,25 +1958,21 @@ avxGcmSm4DecOctetsLoop: avxReduceRound(ACC0) VPXOR ACC1, ACC0, ACC0 - AVX_SM4_4BLOCKS(rk, BX, ACC1, T1, T2, B0, B1, B2, B3) + AVX_SM4_8BLOCKS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) VPXOR (16*0)(ctx), B0, B0 VPXOR (16*1)(ctx), B1, B1 VPXOR (16*2)(ctx), B2, B2 VPXOR (16*3)(ctx), B3, B3 - - VMOVDQU B0, (16*0)(ptx) - VMOVDQU B1, (16*1)(ptx) - VMOVDQU B2, (16*2)(ptx) - VMOVDQU B3, (16*3)(ptx) - - AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7) - VPXOR (16*4)(ctx), B4, B4 VPXOR (16*5)(ctx), B5, B5 VPXOR (16*6)(ctx), B6, B6 VPXOR (16*7)(ctx), B7, B7 + VMOVDQU B0, (16*0)(ptx) + VMOVDQU B1, (16*1)(ptx) + VMOVDQU B2, (16*2)(ptx) + VMOVDQU B3, (16*3)(ptx) VMOVDQU B4, (16*4)(ptx) VMOVDQU B5, (16*5)(ptx) VMOVDQU B6, (16*6)(ptx) @@ -2187,7 +1996,7 @@ avxGcmSm4DecNibbles: VMOVDQU (2*16)(SP), B6 VMOVDQU (3*16)(SP), B7 - AVX_SM4_4BLOCKS(rk, BX, B0, T1, T2, B4, B5, B6, B7) + AVX_SM4_4BLOCKS(rk, B0, B1, T1, T2, B4, B5, B6, B7) VMOVDQU (16*14)(pTbl), T2 VMOVDQU (16*0)(ctx), B0 @@ -2227,7 +2036,7 @@ avxGcmSm4DecSingles: VMOVDQU (2*16)(SP), B2 VMOVDQU (3*16)(SP), B3 - AVX_SM4_4BLOCKS(rk, BX, B7, B6, B5, B0, B1, B2, B3) + AVX_SM4_4BLOCKS(rk, B7, B6, B5, B4, B0, B1, B2, B3) VMOVDQU B0, (16*4)(SP) VMOVDQU B1, (16*5)(SP) VMOVDQU B2, (16*6)(SP) @@ -2328,13 +2137,6 @@ avx2GcmSm4DecOctetsLoop: VMOVDQU (2*32)(SP), DWB2 VMOVDQU (3*32)(SP), DWB3 - VBROADCASTI128 flip_mask<>(SB), XDWTMP0 - // Apply Byte Flip Mask: LE -> BE - VPSHUFB XDWTMP0, DWB0, DWB0 - VPSHUFB XDWTMP0, DWB1, DWB1 - VPSHUFB XDWTMP0, DWB2, DWB2 - VPSHUFB XDWTMP0, DWB3, DWB3 - VMOVDQU (16*0)(ctx), T0 VPSHUFB BSWAP, T0, T0 VPXOR ACC0, T0, T0 @@ -2348,20 +2150,18 @@ avx2GcmSm4DecOctetsLoop: VPCLMULQDQ $0x00, T0, ACC1, ACC0 VPCLMULQDQ $0x11, T0, ACC1, ACC1 + VBROADCASTI128 flip_mask<>(SB), XDWTMP0 + // Apply Byte Flip Mask: LE -> BE + VPSHUFB XDWTMP0, DWB0, DWB0 + VPSHUFB XDWTMP0, DWB1, DWB1 + VPSHUFB XDWTMP0, DWB2, DWB2 + VPSHUFB XDWTMP0, DWB3, DWB3 + // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) - XORL BX, BX VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK -avx2GcmSm4Dec8Loop2: - AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) - AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) - AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) - AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) - - ADDL $16, BX - CMPL BX, $4*32 - JB avx2GcmSm4Dec8Loop2 + AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) @@ -2374,8 +2174,8 @@ avx2GcmSm4Dec8Loop2: VMOVDQU (32*0)(ctx), XDWTMP0 VPXOR XDWTMP0, DWB0, DWB0 - VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0 VEXTRACTI128 $1, XDWTMP0, T0 + VPSHUFB BSWAP, T0, T0 internalAvxDecMulRound(1) increment(0) @@ -2436,7 +2236,6 @@ avx2GcmSm4DecEndOctets: SUBQ $4, aluCTR avx2GcmSm4DecNibbles: - VMOVDQU flip_mask<>(SB), B7 // DO NOT CHANGE B7 CMPQ ptxLen, $64 JBE avx2GcmSm4DecSingles SUBQ $64, ptxLen @@ -2446,31 +2245,7 @@ avx2GcmSm4DecNibbles: VMOVDQU (2*16)(SP), B2 VMOVDQU (3*16)(SP), B3 - VPSHUFB B7, B0, B0 - VPSHUFB B7, B1, B1 - VPSHUFB B7, B2, B2 - VPSHUFB B7, B3, B3 - - TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) - XORL BX, BX - VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK - -avx2GcmSm4Dec4Loop2: - AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) - AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) - AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) - AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) - - ADDL $16, BX - CMPL BX, $4*32 - JB avx2GcmSm4Dec4Loop2 - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) - VPSHUFB BSWAP, B0, B4 - VPSHUFB BSWAP, B1, B1 - VPSHUFB BSWAP, B2, B2 - VPSHUFB BSWAP, B3, B3 + AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3) VMOVDQU (16*14)(pTbl), T2 VMOVDQU (16*0)(ctx), B0 @@ -2511,32 +2286,7 @@ avx2GcmSm4DecSingles: VMOVDQU (2*16)(SP), B2 VMOVDQU (3*16)(SP), B3 - VPSHUFB B7, B0, B0 - VPSHUFB B7, B1, B1 - VPSHUFB B7, B2, B2 - VPSHUFB B7, B3, B3 - - TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) - - XORL BX, BX - VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK - -avx2GcmSm4Dec4Loop1: - AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) - AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) - AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) - AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) - - ADDL $16, BX - CMPL BX, $4*32 - JB avx2GcmSm4Dec4Loop1 - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) - VPSHUFB BSWAP, B0, B0 - VPSHUFB BSWAP, B1, B1 - VPSHUFB BSWAP, B2, B2 - VPSHUFB BSWAP, B3, B3 + AVX_SM4_4BLOCKS(rk, B4, B5, B6, B7, B0, B1, B2, B3) VMOVDQU B0, (16*4)(SP) VMOVDQU B1, (16*5)(SP) diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index 21ffda6..81431e1 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -449,36 +449,24 @@ encOctetsLoop: // encryption first 4 blocks PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) + PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) EOR R13, R13 MOVD rkSave, rk -encOctetsEnc4Blocks1: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) +encOctetsEnc8Blocks: + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) ADD $1, R13 CMP $8, R13 - BNE encOctetsEnc4Blocks1 + BNE encOctetsEnc8Blocks VREV32 B0.B16, B0.B16 VREV32 B1.B16, B1.B16 VREV32 B2.B16, B2.B16 VREV32 B3.B16, B3.B16 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) - // encryption second 4 blocks - PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) - MOVD rkSave, rk - -encOctetsEnc4Blocks2: - SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7) - SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4) - SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5) - SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6) - - ADD $1, R13 - CMP $16, R13 - BNE encOctetsEnc4Blocks2 VREV32 B4.B16, B4.B16 VREV32 B5.B16, B5.B16 VREV32 B6.B16, B6.B16 @@ -741,41 +729,28 @@ decOctetsLoop: // encryption first 4 blocks PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) + PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) EOR R13, R13 MOVD rkSave, rk -decOctetsEnc4Blocks1: - SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) - SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) - SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) - SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) +decOctetsEnc8Blocks: + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) + SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) ADD $1, R13 CMP $8, R13 - BNE decOctetsEnc4Blocks1 + BNE decOctetsEnc8Blocks VREV32 B0.B16, T1.B16 VREV32 B1.B16, T2.B16 VREV32 B2.B16, B2.B16 VREV32 B3.B16, B3.B16 TRANSPOSE_MATRIX(T1, T2, B2, B3, K0, K1, K2, K3) - - // encryption second 4 blocks - PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) - MOVD rkSave, rk - -decOctetsEnc4Blocks2: - SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7) - SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4) - SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5) - SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6) - - ADD $1, R13 - CMP $16, R13 - BNE decOctetsEnc4Blocks2 VREV32 B4.B16, B4.B16 VREV32 B5.B16, B5.B16 VREV32 B6.B16, B6.B16 - VREV32 B7.B16, B7.B16 + VREV32 B7.B16, B7.B16 TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) VLD1.P 32(srcPtr), [B0.B16, B1.B16]