From 5b3dfb6fa7d4c6397b431cff2b5686dffcd982e6 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Thu, 28 Sep 2023 13:19:52 +0800 Subject: [PATCH] sm4: amd64, reduce VBROADCASTI128 usage --- cipher/benchmark_test.go | 2 +- sm4/aesni_macros_amd64.s | 53 ++++++++++++++++++++-------------------- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/cipher/benchmark_test.go b/cipher/benchmark_test.go index 8fed97a..9d79b2e 100644 --- a/cipher/benchmark_test.go +++ b/cipher/benchmark_test.go @@ -25,7 +25,7 @@ func BenchmarkSM4ECBEncrypt1K(b *testing.B) { benchmarkECBEncrypt1K(b, c) } -func BenchmarkAES128EBCEncrypt1K(b *testing.B) { +func BenchmarkAES128ECBEncrypt1K(b *testing.B) { var key [16]byte c, _ := aes.NewCipher(key[:]) benchmarkECBEncrypt1K(b, c) diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index 91c4c6e..cd0f39b 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -16,48 +16,47 @@ GLOBL nibble_mask<>(SB), 8, $16 // inverse shift rows DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 -GLOBL inverse_shift_rows<>(SB), 8, $16 +DATA inverse_shift_rows<>+0x10(SB)/8, $0x0B0E0104070A0D00 +DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508 +GLOBL inverse_shift_rows<>(SB), 8, $32 // Affine transform 1 (low and high hibbles) DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 -GLOBL m1_low<>(SB), 8, $16 +DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69 +DATA m1_low<>+0x18(SB)/8, $0x3045F98CEF9A2653 +GLOBL m1_low<>(SB), 8, $32 DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB -GLOBL m1_high<>(SB), 8, $16 +DATA m1_high<>+0x10(SB)/8, $0xC35BF46CAF379800 +DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB +GLOBL m1_high<>(SB), 8, $32 // Affine transform 2 (low and high hibbles) DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 -GLOBL m2_low<>(SB), 8, $16 +DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61 +DATA m2_low<>+0x18(SB)/8, $0x0E019E916A65FAF5 +GLOBL m2_low<>(SB), 8, $32 DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 -GLOBL m2_high<>(SB), 8, $16 +DATA m2_high<>+0x10(SB)/8, $0x892D69CD44E0A400 +DATA m2_high<>+0x18(SB)/8, $0x2C88CC68E14501A5 +GLOBL m2_high<>(SB), 8, $32 // left rotations of 32-bit words by 8-bit increments DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -GLOBL r08_mask<>(SB), 8, $16 +DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003 +DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B +GLOBL r08_mask<>(SB), 8, $32 DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 GLOBL fk_mask<>(SB), 8, $16 -// inverse shift rows -DATA inverse_shift_rows256<>+0x00(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows256<>+0x08(SB)/8, $0x0306090C0F020508 -DATA inverse_shift_rows256<>+0x10(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows256<>+0x18(SB)/8, $0x0306090C0F020508 -GLOBL inverse_shift_rows256<>(SB), 8, $32 - -DATA r08_mask256<>+0x00(SB)/8, $0x0605040702010003 -DATA r08_mask256<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA r08_mask256<>+0x10(SB)/8, $0x0605040702010003 -DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B -GLOBL r08_mask256<>(SB), 8, $32 - // Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance! // input: from high to low // r0 = [w3, w2, w1, w0] @@ -539,24 +538,24 @@ GLOBL r08_mask256<>(SB), 8, $32 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. #define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ VPAND yNibbleMask, x, z; \ - VBROADCASTI128 m1_low<>(SB), y; \ + VMOVDQU m1_low<>(SB), y; \ VPSHUFB z, y, y; \ VPSRLQ $4, x, x; \ VPAND yNibbleMask, x, x; \ - VBROADCASTI128 m1_high<>(SB), z; \ + VMOVDQU m1_high<>(SB), z; \ VPSHUFB x, z, x; \ VPXOR y, x, x; \ - VPSHUFB inverse_shift_rows256<>(SB), x, x; \ + VPSHUFB inverse_shift_rows<>(SB), x, x; \ VEXTRACTI128 $1, x, yw \ VAESENCLAST xNibbleMask, xw, xw; \ VAESENCLAST xNibbleMask, yw, yw; \ VINSERTI128 $1, yw, x, x; \ VPANDN yNibbleMask, x, z; \ - VBROADCASTI128 m2_low<>(SB), y; \ + VMOVDQU m2_low<>(SB), y; \ VPSHUFB z, y, y; \ VPSRLQ $4, x, x; \ VPAND yNibbleMask, x, x; \ - VBROADCASTI128 m2_high<>(SB), z; \ + VMOVDQU m2_high<>(SB), z; \ VPSHUFB x, z, x; \ VPXOR y, x, x @@ -571,11 +570,11 @@ GLOBL r08_mask256<>(SB), 8, $32 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. #define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \ - VPSHUFB r08_mask256<>(SB), x, y; \ // y = x <<< 8 - VPSHUFB r08_mask256<>(SB), y, z; \ // z = x <<< 16 + VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 + VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16 VPXOR x, y, y; \ // y = x ^ (x <<< 8) VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) - VPSHUFB r08_mask256<>(SB), z, z; \ // z = x <<< 24 + VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24 VPXOR x, z, x; \ // x = x ^ (x <<< 24) VPSLLD $2, y, z; \ VPSRLD $30, y, y; \