From 45584ccaa6771e9f6cd373dd5d4287c8d29520b9 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 28 Aug 2024 13:18:42 +0800 Subject: [PATCH] internal/subtle: apply VLM/VSTM in non-loop --- internal/subtle/xor_s390x.s | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/internal/subtle/xor_s390x.s b/internal/subtle/xor_s390x.s index 202520c..2c9a320 100644 --- a/internal/subtle/xor_s390x.s +++ b/internal/subtle/xor_s390x.s @@ -13,13 +13,22 @@ TEXT ·xorBytes(SB),NOSPLIT,$0-32 CMPBLT R4, $64, tail loop_64: - VLM 0(R2)(R5*1), V0, V3 - VLM 0(R3)(R5*1), V4, V7 + VL 0(R2)(R5*1), V0 + VL 16(R2)(R5*1), V1 + VL 32(R2)(R5*1), V2 + VL 48(R2)(R5*1), V3 + VL 0(R3)(R5*1), V4 + VL 16(R3)(R5*1), V5 + VL 32(R3)(R5*1), V6 + VL 48(R3)(R5*1), V7 VX V0, V4, V4 VX V1, V5, V5 VX V2, V6, V6 VX V3, V7, V7 - VSTM V4, V7, 0(R1)(R5*1) + VST V4, 0(R1)(R5*1) + VST V5, 16(R1)(R5*1) + VST V6, 32(R1)(R5*1) + VST V7, 48(R1)(R5*1) LAY 64(R5), R5 SUB $64, R4 CMPBGE R4, $64, loop_64 @@ -27,14 +36,14 @@ loop_64: tail: CMPBEQ R4, $0, done CMPBLT R4, $32, less_than32 - VL 0(R2)(R5*1), V0 - VL 16(R2)(R5*1), V1 - VL 0(R3)(R5*1), V2 - VL 16(R3)(R5*1), V3 + VLM 0(R2)(R5*1), V0, V1 + //VL 16(R2)(R5*1), V1 + VLM 0(R3)(R5*1), V2, V3 + //VL 16(R3)(R5*1), V3 VX V0, V2, V2 VX V1, V3, V3 - VST V2, 0(R1)(R5*1) - VST V3, 16(R1)(R5*1) + VSTM V2, V3, 0(R1)(R5*1) + //VST V3, 16(R1)(R5*1) LAY 32(R5), R5 SUB $32, R4