diff mbox series

[7/7] RISC-V: Disable by pieces for vector setmem length > UNITS_PER_WORD

Message ID 20241018131300.1150819-8-craig.blackmore@embecosm.com
State New
Headers show
Series RISC-V: Vector memcpy/memset fixes and improvements | expand

Commit Message

Craig Blackmore Oct. 18, 2024, 1:13 p.m. UTC
For fast unaligned access targets, by pieces uses up to UNITS_PER_WORD
size pieces resulting in more store instructions than needed.  For
example gcc.target/riscv/rvv/base/setmem-1.c:f1 built with
`-O3 -march=rv64gcv -mtune=thead-c906`:
```
f1:
        vsetivli        zero,8,e8,mf2,ta,ma
        vmv.v.x v1,a1
        vsetivli        zero,0,e32,mf2,ta,ma
        sb      a1,14(a0)
        vmv.x.s a4,v1
        vsetivli        zero,8,e16,m1,ta,ma
        vmv.x.s a5,v1
        vse8.v  v1,0(a0)
        sw      a4,8(a0)
        sh      a5,12(a0)
        ret
```

The slow unaligned access version built with `-O3 -march=rv64gcv` used
15 sb instructions:
```
f1:
        sb      a1,0(a0)
        sb      a1,1(a0)
        sb      a1,2(a0)
        sb      a1,3(a0)
        sb      a1,4(a0)
        sb      a1,5(a0)
        sb      a1,6(a0)
        sb      a1,7(a0)
        sb      a1,8(a0)
        sb      a1,9(a0)
        sb      a1,10(a0)
        sb      a1,11(a0)
        sb      a1,12(a0)
        sb      a1,13(a0)
        sb      a1,14(a0)
        ret
```

After this patch, the following is generated in both cases:
```
f1:
        vsetivli        zero,15,e8,m1,ta,ma
        vmv.v.x v1,a1
        vse8.v  v1,0(a0)
        ret
```

gcc/ChangeLog:

	* config/riscv/riscv.cc (riscv_use_by_pieces_infrastructure_p):
	New function.
	(TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): Define.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/pr113469.c: Expect mf2 setmem.
	* gcc.target/riscv/rvv/base/setmem-2.c: Update f1 to expect
	straight-line vector memset.
	* gcc.target/riscv/rvv/base/setmem-3.c: Likewise.
---
 gcc/config/riscv/riscv.cc                     | 19 +++++++++++++++++++
 .../gcc.target/riscv/rvv/autovec/pr113469.c   |  3 ++-
 .../gcc.target/riscv/rvv/base/setmem-2.c      | 12 +++++++-----
 .../gcc.target/riscv/rvv/base/setmem-3.c      | 12 +++++++-----
 4 files changed, 35 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index e111cb07284..c008b2da3b7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -12583,6 +12583,22 @@  riscv_stack_clash_protection_alloca_probe_range (void)
   return STACK_CLASH_CALLER_GUARD;
 }
 
+static bool
+riscv_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
+				      unsigned alignment,
+				      enum by_pieces_operation op, bool speed_p)
+{
+  /* For set/clear with size > UNITS_PER_WORD, by pieces uses vector broadcasts
+     with UNITS_PER_WORD size pieces.  Use setmem<mode> instead which can use
+     bigger chunks.  */
+  if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR
+      && (op == CLEAR_BY_PIECES || op == SET_BY_PIECES)
+      && speed_p && size > UNITS_PER_WORD)
+    return false;
+
+  return default_use_by_pieces_infrastructure_p (size, alignment, op, speed_p);
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -12948,6 +12964,9 @@  riscv_stack_clash_protection_alloca_probe_range (void)
 #undef TARGET_C_MODE_FOR_FLOATING_TYPE
 #define TARGET_C_MODE_FOR_FLOATING_TYPE riscv_c_mode_for_floating_type
 
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P riscv_use_by_pieces_infrastructure_p
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-riscv.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c
index d1c118c02d6..f86084bdb40 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c
@@ -51,4 +51,5 @@  void p(int buf, __builtin_va_list ab, int q) {
  } while (k);
 }
 
-/* { dg-final { scan-assembler-times {vsetivli\tzero,\s*4,\s*e8,\s*mf4,\s*t[au],\s*m[au]} 2 } } */
+/* { dg-final { scan-assembler-times {vsetivli\tzero,\s*4,\s*e8,\s*mf4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\tzero,\s*8,\s*e8,\s*mf2,\s*t[au],\s*m[au]} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c
index 9da1c9309d8..67d62f7193e 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c
@@ -5,15 +5,17 @@ 
 
 #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
 
-/* Small memsets shouldn't be vectorised.
+/* Vectorise with no loop.
 ** f1:
 **  (
-**  sb\s+a1,0\(a0\)
-**  ...
+**  vsetivli\s+zero,\d+,e8,m1,ta,ma
 **  |
-**  li\s+a2,\d+
-**  tail\s+memset
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m1,ta,ma
 **  )
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
 */
 void *
 f1 (void *a, int const b)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c
index 2111a139ad4..7ade7ef415b 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c
@@ -5,15 +5,17 @@ 
 
 #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
 
-/* Small memsets shouldn't be vectorised.
+/* Vectorise with no loop.
 ** f1:
 **  (
-**  sb\s+a1,0\(a0\)
-**  ...
+**  vsetivli\s+zero,\d+,e8,m1,ta,ma
 **  |
-**  li\s+a2,\d+
-**  tail\s+memset
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m1,ta,ma
 **  )
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
 */
 void *
 f1 (void *a, int const b)