diff mbox series

[6/7] RISC-V: Make vectorized memset handle more cases

Message ID 20241018131300.1150819-7-craig.blackmore@embecosm.com
State New
Headers show
Series RISC-V: Vector memcpy/memset fixes and improvements | expand

Commit Message

Craig Blackmore Oct. 18, 2024, 1:12 p.m. UTC
`expand_vec_setmem` only generated vectorized memset if it fitted into a
single vector store.  Extend it to generate a loop for longer and
unknown lengths.

The test cases now use -O1 so that they are not sensitive to scheduling.

gcc/ChangeLog:

	* config/riscv/riscv-string.cc
	(use_vector_stringop_p): Add comment.
	(expand_vec_setmem): Use use_vector_stringop_p instead of
	check_vectorise_memory_operation.  Add loop generation.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/base/setmem-1.c: Use -O1.  Expect a loop
	instead of a libcall.  Add test for unknown length.
	* gcc.target/riscv/rvv/base/setmem-2.c: Likewise.
	* gcc.target/riscv/rvv/base/setmem-3.c: Likewise and expect smaller
	lmul.
---
 gcc/config/riscv/riscv-string.cc              | 83 ++++++++++++++-----
 .../gcc.target/riscv/rvv/base/setmem-1.c      | 37 ++++++++-
 .../gcc.target/riscv/rvv/base/setmem-2.c      | 37 ++++++++-
 .../gcc.target/riscv/rvv/base/setmem-3.c      | 41 +++++++--
 4 files changed, 160 insertions(+), 38 deletions(-)
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 118c02a4021..91b0ec03118 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -1062,6 +1062,9 @@  struct stringop_info {
 
    MAX_EW is the maximum element width that the caller wants to use and
    LENGTH_IN is the length of the stringop in bytes.
+
+   This is currently used for cpymem and setmem.  If expand_vec_cmpmem switches
+   to using it too then check_vectorise_memory_operation can be removed.
 */
 
 static bool
@@ -1600,41 +1603,75 @@  check_vectorise_memory_operation (rtx length_in, HOST_WIDE_INT &lmul_out)
 bool
 expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in)
 {
-  HOST_WIDE_INT lmul;
+  stringop_info info;
+
   /* Check we are able and allowed to vectorise this operation;
      bail if not.  */
-  if (!check_vectorise_memory_operation (length_in, lmul))
+  if (!use_vector_stringop_p (info, 1, length_in))
     return false;
 
-  machine_mode vmode
-      = riscv_vector::get_vector_mode (QImode, BYTES_PER_RISCV_VECTOR * lmul)
-	    .require ();
+  /* avl holds the (remaining) length of the required set.
+     cnt holds the length we set with the current store.  */
+  rtx cnt = info.avl;
   rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
-  rtx dst = change_address (dst_in, vmode, dst_addr);
+  rtx dst = change_address (dst_in, info.vmode, dst_addr);
 
-  rtx fill_value = gen_reg_rtx (vmode);
+  rtx fill_value = gen_reg_rtx (info.vmode);
   rtx broadcast_ops[] = { fill_value, fill_value_in };
 
-  /* If the length is exactly vlmax for the selected mode, do that.
-     Otherwise, use a predicated store.  */
-  if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+  rtx label = NULL_RTX;
+  rtx mask = NULL_RTX;
+
+  /* If we don't need a loop and the length is exactly vlmax for the selected
+     mode do a broadcast and store, otherwise use a predicated store.  */
+  if (!info.need_loop
+      && known_eq (GET_MODE_SIZE (info.vmode), INTVAL (length_in)))
     {
-      emit_vlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP,
-			  broadcast_ops);
+      emit_vlmax_insn (code_for_pred_broadcast (info.vmode), UNARY_OP,
+		       broadcast_ops);
       emit_move_insn (dst, fill_value);
+      return true;
     }
-  else
+
+  machine_mode mask_mode
+    = riscv_vector::get_vector_mode (BImode,
+				     GET_MODE_NUNITS (info.vmode)).require ();
+  mask =  CONSTM1_RTX (mask_mode);
+  if (!satisfies_constraint_K (cnt))
+    cnt = force_reg (Pmode, cnt);
+
+  if (info.need_loop)
     {
-      if (!satisfies_constraint_K (length_in))
-	      length_in = force_reg (Pmode, length_in);
-      emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP,
-			  broadcast_ops, length_in);
-      machine_mode mask_mode
-	      = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (vmode))
-		      .require ();
-      rtx mask = CONSTM1_RTX (mask_mode);
-      emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in,
-			  get_avl_type_rtx (riscv_vector::NONVLMAX)));
+      info.avl = copy_to_mode_reg (Pmode, info.avl);
+      cnt = gen_reg_rtx (Pmode);
+      emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (info.vmode, cnt,
+							       info.avl));
+    }
+
+  emit_nonvlmax_insn (code_for_pred_broadcast (info.vmode),
+		      riscv_vector::UNARY_OP, broadcast_ops, cnt);
+
+  if (info.need_loop)
+    {
+      label = gen_label_rtx ();
+
+      emit_label (label);
+      emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (info.vmode, cnt,
+							       info.avl));
+    }
+
+  emit_insn (gen_pred_store (info.vmode, dst, mask, fill_value, cnt,
+			     get_avl_type_rtx (riscv_vector::NONVLMAX)));
+
+  if (info.need_loop)
+    {
+      emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
+      emit_insn (gen_rtx_SET (info.avl, gen_rtx_MINUS (Pmode, info.avl, cnt)));
+
+      /* Emit the loop condition.  */
+      rtx test = gen_rtx_NE (VOIDmode, info.avl, const0_rtx);
+      emit_jump_insn (gen_cbranch4 (Pmode, test, info.avl, const0_rtx, label));
+      emit_insn (gen_nop ());
     }
 
   return true;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
index 22844ff348c..32d85ea4f14 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-add-options riscv_v } */
-/* { dg-additional-options "-O3 -mrvv-max-lmul=dynamic" } */
+/* { dg-additional-options "-O1 -mrvv-max-lmul=dynamic" } */
 /* { dg-final { check-function-bodies "**" "" } } */
 
 #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
@@ -91,13 +91,42 @@  f6 (void *a, int const b)
   return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8);
 }
 
-/* Don't vectorise if the move is too large for one operation.
+/* Vectorise with loop for larger lengths
 ** f7:
-**  li\s+a2,\d+
-**  tail\s+memset
+**  mv\s+[ta][0-7],a0
+**  li\s+[ta][0-7],129
+**  vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+**  vmv.v.x\s+v8,a1
+XX \.L\d+:
+**  vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma
+**  vse8.v\s+v8,0\(a[0-9]\)
+**  add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  bne\s+[ta][0-7],zero,\.L\d+
+**  ret
 */
 void *
 f7 (void *a, int const b)
 {
   return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8 + 1);
 }
+
+/* Vectorize with loop for unknown length.
+** f8:
+**  mv\s+[ta][0-7],a0
+**  mv\s+[ta][0-7],a2
+**  vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+**  vmv.v.x\s+v8,a1
+XX \.L\d+:
+**  vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma
+**  vse8.v\s+v8,0\(a[0-9]\)
+**  add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  bne\s+[ta][0-7],zero,\.L\d+
+**  ret
+*/
+void *
+f8 (void *a, int const b, int n)
+{
+  return __builtin_memset (a, b, n);
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c
index faea442a4bd..9da1c9309d8 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-add-options riscv_v } */
-/* { dg-additional-options "-O3 -mrvv-max-lmul=m1" } */
+/* { dg-additional-options "-O1 -mrvv-max-lmul=m1" } */
 /* { dg-final { check-function-bodies "**" "" } } */
 
 #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
@@ -39,13 +39,42 @@  f2 (void *a, int const b)
   return __builtin_memset (a, b, MIN_VECTOR_BYTES);
 }
 
-/* Don't vectorise if the move is too large for requested lmul.
+/* Vectorise with loop for larger lengths
 ** f3:
-**  li\s+a2,\d+
-**  tail\s+memset
+**  mv\s+[ta][0-7],a0
+**  li\s+[ta][0-7],17
+**  vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma
+**  vmv.v.x\s+v1,a1
+XX \.L\d+:
+**  vsetvli\s+[ta][0-7],[ta][0-7],e8,m1,ta,ma
+**  vse8.v\s+v1,0\(a[0-9]\)
+**  add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  bne\s+[ta][0-7],zero,\.L\d+
+**  ret
 */
 void *
 f3 (void *a, int const b)
 {
   return __builtin_memset (a, b, MIN_VECTOR_BYTES + 1);
 }
+
+/* Vectorize with loop for unknown length.
+** f4:
+**  mv\s+[ta][0-7],a0
+**  mv\s+[ta][0-7],a2
+**  vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma
+**  vmv.v.x\s+v1,a1
+XX \.L\d+:
+**  vsetvli\s+[ta][0-7],[ta][0-7],e8,m1,ta,ma
+**  vse8.v\s+v1,0\(a[0-9]\)
+**  add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  bne\s+[ta][0-7],zero,\.L\d+
+**  ret
+*/
+void *
+f4 (void *a, int const b, int n)
+{
+  return __builtin_memset (a, b, n);
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c
index 25be694d248..2111a139ad4 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-add-options riscv_v } */
-/* { dg-additional-options "-O3 -mrvv-max-lmul=m8" } */
+/* { dg-additional-options "-O1 -mrvv-max-lmul=m8" } */
 /* { dg-final { check-function-bodies "**" "" } } */
 
 #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
@@ -21,13 +21,13 @@  f1 (void *a, int const b)
   return __builtin_memset (a, b, MIN_VECTOR_BYTES - 1);
 }
 
-/* Vectorise+inline minimum vector register width using requested lmul.
+/* Vectorised code should use smallest lmul known to fit length.
 ** f2:
 **  (
-**  vsetivli\s+zero,\d+,e8,m8,ta,ma
+**  vsetivli\s+zero,\d+,e8,m1,ta,ma
 **  |
 **  li\s+a\d+,\d+
-**  vsetvli\s+zero,a\d+,e8,m8,ta,ma
+**  vsetvli\s+zero,a\d+,e8,m1,ta,ma
 **  )
 **  vmv\.v\.x\s+v\d+,a1
 **  vse8\.v\s+v\d+,0\(a0\)
@@ -57,13 +57,40 @@  f3 (void *a, int const b)
   return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8);
 }
 
-/* Don't vectorise if the move is too large for requested lmul.
+/* Vectorise with loop for larger lengths
 ** f4:
-**  li\s+a2,\d+
-**  tail\s+memset
+**  mv\s+[ta][0-7],a0
+**  li\s+[ta][0-7],129
+**  vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+**  vmv.v.x\s+v8,a1
+**  vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma
+**  vse8.v\s+v8,0\(a[0-9]\)
+**  add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  bne\s+[ta][0-7],zero,\.L\d+
+**  ret
 */
 void *
 f4 (void *a, int const b)
 {
   return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8 + 1);
 }
+
+/* Vectorize with loop for unknown length.
+** f5:
+**  mv\s+[ta][0-7],a0
+**  mv\s+[ta][0-7],a2
+**  vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+**  vmv.v.x\s+v8,a1
+**  vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma
+**  vse8.v\s+v8,0\(a[0-9]\)
+**  add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+**  bne\s+[ta][0-7],zero,\.L\d+
+**  ret
+*/
+void *
+f5 (void *a, int const b, int n)
+{
+  return __builtin_memset (a, b, n);
+}