diff mbox series

Enhance gather fallback for PR65518 with SLP

Message ID 20241016090956.0A5503858D3C@sourceware.org
State New
Headers show
Series Enhance gather fallback for PR65518 with SLP | expand

Commit Message

Richard Biener Oct. 16, 2024, 9:09 a.m. UTC
With SLP forced we fail to use gather for PR65518 on RISC-V as expected
because we're failing due to not effective peeling for gaps.  The
following appropriately moves the memory_access_type adjustment before
doing all the overrun checking since using VMAT_ELEMENTWISE means
there's no overrun.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

	* tree-vect-stmts.cc (get_group_load_store_type): Move
	VMAT_ELEMENTWISE fallback for single-element interleaving
	of too large groups before overrun checking.

	* gcc.dg/vect/pr65518.c: Adjust.
---
 gcc/testsuite/gcc.dg/vect/pr65518.c | 109 ++++++++++++++--------------
 gcc/tree-vect-stmts.cc              |  58 ++++++++-------
 2 files changed, 85 insertions(+), 82 deletions(-)
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/pr65518.c b/gcc/testsuite/gcc.dg/vect/pr65518.c
index 189a65534f6..6d851506169 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65518.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65518.c
@@ -1,54 +1,55 @@ 
-#include "tree-vect.h"
-
-#if VECTOR_BITS > 256
-#define NINTS (VECTOR_BITS / 32)
-#else
-#define NINTS 8
-#endif
-
-#define N (NINTS * 2)
-#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)
-
-extern void abort (void);
-
-typedef struct giga
-{
-  unsigned int g[N];
-} giga;
-
-unsigned long __attribute__((noinline,noclone))
-addfst(giga const *gptr, int num)
-{
-  unsigned int retval = 0;
-  int i;
-  for (i = 0; i < num; i++)
-    retval += gptr[i].g[0];
-  return retval;
-}
-
-int main ()
-{
-  struct giga g[NINTS];
-  unsigned int n = 1;
-  int i, j;
-  check_vect ();
-  for (i = 0; i < NINTS; ++i)
-    for (j = 0; j < N; ++j)
-      {
-	g[i].g[j] = n++;
-	__asm__ volatile ("");
-      }
-  if (addfst (g, NINTS) != RESULT)
-    abort ();
-  return 0;
-}
-
-/* We don't want to vectorize the single-element interleaving in the way
-   we currently do that (without ignoring not needed vectors in the
-   gap between gptr[0].g[0] and gptr[1].g[0]), because that's very
-   sub-optimal and causes memory explosion (even though the cost model
-   should reject that in the end).  */
-
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */
-/* We end up using gathers for the strided load on RISC-V which would be OK.  */
-/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */
+#include "tree-vect.h"
+
+#if VECTOR_BITS > 256
+#define NINTS (VECTOR_BITS / 32)
+#else
+#define NINTS 8
+#endif
+
+#define N (NINTS * 2)
+#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)
+
+extern void abort (void);
+
+typedef struct giga
+{
+  unsigned int g[N];
+} giga;
+
+unsigned long __attribute__((noinline,noclone))
+addfst(giga const *gptr, int num)
+{
+  unsigned int retval = 0;
+  int i;
+  for (i = 0; i < num; i++)
+    retval += gptr[i].g[0];
+  return retval;
+}
+
+int main ()
+{
+  struct giga g[NINTS];
+  unsigned int n = 1;
+  int i, j;
+  check_vect ();
+  for (i = 0; i < NINTS; ++i)
+    for (j = 0; j < N; ++j)
+      {
+	g[i].g[j] = n++;
+	__asm__ volatile ("");
+      }
+  if (addfst (g, NINTS) != RESULT)
+    abort ();
+  return 0;
+}
+
+/* We don't want to vectorize the single-element interleaving in the way
+   we currently do that (without ignoring not needed vectors in the
+   gap between gptr[0].g[0] and gptr[1].g[0]), because that's very
+   sub-optimal and causes memory explosion (even though the cost model
+   should reject that in the end).  */
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */
+/* We should end up using gathers for the strided load on RISC-V.  */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" { target { riscv*-*-* } } } } */
+/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 13a825319ca..14723c4dbac 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2081,6 +2081,35 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 	  else
 	    *memory_access_type = VMAT_CONTIGUOUS;
 
+	  /* If this is single-element interleaving with an element
+	     distance that leaves unused vector loads around punt - we
+	     at least create very sub-optimal code in that case (and
+	     blow up memory, see PR65518).  */
+	  if (loop_vinfo
+	      && *memory_access_type == VMAT_CONTIGUOUS
+	      && single_element_p
+	      && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+	    {
+	      if (SLP_TREE_LANES (slp_node) == 1)
+		{
+		  *memory_access_type = VMAT_ELEMENTWISE;
+		  overrun_p = false;
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "single-element interleaving not supported "
+				     "for not adjacent vector loads, using "
+				     "elementwise access\n");
+		}
+	      else
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "single-element interleaving not supported "
+				     "for not adjacent vector loads\n");
+		  return false;
+		}
+	    }
+
 	  overrun_p = loop_vinfo && gap != 0;
 	  if (overrun_p && vls_type != VLS_LOAD)
 	    {
@@ -2149,6 +2178,7 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 				 "Peeling for outer loop is not supported\n");
 	      return false;
 	    }
+
 	  /* Peeling for gaps assumes that a single scalar iteration
 	     is enough to make sure the last vector iteration doesn't
 	     access excess elements.  */
@@ -2179,34 +2209,6 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 		  return false;
 		}
 	    }
-
-	  /* If this is single-element interleaving with an element
-	     distance that leaves unused vector loads around punt - we
-	     at least create very sub-optimal code in that case (and
-	     blow up memory, see PR65518).  */
-	  if (loop_vinfo
-	      && *memory_access_type == VMAT_CONTIGUOUS
-	      && single_element_p
-	      && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
-	    {
-	      if (SLP_TREE_LANES (slp_node) == 1)
-		{
-		  *memory_access_type = VMAT_ELEMENTWISE;
-		  if (dump_enabled_p ())
-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "single-element interleaving not supported "
-				     "for not adjacent vector loads, using "
-				     "elementwise access\n");
-		}
-	      else
-		{
-		  if (dump_enabled_p ())
-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "single-element interleaving not supported "
-				     "for not adjacent vector loads\n");
-		  return false;
-		}
-	    }
 	}
     }
   else