diff mbox series

[3/5] Handle gaps in SLP load permutation lowering

Message ID 20240703132524.336CB386101B@sourceware.org
State New
Headers show
Series [1/5] lower SLP load permutation to interleaving | expand

Commit Message

Richard Biener July 3, 2024, 1:23 p.m. UTC
The following adds handling of gaps by representing them with NULL
entries in SLP_TREE_SCALAR_STMTS for the unpermuted load node.

The SLP discovery changes could be elided if we manually build the
load node instead.

	* tree-vect-slp.cc (vect_build_slp_tree_1): Handle NULL stmt.
	(vect_build_slp_tree_2): Likewise.  Release load permutation
	when there's a NULL in SLP_TREE_SCALAR_STMTS and assert there's
	no actual permutation in that case.
	(vect_lower_load_permutations): Handle gaps in loads.

	* gcc.dg/vect/slp-51.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/slp-51.c | 17 +++++++++++
 gcc/tree-vect-slp.cc               | 49 ++++++++++++++++++------------
 2 files changed, 47 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-51.c
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/slp-51.c b/gcc/testsuite/gcc.dg/vect/slp-51.c
new file mode 100644
index 00000000000..91ae763be30
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-51.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+
+void foo (int * __restrict x, int *y)
+{
+  x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__);
+  y = __builtin_assume_aligned (y, __BIGGEST_ALIGNMENT__);
+  for (int i = 0; i < 1024; ++i)
+    {
+      x[4*i+0] = y[4*i+0];
+      x[4*i+1] = y[4*i+2] * 2;
+      x[4*i+2] = y[4*i+0] + 3;
+      x[4*i+3] = y[4*i+2] * 2 - 5;
+    }
+}
+
+/* Check we can handle SLP with gaps and an interleaving scheme.  */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 6f3822af950..fdefee90e92 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1080,10 +1080,15 @@  vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
   stmt_vec_info stmt_info;
   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
     {
-      gimple *stmt = stmt_info->stmt;
       swap[i] = 0;
       matches[i] = false;
+      if (!stmt_info)
+	{
+	  matches[i] = true;
+	  continue;
+	}
 
+      gimple *stmt = stmt_info->stmt;
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
 
@@ -1984,10 +1989,16 @@  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	  stmt_vec_info first_stmt_info
 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
 	  bool any_permute = false;
+	  bool any_null = false;
 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
 	    {
 	      int load_place;
-	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+	      if (! load_info)
+		{
+		  load_place = j;
+		  any_null = true;
+		}
+	      else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 		load_place = vect_get_place_in_interleaving_chain
 		    (load_info, first_stmt_info);
 	      else
@@ -1996,6 +2007,11 @@  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	      any_permute |= load_place != j;
 	      load_permutation.quick_push (load_place);
 	    }
+	  if (any_null)
+	    {
+	      gcc_assert (!any_permute);
+	      load_permutation.release ();
+	    }
 
 	  if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 	    {
@@ -3978,24 +3994,11 @@  vect_lower_load_permutations (loop_vec_info loop_vinfo,
   stmt_vec_info first
     = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
 
-  /* ???  In principle we have to consider a gap up to the next full
-     vector, but we have to actually represent a scalar stmt for the
-     gaps value so delay handling this.  The same is true for
-     inbetween gaps which the load places in the load-permutation
-     represent.  It's probably not worth trying an intermediate packing
-     to vectors without gap even if that might handle some more cases.
-     Instead get the gap case correct in some way.  */
-  unsigned group_lanes = 0;
-  for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
-    {
-      if ((s == first && DR_GROUP_GAP (s) != 0)
-	  || (s != first && DR_GROUP_GAP (s) != 1))
-	return;
-      group_lanes++;
-    }
   /* Only a power-of-two number of lanes matches interleaving with N levels.
+     The non-SLP path also supports DR_GROUP_SIZE == 3.
      ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
      at each step.  */
+  unsigned group_lanes = DR_GROUP_SIZE (first);
   if (exact_log2 (group_lanes) == -1)
     return;
 
@@ -4017,11 +4020,19 @@  vect_lower_load_permutations (loop_vec_info loop_vinfo,
 	continue;
 
       /* First build (and possibly re-use) a load node for the
-	 unpermuted group.  */
+	 unpermuted group.  Gaps in the middle and on the end are
+	 represented with NULL stmts.  */
       vec<stmt_vec_info> stmts;
       stmts.create (group_lanes);
       for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
-	stmts.quick_push (s);
+	{
+	  if (s != first)
+	    for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
+	      stmts.quick_push (NULL);
+	  stmts.quick_push (s);
+	}
+      for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
+	stmts.quick_push (NULL);
       poly_uint64 max_nunits;
       bool *matches = XALLOCAVEC (bool, group_lanes);
       unsigned limit = 1;