diff mbox

[0/4] Support more VLA SLP permutations

Message ID 20241004104054.2653382-1-richard.sandiford@arm.com
State New
Headers show

Commit Message

Richard Sandiford Oct. 4, 2024, 10:40 a.m. UTC
This series should fix the target-independent parts of PR116583.
(We also need some target-specific patches, to be posted separately.)

The explanations are in the individual commit messages, but I've
attached a -b diff below in case my attempt to split the patch up
has just obfuscated things instead.

Tested on aarch64-linux-gnu (with and without SVE enabled by default)
and x86_64-linux-gnu.  Also tested by running the vect testsuite
with vect-force-slp=1.

Richard Sandiford (4):
  vect: Variable lane indices in vectorizable_slp_permutation_1
  vect: Restructure repeating_p case for SLP permutations
  vect: Support more VLA SLP permutations
  vect: Add more dump messages for VLA SLP permutation

 gcc/testsuite/gcc.dg/vect/slp-13-big-array.c |   2 +-
 gcc/testsuite/gcc.dg/vect/slp-13.c           |   2 +-
 gcc/tree-vect-slp.cc                         | 190 +++++++++++++------
 3 files changed, 134 insertions(+), 60 deletions(-)
diff mbox

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
index ca70856c1dd..e45f8aab133 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
@@ -137,4 +137,4 @@  int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-13.c b/gcc/testsuite/gcc.dg/vect/slp-13.c
index b7f947e6dbe..d6346aef978 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13.c
@@ -131,4 +131,4 @@  int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 482b9d50496..56fb55cb628 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10194,6 +10194,13 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
   unsigned i;
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
+  /* True if we're permuting a single input of 2N vectors down
+     to N vectors.  This case doesn't generalize beyond 2 since
+     VEC_PERM_EXPR only takes 2 inputs.  */
+  bool pack_p = false;
+  /* If we're permuting inputs of N vectors each into X*N outputs,
+     this is the value of X, otherwise it is 1.  */
+  unsigned int unpack_factor = 1;
   tree op_vectype = NULL_TREE;
   FOR_EACH_VEC_ELT (children, i, child)
     if (SLP_TREE_VECTYPE (child))
@@ -10215,7 +10222,20 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
 			     "Unsupported vector types in lane permutation\n");
 	  return -1;
 	}
-      if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
+      auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
+      unsigned int this_unpack_factor;
+      /* Check whether the input has twice as many lanes per vector.  */
+      if (children.length () == 1
+	  && known_eq (SLP_TREE_LANES (child) * nunits,
+		       SLP_TREE_LANES (node) * op_nunits * 2))
+	pack_p = true;
+      /* Check whether the output has N times as many lanes per vector.  */
+      else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
+				    SLP_TREE_LANES (child) * nunits,
+				    &this_unpack_factor)
+	       && (i == 0 || unpack_factor == this_unpack_factor))
+	unpack_factor = this_unpack_factor;
+      else
 	repeating_p = false;
     }
 
@@ -10243,29 +10263,47 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
       return 1;
     }
 
-  /* REPEATING_P is true if every output vector is guaranteed to use the
-     same permute vector.  We can handle that case for both variable-length
-     and constant-length vectors, but we only handle other cases for
-     constant-length vectors.
+  /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR
+     and if we can generate the vectors in a vector-length agnostic way.
+     This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
+     compile time.
+
+     The significance of UNPACK_STEP is that, when PACK_P is false,
+     output vector I operates on a window of UNPACK_STEP elements from each
+     input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR).  For example,
+     when UNPACK_FACTOR is 2, the first output vector operates on lanes
+     [0, NUNITS / 2 - 1] of each input vector and the second output vector
+     operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
+
+     When REPEATING_P is true, NOUTPUTS holds the total number of outputs
+     that we actually need to generate.  */
+  uint64_t noutputs = 0;
+  poly_uint64 unpack_step = 0;
+  loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
+  if (!linfo
+      || !multiple_p (nunits, unpack_factor, &unpack_step)
+      || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
+			       * SLP_TREE_LANES (node), nunits, &noutputs))
+    repeating_p = false;
+
+  /* We can handle the conditions described for REPEATING_P above for
+     both variable- and constant-length vectors.  The fallback requires
+     us to generate every element of every permute vector explicitly,
+     which is only possible for constant-length permute vectors.
 
      Set:
 
      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
-       mask vector that we want to build.
+       mask vectors that we want to build.
 
      - NCOPIES to the number of copies of PERM that we need in order
-       to build the necessary permute mask vectors.
-
-     - NOUTPUTS_PER_MASK to the number of output vectors we want to create
-       for each permute mask vector.  This is only relevant when GSI is
-       nonnull.  */
+       to build the necessary permute mask vectors.  */
   uint64_t npatterns;
   unsigned nelts_per_pattern;
   uint64_t ncopies;
-  unsigned noutputs_per_mask;
   if (repeating_p)
     {
-      /* We need a single permute mask vector that has the form:
+      /* We need permute mask vectors that have the form:
 
 	   { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
 
@@ -10274,7 +10312,6 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
 	 that we use for permutes requires 3n elements.  */
       npatterns = SLP_TREE_LANES (node);
       nelts_per_pattern = ncopies = 3;
-      noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
     }
   else
     {
@@ -10282,24 +10319,40 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
 	 instead of relying on the pattern described above.  */
       if (!nunits.is_constant (&npatterns)
 	  || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
+	{
+	  if (dump_p)
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "unsupported permutation %p on variable-length"
+			     " vectors\n", (void *) node);
 	  return -1;
+	}
       nelts_per_pattern = ncopies = 1;
-      if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
-	if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
+      if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
+	{
+	  if (dump_p)
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "unsupported permutation %p for variable VF\n",
+			     (void *) node);
 	  return -1;
-      noutputs_per_mask = 1;
 	}
-  unsigned olanes = ncopies * SLP_TREE_LANES (node);
+      pack_p = false;
+      unpack_factor = 1;
+    }
+  unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
   gcc_assert (repeating_p || multiple_p (olanes, nunits));
 
   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
      from the { SLP operand, scalar lane } permutation as recorded in the
      SLP node as intermediate step.  This part should already work
      with SLP children with arbitrary number of lanes.  */
-  auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
-  auto_vec<unsigned> active_lane;
+  auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
+  auto_vec<poly_uint64> active_lane;
   vperm.create (olanes);
   active_lane.safe_grow_cleared (children.length (), true);
+  for (unsigned int ui = 0; ui < unpack_factor; ++ui)
+    {
+      for (unsigned j = 0; j < children.length (); ++j)
+	active_lane[j] = ui * unpack_step;
       for (unsigned i = 0; i < ncopies; ++i)
 	{
 	  for (unsigned pi = 0; pi < perm.length (); ++pi)
@@ -10307,13 +10360,16 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
 	      std::pair<unsigned, unsigned> p = perm[pi];
 	      tree vtype = SLP_TREE_VECTYPE (children[p.first]);
 	      if (repeating_p)
-	    vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
+		vperm.quick_push ({{p.first, 0},
+				   p.second + active_lane[p.first]});
 	      else
 		{
 		  /* We checked above that the vectors are constant-length.  */
-	      unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
-	      unsigned vi = (active_lane[p.first] + p.second) / vnunits;
-	      unsigned vl = (active_lane[p.first] + p.second) % vnunits;
+		  unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
+		    .to_constant ();
+		  unsigned lane = active_lane[p.first].to_constant ();
+		  unsigned vi = (lane + p.second) / vnunits;
+		  unsigned vl = (lane + p.second) % vnunits;
 		  vperm.quick_push ({{p.first, vi}, vl});
 		}
 	    }
@@ -10321,6 +10377,7 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
 	  for (unsigned j = 0; j < children.length (); ++j)
 	    active_lane[j] += SLP_TREE_LANES (children[j]);
 	}
+    }
 
   if (dump_p)
     {
@@ -10339,9 +10396,10 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
 		  ? multiple_p (i, npatterns)
 		  : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
 	    dump_printf (MSG_NOTE, ",");
-	  dump_printf (MSG_NOTE, " vops%u[%u][%u]",
-		       vperm[i].first.first, vperm[i].first.second,
-		       vperm[i].second);
+	  dump_printf (MSG_NOTE, " vops%u[%u][",
+		       vperm[i].first.first, vperm[i].first.second);
+	  dump_dec (MSG_NOTE, vperm[i].second);
+	  dump_printf (MSG_NOTE, "]");
 	}
       dump_printf (MSG_NOTE, "\n");
     }
@@ -10362,16 +10420,37 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
   mask.quick_grow (count);
   vec_perm_indices indices;
   unsigned nperms = 0;
-  for (unsigned i = 0; i < vperm.length (); ++i)
-    {
-      mask_element = vperm[i].second;
-      if (first_vec.first == -1U
-	  || first_vec == vperm[i].first)
-	first_vec = vperm[i].first;
+  /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
+     vectors to check during analysis, but we need to generate NOUTPUTS
+     vectors during transformation.  */
+  unsigned total_nelts = olanes;
+  if (repeating_p && gsi)
+    total_nelts = (total_nelts / unpack_factor) * noutputs;
+  for (unsigned i = 0; i < total_nelts; ++i)
+    {
+      /* VI is the input vector index when generating code for REPEATING_P.  */
+      unsigned vi = i / olanes * (pack_p ? 2 : 1);
+      unsigned ei = i % olanes;
+      mask_element = vperm[ei].second;
+      if (pack_p)
+	{
+	  /* In this case, we have N outputs and the single child provides 2N
+	     inputs.  Output X permutes inputs 2X and 2X+1.
+
+	     The mask indices are taken directly from the SLP permutation node.
+	     Index X selects from the first vector if (X / NUNITS) % 2 == 0;
+	     X selects from the second vector otherwise.  These conditions
+	     are only known at compile time for constant-length vectors.  */
+	  first_vec = std::make_pair (0, 0);
+	  second_vec = std::make_pair (0, 1);
+	}
+      else if (first_vec.first == -1U
+	       || first_vec == vperm[ei].first)
+	first_vec = vperm[ei].first;
       else if (second_vec.first == -1U
-	       || second_vec == vperm[i].first)
+	       || second_vec == vperm[ei].first)
 	{
-	  second_vec = vperm[i].first;
+	  second_vec = vperm[ei].first;
 	  mask_element += nunits;
 	}
       else
@@ -10435,18 +10514,13 @@  vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
 	      if (!identity_p)
 		mask_vec = vect_gen_perm_mask_checked (vectype, indices);
 
-	      for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
-		{
 	      tree first_def
-		    = vect_get_slp_vect_def (first_node,
-					     first_vec.second + vi);
+		= vect_get_slp_vect_def (first_node, first_vec.second + vi);
 	      tree second_def
-		    = vect_get_slp_vect_def (second_node,
-					     second_vec.second + vi);
+		= vect_get_slp_vect_def (second_node, second_vec.second + vi);
 	      vect_add_slp_permutation (vinfo, gsi, node, first_def,
 					second_def, mask_vec, mask[0]);
 	    }
-	    }
 
 	  index = 0;
 	  first_vec = std::make_pair (-1U, -1U);