diff mbox series

[4/4] RISC-V: Re-implement SLP single-lane rediscovery for SLP

Message ID 20240829133320.DAE4C3861009@sourceware.org
State New
Headers show
Series [1/4] lower SLP load permutation to interleaving | expand

Commit Message

Richard Biener Aug. 29, 2024, 1:32 p.m. UTC
This re-implements scrapping of SLP in case load/store-lanes is
possible to be used as re-discovering the SLP instance with
forced single-lane splits so SLP load/store-lanes scheme can be
used.

This is now done after SLP discovery and SLP pattern recog are
complete to not disturb the latter but per SLP instance instead
of being a global decision on the whole loop.

	* tree-vect-slp.cc (vect_analyze_slp): After SLP pattern
	recog is finished see if there are any SLP instances
	that would benefit from using load/store-lanes and
	re-discover those with forced single lanes.
---
 gcc/tree-vect-slp.cc | 117 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 108 insertions(+), 9 deletions(-)
diff mbox series

Patch

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index bfecbc87f29..852937cdc34 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3481,7 +3481,8 @@  static bool
 vect_analyze_slp_instance (vec_info *vinfo,
 			   scalar_stmts_to_slp_tree_map_t *bst_map,
 			   stmt_vec_info stmt_info, slp_instance_kind kind,
-			   unsigned max_tree_size, unsigned *limit);
+			   unsigned max_tree_size, unsigned *limit,
+			   bool force_single_lane = false);
 
 /* Build an interleaving scheme for the store sources RHS_NODES from
    SCALAR_STMTS.  */
@@ -3676,7 +3677,8 @@  vect_build_slp_instance (vec_info *vinfo,
 			 unsigned max_tree_size, unsigned *limit,
 			 scalar_stmts_to_slp_tree_map_t *bst_map,
 			 /* ???  We need stmt_info for group splitting.  */
-			 stmt_vec_info stmt_info_)
+			 stmt_vec_info stmt_info_,
+			 bool force_single_lane = false)
 {
   /* If there's no budget left bail out early.  */
   if (*limit == 0)
@@ -3705,9 +3707,17 @@  vect_build_slp_instance (vec_info *vinfo,
   poly_uint64 max_nunits = 1;
   unsigned tree_size = 0;
   unsigned i;
-  slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
-				       &max_nunits, matches, limit,
-				       &tree_size, bst_map);
+
+  slp_tree node = NULL;
+  if (force_single_lane)
+    {
+      matches[0] = true;
+      matches[1] = false;
+    }
+  else
+    node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
+				&max_nunits, matches, limit,
+				&tree_size, bst_map);
   if (node != NULL)
     {
       /* Calculate the unrolling factor based on the smallest type.  */
@@ -3922,7 +3932,7 @@  vect_build_slp_instance (vec_info *vinfo,
 	       && compare_step_with_zero (vinfo, stmt_info) > 0
 	       && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
 						 group_size, 1));
-	  if (want_store_lanes)
+	  if (want_store_lanes || force_single_lane)
 	    i = 1;
 
 	  if (dump_enabled_p ())
@@ -3958,7 +3968,7 @@  vect_build_slp_instance (vec_info *vinfo,
 					       (max_nunits, end - start));
 		  rhs_nodes.safe_push (node);
 		  start = end;
-		  if (want_store_lanes)
+		  if (want_store_lanes || force_single_lane)
 		    end = start + 1;
 		  else
 		    end = group_size;
@@ -4086,7 +4096,8 @@  vect_analyze_slp_instance (vec_info *vinfo,
 			   scalar_stmts_to_slp_tree_map_t *bst_map,
 			   stmt_vec_info stmt_info,
 			   slp_instance_kind kind,
-			   unsigned max_tree_size, unsigned *limit)
+			   unsigned max_tree_size, unsigned *limit,
+			   bool force_single_lane)
 {
   vec<stmt_vec_info> scalar_stmts;
 
@@ -4131,7 +4142,7 @@  vect_analyze_slp_instance (vec_info *vinfo,
 				      roots, remain,
 				      max_tree_size, limit, bst_map,
 				      kind == slp_inst_kind_store
-				      ? stmt_info : NULL);
+				      ? stmt_info : NULL, force_single_lane);
 
   /* ???  If this is slp_inst_kind_store and the above succeeded here's
      where we should do store group splitting.  */
@@ -4662,6 +4673,94 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 	}
     }
 
+  /* Check whether we should force some SLP instances to use load/store-lanes
+     and do so by forcing SLP re-discovery with single lanes.  We used
+     to cancel SLP when this applied to all instances in a loop but now
+     we decide this per SLP instance.  It's important to do this only
+     after SLP pattern recognition.  */
+  if (is_a <loop_vec_info> (vinfo))
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
+      if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+	  && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
+	{
+	  slp_tree slp_root = SLP_INSTANCE_TREE (instance);
+	  int group_size = SLP_TREE_LANES (slp_root);
+	  tree vectype = SLP_TREE_VECTYPE (slp_root);
+
+	  auto_vec<slp_tree> loads;
+	  hash_set<slp_tree> visited;
+	  vect_gather_slp_loads (loads, slp_root, visited);
+
+	  /* Check whether any load in the SLP instance is possibly
+	     permuted.  */
+	  bool loads_permuted = false;
+	  slp_tree load_node;
+	  unsigned j;
+	  FOR_EACH_VEC_ELT (loads, j, load_node)
+	    {
+	      if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+		continue;
+	      unsigned k;
+	      stmt_vec_info load_info;
+	      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
+		if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
+		  {
+		    loads_permuted = true;
+		    break;
+		  }
+	    }
+
+	  /* If the loads and stores can use load/store-lanes force re-discovery
+	     with single lanes.  */
+	  if (loads_permuted
+	      && !slp_root->ldst_lanes
+	      && vect_store_lanes_supported (vectype, group_size, false)
+	      != IFN_LAST)
+	    {
+	      bool can_use_lanes = true;
+	      FOR_EACH_VEC_ELT (loads, j, load_node)
+		if (STMT_VINFO_GROUPED_ACCESS
+		      (SLP_TREE_REPRESENTATIVE (load_node)))
+		  {
+		    stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+			(SLP_TREE_REPRESENTATIVE (load_node));
+		    /* Use SLP for strided accesses (or if we can't
+		       load-lanes).  */
+		    if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+			|| compare_step_with_zero (vinfo, stmt_vinfo) <= 0
+			|| vect_load_lanes_supported
+			     (STMT_VINFO_VECTYPE (stmt_vinfo),
+			      DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
+		      {
+			can_use_lanes = false;
+			break;
+		      }
+		  }
+
+	      if (can_use_lanes)
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "SLP instance %p can use load/store-lanes,"
+				     " re-discovering with single-lanes\n",
+				     (void *) instance);
+
+		  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
+
+		  vect_free_slp_instance (instance);
+		  limit = max_tree_size;
+		  bool res = vect_analyze_slp_instance (vinfo, bst_map,
+							stmt_info,
+							slp_inst_kind_store,
+							max_tree_size, &limit,
+							true);
+		  gcc_assert (res);
+		  auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
+		  LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
+		}
+	    }
+	}
+
   /* When we end up with load permutations that we cannot possibly handle,
      like those requiring three vector inputs, lower them using interleaving
      like schemes.  */