diff mbox series

RISC-V: Also lower SLP grouped loads with just one consumer

Message ID 20240903093312.5D41A13A52@imap1.dmz-prg2.suse.org
State New
Headers show
Series RISC-V: Also lower SLP grouped loads with just one consumer | expand

Commit Message

Richard Biener Sept. 3, 2024, 9:32 a.m. UTC
This makes sure to produce interleaving schemes or load-lanes
for single-element interleaving and other permutes that otherwise
would use more than three vectors.

It exposes the latent issue that single-element interleaving with
large gaps can be inefficient - the mitigation in get_group_load_store_type
doesn't trigger when we clear the load permutation.

It also exposes the fact that not all permutes can be lowered in
the best way in a vector length agnostic way so I've added an
exception to keep power-of-two size contiguous aligned chunks
unlowered (unless we want load-lanes).  The optimal handling
of load/store vectorization is going to continue to be a learning
process.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

	* tree-vect-slp.cc (vect_lower_load_permutations): Also
	process single-use grouped loads.
	Avoid lowering contiguous aligned power-of-two sized
	chunks, those are better handled by the vector size
	specific SLP code generation.
	* tree-vect-stmts.c (get_group_load_store_type): Drop
	the unrelated requirement of a load permutation for the
	single-element interleaving limit.
---
 gcc/tree-vect-slp.cc   | 54 ++++++++++++++++++++++++++++--------------
 gcc/tree-vect-stmts.cc |  1 -
 2 files changed, 36 insertions(+), 19 deletions(-)
diff mbox series

Patch

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 1342913affa..1dc5888e92a 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4312,6 +4312,35 @@  vect_lower_load_permutations (loop_vec_info loop_vinfo,
 	  && ld_lanes_lanes == 0)
 	continue;
 
+      /* Build the permute to get the original load permutation order.  */
+      bool contiguous = true;
+      lane_permutation_t final_perm;
+      final_perm.create (SLP_TREE_LANES (load));
+      for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
+	{
+	  final_perm.quick_push
+	    (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
+	  if (i != 0
+	      && (SLP_TREE_LOAD_PERMUTATION (load)[i]
+		  != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
+	    contiguous = false;
+	}
+
+      /* When the load permutation accesses a contiguous unpermuted,
+	 power-of-two aligned and sized chunk leave the load alone.
+	 We can likely (re-)load it more efficiently rather than
+	 extracting it from the larger load.
+	 ???  Long-term some of the lowering should move to where
+	 the vector types involved are fixed.  */
+      if (ld_lanes_lanes == 0
+	  && pow2p_hwi (SLP_TREE_LANES (load))
+	  && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
+	  && group_lanes % SLP_TREE_LANES (load) == 0)
+	{
+	  final_perm.release ();
+	  continue;
+	}
+
       /* First build (and possibly re-use) a load node for the
 	 unpermuted group.  Gaps in the middle and on the end are
 	 represented with NULL stmts.  */
@@ -4335,13 +4364,6 @@  vect_lower_load_permutations (loop_vec_info loop_vinfo,
 					 &max_nunits, matches, &limit,
 					 &tree_size, bst_map);
 
-      /* Build the permute to get the original load permutation order.  */
-      lane_permutation_t final_perm;
-      final_perm.create (SLP_TREE_LANES (load));
-      for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
-	final_perm.quick_push
-	  (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
-
       if (ld_lanes_lanes != 0)
 	{
 	  /* ???  If this is not in sync with what get_load_store_type
@@ -4500,20 +4522,16 @@  vect_lower_load_permutations (loop_vec_info loop_vinfo,
 	  && STMT_VINFO_GROUPED_ACCESS (b0)
 	  && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
 	continue;
-      /* Just one SLP load of a possible group, leave those alone.  */
-      if (i == firsti + 1)
-	{
-	  firsti = i;
-	  continue;
-	}
-      /* Now we have multiple SLP loads of the same group from
+      /* Now we have one or multiple SLP loads of the same group from
 	 firsti to i - 1.  */
-      vect_lower_load_permutations (loop_vinfo, bst_map,
-				    make_array_slice (&loads[firsti],
-						      i - firsti));
+      if (STMT_VINFO_GROUPED_ACCESS (a0))
+	vect_lower_load_permutations (loop_vinfo, bst_map,
+				      make_array_slice (&loads[firsti],
+							i - firsti));
       firsti = i;
     }
-  if (firsti < loads.length () - 1)
+  if (firsti < loads.length ()
+      && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
     vect_lower_load_permutations (loop_vinfo, bst_map,
 				  make_array_slice (&loads[firsti],
 						    loads.length () - firsti));
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index ace1c8eaa0d..e509ab506ef 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2186,7 +2186,6 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 	     blow up memory, see PR65518).  */
 	  if (loop_vinfo
 	      && *memory_access_type == VMAT_CONTIGUOUS
-	      && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
 	      && single_element_p
 	      && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
 	    {