diff mbox series

[3/3] Avoid using SLP_TREE_LOAD_PERMUTATION for non-grouped SLP loads

Message ID 20241015120504.A0C62385800F@sourceware.org
State New
Headers show
Series [1/3] Remove SLP_INSTANCE_UNROLLING_FACTOR, compute VF in vect_make_slp_decision | expand

Commit Message

Richard Biener Oct. 15, 2024, 12:04 p.m. UTC
The following makes sure to use a VEC_PERM SLP node to produce
lane duplications for non-grouped SLP loads as those are later
not lowered by load permutation lowering.

For some reason gcc.dg/vect/pr106081.c now fails permute optimizing,
in particular eliding vector reversal for the reduction.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

	* tree-vect-slp.cc (vect_build_slp_tree_2): Use a VEC_PERM
	SLP node to duplicate lanes for non-grouped loads.

	* gcc.dg/vect/pr106081.c: Adjust.
---
 gcc/testsuite/gcc.dg/vect/pr106081.c |  2 +-
 gcc/tree-vect-slp.cc                 | 38 +++++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 2 deletions(-)
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/pr106081.c b/gcc/testsuite/gcc.dg/vect/pr106081.c
index 8f97af2d642..1864320c803 100644
--- a/gcc/testsuite/gcc.dg/vect/pr106081.c
+++ b/gcc/testsuite/gcc.dg/vect/pr106081.c
@@ -30,4 +30,4 @@  test(double *k)
 }
 
 /* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
-/* { dg-final { scan-tree-dump-times "VEC_PERM" 4 "optimized" { target x86_64-*-* i?86-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM" 5 "optimized" { target x86_64-*-* i?86-*-* } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index af00c5e35dd..b34064103bd 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -2088,7 +2088,43 @@  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	    }
 	  else
 	    {
-	      SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
+	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+		{
+		  /* Do not use SLP_TREE_LOAD_PERMUTATION for non-grouped
+		     accesses.  Instead when duplicated to so via a
+		     VEC_PERM node.  */
+		  if (!any_permute)
+		    load_permutation.release ();
+		  else
+		    {
+		      gcc_assert (group_size != 1);
+		      vec<stmt_vec_info> stmts2;
+		      stmts2.create (1);
+		      stmts2.quick_push (stmt_info);
+		      bool matches2;
+		      slp_tree unperm_load
+			= vect_build_slp_tree (vinfo, stmts2, 1,
+					       &this_max_nunits, &matches2,
+					       limit, &this_tree_size, bst_map);
+		      gcc_assert (unperm_load);
+		      lane_permutation_t lperm;
+		      lperm.create (group_size);
+		      for (unsigned j = 0; j < load_permutation.length (); ++j)
+			{
+			  gcc_assert (load_permutation[j] == 0);
+			  lperm.quick_push (std::make_pair (0, 0));
+			}
+		      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+		      SLP_TREE_CHILDREN (node).safe_push (unperm_load);
+		      SLP_TREE_LANE_PERMUTATION (node) = lperm;
+		      load_permutation.release ();
+		      *max_nunits = this_max_nunits;
+		      (*tree_size)++;
+		      return node;
+		    }
+		}
+	      else
+		SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
 	      return node;
 	    }
 	}