diff mbox series

[3/3] middle-end: maintain LCSSA throughout loop peeling

Message ID ZRp0VD9Z6w4st7GM@arm.com
State New
Headers show
Series [1/3] middle-end: Refactor vectorizer loop conditionals and separate out IV to new variables | expand

Commit Message

Tamar Christina Oct. 2, 2023, 7:42 a.m. UTC
Hi All,

This final patch updates peeling to maintain LCSSA all the way through.

It's significantly easier to maintain it during peeling while we still know
where all new edges connect rather than touching it up later as is currently
being done.

This allows us to remove many of the helper functions that touch up the loops
at various parts.  The only complication is for loop distribution where we
should be able to use the same,  however ldist depending on whether
redirect_lc_phi_defs is true or not will either try to maintain a limited LCSSA
form itself or removes are non-virtual phis.

The problem here is that if we maintain LCSSA then in some cases the blocks
connecting the two loops get PHIs to keep the loop IV up to date.

However there is no loop, the guard condition is rewritten as 0 != 0, to the
"loop" always exits.   However due to the PHI nodes the probabilities get
completely wrong.  It seems to think that the impossible exit is the likely
edge.  This causes incorrect warnings and the presence of the PHIs prevent the
blocks to be simplified.

While it may be possible to make ldist work with LCSSA form, doing so seems more
work than not.  For that reason the peeling code has an additional parameter
used by only ldist to not connect the two loops during peeling.

This preserves the current behaviour from ldist until I can dive into the
implementation more.  Hopefully that's ok for now.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu, and
no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-loop-distribution.cc (copy_loop_before): Request no LCSSA.
	* tree-vect-loop-manip.cc (adjust_phi_and_debug_stmts): Add additional
	asserts.
	(slpeel_tree_duplicate_loop_to_edge_cfg): Keep LCSSA during peeling.
	(find_guard_arg): Look value up through explicit edge and original defs.
	(vect_do_peeling): Use it.
	(slpeel_update_phi_nodes_for_guard2): Take explicit exit edge.
	(slpeel_update_phi_nodes_for_lcssa, slpeel_update_phi_nodes_for_loops):
	Remove.
	* tree-vect-loop.cc (vect_create_epilog_for_reduction): Initialize phi.
	* tree-vectorizer.h (slpeel_tree_duplicate_loop_to_edge_cfg): Add
	optional param to turn off LCSSA mode.

--- inline copy of patch -- 
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index 902edc49ab588152a5b845f2c8a42a7e2a1d6080..14fb884d3e91d79785867debaee4956a2d5b0bb1 100644




--
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index 902edc49ab588152a5b845f2c8a42a7e2a1d6080..14fb884d3e91d79785867debaee4956a2d5b0bb1 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -950,7 +950,7 @@ copy_loop_before (class loop *loop, bool redirect_lc_phi_defs)
 
   initialize_original_copy_tables ();
   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, single_exit (loop), NULL,
-						NULL, preheader, NULL);
+						NULL, preheader, NULL, false);
   gcc_assert (res != NULL);
 
   /* When a not last partition is supposed to keep the LC PHIs computed
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 77f8e668bcc8beca99ba4052e1b12e0d17300262..0e8c0be5384aab2399ed93966e7bf4918f6c87a5 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -252,6 +252,9 @@ adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 {
   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 
+  gcc_assert (TREE_CODE (orig_def) != SSA_NAME
+	      || orig_def != new_def);
+
   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 
   if (MAY_HAVE_DEBUG_BIND_STMTS)
@@ -1445,12 +1448,19 @@ slpeel_duplicate_current_defs_from_edges (edge from, edge to)
    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
-   entry or exit of LOOP.  */
+   entry or exit of LOOP.  If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
+   continuation.  This is correct for cases where one loop continues from the
+   other like in the vectorizer, but not true for uses in e.g. loop distribution
+   where the loop is duplicated and then modified.
+
+   If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
+   dominators were updated during the peeling.  */
 
 class loop *
 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 					class loop *scalar_loop,
-					edge scalar_exit, edge e, edge *new_e)
+					edge scalar_exit, edge e, edge *new_e,
+					bool flow_loops)
 {
   class loop *new_loop;
   basic_block *new_bbs, *bbs, *pbbs;
@@ -1481,6 +1491,8 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	    scalar_exit	= exit;
 	    break;
 	  }
+
+      gcc_assert (scalar_exit);
     }
 
   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
@@ -1513,6 +1525,8 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
   exit = loop_exit;
   basic_block new_preheader = new_bbs[0];
 
+  gcc_assert (new_exit);
+
   /* Record the new loop exit information.  new_loop doesn't have SCEV data and
      so we must initialize the exit information.  */
   if (new_e)
@@ -1551,6 +1565,19 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
 
+  /* Rename the exit uses.  */
+  for (edge exit : get_loop_exit_edges (new_loop))
+    for (auto gsi = gsi_start_phis (exit->dest);
+	 !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
+	rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
+	if (MAY_HAVE_DEBUG_BIND_STMTS)
+	  adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
+      }
+
+  /* This condition happens when the loop has been versioned. e.g. due to ifcvt
+     versioning the loop.  */
   if (scalar_loop != loop)
     {
       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
@@ -1564,28 +1591,83 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 						EDGE_SUCC (loop->latch, 0));
     }
 
+  auto loop_exits = get_loop_exit_edges (loop);
+  auto_vec<basic_block> doms;
+
   if (at_exit) /* Add the loop copy at exit.  */
     {
-      if (scalar_loop != loop)
+      if (scalar_loop != loop && new_exit->dest != exit_dest)
 	{
-	  gphi_iterator gsi;
 	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
+	  flush_pending_stmts (new_exit);
+	}
 
-	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
-	       gsi_next (&gsi))
-	    {
-	      gphi *phi = gsi.phi ();
-	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
-	      location_t orig_locus
-		= gimple_phi_arg_location_from_edge (phi, e);
+      auto_vec <gimple *> new_phis;
+      hash_map <tree, tree> new_phi_args;
+      /* First create the empty phi nodes so that when we flush the
+	 statements they can be filled in.   However because there is no order
+	 between the PHI nodes in the exits and the loop headers we need to
+	 order them base on the order of the two headers.  First record the new
+	 phi nodes.  */
+      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
+	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
+	{
+	  gimple *from_phi = gsi_stmt (gsi_from);
+	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+	  gphi *res = create_phi_node (new_res, new_preheader);
+	  new_phis.safe_push (res);
+	}
 
-	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
+      /* Then redirect the edges and flush the changes.  This writes out the new
+	 SSA names.  */
+      for (edge exit : loop_exits)
+	{
+	  edge e = redirect_edge_and_branch (exit, new_preheader);
+	  flush_pending_stmts (e);
+	}
+
+      /* Record the new SSA names in the cache so that we can skip materializing
+	 them again when we fill in the rest of the LCSSA variables.  */
+      for (auto phi : new_phis)
+	{
+	  tree new_arg = gimple_phi_arg (phi, 0)->def;
+	  new_phi_args.put (new_arg, gimple_phi_result (phi));
+	}
+
+      /* Copy the current loop LC PHI nodes between the original loop exit
+	 block and the new loop header.  This allows us to later split the
+	 preheader block and still find the right LC nodes.  */
+      edge latch_new = single_succ_edge (new_preheader);
+      for (auto gsi_from = gsi_start_phis (loop->header),
+	   gsi_to = gsi_start_phis (new_loop->header);
+	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+	   gsi_next (&gsi_from), gsi_next (&gsi_to))
+	{
+	  gimple *from_phi = gsi_stmt (gsi_from);
+	  gimple *to_phi = gsi_stmt (gsi_to);
+	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
+						loop_latch_edge (loop));
+
+	  /* Check if we've already created a new phi node during edge
+	     redirection.  If we have, only propagate the value downwards.  */
+	  if (tree *res = new_phi_args.get (new_arg))
+	    {
+	      adjust_phi_and_debug_stmts (to_phi, latch_new, *res);
+	      continue;
 	    }
+
+	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+	  gphi *lcssa_phi = create_phi_node (new_res, e->dest);
+
+	  /* Main loop exit should use the final iter value.  */
+	  add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
+
+	  adjust_phi_and_debug_stmts (to_phi, latch_new, new_res);
 	}
-      redirect_edge_and_branch_force (e, new_preheader);
-      flush_pending_stmts (e);
+
       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
-      if (was_imm_dom || duplicate_outer_loop)
+
+      if ((was_imm_dom || duplicate_outer_loop))
 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
 
       /* And remove the non-necessary forwarder again.  Keep the other
@@ -1598,6 +1680,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
     }
   else /* Add the copy at entry.  */
     {
+      /* Copy the current loop LC PHI nodes between the original loop exit
+	 block and the new loop header.  This allows us to later split the
+	 preheader block and still find the right LC nodes.  */
+      for (auto gsi_from = gsi_start_phis (new_loop->header),
+	   gsi_to = gsi_start_phis (loop->header);
+	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+	   gsi_next (&gsi_from), gsi_next (&gsi_to))
+	{
+	  gimple *from_phi = gsi_stmt (gsi_from);
+	  gimple *to_phi = gsi_stmt (gsi_to);
+	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
+						loop_latch_edge (new_loop));
+	  adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
+				      new_arg);
+	}
+
       if (scalar_loop != loop)
 	{
 	  /* Remove the non-necessary forwarder of scalar_loop again.  */
@@ -1627,29 +1725,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 			       loop_preheader_edge (new_loop)->src);
     }
 
-  if (scalar_loop != loop)
-    {
-      /* Update new_loop->header PHIs, so that on the preheader
-	 edge they are the ones from loop rather than scalar_loop.  */
-      gphi_iterator gsi_orig, gsi_new;
-      edge orig_e = loop_preheader_edge (loop);
-      edge new_e = loop_preheader_edge (new_loop);
-
-      for (gsi_orig = gsi_start_phis (loop->header),
-	   gsi_new = gsi_start_phis (new_loop->header);
-	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
-	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
-	{
-	  gphi *orig_phi = gsi_orig.phi ();
-	  gphi *new_phi = gsi_new.phi ();
-	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
-	  location_t orig_locus
-	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
-
-	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
-	}
-    }
-
   free (new_bbs);
   free (bbs);
 
@@ -2579,139 +2654,36 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
 
 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
    this function searches for the corresponding lcssa phi node in exit
-   bb of LOOP.  If it is found, return the phi result; otherwise return
-   NULL.  */
+   bb of LOOP following the LCSSA_EDGE to the exit node.  If it is found,
+   return the phi result; otherwise return NULL.  */
 
 static tree
 find_guard_arg (class loop *loop ATTRIBUTE_UNUSED,
 		class loop *epilog ATTRIBUTE_UNUSED,
-		const_edge e, gphi *lcssa_phi)
+		const_edge e, gphi *lcssa_phi, int lcssa_edge = 0)
 {
   gphi_iterator gsi;
 
-  gcc_assert (single_pred_p (e->dest));
   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
     {
       gphi *phi = gsi.phi ();
-      if (operand_equal_p (PHI_ARG_DEF (phi, 0),
-			   PHI_ARG_DEF (lcssa_phi, 0), 0))
-	return PHI_RESULT (phi);
-    }
-  return NULL_TREE;
-}
-
-/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
-   from SECOND/FIRST and puts it at the original loop's preheader/exit
-   edge, the two loops are arranged as below:
-
-       preheader_a:
-     first_loop:
-       header_a:
-	 i_1 = PHI<i_0, i_2>;
-	 ...
-	 i_2 = i_1 + 1;
-	 if (cond_a)
-	   goto latch_a;
-	 else
-	   goto between_bb;
-       latch_a:
-	 goto header_a;
-
-       between_bb:
-	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
-
-     second_loop:
-       header_b:
-	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
-				 or with i_2 if no LCSSA phi is created
-				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
-	 ...
-	 i_4 = i_3 + 1;
-	 if (cond_b)
-	   goto latch_b;
-	 else
-	   goto exit_bb;
-       latch_b:
-	 goto header_b;
-
-       exit_bb:
-
-   This function creates loop closed SSA for the first loop; update the
-   second loop's PHI nodes by replacing argument on incoming edge with the
-   result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
-   is false, Loop closed ssa phis will only be created for non-iv phis for
-   the first loop.
-
-   This function assumes exit bb of the first loop is preheader bb of the
-   second loop, i.e, between_bb in the example code.  With PHIs updated,
-   the second loop will execute rest iterations of the first.  */
-
-static void
-slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
-				   class loop *first, edge first_loop_e,
-				   class loop *second, edge second_loop_e,
-				   bool create_lcssa_for_iv_phis)
-{
-  gphi_iterator gsi_update, gsi_orig;
-  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-
-  edge first_latch_e = EDGE_SUCC (first->latch, 0);
-  edge second_preheader_e = loop_preheader_edge (second);
-  basic_block between_bb = first_loop_e->dest;
-
-  gcc_assert (between_bb == second_preheader_e->src);
-  gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
-  /* Either the first loop or the second is the loop to be vectorized.  */
-  gcc_assert (loop == first || loop == second);
-
-  for (gsi_orig = gsi_start_phis (first->header),
-       gsi_update = gsi_start_phis (second->header);
-       !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
-       gsi_next (&gsi_orig), gsi_next (&gsi_update))
-    {
-      gphi *orig_phi = gsi_orig.phi ();
-      gphi *update_phi = gsi_update.phi ();
-
-      tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
-      /* Generate lcssa PHI node for the first loop.  */
-      gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
-      stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
-      if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
+      /* Nested loops with multiple exits can have different no# phi node
+	arguments between the main loop and epilog as epilog falls to the
+	second loop.  */
+      if (gimple_phi_num_args (phi) > e->dest_idx)
 	{
-	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
-	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
-	  add_phi_arg (lcssa_phi, arg, first_loop_e, UNKNOWN_LOCATION);
-	  arg = new_res;
-	}
-
-      /* Update PHI node in the second loop by replacing arg on the loop's
-	 incoming edge.  */
-      adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
-    }
-
-  /* For epilogue peeling we have to make sure to copy all LC PHIs
-     for correct vectorization of live stmts.  */
-  if (loop == first)
-    {
-      basic_block orig_exit = second_loop_e->dest;
-      for (gsi_orig = gsi_start_phis (orig_exit);
-	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
-	{
-	  gphi *orig_phi = gsi_orig.phi ();
-	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
-	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
-	    continue;
-
-	  const_edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
-	  /* Already created in the above loop.   */
-	  if (find_guard_arg (first, second, exit_e, orig_phi))
+	 tree var = PHI_ARG_DEF (phi, e->dest_idx);
+	 if (TREE_CODE (var) != SSA_NAME)
 	    continue;
-
-	  tree new_res = copy_ssa_name (orig_arg);
-	  gphi *lcphi = create_phi_node (new_res, between_bb);
-	  add_phi_arg (lcphi, orig_arg, first_loop_e, UNKNOWN_LOCATION);
+	 tree def = get_current_def (var);
+	 if (!def)
+	   continue;
+	 if (operand_equal_p (def,
+			      PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
+	   return PHI_RESULT (phi);
 	}
     }
+  return NULL_TREE;
 }
 
 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
@@ -2796,11 +2768,11 @@ slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
     }
 }
 
-/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
-   from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
-   point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
-   and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
-   The CFG looks like:
+/* LOOP and EPILOG are two consecutive loops in CFG connected by LOOP_EXIT edge
+   and EPILOG is copied from LOOP.  Function slpeel_add_loop_guard adds guard
+   skipping from a point between the two loops to the end of EPILOG.  Edges
+   GUARD_EDGE and MERGE_EDGE are the two pred edges of merge_bb at the end of
+   EPILOG.  The CFG looks like:
 
      loop:
        header_a:
@@ -2851,6 +2823,7 @@ slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
 
 static void
 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
+				    const_edge loop_exit,
 				    edge guard_edge, edge merge_edge)
 {
   gphi_iterator gsi;
@@ -2859,13 +2832,11 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
   gcc_assert (single_succ_p (merge_bb));
   edge e = single_succ_edge (merge_bb);
   basic_block exit_bb = e->dest;
-  gcc_assert (single_pred_p (exit_bb));
-  gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
 
   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
     {
       gphi *update_phi = gsi.phi ();
-      tree old_arg = PHI_ARG_DEF (update_phi, 0);
+      tree old_arg = PHI_ARG_DEF (update_phi, e->dest_idx);
 
       tree merge_arg = NULL_TREE;
 
@@ -2877,8 +2848,8 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
       if (!merge_arg)
 	merge_arg = old_arg;
 
-      tree guard_arg
-	= find_guard_arg (loop, epilog, single_exit (loop), update_phi);
+      tree guard_arg = find_guard_arg (loop, epilog, loop_exit,
+				       update_phi, e->dest_idx);
       /* If the var is live after loop but not a reduction, we simply
 	 use the old arg.  */
       if (!guard_arg)
@@ -2898,21 +2869,6 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
     }
 }
 
-/* EPILOG loop is duplicated from the original loop for vectorizing,
-   the arg of its loop closed ssa PHI needs to be updated.  */
-
-static void
-slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
-{
-  gphi_iterator gsi;
-  basic_block exit_bb = single_exit (epilog)->dest;
-
-  gcc_assert (single_pred_p (exit_bb));
-  edge e = EDGE_PRED (exit_bb, 0);
-  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
-    rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
-}
-
 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
    Return a value that equals:
 
@@ -3255,8 +3211,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 						       e, &prolog_e);
       gcc_assert (prolog);
       prolog->force_vectorize = false;
-      slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, prolog_e, loop,
-					 exit_e, true);
+
       first_loop = prolog;
       reset_original_copy_tables ();
 
@@ -3336,8 +3291,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
       gcc_assert (epilog);
       epilog->force_vectorize = false;
-      slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, e, epilog,
-					 new_epilog_e, false);
       bb_before_epilog = loop_preheader_edge (epilog)->src;
 
       /* Scalar version loop may be preferred.  In this case, add guard
@@ -3430,7 +3383,9 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 					   irred_flag);
 	  if (vect_epilogues)
 	    epilogue_vinfo->skip_this_loop_edge = guard_e;
-	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, epilog_e);
+	  edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
+	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, main_iv, guard_e,
+					      epilog_e);
 	  /* Only need to handle basic block before epilog loop if it's not
 	     the guard_bb, which is the case when skip_vector is true.  */
 	  if (guard_bb != bb_before_epilog)
@@ -3441,8 +3396,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	    }
 	  scale_loop_profile (epilog, prob_epilog, -1);
 	}
-      else
-	slpeel_update_phi_nodes_for_lcssa (epilog);
 
       unsigned HOST_WIDE_INT bound;
       if (bound_scalar.is_constant (&bound))
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index f1caa5f207d3b13da58c3a313b11d1ef98374349..327cab0f736da7f1bd3e024d666df46ef9208107 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5877,7 +5877,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   basic_block exit_bb;
   tree scalar_dest;
   tree scalar_type;
-  gimple *new_phi = NULL, *phi;
+  gimple *new_phi = NULL, *phi = NULL;
   gimple_stmt_iterator exit_gsi;
   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
   gimple *epilog_stmt = NULL;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 55b6771b271d5072fa1327d595e1dddb112cfdf6..25ceb6600673d71fd6012443403997e921066483 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2183,7 +2183,7 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
 					 const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
 						    class loop *, edge,
-						    edge, edge *);
+						    edge, edge *, bool = true);
 class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,

Comments

Richard Biener Oct. 10, 2023, 12:59 p.m. UTC | #1
On Mon, 2 Oct 2023, Tamar Christina wrote:

> Hi All,
> 
> This final patch updates peeling to maintain LCSSA all the way through.
> 
> It's significantly easier to maintain it during peeling while we still know
> where all new edges connect rather than touching it up later as is currently
> being done.
> 
> This allows us to remove many of the helper functions that touch up the loops
> at various parts.  The only complication is for loop distribution where we
> should be able to use the same,  however ldist depending on whether
> redirect_lc_phi_defs is true or not will either try to maintain a limited LCSSA
> form itself or removes are non-virtual phis.
> 
> The problem here is that if we maintain LCSSA then in some cases the blocks
> connecting the two loops get PHIs to keep the loop IV up to date.
> 
> However there is no loop, the guard condition is rewritten as 0 != 0, to the
> "loop" always exits.   However due to the PHI nodes the probabilities get
> completely wrong.  It seems to think that the impossible exit is the likely
> edge.  This causes incorrect warnings and the presence of the PHIs prevent the
> blocks to be simplified.
> 
> While it may be possible to make ldist work with LCSSA form, doing so seems more
> work than not.  For that reason the peeling code has an additional parameter
> used by only ldist to not connect the two loops during peeling.
> 
> This preserves the current behaviour from ldist until I can dive into the
> implementation more.  Hopefully that's ok for now.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu, and
> no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-loop-distribution.cc (copy_loop_before): Request no LCSSA.
> 	* tree-vect-loop-manip.cc (adjust_phi_and_debug_stmts): Add additional
> 	asserts.
> 	(slpeel_tree_duplicate_loop_to_edge_cfg): Keep LCSSA during peeling.
> 	(find_guard_arg): Look value up through explicit edge and original defs.
> 	(vect_do_peeling): Use it.
> 	(slpeel_update_phi_nodes_for_guard2): Take explicit exit edge.
> 	(slpeel_update_phi_nodes_for_lcssa, slpeel_update_phi_nodes_for_loops):
> 	Remove.
> 	* tree-vect-loop.cc (vect_create_epilog_for_reduction): Initialize phi.
> 	* tree-vectorizer.h (slpeel_tree_duplicate_loop_to_edge_cfg): Add
> 	optional param to turn off LCSSA mode.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
> index 902edc49ab588152a5b845f2c8a42a7e2a1d6080..14fb884d3e91d79785867debaee4956a2d5b0bb1 100644
> --- a/gcc/tree-loop-distribution.cc
> +++ b/gcc/tree-loop-distribution.cc
> @@ -950,7 +950,7 @@ copy_loop_before (class loop *loop, bool redirect_lc_phi_defs)
>  
>    initialize_original_copy_tables ();
>    res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, single_exit (loop), NULL,
> -						NULL, preheader, NULL);
> +						NULL, preheader, NULL, false);
>    gcc_assert (res != NULL);
>  
>    /* When a not last partition is supposed to keep the LC PHIs computed
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 77f8e668bcc8beca99ba4052e1b12e0d17300262..0e8c0be5384aab2399ed93966e7bf4918f6c87a5 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -252,6 +252,9 @@ adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
>  {
>    tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
>  
> +  gcc_assert (TREE_CODE (orig_def) != SSA_NAME
> +	      || orig_def != new_def);
> +
>    SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
>  
>    if (MAY_HAVE_DEBUG_BIND_STMTS)
> @@ -1445,12 +1448,19 @@ slpeel_duplicate_current_defs_from_edges (edge from, edge to)
>     on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
>     non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
>     basic blocks from SCALAR_LOOP instead of LOOP, but to either the
> -   entry or exit of LOOP.  */
> +   entry or exit of LOOP.  If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
> +   continuation.  This is correct for cases where one loop continues from the
> +   other like in the vectorizer, but not true for uses in e.g. loop distribution
> +   where the loop is duplicated and then modified.

But for loop distribution the other loop also "continues" from the other,
maybe better say ", but not true for uses in e.g. loop distribution where
the contents of the loop body are split but the iteration space of both
copies remains the same."  It's an implementation limitation in loop
distribution that it for example doesn't support producing reductions
as the first loop (aka it cannot handle LC PHI nodes "inbetween").

> +
> +   If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
> +   dominators were updated during the peeling.  */
>  
>  class loop *
>  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  					class loop *scalar_loop,
> -					edge scalar_exit, edge e, edge *new_e)
> +					edge scalar_exit, edge e, edge *new_e,
> +					bool flow_loops)
>  {
>    class loop *new_loop;
>    basic_block *new_bbs, *bbs, *pbbs;
> @@ -1481,6 +1491,8 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  	    scalar_exit	= exit;
>  	    break;
>  	  }
> +
> +      gcc_assert (scalar_exit);
>      }
>  
>    bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
> @@ -1513,6 +1525,8 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>    exit = loop_exit;
>    basic_block new_preheader = new_bbs[0];
>  
> +  gcc_assert (new_exit);
> +
>    /* Record the new loop exit information.  new_loop doesn't have SCEV data and
>       so we must initialize the exit information.  */
>    if (new_e)
> @@ -1551,6 +1565,19 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>    for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
>      rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
>  
> +  /* Rename the exit uses.  */
> +  for (edge exit : get_loop_exit_edges (new_loop))
> +    for (auto gsi = gsi_start_phis (exit->dest);
> +	 !gsi_end_p (gsi); gsi_next (&gsi))
> +      {
> +	tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
> +	rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
> +	if (MAY_HAVE_DEBUG_BIND_STMTS)
> +	  adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
> +      }
> +
> +  /* This condition happens when the loop has been versioned. e.g. due to ifcvt
> +     versioning the loop.  */
>    if (scalar_loop != loop)
>      {
>        /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
> @@ -1564,28 +1591,83 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  						EDGE_SUCC (loop->latch, 0));
>      }
>  
> +  auto loop_exits = get_loop_exit_edges (loop);
> +  auto_vec<basic_block> doms;
> +
>    if (at_exit) /* Add the loop copy at exit.  */
>      {
> -      if (scalar_loop != loop)
> +      if (scalar_loop != loop && new_exit->dest != exit_dest)
>  	{
> -	  gphi_iterator gsi;
>  	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
> +	  flush_pending_stmts (new_exit);
> +	}
>  
> -	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
> -	       gsi_next (&gsi))
> -	    {
> -	      gphi *phi = gsi.phi ();
> -	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
> -	      location_t orig_locus
> -		= gimple_phi_arg_location_from_edge (phi, e);
> +      auto_vec <gimple *> new_phis;
> +      hash_map <tree, tree> new_phi_args;
> +      /* First create the empty phi nodes so that when we flush the
> +	 statements they can be filled in.   However because there is no order
> +	 between the PHI nodes in the exits and the loop headers we need to
> +	 order them base on the order of the two headers.  First record the new
> +	 phi nodes.  */
> +      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
> +	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
> +	{
> +	  gimple *from_phi = gsi_stmt (gsi_from);
> +	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> +	  gphi *res = create_phi_node (new_res, new_preheader);
> +	  new_phis.safe_push (res);
> +	}
>  
> -	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
> +      /* Then redirect the edges and flush the changes.  This writes out the new
> +	 SSA names.  */
> +      for (edge exit : loop_exits)

I realize at the moment it's the same, but we are redirecting
multiple exit edges here and from the walk above expect them
all to have the same set of PHI nodes - that looks a bit fragile?
Does this need adjustments later for the early exit vectorization?

This also somewhat confuses the original redirection of 'e', the main
exit with the later (*)

> +	{
> +	  edge e = redirect_edge_and_branch (exit, new_preheader);
> +	  flush_pending_stmts (e);
> +	}
> +
> +      /* Record the new SSA names in the cache so that we can skip materializing
> +	 them again when we fill in the rest of the LCSSA variables.  */
> +      for (auto phi : new_phis)
> +	{
> +	  tree new_arg = gimple_phi_arg (phi, 0)->def;

and here you look at the (for now) single edge we redirected ...

> +	  new_phi_args.put (new_arg, gimple_phi_result (phi));
> +	}
> +
> +      /* Copy the current loop LC PHI nodes between the original loop exit
> +	 block and the new loop header.  This allows us to later split the
> +	 preheader block and still find the right LC nodes.  */
> +      edge latch_new = single_succ_edge (new_preheader);

odd name - the single successor of a loop preheader is the loop
header and the corresponding edge is the loop entry edge, not the latch?

> +      for (auto gsi_from = gsi_start_phis (loop->header),
> +	   gsi_to = gsi_start_phis (new_loop->header);
> +	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);

Eh, can we have

  if (flow_loops)
    for  (auto ...)

please, even if that indents more?

> +	   gsi_next (&gsi_from), gsi_next (&gsi_to))
> +	{
> +	  gimple *from_phi = gsi_stmt (gsi_from);
> +	  gimple *to_phi = gsi_stmt (gsi_to);
> +	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> +						loop_latch_edge (loop));
> +
> +	  /* Check if we've already created a new phi node during edge
> +	     redirection.  If we have, only propagate the value downwards.  */
> +	  if (tree *res = new_phi_args.get (new_arg))
> +	    {
> +	      adjust_phi_and_debug_stmts (to_phi, latch_new, *res);
> +	      continue;
>  	    }
> +
> +	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> +	  gphi *lcssa_phi = create_phi_node (new_res, e->dest);
> +
> +	  /* Main loop exit should use the final iter value.  */
> +	  add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);

For all other edges into the loop besides 'e' there's missing
PHI arguments?  You are using 'e' here again, but also use that as
temporary in for blocks, shadowing the parameter - that makes it
difficult to read.  Also it's sometimes 'e->dest' and sometimes
new_preheader - I think you want to use new_preheader here as well
(in create_phi_node) for consistency and ease of understanding.

ISTR when early break vectorization lands we're going to redirect
the alternate exits away again "fixing" the missing PHI args.

> +
> +	  adjust_phi_and_debug_stmts (to_phi, latch_new, new_res);
>  	}
> -      redirect_edge_and_branch_force (e, new_preheader);
> -      flush_pending_stmts (e);
> +
>        set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
> -      if (was_imm_dom || duplicate_outer_loop)
> +
> +      if ((was_imm_dom || duplicate_outer_loop))

extra ()s

>  	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
>  
>        /* And remove the non-necessary forwarder again.  Keep the other
> @@ -1598,6 +1680,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>      }
>    else /* Add the copy at entry.  */
>      {
> +      /* Copy the current loop LC PHI nodes between the original loop exit
> +	 block and the new loop header.  This allows us to later split the
> +	 preheader block and still find the right LC nodes.  */
> +      for (auto gsi_from = gsi_start_phis (new_loop->header),
> +	   gsi_to = gsi_start_phis (loop->header);
> +	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);

same if (flow_loops)

> +	   gsi_next (&gsi_from), gsi_next (&gsi_to))
> +	{
> +	  gimple *from_phi = gsi_stmt (gsi_from);
> +	  gimple *to_phi = gsi_stmt (gsi_to);
> +	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> +						loop_latch_edge (new_loop));

this looks wrong?  IMHO it should be the PHI_RESULT, no?  Note this
only triggers for alignment peeling ...

Otherwise looks OK.

Thanks,
Richard.


> +	  adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
> +				      new_arg);
> +	}
> +
>        if (scalar_loop != loop)
>  	{
>  	  /* Remove the non-necessary forwarder of scalar_loop again.  */
> @@ -1627,29 +1725,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  			       loop_preheader_edge (new_loop)->src);
>      }
>  
> -  if (scalar_loop != loop)
> -    {
> -      /* Update new_loop->header PHIs, so that on the preheader
> -	 edge they are the ones from loop rather than scalar_loop.  */
> -      gphi_iterator gsi_orig, gsi_new;
> -      edge orig_e = loop_preheader_edge (loop);
> -      edge new_e = loop_preheader_edge (new_loop);
> -
> -      for (gsi_orig = gsi_start_phis (loop->header),
> -	   gsi_new = gsi_start_phis (new_loop->header);
> -	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
> -	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
> -	{
> -	  gphi *orig_phi = gsi_orig.phi ();
> -	  gphi *new_phi = gsi_new.phi ();
> -	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
> -	  location_t orig_locus
> -	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
> -
> -	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
> -	}
> -    }
> -
>    free (new_bbs);
>    free (bbs);
>  
> @@ -2579,139 +2654,36 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
>  
>  /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
>     this function searches for the corresponding lcssa phi node in exit
> -   bb of LOOP.  If it is found, return the phi result; otherwise return
> -   NULL.  */
> +   bb of LOOP following the LCSSA_EDGE to the exit node.  If it is found,
> +   return the phi result; otherwise return NULL.  */
>  
>  static tree
>  find_guard_arg (class loop *loop ATTRIBUTE_UNUSED,
>  		class loop *epilog ATTRIBUTE_UNUSED,
> -		const_edge e, gphi *lcssa_phi)
> +		const_edge e, gphi *lcssa_phi, int lcssa_edge = 0)
>  {
>    gphi_iterator gsi;
>  
> -  gcc_assert (single_pred_p (e->dest));
>    for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
>      {
>        gphi *phi = gsi.phi ();
> -      if (operand_equal_p (PHI_ARG_DEF (phi, 0),
> -			   PHI_ARG_DEF (lcssa_phi, 0), 0))
> -	return PHI_RESULT (phi);
> -    }
> -  return NULL_TREE;
> -}
> -
> -/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
> -   from SECOND/FIRST and puts it at the original loop's preheader/exit
> -   edge, the two loops are arranged as below:
> -
> -       preheader_a:
> -     first_loop:
> -       header_a:
> -	 i_1 = PHI<i_0, i_2>;
> -	 ...
> -	 i_2 = i_1 + 1;
> -	 if (cond_a)
> -	   goto latch_a;
> -	 else
> -	   goto between_bb;
> -       latch_a:
> -	 goto header_a;
> -
> -       between_bb:
> -	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
> -
> -     second_loop:
> -       header_b:
> -	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
> -				 or with i_2 if no LCSSA phi is created
> -				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
> -	 ...
> -	 i_4 = i_3 + 1;
> -	 if (cond_b)
> -	   goto latch_b;
> -	 else
> -	   goto exit_bb;
> -       latch_b:
> -	 goto header_b;
> -
> -       exit_bb:
> -
> -   This function creates loop closed SSA for the first loop; update the
> -   second loop's PHI nodes by replacing argument on incoming edge with the
> -   result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
> -   is false, Loop closed ssa phis will only be created for non-iv phis for
> -   the first loop.
> -
> -   This function assumes exit bb of the first loop is preheader bb of the
> -   second loop, i.e, between_bb in the example code.  With PHIs updated,
> -   the second loop will execute rest iterations of the first.  */
> -
> -static void
> -slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
> -				   class loop *first, edge first_loop_e,
> -				   class loop *second, edge second_loop_e,
> -				   bool create_lcssa_for_iv_phis)
> -{
> -  gphi_iterator gsi_update, gsi_orig;
> -  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> -
> -  edge first_latch_e = EDGE_SUCC (first->latch, 0);
> -  edge second_preheader_e = loop_preheader_edge (second);
> -  basic_block between_bb = first_loop_e->dest;
> -
> -  gcc_assert (between_bb == second_preheader_e->src);
> -  gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
> -  /* Either the first loop or the second is the loop to be vectorized.  */
> -  gcc_assert (loop == first || loop == second);
> -
> -  for (gsi_orig = gsi_start_phis (first->header),
> -       gsi_update = gsi_start_phis (second->header);
> -       !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
> -       gsi_next (&gsi_orig), gsi_next (&gsi_update))
> -    {
> -      gphi *orig_phi = gsi_orig.phi ();
> -      gphi *update_phi = gsi_update.phi ();
> -
> -      tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
> -      /* Generate lcssa PHI node for the first loop.  */
> -      gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
> -      stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
> -      if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
> +      /* Nested loops with multiple exits can have different no# phi node
> +	arguments between the main loop and epilog as epilog falls to the
> +	second loop.  */
> +      if (gimple_phi_num_args (phi) > e->dest_idx)
>  	{
> -	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
> -	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
> -	  add_phi_arg (lcssa_phi, arg, first_loop_e, UNKNOWN_LOCATION);
> -	  arg = new_res;
> -	}
> -
> -      /* Update PHI node in the second loop by replacing arg on the loop's
> -	 incoming edge.  */
> -      adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
> -    }
> -
> -  /* For epilogue peeling we have to make sure to copy all LC PHIs
> -     for correct vectorization of live stmts.  */
> -  if (loop == first)
> -    {
> -      basic_block orig_exit = second_loop_e->dest;
> -      for (gsi_orig = gsi_start_phis (orig_exit);
> -	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
> -	{
> -	  gphi *orig_phi = gsi_orig.phi ();
> -	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
> -	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
> -	    continue;
> -
> -	  const_edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> -	  /* Already created in the above loop.   */
> -	  if (find_guard_arg (first, second, exit_e, orig_phi))
> +	 tree var = PHI_ARG_DEF (phi, e->dest_idx);
> +	 if (TREE_CODE (var) != SSA_NAME)
>  	    continue;
> -
> -	  tree new_res = copy_ssa_name (orig_arg);
> -	  gphi *lcphi = create_phi_node (new_res, between_bb);
> -	  add_phi_arg (lcphi, orig_arg, first_loop_e, UNKNOWN_LOCATION);
> +	 tree def = get_current_def (var);
> +	 if (!def)
> +	   continue;
> +	 if (operand_equal_p (def,
> +			      PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
> +	   return PHI_RESULT (phi);
>  	}
>      }
> +  return NULL_TREE;
>  }
>  
>  /* Function slpeel_add_loop_guard adds guard skipping from the beginning
> @@ -2796,11 +2768,11 @@ slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
>      }
>  }
>  
> -/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
> -   from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
> -   point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
> -   and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
> -   The CFG looks like:
> +/* LOOP and EPILOG are two consecutive loops in CFG connected by LOOP_EXIT edge
> +   and EPILOG is copied from LOOP.  Function slpeel_add_loop_guard adds guard
> +   skipping from a point between the two loops to the end of EPILOG.  Edges
> +   GUARD_EDGE and MERGE_EDGE are the two pred edges of merge_bb at the end of
> +   EPILOG.  The CFG looks like:
>  
>       loop:
>         header_a:
> @@ -2851,6 +2823,7 @@ slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
>  
>  static void
>  slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
> +				    const_edge loop_exit,
>  				    edge guard_edge, edge merge_edge)
>  {
>    gphi_iterator gsi;
> @@ -2859,13 +2832,11 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
>    gcc_assert (single_succ_p (merge_bb));
>    edge e = single_succ_edge (merge_bb);
>    basic_block exit_bb = e->dest;
> -  gcc_assert (single_pred_p (exit_bb));
> -  gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
>  
>    for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
>      {
>        gphi *update_phi = gsi.phi ();
> -      tree old_arg = PHI_ARG_DEF (update_phi, 0);
> +      tree old_arg = PHI_ARG_DEF (update_phi, e->dest_idx);
>  
>        tree merge_arg = NULL_TREE;
>  
> @@ -2877,8 +2848,8 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
>        if (!merge_arg)
>  	merge_arg = old_arg;
>  
> -      tree guard_arg
> -	= find_guard_arg (loop, epilog, single_exit (loop), update_phi);
> +      tree guard_arg = find_guard_arg (loop, epilog, loop_exit,
> +				       update_phi, e->dest_idx);
>        /* If the var is live after loop but not a reduction, we simply
>  	 use the old arg.  */
>        if (!guard_arg)
> @@ -2898,21 +2869,6 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
>      }
>  }
>  
> -/* EPILOG loop is duplicated from the original loop for vectorizing,
> -   the arg of its loop closed ssa PHI needs to be updated.  */
> -
> -static void
> -slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
> -{
> -  gphi_iterator gsi;
> -  basic_block exit_bb = single_exit (epilog)->dest;
> -
> -  gcc_assert (single_pred_p (exit_bb));
> -  edge e = EDGE_PRED (exit_bb, 0);
> -  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
> -    rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
> -}
> -
>  /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
>     Return a value that equals:
>  
> @@ -3255,8 +3211,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>  						       e, &prolog_e);
>        gcc_assert (prolog);
>        prolog->force_vectorize = false;
> -      slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, prolog_e, loop,
> -					 exit_e, true);
> +
>        first_loop = prolog;
>        reset_original_copy_tables ();
>  
> @@ -3336,8 +3291,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>        LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
>        gcc_assert (epilog);
>        epilog->force_vectorize = false;
> -      slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, e, epilog,
> -					 new_epilog_e, false);
>        bb_before_epilog = loop_preheader_edge (epilog)->src;
>  
>        /* Scalar version loop may be preferred.  In this case, add guard
> @@ -3430,7 +3383,9 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>  					   irred_flag);
>  	  if (vect_epilogues)
>  	    epilogue_vinfo->skip_this_loop_edge = guard_e;
> -	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, epilog_e);
> +	  edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
> +	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, main_iv, guard_e,
> +					      epilog_e);
>  	  /* Only need to handle basic block before epilog loop if it's not
>  	     the guard_bb, which is the case when skip_vector is true.  */
>  	  if (guard_bb != bb_before_epilog)
> @@ -3441,8 +3396,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>  	    }
>  	  scale_loop_profile (epilog, prob_epilog, -1);
>  	}
> -      else
> -	slpeel_update_phi_nodes_for_lcssa (epilog);
>  
>        unsigned HOST_WIDE_INT bound;
>        if (bound_scalar.is_constant (&bound))
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index f1caa5f207d3b13da58c3a313b11d1ef98374349..327cab0f736da7f1bd3e024d666df46ef9208107 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -5877,7 +5877,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    basic_block exit_bb;
>    tree scalar_dest;
>    tree scalar_type;
> -  gimple *new_phi = NULL, *phi;
> +  gimple *new_phi = NULL, *phi = NULL;
>    gimple_stmt_iterator exit_gsi;
>    tree new_temp = NULL_TREE, new_name, new_scalar_dest;
>    gimple *epilog_stmt = NULL;
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 55b6771b271d5072fa1327d595e1dddb112cfdf6..25ceb6600673d71fd6012443403997e921066483 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2183,7 +2183,7 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
>  					 const_edge);
>  class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
>  						    class loop *, edge,
> -						    edge, edge *);
> +						    edge, edge *, bool = true);
>  class loop *vect_loop_versioning (loop_vec_info, gimple *);
>  extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
>  				    tree *, tree *, tree *, int, bool, bool,
> 
> 
> 
> 
>
Tamar Christina Oct. 11, 2023, 11:16 a.m. UTC | #2
> > +  auto loop_exits = get_loop_exit_edges (loop);
> > + auto_vec<basic_block> doms;
> > +
> >    if (at_exit) /* Add the loop copy at exit.  */
> >      {
> > -      if (scalar_loop != loop)
> > +      if (scalar_loop != loop && new_exit->dest != exit_dest)
> >  	{
> > -	  gphi_iterator gsi;
> >  	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
> > +	  flush_pending_stmts (new_exit);
> > +	}
> >
> > -	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
> > -	       gsi_next (&gsi))
> > -	    {
> > -	      gphi *phi = gsi.phi ();
> > -	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
> > -	      location_t orig_locus
> > -		= gimple_phi_arg_location_from_edge (phi, e);
> > +      auto_vec <gimple *> new_phis;
> > +      hash_map <tree, tree> new_phi_args;
> > +      /* First create the empty phi nodes so that when we flush the
> > +	 statements they can be filled in.   However because there is no order
> > +	 between the PHI nodes in the exits and the loop headers we need to
> > +	 order them base on the order of the two headers.  First record the
> new
> > +	 phi nodes.  */
> > +      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
> > +	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
> > +	{
> > +	  gimple *from_phi = gsi_stmt (gsi_from);
> > +	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> > +	  gphi *res = create_phi_node (new_res, new_preheader);
> > +	  new_phis.safe_push (res);
> > +	}
> >
> > -	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
> > +      /* Then redirect the edges and flush the changes.  This writes out the
> new
> > +	 SSA names.  */
> > +      for (edge exit : loop_exits)
> 
> I realize at the moment it's the same, but we are redirecting multiple exit edges
> here and from the walk above expect them all to have the same set of PHI
> nodes - that looks a bit fragile?

No, it only expects the two preheaders to have the same PHI nodes.  Since one loop
is copied from the other we know that to be true.

Now of course there are cases where your exit blocks have more PHI nodes than the
headers (e.g. live values) but those are handled later in the hunk below (with new_phi_args).

For the flush_pending_stmts to work I had to make sure the order of the phi nodes are the
same as the original.  This is why I can't iterate over the values in the exit block instead and
need to handle it in two steps.

> Does this need adjustments later for the early exit vectorization?
> 

I believe (need to finish the rebase) that the only adjustment I'll need here for multiple exits
is the updates of the dominators.  I don't think I'll need more.  I had issues with live values that
I had to handle specially before, but I think this new approach should deal with it already.

> This also somewhat confuses the original redirection of 'e', the main exit with
> the later (*)
> 
> > +	{
> > +	  edge e = redirect_edge_and_branch (exit, new_preheader);
> > +	  flush_pending_stmts (e);
> > +	}
> > +
> > +      /* Record the new SSA names in the cache so that we can skip
> materializing
> > +	 them again when we fill in the rest of the LCSSA variables.  */
> > +      for (auto phi : new_phis)
> > +	{
> > +	  tree new_arg = gimple_phi_arg (phi, 0)->def;
> 
> and here you look at the (for now) single edge we redirected ...
> 
> > +	  new_phi_args.put (new_arg, gimple_phi_result (phi));
> > +	}
> > +
> > +      /* Copy the current loop LC PHI nodes between the original loop exit
> > +	 block and the new loop header.  This allows us to later split the
> > +	 preheader block and still find the right LC nodes.  */
> > +      edge latch_new = single_succ_edge (new_preheader);
> 
> odd name - the single successor of a loop preheader is the loop header and the
> corresponding edge is the loop entry edge, not the latch?
> 
> > +      for (auto gsi_from = gsi_start_phis (loop->header),
> > +	   gsi_to = gsi_start_phis (new_loop->header);
> > +	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> 
> Eh, can we have
> 
>   if (flow_loops)
>     for  (auto ...)
> 
> please, even if that indents more?
> 
> > +	   gsi_next (&gsi_from), gsi_next (&gsi_to))
> > +	{
> > +	  gimple *from_phi = gsi_stmt (gsi_from);
> > +	  gimple *to_phi = gsi_stmt (gsi_to);
> > +	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> > +						loop_latch_edge (loop));
> > +
> > +	  /* Check if we've already created a new phi node during edge
> > +	     redirection.  If we have, only propagate the value downwards.  */
> > +	  if (tree *res = new_phi_args.get (new_arg))
> > +	    {
> > +	      adjust_phi_and_debug_stmts (to_phi, latch_new, *res);
> > +	      continue;
> >  	    }
> > +
> > +	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> > +	  gphi *lcssa_phi = create_phi_node (new_res, e->dest);
> > +
> > +	  /* Main loop exit should use the final iter value.  */
> > +	  add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
> 
> For all other edges into the loop besides 'e' there's missing PHI arguments?
> You are using 'e' here again, but also use that as temporary in for blocks,
> shadowing the parameter - that makes it difficult to read.  Also it's sometimes
> 'e->dest' and sometimes new_preheader - I think you want to use
> new_preheader here as well (in create_phi_node) for consistency and ease of
> understanding.
> 
> ISTR when early break vectorization lands we're going to redirect the alternate
> exits away again "fixing" the missing PHI args.
> 

We indeed had a discussion about this, and I'll expand more on the reasoning in the
patch for early breaks.  But I think not redirecting the edges away for early break makes
more sense as It treats early break, alignment peeling and epilogue vectorization the same
way and the only difference is in the statement inside the guard blocks.

But also more importantly this representation also makes it easier to implement First-Faulting
Loads support.  For FFL we'll copy the main loop and at the "fault" check we branch to a new
Loop remainder that has the same sequences as the remainder of the main vector loop but
with different predicates.  The reason for this is to remove the predicate mangling from the
optimal/likely loop body which is critical for performance.

Now since FFL is intended to pair naturally with early break having the early exit edges all
lead into the same block makes the flow a lot easier to manage.

But I'll make sure to include a diagram in the early break peeling patch.

Thanks,
Tamar

> > +
> > +	  adjust_phi_and_debug_stmts (to_phi, latch_new, new_res);
> >  	}
> > -      redirect_edge_and_branch_force (e, new_preheader);
> > -      flush_pending_stmts (e);
> > +
> >        set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
> > -      if (was_imm_dom || duplicate_outer_loop)
> > +
> > +      if ((was_imm_dom || duplicate_outer_loop))
> 
> extra ()s
> 
> >  	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit-
> >src);
> >
> >        /* And remove the non-necessary forwarder again.  Keep the
> > other @@ -1598,6 +1680,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg
> (class loop *loop, edge loop_exit,
> >      }
> >    else /* Add the copy at entry.  */
> >      {
> > +      /* Copy the current loop LC PHI nodes between the original loop exit
> > +	 block and the new loop header.  This allows us to later split the
> > +	 preheader block and still find the right LC nodes.  */
> > +      for (auto gsi_from = gsi_start_phis (new_loop->header),
> > +	   gsi_to = gsi_start_phis (loop->header);
> > +	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> 
> same if (flow_loops)
> 
> > +	   gsi_next (&gsi_from), gsi_next (&gsi_to))
> > +	{
> > +	  gimple *from_phi = gsi_stmt (gsi_from);
> > +	  gimple *to_phi = gsi_stmt (gsi_to);
> > +	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> > +						loop_latch_edge (new_loop));
> 
> this looks wrong?  IMHO it should be the PHI_RESULT, no?  Note this only
> triggers for alignment peeling ...
> 
> Otherwise looks OK.
> 
> Thanks,
> Richard.
> 
> 
> > +	  adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
> > +				      new_arg);
> > +	}
> > +
> >        if (scalar_loop != loop)
> >  	{
> >  	  /* Remove the non-necessary forwarder of scalar_loop again.  */ @@
> > -1627,29 +1725,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> *loop, edge loop_exit,
> >  			       loop_preheader_edge (new_loop)->src);
> >      }
> >
> > -  if (scalar_loop != loop)
> > -    {
> > -      /* Update new_loop->header PHIs, so that on the preheader
> > -	 edge they are the ones from loop rather than scalar_loop.  */
> > -      gphi_iterator gsi_orig, gsi_new;
> > -      edge orig_e = loop_preheader_edge (loop);
> > -      edge new_e = loop_preheader_edge (new_loop);
> > -
> > -      for (gsi_orig = gsi_start_phis (loop->header),
> > -	   gsi_new = gsi_start_phis (new_loop->header);
> > -	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
> > -	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
> > -	{
> > -	  gphi *orig_phi = gsi_orig.phi ();
> > -	  gphi *new_phi = gsi_new.phi ();
> > -	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
> > -	  location_t orig_locus
> > -	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
> > -
> > -	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
> > -	}
> > -    }
> > -
> >    free (new_bbs);
> >    free (bbs);
> >
> > @@ -2579,139 +2654,36 @@ vect_gen_vector_loop_niters_mult_vf
> > (loop_vec_info loop_vinfo,
> >
> >  /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
> >     this function searches for the corresponding lcssa phi node in exit
> > -   bb of LOOP.  If it is found, return the phi result; otherwise return
> > -   NULL.  */
> > +   bb of LOOP following the LCSSA_EDGE to the exit node.  If it is found,
> > +   return the phi result; otherwise return NULL.  */
> >
> >  static tree
> >  find_guard_arg (class loop *loop ATTRIBUTE_UNUSED,
> >  		class loop *epilog ATTRIBUTE_UNUSED,
> > -		const_edge e, gphi *lcssa_phi)
> > +		const_edge e, gphi *lcssa_phi, int lcssa_edge = 0)
> >  {
> >    gphi_iterator gsi;
> >
> > -  gcc_assert (single_pred_p (e->dest));
> >    for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
> >      {
> >        gphi *phi = gsi.phi ();
> > -      if (operand_equal_p (PHI_ARG_DEF (phi, 0),
> > -			   PHI_ARG_DEF (lcssa_phi, 0), 0))
> > -	return PHI_RESULT (phi);
> > -    }
> > -  return NULL_TREE;
> > -}
> > -
> > -/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates
> FIRST/SECOND
> > -   from SECOND/FIRST and puts it at the original loop's preheader/exit
> > -   edge, the two loops are arranged as below:
> > -
> > -       preheader_a:
> > -     first_loop:
> > -       header_a:
> > -	 i_1 = PHI<i_0, i_2>;
> > -	 ...
> > -	 i_2 = i_1 + 1;
> > -	 if (cond_a)
> > -	   goto latch_a;
> > -	 else
> > -	   goto between_bb;
> > -       latch_a:
> > -	 goto header_a;
> > -
> > -       between_bb:
> > -	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
> > -
> > -     second_loop:
> > -       header_b:
> > -	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
> > -				 or with i_2 if no LCSSA phi is created
> > -				 under condition of
> CREATE_LCSSA_FOR_IV_PHIS.
> > -	 ...
> > -	 i_4 = i_3 + 1;
> > -	 if (cond_b)
> > -	   goto latch_b;
> > -	 else
> > -	   goto exit_bb;
> > -       latch_b:
> > -	 goto header_b;
> > -
> > -       exit_bb:
> > -
> > -   This function creates loop closed SSA for the first loop; update the
> > -   second loop's PHI nodes by replacing argument on incoming edge with the
> > -   result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
> > -   is false, Loop closed ssa phis will only be created for non-iv phis for
> > -   the first loop.
> > -
> > -   This function assumes exit bb of the first loop is preheader bb of the
> > -   second loop, i.e, between_bb in the example code.  With PHIs updated,
> > -   the second loop will execute rest iterations of the first.  */
> > -
> > -static void
> > -slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
> > -				   class loop *first, edge first_loop_e,
> > -				   class loop *second, edge second_loop_e,
> > -				   bool create_lcssa_for_iv_phis)
> > -{
> > -  gphi_iterator gsi_update, gsi_orig;
> > -  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > -
> > -  edge first_latch_e = EDGE_SUCC (first->latch, 0);
> > -  edge second_preheader_e = loop_preheader_edge (second);
> > -  basic_block between_bb = first_loop_e->dest;
> > -
> > -  gcc_assert (between_bb == second_preheader_e->src);
> > -  gcc_assert (single_pred_p (between_bb) && single_succ_p
> > (between_bb));
> > -  /* Either the first loop or the second is the loop to be
> > vectorized.  */
> > -  gcc_assert (loop == first || loop == second);
> > -
> > -  for (gsi_orig = gsi_start_phis (first->header),
> > -       gsi_update = gsi_start_phis (second->header);
> > -       !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
> > -       gsi_next (&gsi_orig), gsi_next (&gsi_update))
> > -    {
> > -      gphi *orig_phi = gsi_orig.phi ();
> > -      gphi *update_phi = gsi_update.phi ();
> > -
> > -      tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
> > -      /* Generate lcssa PHI node for the first loop.  */
> > -      gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
> > -      stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
> > -      if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
> > +      /* Nested loops with multiple exits can have different no# phi node
> > +	arguments between the main loop and epilog as epilog falls to the
> > +	second loop.  */
> > +      if (gimple_phi_num_args (phi) > e->dest_idx)
> >  	{
> > -	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
> > -	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
> > -	  add_phi_arg (lcssa_phi, arg, first_loop_e, UNKNOWN_LOCATION);
> > -	  arg = new_res;
> > -	}
> > -
> > -      /* Update PHI node in the second loop by replacing arg on the loop's
> > -	 incoming edge.  */
> > -      adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
> > -    }
> > -
> > -  /* For epilogue peeling we have to make sure to copy all LC PHIs
> > -     for correct vectorization of live stmts.  */
> > -  if (loop == first)
> > -    {
> > -      basic_block orig_exit = second_loop_e->dest;
> > -      for (gsi_orig = gsi_start_phis (orig_exit);
> > -	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
> > -	{
> > -	  gphi *orig_phi = gsi_orig.phi ();
> > -	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
> > -	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p
> (orig_arg))
> > -	    continue;
> > -
> > -	  const_edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > -	  /* Already created in the above loop.   */
> > -	  if (find_guard_arg (first, second, exit_e, orig_phi))
> > +	 tree var = PHI_ARG_DEF (phi, e->dest_idx);
> > +	 if (TREE_CODE (var) != SSA_NAME)
> >  	    continue;
> > -
> > -	  tree new_res = copy_ssa_name (orig_arg);
> > -	  gphi *lcphi = create_phi_node (new_res, between_bb);
> > -	  add_phi_arg (lcphi, orig_arg, first_loop_e, UNKNOWN_LOCATION);
> > +	 tree def = get_current_def (var);
> > +	 if (!def)
> > +	   continue;
> > +	 if (operand_equal_p (def,
> > +			      PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
> > +	   return PHI_RESULT (phi);
> >  	}
> >      }
> > +  return NULL_TREE;
> >  }
> >
> >  /* Function slpeel_add_loop_guard adds guard skipping from the
> > beginning @@ -2796,11 +2768,11 @@
> slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
> >      }
> >  }
> >
> > -/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
> > -   from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
> > -   point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
> > -   and MERGE_EDGE are the two pred edges of merge_bb at the end of
> EPILOG.
> > -   The CFG looks like:
> > +/* LOOP and EPILOG are two consecutive loops in CFG connected by
> LOOP_EXIT edge
> > +   and EPILOG is copied from LOOP.  Function slpeel_add_loop_guard adds
> guard
> > +   skipping from a point between the two loops to the end of EPILOG.  Edges
> > +   GUARD_EDGE and MERGE_EDGE are the two pred edges of merge_bb at
> the end of
> > +   EPILOG.  The CFG looks like:
> >
> >       loop:
> >         header_a:
> > @@ -2851,6 +2823,7 @@ slpeel_update_phi_nodes_for_guard1 (class loop
> > *skip_loop,
> >
> >  static void
> >  slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop
> > *epilog,
> > +				    const_edge loop_exit,
> >  				    edge guard_edge, edge merge_edge)  {
> >    gphi_iterator gsi;
> > @@ -2859,13 +2832,11 @@ slpeel_update_phi_nodes_for_guard2 (class
> loop *loop, class loop *epilog,
> >    gcc_assert (single_succ_p (merge_bb));
> >    edge e = single_succ_edge (merge_bb);
> >    basic_block exit_bb = e->dest;
> > -  gcc_assert (single_pred_p (exit_bb));
> > -  gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
> >
> >    for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
> >      {
> >        gphi *update_phi = gsi.phi ();
> > -      tree old_arg = PHI_ARG_DEF (update_phi, 0);
> > +      tree old_arg = PHI_ARG_DEF (update_phi, e->dest_idx);
> >
> >        tree merge_arg = NULL_TREE;
> >
> > @@ -2877,8 +2848,8 @@ slpeel_update_phi_nodes_for_guard2 (class loop
> *loop, class loop *epilog,
> >        if (!merge_arg)
> >  	merge_arg = old_arg;
> >
> > -      tree guard_arg
> > -	= find_guard_arg (loop, epilog, single_exit (loop), update_phi);
> > +      tree guard_arg = find_guard_arg (loop, epilog, loop_exit,
> > +				       update_phi, e->dest_idx);
> >        /* If the var is live after loop but not a reduction, we simply
> >  	 use the old arg.  */
> >        if (!guard_arg)
> > @@ -2898,21 +2869,6 @@ slpeel_update_phi_nodes_for_guard2 (class
> loop *loop, class loop *epilog,
> >      }
> >  }
> >
> > -/* EPILOG loop is duplicated from the original loop for vectorizing,
> > -   the arg of its loop closed ssa PHI needs to be updated.  */
> > -
> > -static void
> > -slpeel_update_phi_nodes_for_lcssa (class loop *epilog) -{
> > -  gphi_iterator gsi;
> > -  basic_block exit_bb = single_exit (epilog)->dest;
> > -
> > -  gcc_assert (single_pred_p (exit_bb));
> > -  edge e = EDGE_PRED (exit_bb, 0);
> > -  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
> > -    rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
> > -}
> > -
> >  /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be
> skipped.
> >     Return a value that equals:
> >
> > @@ -3255,8 +3211,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> niters, tree nitersm1,
> >  						       e, &prolog_e);
> >        gcc_assert (prolog);
> >        prolog->force_vectorize = false;
> > -      slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, prolog_e, loop,
> > -					 exit_e, true);
> > +
> >        first_loop = prolog;
> >        reset_original_copy_tables ();
> >
> > @@ -3336,8 +3291,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> niters, tree nitersm1,
> >        LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
> >        gcc_assert (epilog);
> >        epilog->force_vectorize = false;
> > -      slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, e, epilog,
> > -					 new_epilog_e, false);
> >        bb_before_epilog = loop_preheader_edge (epilog)->src;
> >
> >        /* Scalar version loop may be preferred.  In this case, add
> > guard @@ -3430,7 +3383,9 @@ vect_do_peeling (loop_vec_info
> loop_vinfo, tree niters, tree nitersm1,
> >  					   irred_flag);
> >  	  if (vect_epilogues)
> >  	    epilogue_vinfo->skip_this_loop_edge = guard_e;
> > -	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
> epilog_e);
> > +	  edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > +	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, main_iv,
> guard_e,
> > +					      epilog_e);
> >  	  /* Only need to handle basic block before epilog loop if it's not
> >  	     the guard_bb, which is the case when skip_vector is true.  */
> >  	  if (guard_bb != bb_before_epilog)
> > @@ -3441,8 +3396,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> niters, tree nitersm1,
> >  	    }
> >  	  scale_loop_profile (epilog, prob_epilog, -1);
> >  	}
> > -      else
> > -	slpeel_update_phi_nodes_for_lcssa (epilog);
> >
> >        unsigned HOST_WIDE_INT bound;
> >        if (bound_scalar.is_constant (&bound)) diff --git
> > a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> >
> f1caa5f207d3b13da58c3a313b11d1ef98374349..327cab0f736da7f1bd3e0
> 24d666d
> > f46ef9208107 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -5877,7 +5877,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
> >    basic_block exit_bb;
> >    tree scalar_dest;
> >    tree scalar_type;
> > -  gimple *new_phi = NULL, *phi;
> > +  gimple *new_phi = NULL, *phi = NULL;
> >    gimple_stmt_iterator exit_gsi;
> >    tree new_temp = NULL_TREE, new_name, new_scalar_dest;
> >    gimple *epilog_stmt = NULL;
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index
> >
> 55b6771b271d5072fa1327d595e1dddb112cfdf6..25ceb6600673d71fd601
> 24434039
> > 97e921066483 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -2183,7 +2183,7 @@ extern bool slpeel_can_duplicate_loop_p (const
> class loop *, const_edge,
> >  					 const_edge);
> >  class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
> >  						    class loop *, edge,
> > -						    edge, edge *);
> > +						    edge, edge *, bool = true);
> >  class loop *vect_loop_versioning (loop_vec_info, gimple *);  extern
> > class loop *vect_do_peeling (loop_vec_info, tree, tree,
> >  				    tree *, tree *, tree *, int, bool, bool,
> >
> >
> >
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
Richard Biener Oct. 11, 2023, 12:09 p.m. UTC | #3
On Wed, 11 Oct 2023, Tamar Christina wrote:

> > > +  auto loop_exits = get_loop_exit_edges (loop);
> > > + auto_vec<basic_block> doms;
> > > +
> > >    if (at_exit) /* Add the loop copy at exit.  */
> > >      {
> > > -      if (scalar_loop != loop)
> > > +      if (scalar_loop != loop && new_exit->dest != exit_dest)
> > >  	{
> > > -	  gphi_iterator gsi;
> > >  	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
> > > +	  flush_pending_stmts (new_exit);
> > > +	}
> > >
> > > -	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
> > > -	       gsi_next (&gsi))
> > > -	    {
> > > -	      gphi *phi = gsi.phi ();
> > > -	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
> > > -	      location_t orig_locus
> > > -		= gimple_phi_arg_location_from_edge (phi, e);
> > > +      auto_vec <gimple *> new_phis;
> > > +      hash_map <tree, tree> new_phi_args;
> > > +      /* First create the empty phi nodes so that when we flush the
> > > +	 statements they can be filled in.   However because there is no order
> > > +	 between the PHI nodes in the exits and the loop headers we need to
> > > +	 order them base on the order of the two headers.  First record the
> > new
> > > +	 phi nodes.  */
> > > +      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
> > > +	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
> > > +	{
> > > +	  gimple *from_phi = gsi_stmt (gsi_from);
> > > +	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> > > +	  gphi *res = create_phi_node (new_res, new_preheader);
> > > +	  new_phis.safe_push (res);
> > > +	}
> > >
> > > -	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
> > > +      /* Then redirect the edges and flush the changes.  This writes out the
> > new
> > > +	 SSA names.  */
> > > +      for (edge exit : loop_exits)
> > 
> > I realize at the moment it's the same, but we are redirecting multiple exit edges
> > here and from the walk above expect them all to have the same set of PHI
> > nodes - that looks a bit fragile?
> 
> No, it only expects the two preheaders to have the same PHI nodes.  Since one loop
> is copied from the other we know that to be true.
> 
> Now of course there are cases where your exit blocks have more PHI nodes than the
> headers (e.g. live values) but those are handled later in the hunk below (with new_phi_args).
> 
> For the flush_pending_stmts to work I had to make sure the order of the phi nodes are the
> same as the original.  This is why I can't iterate over the values in the exit block instead and
> need to handle it in two steps.
> 
> > Does this need adjustments later for the early exit vectorization?
> > 
> 
> I believe (need to finish the rebase) that the only adjustment I'll need here for multiple exits
> is the updates of the dominators.  I don't think I'll need more.  I had issues with live values that
> I had to handle specially before, but I think this new approach should deal with it already.

OK.

> > This also somewhat confuses the original redirection of 'e', the main exit with
> > the later (*)
> > 
> > > +	{
> > > +	  edge e = redirect_edge_and_branch (exit, new_preheader);
> > > +	  flush_pending_stmts (e);
> > > +	}
> > > +
> > > +      /* Record the new SSA names in the cache so that we can skip
> > materializing
> > > +	 them again when we fill in the rest of the LCSSA variables.  */
> > > +      for (auto phi : new_phis)
> > > +	{
> > > +	  tree new_arg = gimple_phi_arg (phi, 0)->def;
> > 
> > and here you look at the (for now) single edge we redirected ...
> > 
> > > +	  new_phi_args.put (new_arg, gimple_phi_result (phi));
> > > +	}
> > > +
> > > +      /* Copy the current loop LC PHI nodes between the original loop exit
> > > +	 block and the new loop header.  This allows us to later split the
> > > +	 preheader block and still find the right LC nodes.  */
> > > +      edge latch_new = single_succ_edge (new_preheader);
> > 
> > odd name - the single successor of a loop preheader is the loop header and the
> > corresponding edge is the loop entry edge, not the latch?
> > 
> > > +      for (auto gsi_from = gsi_start_phis (loop->header),
> > > +	   gsi_to = gsi_start_phis (new_loop->header);
> > > +	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> > 
> > Eh, can we have
> > 
> >   if (flow_loops)
> >     for  (auto ...)
> > 
> > please, even if that indents more?
> > 
> > > +	   gsi_next (&gsi_from), gsi_next (&gsi_to))
> > > +	{
> > > +	  gimple *from_phi = gsi_stmt (gsi_from);
> > > +	  gimple *to_phi = gsi_stmt (gsi_to);
> > > +	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> > > +						loop_latch_edge (loop));
> > > +
> > > +	  /* Check if we've already created a new phi node during edge
> > > +	     redirection.  If we have, only propagate the value downwards.  */
> > > +	  if (tree *res = new_phi_args.get (new_arg))
> > > +	    {
> > > +	      adjust_phi_and_debug_stmts (to_phi, latch_new, *res);
> > > +	      continue;
> > >  	    }
> > > +
> > > +	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> > > +	  gphi *lcssa_phi = create_phi_node (new_res, e->dest);
> > > +
> > > +	  /* Main loop exit should use the final iter value.  */
> > > +	  add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
> > 
> > For all other edges into the loop besides 'e' there's missing PHI arguments?
> > You are using 'e' here again, but also use that as temporary in for blocks,
> > shadowing the parameter - that makes it difficult to read.  Also it's sometimes
> > 'e->dest' and sometimes new_preheader - I think you want to use
> > new_preheader here as well (in create_phi_node) for consistency and ease of
> > understanding.
> > 
> > ISTR when early break vectorization lands we're going to redirect the alternate
> > exits away again "fixing" the missing PHI args.
> > 
> 
> We indeed had a discussion about this, and I'll expand more on the reasoning in the
> patch for early breaks.  But I think not redirecting the edges away for early break makes
> more sense as It treats early break, alignment peeling and epilogue vectorization the same
> way and the only difference is in the statement inside the guard blocks.
> 
> But also more importantly this representation also makes it easier to implement First-Faulting
> Loads support.  For FFL we'll copy the main loop and at the "fault" check we branch to a new
> Loop remainder that has the same sequences as the remainder of the main vector loop but
> with different predicates.  The reason for this is to remove the predicate mangling from the
> optimal/likely loop body which is critical for performance.
> 
> Now since FFL is intended to pair naturally with early break having the early exit edges all
> lead into the same block makes the flow a lot easier to manage.
> 
> But I'll make sure to include a diagram in the early break peeling patch.

Thanks.

So with the minor pending adjustments this series should be OK.

Thanks,
Richard.

> Thanks,
> Tamar
> 
> > > +
> > > +	  adjust_phi_and_debug_stmts (to_phi, latch_new, new_res);
> > >  	}
> > > -      redirect_edge_and_branch_force (e, new_preheader);
> > > -      flush_pending_stmts (e);
> > > +
> > >        set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
> > > -      if (was_imm_dom || duplicate_outer_loop)
> > > +
> > > +      if ((was_imm_dom || duplicate_outer_loop))
> > 
> > extra ()s
> > 
> > >  	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit-
> > >src);
> > >
> > >        /* And remove the non-necessary forwarder again.  Keep the
> > > other @@ -1598,6 +1680,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg
> > (class loop *loop, edge loop_exit,
> > >      }
> > >    else /* Add the copy at entry.  */
> > >      {
> > > +      /* Copy the current loop LC PHI nodes between the original loop exit
> > > +	 block and the new loop header.  This allows us to later split the
> > > +	 preheader block and still find the right LC nodes.  */
> > > +      for (auto gsi_from = gsi_start_phis (new_loop->header),
> > > +	   gsi_to = gsi_start_phis (loop->header);
> > > +	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> > 
> > same if (flow_loops)
> > 
> > > +	   gsi_next (&gsi_from), gsi_next (&gsi_to))
> > > +	{
> > > +	  gimple *from_phi = gsi_stmt (gsi_from);
> > > +	  gimple *to_phi = gsi_stmt (gsi_to);
> > > +	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> > > +						loop_latch_edge (new_loop));
> > 
> > this looks wrong?  IMHO it should be the PHI_RESULT, no?  Note this only
> > triggers for alignment peeling ...
> > 
> > Otherwise looks OK.
> > 
> > Thanks,
> > Richard.
> > 
> > 
> > > +	  adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
> > > +				      new_arg);
> > > +	}
> > > +
> > >        if (scalar_loop != loop)
> > >  	{
> > >  	  /* Remove the non-necessary forwarder of scalar_loop again.  */ @@
> > > -1627,29 +1725,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> > *loop, edge loop_exit,
> > >  			       loop_preheader_edge (new_loop)->src);
> > >      }
> > >
> > > -  if (scalar_loop != loop)
> > > -    {
> > > -      /* Update new_loop->header PHIs, so that on the preheader
> > > -	 edge they are the ones from loop rather than scalar_loop.  */
> > > -      gphi_iterator gsi_orig, gsi_new;
> > > -      edge orig_e = loop_preheader_edge (loop);
> > > -      edge new_e = loop_preheader_edge (new_loop);
> > > -
> > > -      for (gsi_orig = gsi_start_phis (loop->header),
> > > -	   gsi_new = gsi_start_phis (new_loop->header);
> > > -	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
> > > -	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
> > > -	{
> > > -	  gphi *orig_phi = gsi_orig.phi ();
> > > -	  gphi *new_phi = gsi_new.phi ();
> > > -	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
> > > -	  location_t orig_locus
> > > -	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
> > > -
> > > -	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
> > > -	}
> > > -    }
> > > -
> > >    free (new_bbs);
> > >    free (bbs);
> > >
> > > @@ -2579,139 +2654,36 @@ vect_gen_vector_loop_niters_mult_vf
> > > (loop_vec_info loop_vinfo,
> > >
> > >  /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
> > >     this function searches for the corresponding lcssa phi node in exit
> > > -   bb of LOOP.  If it is found, return the phi result; otherwise return
> > > -   NULL.  */
> > > +   bb of LOOP following the LCSSA_EDGE to the exit node.  If it is found,
> > > +   return the phi result; otherwise return NULL.  */
> > >
> > >  static tree
> > >  find_guard_arg (class loop *loop ATTRIBUTE_UNUSED,
> > >  		class loop *epilog ATTRIBUTE_UNUSED,
> > > -		const_edge e, gphi *lcssa_phi)
> > > +		const_edge e, gphi *lcssa_phi, int lcssa_edge = 0)
> > >  {
> > >    gphi_iterator gsi;
> > >
> > > -  gcc_assert (single_pred_p (e->dest));
> > >    for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
> > >      {
> > >        gphi *phi = gsi.phi ();
> > > -      if (operand_equal_p (PHI_ARG_DEF (phi, 0),
> > > -			   PHI_ARG_DEF (lcssa_phi, 0), 0))
> > > -	return PHI_RESULT (phi);
> > > -    }
> > > -  return NULL_TREE;
> > > -}
> > > -
> > > -/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates
> > FIRST/SECOND
> > > -   from SECOND/FIRST and puts it at the original loop's preheader/exit
> > > -   edge, the two loops are arranged as below:
> > > -
> > > -       preheader_a:
> > > -     first_loop:
> > > -       header_a:
> > > -	 i_1 = PHI<i_0, i_2>;
> > > -	 ...
> > > -	 i_2 = i_1 + 1;
> > > -	 if (cond_a)
> > > -	   goto latch_a;
> > > -	 else
> > > -	   goto between_bb;
> > > -       latch_a:
> > > -	 goto header_a;
> > > -
> > > -       between_bb:
> > > -	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
> > > -
> > > -     second_loop:
> > > -       header_b:
> > > -	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
> > > -				 or with i_2 if no LCSSA phi is created
> > > -				 under condition of
> > CREATE_LCSSA_FOR_IV_PHIS.
> > > -	 ...
> > > -	 i_4 = i_3 + 1;
> > > -	 if (cond_b)
> > > -	   goto latch_b;
> > > -	 else
> > > -	   goto exit_bb;
> > > -       latch_b:
> > > -	 goto header_b;
> > > -
> > > -       exit_bb:
> > > -
> > > -   This function creates loop closed SSA for the first loop; update the
> > > -   second loop's PHI nodes by replacing argument on incoming edge with the
> > > -   result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
> > > -   is false, Loop closed ssa phis will only be created for non-iv phis for
> > > -   the first loop.
> > > -
> > > -   This function assumes exit bb of the first loop is preheader bb of the
> > > -   second loop, i.e, between_bb in the example code.  With PHIs updated,
> > > -   the second loop will execute rest iterations of the first.  */
> > > -
> > > -static void
> > > -slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
> > > -				   class loop *first, edge first_loop_e,
> > > -				   class loop *second, edge second_loop_e,
> > > -				   bool create_lcssa_for_iv_phis)
> > > -{
> > > -  gphi_iterator gsi_update, gsi_orig;
> > > -  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > > -
> > > -  edge first_latch_e = EDGE_SUCC (first->latch, 0);
> > > -  edge second_preheader_e = loop_preheader_edge (second);
> > > -  basic_block between_bb = first_loop_e->dest;
> > > -
> > > -  gcc_assert (between_bb == second_preheader_e->src);
> > > -  gcc_assert (single_pred_p (between_bb) && single_succ_p
> > > (between_bb));
> > > -  /* Either the first loop or the second is the loop to be
> > > vectorized.  */
> > > -  gcc_assert (loop == first || loop == second);
> > > -
> > > -  for (gsi_orig = gsi_start_phis (first->header),
> > > -       gsi_update = gsi_start_phis (second->header);
> > > -       !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
> > > -       gsi_next (&gsi_orig), gsi_next (&gsi_update))
> > > -    {
> > > -      gphi *orig_phi = gsi_orig.phi ();
> > > -      gphi *update_phi = gsi_update.phi ();
> > > -
> > > -      tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
> > > -      /* Generate lcssa PHI node for the first loop.  */
> > > -      gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
> > > -      stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
> > > -      if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
> > > +      /* Nested loops with multiple exits can have different no# phi node
> > > +	arguments between the main loop and epilog as epilog falls to the
> > > +	second loop.  */
> > > +      if (gimple_phi_num_args (phi) > e->dest_idx)
> > >  	{
> > > -	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
> > > -	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
> > > -	  add_phi_arg (lcssa_phi, arg, first_loop_e, UNKNOWN_LOCATION);
> > > -	  arg = new_res;
> > > -	}
> > > -
> > > -      /* Update PHI node in the second loop by replacing arg on the loop's
> > > -	 incoming edge.  */
> > > -      adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
> > > -    }
> > > -
> > > -  /* For epilogue peeling we have to make sure to copy all LC PHIs
> > > -     for correct vectorization of live stmts.  */
> > > -  if (loop == first)
> > > -    {
> > > -      basic_block orig_exit = second_loop_e->dest;
> > > -      for (gsi_orig = gsi_start_phis (orig_exit);
> > > -	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
> > > -	{
> > > -	  gphi *orig_phi = gsi_orig.phi ();
> > > -	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
> > > -	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p
> > (orig_arg))
> > > -	    continue;
> > > -
> > > -	  const_edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > > -	  /* Already created in the above loop.   */
> > > -	  if (find_guard_arg (first, second, exit_e, orig_phi))
> > > +	 tree var = PHI_ARG_DEF (phi, e->dest_idx);
> > > +	 if (TREE_CODE (var) != SSA_NAME)
> > >  	    continue;
> > > -
> > > -	  tree new_res = copy_ssa_name (orig_arg);
> > > -	  gphi *lcphi = create_phi_node (new_res, between_bb);
> > > -	  add_phi_arg (lcphi, orig_arg, first_loop_e, UNKNOWN_LOCATION);
> > > +	 tree def = get_current_def (var);
> > > +	 if (!def)
> > > +	   continue;
> > > +	 if (operand_equal_p (def,
> > > +			      PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
> > > +	   return PHI_RESULT (phi);
> > >  	}
> > >      }
> > > +  return NULL_TREE;
> > >  }
> > >
> > >  /* Function slpeel_add_loop_guard adds guard skipping from the
> > > beginning @@ -2796,11 +2768,11 @@
> > slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
> > >      }
> > >  }
> > >
> > > -/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
> > > -   from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
> > > -   point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
> > > -   and MERGE_EDGE are the two pred edges of merge_bb at the end of
> > EPILOG.
> > > -   The CFG looks like:
> > > +/* LOOP and EPILOG are two consecutive loops in CFG connected by
> > LOOP_EXIT edge
> > > +   and EPILOG is copied from LOOP.  Function slpeel_add_loop_guard adds
> > guard
> > > +   skipping from a point between the two loops to the end of EPILOG.  Edges
> > > +   GUARD_EDGE and MERGE_EDGE are the two pred edges of merge_bb at
> > the end of
> > > +   EPILOG.  The CFG looks like:
> > >
> > >       loop:
> > >         header_a:
> > > @@ -2851,6 +2823,7 @@ slpeel_update_phi_nodes_for_guard1 (class loop
> > > *skip_loop,
> > >
> > >  static void
> > >  slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop
> > > *epilog,
> > > +				    const_edge loop_exit,
> > >  				    edge guard_edge, edge merge_edge)  {
> > >    gphi_iterator gsi;
> > > @@ -2859,13 +2832,11 @@ slpeel_update_phi_nodes_for_guard2 (class
> > loop *loop, class loop *epilog,
> > >    gcc_assert (single_succ_p (merge_bb));
> > >    edge e = single_succ_edge (merge_bb);
> > >    basic_block exit_bb = e->dest;
> > > -  gcc_assert (single_pred_p (exit_bb));
> > > -  gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
> > >
> > >    for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
> > >      {
> > >        gphi *update_phi = gsi.phi ();
> > > -      tree old_arg = PHI_ARG_DEF (update_phi, 0);
> > > +      tree old_arg = PHI_ARG_DEF (update_phi, e->dest_idx);
> > >
> > >        tree merge_arg = NULL_TREE;
> > >
> > > @@ -2877,8 +2848,8 @@ slpeel_update_phi_nodes_for_guard2 (class loop
> > *loop, class loop *epilog,
> > >        if (!merge_arg)
> > >  	merge_arg = old_arg;
> > >
> > > -      tree guard_arg
> > > -	= find_guard_arg (loop, epilog, single_exit (loop), update_phi);
> > > +      tree guard_arg = find_guard_arg (loop, epilog, loop_exit,
> > > +				       update_phi, e->dest_idx);
> > >        /* If the var is live after loop but not a reduction, we simply
> > >  	 use the old arg.  */
> > >        if (!guard_arg)
> > > @@ -2898,21 +2869,6 @@ slpeel_update_phi_nodes_for_guard2 (class
> > loop *loop, class loop *epilog,
> > >      }
> > >  }
> > >
> > > -/* EPILOG loop is duplicated from the original loop for vectorizing,
> > > -   the arg of its loop closed ssa PHI needs to be updated.  */
> > > -
> > > -static void
> > > -slpeel_update_phi_nodes_for_lcssa (class loop *epilog) -{
> > > -  gphi_iterator gsi;
> > > -  basic_block exit_bb = single_exit (epilog)->dest;
> > > -
> > > -  gcc_assert (single_pred_p (exit_bb));
> > > -  edge e = EDGE_PRED (exit_bb, 0);
> > > -  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
> > > -    rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
> > > -}
> > > -
> > >  /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be
> > skipped.
> > >     Return a value that equals:
> > >
> > > @@ -3255,8 +3211,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> > niters, tree nitersm1,
> > >  						       e, &prolog_e);
> > >        gcc_assert (prolog);
> > >        prolog->force_vectorize = false;
> > > -      slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, prolog_e, loop,
> > > -					 exit_e, true);
> > > +
> > >        first_loop = prolog;
> > >        reset_original_copy_tables ();
> > >
> > > @@ -3336,8 +3291,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> > niters, tree nitersm1,
> > >        LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
> > >        gcc_assert (epilog);
> > >        epilog->force_vectorize = false;
> > > -      slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, e, epilog,
> > > -					 new_epilog_e, false);
> > >        bb_before_epilog = loop_preheader_edge (epilog)->src;
> > >
> > >        /* Scalar version loop may be preferred.  In this case, add
> > > guard @@ -3430,7 +3383,9 @@ vect_do_peeling (loop_vec_info
> > loop_vinfo, tree niters, tree nitersm1,
> > >  					   irred_flag);
> > >  	  if (vect_epilogues)
> > >  	    epilogue_vinfo->skip_this_loop_edge = guard_e;
> > > -	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
> > epilog_e);
> > > +	  edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > > +	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, main_iv,
> > guard_e,
> > > +					      epilog_e);
> > >  	  /* Only need to handle basic block before epilog loop if it's not
> > >  	     the guard_bb, which is the case when skip_vector is true.  */
> > >  	  if (guard_bb != bb_before_epilog)
> > > @@ -3441,8 +3396,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> > niters, tree nitersm1,
> > >  	    }
> > >  	  scale_loop_profile (epilog, prob_epilog, -1);
> > >  	}
> > > -      else
> > > -	slpeel_update_phi_nodes_for_lcssa (epilog);
> > >
> > >        unsigned HOST_WIDE_INT bound;
> > >        if (bound_scalar.is_constant (&bound)) diff --git
> > > a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> > >
> > f1caa5f207d3b13da58c3a313b11d1ef98374349..327cab0f736da7f1bd3e0
> > 24d666d
> > > f46ef9208107 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -5877,7 +5877,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> > loop_vinfo,
> > >    basic_block exit_bb;
> > >    tree scalar_dest;
> > >    tree scalar_type;
> > > -  gimple *new_phi = NULL, *phi;
> > > +  gimple *new_phi = NULL, *phi = NULL;
> > >    gimple_stmt_iterator exit_gsi;
> > >    tree new_temp = NULL_TREE, new_name, new_scalar_dest;
> > >    gimple *epilog_stmt = NULL;
> > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index
> > >
> > 55b6771b271d5072fa1327d595e1dddb112cfdf6..25ceb6600673d71fd601
> > 24434039
> > > 97e921066483 100644
> > > --- a/gcc/tree-vectorizer.h
> > > +++ b/gcc/tree-vectorizer.h
> > > @@ -2183,7 +2183,7 @@ extern bool slpeel_can_duplicate_loop_p (const
> > class loop *, const_edge,
> > >  					 const_edge);
> > >  class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
> > >  						    class loop *, edge,
> > > -						    edge, edge *);
> > > +						    edge, edge *, bool = true);
> > >  class loop *vect_loop_versioning (loop_vec_info, gimple *);  extern
> > > class loop *vect_do_peeling (loop_vec_info, tree, tree,
> > >  				    tree *, tree *, tree *, int, bool, bool,
> > >
> > >
> > >
> > >
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
>
diff mbox series

Patch

--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -950,7 +950,7 @@  copy_loop_before (class loop *loop, bool redirect_lc_phi_defs)
 
   initialize_original_copy_tables ();
   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, single_exit (loop), NULL,
-						NULL, preheader, NULL);
+						NULL, preheader, NULL, false);
   gcc_assert (res != NULL);
 
   /* When a not last partition is supposed to keep the LC PHIs computed
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 77f8e668bcc8beca99ba4052e1b12e0d17300262..0e8c0be5384aab2399ed93966e7bf4918f6c87a5 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -252,6 +252,9 @@  adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 {
   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 
+  gcc_assert (TREE_CODE (orig_def) != SSA_NAME
+	      || orig_def != new_def);
+
   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 
   if (MAY_HAVE_DEBUG_BIND_STMTS)
@@ -1445,12 +1448,19 @@  slpeel_duplicate_current_defs_from_edges (edge from, edge to)
    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
-   entry or exit of LOOP.  */
+   entry or exit of LOOP.  If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
+   continuation.  This is correct for cases where one loop continues from the
+   other like in the vectorizer, but not true for uses in e.g. loop distribution
+   where the loop is duplicated and then modified.
+
+   If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
+   dominators were updated during the peeling.  */
 
 class loop *
 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 					class loop *scalar_loop,
-					edge scalar_exit, edge e, edge *new_e)
+					edge scalar_exit, edge e, edge *new_e,
+					bool flow_loops)
 {
   class loop *new_loop;
   basic_block *new_bbs, *bbs, *pbbs;
@@ -1481,6 +1491,8 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	    scalar_exit	= exit;
 	    break;
 	  }
+
+      gcc_assert (scalar_exit);
     }
 
   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
@@ -1513,6 +1525,8 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
   exit = loop_exit;
   basic_block new_preheader = new_bbs[0];
 
+  gcc_assert (new_exit);
+
   /* Record the new loop exit information.  new_loop doesn't have SCEV data and
      so we must initialize the exit information.  */
   if (new_e)
@@ -1551,6 +1565,19 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
 
+  /* Rename the exit uses.  */
+  for (edge exit : get_loop_exit_edges (new_loop))
+    for (auto gsi = gsi_start_phis (exit->dest);
+	 !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
+	rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
+	if (MAY_HAVE_DEBUG_BIND_STMTS)
+	  adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
+      }
+
+  /* This condition happens when the loop has been versioned. e.g. due to ifcvt
+     versioning the loop.  */
   if (scalar_loop != loop)
     {
       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
@@ -1564,28 +1591,83 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 						EDGE_SUCC (loop->latch, 0));
     }
 
+  auto loop_exits = get_loop_exit_edges (loop);
+  auto_vec<basic_block> doms;
+
   if (at_exit) /* Add the loop copy at exit.  */
     {
-      if (scalar_loop != loop)
+      if (scalar_loop != loop && new_exit->dest != exit_dest)
 	{
-	  gphi_iterator gsi;
 	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
+	  flush_pending_stmts (new_exit);
+	}
 
-	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
-	       gsi_next (&gsi))
-	    {
-	      gphi *phi = gsi.phi ();
-	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
-	      location_t orig_locus
-		= gimple_phi_arg_location_from_edge (phi, e);
+      auto_vec <gimple *> new_phis;
+      hash_map <tree, tree> new_phi_args;
+      /* First create the empty phi nodes so that when we flush the
+	 statements they can be filled in.   However because there is no order
+	 between the PHI nodes in the exits and the loop headers we need to
+	 order them base on the order of the two headers.  First record the new
+	 phi nodes.  */
+      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
+	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
+	{
+	  gimple *from_phi = gsi_stmt (gsi_from);
+	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+	  gphi *res = create_phi_node (new_res, new_preheader);
+	  new_phis.safe_push (res);
+	}
 
-	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
+      /* Then redirect the edges and flush the changes.  This writes out the new
+	 SSA names.  */
+      for (edge exit : loop_exits)
+	{
+	  edge e = redirect_edge_and_branch (exit, new_preheader);
+	  flush_pending_stmts (e);
+	}
+
+      /* Record the new SSA names in the cache so that we can skip materializing
+	 them again when we fill in the rest of the LCSSA variables.  */
+      for (auto phi : new_phis)
+	{
+	  tree new_arg = gimple_phi_arg (phi, 0)->def;
+	  new_phi_args.put (new_arg, gimple_phi_result (phi));
+	}
+
+      /* Copy the current loop LC PHI nodes between the original loop exit
+	 block and the new loop header.  This allows us to later split the
+	 preheader block and still find the right LC nodes.  */
+      edge latch_new = single_succ_edge (new_preheader);
+      for (auto gsi_from = gsi_start_phis (loop->header),
+	   gsi_to = gsi_start_phis (new_loop->header);
+	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+	   gsi_next (&gsi_from), gsi_next (&gsi_to))
+	{
+	  gimple *from_phi = gsi_stmt (gsi_from);
+	  gimple *to_phi = gsi_stmt (gsi_to);
+	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
+						loop_latch_edge (loop));
+
+	  /* Check if we've already created a new phi node during edge
+	     redirection.  If we have, only propagate the value downwards.  */
+	  if (tree *res = new_phi_args.get (new_arg))
+	    {
+	      adjust_phi_and_debug_stmts (to_phi, latch_new, *res);
+	      continue;
 	    }
+
+	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+	  gphi *lcssa_phi = create_phi_node (new_res, e->dest);
+
+	  /* Main loop exit should use the final iter value.  */
+	  add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
+
+	  adjust_phi_and_debug_stmts (to_phi, latch_new, new_res);
 	}
-      redirect_edge_and_branch_force (e, new_preheader);
-      flush_pending_stmts (e);
+
       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
-      if (was_imm_dom || duplicate_outer_loop)
+
+      if ((was_imm_dom || duplicate_outer_loop))
 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
 
       /* And remove the non-necessary forwarder again.  Keep the other
@@ -1598,6 +1680,22 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
     }
   else /* Add the copy at entry.  */
     {
+      /* Copy the current loop LC PHI nodes between the original loop exit
+	 block and the new loop header.  This allows us to later split the
+	 preheader block and still find the right LC nodes.  */
+      for (auto gsi_from = gsi_start_phis (new_loop->header),
+	   gsi_to = gsi_start_phis (loop->header);
+	   flow_loops && !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+	   gsi_next (&gsi_from), gsi_next (&gsi_to))
+	{
+	  gimple *from_phi = gsi_stmt (gsi_from);
+	  gimple *to_phi = gsi_stmt (gsi_to);
+	  tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
+						loop_latch_edge (new_loop));
+	  adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
+				      new_arg);
+	}
+
       if (scalar_loop != loop)
 	{
 	  /* Remove the non-necessary forwarder of scalar_loop again.  */
@@ -1627,29 +1725,6 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 			       loop_preheader_edge (new_loop)->src);
     }
 
-  if (scalar_loop != loop)
-    {
-      /* Update new_loop->header PHIs, so that on the preheader
-	 edge they are the ones from loop rather than scalar_loop.  */
-      gphi_iterator gsi_orig, gsi_new;
-      edge orig_e = loop_preheader_edge (loop);
-      edge new_e = loop_preheader_edge (new_loop);
-
-      for (gsi_orig = gsi_start_phis (loop->header),
-	   gsi_new = gsi_start_phis (new_loop->header);
-	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
-	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
-	{
-	  gphi *orig_phi = gsi_orig.phi ();
-	  gphi *new_phi = gsi_new.phi ();
-	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
-	  location_t orig_locus
-	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
-
-	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
-	}
-    }
-
   free (new_bbs);
   free (bbs);
 
@@ -2579,139 +2654,36 @@  vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
 
 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
    this function searches for the corresponding lcssa phi node in exit
-   bb of LOOP.  If it is found, return the phi result; otherwise return
-   NULL.  */
+   bb of LOOP following the LCSSA_EDGE to the exit node.  If it is found,
+   return the phi result; otherwise return NULL.  */
 
 static tree
 find_guard_arg (class loop *loop ATTRIBUTE_UNUSED,
 		class loop *epilog ATTRIBUTE_UNUSED,
-		const_edge e, gphi *lcssa_phi)
+		const_edge e, gphi *lcssa_phi, int lcssa_edge = 0)
 {
   gphi_iterator gsi;
 
-  gcc_assert (single_pred_p (e->dest));
   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
     {
       gphi *phi = gsi.phi ();
-      if (operand_equal_p (PHI_ARG_DEF (phi, 0),
-			   PHI_ARG_DEF (lcssa_phi, 0), 0))
-	return PHI_RESULT (phi);
-    }
-  return NULL_TREE;
-}
-
-/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
-   from SECOND/FIRST and puts it at the original loop's preheader/exit
-   edge, the two loops are arranged as below:
-
-       preheader_a:
-     first_loop:
-       header_a:
-	 i_1 = PHI<i_0, i_2>;
-	 ...
-	 i_2 = i_1 + 1;
-	 if (cond_a)
-	   goto latch_a;
-	 else
-	   goto between_bb;
-       latch_a:
-	 goto header_a;
-
-       between_bb:
-	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
-
-     second_loop:
-       header_b:
-	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
-				 or with i_2 if no LCSSA phi is created
-				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
-	 ...
-	 i_4 = i_3 + 1;
-	 if (cond_b)
-	   goto latch_b;
-	 else
-	   goto exit_bb;
-       latch_b:
-	 goto header_b;
-
-       exit_bb:
-
-   This function creates loop closed SSA for the first loop; update the
-   second loop's PHI nodes by replacing argument on incoming edge with the
-   result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
-   is false, Loop closed ssa phis will only be created for non-iv phis for
-   the first loop.
-
-   This function assumes exit bb of the first loop is preheader bb of the
-   second loop, i.e, between_bb in the example code.  With PHIs updated,
-   the second loop will execute rest iterations of the first.  */
-
-static void
-slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
-				   class loop *first, edge first_loop_e,
-				   class loop *second, edge second_loop_e,
-				   bool create_lcssa_for_iv_phis)
-{
-  gphi_iterator gsi_update, gsi_orig;
-  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-
-  edge first_latch_e = EDGE_SUCC (first->latch, 0);
-  edge second_preheader_e = loop_preheader_edge (second);
-  basic_block between_bb = first_loop_e->dest;
-
-  gcc_assert (between_bb == second_preheader_e->src);
-  gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
-  /* Either the first loop or the second is the loop to be vectorized.  */
-  gcc_assert (loop == first || loop == second);
-
-  for (gsi_orig = gsi_start_phis (first->header),
-       gsi_update = gsi_start_phis (second->header);
-       !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
-       gsi_next (&gsi_orig), gsi_next (&gsi_update))
-    {
-      gphi *orig_phi = gsi_orig.phi ();
-      gphi *update_phi = gsi_update.phi ();
-
-      tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
-      /* Generate lcssa PHI node for the first loop.  */
-      gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
-      stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
-      if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
+      /* Nested loops with multiple exits can have different no# phi node
+	arguments between the main loop and epilog as epilog falls to the
+	second loop.  */
+      if (gimple_phi_num_args (phi) > e->dest_idx)
 	{
-	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
-	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
-	  add_phi_arg (lcssa_phi, arg, first_loop_e, UNKNOWN_LOCATION);
-	  arg = new_res;
-	}
-
-      /* Update PHI node in the second loop by replacing arg on the loop's
-	 incoming edge.  */
-      adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
-    }
-
-  /* For epilogue peeling we have to make sure to copy all LC PHIs
-     for correct vectorization of live stmts.  */
-  if (loop == first)
-    {
-      basic_block orig_exit = second_loop_e->dest;
-      for (gsi_orig = gsi_start_phis (orig_exit);
-	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
-	{
-	  gphi *orig_phi = gsi_orig.phi ();
-	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
-	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
-	    continue;
-
-	  const_edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
-	  /* Already created in the above loop.   */
-	  if (find_guard_arg (first, second, exit_e, orig_phi))
+	 tree var = PHI_ARG_DEF (phi, e->dest_idx);
+	 if (TREE_CODE (var) != SSA_NAME)
 	    continue;
-
-	  tree new_res = copy_ssa_name (orig_arg);
-	  gphi *lcphi = create_phi_node (new_res, between_bb);
-	  add_phi_arg (lcphi, orig_arg, first_loop_e, UNKNOWN_LOCATION);
+	 tree def = get_current_def (var);
+	 if (!def)
+	   continue;
+	 if (operand_equal_p (def,
+			      PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
+	   return PHI_RESULT (phi);
 	}
     }
+  return NULL_TREE;
 }
 
 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
@@ -2796,11 +2768,11 @@  slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
     }
 }
 
-/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
-   from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
-   point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
-   and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
-   The CFG looks like:
+/* LOOP and EPILOG are two consecutive loops in CFG connected by LOOP_EXIT edge
+   and EPILOG is copied from LOOP.  Function slpeel_add_loop_guard adds guard
+   skipping from a point between the two loops to the end of EPILOG.  Edges
+   GUARD_EDGE and MERGE_EDGE are the two pred edges of merge_bb at the end of
+   EPILOG.  The CFG looks like:
 
      loop:
        header_a:
@@ -2851,6 +2823,7 @@  slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
 
 static void
 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
+				    const_edge loop_exit,
 				    edge guard_edge, edge merge_edge)
 {
   gphi_iterator gsi;
@@ -2859,13 +2832,11 @@  slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
   gcc_assert (single_succ_p (merge_bb));
   edge e = single_succ_edge (merge_bb);
   basic_block exit_bb = e->dest;
-  gcc_assert (single_pred_p (exit_bb));
-  gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
 
   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
     {
       gphi *update_phi = gsi.phi ();
-      tree old_arg = PHI_ARG_DEF (update_phi, 0);
+      tree old_arg = PHI_ARG_DEF (update_phi, e->dest_idx);
 
       tree merge_arg = NULL_TREE;
 
@@ -2877,8 +2848,8 @@  slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
       if (!merge_arg)
 	merge_arg = old_arg;
 
-      tree guard_arg
-	= find_guard_arg (loop, epilog, single_exit (loop), update_phi);
+      tree guard_arg = find_guard_arg (loop, epilog, loop_exit,
+				       update_phi, e->dest_idx);
       /* If the var is live after loop but not a reduction, we simply
 	 use the old arg.  */
       if (!guard_arg)
@@ -2898,21 +2869,6 @@  slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
     }
 }
 
-/* EPILOG loop is duplicated from the original loop for vectorizing,
-   the arg of its loop closed ssa PHI needs to be updated.  */
-
-static void
-slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
-{
-  gphi_iterator gsi;
-  basic_block exit_bb = single_exit (epilog)->dest;
-
-  gcc_assert (single_pred_p (exit_bb));
-  edge e = EDGE_PRED (exit_bb, 0);
-  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
-    rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
-}
-
 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
    Return a value that equals:
 
@@ -3255,8 +3211,7 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 						       e, &prolog_e);
       gcc_assert (prolog);
       prolog->force_vectorize = false;
-      slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, prolog_e, loop,
-					 exit_e, true);
+
       first_loop = prolog;
       reset_original_copy_tables ();
 
@@ -3336,8 +3291,6 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
       gcc_assert (epilog);
       epilog->force_vectorize = false;
-      slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, e, epilog,
-					 new_epilog_e, false);
       bb_before_epilog = loop_preheader_edge (epilog)->src;
 
       /* Scalar version loop may be preferred.  In this case, add guard
@@ -3430,7 +3383,9 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 					   irred_flag);
 	  if (vect_epilogues)
 	    epilogue_vinfo->skip_this_loop_edge = guard_e;
-	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, epilog_e);
+	  edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
+	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, main_iv, guard_e,
+					      epilog_e);
 	  /* Only need to handle basic block before epilog loop if it's not
 	     the guard_bb, which is the case when skip_vector is true.  */
 	  if (guard_bb != bb_before_epilog)
@@ -3441,8 +3396,6 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	    }
 	  scale_loop_profile (epilog, prob_epilog, -1);
 	}
-      else
-	slpeel_update_phi_nodes_for_lcssa (epilog);
 
       unsigned HOST_WIDE_INT bound;
       if (bound_scalar.is_constant (&bound))
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index f1caa5f207d3b13da58c3a313b11d1ef98374349..327cab0f736da7f1bd3e024d666df46ef9208107 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5877,7 +5877,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   basic_block exit_bb;
   tree scalar_dest;
   tree scalar_type;
-  gimple *new_phi = NULL, *phi;
+  gimple *new_phi = NULL, *phi = NULL;
   gimple_stmt_iterator exit_gsi;
   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
   gimple *epilog_stmt = NULL;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 55b6771b271d5072fa1327d595e1dddb112cfdf6..25ceb6600673d71fd6012443403997e921066483 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2183,7 +2183,7 @@  extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
 					 const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
 						    class loop *, edge,
-						    edge, edge *);
+						    edge, edge *, bool = true);
 class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,