diff mbox series

[7/21] middle-end: update IV update code to support early breaks and arbitrary exits

Message ID ZUiYG7mRzlNStIYa@arm.com
State New
Headers show
Series None | expand

Commit Message

Tamar Christina Nov. 6, 2023, 7:39 a.m. UTC
Hi All,

This changes the PHI node updates to support early breaks.
It has to support both the case where the loop's exit matches the normal loop
exit and one where the early exit is "inverted", i.e. it's an early exit edge.

In the latter case we must always restart the loop for VF iterations.  For an
early exit the reason is obvious, but there are cases where the "normal" exit
is located before the early one.  This exit then does a check on ivtmp resulting
in us leaving the loop since it thinks we're done.

In these case we may still have side-effects to perform so we also go to the
scalar loop.

For the "normal" exit niters has already been adjusted for peeling, for the
early exits we must find out how many iterations we actually did.  So we have
to recalculate the new position for each exit.

This works, however ./gcc/testsuite/gcc.dg/vect/vect-early-break_76.c is
currently giving me a runtime failure, but I cannot seem to tell why.

The generated control looks correct to me, See loop 1:
https://gist.github.com/Mistuke/78b439de05e303ac6de5438dd83f079b

Any help in pointing out the mistake is appreciated.

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal): Hide unused.
	(vect_is_loop_exit_latch_pred): Mark inline
	(vect_update_ivs_after_vectorizer): Support early break.
	(vect_do_peeling): Use it.
	(find_guard_arg): Keep the same value.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 58b4b9c11d8b844ee86156cdfcba7f838030a7c2..abd905b78f3661f80168c3866d7c3e68a9c15521 100644




--
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 58b4b9c11d8b844ee86156cdfcba7f838030a7c2..abd905b78f3661f80168c3866d7c3e68a9c15521 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1187,7 +1187,7 @@ vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
    loop handles exactly VF scalars per iteration.  */
 
 static gcond *
-vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
+vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
 				class loop *loop, tree niters, tree step,
 				tree final_iv, bool niters_maybe_zero,
 				gimple_stmt_iterator loop_cond_gsi)
@@ -1452,7 +1452,7 @@ slpeel_duplicate_current_defs_from_edges (edge from, edge to)
    When this happens we need to flip the understanding of main and other
    exits by peeling and IV updates.  */
 
-bool
+bool inline
 vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)
 {
   return single_pred (loop->latch) == loop_exit->src;
@@ -2193,6 +2193,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
      Input:
      - LOOP - a loop that is going to be vectorized. The last few iterations
               of LOOP were peeled.
+     - VF   - The chosen vectorization factor for LOOP.
      - NITERS - the number of iterations that LOOP executes (before it is
                 vectorized). i.e, the number of times the ivs should be bumped.
      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
@@ -2203,6 +2204,9 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
                   The phi args associated with the edge UPDATE_E in the bb
                   UPDATE_E->dest are updated accordingly.
 
+     - MAIN_EXIT_P - Indicates whether UPDATE_E is twhat the vectorizer
+		     considers the main loop exit.
+
      Assumption 1: Like the rest of the vectorizer, this function assumes
      a single loop exit that has a single predecessor.
 
@@ -2220,18 +2224,21 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
  */
 
 static void
-vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
-				  tree niters, edge update_e)
+vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, poly_uint64 vf,
+				  tree niters, edge update_e, bool main_exit_p)
 {
   gphi_iterator gsi, gsi1;
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block update_bb = update_e->dest;
+  bool inversed_iv
+	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
+					 LOOP_VINFO_LOOP (loop_vinfo));
 
-  basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
-
-  /* Make sure there exists a single-predecessor exit bb:  */
-  gcc_assert (single_pred_p (exit_bb));
-  gcc_assert (single_succ_edge (exit_bb) == update_e);
+  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+  gcond *cond = get_loop_exit_condition (loop_e);
+  basic_block exit_bb = loop_e->dest;
+  basic_block iv_block = NULL;
+  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
 
   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
@@ -2241,7 +2248,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       tree step_expr, off;
       tree type;
       tree var, ni, ni_name;
-      gimple_stmt_iterator last_gsi;
 
       gphi *phi = gsi.phi ();
       gphi *phi1 = gsi1.phi ();
@@ -2273,11 +2279,52 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       enum vect_induction_op_type induction_type
 	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
 
-      if (induction_type == vect_step_op_add)
+      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+      /* create_iv always places it on the LHS.  Alternatively we can set a
+	 property during create_iv to identify it.  */
+      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
+      if ((!main_exit_p || inversed_iv) && ivtemp)
+	{
+	  step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+	  type = TREE_TYPE (gimple_phi_result (phi));
+	  ni = build_int_cst (type, vf);
+	}
+      else if (!main_exit_p && inversed_iv)
+	continue;
+      else if (induction_type == vect_step_op_add)
 	{
+
 	  tree stype = TREE_TYPE (step_expr);
-	  off = fold_build2 (MULT_EXPR, stype,
-			     fold_convert (stype, niters), step_expr);
+
+	  /* Early exits always use last iter value not niters. */
+	  if (!main_exit_p || (main_exit_p && inversed_iv))
+	    {
+	      /* Live statements in the non-main exit shouldn't be adjusted.  We
+		 normally didn't have this problem with a single exit as live
+		 values would be in the exit block.  However when dealing with
+		 multiple exits all exits are redirected to the merge block
+		 and we restart the iteration.  */
+	      if (STMT_VINFO_LIVE_P (phi_info))
+		continue;
+
+	      /* For early break the final loop IV is:
+		 init + (final - init) * vf which takes into account peeling
+		 values and non-single steps.  The main exit can use niters
+		 since if you exit from the main exit you've done all vector
+		 iterations.  For an early exit we don't know when we exit so we
+		 must re-calculate this on the exit.  */
+	      tree start_expr = gimple_phi_result (phi);
+	      off = fold_build2 (MINUS_EXPR, stype,
+				 fold_convert (stype, start_expr),
+				 fold_convert (stype, init_expr));
+	      /* Now adjust for VF to get the final iteration value.  */
+	      off = fold_build2 (MULT_EXPR, stype, off,
+				 build_int_cst (stype, vf));
+	    }
+	  else
+	    off = fold_build2 (MULT_EXPR, stype,
+			       fold_convert (stype, niters), step_expr);
+
 	  if (POINTER_TYPE_P (type))
 	    ni = fold_build_pointer_plus (init_expr, off);
 	  else
@@ -2289,6 +2336,8 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       /* Don't bother call vect_peel_nonlinear_iv_init.  */
       else if (induction_type == vect_step_op_neg)
 	ni = init_expr;
+      else if (!main_exit_p)
+	continue;
       else
 	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
 					  niters, step_expr,
@@ -2296,9 +2345,20 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
 
       var = create_tmp_var (type, "tmp");
 
-      last_gsi = gsi_last_bb (exit_bb);
       gimple_seq new_stmts = NULL;
       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
+
+      /* For non-main exit create an intermediat edge to get any updated iv
+	 calculations.  */
+      if (!main_exit_p
+	  && !iv_block
+	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p (new_stmts)))
+	{
+	  iv_block = split_edge (update_e);
+	  update_e = single_succ_edge (update_e->dest);
+	  last_gsi = gsi_last_bb (iv_block);
+	}
+
       /* Exit_bb shouldn't be empty.  */
       if (!gsi_end_p (last_gsi))
 	{
@@ -2836,12 +2896,18 @@ find_guard_arg (class loop *loop ATTRIBUTE_UNUSED, const_edge loop_e,
 	 tree var = PHI_ARG_DEF (phi, loop_e->dest_idx);
 	 if (TREE_CODE (var) != SSA_NAME)
 	    continue;
-	 tree def = get_current_def (var);
-	 if (!def)
-	   continue;
-	 if (operand_equal_p (def,
-			      PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
-	   return PHI_RESULT (phi);
+
+	  /* The value could be carried all the way from the loop version block
+	     in which case we wouldn't have kept the value if it's not used in
+	     the loop.  In such cases get_current_def returns null as the value
+	     is already current.  */
+	  tree orig_var = get_current_def (var);
+	  if (!orig_var)
+	    orig_var = var;
+
+	  if (operand_equal_p (orig_var,
+			       PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
+	    return PHI_RESULT (phi);
 	}
     }
   return NULL_TREE;
@@ -3528,8 +3594,21 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	 niters_vector_mult_vf steps.  */
       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
       update_e = skip_vector ? e : loop_preheader_edge (epilog);
-      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
-					update_e);
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	update_e = single_succ_edge (e->dest);
+      bool inversed_iv
+	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
+					 LOOP_VINFO_LOOP (loop_vinfo));
+
+      for (auto exit : get_loop_exit_edges (loop))
+	{
+	  bool main_exit_p = vect_is_loop_exit_latch_pred (exit, loop);
+	  edge exit_e = main_exit_p ? update_e : exit;
+	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
+					    niters_vector_mult_vf, exit_e,
+					    main_exit_p);
+
+	}
 
       if (skip_epilog)
 	{

Comments

Tamar Christina Nov. 15, 2023, 12:03 a.m. UTC | #1
Patch updated to latest trunk:

Hi All,

This changes the PHI node updates to support early breaks.
It has to support both the case where the loop's exit matches the normal loop
exit and one where the early exit is "inverted", i.e. it's an early exit edge.

In the latter case we must always restart the loop for VF iterations.  For an
early exit the reason is obvious, but there are cases where the "normal" exit
is located before the early one.  This exit then does a check on ivtmp resulting
in us leaving the loop since it thinks we're done.

In these case we may still have side-effects to perform so we also go to the
scalar loop.

For the "normal" exit niters has already been adjusted for peeling, for the
early exits we must find out how many iterations we actually did.  So we have
to recalculate the new position for each exit.

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal): Hide unused.
	(vect_update_ivs_after_vectorizer): Support early break.
	(vect_do_peeling): Use it.

--- inline copy of patch ---

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3d2654cf1c842baac58f5 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1200,7 +1200,7 @@ vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
    loop handles exactly VF scalars per iteration.  */
 
 static gcond *
-vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
+vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
 				class loop *loop, tree niters, tree step,
 				tree final_iv, bool niters_maybe_zero,
 				gimple_stmt_iterator loop_cond_gsi)
@@ -1412,7 +1412,7 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
    When this happens we need to flip the understanding of main and other
    exits by peeling and IV updates.  */
 
-bool inline
+bool
 vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)
 {
   return single_pred (loop->latch) == loop_exit->src;
@@ -2142,6 +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
      Input:
      - LOOP - a loop that is going to be vectorized. The last few iterations
               of LOOP were peeled.
+     - VF   - The chosen vectorization factor for LOOP.
      - NITERS - the number of iterations that LOOP executes (before it is
                 vectorized). i.e, the number of times the ivs should be bumped.
      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
@@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
                   The phi args associated with the edge UPDATE_E in the bb
                   UPDATE_E->dest are updated accordingly.
 
+     - restart_loop - Indicates whether the scalar loop needs to restart the
+		      iteration count where the vector loop began.
+
      Assumption 1: Like the rest of the vectorizer, this function assumes
      a single loop exit that has a single predecessor.
 
@@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
  */
 
 static void
-vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
-				  tree niters, edge update_e)
+vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, poly_uint64 vf,
+				  tree niters, edge update_e, bool restart_loop)
 {
   gphi_iterator gsi, gsi1;
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block update_bb = update_e->dest;
-
-  basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
-
-  /* Make sure there exists a single-predecessor exit bb:  */
-  gcc_assert (single_pred_p (exit_bb));
-  gcc_assert (single_succ_edge (exit_bb) == update_e);
+  bool inversed_iv
+	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
+					 LOOP_VINFO_LOOP (loop_vinfo));
+  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+			    && flow_bb_inside_loop_p (loop, update_e->src);
+  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+  gcond *cond = get_loop_exit_condition (loop_e);
+  basic_block exit_bb = loop_e->dest;
+  basic_block iv_block = NULL;
+  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
 
   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
@@ -2190,7 +2198,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       tree step_expr, off;
       tree type;
       tree var, ni, ni_name;
-      gimple_stmt_iterator last_gsi;
 
       gphi *phi = gsi.phi ();
       gphi *phi1 = gsi1.phi ();
@@ -2222,11 +2229,52 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       enum vect_induction_op_type induction_type
 	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
 
-      if (induction_type == vect_step_op_add)
+      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+      /* create_iv always places it on the LHS.  Alternatively we can set a
+	 property during create_iv to identify it.  */
+      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
+      if (restart_loop && ivtemp)
 	{
+	  type = TREE_TYPE (gimple_phi_result (phi));
+	  ni = build_int_cst (type, vf);
+	  if (inversed_iv)
+	    ni = fold_build2 (MINUS_EXPR, type, ni,
+			      fold_convert (type, step_expr));
+	}
+      else if (induction_type == vect_step_op_add)
+	{
+
 	  tree stype = TREE_TYPE (step_expr);
-	  off = fold_build2 (MULT_EXPR, stype,
-			     fold_convert (stype, niters), step_expr);
+
+	  /* Early exits always use last iter value not niters. */
+	  if (restart_loop)
+	    {
+	      /* Live statements in the non-main exit shouldn't be adjusted.  We
+		 normally didn't have this problem with a single exit as live
+		 values would be in the exit block.  However when dealing with
+		 multiple exits all exits are redirected to the merge block
+		 and we restart the iteration.  */
+	      if (STMT_VINFO_LIVE_P (phi_info))
+		continue;
+
+	      /* For early break the final loop IV is:
+		 init + (final - init) * vf which takes into account peeling
+		 values and non-single steps.  The main exit can use niters
+		 since if you exit from the main exit you've done all vector
+		 iterations.  For an early exit we don't know when we exit so we
+		 must re-calculate this on the exit.  */
+	      tree start_expr = gimple_phi_result (phi);
+	      off = fold_build2 (MINUS_EXPR, stype,
+				 fold_convert (stype, start_expr),
+				 fold_convert (stype, init_expr));
+	      /* Now adjust for VF to get the final iteration value.  */
+	      off = fold_build2 (MULT_EXPR, stype, off,
+				 build_int_cst (stype, vf));
+	    }
+	  else
+	    off = fold_build2 (MULT_EXPR, stype,
+			       fold_convert (stype, niters), step_expr);
+
 	  if (POINTER_TYPE_P (type))
 	    ni = fold_build_pointer_plus (init_expr, off);
 	  else
@@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       /* Don't bother call vect_peel_nonlinear_iv_init.  */
       else if (induction_type == vect_step_op_neg)
 	ni = init_expr;
+      else if (restart_loop)
+	continue;
       else
 	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
 					  niters, step_expr,
@@ -2245,9 +2295,20 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
 
       var = create_tmp_var (type, "tmp");
 
-      last_gsi = gsi_last_bb (exit_bb);
       gimple_seq new_stmts = NULL;
       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
+
+      /* For non-main exit create an intermediat edge to get any updated iv
+	 calculations.  */
+      if (needs_interm_block
+	  && !iv_block
+	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p (new_stmts)))
+	{
+	  iv_block = split_edge (update_e);
+	  update_e = single_succ_edge (update_e->dest);
+	  last_gsi = gsi_last_bb (iv_block);
+	}
+
       /* Exit_bb shouldn't be empty.  */
       if (!gsi_end_p (last_gsi))
 	{
@@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	 niters_vector_mult_vf steps.  */
       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
       update_e = skip_vector ? e : loop_preheader_edge (epilog);
-      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
-					update_e);
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	update_e = single_succ_edge (e->dest);
+      bool inversed_iv
+	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
+					 LOOP_VINFO_LOOP (loop_vinfo));
+
+      /* Update the main exit first.  */
+      vect_update_ivs_after_vectorizer (loop_vinfo, vf, niters_vector_mult_vf,
+					update_e, inversed_iv);
+
+      /* And then update the early exits.  */
+      for (auto exit : get_loop_exit_edges (loop))
+	{
+	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
+	    continue;
+
+	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
+					    niters_vector_mult_vf,
+					    exit, true);
+	}
 
       if (skip_epilog)
 	{
Richard Biener Nov. 15, 2023, 1:01 p.m. UTC | #2
On Wed, 15 Nov 2023, Tamar Christina wrote:

> Patch updated to latest trunk:
> 
> Hi All,
> 
> This changes the PHI node updates to support early breaks.
> It has to support both the case where the loop's exit matches the normal loop
> exit and one where the early exit is "inverted", i.e. it's an early exit edge.
> 
> In the latter case we must always restart the loop for VF iterations.  For an
> early exit the reason is obvious, but there are cases where the "normal" exit
> is located before the early one.  This exit then does a check on ivtmp resulting
> in us leaving the loop since it thinks we're done.
> 
> In these case we may still have side-effects to perform so we also go to the
> scalar loop.
> 
> For the "normal" exit niters has already been adjusted for peeling, for the
> early exits we must find out how many iterations we actually did.  So we have
> to recalculate the new position for each exit.
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal): Hide unused.
> 	(vect_update_ivs_after_vectorizer): Support early break.
> 	(vect_do_peeling): Use it.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3d2654cf1c842baac58f5 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1200,7 +1200,7 @@ vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
>     loop handles exactly VF scalars per iteration.  */
>  
>  static gcond *
> -vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
> +vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
>  				class loop *loop, tree niters, tree step,
>  				tree final_iv, bool niters_maybe_zero,
>  				gimple_stmt_iterator loop_cond_gsi)
> @@ -1412,7 +1412,7 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
>     When this happens we need to flip the understanding of main and other
>     exits by peeling and IV updates.  */
>  
> -bool inline
> +bool
>  vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)
>  {
>    return single_pred (loop->latch) == loop_exit->src;
> @@ -2142,6 +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
>       Input:
>       - LOOP - a loop that is going to be vectorized. The last few iterations
>                of LOOP were peeled.
> +     - VF   - The chosen vectorization factor for LOOP.
>       - NITERS - the number of iterations that LOOP executes (before it is
>                  vectorized). i.e, the number of times the ivs should be bumped.
>       - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path

the comment on this is now a bit misleading, can you try to update it
and/or move the comment bits to the docs on EARLY_EXIT?

> @@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
>                    The phi args associated with the edge UPDATE_E in the bb
>                    UPDATE_E->dest are updated accordingly.
>  
> +     - restart_loop - Indicates whether the scalar loop needs to restart the

params are ALL_CAPS

> +		      iteration count where the vector loop began.
> +
>       Assumption 1: Like the rest of the vectorizer, this function assumes
>       a single loop exit that has a single predecessor.
>  
> @@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
>   */
>  
>  static void
> -vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> -				  tree niters, edge update_e)
> +vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, poly_uint64 vf,

LOOP_VINFO_VECT_FACTOR?

> +				  tree niters, edge update_e, bool restart_loop)

I think 'bool early_exit' is better here?  I wonder if we have an "early"
exit after the main exit we are probably sure there are no side-effects
to re-execute and could avoid this restarting?

>  {
>    gphi_iterator gsi, gsi1;
>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>    basic_block update_bb = update_e->dest;
> -
> -  basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> -
> -  /* Make sure there exists a single-predecessor exit bb:  */
> -  gcc_assert (single_pred_p (exit_bb));
> -  gcc_assert (single_succ_edge (exit_bb) == update_e);
> +  bool inversed_iv
> +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> +					 LOOP_VINFO_LOOP (loop_vinfo));
> +  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +			    && flow_bb_inside_loop_p (loop, update_e->src);
> +  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> +  gcond *cond = get_loop_exit_condition (loop_e);
> +  basic_block exit_bb = loop_e->dest;
> +  basic_block iv_block = NULL;
> +  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
>  
>    for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
>         !gsi_end_p (gsi) && !gsi_end_p (gsi1);
> @@ -2190,7 +2198,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
>        tree step_expr, off;
>        tree type;
>        tree var, ni, ni_name;
> -      gimple_stmt_iterator last_gsi;
>  
>        gphi *phi = gsi.phi ();
>        gphi *phi1 = gsi1.phi ();
> @@ -2222,11 +2229,52 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
>        enum vect_induction_op_type induction_type
>  	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
>  
> -      if (induction_type == vect_step_op_add)
> +      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> +      /* create_iv always places it on the LHS.  Alternatively we can set a
> +	 property during create_iv to identify it.  */
> +      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> +      if (restart_loop && ivtemp)
>  	{
> +	  type = TREE_TYPE (gimple_phi_result (phi));
> +	  ni = build_int_cst (type, vf);
> +	  if (inversed_iv)
> +	    ni = fold_build2 (MINUS_EXPR, type, ni,
> +			      fold_convert (type, step_expr));
> +	}
> +      else if (induction_type == vect_step_op_add)
> +	{
> +
>  	  tree stype = TREE_TYPE (step_expr);
> -	  off = fold_build2 (MULT_EXPR, stype,
> -			     fold_convert (stype, niters), step_expr);
> +
> +	  /* Early exits always use last iter value not niters. */
> +	  if (restart_loop)
> +	    {
> +	      /* Live statements in the non-main exit shouldn't be adjusted.  We
> +		 normally didn't have this problem with a single exit as live
> +		 values would be in the exit block.  However when dealing with
> +		 multiple exits all exits are redirected to the merge block
> +		 and we restart the iteration.  */

Hmm, I fail to see how this works - we're either using the value to 
continue the induction or not, independent of STMT_VINFO_LIVE_P.

> +	      if (STMT_VINFO_LIVE_P (phi_info))
> +		continue;
> +
> +	      /* For early break the final loop IV is:
> +		 init + (final - init) * vf which takes into account peeling
> +		 values and non-single steps.  The main exit can use niters
> +		 since if you exit from the main exit you've done all vector
> +		 iterations.  For an early exit we don't know when we exit so we
> +		 must re-calculate this on the exit.  */
> +	      tree start_expr = gimple_phi_result (phi);
> +	      off = fold_build2 (MINUS_EXPR, stype,
> +				 fold_convert (stype, start_expr),
> +				 fold_convert (stype, init_expr));
> +	      /* Now adjust for VF to get the final iteration value.  */
> +	      off = fold_build2 (MULT_EXPR, stype, off,
> +				 build_int_cst (stype, vf));
> +	    }
> +	  else
> +	    off = fold_build2 (MULT_EXPR, stype,
> +			       fold_convert (stype, niters), step_expr);
> +
>  	  if (POINTER_TYPE_P (type))
>  	    ni = fold_build_pointer_plus (init_expr, off);
>  	  else
> @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
>        /* Don't bother call vect_peel_nonlinear_iv_init.  */
>        else if (induction_type == vect_step_op_neg)
>  	ni = init_expr;
> +      else if (restart_loop)
> +	continue;

This looks all a bit complicated - why wouldn't we simply always
use the PHI result when 'restart_loop'?  Isn't that the correct old start
value in all cases?

>        else
>  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
>  					  niters, step_expr,
> @@ -2245,9 +2295,20 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
>  
>        var = create_tmp_var (type, "tmp");
>  
> -      last_gsi = gsi_last_bb (exit_bb);
>        gimple_seq new_stmts = NULL;
>        ni_name = force_gimple_operand (ni, &new_stmts, false, var);
> +
> +      /* For non-main exit create an intermediat edge to get any updated iv
> +	 calculations.  */
> +      if (needs_interm_block
> +	  && !iv_block
> +	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p (new_stmts)))
> +	{
> +	  iv_block = split_edge (update_e);
> +	  update_e = single_succ_edge (update_e->dest);
> +	  last_gsi = gsi_last_bb (iv_block);
> +	}
> +
>        /* Exit_bb shouldn't be empty.  */
>        if (!gsi_end_p (last_gsi))
>  	{
> @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>  	 niters_vector_mult_vf steps.  */
>        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
>        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> -      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
> -					update_e);
> +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	update_e = single_succ_edge (e->dest);
> +      bool inversed_iv
> +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> +					 LOOP_VINFO_LOOP (loop_vinfo));

You are computing this here and in vect_update_ivs_after_vectorizer?

> +
> +      /* Update the main exit first.  */
> +      vect_update_ivs_after_vectorizer (loop_vinfo, vf, niters_vector_mult_vf,
> +					update_e, inversed_iv);
> +
> +      /* And then update the early exits.  */
> +      for (auto exit : get_loop_exit_edges (loop))
> +	{
> +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> +	    continue;
> +
> +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> +					    niters_vector_mult_vf,
> +					    exit, true);

... why does the same not work here?  Wouldn't the proper condition
be !dominated_by_p (CDI_DOMINATORS, exit->src, LOOP_VINFO_IV_EXIT 
(loop_vinfo)->src) or similar?  That is, whether the exit is at or
after the main IV exit?  (consider having two)

> +	}
>  
>        if (skip_epilog)
>  	{
>
Tamar Christina Nov. 15, 2023, 1:09 p.m. UTC | #3
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Wednesday, November 15, 2023 1:01 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> breaks and arbitrary exits
> 
> On Wed, 15 Nov 2023, Tamar Christina wrote:
> 
> > Patch updated to latest trunk:
> >
> > Hi All,
> >
> > This changes the PHI node updates to support early breaks.
> > It has to support both the case where the loop's exit matches the
> > normal loop exit and one where the early exit is "inverted", i.e. it's an early
> exit edge.
> >
> > In the latter case we must always restart the loop for VF iterations.
> > For an early exit the reason is obvious, but there are cases where the
> > "normal" exit is located before the early one.  This exit then does a
> > check on ivtmp resulting in us leaving the loop since it thinks we're done.
> >
> > In these case we may still have side-effects to perform so we also go
> > to the scalar loop.
> >
> > For the "normal" exit niters has already been adjusted for peeling,
> > for the early exits we must find out how many iterations we actually
> > did.  So we have to recalculate the new position for each exit.
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal): Hide
> unused.
> > 	(vect_update_ivs_after_vectorizer): Support early break.
> > 	(vect_do_peeling): Use it.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > index
> >
> d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3
> d2654cf1
> > c842baac58f5 100644
> > --- a/gcc/tree-vect-loop-manip.cc
> > +++ b/gcc/tree-vect-loop-manip.cc
> > @@ -1200,7 +1200,7 @@ vect_set_loop_condition_partial_vectors_avx512
> (class loop *loop,
> >     loop handles exactly VF scalars per iteration.  */
> >
> >  static gcond *
> > -vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge
> > exit_edge,
> > +vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge
> > +exit_edge,
> >  				class loop *loop, tree niters, tree step,
> >  				tree final_iv, bool niters_maybe_zero,
> >  				gimple_stmt_iterator loop_cond_gsi) @@ -
> 1412,7 +1412,7 @@
> > vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info
> loop_vinfo
> >     When this happens we need to flip the understanding of main and other
> >     exits by peeling and IV updates.  */
> >
> > -bool inline
> > +bool
> >  vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)  {
> >    return single_pred (loop->latch) == loop_exit->src; @@ -2142,6
> > +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
> >       Input:
> >       - LOOP - a loop that is going to be vectorized. The last few iterations
> >                of LOOP were peeled.
> > +     - VF   - The chosen vectorization factor for LOOP.
> >       - NITERS - the number of iterations that LOOP executes (before it is
> >                  vectorized). i.e, the number of times the ivs should be bumped.
> >       - UPDATE_E - a successor edge of LOOP->exit that is on the
> > (only) path
> 
> the comment on this is now a bit misleading, can you try to update it and/or
> move the comment bits to the docs on EARLY_EXIT?
> 
> > @@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p (loop_vec_info
> loop_vinfo)
> >                    The phi args associated with the edge UPDATE_E in the bb
> >                    UPDATE_E->dest are updated accordingly.
> >
> > +     - restart_loop - Indicates whether the scalar loop needs to
> > + restart the
> 
> params are ALL_CAPS
> 
> > +		      iteration count where the vector loop began.
> > +
> >       Assumption 1: Like the rest of the vectorizer, this function assumes
> >       a single loop exit that has a single predecessor.
> >
> > @@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p (loop_vec_info
> loop_vinfo)
> >   */
> >
> >  static void
> > -vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > -				  tree niters, edge update_e)
> > +vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > +poly_uint64 vf,
> 
> LOOP_VINFO_VECT_FACTOR?
> 
> > +				  tree niters, edge update_e, bool
> restart_loop)
> 
> I think 'bool early_exit' is better here?  I wonder if we have an "early"
> exit after the main exit we are probably sure there are no side-effects to re-
> execute and could avoid this restarting?

Side effects yes, but the actual check may not have been performed yet.
If you remember https://gist.github.com/Mistuke/66f14fe5c1be32b91ce149bd9b8bb35f
There in the clz loop through the "main" exit you still have to see if that iteration
did not contain the entry.  This is because the loop counter is incremented
before you iterate.

> 
> >  {
> >    gphi_iterator gsi, gsi1;
> >    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> >    basic_block update_bb = update_e->dest;
> > -
> > -  basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > -
> > -  /* Make sure there exists a single-predecessor exit bb:  */
> > -  gcc_assert (single_pred_p (exit_bb));
> > -  gcc_assert (single_succ_edge (exit_bb) == update_e);
> > +  bool inversed_iv
> > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> > +					 LOOP_VINFO_LOOP (loop_vinfo));
> > +  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > +			    && flow_bb_inside_loop_p (loop, update_e->src);
> > +  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > +  gcond *cond = get_loop_exit_condition (loop_e);
> > +  basic_block exit_bb = loop_e->dest;
> > +  basic_block iv_block = NULL;
> > +  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
> >
> >    for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
> >         !gsi_end_p (gsi) && !gsi_end_p (gsi1); @@ -2190,7 +2198,6 @@
> > vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> >        tree step_expr, off;
> >        tree type;
> >        tree var, ni, ni_name;
> > -      gimple_stmt_iterator last_gsi;
> >
> >        gphi *phi = gsi.phi ();
> >        gphi *phi1 = gsi1.phi ();
> > @@ -2222,11 +2229,52 @@ vect_update_ivs_after_vectorizer
> (loop_vec_info loop_vinfo,
> >        enum vect_induction_op_type induction_type
> >  	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
> >
> > -      if (induction_type == vect_step_op_add)
> > +      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> > +      /* create_iv always places it on the LHS.  Alternatively we can set a
> > +	 property during create_iv to identify it.  */
> > +      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > +      if (restart_loop && ivtemp)
> >  	{
> > +	  type = TREE_TYPE (gimple_phi_result (phi));
> > +	  ni = build_int_cst (type, vf);
> > +	  if (inversed_iv)
> > +	    ni = fold_build2 (MINUS_EXPR, type, ni,
> > +			      fold_convert (type, step_expr));
> > +	}
> > +      else if (induction_type == vect_step_op_add)
> > +	{
> > +
> >  	  tree stype = TREE_TYPE (step_expr);
> > -	  off = fold_build2 (MULT_EXPR, stype,
> > -			     fold_convert (stype, niters), step_expr);
> > +
> > +	  /* Early exits always use last iter value not niters. */
> > +	  if (restart_loop)
> > +	    {
> > +	      /* Live statements in the non-main exit shouldn't be adjusted.  We
> > +		 normally didn't have this problem with a single exit as live
> > +		 values would be in the exit block.  However when dealing with
> > +		 multiple exits all exits are redirected to the merge block
> > +		 and we restart the iteration.  */
> 
> Hmm, I fail to see how this works - we're either using the value to continue the
> induction or not, independent of STMT_VINFO_LIVE_P.

That becomes clear in the patch to update live reductions.  Essentially any live
Reductions inside an alternative exit will reduce to the first element
rather than the last and use that as the seed for the scalar loop.

It has to do this since you have to perform the side effects for the non-matching
elements still.

Regards,
Tamar

> 
> > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > +		continue;
> > +
> > +	      /* For early break the final loop IV is:
> > +		 init + (final - init) * vf which takes into account peeling
> > +		 values and non-single steps.  The main exit can use niters
> > +		 since if you exit from the main exit you've done all vector
> > +		 iterations.  For an early exit we don't know when we exit so
> we
> > +		 must re-calculate this on the exit.  */
> > +	      tree start_expr = gimple_phi_result (phi);
> > +	      off = fold_build2 (MINUS_EXPR, stype,
> > +				 fold_convert (stype, start_expr),
> > +				 fold_convert (stype, init_expr));
> > +	      /* Now adjust for VF to get the final iteration value.  */
> > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > +				 build_int_cst (stype, vf));
> > +	    }
> > +	  else
> > +	    off = fold_build2 (MULT_EXPR, stype,
> > +			       fold_convert (stype, niters), step_expr);
> > +
> >  	  if (POINTER_TYPE_P (type))
> >  	    ni = fold_build_pointer_plus (init_expr, off);
> >  	  else
> > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer (loop_vec_info
> loop_vinfo,
> >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> >        else if (induction_type == vect_step_op_neg)
> >  	ni = init_expr;
> > +      else if (restart_loop)
> > +	continue;
> 
> This looks all a bit complicated - why wouldn't we simply always use the PHI
> result when 'restart_loop'?  Isn't that the correct old start value in all cases?
> 
> >        else
> >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> >  					  niters, step_expr,
> > @@ -2245,9 +2295,20 @@ vect_update_ivs_after_vectorizer
> (loop_vec_info
> > loop_vinfo,
> >
> >        var = create_tmp_var (type, "tmp");
> >
> > -      last_gsi = gsi_last_bb (exit_bb);
> >        gimple_seq new_stmts = NULL;
> >        ni_name = force_gimple_operand (ni, &new_stmts, false, var);
> > +
> > +      /* For non-main exit create an intermediat edge to get any updated iv
> > +	 calculations.  */
> > +      if (needs_interm_block
> > +	  && !iv_block
> > +	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> (new_stmts)))
> > +	{
> > +	  iv_block = split_edge (update_e);
> > +	  update_e = single_succ_edge (update_e->dest);
> > +	  last_gsi = gsi_last_bb (iv_block);
> > +	}
> > +
> >        /* Exit_bb shouldn't be empty.  */
> >        if (!gsi_end_p (last_gsi))
> >  	{
> > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> niters, tree nitersm1,
> >  	 niters_vector_mult_vf steps.  */
> >        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
> >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > -      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
> > -					update_e);
> > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > +	update_e = single_succ_edge (e->dest);
> > +      bool inversed_iv
> > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> > +					 LOOP_VINFO_LOOP (loop_vinfo));
> 
> You are computing this here and in vect_update_ivs_after_vectorizer?
> 
> > +
> > +      /* Update the main exit first.  */
> > +      vect_update_ivs_after_vectorizer (loop_vinfo, vf, niters_vector_mult_vf,
> > +					update_e, inversed_iv);
> > +
> > +      /* And then update the early exits.  */
> > +      for (auto exit : get_loop_exit_edges (loop))
> > +	{
> > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > +	    continue;
> > +
> > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > +					    niters_vector_mult_vf,
> > +					    exit, true);
> 
> ... why does the same not work here?  Wouldn't the proper condition be
> !dominated_by_p (CDI_DOMINATORS, exit->src, LOOP_VINFO_IV_EXIT
> (loop_vinfo)->src) or similar?  That is, whether the exit is at or after the main IV
> exit?  (consider having two)
> 
> > +	}
> >
> >        if (skip_epilog)
> >  	{
> >
Richard Biener Nov. 15, 2023, 1:22 p.m. UTC | #4
On Wed, 15 Nov 2023, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Wednesday, November 15, 2023 1:01 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> > breaks and arbitrary exits
> > 
> > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > 
> > > Patch updated to latest trunk:
> > >
> > > Hi All,
> > >
> > > This changes the PHI node updates to support early breaks.
> > > It has to support both the case where the loop's exit matches the
> > > normal loop exit and one where the early exit is "inverted", i.e. it's an early
> > exit edge.
> > >
> > > In the latter case we must always restart the loop for VF iterations.
> > > For an early exit the reason is obvious, but there are cases where the
> > > "normal" exit is located before the early one.  This exit then does a
> > > check on ivtmp resulting in us leaving the loop since it thinks we're done.
> > >
> > > In these case we may still have side-effects to perform so we also go
> > > to the scalar loop.
> > >
> > > For the "normal" exit niters has already been adjusted for peeling,
> > > for the early exits we must find out how many iterations we actually
> > > did.  So we have to recalculate the new position for each exit.
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal): Hide
> > unused.
> > > 	(vect_update_ivs_after_vectorizer): Support early break.
> > > 	(vect_do_peeling): Use it.
> > >
> > > --- inline copy of patch ---
> > >
> > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > index
> > >
> > d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3
> > d2654cf1
> > > c842baac58f5 100644
> > > --- a/gcc/tree-vect-loop-manip.cc
> > > +++ b/gcc/tree-vect-loop-manip.cc
> > > @@ -1200,7 +1200,7 @@ vect_set_loop_condition_partial_vectors_avx512
> > (class loop *loop,
> > >     loop handles exactly VF scalars per iteration.  */
> > >
> > >  static gcond *
> > > -vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge
> > > exit_edge,
> > > +vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge
> > > +exit_edge,
> > >  				class loop *loop, tree niters, tree step,
> > >  				tree final_iv, bool niters_maybe_zero,
> > >  				gimple_stmt_iterator loop_cond_gsi) @@ -
> > 1412,7 +1412,7 @@
> > > vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info
> > loop_vinfo
> > >     When this happens we need to flip the understanding of main and other
> > >     exits by peeling and IV updates.  */
> > >
> > > -bool inline
> > > +bool
> > >  vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)  {
> > >    return single_pred (loop->latch) == loop_exit->src; @@ -2142,6
> > > +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
> > >       Input:
> > >       - LOOP - a loop that is going to be vectorized. The last few iterations
> > >                of LOOP were peeled.
> > > +     - VF   - The chosen vectorization factor for LOOP.
> > >       - NITERS - the number of iterations that LOOP executes (before it is
> > >                  vectorized). i.e, the number of times the ivs should be bumped.
> > >       - UPDATE_E - a successor edge of LOOP->exit that is on the
> > > (only) path
> > 
> > the comment on this is now a bit misleading, can you try to update it and/or
> > move the comment bits to the docs on EARLY_EXIT?
> > 
> > > @@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p (loop_vec_info
> > loop_vinfo)
> > >                    The phi args associated with the edge UPDATE_E in the bb
> > >                    UPDATE_E->dest are updated accordingly.
> > >
> > > +     - restart_loop - Indicates whether the scalar loop needs to
> > > + restart the
> > 
> > params are ALL_CAPS
> > 
> > > +		      iteration count where the vector loop began.
> > > +
> > >       Assumption 1: Like the rest of the vectorizer, this function assumes
> > >       a single loop exit that has a single predecessor.
> > >
> > > @@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p (loop_vec_info
> > loop_vinfo)
> > >   */
> > >
> > >  static void
> > > -vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > -				  tree niters, edge update_e)
> > > +vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > +poly_uint64 vf,
> > 
> > LOOP_VINFO_VECT_FACTOR?
> > 
> > > +				  tree niters, edge update_e, bool
> > restart_loop)
> > 
> > I think 'bool early_exit' is better here?  I wonder if we have an "early"
> > exit after the main exit we are probably sure there are no side-effects to re-
> > execute and could avoid this restarting?
> 
> Side effects yes, but the actual check may not have been performed yet.
> If you remember https://gist.github.com/Mistuke/66f14fe5c1be32b91ce149bd9b8bb35f
> There in the clz loop through the "main" exit you still have to see if that iteration
> did not contain the entry.  This is because the loop counter is incremented
> before you iterate.
> 
> > 
> > >  {
> > >    gphi_iterator gsi, gsi1;
> > >    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > >    basic_block update_bb = update_e->dest;
> > > -
> > > -  basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > > -
> > > -  /* Make sure there exists a single-predecessor exit bb:  */
> > > -  gcc_assert (single_pred_p (exit_bb));
> > > -  gcc_assert (single_succ_edge (exit_bb) == update_e);
> > > +  bool inversed_iv
> > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> > > +					 LOOP_VINFO_LOOP (loop_vinfo));
> > > +  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > > +			    && flow_bb_inside_loop_p (loop, update_e->src);
> > > +  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > > +  gcond *cond = get_loop_exit_condition (loop_e);
> > > +  basic_block exit_bb = loop_e->dest;
> > > +  basic_block iv_block = NULL;
> > > +  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
> > >
> > >    for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
> > >         !gsi_end_p (gsi) && !gsi_end_p (gsi1); @@ -2190,7 +2198,6 @@
> > > vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > >        tree step_expr, off;
> > >        tree type;
> > >        tree var, ni, ni_name;
> > > -      gimple_stmt_iterator last_gsi;
> > >
> > >        gphi *phi = gsi.phi ();
> > >        gphi *phi1 = gsi1.phi ();
> > > @@ -2222,11 +2229,52 @@ vect_update_ivs_after_vectorizer
> > (loop_vec_info loop_vinfo,
> > >        enum vect_induction_op_type induction_type
> > >  	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
> > >
> > > -      if (induction_type == vect_step_op_add)
> > > +      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> > > +      /* create_iv always places it on the LHS.  Alternatively we can set a
> > > +	 property during create_iv to identify it.  */
> > > +      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > +      if (restart_loop && ivtemp)
> > >  	{
> > > +	  type = TREE_TYPE (gimple_phi_result (phi));
> > > +	  ni = build_int_cst (type, vf);
> > > +	  if (inversed_iv)
> > > +	    ni = fold_build2 (MINUS_EXPR, type, ni,
> > > +			      fold_convert (type, step_expr));
> > > +	}
> > > +      else if (induction_type == vect_step_op_add)
> > > +	{
> > > +
> > >  	  tree stype = TREE_TYPE (step_expr);
> > > -	  off = fold_build2 (MULT_EXPR, stype,
> > > -			     fold_convert (stype, niters), step_expr);
> > > +
> > > +	  /* Early exits always use last iter value not niters. */
> > > +	  if (restart_loop)
> > > +	    {
> > > +	      /* Live statements in the non-main exit shouldn't be adjusted.  We
> > > +		 normally didn't have this problem with a single exit as live
> > > +		 values would be in the exit block.  However when dealing with
> > > +		 multiple exits all exits are redirected to the merge block
> > > +		 and we restart the iteration.  */
> > 
> > Hmm, I fail to see how this works - we're either using the value to continue the
> > induction or not, independent of STMT_VINFO_LIVE_P.
> 
> That becomes clear in the patch to update live reductions.  Essentially any live
> Reductions inside an alternative exit will reduce to the first element
> rather than the last and use that as the seed for the scalar loop.

Hum.  Reductions are vectorized as N separate reductions.  I don't think
you can simply change the reduction between the lanes to "skip"
part of the vector iteration.  But you can use the value of the vector
from before the vector iteration - the loop header PHI result, and
fully reduce that to get at the proper value.

> It has to do this since you have to perform the side effects for the non-matching
> elements still.
> 
> Regards,
> Tamar
> 
> > 
> > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > +		continue;
> > > +
> > > +	      /* For early break the final loop IV is:
> > > +		 init + (final - init) * vf which takes into account peeling
> > > +		 values and non-single steps.  The main exit can use niters
> > > +		 since if you exit from the main exit you've done all vector
> > > +		 iterations.  For an early exit we don't know when we exit so
> > we
> > > +		 must re-calculate this on the exit.  */
> > > +	      tree start_expr = gimple_phi_result (phi);
> > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > +				 fold_convert (stype, start_expr),
> > > +				 fold_convert (stype, init_expr));
> > > +	      /* Now adjust for VF to get the final iteration value.  */
> > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > +				 build_int_cst (stype, vf));
> > > +	    }
> > > +	  else
> > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > +			       fold_convert (stype, niters), step_expr);
> > > +
> > >  	  if (POINTER_TYPE_P (type))
> > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > >  	  else
> > > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer (loop_vec_info
> > loop_vinfo,
> > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > >        else if (induction_type == vect_step_op_neg)
> > >  	ni = init_expr;
> > > +      else if (restart_loop)
> > > +	continue;
> > 
> > This looks all a bit complicated - why wouldn't we simply always use the PHI
> > result when 'restart_loop'?  Isn't that the correct old start value in all cases?
> > 
> > >        else
> > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > >  					  niters, step_expr,
> > > @@ -2245,9 +2295,20 @@ vect_update_ivs_after_vectorizer
> > (loop_vec_info
> > > loop_vinfo,
> > >
> > >        var = create_tmp_var (type, "tmp");
> > >
> > > -      last_gsi = gsi_last_bb (exit_bb);
> > >        gimple_seq new_stmts = NULL;
> > >        ni_name = force_gimple_operand (ni, &new_stmts, false, var);
> > > +
> > > +      /* For non-main exit create an intermediat edge to get any updated iv
> > > +	 calculations.  */
> > > +      if (needs_interm_block
> > > +	  && !iv_block
> > > +	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > (new_stmts)))
> > > +	{
> > > +	  iv_block = split_edge (update_e);
> > > +	  update_e = single_succ_edge (update_e->dest);
> > > +	  last_gsi = gsi_last_bb (iv_block);
> > > +	}
> > > +
> > >        /* Exit_bb shouldn't be empty.  */
> > >        if (!gsi_end_p (last_gsi))
> > >  	{
> > > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> > niters, tree nitersm1,
> > >  	 niters_vector_mult_vf steps.  */
> > >        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
> > >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > > -      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
> > > -					update_e);
> > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > +	update_e = single_succ_edge (e->dest);
> > > +      bool inversed_iv
> > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> > > +					 LOOP_VINFO_LOOP (loop_vinfo));
> > 
> > You are computing this here and in vect_update_ivs_after_vectorizer?
> > 
> > > +
> > > +      /* Update the main exit first.  */
> > > +      vect_update_ivs_after_vectorizer (loop_vinfo, vf, niters_vector_mult_vf,
> > > +					update_e, inversed_iv);
> > > +
> > > +      /* And then update the early exits.  */
> > > +      for (auto exit : get_loop_exit_edges (loop))
> > > +	{
> > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > +	    continue;
> > > +
> > > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > +					    niters_vector_mult_vf,
> > > +					    exit, true);
> > 
> > ... why does the same not work here?  Wouldn't the proper condition be
> > !dominated_by_p (CDI_DOMINATORS, exit->src, LOOP_VINFO_IV_EXIT
> > (loop_vinfo)->src) or similar?  That is, whether the exit is at or after the main IV
> > exit?  (consider having two)
> > 
> > > +	}
> > >
> > >        if (skip_epilog)
> > >  	{
> > >
>
Tamar Christina Nov. 15, 2023, 2:14 p.m. UTC | #5
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Wednesday, November 15, 2023 1:23 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> breaks and arbitrary exits
> 
> On Wed, 15 Nov 2023, Tamar Christina wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Wednesday, November 15, 2023 1:01 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> jlaw@ventanamicro.com
> > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > support early breaks and arbitrary exits
> > >
> > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > >
> > > > Patch updated to latest trunk:
> > > >
> > > > Hi All,
> > > >
> > > > This changes the PHI node updates to support early breaks.
> > > > It has to support both the case where the loop's exit matches the
> > > > normal loop exit and one where the early exit is "inverted", i.e.
> > > > it's an early
> > > exit edge.
> > > >
> > > > In the latter case we must always restart the loop for VF iterations.
> > > > For an early exit the reason is obvious, but there are cases where
> > > > the "normal" exit is located before the early one.  This exit then
> > > > does a check on ivtmp resulting in us leaving the loop since it thinks we're
> done.
> > > >
> > > > In these case we may still have side-effects to perform so we also
> > > > go to the scalar loop.
> > > >
> > > > For the "normal" exit niters has already been adjusted for
> > > > peeling, for the early exits we must find out how many iterations
> > > > we actually did.  So we have to recalculate the new position for each exit.
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > 	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal): Hide
> > > unused.
> > > > 	(vect_update_ivs_after_vectorizer): Support early break.
> > > > 	(vect_do_peeling): Use it.
> > > >
> > > > --- inline copy of patch ---
> > > >
> > > > diff --git a/gcc/tree-vect-loop-manip.cc
> > > > b/gcc/tree-vect-loop-manip.cc index
> > > >
> > >
> d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3
> > > d2654cf1
> > > > c842baac58f5 100644
> > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > @@ -1200,7 +1200,7 @@
> > > > vect_set_loop_condition_partial_vectors_avx512
> > > (class loop *loop,
> > > >     loop handles exactly VF scalars per iteration.  */
> > > >
> > > >  static gcond *
> > > > -vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge
> > > > exit_edge,
> > > > +vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */,
> > > > +edge exit_edge,
> > > >  				class loop *loop, tree niters, tree step,
> > > >  				tree final_iv, bool niters_maybe_zero,
> > > >  				gimple_stmt_iterator loop_cond_gsi) @@ -
> > > 1412,7 +1412,7 @@
> > > > vect_set_loop_condition (class loop *loop, edge loop_e,
> > > > loop_vec_info
> > > loop_vinfo
> > > >     When this happens we need to flip the understanding of main and
> other
> > > >     exits by peeling and IV updates.  */
> > > >
> > > > -bool inline
> > > > +bool
> > > >  vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)  {
> > > >    return single_pred (loop->latch) == loop_exit->src; @@ -2142,6
> > > > +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
> > > >       Input:
> > > >       - LOOP - a loop that is going to be vectorized. The last few iterations
> > > >                of LOOP were peeled.
> > > > +     - VF   - The chosen vectorization factor for LOOP.
> > > >       - NITERS - the number of iterations that LOOP executes (before it is
> > > >                  vectorized). i.e, the number of times the ivs should be bumped.
> > > >       - UPDATE_E - a successor edge of LOOP->exit that is on the
> > > > (only) path
> > >
> > > the comment on this is now a bit misleading, can you try to update
> > > it and/or move the comment bits to the docs on EARLY_EXIT?
> > >
> > > > @@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p (loop_vec_info
> > > loop_vinfo)
> > > >                    The phi args associated with the edge UPDATE_E in the bb
> > > >                    UPDATE_E->dest are updated accordingly.
> > > >
> > > > +     - restart_loop - Indicates whether the scalar loop needs to
> > > > + restart the
> > >
> > > params are ALL_CAPS
> > >
> > > > +		      iteration count where the vector loop began.
> > > > +
> > > >       Assumption 1: Like the rest of the vectorizer, this function assumes
> > > >       a single loop exit that has a single predecessor.
> > > >
> > > > @@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p (loop_vec_info
> > > loop_vinfo)
> > > >   */
> > > >
> > > >  static void
> > > > -vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > -				  tree niters, edge update_e)
> > > > +vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > +poly_uint64 vf,
> > >
> > > LOOP_VINFO_VECT_FACTOR?
> > >
> > > > +				  tree niters, edge update_e, bool
> > > restart_loop)
> > >
> > > I think 'bool early_exit' is better here?  I wonder if we have an "early"
> > > exit after the main exit we are probably sure there are no
> > > side-effects to re- execute and could avoid this restarting?
> >
> > Side effects yes, but the actual check may not have been performed yet.
> > If you remember
> > https://gist.github.com/Mistuke/66f14fe5c1be32b91ce149bd9b8bb35f
> > There in the clz loop through the "main" exit you still have to see if
> > that iteration did not contain the entry.  This is because the loop
> > counter is incremented before you iterate.
> >
> > >
> > > >  {
> > > >    gphi_iterator gsi, gsi1;
> > > >    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > > >    basic_block update_bb = update_e->dest;
> > > > -
> > > > -  basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > > > -
> > > > -  /* Make sure there exists a single-predecessor exit bb:  */
> > > > -  gcc_assert (single_pred_p (exit_bb));
> > > > -  gcc_assert (single_succ_edge (exit_bb) == update_e);
> > > > +  bool inversed_iv
> > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> > > > +					 LOOP_VINFO_LOOP (loop_vinfo));
> > > > +  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > > > +			    && flow_bb_inside_loop_p (loop, update_e->src);
> > > > +  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > > > +  gcond *cond = get_loop_exit_condition (loop_e);
> > > > +  basic_block exit_bb = loop_e->dest;
> > > > +  basic_block iv_block = NULL;
> > > > +  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
> > > >
> > > >    for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis
> (update_bb);
> > > >         !gsi_end_p (gsi) && !gsi_end_p (gsi1); @@ -2190,7 +2198,6
> > > > @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > >        tree step_expr, off;
> > > >        tree type;
> > > >        tree var, ni, ni_name;
> > > > -      gimple_stmt_iterator last_gsi;
> > > >
> > > >        gphi *phi = gsi.phi ();
> > > >        gphi *phi1 = gsi1.phi ();
> > > > @@ -2222,11 +2229,52 @@ vect_update_ivs_after_vectorizer
> > > (loop_vec_info loop_vinfo,
> > > >        enum vect_induction_op_type induction_type
> > > >  	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
> > > >
> > > > -      if (induction_type == vect_step_op_add)
> > > > +      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge
> (loop));
> > > > +      /* create_iv always places it on the LHS.  Alternatively we can set a
> > > > +	 property during create_iv to identify it.  */
> > > > +      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > +      if (restart_loop && ivtemp)
> > > >  	{
> > > > +	  type = TREE_TYPE (gimple_phi_result (phi));
> > > > +	  ni = build_int_cst (type, vf);
> > > > +	  if (inversed_iv)
> > > > +	    ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > +			      fold_convert (type, step_expr));
> > > > +	}
> > > > +      else if (induction_type == vect_step_op_add)
> > > > +	{
> > > > +
> > > >  	  tree stype = TREE_TYPE (step_expr);
> > > > -	  off = fold_build2 (MULT_EXPR, stype,
> > > > -			     fold_convert (stype, niters), step_expr);
> > > > +
> > > > +	  /* Early exits always use last iter value not niters. */
> > > > +	  if (restart_loop)
> > > > +	    {
> > > > +	      /* Live statements in the non-main exit shouldn't be adjusted.  We
> > > > +		 normally didn't have this problem with a single exit as live
> > > > +		 values would be in the exit block.  However when dealing with
> > > > +		 multiple exits all exits are redirected to the merge block
> > > > +		 and we restart the iteration.  */
> > >
> > > Hmm, I fail to see how this works - we're either using the value to
> > > continue the induction or not, independent of STMT_VINFO_LIVE_P.
> >
> > That becomes clear in the patch to update live reductions.
> > Essentially any live Reductions inside an alternative exit will reduce
> > to the first element rather than the last and use that as the seed for the
> scalar loop.
> 
> Hum.  Reductions are vectorized as N separate reductions.  I don't think you
> can simply change the reduction between the lanes to "skip"
> part of the vector iteration.  But you can use the value of the vector from
> before the vector iteration - the loop header PHI result, and fully reduce that
> to get at the proper value.

That's what It's supposed to be doing though.  The reason live operations
are skipped here is that if we don't we'll re-adjust the IV even though the value
will already be correct after vectorization.

Remember that this code only gets so far for IV PHI nodes.

The loop phi header result itself can be live, i.e. see testcases
vect-early-break_70.c to vect-early-break_75.c

you have i_15 = PHI <i_14 (6), 1(2)>

we use i_15 in the early exit. This should not be adjusted because when it's
vectorized the value at 0[lane 0] is already correct.  This is why for any PHI
inside the early exits it uses the value 0[0] instead of N[lane_max].

Perhaps I'm missing something here?

Regards,
Tamar
> 
> > It has to do this since you have to perform the side effects for the
> > non-matching elements still.
> >
> > Regards,
> > Tamar
> >
> > >
> > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > +		continue;
> > > > +
> > > > +	      /* For early break the final loop IV is:
> > > > +		 init + (final - init) * vf which takes into account peeling
> > > > +		 values and non-single steps.  The main exit can use niters
> > > > +		 since if you exit from the main exit you've done all vector
> > > > +		 iterations.  For an early exit we don't know when we exit so
> > > we
> > > > +		 must re-calculate this on the exit.  */
> > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > +				 fold_convert (stype, start_expr),
> > > > +				 fold_convert (stype, init_expr));
> > > > +	      /* Now adjust for VF to get the final iteration value.  */
> > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > +				 build_int_cst (stype, vf));
> > > > +	    }
> > > > +	  else
> > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > +			       fold_convert (stype, niters), step_expr);
> > > > +
> > > >  	  if (POINTER_TYPE_P (type))
> > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > >  	  else
> > > > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer
> > > > (loop_vec_info
> > > loop_vinfo,
> > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > >        else if (induction_type == vect_step_op_neg)
> > > >  	ni = init_expr;
> > > > +      else if (restart_loop)
> > > > +	continue;
> > >
> > > This looks all a bit complicated - why wouldn't we simply always use
> > > the PHI result when 'restart_loop'?  Isn't that the correct old start value in
> all cases?
> > >
> > > >        else
> > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > >  					  niters, step_expr,
> > > > @@ -2245,9 +2295,20 @@ vect_update_ivs_after_vectorizer
> > > (loop_vec_info
> > > > loop_vinfo,
> > > >
> > > >        var = create_tmp_var (type, "tmp");
> > > >
> > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > >        gimple_seq new_stmts = NULL;
> > > >        ni_name = force_gimple_operand (ni, &new_stmts, false,
> > > > var);
> > > > +
> > > > +      /* For non-main exit create an intermediat edge to get any updated iv
> > > > +	 calculations.  */
> > > > +      if (needs_interm_block
> > > > +	  && !iv_block
> > > > +	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > (new_stmts)))
> > > > +	{
> > > > +	  iv_block = split_edge (update_e);
> > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > +	}
> > > > +
> > > >        /* Exit_bb shouldn't be empty.  */
> > > >        if (!gsi_end_p (last_gsi))
> > > >  	{
> > > > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info loop_vinfo,
> > > > tree
> > > niters, tree nitersm1,
> > > >  	 niters_vector_mult_vf steps.  */
> > > >        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
> > > >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > > > -      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
> > > > -					update_e);
> > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > +	update_e = single_succ_edge (e->dest);
> > > > +      bool inversed_iv
> > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> > > > +					 LOOP_VINFO_LOOP (loop_vinfo));
> > >
> > > You are computing this here and in vect_update_ivs_after_vectorizer?
> > >
> > > > +
> > > > +      /* Update the main exit first.  */
> > > > +      vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> niters_vector_mult_vf,
> > > > +					update_e, inversed_iv);
> > > > +
> > > > +      /* And then update the early exits.  */
> > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > +	{
> > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > +	    continue;
> > > > +
> > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > +					    niters_vector_mult_vf,
> > > > +					    exit, true);
> > >
> > > ... why does the same not work here?  Wouldn't the proper condition
> > > be !dominated_by_p (CDI_DOMINATORS, exit->src, LOOP_VINFO_IV_EXIT
> > > (loop_vinfo)->src) or similar?  That is, whether the exit is at or
> > > after the main IV exit?  (consider having two)
> > >
> > > > +	}
> > > >
> > > >        if (skip_epilog)
> > > >  	{
> > > >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
Richard Biener Nov. 16, 2023, 10:40 a.m. UTC | #6
On Wed, 15 Nov 2023, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Wednesday, November 15, 2023 1:23 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> > breaks and arbitrary exits
> > 
> > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > 
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Wednesday, November 15, 2023 1:01 PM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > jlaw@ventanamicro.com
> > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > > support early breaks and arbitrary exits
> > > >
> > > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > > >
> > > > > Patch updated to latest trunk:
> > > > >
> > > > > Hi All,
> > > > >
> > > > > This changes the PHI node updates to support early breaks.
> > > > > It has to support both the case where the loop's exit matches the
> > > > > normal loop exit and one where the early exit is "inverted", i.e.
> > > > > it's an early
> > > > exit edge.
> > > > >
> > > > > In the latter case we must always restart the loop for VF iterations.
> > > > > For an early exit the reason is obvious, but there are cases where
> > > > > the "normal" exit is located before the early one.  This exit then
> > > > > does a check on ivtmp resulting in us leaving the loop since it thinks we're
> > done.
> > > > >
> > > > > In these case we may still have side-effects to perform so we also
> > > > > go to the scalar loop.
> > > > >
> > > > > For the "normal" exit niters has already been adjusted for
> > > > > peeling, for the early exits we must find out how many iterations
> > > > > we actually did.  So we have to recalculate the new position for each exit.
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > 	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal): Hide
> > > > unused.
> > > > > 	(vect_update_ivs_after_vectorizer): Support early break.
> > > > > 	(vect_do_peeling): Use it.
> > > > >
> > > > > --- inline copy of patch ---
> > > > >
> > > > > diff --git a/gcc/tree-vect-loop-manip.cc
> > > > > b/gcc/tree-vect-loop-manip.cc index
> > > > >
> > > >
> > d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3
> > > > d2654cf1
> > > > > c842baac58f5 100644
> > > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > > @@ -1200,7 +1200,7 @@
> > > > > vect_set_loop_condition_partial_vectors_avx512
> > > > (class loop *loop,
> > > > >     loop handles exactly VF scalars per iteration.  */
> > > > >
> > > > >  static gcond *
> > > > > -vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge
> > > > > exit_edge,
> > > > > +vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */,
> > > > > +edge exit_edge,
> > > > >  				class loop *loop, tree niters, tree step,
> > > > >  				tree final_iv, bool niters_maybe_zero,
> > > > >  				gimple_stmt_iterator loop_cond_gsi) @@ -
> > > > 1412,7 +1412,7 @@
> > > > > vect_set_loop_condition (class loop *loop, edge loop_e,
> > > > > loop_vec_info
> > > > loop_vinfo
> > > > >     When this happens we need to flip the understanding of main and
> > other
> > > > >     exits by peeling and IV updates.  */
> > > > >
> > > > > -bool inline
> > > > > +bool
> > > > >  vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)  {
> > > > >    return single_pred (loop->latch) == loop_exit->src; @@ -2142,6
> > > > > +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
> > > > >       Input:
> > > > >       - LOOP - a loop that is going to be vectorized. The last few iterations
> > > > >                of LOOP were peeled.
> > > > > +     - VF   - The chosen vectorization factor for LOOP.
> > > > >       - NITERS - the number of iterations that LOOP executes (before it is
> > > > >                  vectorized). i.e, the number of times the ivs should be bumped.
> > > > >       - UPDATE_E - a successor edge of LOOP->exit that is on the
> > > > > (only) path
> > > >
> > > > the comment on this is now a bit misleading, can you try to update
> > > > it and/or move the comment bits to the docs on EARLY_EXIT?
> > > >
> > > > > @@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p (loop_vec_info
> > > > loop_vinfo)
> > > > >                    The phi args associated with the edge UPDATE_E in the bb
> > > > >                    UPDATE_E->dest are updated accordingly.
> > > > >
> > > > > +     - restart_loop - Indicates whether the scalar loop needs to
> > > > > + restart the
> > > >
> > > > params are ALL_CAPS
> > > >
> > > > > +		      iteration count where the vector loop began.
> > > > > +
> > > > >       Assumption 1: Like the rest of the vectorizer, this function assumes
> > > > >       a single loop exit that has a single predecessor.
> > > > >
> > > > > @@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p (loop_vec_info
> > > > loop_vinfo)
> > > > >   */
> > > > >
> > > > >  static void
> > > > > -vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > > -				  tree niters, edge update_e)
> > > > > +vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > > +poly_uint64 vf,
> > > >
> > > > LOOP_VINFO_VECT_FACTOR?
> > > >
> > > > > +				  tree niters, edge update_e, bool
> > > > restart_loop)
> > > >
> > > > I think 'bool early_exit' is better here?  I wonder if we have an "early"
> > > > exit after the main exit we are probably sure there are no
> > > > side-effects to re- execute and could avoid this restarting?
> > >
> > > Side effects yes, but the actual check may not have been performed yet.
> > > If you remember
> > > https://gist.github.com/Mistuke/66f14fe5c1be32b91ce149bd9b8bb35f
> > > There in the clz loop through the "main" exit you still have to see if
> > > that iteration did not contain the entry.  This is because the loop
> > > counter is incremented before you iterate.
> > >
> > > >
> > > > >  {
> > > > >    gphi_iterator gsi, gsi1;
> > > > >    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > > > >    basic_block update_bb = update_e->dest;
> > > > > -
> > > > > -  basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > > > > -
> > > > > -  /* Make sure there exists a single-predecessor exit bb:  */
> > > > > -  gcc_assert (single_pred_p (exit_bb));
> > > > > -  gcc_assert (single_succ_edge (exit_bb) == update_e);
> > > > > +  bool inversed_iv
> > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> > > > > +					 LOOP_VINFO_LOOP (loop_vinfo));
> > > > > +  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > > > > +			    && flow_bb_inside_loop_p (loop, update_e->src);
> > > > > +  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > > > > +  gcond *cond = get_loop_exit_condition (loop_e);
> > > > > +  basic_block exit_bb = loop_e->dest;
> > > > > +  basic_block iv_block = NULL;
> > > > > +  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
> > > > >
> > > > >    for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis
> > (update_bb);
> > > > >         !gsi_end_p (gsi) && !gsi_end_p (gsi1); @@ -2190,7 +2198,6
> > > > > @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > >        tree step_expr, off;
> > > > >        tree type;
> > > > >        tree var, ni, ni_name;
> > > > > -      gimple_stmt_iterator last_gsi;
> > > > >
> > > > >        gphi *phi = gsi.phi ();
> > > > >        gphi *phi1 = gsi1.phi ();
> > > > > @@ -2222,11 +2229,52 @@ vect_update_ivs_after_vectorizer
> > > > (loop_vec_info loop_vinfo,
> > > > >        enum vect_induction_op_type induction_type
> > > > >  	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
> > > > >
> > > > > -      if (induction_type == vect_step_op_add)
> > > > > +      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge
> > (loop));
> > > > > +      /* create_iv always places it on the LHS.  Alternatively we can set a
> > > > > +	 property during create_iv to identify it.  */
> > > > > +      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > +      if (restart_loop && ivtemp)
> > > > >  	{
> > > > > +	  type = TREE_TYPE (gimple_phi_result (phi));
> > > > > +	  ni = build_int_cst (type, vf);
> > > > > +	  if (inversed_iv)
> > > > > +	    ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > +			      fold_convert (type, step_expr));
> > > > > +	}
> > > > > +      else if (induction_type == vect_step_op_add)
> > > > > +	{
> > > > > +
> > > > >  	  tree stype = TREE_TYPE (step_expr);
> > > > > -	  off = fold_build2 (MULT_EXPR, stype,
> > > > > -			     fold_convert (stype, niters), step_expr);
> > > > > +
> > > > > +	  /* Early exits always use last iter value not niters. */
> > > > > +	  if (restart_loop)
> > > > > +	    {
> > > > > +	      /* Live statements in the non-main exit shouldn't be adjusted.  We
> > > > > +		 normally didn't have this problem with a single exit as live
> > > > > +		 values would be in the exit block.  However when dealing with
> > > > > +		 multiple exits all exits are redirected to the merge block
> > > > > +		 and we restart the iteration.  */
> > > >
> > > > Hmm, I fail to see how this works - we're either using the value to
> > > > continue the induction or not, independent of STMT_VINFO_LIVE_P.
> > >
> > > That becomes clear in the patch to update live reductions.
> > > Essentially any live Reductions inside an alternative exit will reduce
> > > to the first element rather than the last and use that as the seed for the
> > scalar loop.
> > 
> > Hum.  Reductions are vectorized as N separate reductions.  I don't think you
> > can simply change the reduction between the lanes to "skip"
> > part of the vector iteration.  But you can use the value of the vector from
> > before the vector iteration - the loop header PHI result, and fully reduce that
> > to get at the proper value.
> 
> That's what It's supposed to be doing though.  The reason live operations
> are skipped here is that if we don't we'll re-adjust the IV even though the value
> will already be correct after vectorization.
> 
> Remember that this code only gets so far for IV PHI nodes.
> 
> The loop phi header result itself can be live, i.e. see testcases
> vect-early-break_70.c to vect-early-break_75.c
> 
> you have i_15 = PHI <i_14 (6), 1(2)>
> 
> we use i_15 in the early exit. This should not be adjusted because when it's
> vectorized the value at 0[lane 0] is already correct.  This is why for any PHI
> inside the early exits it uses the value 0[0] instead of N[lane_max].
> 
> Perhaps I'm missing something here?

OK, so I refreshed my mind of what vect_update_ivs_after_vectorizer does.

I still do not understand the (complexity of the) patch.  Basically
the function computes the new value of the IV "from scratch" based
on the number of scalar iterations of the vector loop, the 'niter'
argument.  I would have expected that for the early exits we either
pass in a different 'niter' or alternatively a 'niter_adjustment'.

It seems your change handles different kinds of inductions differently.
Specifically

      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
      if (restart_loop && ivtemp)
        {
          type = TREE_TYPE (gimple_phi_result (phi));
          ni = build_int_cst (type, vf);
          if (inversed_iv)
            ni = fold_build2 (MINUS_EXPR, type, ni,
                              fold_convert (type, step_expr));
        }

it looks like for the exit test IV we use either 'VF' or 'VF - step'
as the new value.  That seems to be very odd special casing for
unknown reasons.  And while you adjust vec_step_op_add, you don't
adjust vect_peel_nonlinear_iv_init (maybe not supported - better
assert here).

Also the vec_step_op_add case will keep the original scalar IV
live even when it is a vectorized induction.  The code
recomputing the value from scratch avoids this.

      /* For non-main exit create an intermediat edge to get any updated 
iv
         calculations.  */
      if (needs_interm_block
          && !iv_block
          && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p 
(new_stmts)))
        {
          iv_block = split_edge (update_e);
          update_e = single_succ_edge (update_e->dest);
          last_gsi = gsi_last_bb (iv_block);
        }

this is also odd, can we adjust the API instead?  I suppose this
is because your computation uses the original loop IV, if you
based the computation off the initial value only this might not
be necessary?

That said, I wonder why we cannot simply pass in an adjusted niter
which would be niters_vector_mult_vf - vf and be done with that?

Thanks,
Richard.


> Regards,
> Tamar
> > 
> > > It has to do this since you have to perform the side effects for the
> > > non-matching elements still.
> > >
> > > Regards,
> > > Tamar
> > >
> > > >
> > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > +		continue;
> > > > > +
> > > > > +	      /* For early break the final loop IV is:
> > > > > +		 init + (final - init) * vf which takes into account peeling
> > > > > +		 values and non-single steps.  The main exit can use niters
> > > > > +		 since if you exit from the main exit you've done all vector
> > > > > +		 iterations.  For an early exit we don't know when we exit so
> > > > we
> > > > > +		 must re-calculate this on the exit.  */
> > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > +				 fold_convert (stype, start_expr),
> > > > > +				 fold_convert (stype, init_expr));
> > > > > +	      /* Now adjust for VF to get the final iteration value.  */
> > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > +				 build_int_cst (stype, vf));
> > > > > +	    }
> > > > > +	  else
> > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > +			       fold_convert (stype, niters), step_expr);
> > > > > +
> > > > >  	  if (POINTER_TYPE_P (type))
> > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > >  	  else
> > > > > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer
> > > > > (loop_vec_info
> > > > loop_vinfo,
> > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > >        else if (induction_type == vect_step_op_neg)
> > > > >  	ni = init_expr;
> > > > > +      else if (restart_loop)
> > > > > +	continue;
> > > >
> > > > This looks all a bit complicated - why wouldn't we simply always use
> > > > the PHI result when 'restart_loop'?  Isn't that the correct old start value in
> > all cases?
> > > >
> > > > >        else
> > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > >  					  niters, step_expr,
> > > > > @@ -2245,9 +2295,20 @@ vect_update_ivs_after_vectorizer
> > > > (loop_vec_info
> > > > > loop_vinfo,
> > > > >
> > > > >        var = create_tmp_var (type, "tmp");
> > > > >
> > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > >        gimple_seq new_stmts = NULL;
> > > > >        ni_name = force_gimple_operand (ni, &new_stmts, false,
> > > > > var);
> > > > > +
> > > > > +      /* For non-main exit create an intermediat edge to get any updated iv
> > > > > +	 calculations.  */
> > > > > +      if (needs_interm_block
> > > > > +	  && !iv_block
> > > > > +	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > > (new_stmts)))
> > > > > +	{
> > > > > +	  iv_block = split_edge (update_e);
> > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > +	}
> > > > > +
> > > > >        /* Exit_bb shouldn't be empty.  */
> > > > >        if (!gsi_end_p (last_gsi))
> > > > >  	{
> > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info loop_vinfo,
> > > > > tree
> > > > niters, tree nitersm1,
> > > > >  	 niters_vector_mult_vf steps.  */
> > > > >        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
> > > > >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
> > > > > -					update_e);
> > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > +	update_e = single_succ_edge (e->dest);
> > > > > +      bool inversed_iv
> > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
> > > > > +					 LOOP_VINFO_LOOP (loop_vinfo));
> > > >
> > > > You are computing this here and in vect_update_ivs_after_vectorizer?
> > > >
> > > > > +
> > > > > +      /* Update the main exit first.  */
> > > > > +      vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > niters_vector_mult_vf,
> > > > > +					update_e, inversed_iv);
> > > > > +
> > > > > +      /* And then update the early exits.  */
> > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > +	{
> > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > +	    continue;
> > > > > +
> > > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > > +					    niters_vector_mult_vf,
> > > > > +					    exit, true);
> > > >
> > > > ... why does the same not work here?  Wouldn't the proper condition
> > > > be !dominated_by_p (CDI_DOMINATORS, exit->src, LOOP_VINFO_IV_EXIT
> > > > (loop_vinfo)->src) or similar?  That is, whether the exit is at or
> > > > after the main IV exit?  (consider having two)
> > > >
> > > > > +	}
> > > > >
> > > > >        if (skip_epilog)
> > > > >  	{
> > > > >
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
>
Tamar Christina Nov. 16, 2023, 11:08 a.m. UTC | #7
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Thursday, November 16, 2023 10:40 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> breaks and arbitrary exits
> 
> On Wed, 15 Nov 2023, Tamar Christina wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Wednesday, November 15, 2023 1:23 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> jlaw@ventanamicro.com
> > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > support early breaks and arbitrary exits
> > >
> > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <rguenther@suse.de>
> > > > > Sent: Wednesday, November 15, 2023 1:01 PM
> > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > > jlaw@ventanamicro.com
> > > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > > > support early breaks and arbitrary exits
> > > > >
> > > > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > > > >
> > > > > > Patch updated to latest trunk:
> > > > > >
> > > > > > Hi All,
> > > > > >
> > > > > > This changes the PHI node updates to support early breaks.
> > > > > > It has to support both the case where the loop's exit matches
> > > > > > the normal loop exit and one where the early exit is "inverted", i.e.
> > > > > > it's an early
> > > > > exit edge.
> > > > > >
> > > > > > In the latter case we must always restart the loop for VF iterations.
> > > > > > For an early exit the reason is obvious, but there are cases
> > > > > > where the "normal" exit is located before the early one.  This
> > > > > > exit then does a check on ivtmp resulting in us leaving the
> > > > > > loop since it thinks we're
> > > done.
> > > > > >
> > > > > > In these case we may still have side-effects to perform so we
> > > > > > also go to the scalar loop.
> > > > > >
> > > > > > For the "normal" exit niters has already been adjusted for
> > > > > > peeling, for the early exits we must find out how many
> > > > > > iterations we actually did.  So we have to recalculate the new position
> for each exit.
> > > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > 	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal):
> > > > > > Hide
> > > > > unused.
> > > > > > 	(vect_update_ivs_after_vectorizer): Support early break.
> > > > > > 	(vect_do_peeling): Use it.
> > > > > >
> > > > > > --- inline copy of patch ---
> > > > > >
> > > > > > diff --git a/gcc/tree-vect-loop-manip.cc
> > > > > > b/gcc/tree-vect-loop-manip.cc index
> > > > > >
> > > > >
> > >
> d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3
> > > > > d2654cf1
> > > > > > c842baac58f5 100644
> > > > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > > > @@ -1200,7 +1200,7 @@
> > > > > > vect_set_loop_condition_partial_vectors_avx512
> > > > > (class loop *loop,
> > > > > >     loop handles exactly VF scalars per iteration.  */
> > > > > >
> > > > > >  static gcond *
> > > > > > -vect_set_loop_condition_normal (loop_vec_info loop_vinfo,
> > > > > > edge exit_edge,
> > > > > > +vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo
> > > > > > +*/, edge exit_edge,
> > > > > >  				class loop *loop, tree niters, tree step,
> > > > > >  				tree final_iv, bool niters_maybe_zero,
> > > > > >  				gimple_stmt_iterator loop_cond_gsi)
> @@ -
> > > > > 1412,7 +1412,7 @@
> > > > > > vect_set_loop_condition (class loop *loop, edge loop_e,
> > > > > > loop_vec_info
> > > > > loop_vinfo
> > > > > >     When this happens we need to flip the understanding of
> > > > > > main and
> > > other
> > > > > >     exits by peeling and IV updates.  */
> > > > > >
> > > > > > -bool inline
> > > > > > +bool
> > > > > >  vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)  {
> > > > > >    return single_pred (loop->latch) == loop_exit->src; @@
> > > > > > -2142,6
> > > > > > +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
> > > > > >       Input:
> > > > > >       - LOOP - a loop that is going to be vectorized. The last few
> iterations
> > > > > >                of LOOP were peeled.
> > > > > > +     - VF   - The chosen vectorization factor for LOOP.
> > > > > >       - NITERS - the number of iterations that LOOP executes (before it is
> > > > > >                  vectorized). i.e, the number of times the ivs should be
> bumped.
> > > > > >       - UPDATE_E - a successor edge of LOOP->exit that is on
> > > > > > the
> > > > > > (only) path
> > > > >
> > > > > the comment on this is now a bit misleading, can you try to
> > > > > update it and/or move the comment bits to the docs on EARLY_EXIT?
> > > > >
> > > > > > @@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p (loop_vec_info
> > > > > loop_vinfo)
> > > > > >                    The phi args associated with the edge UPDATE_E in the bb
> > > > > >                    UPDATE_E->dest are updated accordingly.
> > > > > >
> > > > > > +     - restart_loop - Indicates whether the scalar loop needs
> > > > > > + to restart the
> > > > >
> > > > > params are ALL_CAPS
> > > > >
> > > > > > +		      iteration count where the vector loop began.
> > > > > > +
> > > > > >       Assumption 1: Like the rest of the vectorizer, this function assumes
> > > > > >       a single loop exit that has a single predecessor.
> > > > > >
> > > > > > @@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p (loop_vec_info
> > > > > loop_vinfo)
> > > > > >   */
> > > > > >
> > > > > >  static void
> > > > > > -vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > > > -				  tree niters, edge update_e)
> > > > > > +vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > > > +poly_uint64 vf,
> > > > >
> > > > > LOOP_VINFO_VECT_FACTOR?
> > > > >
> > > > > > +				  tree niters, edge update_e, bool
> > > > > restart_loop)
> > > > >
> > > > > I think 'bool early_exit' is better here?  I wonder if we have an "early"
> > > > > exit after the main exit we are probably sure there are no
> > > > > side-effects to re- execute and could avoid this restarting?
> > > >
> > > > Side effects yes, but the actual check may not have been performed yet.
> > > > If you remember
> > > >
> https://gist.github.com/Mistuke/66f14fe5c1be32b91ce149bd9b8bb35f
> > > > There in the clz loop through the "main" exit you still have to
> > > > see if that iteration did not contain the entry.  This is because
> > > > the loop counter is incremented before you iterate.
> > > >
> > > > >
> > > > > >  {
> > > > > >    gphi_iterator gsi, gsi1;
> > > > > >    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > > > > >    basic_block update_bb = update_e->dest;
> > > > > > -
> > > > > > -  basic_block exit_bb = LOOP_VINFO_IV_EXIT
> > > > > > (loop_vinfo)->dest;
> > > > > > -
> > > > > > -  /* Make sure there exists a single-predecessor exit bb:  */
> > > > > > -  gcc_assert (single_pred_p (exit_bb));
> > > > > > -  gcc_assert (single_succ_edge (exit_bb) == update_e);
> > > > > > +  bool inversed_iv
> > > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT
> (loop_vinfo),
> > > > > > +					 LOOP_VINFO_LOOP
> (loop_vinfo));
> > > > > > +  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS
> (loop_vinfo)
> > > > > > +			    && flow_bb_inside_loop_p (loop,
> update_e->src);
> > > > > > +  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > > > > > +  gcond *cond = get_loop_exit_condition (loop_e);
> > > > > > +  basic_block exit_bb = loop_e->dest;
> > > > > > +  basic_block iv_block = NULL;
> > > > > > +  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
> > > > > >
> > > > > >    for (gsi = gsi_start_phis (loop->header), gsi1 =
> > > > > > gsi_start_phis
> > > (update_bb);
> > > > > >         !gsi_end_p (gsi) && !gsi_end_p (gsi1); @@ -2190,7
> > > > > > +2198,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info
> loop_vinfo,
> > > > > >        tree step_expr, off;
> > > > > >        tree type;
> > > > > >        tree var, ni, ni_name;
> > > > > > -      gimple_stmt_iterator last_gsi;
> > > > > >
> > > > > >        gphi *phi = gsi.phi ();
> > > > > >        gphi *phi1 = gsi1.phi (); @@ -2222,11 +2229,52 @@
> > > > > > vect_update_ivs_after_vectorizer
> > > > > (loop_vec_info loop_vinfo,
> > > > > >        enum vect_induction_op_type induction_type
> > > > > >  	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
> > > > > >
> > > > > > -      if (induction_type == vect_step_op_add)
> > > > > > +      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi,
> > > > > > + loop_latch_edge
> > > (loop));
> > > > > > +      /* create_iv always places it on the LHS.  Alternatively we can set a
> > > > > > +	 property during create_iv to identify it.  */
> > > > > > +      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > +      if (restart_loop && ivtemp)
> > > > > >  	{
> > > > > > +	  type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > +	  ni = build_int_cst (type, vf);
> > > > > > +	  if (inversed_iv)
> > > > > > +	    ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > +			      fold_convert (type, step_expr));
> > > > > > +	}
> > > > > > +      else if (induction_type == vect_step_op_add)
> > > > > > +	{
> > > > > > +
> > > > > >  	  tree stype = TREE_TYPE (step_expr);
> > > > > > -	  off = fold_build2 (MULT_EXPR, stype,
> > > > > > -			     fold_convert (stype, niters), step_expr);
> > > > > > +
> > > > > > +	  /* Early exits always use last iter value not niters. */
> > > > > > +	  if (restart_loop)
> > > > > > +	    {
> > > > > > +	      /* Live statements in the non-main exit shouldn't be
> adjusted.  We
> > > > > > +		 normally didn't have this problem with a single exit as
> live
> > > > > > +		 values would be in the exit block.  However when
> dealing with
> > > > > > +		 multiple exits all exits are redirected to the merge
> block
> > > > > > +		 and we restart the iteration.  */
> > > > >
> > > > > Hmm, I fail to see how this works - we're either using the value
> > > > > to continue the induction or not, independent of STMT_VINFO_LIVE_P.
> > > >
> > > > That becomes clear in the patch to update live reductions.
> > > > Essentially any live Reductions inside an alternative exit will
> > > > reduce to the first element rather than the last and use that as
> > > > the seed for the
> > > scalar loop.
> > >
> > > Hum.  Reductions are vectorized as N separate reductions.  I don't
> > > think you can simply change the reduction between the lanes to "skip"
> > > part of the vector iteration.  But you can use the value of the
> > > vector from before the vector iteration - the loop header PHI
> > > result, and fully reduce that to get at the proper value.
> >
> > That's what It's supposed to be doing though.  The reason live
> > operations are skipped here is that if we don't we'll re-adjust the IV
> > even though the value will already be correct after vectorization.
> >
> > Remember that this code only gets so far for IV PHI nodes.
> >
> > The loop phi header result itself can be live, i.e. see testcases
> > vect-early-break_70.c to vect-early-break_75.c
> >
> > you have i_15 = PHI <i_14 (6), 1(2)>
> >
> > we use i_15 in the early exit. This should not be adjusted because
> > when it's vectorized the value at 0[lane 0] is already correct.  This
> > is why for any PHI inside the early exits it uses the value 0[0] instead of
> N[lane_max].
> >
> > Perhaps I'm missing something here?
> 
> OK, so I refreshed my mind of what vect_update_ivs_after_vectorizer does.
> 
> I still do not understand the (complexity of the) patch.  Basically the function
> computes the new value of the IV "from scratch" based on the number of
> scalar iterations of the vector loop, the 'niter'
> argument.  I would have expected that for the early exits we either pass in a
> different 'niter' or alternatively a 'niter_adjustment'.

But for an early exit there's no static value for adjusted niter, since you don't know
which iteration you exited from. Unlike the normal exit when you know if you get
there you've done all possible iterations.

So you must compute the scalar iteration count on the exit itself.

> 
> It seems your change handles different kinds of inductions differently.
> Specifically
> 
>       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
>       if (restart_loop && ivtemp)
>         {
>           type = TREE_TYPE (gimple_phi_result (phi));
>           ni = build_int_cst (type, vf);
>           if (inversed_iv)
>             ni = fold_build2 (MINUS_EXPR, type, ni,
>                               fold_convert (type, step_expr));
>         }
> 
> it looks like for the exit test IV we use either 'VF' or 'VF - step'
> as the new value.  That seems to be very odd special casing for unknown
> reasons.  And while you adjust vec_step_op_add, you don't adjust
> vect_peel_nonlinear_iv_init (maybe not supported - better assert here).

The VF case is for a normal "non-inverted" loop, where if you take an early exit
you know that you have to do at most VF iterations.  The VF - step is to account
for the inverted loop control flow where you exit after adjusting the IV already
by + step.

Peeling doesn't matter here, since you know you were able to do a vector iteration
so it's safe to do VF iterations.  So having peeled doesn't affect the remaining
iters count.

> 
> Also the vec_step_op_add case will keep the original scalar IV live even when it
> is a vectorized induction.  The code recomputing the value from scratch avoids
> this.
> 
>       /* For non-main exit create an intermediat edge to get any updated iv
>          calculations.  */
>       if (needs_interm_block
>           && !iv_block
>           && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> (new_stmts)))
>         {
>           iv_block = split_edge (update_e);
>           update_e = single_succ_edge (update_e->dest);
>           last_gsi = gsi_last_bb (iv_block);
>         }
> 
> this is also odd, can we adjust the API instead?  I suppose this is because your
> computation uses the original loop IV, if you based the computation off the
> initial value only this might not be necessary?

No, on the main exit the code updates the value in the loop header and puts the
Calculation in the merge block.  This works because it only needs to consume PHI
nodes in the merge block and things like niters are adjusted in the guard block.

For an early exit, we don't have a guard block, only the merge block. We have to
update the PHI nodes in that block,  but can't do so since you can't produce a value
and consume it in a PHI node in the same BB.  So we need to create the block to put
the values in for use in the merge block.  Because there's no "guard" block for early
exits.

The API can be adjusted by always creating the empty block either during peeling.
That would prevent us from having to do anything special here.  Would that work
better?  Or I can do it in the loop that iterates over the exits to before the call
to vect_update_ivs_after_vectorizer, which I think might be more consistent.

> 
> That said, I wonder why we cannot simply pass in an adjusted niter which
> would be niters_vector_mult_vf - vf and be done with that?
> 

We can ofcourse not have this and recompute it from niters itself, however this does
affect the epilog code layout. Particularly knowing the static number if iterations left
causes it to usually unroll the loop and share some of the computations.  i.e. the scalar
code is often more efficient.

The computation would be niters_vector_mult_vf - iters_done * vf, since the value put
Here is the remaining iteration count.  It's static for early exits.

But can do whatever you prefer here.  Let me know what you prefer for the above.

Thanks,
Tamar

> Thanks,
> Richard.
> 
> 
> > Regards,
> > Tamar
> > >
> > > > It has to do this since you have to perform the side effects for
> > > > the non-matching elements still.
> > > >
> > > > Regards,
> > > > Tamar
> > > >
> > > > >
> > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > +		continue;
> > > > > > +
> > > > > > +	      /* For early break the final loop IV is:
> > > > > > +		 init + (final - init) * vf which takes into account peeling
> > > > > > +		 values and non-single steps.  The main exit can use
> niters
> > > > > > +		 since if you exit from the main exit you've done all
> vector
> > > > > > +		 iterations.  For an early exit we don't know when we
> exit
> > > > > > +so
> > > > > we
> > > > > > +		 must re-calculate this on the exit.  */
> > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > +				 fold_convert (stype, start_expr),
> > > > > > +				 fold_convert (stype, init_expr));
> > > > > > +	      /* Now adjust for VF to get the final iteration value.  */
> > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > +				 build_int_cst (stype, vf));
> > > > > > +	    }
> > > > > > +	  else
> > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > +			       fold_convert (stype, niters), step_expr);
> > > > > > +
> > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > >  	  else
> > > > > > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer
> > > > > > (loop_vec_info
> > > > > loop_vinfo,
> > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > >  	ni = init_expr;
> > > > > > +      else if (restart_loop)
> > > > > > +	continue;
> > > > >
> > > > > This looks all a bit complicated - why wouldn't we simply always
> > > > > use the PHI result when 'restart_loop'?  Isn't that the correct
> > > > > old start value in
> > > all cases?
> > > > >
> > > > > >        else
> > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > > >  					  niters, step_expr,
> > > > > > @@ -2245,9 +2295,20 @@ vect_update_ivs_after_vectorizer
> > > > > (loop_vec_info
> > > > > > loop_vinfo,
> > > > > >
> > > > > >        var = create_tmp_var (type, "tmp");
> > > > > >
> > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > >        gimple_seq new_stmts = NULL;
> > > > > >        ni_name = force_gimple_operand (ni, &new_stmts, false,
> > > > > > var);
> > > > > > +
> > > > > > +      /* For non-main exit create an intermediat edge to get any
> updated iv
> > > > > > +	 calculations.  */
> > > > > > +      if (needs_interm_block
> > > > > > +	  && !iv_block
> > > > > > +	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > > > (new_stmts)))
> > > > > > +	{
> > > > > > +	  iv_block = split_edge (update_e);
> > > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > +	}
> > > > > > +
> > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > >        if (!gsi_end_p (last_gsi))
> > > > > >  	{
> > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info
> > > > > > loop_vinfo, tree
> > > > > niters, tree nitersm1,
> > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > >        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
> > > > > >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> niters_vector_mult_vf,
> > > > > > -					update_e);
> > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > +      bool inversed_iv
> > > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT
> (loop_vinfo),
> > > > > > +					 LOOP_VINFO_LOOP
> (loop_vinfo));
> > > > >
> > > > > You are computing this here and in vect_update_ivs_after_vectorizer?
> > > > >
> > > > > > +
> > > > > > +      /* Update the main exit first.  */
> > > > > > +      vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > niters_vector_mult_vf,
> > > > > > +					update_e, inversed_iv);
> > > > > > +
> > > > > > +      /* And then update the early exits.  */
> > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > +	{
> > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > > +	    continue;
> > > > > > +
> > > > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > > > +					    niters_vector_mult_vf,
> > > > > > +					    exit, true);
> > > > >
> > > > > ... why does the same not work here?  Wouldn't the proper
> > > > > condition be !dominated_by_p (CDI_DOMINATORS, exit->src,
> > > > > LOOP_VINFO_IV_EXIT
> > > > > (loop_vinfo)->src) or similar?  That is, whether the exit is at
> > > > > or after the main IV exit?  (consider having two)
> > > > >
> > > > > > +	}
> > > > > >
> > > > > >        if (skip_epilog)
> > > > > >  	{
> > > > > >
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
Richard Biener Nov. 16, 2023, 11:27 a.m. UTC | #8
On Thu, 16 Nov 2023, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Thursday, November 16, 2023 10:40 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> > breaks and arbitrary exits
> > 
> > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > 
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Wednesday, November 15, 2023 1:23 PM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > jlaw@ventanamicro.com
> > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > > support early breaks and arbitrary exits
> > > >
> > > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > > >
> > > > > > -----Original Message-----
> > > > > > From: Richard Biener <rguenther@suse.de>
> > > > > > Sent: Wednesday, November 15, 2023 1:01 PM
> > > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > > > jlaw@ventanamicro.com
> > > > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > > > > support early breaks and arbitrary exits
> > > > > >
> > > > > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > > > > >
> > > > > > > Patch updated to latest trunk:
> > > > > > >
> > > > > > > Hi All,
> > > > > > >
> > > > > > > This changes the PHI node updates to support early breaks.
> > > > > > > It has to support both the case where the loop's exit matches
> > > > > > > the normal loop exit and one where the early exit is "inverted", i.e.
> > > > > > > it's an early
> > > > > > exit edge.
> > > > > > >
> > > > > > > In the latter case we must always restart the loop for VF iterations.
> > > > > > > For an early exit the reason is obvious, but there are cases
> > > > > > > where the "normal" exit is located before the early one.  This
> > > > > > > exit then does a check on ivtmp resulting in us leaving the
> > > > > > > loop since it thinks we're
> > > > done.
> > > > > > >
> > > > > > > In these case we may still have side-effects to perform so we
> > > > > > > also go to the scalar loop.
> > > > > > >
> > > > > > > For the "normal" exit niters has already been adjusted for
> > > > > > > peeling, for the early exits we must find out how many
> > > > > > > iterations we actually did.  So we have to recalculate the new position
> > for each exit.
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Tamar
> > > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > > 	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal):
> > > > > > > Hide
> > > > > > unused.
> > > > > > > 	(vect_update_ivs_after_vectorizer): Support early break.
> > > > > > > 	(vect_do_peeling): Use it.
> > > > > > >
> > > > > > > --- inline copy of patch ---
> > > > > > >
> > > > > > > diff --git a/gcc/tree-vect-loop-manip.cc
> > > > > > > b/gcc/tree-vect-loop-manip.cc index
> > > > > > >
> > > > > >
> > > >
> > d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3
> > > > > > d2654cf1
> > > > > > > c842baac58f5 100644
> > > > > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > > > > @@ -1200,7 +1200,7 @@
> > > > > > > vect_set_loop_condition_partial_vectors_avx512
> > > > > > (class loop *loop,
> > > > > > >     loop handles exactly VF scalars per iteration.  */
> > > > > > >
> > > > > > >  static gcond *
> > > > > > > -vect_set_loop_condition_normal (loop_vec_info loop_vinfo,
> > > > > > > edge exit_edge,
> > > > > > > +vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo
> > > > > > > +*/, edge exit_edge,
> > > > > > >  				class loop *loop, tree niters, tree step,
> > > > > > >  				tree final_iv, bool niters_maybe_zero,
> > > > > > >  				gimple_stmt_iterator loop_cond_gsi)
> > @@ -
> > > > > > 1412,7 +1412,7 @@
> > > > > > > vect_set_loop_condition (class loop *loop, edge loop_e,
> > > > > > > loop_vec_info
> > > > > > loop_vinfo
> > > > > > >     When this happens we need to flip the understanding of
> > > > > > > main and
> > > > other
> > > > > > >     exits by peeling and IV updates.  */
> > > > > > >
> > > > > > > -bool inline
> > > > > > > +bool
> > > > > > >  vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)  {
> > > > > > >    return single_pred (loop->latch) == loop_exit->src; @@
> > > > > > > -2142,6
> > > > > > > +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
> > > > > > >       Input:
> > > > > > >       - LOOP - a loop that is going to be vectorized. The last few
> > iterations
> > > > > > >                of LOOP were peeled.
> > > > > > > +     - VF   - The chosen vectorization factor for LOOP.
> > > > > > >       - NITERS - the number of iterations that LOOP executes (before it is
> > > > > > >                  vectorized). i.e, the number of times the ivs should be
> > bumped.
> > > > > > >       - UPDATE_E - a successor edge of LOOP->exit that is on
> > > > > > > the
> > > > > > > (only) path
> > > > > >
> > > > > > the comment on this is now a bit misleading, can you try to
> > > > > > update it and/or move the comment bits to the docs on EARLY_EXIT?
> > > > > >
> > > > > > > @@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p (loop_vec_info
> > > > > > loop_vinfo)
> > > > > > >                    The phi args associated with the edge UPDATE_E in the bb
> > > > > > >                    UPDATE_E->dest are updated accordingly.
> > > > > > >
> > > > > > > +     - restart_loop - Indicates whether the scalar loop needs
> > > > > > > + to restart the
> > > > > >
> > > > > > params are ALL_CAPS
> > > > > >
> > > > > > > +		      iteration count where the vector loop began.
> > > > > > > +
> > > > > > >       Assumption 1: Like the rest of the vectorizer, this function assumes
> > > > > > >       a single loop exit that has a single predecessor.
> > > > > > >
> > > > > > > @@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p (loop_vec_info
> > > > > > loop_vinfo)
> > > > > > >   */
> > > > > > >
> > > > > > >  static void
> > > > > > > -vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > > > > -				  tree niters, edge update_e)
> > > > > > > +vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > > > > +poly_uint64 vf,
> > > > > >
> > > > > > LOOP_VINFO_VECT_FACTOR?
> > > > > >
> > > > > > > +				  tree niters, edge update_e, bool
> > > > > > restart_loop)
> > > > > >
> > > > > > I think 'bool early_exit' is better here?  I wonder if we have an "early"
> > > > > > exit after the main exit we are probably sure there are no
> > > > > > side-effects to re- execute and could avoid this restarting?
> > > > >
> > > > > Side effects yes, but the actual check may not have been performed yet.
> > > > > If you remember
> > > > >
> > https://gist.github.com/Mistuke/66f14fe5c1be32b91ce149bd9b8bb35f
> > > > > There in the clz loop through the "main" exit you still have to
> > > > > see if that iteration did not contain the entry.  This is because
> > > > > the loop counter is incremented before you iterate.
> > > > >
> > > > > >
> > > > > > >  {
> > > > > > >    gphi_iterator gsi, gsi1;
> > > > > > >    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > > > > > >    basic_block update_bb = update_e->dest;
> > > > > > > -
> > > > > > > -  basic_block exit_bb = LOOP_VINFO_IV_EXIT
> > > > > > > (loop_vinfo)->dest;
> > > > > > > -
> > > > > > > -  /* Make sure there exists a single-predecessor exit bb:  */
> > > > > > > -  gcc_assert (single_pred_p (exit_bb));
> > > > > > > -  gcc_assert (single_succ_edge (exit_bb) == update_e);
> > > > > > > +  bool inversed_iv
> > > > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT
> > (loop_vinfo),
> > > > > > > +					 LOOP_VINFO_LOOP
> > (loop_vinfo));
> > > > > > > +  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS
> > (loop_vinfo)
> > > > > > > +			    && flow_bb_inside_loop_p (loop,
> > update_e->src);
> > > > > > > +  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > > > > > > +  gcond *cond = get_loop_exit_condition (loop_e);
> > > > > > > +  basic_block exit_bb = loop_e->dest;
> > > > > > > +  basic_block iv_block = NULL;
> > > > > > > +  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
> > > > > > >
> > > > > > >    for (gsi = gsi_start_phis (loop->header), gsi1 =
> > > > > > > gsi_start_phis
> > > > (update_bb);
> > > > > > >         !gsi_end_p (gsi) && !gsi_end_p (gsi1); @@ -2190,7
> > > > > > > +2198,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info
> > loop_vinfo,
> > > > > > >        tree step_expr, off;
> > > > > > >        tree type;
> > > > > > >        tree var, ni, ni_name;
> > > > > > > -      gimple_stmt_iterator last_gsi;
> > > > > > >
> > > > > > >        gphi *phi = gsi.phi ();
> > > > > > >        gphi *phi1 = gsi1.phi (); @@ -2222,11 +2229,52 @@
> > > > > > > vect_update_ivs_after_vectorizer
> > > > > > (loop_vec_info loop_vinfo,
> > > > > > >        enum vect_induction_op_type induction_type
> > > > > > >  	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
> > > > > > >
> > > > > > > -      if (induction_type == vect_step_op_add)
> > > > > > > +      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi,
> > > > > > > + loop_latch_edge
> > > > (loop));
> > > > > > > +      /* create_iv always places it on the LHS.  Alternatively we can set a
> > > > > > > +	 property during create_iv to identify it.  */
> > > > > > > +      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > > +      if (restart_loop && ivtemp)
> > > > > > >  	{
> > > > > > > +	  type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > > +	  ni = build_int_cst (type, vf);
> > > > > > > +	  if (inversed_iv)
> > > > > > > +	    ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > > +			      fold_convert (type, step_expr));
> > > > > > > +	}
> > > > > > > +      else if (induction_type == vect_step_op_add)
> > > > > > > +	{
> > > > > > > +
> > > > > > >  	  tree stype = TREE_TYPE (step_expr);
> > > > > > > -	  off = fold_build2 (MULT_EXPR, stype,
> > > > > > > -			     fold_convert (stype, niters), step_expr);
> > > > > > > +
> > > > > > > +	  /* Early exits always use last iter value not niters. */
> > > > > > > +	  if (restart_loop)
> > > > > > > +	    {
> > > > > > > +	      /* Live statements in the non-main exit shouldn't be
> > adjusted.  We
> > > > > > > +		 normally didn't have this problem with a single exit as
> > live
> > > > > > > +		 values would be in the exit block.  However when
> > dealing with
> > > > > > > +		 multiple exits all exits are redirected to the merge
> > block
> > > > > > > +		 and we restart the iteration.  */
> > > > > >
> > > > > > Hmm, I fail to see how this works - we're either using the value
> > > > > > to continue the induction or not, independent of STMT_VINFO_LIVE_P.
> > > > >
> > > > > That becomes clear in the patch to update live reductions.
> > > > > Essentially any live Reductions inside an alternative exit will
> > > > > reduce to the first element rather than the last and use that as
> > > > > the seed for the
> > > > scalar loop.
> > > >
> > > > Hum.  Reductions are vectorized as N separate reductions.  I don't
> > > > think you can simply change the reduction between the lanes to "skip"
> > > > part of the vector iteration.  But you can use the value of the
> > > > vector from before the vector iteration - the loop header PHI
> > > > result, and fully reduce that to get at the proper value.
> > >
> > > That's what It's supposed to be doing though.  The reason live
> > > operations are skipped here is that if we don't we'll re-adjust the IV
> > > even though the value will already be correct after vectorization.
> > >
> > > Remember that this code only gets so far for IV PHI nodes.
> > >
> > > The loop phi header result itself can be live, i.e. see testcases
> > > vect-early-break_70.c to vect-early-break_75.c
> > >
> > > you have i_15 = PHI <i_14 (6), 1(2)>
> > >
> > > we use i_15 in the early exit. This should not be adjusted because
> > > when it's vectorized the value at 0[lane 0] is already correct.  This
> > > is why for any PHI inside the early exits it uses the value 0[0] instead of
> > N[lane_max].
> > >
> > > Perhaps I'm missing something here?
> > 
> > OK, so I refreshed my mind of what vect_update_ivs_after_vectorizer does.
> > 
> > I still do not understand the (complexity of the) patch.  Basically the function
> > computes the new value of the IV "from scratch" based on the number of
> > scalar iterations of the vector loop, the 'niter'
> > argument.  I would have expected that for the early exits we either pass in a
> > different 'niter' or alternatively a 'niter_adjustment'.
> 
> But for an early exit there's no static value for adjusted niter, since you don't know
> which iteration you exited from. Unlike the normal exit when you know if you get
> there you've done all possible iterations.
> 
> So you must compute the scalar iteration count on the exit itself.

?  You do not need the actual scalar iteration you exited (you don't
compute that either), you need the scalar iteration the vector iteration
started with when it exited prematurely and that's readily available?

> > 
> > It seems your change handles different kinds of inductions differently.
> > Specifically
> > 
> >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> >       if (restart_loop && ivtemp)
> >         {
> >           type = TREE_TYPE (gimple_phi_result (phi));
> >           ni = build_int_cst (type, vf);
> >           if (inversed_iv)
> >             ni = fold_build2 (MINUS_EXPR, type, ni,
> >                               fold_convert (type, step_expr));
> >         }
> > 
> > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > as the new value.  That seems to be very odd special casing for unknown
> > reasons.  And while you adjust vec_step_op_add, you don't adjust
> > vect_peel_nonlinear_iv_init (maybe not supported - better assert here).
> 
> The VF case is for a normal "non-inverted" loop, where if you take an early exit
> you know that you have to do at most VF iterations.  The VF - step is to account
> for the inverted loop control flow where you exit after adjusting the IV already
> by + step.

But doesn't that assume the IV counts from niter to zero?  I don't
see this special case is actually necessary, no?

> 
> Peeling doesn't matter here, since you know you were able to do a vector iteration
> so it's safe to do VF iterations.  So having peeled doesn't affect the remaining
> iters count.
> 
> > 
> > Also the vec_step_op_add case will keep the original scalar IV live even when it
> > is a vectorized induction.  The code recomputing the value from scratch avoids
> > this.
> > 
> >       /* For non-main exit create an intermediat edge to get any updated iv
> >          calculations.  */
> >       if (needs_interm_block
> >           && !iv_block
> >           && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > (new_stmts)))
> >         {
> >           iv_block = split_edge (update_e);
> >           update_e = single_succ_edge (update_e->dest);
> >           last_gsi = gsi_last_bb (iv_block);
> >         }
> > 
> > this is also odd, can we adjust the API instead?  I suppose this is because your
> > computation uses the original loop IV, if you based the computation off the
> > initial value only this might not be necessary?
> 
> No, on the main exit the code updates the value in the loop header and puts the
> Calculation in the merge block.  This works because it only needs to consume PHI
> nodes in the merge block and things like niters are adjusted in the guard block.
> 
> For an early exit, we don't have a guard block, only the merge block. We have to
> update the PHI nodes in that block,  but can't do so since you can't produce a value
> and consume it in a PHI node in the same BB.  So we need to create the block to put
> the values in for use in the merge block.  Because there's no "guard" block for early
> exits.

?  then compute niters in that block as well.

> The API can be adjusted by always creating the empty block either during peeling.
> That would prevent us from having to do anything special here.  Would that work
> better?  Or I can do it in the loop that iterates over the exits to before the call
> to vect_update_ivs_after_vectorizer, which I think might be more consistent.
> 
> > 
> > That said, I wonder why we cannot simply pass in an adjusted niter which
> > would be niters_vector_mult_vf - vf and be done with that?
> > 
> 
> We can ofcourse not have this and recompute it from niters itself, however this does
> affect the epilog code layout. Particularly knowing the static number if iterations left
> causes it to usually unroll the loop and share some of the computations.  i.e. the scalar
> code is often more efficient.
> 
> The computation would be niters_vector_mult_vf - iters_done * vf, since the value put
> Here is the remaining iteration count.  It's static for early exits.

Well, it might be "static" in that it doesn't really matter what you
use for the epilog main IV initial value as long as you are sure
you're not going to take that exit as you are sure we're going to
take one of the early exits.  So yeah, the special code is probably
OK, but it needs a better comment and as said the structure of
vect_update_ivs_after_vectorizer is a bit hard to follow now.

As said an important part for optimization is to not keep the scalar
IVs live in the vector loop.

> But can do whatever you prefer here.  Let me know what you prefer for the above.
> 
> Thanks,
> Tamar
> 
> > Thanks,
> > Richard.
> > 
> > 
> > > Regards,
> > > Tamar
> > > >
> > > > > It has to do this since you have to perform the side effects for
> > > > > the non-matching elements still.
> > > > >
> > > > > Regards,
> > > > > Tamar
> > > > >
> > > > > >
> > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > +		continue;
> > > > > > > +
> > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > +		 init + (final - init) * vf which takes into account peeling
> > > > > > > +		 values and non-single steps.  The main exit can use
> > niters
> > > > > > > +		 since if you exit from the main exit you've done all
> > vector
> > > > > > > +		 iterations.  For an early exit we don't know when we
> > exit
> > > > > > > +so
> > > > > > we
> > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > +				 fold_convert (stype, start_expr),
> > > > > > > +				 fold_convert (stype, init_expr));
> > > > > > > +	      /* Now adjust for VF to get the final iteration value.  */
> > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > +				 build_int_cst (stype, vf));
> > > > > > > +	    }
> > > > > > > +	  else
> > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > +			       fold_convert (stype, niters), step_expr);
> > > > > > > +
> > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > >  	  else
> > > > > > > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer
> > > > > > > (loop_vec_info
> > > > > > loop_vinfo,
> > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > >  	ni = init_expr;
> > > > > > > +      else if (restart_loop)
> > > > > > > +	continue;
> > > > > >
> > > > > > This looks all a bit complicated - why wouldn't we simply always
> > > > > > use the PHI result when 'restart_loop'?  Isn't that the correct
> > > > > > old start value in
> > > > all cases?
> > > > > >
> > > > > > >        else
> > > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > > > >  					  niters, step_expr,
> > > > > > > @@ -2245,9 +2295,20 @@ vect_update_ivs_after_vectorizer
> > > > > > (loop_vec_info
> > > > > > > loop_vinfo,
> > > > > > >
> > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > >
> > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > >        ni_name = force_gimple_operand (ni, &new_stmts, false,
> > > > > > > var);
> > > > > > > +
> > > > > > > +      /* For non-main exit create an intermediat edge to get any
> > updated iv
> > > > > > > +	 calculations.  */
> > > > > > > +      if (needs_interm_block
> > > > > > > +	  && !iv_block
> > > > > > > +	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > > > > (new_stmts)))
> > > > > > > +	{
> > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > +	}
> > > > > > > +
> > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > >  	{
> > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info
> > > > > > > loop_vinfo, tree
> > > > > > niters, tree nitersm1,
> > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > >        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
> > > > > > >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > niters_vector_mult_vf,
> > > > > > > -					update_e);
> > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > +      bool inversed_iv
> > > > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT
> > (loop_vinfo),
> > > > > > > +					 LOOP_VINFO_LOOP
> > (loop_vinfo));
> > > > > >
> > > > > > You are computing this here and in vect_update_ivs_after_vectorizer?
> > > > > >
> > > > > > > +
> > > > > > > +      /* Update the main exit first.  */
> > > > > > > +      vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > niters_vector_mult_vf,
> > > > > > > +					update_e, inversed_iv);
> > > > > > > +
> > > > > > > +      /* And then update the early exits.  */
> > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > +	{
> > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > > > +	    continue;
> > > > > > > +
> > > > > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > > > > +					    niters_vector_mult_vf,
> > > > > > > +					    exit, true);
> > > > > >
> > > > > > ... why does the same not work here?  Wouldn't the proper
> > > > > > condition be !dominated_by_p (CDI_DOMINATORS, exit->src,
> > > > > > LOOP_VINFO_IV_EXIT
> > > > > > (loop_vinfo)->src) or similar?  That is, whether the exit is at
> > > > > > or after the main IV exit?  (consider having two)
> > > > > >
> > > > > > > +	}
> > > > > > >
> > > > > > >        if (skip_epilog)
> > > > > > >  	{
> > > > > > >
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
>
Tamar Christina Nov. 16, 2023, 12:01 p.m. UTC | #9
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Thursday, November 16, 2023 11:28 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> breaks and arbitrary exits
> 
> On Thu, 16 Nov 2023, Tamar Christina wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Thursday, November 16, 2023 10:40 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> jlaw@ventanamicro.com
> > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > support early breaks and arbitrary exits
> > >
> > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <rguenther@suse.de>
> > > > > Sent: Wednesday, November 15, 2023 1:23 PM
> > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > > jlaw@ventanamicro.com
> > > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > > > support early breaks and arbitrary exits
> > > > >
> > > > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Richard Biener <rguenther@suse.de>
> > > > > > > Sent: Wednesday, November 15, 2023 1:01 PM
> > > > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > > > > jlaw@ventanamicro.com
> > > > > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code
> > > > > > > to support early breaks and arbitrary exits
> > > > > > >
> > > > > > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > > > > > >
> > > > > > > > Patch updated to latest trunk:
> > > > > > > >
> > > > > > > > Hi All,
> > > > > > > >
> > > > > > > > This changes the PHI node updates to support early breaks.
> > > > > > > > It has to support both the case where the loop's exit
> > > > > > > > matches the normal loop exit and one where the early exit is
> "inverted", i.e.
> > > > > > > > it's an early
> > > > > > > exit edge.
> > > > > > > >
> > > > > > > > In the latter case we must always restart the loop for VF iterations.
> > > > > > > > For an early exit the reason is obvious, but there are
> > > > > > > > cases where the "normal" exit is located before the early
> > > > > > > > one.  This exit then does a check on ivtmp resulting in us
> > > > > > > > leaving the loop since it thinks we're
> > > > > done.
> > > > > > > >
> > > > > > > > In these case we may still have side-effects to perform so
> > > > > > > > we also go to the scalar loop.
> > > > > > > >
> > > > > > > > For the "normal" exit niters has already been adjusted for
> > > > > > > > peeling, for the early exits we must find out how many
> > > > > > > > iterations we actually did.  So we have to recalculate the
> > > > > > > > new position
> > > for each exit.
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > > Tamar
> > > > > > > >
> > > > > > > > gcc/ChangeLog:
> > > > > > > >
> > > > > > > > 	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal):
> > > > > > > > Hide
> > > > > > > unused.
> > > > > > > > 	(vect_update_ivs_after_vectorizer): Support early break.
> > > > > > > > 	(vect_do_peeling): Use it.
> > > > > > > >
> > > > > > > > --- inline copy of patch ---
> > > > > > > >
> > > > > > > > diff --git a/gcc/tree-vect-loop-manip.cc
> > > > > > > > b/gcc/tree-vect-loop-manip.cc index
> > > > > > > >
> > > > > > >
> > > > >
> > >
> d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3
> > > > > > > d2654cf1
> > > > > > > > c842baac58f5 100644
> > > > > > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > > > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > > > > > @@ -1200,7 +1200,7 @@
> > > > > > > > vect_set_loop_condition_partial_vectors_avx512
> > > > > > > (class loop *loop,
> > > > > > > >     loop handles exactly VF scalars per iteration.  */
> > > > > > > >
> > > > > > > >  static gcond *
> > > > > > > > -vect_set_loop_condition_normal (loop_vec_info loop_vinfo,
> > > > > > > > edge exit_edge,
> > > > > > > > +vect_set_loop_condition_normal (loop_vec_info /*
> > > > > > > > +loop_vinfo */, edge exit_edge,
> > > > > > > >  				class loop *loop, tree niters, tree step,
> > > > > > > >  				tree final_iv, bool niters_maybe_zero,
> > > > > > > >  				gimple_stmt_iterator loop_cond_gsi)
> > > @@ -
> > > > > > > 1412,7 +1412,7 @@
> > > > > > > > vect_set_loop_condition (class loop *loop, edge loop_e,
> > > > > > > > loop_vec_info
> > > > > > > loop_vinfo
> > > > > > > >     When this happens we need to flip the understanding of
> > > > > > > > main and
> > > > > other
> > > > > > > >     exits by peeling and IV updates.  */
> > > > > > > >
> > > > > > > > -bool inline
> > > > > > > > +bool
> > > > > > > >  vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)  {
> > > > > > > >    return single_pred (loop->latch) == loop_exit->src; @@
> > > > > > > > -2142,6
> > > > > > > > +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info
> > > > > > > > +loop_vinfo)
> > > > > > > >       Input:
> > > > > > > >       - LOOP - a loop that is going to be vectorized. The
> > > > > > > > last few
> > > iterations
> > > > > > > >                of LOOP were peeled.
> > > > > > > > +     - VF   - The chosen vectorization factor for LOOP.
> > > > > > > >       - NITERS - the number of iterations that LOOP executes (before
> it is
> > > > > > > >                  vectorized). i.e, the number of times the
> > > > > > > > ivs should be
> > > bumped.
> > > > > > > >       - UPDATE_E - a successor edge of LOOP->exit that is
> > > > > > > > on the
> > > > > > > > (only) path
> > > > > > >
> > > > > > > the comment on this is now a bit misleading, can you try to
> > > > > > > update it and/or move the comment bits to the docs on
> EARLY_EXIT?
> > > > > > >
> > > > > > > > @@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p
> > > > > > > > (loop_vec_info
> > > > > > > loop_vinfo)
> > > > > > > >                    The phi args associated with the edge UPDATE_E in the
> bb
> > > > > > > >                    UPDATE_E->dest are updated accordingly.
> > > > > > > >
> > > > > > > > +     - restart_loop - Indicates whether the scalar loop
> > > > > > > > + needs to restart the
> > > > > > >
> > > > > > > params are ALL_CAPS
> > > > > > >
> > > > > > > > +		      iteration count where the vector loop began.
> > > > > > > > +
> > > > > > > >       Assumption 1: Like the rest of the vectorizer, this function
> assumes
> > > > > > > >       a single loop exit that has a single predecessor.
> > > > > > > >
> > > > > > > > @@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p
> > > > > > > > (loop_vec_info
> > > > > > > loop_vinfo)
> > > > > > > >   */
> > > > > > > >
> > > > > > > >  static void
> > > > > > > > -vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > > > > > -				  tree niters, edge update_e)
> > > > > > > > +vect_update_ivs_after_vectorizer (loop_vec_info
> > > > > > > > +loop_vinfo,
> > > > > > > > +poly_uint64 vf,
> > > > > > >
> > > > > > > LOOP_VINFO_VECT_FACTOR?
> > > > > > >
> > > > > > > > +				  tree niters, edge update_e, bool
> > > > > > > restart_loop)
> > > > > > >
> > > > > > > I think 'bool early_exit' is better here?  I wonder if we have an "early"
> > > > > > > exit after the main exit we are probably sure there are no
> > > > > > > side-effects to re- execute and could avoid this restarting?
> > > > > >
> > > > > > Side effects yes, but the actual check may not have been performed
> yet.
> > > > > > If you remember
> > > > > >
> > > https://gist.github.com/Mistuke/66f14fe5c1be32b91ce149bd9b8bb35f
> > > > > > There in the clz loop through the "main" exit you still have
> > > > > > to see if that iteration did not contain the entry.  This is
> > > > > > because the loop counter is incremented before you iterate.
> > > > > >
> > > > > > >
> > > > > > > >  {
> > > > > > > >    gphi_iterator gsi, gsi1;
> > > > > > > >    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > > > > > > >    basic_block update_bb = update_e->dest;
> > > > > > > > -
> > > > > > > > -  basic_block exit_bb = LOOP_VINFO_IV_EXIT
> > > > > > > > (loop_vinfo)->dest;
> > > > > > > > -
> > > > > > > > -  /* Make sure there exists a single-predecessor exit bb:
> > > > > > > > */
> > > > > > > > -  gcc_assert (single_pred_p (exit_bb));
> > > > > > > > -  gcc_assert (single_succ_edge (exit_bb) == update_e);
> > > > > > > > +  bool inversed_iv
> > > > > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT
> > > (loop_vinfo),
> > > > > > > > +					 LOOP_VINFO_LOOP
> > > (loop_vinfo));
> > > > > > > > +  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS
> > > (loop_vinfo)
> > > > > > > > +			    && flow_bb_inside_loop_p (loop,
> > > update_e->src);
> > > > > > > > +  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);  gcond
> > > > > > > > + *cond = get_loop_exit_condition (loop_e);  basic_block
> > > > > > > > + exit_bb = loop_e->dest;  basic_block iv_block = NULL;
> > > > > > > > + gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
> > > > > > > >
> > > > > > > >    for (gsi = gsi_start_phis (loop->header), gsi1 =
> > > > > > > > gsi_start_phis
> > > > > (update_bb);
> > > > > > > >         !gsi_end_p (gsi) && !gsi_end_p (gsi1); @@ -2190,7
> > > > > > > > +2198,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info
> > > loop_vinfo,
> > > > > > > >        tree step_expr, off;
> > > > > > > >        tree type;
> > > > > > > >        tree var, ni, ni_name;
> > > > > > > > -      gimple_stmt_iterator last_gsi;
> > > > > > > >
> > > > > > > >        gphi *phi = gsi.phi ();
> > > > > > > >        gphi *phi1 = gsi1.phi (); @@ -2222,11 +2229,52 @@
> > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > (loop_vec_info loop_vinfo,
> > > > > > > >        enum vect_induction_op_type induction_type
> > > > > > > >  	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
> > > > > > > >
> > > > > > > > -      if (induction_type == vect_step_op_add)
> > > > > > > > +      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi,
> > > > > > > > + loop_latch_edge
> > > > > (loop));
> > > > > > > > +      /* create_iv always places it on the LHS.  Alternatively we can
> set a
> > > > > > > > +	 property during create_iv to identify it.  */
> > > > > > > > +      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > > > +      if (restart_loop && ivtemp)
> > > > > > > >  	{
> > > > > > > > +	  type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > > > +	  ni = build_int_cst (type, vf);
> > > > > > > > +	  if (inversed_iv)
> > > > > > > > +	    ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > > > +			      fold_convert (type, step_expr));
> > > > > > > > +	}
> > > > > > > > +      else if (induction_type == vect_step_op_add)
> > > > > > > > +	{
> > > > > > > > +
> > > > > > > >  	  tree stype = TREE_TYPE (step_expr);
> > > > > > > > -	  off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > -			     fold_convert (stype, niters), step_expr);
> > > > > > > > +
> > > > > > > > +	  /* Early exits always use last iter value not niters. */
> > > > > > > > +	  if (restart_loop)
> > > > > > > > +	    {
> > > > > > > > +	      /* Live statements in the non-main exit shouldn't
> > > > > > > > +be
> > > adjusted.  We
> > > > > > > > +		 normally didn't have this problem with a single exit
> > > > > > > > +as
> > > live
> > > > > > > > +		 values would be in the exit block.  However when
> > > dealing with
> > > > > > > > +		 multiple exits all exits are redirected to the merge
> > > block
> > > > > > > > +		 and we restart the iteration.  */
> > > > > > >
> > > > > > > Hmm, I fail to see how this works - we're either using the
> > > > > > > value to continue the induction or not, independent of
> STMT_VINFO_LIVE_P.
> > > > > >
> > > > > > That becomes clear in the patch to update live reductions.
> > > > > > Essentially any live Reductions inside an alternative exit
> > > > > > will reduce to the first element rather than the last and use
> > > > > > that as the seed for the
> > > > > scalar loop.
> > > > >
> > > > > Hum.  Reductions are vectorized as N separate reductions.  I
> > > > > don't think you can simply change the reduction between the lanes to
> "skip"
> > > > > part of the vector iteration.  But you can use the value of the
> > > > > vector from before the vector iteration - the loop header PHI
> > > > > result, and fully reduce that to get at the proper value.
> > > >
> > > > That's what It's supposed to be doing though.  The reason live
> > > > operations are skipped here is that if we don't we'll re-adjust
> > > > the IV even though the value will already be correct after vectorization.
> > > >
> > > > Remember that this code only gets so far for IV PHI nodes.
> > > >
> > > > The loop phi header result itself can be live, i.e. see testcases
> > > > vect-early-break_70.c to vect-early-break_75.c
> > > >
> > > > you have i_15 = PHI <i_14 (6), 1(2)>
> > > >
> > > > we use i_15 in the early exit. This should not be adjusted because
> > > > when it's vectorized the value at 0[lane 0] is already correct.
> > > > This is why for any PHI inside the early exits it uses the value
> > > > 0[0] instead of
> > > N[lane_max].
> > > >
> > > > Perhaps I'm missing something here?
> > >
> > > OK, so I refreshed my mind of what vect_update_ivs_after_vectorizer
> does.
> > >
> > > I still do not understand the (complexity of the) patch.  Basically
> > > the function computes the new value of the IV "from scratch" based
> > > on the number of scalar iterations of the vector loop, the 'niter'
> > > argument.  I would have expected that for the early exits we either
> > > pass in a different 'niter' or alternatively a 'niter_adjustment'.
> >
> > But for an early exit there's no static value for adjusted niter,
> > since you don't know which iteration you exited from. Unlike the
> > normal exit when you know if you get there you've done all possible
> iterations.
> >
> > So you must compute the scalar iteration count on the exit itself.
> 
> ?  You do not need the actual scalar iteration you exited (you don't compute
> that either), you need the scalar iteration the vector iteration started with
> when it exited prematurely and that's readily available?

For a normal exit yes, not for an early exit no? niters_vector_mult_vf is only
valid for the main exit.

There's the unadjusted scalar count, which is what it's using to adjust it to
the final count.  Unless I'm missing something?

> 
> > >
> > > It seems your change handles different kinds of inductions differently.
> > > Specifically
> > >
> > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > >       if (restart_loop && ivtemp)
> > >         {
> > >           type = TREE_TYPE (gimple_phi_result (phi));
> > >           ni = build_int_cst (type, vf);
> > >           if (inversed_iv)
> > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > >                               fold_convert (type, step_expr));
> > >         }
> > >
> > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > as the new value.  That seems to be very odd special casing for
> > > unknown reasons.  And while you adjust vec_step_op_add, you don't
> > > adjust vect_peel_nonlinear_iv_init (maybe not supported - better assert
> here).
> >
> > The VF case is for a normal "non-inverted" loop, where if you take an
> > early exit you know that you have to do at most VF iterations.  The VF
> > - step is to account for the inverted loop control flow where you exit
> > after adjusting the IV already by + step.
> 
> But doesn't that assume the IV counts from niter to zero?  I don't see this
> special case is actually necessary, no?
> 

I needed it because otherwise the scalar loop iterates one iteration too little
So I got a miscompile with the inverter loop stuff.  I'll look at it again perhaps
It can be solved differently.

> >
> > Peeling doesn't matter here, since you know you were able to do a
> > vector iteration so it's safe to do VF iterations.  So having peeled
> > doesn't affect the remaining iters count.
> >
> > >
> > > Also the vec_step_op_add case will keep the original scalar IV live
> > > even when it is a vectorized induction.  The code recomputing the
> > > value from scratch avoids this.
> > >
> > >       /* For non-main exit create an intermediat edge to get any updated iv
> > >          calculations.  */
> > >       if (needs_interm_block
> > >           && !iv_block
> > >           && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > (new_stmts)))
> > >         {
> > >           iv_block = split_edge (update_e);
> > >           update_e = single_succ_edge (update_e->dest);
> > >           last_gsi = gsi_last_bb (iv_block);
> > >         }
> > >
> > > this is also odd, can we adjust the API instead?  I suppose this is
> > > because your computation uses the original loop IV, if you based the
> > > computation off the initial value only this might not be necessary?
> >
> > No, on the main exit the code updates the value in the loop header and
> > puts the Calculation in the merge block.  This works because it only
> > needs to consume PHI nodes in the merge block and things like niters are
> adjusted in the guard block.
> >
> > For an early exit, we don't have a guard block, only the merge block.
> > We have to update the PHI nodes in that block,  but can't do so since
> > you can't produce a value and consume it in a PHI node in the same BB.
> > So we need to create the block to put the values in for use in the
> > merge block.  Because there's no "guard" block for early exits.
> 
> ?  then compute niters in that block as well.

We can't since it'll not be reachable through the right edge.  What we can
do if you want is slightly change peeling, we currently peel as:

  \        \             /
  E1     E2        Normal exit
    \       |          |
       \    |          Guard
          \ |          |
         Merge block
                  |
             Pre Header

If we instead peel as:


  \        \             /
  E1     E2        Normal exit
    \       |          |
       Exit join   Guard
          \ |          |
         Merge block
                  |
             Pre Header

We can use the exit join block.  This would also mean vect_update_ivs_after_vectorizer
Doesn't need to iterate over all exits and only really needs to adjust the phi nodes
Coming out of the exit join and guard block.

Does this work for you?

Thanks,
Tamar
> 
> > The API can be adjusted by always creating the empty block either during
> peeling.
> > That would prevent us from having to do anything special here.  Would
> > that work better?  Or I can do it in the loop that iterates over the
> > exits to before the call to vect_update_ivs_after_vectorizer, which I think
> might be more consistent.
> >
> > >
> > > That said, I wonder why we cannot simply pass in an adjusted niter
> > > which would be niters_vector_mult_vf - vf and be done with that?
> > >
> >
> > We can ofcourse not have this and recompute it from niters itself,
> > however this does affect the epilog code layout. Particularly knowing
> > the static number if iterations left causes it to usually unroll the
> > loop and share some of the computations.  i.e. the scalar code is often more
> efficient.
> >
> > The computation would be niters_vector_mult_vf - iters_done * vf,
> > since the value put Here is the remaining iteration count.  It's static for early
> exits.
> 
> Well, it might be "static" in that it doesn't really matter what you use for the
> epilog main IV initial value as long as you are sure you're not going to take that
> exit as you are sure we're going to take one of the early exits.  So yeah, the
> special code is probably OK, but it needs a better comment and as said the
> structure of vect_update_ivs_after_vectorizer is a bit hard to follow now.
> 
> As said an important part for optimization is to not keep the scalar IVs live in
> the vector loop.
> 
> > But can do whatever you prefer here.  Let me know what you prefer for the
> above.
> >
> > Thanks,
> > Tamar
> >
> > > Thanks,
> > > Richard.
> > >
> > >
> > > > Regards,
> > > > Tamar
> > > > >
> > > > > > It has to do this since you have to perform the side effects
> > > > > > for the non-matching elements still.
> > > > > >
> > > > > > Regards,
> > > > > > Tamar
> > > > > >
> > > > > > >
> > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > +		continue;
> > > > > > > > +
> > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > +		 init + (final - init) * vf which takes into account peeling
> > > > > > > > +		 values and non-single steps.  The main exit can use
> > > niters
> > > > > > > > +		 since if you exit from the main exit you've done all
> > > vector
> > > > > > > > +		 iterations.  For an early exit we don't know when we
> > > exit
> > > > > > > > +so
> > > > > > > we
> > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > +				 fold_convert (stype, start_expr),
> > > > > > > > +				 fold_convert (stype, init_expr));
> > > > > > > > +	      /* Now adjust for VF to get the final iteration value.  */
> > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > +				 build_int_cst (stype, vf));
> > > > > > > > +	    }
> > > > > > > > +	  else
> > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > +			       fold_convert (stype, niters), step_expr);
> > > > > > > > +
> > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > >  	  else
> > > > > > > > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer
> > > > > > > > (loop_vec_info
> > > > > > > loop_vinfo,
> > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > >  	ni = init_expr;
> > > > > > > > +      else if (restart_loop)
> > > > > > > > +	continue;
> > > > > > >
> > > > > > > This looks all a bit complicated - why wouldn't we simply
> > > > > > > always use the PHI result when 'restart_loop'?  Isn't that
> > > > > > > the correct old start value in
> > > > > all cases?
> > > > > > >
> > > > > > > >        else
> > > > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > > > > >  					  niters, step_expr, @@ -
> 2245,9 +2295,20 @@
> > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > (loop_vec_info
> > > > > > > > loop_vinfo,
> > > > > > > >
> > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > >
> > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > >        ni_name = force_gimple_operand (ni, &new_stmts,
> > > > > > > > false, var);
> > > > > > > > +
> > > > > > > > +      /* For non-main exit create an intermediat edge to
> > > > > > > > + get any
> > > updated iv
> > > > > > > > +	 calculations.  */
> > > > > > > > +      if (needs_interm_block
> > > > > > > > +	  && !iv_block
> > > > > > > > +	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > > > > > (new_stmts)))
> > > > > > > > +	{
> > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > >  	{
> > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info
> > > > > > > > loop_vinfo, tree
> > > > > > > niters, tree nitersm1,
> > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > >        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
> > > > > > > >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > niters_vector_mult_vf,
> > > > > > > > -					update_e);
> > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > +      bool inversed_iv
> > > > > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT
> > > (loop_vinfo),
> > > > > > > > +					 LOOP_VINFO_LOOP
> > > (loop_vinfo));
> > > > > > >
> > > > > > > You are computing this here and in
> vect_update_ivs_after_vectorizer?
> > > > > > >
> > > > > > > > +
> > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > +      vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > > niters_vector_mult_vf,
> > > > > > > > +					update_e, inversed_iv);
> > > > > > > > +
> > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > +	{
> > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > > > > +	    continue;
> > > > > > > > +
> > > > > > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > > > > > +					    niters_vector_mult_vf,
> > > > > > > > +					    exit, true);
> > > > > > >
> > > > > > > ... why does the same not work here?  Wouldn't the proper
> > > > > > > condition be !dominated_by_p (CDI_DOMINATORS, exit->src,
> > > > > > > LOOP_VINFO_IV_EXIT
> > > > > > > (loop_vinfo)->src) or similar?  That is, whether the exit is
> > > > > > > at or after the main IV exit?  (consider having two)
> > > > > > >
> > > > > > > > +	}
> > > > > > > >
> > > > > > > >        if (skip_epilog)
> > > > > > > >  	{
> > > > > > > >
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
Richard Biener Nov. 16, 2023, 12:30 p.m. UTC | #10
On Thu, 16 Nov 2023, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Thursday, November 16, 2023 11:28 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> > breaks and arbitrary exits
> > 
> > On Thu, 16 Nov 2023, Tamar Christina wrote:
> > 
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Thursday, November 16, 2023 10:40 AM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > jlaw@ventanamicro.com
> > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > > support early breaks and arbitrary exits
> > > >
> > > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > > >
> > > > > > -----Original Message-----
> > > > > > From: Richard Biener <rguenther@suse.de>
> > > > > > Sent: Wednesday, November 15, 2023 1:23 PM
> > > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > > > jlaw@ventanamicro.com
> > > > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > > > > support early breaks and arbitrary exits
> > > > > >
> > > > > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > > > > >
> > > > > > > > -----Original Message-----
> > > > > > > > From: Richard Biener <rguenther@suse.de>
> > > > > > > > Sent: Wednesday, November 15, 2023 1:01 PM
> > > > > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > > > > > jlaw@ventanamicro.com
> > > > > > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code
> > > > > > > > to support early breaks and arbitrary exits
> > > > > > > >
> > > > > > > > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > > > > > > >
> > > > > > > > > Patch updated to latest trunk:
> > > > > > > > >
> > > > > > > > > Hi All,
> > > > > > > > >
> > > > > > > > > This changes the PHI node updates to support early breaks.
> > > > > > > > > It has to support both the case where the loop's exit
> > > > > > > > > matches the normal loop exit and one where the early exit is
> > "inverted", i.e.
> > > > > > > > > it's an early
> > > > > > > > exit edge.
> > > > > > > > >
> > > > > > > > > In the latter case we must always restart the loop for VF iterations.
> > > > > > > > > For an early exit the reason is obvious, but there are
> > > > > > > > > cases where the "normal" exit is located before the early
> > > > > > > > > one.  This exit then does a check on ivtmp resulting in us
> > > > > > > > > leaving the loop since it thinks we're
> > > > > > done.
> > > > > > > > >
> > > > > > > > > In these case we may still have side-effects to perform so
> > > > > > > > > we also go to the scalar loop.
> > > > > > > > >
> > > > > > > > > For the "normal" exit niters has already been adjusted for
> > > > > > > > > peeling, for the early exits we must find out how many
> > > > > > > > > iterations we actually did.  So we have to recalculate the
> > > > > > > > > new position
> > > > for each exit.
> > > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > > Tamar
> > > > > > > > >
> > > > > > > > > gcc/ChangeLog:
> > > > > > > > >
> > > > > > > > > 	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal):
> > > > > > > > > Hide
> > > > > > > > unused.
> > > > > > > > > 	(vect_update_ivs_after_vectorizer): Support early break.
> > > > > > > > > 	(vect_do_peeling): Use it.
> > > > > > > > >
> > > > > > > > > --- inline copy of patch ---
> > > > > > > > >
> > > > > > > > > diff --git a/gcc/tree-vect-loop-manip.cc
> > > > > > > > > b/gcc/tree-vect-loop-manip.cc index
> > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> > d3fa8699271c4d7f404d648a38a95beabeabc99a..e1d210ab4617c894dab3
> > > > > > > > d2654cf1
> > > > > > > > > c842baac58f5 100644
> > > > > > > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > > > > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > > > > > > @@ -1200,7 +1200,7 @@
> > > > > > > > > vect_set_loop_condition_partial_vectors_avx512
> > > > > > > > (class loop *loop,
> > > > > > > > >     loop handles exactly VF scalars per iteration.  */
> > > > > > > > >
> > > > > > > > >  static gcond *
> > > > > > > > > -vect_set_loop_condition_normal (loop_vec_info loop_vinfo,
> > > > > > > > > edge exit_edge,
> > > > > > > > > +vect_set_loop_condition_normal (loop_vec_info /*
> > > > > > > > > +loop_vinfo */, edge exit_edge,
> > > > > > > > >  				class loop *loop, tree niters, tree step,
> > > > > > > > >  				tree final_iv, bool niters_maybe_zero,
> > > > > > > > >  				gimple_stmt_iterator loop_cond_gsi)
> > > > @@ -
> > > > > > > > 1412,7 +1412,7 @@
> > > > > > > > > vect_set_loop_condition (class loop *loop, edge loop_e,
> > > > > > > > > loop_vec_info
> > > > > > > > loop_vinfo
> > > > > > > > >     When this happens we need to flip the understanding of
> > > > > > > > > main and
> > > > > > other
> > > > > > > > >     exits by peeling and IV updates.  */
> > > > > > > > >
> > > > > > > > > -bool inline
> > > > > > > > > +bool
> > > > > > > > >  vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)  {
> > > > > > > > >    return single_pred (loop->latch) == loop_exit->src; @@
> > > > > > > > > -2142,6
> > > > > > > > > +2142,7 @@ vect_can_advance_ivs_p (loop_vec_info
> > > > > > > > > +loop_vinfo)
> > > > > > > > >       Input:
> > > > > > > > >       - LOOP - a loop that is going to be vectorized. The
> > > > > > > > > last few
> > > > iterations
> > > > > > > > >                of LOOP were peeled.
> > > > > > > > > +     - VF   - The chosen vectorization factor for LOOP.
> > > > > > > > >       - NITERS - the number of iterations that LOOP executes (before
> > it is
> > > > > > > > >                  vectorized). i.e, the number of times the
> > > > > > > > > ivs should be
> > > > bumped.
> > > > > > > > >       - UPDATE_E - a successor edge of LOOP->exit that is
> > > > > > > > > on the
> > > > > > > > > (only) path
> > > > > > > >
> > > > > > > > the comment on this is now a bit misleading, can you try to
> > > > > > > > update it and/or move the comment bits to the docs on
> > EARLY_EXIT?
> > > > > > > >
> > > > > > > > > @@ -2152,6 +2153,9 @@ vect_can_advance_ivs_p
> > > > > > > > > (loop_vec_info
> > > > > > > > loop_vinfo)
> > > > > > > > >                    The phi args associated with the edge UPDATE_E in the
> > bb
> > > > > > > > >                    UPDATE_E->dest are updated accordingly.
> > > > > > > > >
> > > > > > > > > +     - restart_loop - Indicates whether the scalar loop
> > > > > > > > > + needs to restart the
> > > > > > > >
> > > > > > > > params are ALL_CAPS
> > > > > > > >
> > > > > > > > > +		      iteration count where the vector loop began.
> > > > > > > > > +
> > > > > > > > >       Assumption 1: Like the rest of the vectorizer, this function
> > assumes
> > > > > > > > >       a single loop exit that has a single predecessor.
> > > > > > > > >
> > > > > > > > > @@ -2169,18 +2173,22 @@ vect_can_advance_ivs_p
> > > > > > > > > (loop_vec_info
> > > > > > > > loop_vinfo)
> > > > > > > > >   */
> > > > > > > > >
> > > > > > > > >  static void
> > > > > > > > > -vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> > > > > > > > > -				  tree niters, edge update_e)
> > > > > > > > > +vect_update_ivs_after_vectorizer (loop_vec_info
> > > > > > > > > +loop_vinfo,
> > > > > > > > > +poly_uint64 vf,
> > > > > > > >
> > > > > > > > LOOP_VINFO_VECT_FACTOR?
> > > > > > > >
> > > > > > > > > +				  tree niters, edge update_e, bool
> > > > > > > > restart_loop)
> > > > > > > >
> > > > > > > > I think 'bool early_exit' is better here?  I wonder if we have an "early"
> > > > > > > > exit after the main exit we are probably sure there are no
> > > > > > > > side-effects to re- execute and could avoid this restarting?
> > > > > > >
> > > > > > > Side effects yes, but the actual check may not have been performed
> > yet.
> > > > > > > If you remember
> > > > > > >
> > > > https://gist.github.com/Mistuke/66f14fe5c1be32b91ce149bd9b8bb35f
> > > > > > > There in the clz loop through the "main" exit you still have
> > > > > > > to see if that iteration did not contain the entry.  This is
> > > > > > > because the loop counter is incremented before you iterate.
> > > > > > >
> > > > > > > >
> > > > > > > > >  {
> > > > > > > > >    gphi_iterator gsi, gsi1;
> > > > > > > > >    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > > > > > > > >    basic_block update_bb = update_e->dest;
> > > > > > > > > -
> > > > > > > > > -  basic_block exit_bb = LOOP_VINFO_IV_EXIT
> > > > > > > > > (loop_vinfo)->dest;
> > > > > > > > > -
> > > > > > > > > -  /* Make sure there exists a single-predecessor exit bb:
> > > > > > > > > */
> > > > > > > > > -  gcc_assert (single_pred_p (exit_bb));
> > > > > > > > > -  gcc_assert (single_succ_edge (exit_bb) == update_e);
> > > > > > > > > +  bool inversed_iv
> > > > > > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT
> > > > (loop_vinfo),
> > > > > > > > > +					 LOOP_VINFO_LOOP
> > > > (loop_vinfo));
> > > > > > > > > +  bool needs_interm_block = LOOP_VINFO_EARLY_BREAKS
> > > > (loop_vinfo)
> > > > > > > > > +			    && flow_bb_inside_loop_p (loop,
> > > > update_e->src);
> > > > > > > > > +  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);  gcond
> > > > > > > > > + *cond = get_loop_exit_condition (loop_e);  basic_block
> > > > > > > > > + exit_bb = loop_e->dest;  basic_block iv_block = NULL;
> > > > > > > > > + gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > >
> > > > > > > > >    for (gsi = gsi_start_phis (loop->header), gsi1 =
> > > > > > > > > gsi_start_phis
> > > > > > (update_bb);
> > > > > > > > >         !gsi_end_p (gsi) && !gsi_end_p (gsi1); @@ -2190,7
> > > > > > > > > +2198,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info
> > > > loop_vinfo,
> > > > > > > > >        tree step_expr, off;
> > > > > > > > >        tree type;
> > > > > > > > >        tree var, ni, ni_name;
> > > > > > > > > -      gimple_stmt_iterator last_gsi;
> > > > > > > > >
> > > > > > > > >        gphi *phi = gsi.phi ();
> > > > > > > > >        gphi *phi1 = gsi1.phi (); @@ -2222,11 +2229,52 @@
> > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > (loop_vec_info loop_vinfo,
> > > > > > > > >        enum vect_induction_op_type induction_type
> > > > > > > > >  	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
> > > > > > > > >
> > > > > > > > > -      if (induction_type == vect_step_op_add)
> > > > > > > > > +      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi,
> > > > > > > > > + loop_latch_edge
> > > > > > (loop));
> > > > > > > > > +      /* create_iv always places it on the LHS.  Alternatively we can
> > set a
> > > > > > > > > +	 property during create_iv to identify it.  */
> > > > > > > > > +      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > > > > +      if (restart_loop && ivtemp)
> > > > > > > > >  	{
> > > > > > > > > +	  type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > > > > +	  ni = build_int_cst (type, vf);
> > > > > > > > > +	  if (inversed_iv)
> > > > > > > > > +	    ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > > > > +			      fold_convert (type, step_expr));
> > > > > > > > > +	}
> > > > > > > > > +      else if (induction_type == vect_step_op_add)
> > > > > > > > > +	{
> > > > > > > > > +
> > > > > > > > >  	  tree stype = TREE_TYPE (step_expr);
> > > > > > > > > -	  off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > -			     fold_convert (stype, niters), step_expr);
> > > > > > > > > +
> > > > > > > > > +	  /* Early exits always use last iter value not niters. */
> > > > > > > > > +	  if (restart_loop)
> > > > > > > > > +	    {
> > > > > > > > > +	      /* Live statements in the non-main exit shouldn't
> > > > > > > > > +be
> > > > adjusted.  We
> > > > > > > > > +		 normally didn't have this problem with a single exit
> > > > > > > > > +as
> > > > live
> > > > > > > > > +		 values would be in the exit block.  However when
> > > > dealing with
> > > > > > > > > +		 multiple exits all exits are redirected to the merge
> > > > block
> > > > > > > > > +		 and we restart the iteration.  */
> > > > > > > >
> > > > > > > > Hmm, I fail to see how this works - we're either using the
> > > > > > > > value to continue the induction or not, independent of
> > STMT_VINFO_LIVE_P.
> > > > > > >
> > > > > > > That becomes clear in the patch to update live reductions.
> > > > > > > Essentially any live Reductions inside an alternative exit
> > > > > > > will reduce to the first element rather than the last and use
> > > > > > > that as the seed for the
> > > > > > scalar loop.
> > > > > >
> > > > > > Hum.  Reductions are vectorized as N separate reductions.  I
> > > > > > don't think you can simply change the reduction between the lanes to
> > "skip"
> > > > > > part of the vector iteration.  But you can use the value of the
> > > > > > vector from before the vector iteration - the loop header PHI
> > > > > > result, and fully reduce that to get at the proper value.
> > > > >
> > > > > That's what It's supposed to be doing though.  The reason live
> > > > > operations are skipped here is that if we don't we'll re-adjust
> > > > > the IV even though the value will already be correct after vectorization.
> > > > >
> > > > > Remember that this code only gets so far for IV PHI nodes.
> > > > >
> > > > > The loop phi header result itself can be live, i.e. see testcases
> > > > > vect-early-break_70.c to vect-early-break_75.c
> > > > >
> > > > > you have i_15 = PHI <i_14 (6), 1(2)>
> > > > >
> > > > > we use i_15 in the early exit. This should not be adjusted because
> > > > > when it's vectorized the value at 0[lane 0] is already correct.
> > > > > This is why for any PHI inside the early exits it uses the value
> > > > > 0[0] instead of
> > > > N[lane_max].
> > > > >
> > > > > Perhaps I'm missing something here?
> > > >
> > > > OK, so I refreshed my mind of what vect_update_ivs_after_vectorizer
> > does.
> > > >
> > > > I still do not understand the (complexity of the) patch.  Basically
> > > > the function computes the new value of the IV "from scratch" based
> > > > on the number of scalar iterations of the vector loop, the 'niter'
> > > > argument.  I would have expected that for the early exits we either
> > > > pass in a different 'niter' or alternatively a 'niter_adjustment'.
> > >
> > > But for an early exit there's no static value for adjusted niter,
> > > since you don't know which iteration you exited from. Unlike the
> > > normal exit when you know if you get there you've done all possible
> > iterations.
> > >
> > > So you must compute the scalar iteration count on the exit itself.
> > 
> > ?  You do not need the actual scalar iteration you exited (you don't compute
> > that either), you need the scalar iteration the vector iteration started with
> > when it exited prematurely and that's readily available?
> 
> For a normal exit yes, not for an early exit no? niters_vector_mult_vf is only
> valid for the main exit.
> 
> There's the unadjusted scalar count, which is what it's using to adjust it to
> the final count.  Unless I'm missing something?

Ah, of course - niters_vector_mult_vf is for the countable exit.  For
the early exits we can't precompute the scalar iteration value.  But that
then means we should compute the appropriate "continuation" as live
value of the vectorized IVs even when they were not originally used
outside of the loop.  I don't see how we can express this in terms
of the scalar IVs in the (not yet) vectorized loop - similar to the
reduction case you are going to end up with the wrong values here.

That said, I've for a long time wanted to preserve the original
control IV also for the vector code (leaving any "optimization"
to IVOPTs there), that would enable us to compute the correct
"niters_vector_mult_vf" based on that IV.

So given we cannot use the scalar IVs you have to handle all
inductions (besides the main exit control IV) in
vectorizable_live_operation I think.

Or for now disable early-break for inductions that are not
the main exit control IV (in vect_can_advance_ivs_p)?

> > > >
> > > > It seems your change handles different kinds of inductions differently.
> > > > Specifically
> > > >
> > > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > >       if (restart_loop && ivtemp)
> > > >         {
> > > >           type = TREE_TYPE (gimple_phi_result (phi));
> > > >           ni = build_int_cst (type, vf);
> > > >           if (inversed_iv)
> > > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > > >                               fold_convert (type, step_expr));
> > > >         }
> > > >
> > > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > > as the new value.  That seems to be very odd special casing for
> > > > unknown reasons.  And while you adjust vec_step_op_add, you don't
> > > > adjust vect_peel_nonlinear_iv_init (maybe not supported - better assert
> > here).
> > >
> > > The VF case is for a normal "non-inverted" loop, where if you take an
> > > early exit you know that you have to do at most VF iterations.  The VF
> > > - step is to account for the inverted loop control flow where you exit
> > > after adjusting the IV already by + step.
> > 
> > But doesn't that assume the IV counts from niter to zero?  I don't see this
> > special case is actually necessary, no?
> > 
> 
> I needed it because otherwise the scalar loop iterates one iteration too little
> So I got a miscompile with the inverter loop stuff.  I'll look at it again perhaps
> It can be solved differently.
> 
> > >
> > > Peeling doesn't matter here, since you know you were able to do a
> > > vector iteration so it's safe to do VF iterations.  So having peeled
> > > doesn't affect the remaining iters count.
> > >
> > > >
> > > > Also the vec_step_op_add case will keep the original scalar IV live
> > > > even when it is a vectorized induction.  The code recomputing the
> > > > value from scratch avoids this.
> > > >
> > > >       /* For non-main exit create an intermediat edge to get any updated iv
> > > >          calculations.  */
> > > >       if (needs_interm_block
> > > >           && !iv_block
> > > >           && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > > (new_stmts)))
> > > >         {
> > > >           iv_block = split_edge (update_e);
> > > >           update_e = single_succ_edge (update_e->dest);
> > > >           last_gsi = gsi_last_bb (iv_block);
> > > >         }
> > > >
> > > > this is also odd, can we adjust the API instead?  I suppose this is
> > > > because your computation uses the original loop IV, if you based the
> > > > computation off the initial value only this might not be necessary?
> > >
> > > No, on the main exit the code updates the value in the loop header and
> > > puts the Calculation in the merge block.  This works because it only
> > > needs to consume PHI nodes in the merge block and things like niters are
> > adjusted in the guard block.
> > >
> > > For an early exit, we don't have a guard block, only the merge block.
> > > We have to update the PHI nodes in that block,  but can't do so since
> > > you can't produce a value and consume it in a PHI node in the same BB.
> > > So we need to create the block to put the values in for use in the
> > > merge block.  Because there's no "guard" block for early exits.
> > 
> > ?  then compute niters in that block as well.
> 
> We can't since it'll not be reachable through the right edge.  What we can
> do if you want is slightly change peeling, we currently peel as:
> 
>   \        \             /
>   E1     E2        Normal exit
>     \       |          |
>        \    |          Guard
>           \ |          |
>          Merge block
>                   |
>              Pre Header
> 
> If we instead peel as:
> 
> 
>   \        \             /
>   E1     E2        Normal exit
>     \       |          |
>        Exit join   Guard
>           \ |          |
>          Merge block
>                   |
>              Pre Header
> 
> We can use the exit join block.  This would also mean vect_update_ivs_after_vectorizer
> Doesn't need to iterate over all exits and only really needs to adjust the phi nodes
> Coming out of the exit join and guard block.
> 
> Does this work for you?
> 
> Thanks,
> Tamar
> > 
> > > The API can be adjusted by always creating the empty block either during
> > peeling.
> > > That would prevent us from having to do anything special here.  Would
> > > that work better?  Or I can do it in the loop that iterates over the
> > > exits to before the call to vect_update_ivs_after_vectorizer, which I think
> > might be more consistent.
> > >
> > > >
> > > > That said, I wonder why we cannot simply pass in an adjusted niter
> > > > which would be niters_vector_mult_vf - vf and be done with that?
> > > >
> > >
> > > We can ofcourse not have this and recompute it from niters itself,
> > > however this does affect the epilog code layout. Particularly knowing
> > > the static number if iterations left causes it to usually unroll the
> > > loop and share some of the computations.  i.e. the scalar code is often more
> > efficient.
> > >
> > > The computation would be niters_vector_mult_vf - iters_done * vf,
> > > since the value put Here is the remaining iteration count.  It's static for early
> > exits.
> > 
> > Well, it might be "static" in that it doesn't really matter what you use for the
> > epilog main IV initial value as long as you are sure you're not going to take that
> > exit as you are sure we're going to take one of the early exits.  So yeah, the
> > special code is probably OK, but it needs a better comment and as said the
> > structure of vect_update_ivs_after_vectorizer is a bit hard to follow now.
> > 
> > As said an important part for optimization is to not keep the scalar IVs live in
> > the vector loop.
> > 
> > > But can do whatever you prefer here.  Let me know what you prefer for the
> > above.
> > >
> > > Thanks,
> > > Tamar
> > >
> > > > Thanks,
> > > > Richard.
> > > >
> > > >
> > > > > Regards,
> > > > > Tamar
> > > > > >
> > > > > > > It has to do this since you have to perform the side effects
> > > > > > > for the non-matching elements still.
> > > > > > >
> > > > > > > Regards,
> > > > > > > Tamar
> > > > > > >
> > > > > > > >
> > > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > > +		continue;
> > > > > > > > > +
> > > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > > +		 init + (final - init) * vf which takes into account peeling
> > > > > > > > > +		 values and non-single steps.  The main exit can use
> > > > niters
> > > > > > > > > +		 since if you exit from the main exit you've done all
> > > > vector
> > > > > > > > > +		 iterations.  For an early exit we don't know when we
> > > > exit
> > > > > > > > > +so
> > > > > > > > we
> > > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > > +				 fold_convert (stype, start_expr),
> > > > > > > > > +				 fold_convert (stype, init_expr));
> > > > > > > > > +	      /* Now adjust for VF to get the final iteration value.  */
> > > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > > +				 build_int_cst (stype, vf));
> > > > > > > > > +	    }
> > > > > > > > > +	  else
> > > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > +			       fold_convert (stype, niters), step_expr);
> > > > > > > > > +
> > > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > > >  	  else
> > > > > > > > > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer
> > > > > > > > > (loop_vec_info
> > > > > > > > loop_vinfo,
> > > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > > >  	ni = init_expr;
> > > > > > > > > +      else if (restart_loop)
> > > > > > > > > +	continue;
> > > > > > > >
> > > > > > > > This looks all a bit complicated - why wouldn't we simply
> > > > > > > > always use the PHI result when 'restart_loop'?  Isn't that
> > > > > > > > the correct old start value in
> > > > > > all cases?
> > > > > > > >
> > > > > > > > >        else
> > > > > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > > > > > >  					  niters, step_expr, @@ -
> > 2245,9 +2295,20 @@
> > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > (loop_vec_info
> > > > > > > > > loop_vinfo,
> > > > > > > > >
> > > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > > >
> > > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > > >        ni_name = force_gimple_operand (ni, &new_stmts,
> > > > > > > > > false, var);
> > > > > > > > > +
> > > > > > > > > +      /* For non-main exit create an intermediat edge to
> > > > > > > > > + get any
> > > > updated iv
> > > > > > > > > +	 calculations.  */
> > > > > > > > > +      if (needs_interm_block
> > > > > > > > > +	  && !iv_block
> > > > > > > > > +	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > > > > > > (new_stmts)))
> > > > > > > > > +	{
> > > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > +	}
> > > > > > > > > +
> > > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > > >  	{
> > > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info
> > > > > > > > > loop_vinfo, tree
> > > > > > > > niters, tree nitersm1,
> > > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > > >        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
> > > > > > > > >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > niters_vector_mult_vf,
> > > > > > > > > -					update_e);
> > > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > > +      bool inversed_iv
> > > > > > > > > +	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT
> > > > (loop_vinfo),
> > > > > > > > > +					 LOOP_VINFO_LOOP
> > > > (loop_vinfo));
> > > > > > > >
> > > > > > > > You are computing this here and in
> > vect_update_ivs_after_vectorizer?
> > > > > > > >
> > > > > > > > > +
> > > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > > +      vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > > > niters_vector_mult_vf,
> > > > > > > > > +					update_e, inversed_iv);
> > > > > > > > > +
> > > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > > +	{
> > > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > > > > > +	    continue;
> > > > > > > > > +
> > > > > > > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > > > > > > +					    niters_vector_mult_vf,
> > > > > > > > > +					    exit, true);
> > > > > > > >
> > > > > > > > ... why does the same not work here?  Wouldn't the proper
> > > > > > > > condition be !dominated_by_p (CDI_DOMINATORS, exit->src,
> > > > > > > > LOOP_VINFO_IV_EXIT
> > > > > > > > (loop_vinfo)->src) or similar?  That is, whether the exit is
> > > > > > > > at or after the main IV exit?  (consider having two)
> > > > > > > >
> > > > > > > > > +	}
> > > > > > > > >
> > > > > > > > >        if (skip_epilog)
> > > > > > > > >  	{
> > > > > > > > >
> > > > > > >
> > > > > >
> > > > > > --
> > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > > Nuernberg)
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
>
Tamar Christina Nov. 16, 2023, 1:22 p.m. UTC | #11
> > > > > >
> > > > > > Perhaps I'm missing something here?
> > > > >
> > > > > OK, so I refreshed my mind of what
> > > > > vect_update_ivs_after_vectorizer
> > > does.
> > > > >
> > > > > I still do not understand the (complexity of the) patch.
> > > > > Basically the function computes the new value of the IV "from
> > > > > scratch" based on the number of scalar iterations of the vector loop,
> the 'niter'
> > > > > argument.  I would have expected that for the early exits we
> > > > > either pass in a different 'niter' or alternatively a 'niter_adjustment'.
> > > >
> > > > But for an early exit there's no static value for adjusted niter,
> > > > since you don't know which iteration you exited from. Unlike the
> > > > normal exit when you know if you get there you've done all
> > > > possible
> > > iterations.
> > > >
> > > > So you must compute the scalar iteration count on the exit itself.
> > >
> > > ?  You do not need the actual scalar iteration you exited (you don't
> > > compute that either), you need the scalar iteration the vector
> > > iteration started with when it exited prematurely and that's readily
> available?
> >
> > For a normal exit yes, not for an early exit no? niters_vector_mult_vf
> > is only valid for the main exit.
> >
> > There's the unadjusted scalar count, which is what it's using to
> > adjust it to the final count.  Unless I'm missing something?
> 
> Ah, of course - niters_vector_mult_vf is for the countable exit.  For the early
> exits we can't precompute the scalar iteration value.  But that then means we
> should compute the appropriate "continuation" as live value of the vectorized
> IVs even when they were not originally used outside of the loop.  I don't see
> how we can express this in terms of the scalar IVs in the (not yet) vectorized
> loop - similar to the reduction case you are going to end up with the wrong
> values here.
> 
> That said, I've for a long time wanted to preserve the original control IV also for
> the vector code (leaving any "optimization"
> to IVOPTs there), that would enable us to compute the correct
> "niters_vector_mult_vf" based on that IV.
> 
> So given we cannot use the scalar IVs you have to handle all inductions
> (besides the main exit control IV) in vectorizable_live_operation I think.
> 

That's what I currently do, that's why there was the
	      if (STMT_VINFO_LIVE_P (phi_info))
		continue;

although I don't understand why we use the scalar count,  I suppose the reasoning
is that we don't really want to keep it around, and referencing it forces it to be kept?

At the moment it just does `init + (final - init) * vf` which is correct no?

Also you missed the question below about how to avoid the creation of the block,
You ok with changing that?

Thanks,
Tamar

> Or for now disable early-break for inductions that are not the main exit control
> IV (in vect_can_advance_ivs_p)?
> 
> > > > >
> > > > > It seems your change handles different kinds of inductions differently.
> > > > > Specifically
> > > > >
> > > > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > >       if (restart_loop && ivtemp)
> > > > >         {
> > > > >           type = TREE_TYPE (gimple_phi_result (phi));
> > > > >           ni = build_int_cst (type, vf);
> > > > >           if (inversed_iv)
> > > > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > >                               fold_convert (type, step_expr));
> > > > >         }
> > > > >
> > > > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > > > as the new value.  That seems to be very odd special casing for
> > > > > unknown reasons.  And while you adjust vec_step_op_add, you
> > > > > don't adjust vect_peel_nonlinear_iv_init (maybe not supported -
> > > > > better assert
> > > here).
> > > >
> > > > The VF case is for a normal "non-inverted" loop, where if you take
> > > > an early exit you know that you have to do at most VF iterations.
> > > > The VF
> > > > - step is to account for the inverted loop control flow where you
> > > > exit after adjusting the IV already by + step.
> > >
> > > But doesn't that assume the IV counts from niter to zero?  I don't
> > > see this special case is actually necessary, no?
> > >
> >
> > I needed it because otherwise the scalar loop iterates one iteration
> > too little So I got a miscompile with the inverter loop stuff.  I'll
> > look at it again perhaps It can be solved differently.
> >
> > > >
> > > > Peeling doesn't matter here, since you know you were able to do a
> > > > vector iteration so it's safe to do VF iterations.  So having
> > > > peeled doesn't affect the remaining iters count.
> > > >
> > > > >
> > > > > Also the vec_step_op_add case will keep the original scalar IV
> > > > > live even when it is a vectorized induction.  The code
> > > > > recomputing the value from scratch avoids this.
> > > > >
> > > > >       /* For non-main exit create an intermediat edge to get any updated
> iv
> > > > >          calculations.  */
> > > > >       if (needs_interm_block
> > > > >           && !iv_block
> > > > >           && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > > > (new_stmts)))
> > > > >         {
> > > > >           iv_block = split_edge (update_e);
> > > > >           update_e = single_succ_edge (update_e->dest);
> > > > >           last_gsi = gsi_last_bb (iv_block);
> > > > >         }
> > > > >
> > > > > this is also odd, can we adjust the API instead?  I suppose this
> > > > > is because your computation uses the original loop IV, if you
> > > > > based the computation off the initial value only this might not be
> necessary?
> > > >
> > > > No, on the main exit the code updates the value in the loop header
> > > > and puts the Calculation in the merge block.  This works because
> > > > it only needs to consume PHI nodes in the merge block and things
> > > > like niters are
> > > adjusted in the guard block.
> > > >
> > > > For an early exit, we don't have a guard block, only the merge block.
> > > > We have to update the PHI nodes in that block,  but can't do so
> > > > since you can't produce a value and consume it in a PHI node in the same
> BB.
> > > > So we need to create the block to put the values in for use in the
> > > > merge block.  Because there's no "guard" block for early exits.
> > >
> > > ?  then compute niters in that block as well.
> >
> > We can't since it'll not be reachable through the right edge.  What we
> > can do if you want is slightly change peeling, we currently peel as:
> >
> >   \        \             /
> >   E1     E2        Normal exit
> >     \       |          |
> >        \    |          Guard
> >           \ |          |
> >          Merge block
> >                   |
> >              Pre Header
> >
> > If we instead peel as:
> >
> >
> >   \        \             /
> >   E1     E2        Normal exit
> >     \       |          |
> >        Exit join   Guard
> >           \ |          |
> >          Merge block
> >                   |
> >              Pre Header
> >
> > We can use the exit join block.  This would also mean
> > vect_update_ivs_after_vectorizer Doesn't need to iterate over all
> > exits and only really needs to adjust the phi nodes Coming out of the exit join
> and guard block.
> >
> > Does this work for you?
> >
> > Thanks,
> > Tamar
> > >
> > > > The API can be adjusted by always creating the empty block either
> > > > during
> > > peeling.
> > > > That would prevent us from having to do anything special here.
> > > > Would that work better?  Or I can do it in the loop that iterates
> > > > over the exits to before the call to
> > > > vect_update_ivs_after_vectorizer, which I think
> > > might be more consistent.
> > > >
> > > > >
> > > > > That said, I wonder why we cannot simply pass in an adjusted
> > > > > niter which would be niters_vector_mult_vf - vf and be done with that?
> > > > >
> > > >
> > > > We can ofcourse not have this and recompute it from niters itself,
> > > > however this does affect the epilog code layout. Particularly
> > > > knowing the static number if iterations left causes it to usually
> > > > unroll the loop and share some of the computations.  i.e. the
> > > > scalar code is often more
> > > efficient.
> > > >
> > > > The computation would be niters_vector_mult_vf - iters_done * vf,
> > > > since the value put Here is the remaining iteration count.  It's
> > > > static for early
> > > exits.
> > >
> > > Well, it might be "static" in that it doesn't really matter what you
> > > use for the epilog main IV initial value as long as you are sure
> > > you're not going to take that exit as you are sure we're going to
> > > take one of the early exits.  So yeah, the special code is probably
> > > OK, but it needs a better comment and as said the structure of
> vect_update_ivs_after_vectorizer is a bit hard to follow now.
> > >
> > > As said an important part for optimization is to not keep the scalar
> > > IVs live in the vector loop.
> > >
> > > > But can do whatever you prefer here.  Let me know what you prefer
> > > > for the
> > > above.
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > > Thanks,
> > > > > Richard.
> > > > >
> > > > >
> > > > > > Regards,
> > > > > > Tamar
> > > > > > >
> > > > > > > > It has to do this since you have to perform the side
> > > > > > > > effects for the non-matching elements still.
> > > > > > > >
> > > > > > > > Regards,
> > > > > > > > Tamar
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > > > +		continue;
> > > > > > > > > > +
> > > > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > > > +		 init + (final - init) * vf which takes into account
> peeling
> > > > > > > > > > +		 values and non-single steps.  The main exit
> can
> > > > > > > > > > +use
> > > > > niters
> > > > > > > > > > +		 since if you exit from the main exit you've
> done
> > > > > > > > > > +all
> > > > > vector
> > > > > > > > > > +		 iterations.  For an early exit we don't know
> when
> > > > > > > > > > +we
> > > > > exit
> > > > > > > > > > +so
> > > > > > > > > we
> > > > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > > > +				 fold_convert (stype,
> start_expr),
> > > > > > > > > > +				 fold_convert (stype,
> init_expr));
> > > > > > > > > > +	      /* Now adjust for VF to get the final iteration value.
> */
> > > > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > > > +				 build_int_cst (stype, vf));
> > > > > > > > > > +	    }
> > > > > > > > > > +	  else
> > > > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > > +			       fold_convert (stype, niters),
> step_expr);
> > > > > > > > > > +
> > > > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > > > >  	  else
> > > > > > > > > > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer
> > > > > > > > > > (loop_vec_info
> > > > > > > > > loop_vinfo,
> > > > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > > > >  	ni = init_expr;
> > > > > > > > > > +      else if (restart_loop)
> > > > > > > > > > +	continue;
> > > > > > > > >
> > > > > > > > > This looks all a bit complicated - why wouldn't we
> > > > > > > > > simply always use the PHI result when 'restart_loop'?
> > > > > > > > > Isn't that the correct old start value in
> > > > > > > all cases?
> > > > > > > > >
> > > > > > > > > >        else
> > > > > > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > > > > > > >  					  niters, step_expr, @@ -
> > > 2245,9 +2295,20 @@
> > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > (loop_vec_info
> > > > > > > > > > loop_vinfo,
> > > > > > > > > >
> > > > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > > > >
> > > > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > > > >        ni_name = force_gimple_operand (ni, &new_stmts,
> > > > > > > > > > false, var);
> > > > > > > > > > +
> > > > > > > > > > +      /* For non-main exit create an intermediat edge
> > > > > > > > > > + to get any
> > > > > updated iv
> > > > > > > > > > +	 calculations.  */
> > > > > > > > > > +      if (needs_interm_block
> > > > > > > > > > +	  && !iv_block
> > > > > > > > > > +	  && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > +!gimple_seq_empty_p
> > > > > > > > > (new_stmts)))
> > > > > > > > > > +	{
> > > > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > +	}
> > > > > > > > > > +
> > > > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > > > >  	{
> > > > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info
> > > > > > > > > > loop_vinfo, tree
> > > > > > > > > niters, tree nitersm1,
> > > > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > > > >        gcc_checking_assert (vect_can_advance_ivs_p
> (loop_vinfo));
> > > > > > > > > >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > > > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > niters_vector_mult_vf,
> > > > > > > > > > -					update_e);
> > > > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > > > +      bool inversed_iv
> > > > > > > > > > +	= !vect_is_loop_exit_latch_pred
> (LOOP_VINFO_IV_EXIT
> > > > > (loop_vinfo),
> > > > > > > > > > +					 LOOP_VINFO_LOOP
> > > > > (loop_vinfo));
> > > > > > > > >
> > > > > > > > > You are computing this here and in
> > > vect_update_ivs_after_vectorizer?
> > > > > > > > >
> > > > > > > > > > +
> > > > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > > > +      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > > > > + vf,
> > > > > > > niters_vector_mult_vf,
> > > > > > > > > > +					update_e,
> inversed_iv);
> > > > > > > > > > +
> > > > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > > > +	{
> > > > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > > > > > > +	    continue;
> > > > > > > > > > +
> > > > > > > > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > > > > > > > +
> niters_vector_mult_vf,
> > > > > > > > > > +					    exit, true);
> > > > > > > > >
> > > > > > > > > ... why does the same not work here?  Wouldn't the
> > > > > > > > > proper condition be !dominated_by_p (CDI_DOMINATORS,
> > > > > > > > > exit->src, LOOP_VINFO_IV_EXIT
> > > > > > > > > (loop_vinfo)->src) or similar?  That is, whether the
> > > > > > > > > exit is at or after the main IV exit?  (consider having
> > > > > > > > > two)
> > > > > > > > >
> > > > > > > > > > +	}
> > > > > > > > > >
> > > > > > > > > >        if (skip_epilog)
> > > > > > > > > >  	{
> > > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > > > --
> > > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> > > > > > > AG
> > > > > > > Nuernberg)
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
Richard Biener Nov. 16, 2023, 1:35 p.m. UTC | #12
On Thu, 16 Nov 2023, Tamar Christina wrote:

> > > > > > >
> > > > > > > Perhaps I'm missing something here?
> > > > > >
> > > > > > OK, so I refreshed my mind of what
> > > > > > vect_update_ivs_after_vectorizer
> > > > does.
> > > > > >
> > > > > > I still do not understand the (complexity of the) patch.
> > > > > > Basically the function computes the new value of the IV "from
> > > > > > scratch" based on the number of scalar iterations of the vector loop,
> > the 'niter'
> > > > > > argument.  I would have expected that for the early exits we
> > > > > > either pass in a different 'niter' or alternatively a 'niter_adjustment'.
> > > > >
> > > > > But for an early exit there's no static value for adjusted niter,
> > > > > since you don't know which iteration you exited from. Unlike the
> > > > > normal exit when you know if you get there you've done all
> > > > > possible
> > > > iterations.
> > > > >
> > > > > So you must compute the scalar iteration count on the exit itself.
> > > >
> > > > ?  You do not need the actual scalar iteration you exited (you don't
> > > > compute that either), you need the scalar iteration the vector
> > > > iteration started with when it exited prematurely and that's readily
> > available?
> > >
> > > For a normal exit yes, not for an early exit no? niters_vector_mult_vf
> > > is only valid for the main exit.
> > >
> > > There's the unadjusted scalar count, which is what it's using to
> > > adjust it to the final count.  Unless I'm missing something?
> > 
> > Ah, of course - niters_vector_mult_vf is for the countable exit.  For the early
> > exits we can't precompute the scalar iteration value.  But that then means we
> > should compute the appropriate "continuation" as live value of the vectorized
> > IVs even when they were not originally used outside of the loop.  I don't see
> > how we can express this in terms of the scalar IVs in the (not yet) vectorized
> > loop - similar to the reduction case you are going to end up with the wrong
> > values here.
> > 
> > That said, I've for a long time wanted to preserve the original control IV also for
> > the vector code (leaving any "optimization"
> > to IVOPTs there), that would enable us to compute the correct
> > "niters_vector_mult_vf" based on that IV.
> > 
> > So given we cannot use the scalar IVs you have to handle all inductions
> > (besides the main exit control IV) in vectorizable_live_operation I think.
> > 
> 
> That's what I currently do, that's why there was the
> 	      if (STMT_VINFO_LIVE_P (phi_info))
> 		continue;

Yes, but that only works for the inductions marked so.  We'd need to
mark the others as well, but only for the early exits.

> although I don't understand why we use the scalar count,  I suppose the reasoning
> is that we don't really want to keep it around, and referencing it forces it to be kept?

Referencing it will cause the scalar compute to be retained, but since
we do not adjust the scalar compute during vectorization (but expect
it to be dead) the scalar compute will compute the wrong thing (as
shown by the reduction example - I suspect inductions will suffer from
the same problem).

> At the moment it just does `init + (final - init) * vf` which is correct no?

The issue is that 'final' is not computed correctly in the vectorized
loop.  This formula might work for affine evolutions of course.

Extracting the correct value from the vectorized induction would be
the preferred solution.

> Also you missed the question below about how to avoid the creation of the block,
> You ok with changing that?
> 
> Thanks,
> Tamar
> 
> > Or for now disable early-break for inductions that are not the main exit control
> > IV (in vect_can_advance_ivs_p)?
> > 
> > > > > >
> > > > > > It seems your change handles different kinds of inductions differently.
> > > > > > Specifically
> > > > > >
> > > > > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > >       if (restart_loop && ivtemp)
> > > > > >         {
> > > > > >           type = TREE_TYPE (gimple_phi_result (phi));
> > > > > >           ni = build_int_cst (type, vf);
> > > > > >           if (inversed_iv)
> > > > > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > >                               fold_convert (type, step_expr));
> > > > > >         }
> > > > > >
> > > > > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > > > > as the new value.  That seems to be very odd special casing for
> > > > > > unknown reasons.  And while you adjust vec_step_op_add, you
> > > > > > don't adjust vect_peel_nonlinear_iv_init (maybe not supported -
> > > > > > better assert
> > > > here).
> > > > >
> > > > > The VF case is for a normal "non-inverted" loop, where if you take
> > > > > an early exit you know that you have to do at most VF iterations.
> > > > > The VF
> > > > > - step is to account for the inverted loop control flow where you
> > > > > exit after adjusting the IV already by + step.
> > > >
> > > > But doesn't that assume the IV counts from niter to zero?  I don't
> > > > see this special case is actually necessary, no?
> > > >
> > >
> > > I needed it because otherwise the scalar loop iterates one iteration
> > > too little So I got a miscompile with the inverter loop stuff.  I'll
> > > look at it again perhaps It can be solved differently.
> > >
> > > > >
> > > > > Peeling doesn't matter here, since you know you were able to do a
> > > > > vector iteration so it's safe to do VF iterations.  So having
> > > > > peeled doesn't affect the remaining iters count.
> > > > >
> > > > > >
> > > > > > Also the vec_step_op_add case will keep the original scalar IV
> > > > > > live even when it is a vectorized induction.  The code
> > > > > > recomputing the value from scratch avoids this.
> > > > > >
> > > > > >       /* For non-main exit create an intermediat edge to get any updated
> > iv
> > > > > >          calculations.  */
> > > > > >       if (needs_interm_block
> > > > > >           && !iv_block
> > > > > >           && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p
> > > > > > (new_stmts)))
> > > > > >         {
> > > > > >           iv_block = split_edge (update_e);
> > > > > >           update_e = single_succ_edge (update_e->dest);
> > > > > >           last_gsi = gsi_last_bb (iv_block);
> > > > > >         }
> > > > > >
> > > > > > this is also odd, can we adjust the API instead?  I suppose this
> > > > > > is because your computation uses the original loop IV, if you
> > > > > > based the computation off the initial value only this might not be
> > necessary?
> > > > >
> > > > > No, on the main exit the code updates the value in the loop header
> > > > > and puts the Calculation in the merge block.  This works because
> > > > > it only needs to consume PHI nodes in the merge block and things
> > > > > like niters are
> > > > adjusted in the guard block.
> > > > >
> > > > > For an early exit, we don't have a guard block, only the merge block.
> > > > > We have to update the PHI nodes in that block,  but can't do so
> > > > > since you can't produce a value and consume it in a PHI node in the same
> > BB.
> > > > > So we need to create the block to put the values in for use in the
> > > > > merge block.  Because there's no "guard" block for early exits.
> > > >
> > > > ?  then compute niters in that block as well.
> > >
> > > We can't since it'll not be reachable through the right edge.  What we
> > > can do if you want is slightly change peeling, we currently peel as:
> > >
> > >   \        \             /
> > >   E1     E2        Normal exit
> > >     \       |          |
> > >        \    |          Guard
> > >           \ |          |
> > >          Merge block
> > >                   |
> > >              Pre Header
> > >
> > > If we instead peel as:
> > >
> > >
> > >   \        \             /
> > >   E1     E2        Normal exit
> > >     \       |          |
> > >        Exit join   Guard
> > >           \ |          |
> > >          Merge block
> > >                   |
> > >              Pre Header
> > >
> > > We can use the exit join block.  This would also mean
> > > vect_update_ivs_after_vectorizer Doesn't need to iterate over all
> > > exits and only really needs to adjust the phi nodes Coming out of the exit join
> > and guard block.
> > >
> > > Does this work for you?

Yeah, I think that would work.  But I'd like to sort out the
correctness details of the IV update itself before sorting out
this code placement detail.

Richard.

> > > Thanks,
> > > Tamar
> > > >
> > > > > The API can be adjusted by always creating the empty block either
> > > > > during
> > > > peeling.
> > > > > That would prevent us from having to do anything special here.
> > > > > Would that work better?  Or I can do it in the loop that iterates
> > > > > over the exits to before the call to
> > > > > vect_update_ivs_after_vectorizer, which I think
> > > > might be more consistent.
> > > > >
> > > > > >
> > > > > > That said, I wonder why we cannot simply pass in an adjusted
> > > > > > niter which would be niters_vector_mult_vf - vf and be done with that?
> > > > > >
> > > > >
> > > > > We can ofcourse not have this and recompute it from niters itself,
> > > > > however this does affect the epilog code layout. Particularly
> > > > > knowing the static number if iterations left causes it to usually
> > > > > unroll the loop and share some of the computations.  i.e. the
> > > > > scalar code is often more
> > > > efficient.
> > > > >
> > > > > The computation would be niters_vector_mult_vf - iters_done * vf,
> > > > > since the value put Here is the remaining iteration count.  It's
> > > > > static for early
> > > > exits.
> > > >
> > > > Well, it might be "static" in that it doesn't really matter what you
> > > > use for the epilog main IV initial value as long as you are sure
> > > > you're not going to take that exit as you are sure we're going to
> > > > take one of the early exits.  So yeah, the special code is probably
> > > > OK, but it needs a better comment and as said the structure of
> > vect_update_ivs_after_vectorizer is a bit hard to follow now.
> > > >
> > > > As said an important part for optimization is to not keep the scalar
> > > > IVs live in the vector loop.
> > > >
> > > > > But can do whatever you prefer here.  Let me know what you prefer
> > > > > for the
> > > > above.
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > > Thanks,
> > > > > > Richard.
> > > > > >
> > > > > >
> > > > > > > Regards,
> > > > > > > Tamar
> > > > > > > >
> > > > > > > > > It has to do this since you have to perform the side
> > > > > > > > > effects for the non-matching elements still.
> > > > > > > > >
> > > > > > > > > Regards,
> > > > > > > > > Tamar
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > > > > +		continue;
> > > > > > > > > > > +
> > > > > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > > > > +		 init + (final - init) * vf which takes into account
> > peeling
> > > > > > > > > > > +		 values and non-single steps.  The main exit
> > can
> > > > > > > > > > > +use
> > > > > > niters
> > > > > > > > > > > +		 since if you exit from the main exit you've
> > done
> > > > > > > > > > > +all
> > > > > > vector
> > > > > > > > > > > +		 iterations.  For an early exit we don't know
> > when
> > > > > > > > > > > +we
> > > > > > exit
> > > > > > > > > > > +so
> > > > > > > > > > we
> > > > > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > > > > +				 fold_convert (stype,
> > start_expr),
> > > > > > > > > > > +				 fold_convert (stype,
> > init_expr));
> > > > > > > > > > > +	      /* Now adjust for VF to get the final iteration value.
> > */
> > > > > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > > > > +				 build_int_cst (stype, vf));
> > > > > > > > > > > +	    }
> > > > > > > > > > > +	  else
> > > > > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > > > +			       fold_convert (stype, niters),
> > step_expr);
> > > > > > > > > > > +
> > > > > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > > > > >  	  else
> > > > > > > > > > > @@ -2238,6 +2286,8 @@ vect_update_ivs_after_vectorizer
> > > > > > > > > > > (loop_vec_info
> > > > > > > > > > loop_vinfo,
> > > > > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > > > > >  	ni = init_expr;
> > > > > > > > > > > +      else if (restart_loop)
> > > > > > > > > > > +	continue;
> > > > > > > > > >
> > > > > > > > > > This looks all a bit complicated - why wouldn't we
> > > > > > > > > > simply always use the PHI result when 'restart_loop'?
> > > > > > > > > > Isn't that the correct old start value in
> > > > > > > > all cases?
> > > > > > > > > >
> > > > > > > > > > >        else
> > > > > > > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > > > > > > > >  					  niters, step_expr, @@ -
> > > > 2245,9 +2295,20 @@
> > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > (loop_vec_info
> > > > > > > > > > > loop_vinfo,
> > > > > > > > > > >
> > > > > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > > > > >
> > > > > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > > > > >        ni_name = force_gimple_operand (ni, &new_stmts,
> > > > > > > > > > > false, var);
> > > > > > > > > > > +
> > > > > > > > > > > +      /* For non-main exit create an intermediat edge
> > > > > > > > > > > + to get any
> > > > > > updated iv
> > > > > > > > > > > +	 calculations.  */
> > > > > > > > > > > +      if (needs_interm_block
> > > > > > > > > > > +	  && !iv_block
> > > > > > > > > > > +	  && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > > +!gimple_seq_empty_p
> > > > > > > > > > (new_stmts)))
> > > > > > > > > > > +	{
> > > > > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > > +	}
> > > > > > > > > > > +
> > > > > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > > > > >  	{
> > > > > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling (loop_vec_info
> > > > > > > > > > > loop_vinfo, tree
> > > > > > > > > > niters, tree nitersm1,
> > > > > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > > > > >        gcc_checking_assert (vect_can_advance_ivs_p
> > (loop_vinfo));
> > > > > > > > > > >        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> > > > > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > niters_vector_mult_vf,
> > > > > > > > > > > -					update_e);
> > > > > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > > > > +      bool inversed_iv
> > > > > > > > > > > +	= !vect_is_loop_exit_latch_pred
> > (LOOP_VINFO_IV_EXIT
> > > > > > (loop_vinfo),
> > > > > > > > > > > +					 LOOP_VINFO_LOOP
> > > > > > (loop_vinfo));
> > > > > > > > > >
> > > > > > > > > > You are computing this here and in
> > > > vect_update_ivs_after_vectorizer?
> > > > > > > > > >
> > > > > > > > > > > +
> > > > > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > > > > +      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > > > > > + vf,
> > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > +					update_e,
> > inversed_iv);
> > > > > > > > > > > +
> > > > > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > > > > +	{
> > > > > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > > > > > > > +	    continue;
> > > > > > > > > > > +
> > > > > > > > > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
> > > > > > > > > > > +
> > niters_vector_mult_vf,
> > > > > > > > > > > +					    exit, true);
> > > > > > > > > >
> > > > > > > > > > ... why does the same not work here?  Wouldn't the
> > > > > > > > > > proper condition be !dominated_by_p (CDI_DOMINATORS,
> > > > > > > > > > exit->src, LOOP_VINFO_IV_EXIT
> > > > > > > > > > (loop_vinfo)->src) or similar?  That is, whether the
> > > > > > > > > > exit is at or after the main IV exit?  (consider having
> > > > > > > > > > two)
> > > > > > > > > >
> > > > > > > > > > > +	}
> > > > > > > > > > >
> > > > > > > > > > >        if (skip_epilog)
> > > > > > > > > > >  	{
> > > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > > > --
> > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> > > > > > > > AG
> > > > > > > > Nuernberg)
> > > > > > >
> > > > > >
> > > > > > --
> > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > > Nuernberg)
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
>
Tamar Christina Nov. 16, 2023, 2:14 p.m. UTC | #13
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Thursday, November 16, 2023 1:36 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> breaks and arbitrary exits
> 
> On Thu, 16 Nov 2023, Tamar Christina wrote:
> 
> > > > > > > >
> > > > > > > > Perhaps I'm missing something here?
> > > > > > >
> > > > > > > OK, so I refreshed my mind of what
> > > > > > > vect_update_ivs_after_vectorizer
> > > > > does.
> > > > > > >
> > > > > > > I still do not understand the (complexity of the) patch.
> > > > > > > Basically the function computes the new value of the IV
> > > > > > > "from scratch" based on the number of scalar iterations of
> > > > > > > the vector loop,
> > > the 'niter'
> > > > > > > argument.  I would have expected that for the early exits we
> > > > > > > either pass in a different 'niter' or alternatively a 'niter_adjustment'.
> > > > > >
> > > > > > But for an early exit there's no static value for adjusted
> > > > > > niter, since you don't know which iteration you exited from.
> > > > > > Unlike the normal exit when you know if you get there you've
> > > > > > done all possible
> > > > > iterations.
> > > > > >
> > > > > > So you must compute the scalar iteration count on the exit itself.
> > > > >
> > > > > ?  You do not need the actual scalar iteration you exited (you
> > > > > don't compute that either), you need the scalar iteration the
> > > > > vector iteration started with when it exited prematurely and
> > > > > that's readily
> > > available?
> > > >
> > > > For a normal exit yes, not for an early exit no?
> > > > niters_vector_mult_vf is only valid for the main exit.
> > > >
> > > > There's the unadjusted scalar count, which is what it's using to
> > > > adjust it to the final count.  Unless I'm missing something?
> > >
> > > Ah, of course - niters_vector_mult_vf is for the countable exit.
> > > For the early exits we can't precompute the scalar iteration value.
> > > But that then means we should compute the appropriate "continuation"
> > > as live value of the vectorized IVs even when they were not
> > > originally used outside of the loop.  I don't see how we can express
> > > this in terms of the scalar IVs in the (not yet) vectorized loop -
> > > similar to the reduction case you are going to end up with the wrong values
> here.
> > >
> > > That said, I've for a long time wanted to preserve the original
> > > control IV also for the vector code (leaving any "optimization"
> > > to IVOPTs there), that would enable us to compute the correct
> > > "niters_vector_mult_vf" based on that IV.
> > >
> > > So given we cannot use the scalar IVs you have to handle all
> > > inductions (besides the main exit control IV) in vectorizable_live_operation
> I think.
> > >
> >
> > That's what I currently do, that's why there was the
> > 	      if (STMT_VINFO_LIVE_P (phi_info))
> > 		continue;
> 
> Yes, but that only works for the inductions marked so.  We'd need to mark the
> others as well, but only for the early exits.
> 
> > although I don't understand why we use the scalar count,  I suppose
> > the reasoning is that we don't really want to keep it around, and referencing
> it forces it to be kept?
> 
> Referencing it will cause the scalar compute to be retained, but since we do not
> adjust the scalar compute during vectorization (but expect it to be dead) the
> scalar compute will compute the wrong thing (as shown by the reduction
> example - I suspect inductions will suffer from the same problem).
> 
> > At the moment it just does `init + (final - init) * vf` which is correct no?
> 
> The issue is that 'final' is not computed correctly in the vectorized loop.  This
> formula might work for affine evolutions of course.
> 
> Extracting the correct value from the vectorized induction would be the
> preferred solution.

Ok, so I should be able to just mark IVs as live during process_use if there are
multiple exits right? Since it's just gonna be unused on the main exit since we
use niters?

Because since it's the PHI inside the loop that needs to be marked live I can't
just do it for a specific exits no?

If I create a copy of the PHI node during peeling for use in early exits and mark
it live it won't work no?

Tamar
> 
> > Also you missed the question below about how to avoid the creation of
> > the block, You ok with changing that?
> >
> > Thanks,
> > Tamar
> >
> > > Or for now disable early-break for inductions that are not the main
> > > exit control IV (in vect_can_advance_ivs_p)?
> > >
> > > > > > >
> > > > > > > It seems your change handles different kinds of inductions
> differently.
> > > > > > > Specifically
> > > > > > >
> > > > > > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > >       if (restart_loop && ivtemp)
> > > > > > >         {
> > > > > > >           type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > >           ni = build_int_cst (type, vf);
> > > > > > >           if (inversed_iv)
> > > > > > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > >                               fold_convert (type, step_expr));
> > > > > > >         }
> > > > > > >
> > > > > > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > > > > > as the new value.  That seems to be very odd special casing
> > > > > > > for unknown reasons.  And while you adjust vec_step_op_add,
> > > > > > > you don't adjust vect_peel_nonlinear_iv_init (maybe not
> > > > > > > supported - better assert
> > > > > here).
> > > > > >
> > > > > > The VF case is for a normal "non-inverted" loop, where if you
> > > > > > take an early exit you know that you have to do at most VF iterations.
> > > > > > The VF
> > > > > > - step is to account for the inverted loop control flow where
> > > > > > you exit after adjusting the IV already by + step.
> > > > >
> > > > > But doesn't that assume the IV counts from niter to zero?  I
> > > > > don't see this special case is actually necessary, no?
> > > > >
> > > >
> > > > I needed it because otherwise the scalar loop iterates one
> > > > iteration too little So I got a miscompile with the inverter loop
> > > > stuff.  I'll look at it again perhaps It can be solved differently.
> > > >
> > > > > >
> > > > > > Peeling doesn't matter here, since you know you were able to
> > > > > > do a vector iteration so it's safe to do VF iterations.  So
> > > > > > having peeled doesn't affect the remaining iters count.
> > > > > >
> > > > > > >
> > > > > > > Also the vec_step_op_add case will keep the original scalar
> > > > > > > IV live even when it is a vectorized induction.  The code
> > > > > > > recomputing the value from scratch avoids this.
> > > > > > >
> > > > > > >       /* For non-main exit create an intermediat edge to get
> > > > > > > any updated
> > > iv
> > > > > > >          calculations.  */
> > > > > > >       if (needs_interm_block
> > > > > > >           && !iv_block
> > > > > > >           && (!gimple_seq_empty_p (stmts) ||
> > > > > > > !gimple_seq_empty_p
> > > > > > > (new_stmts)))
> > > > > > >         {
> > > > > > >           iv_block = split_edge (update_e);
> > > > > > >           update_e = single_succ_edge (update_e->dest);
> > > > > > >           last_gsi = gsi_last_bb (iv_block);
> > > > > > >         }
> > > > > > >
> > > > > > > this is also odd, can we adjust the API instead?  I suppose
> > > > > > > this is because your computation uses the original loop IV,
> > > > > > > if you based the computation off the initial value only this
> > > > > > > might not be
> > > necessary?
> > > > > >
> > > > > > No, on the main exit the code updates the value in the loop
> > > > > > header and puts the Calculation in the merge block.  This
> > > > > > works because it only needs to consume PHI nodes in the merge
> > > > > > block and things like niters are
> > > > > adjusted in the guard block.
> > > > > >
> > > > > > For an early exit, we don't have a guard block, only the merge block.
> > > > > > We have to update the PHI nodes in that block,  but can't do
> > > > > > so since you can't produce a value and consume it in a PHI
> > > > > > node in the same
> > > BB.
> > > > > > So we need to create the block to put the values in for use in
> > > > > > the merge block.  Because there's no "guard" block for early exits.
> > > > >
> > > > > ?  then compute niters in that block as well.
> > > >
> > > > We can't since it'll not be reachable through the right edge.
> > > > What we can do if you want is slightly change peeling, we currently peel
> as:
> > > >
> > > >   \        \             /
> > > >   E1     E2        Normal exit
> > > >     \       |          |
> > > >        \    |          Guard
> > > >           \ |          |
> > > >          Merge block
> > > >                   |
> > > >              Pre Header
> > > >
> > > > If we instead peel as:
> > > >
> > > >
> > > >   \        \             /
> > > >   E1     E2        Normal exit
> > > >     \       |          |
> > > >        Exit join   Guard
> > > >           \ |          |
> > > >          Merge block
> > > >                   |
> > > >              Pre Header
> > > >
> > > > We can use the exit join block.  This would also mean
> > > > vect_update_ivs_after_vectorizer Doesn't need to iterate over all
> > > > exits and only really needs to adjust the phi nodes Coming out of
> > > > the exit join
> > > and guard block.
> > > >
> > > > Does this work for you?
> 
> Yeah, I think that would work.  But I'd like to sort out the correctness details of
> the IV update itself before sorting out this code placement detail.
> 
> Richard.
> 
> > > > Thanks,
> > > > Tamar
> > > > >
> > > > > > The API can be adjusted by always creating the empty block
> > > > > > either during
> > > > > peeling.
> > > > > > That would prevent us from having to do anything special here.
> > > > > > Would that work better?  Or I can do it in the loop that
> > > > > > iterates over the exits to before the call to
> > > > > > vect_update_ivs_after_vectorizer, which I think
> > > > > might be more consistent.
> > > > > >
> > > > > > >
> > > > > > > That said, I wonder why we cannot simply pass in an adjusted
> > > > > > > niter which would be niters_vector_mult_vf - vf and be done with
> that?
> > > > > > >
> > > > > >
> > > > > > We can ofcourse not have this and recompute it from niters
> > > > > > itself, however this does affect the epilog code layout.
> > > > > > Particularly knowing the static number if iterations left
> > > > > > causes it to usually unroll the loop and share some of the
> > > > > > computations.  i.e. the scalar code is often more
> > > > > efficient.
> > > > > >
> > > > > > The computation would be niters_vector_mult_vf - iters_done *
> > > > > > vf, since the value put Here is the remaining iteration count.
> > > > > > It's static for early
> > > > > exits.
> > > > >
> > > > > Well, it might be "static" in that it doesn't really matter what
> > > > > you use for the epilog main IV initial value as long as you are
> > > > > sure you're not going to take that exit as you are sure we're
> > > > > going to take one of the early exits.  So yeah, the special code
> > > > > is probably OK, but it needs a better comment and as said the
> > > > > structure of
> > > vect_update_ivs_after_vectorizer is a bit hard to follow now.
> > > > >
> > > > > As said an important part for optimization is to not keep the
> > > > > scalar IVs live in the vector loop.
> > > > >
> > > > > > But can do whatever you prefer here.  Let me know what you
> > > > > > prefer for the
> > > > > above.
> > > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > > Thanks,
> > > > > > > Richard.
> > > > > > >
> > > > > > >
> > > > > > > > Regards,
> > > > > > > > Tamar
> > > > > > > > >
> > > > > > > > > > It has to do this since you have to perform the side
> > > > > > > > > > effects for the non-matching elements still.
> > > > > > > > > >
> > > > > > > > > > Regards,
> > > > > > > > > > Tamar
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > > > > > +		continue;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > > > > > +		 init + (final - init) * vf which takes into
> > > > > > > > > > > > +account
> > > peeling
> > > > > > > > > > > > +		 values and non-single steps.  The main exit
> > > can
> > > > > > > > > > > > +use
> > > > > > > niters
> > > > > > > > > > > > +		 since if you exit from the main exit you've
> > > done
> > > > > > > > > > > > +all
> > > > > > > vector
> > > > > > > > > > > > +		 iterations.  For an early exit we don't know
> > > when
> > > > > > > > > > > > +we
> > > > > > > exit
> > > > > > > > > > > > +so
> > > > > > > > > > > we
> > > > > > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > > > > > +				 fold_convert (stype,
> > > start_expr),
> > > > > > > > > > > > +				 fold_convert (stype,
> > > init_expr));
> > > > > > > > > > > > +	      /* Now adjust for VF to get the final iteration value.
> > > */
> > > > > > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > > > > > +				 build_int_cst (stype, vf));
> > > > > > > > > > > > +	    }
> > > > > > > > > > > > +	  else
> > > > > > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > > > > +			       fold_convert (stype, niters),
> > > step_expr);
> > > > > > > > > > > > +
> > > > > > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > > > > > >  	  else
> > > > > > > > > > > > @@ -2238,6 +2286,8 @@
> > > > > > > > > > > > vect_update_ivs_after_vectorizer (loop_vec_info
> > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > > > > > >  	ni = init_expr;
> > > > > > > > > > > > +      else if (restart_loop)
> > > > > > > > > > > > +	continue;
> > > > > > > > > > >
> > > > > > > > > > > This looks all a bit complicated - why wouldn't we
> > > > > > > > > > > simply always use the PHI result when 'restart_loop'?
> > > > > > > > > > > Isn't that the correct old start value in
> > > > > > > > > all cases?
> > > > > > > > > > >
> > > > > > > > > > > >        else
> > > > > > > > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > > > > > > > > >  					  niters, step_expr,
> @@ -
> > > > > 2245,9 +2295,20 @@
> > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > >
> > > > > > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > > > > > >
> > > > > > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > > > > > >        ni_name = force_gimple_operand (ni,
> > > > > > > > > > > > &new_stmts, false, var);
> > > > > > > > > > > > +
> > > > > > > > > > > > +      /* For non-main exit create an intermediat
> > > > > > > > > > > > + edge to get any
> > > > > > > updated iv
> > > > > > > > > > > > +	 calculations.  */
> > > > > > > > > > > > +      if (needs_interm_block
> > > > > > > > > > > > +	  && !iv_block
> > > > > > > > > > > > +	  && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > > > +!gimple_seq_empty_p
> > > > > > > > > > > (new_stmts)))
> > > > > > > > > > > > +	{
> > > > > > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > > > +	}
> > > > > > > > > > > > +
> > > > > > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > > > > > >  	{
> > > > > > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling
> > > > > > > > > > > > (loop_vec_info loop_vinfo, tree
> > > > > > > > > > > niters, tree nitersm1,
> > > > > > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > > > > > >        gcc_checking_assert (vect_can_advance_ivs_p
> > > (loop_vinfo));
> > > > > > > > > > > >        update_e = skip_vector ? e : loop_preheader_edge
> (epilog);
> > > > > > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > -					update_e);
> > > > > > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > > > > > +      bool inversed_iv
> > > > > > > > > > > > +	= !vect_is_loop_exit_latch_pred
> > > (LOOP_VINFO_IV_EXIT
> > > > > > > (loop_vinfo),
> > > > > > > > > > > > +					 LOOP_VINFO_LOOP
> > > > > > > (loop_vinfo));
> > > > > > > > > > >
> > > > > > > > > > > You are computing this here and in
> > > > > vect_update_ivs_after_vectorizer?
> > > > > > > > > > >
> > > > > > > > > > > > +
> > > > > > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > > > > > +      vect_update_ivs_after_vectorizer
> > > > > > > > > > > > + (loop_vinfo, vf,
> > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > +					update_e,
> > > inversed_iv);
> > > > > > > > > > > > +
> > > > > > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > > > > > +	{
> > > > > > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > > > > > > > > +	    continue;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > > > > > > +vf,
> > > > > > > > > > > > +
> > > niters_vector_mult_vf,
> > > > > > > > > > > > +					    exit, true);
> > > > > > > > > > >
> > > > > > > > > > > ... why does the same not work here?  Wouldn't the
> > > > > > > > > > > proper condition be !dominated_by_p (CDI_DOMINATORS,
> > > > > > > > > > > exit->src, LOOP_VINFO_IV_EXIT
> > > > > > > > > > > (loop_vinfo)->src) or similar?  That is, whether the
> > > > > > > > > > > exit is at or after the main IV exit?  (consider
> > > > > > > > > > > having
> > > > > > > > > > > two)
> > > > > > > > > > >
> > > > > > > > > > > > +	}
> > > > > > > > > > > >
> > > > > > > > > > > >        if (skip_epilog)
> > > > > > > > > > > >  	{
> > > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > --
> > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > Nuernberg, Germany;
> > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB
> > > > > > > > > 36809, AG
> > > > > > > > > Nuernberg)
> > > > > > > >
> > > > > > >
> > > > > > > --
> > > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> > > > > > > AG
> > > > > > > Nuernberg)
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
Richard Biener Nov. 16, 2023, 2:17 p.m. UTC | #14
On Thu, 16 Nov 2023, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Thursday, November 16, 2023 1:36 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> > breaks and arbitrary exits
> > 
> > On Thu, 16 Nov 2023, Tamar Christina wrote:
> > 
> > > > > > > > >
> > > > > > > > > Perhaps I'm missing something here?
> > > > > > > >
> > > > > > > > OK, so I refreshed my mind of what
> > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > does.
> > > > > > > >
> > > > > > > > I still do not understand the (complexity of the) patch.
> > > > > > > > Basically the function computes the new value of the IV
> > > > > > > > "from scratch" based on the number of scalar iterations of
> > > > > > > > the vector loop,
> > > > the 'niter'
> > > > > > > > argument.  I would have expected that for the early exits we
> > > > > > > > either pass in a different 'niter' or alternatively a 'niter_adjustment'.
> > > > > > >
> > > > > > > But for an early exit there's no static value for adjusted
> > > > > > > niter, since you don't know which iteration you exited from.
> > > > > > > Unlike the normal exit when you know if you get there you've
> > > > > > > done all possible
> > > > > > iterations.
> > > > > > >
> > > > > > > So you must compute the scalar iteration count on the exit itself.
> > > > > >
> > > > > > ?  You do not need the actual scalar iteration you exited (you
> > > > > > don't compute that either), you need the scalar iteration the
> > > > > > vector iteration started with when it exited prematurely and
> > > > > > that's readily
> > > > available?
> > > > >
> > > > > For a normal exit yes, not for an early exit no?
> > > > > niters_vector_mult_vf is only valid for the main exit.
> > > > >
> > > > > There's the unadjusted scalar count, which is what it's using to
> > > > > adjust it to the final count.  Unless I'm missing something?
> > > >
> > > > Ah, of course - niters_vector_mult_vf is for the countable exit.
> > > > For the early exits we can't precompute the scalar iteration value.
> > > > But that then means we should compute the appropriate "continuation"
> > > > as live value of the vectorized IVs even when they were not
> > > > originally used outside of the loop.  I don't see how we can express
> > > > this in terms of the scalar IVs in the (not yet) vectorized loop -
> > > > similar to the reduction case you are going to end up with the wrong values
> > here.
> > > >
> > > > That said, I've for a long time wanted to preserve the original
> > > > control IV also for the vector code (leaving any "optimization"
> > > > to IVOPTs there), that would enable us to compute the correct
> > > > "niters_vector_mult_vf" based on that IV.
> > > >
> > > > So given we cannot use the scalar IVs you have to handle all
> > > > inductions (besides the main exit control IV) in vectorizable_live_operation
> > I think.
> > > >
> > >
> > > That's what I currently do, that's why there was the
> > > 	      if (STMT_VINFO_LIVE_P (phi_info))
> > > 		continue;
> > 
> > Yes, but that only works for the inductions marked so.  We'd need to mark the
> > others as well, but only for the early exits.
> > 
> > > although I don't understand why we use the scalar count,  I suppose
> > > the reasoning is that we don't really want to keep it around, and referencing
> > it forces it to be kept?
> > 
> > Referencing it will cause the scalar compute to be retained, but since we do not
> > adjust the scalar compute during vectorization (but expect it to be dead) the
> > scalar compute will compute the wrong thing (as shown by the reduction
> > example - I suspect inductions will suffer from the same problem).
> > 
> > > At the moment it just does `init + (final - init) * vf` which is correct no?
> > 
> > The issue is that 'final' is not computed correctly in the vectorized loop.  This
> > formula might work for affine evolutions of course.
> > 
> > Extracting the correct value from the vectorized induction would be the
> > preferred solution.
> 
> Ok, so I should be able to just mark IVs as live during process_use if there are
> multiple exits right? Since it's just gonna be unused on the main exit since we
> use niters?
> 
> Because since it's the PHI inside the loop that needs to be marked live I can't
> just do it for a specific exits no?
> 
> If I create a copy of the PHI node during peeling for use in early exits and mark
> it live it won't work no?

I guess I wouldn't actually mark it STMT_VINFO_LIVE_P but somehow
arrange vectorizable_live_operation to be called, possibly adding
a edge argument to that as well.

Maybe the thing to do for the moment is to reject vectorization with
early breaks if there's any (non-STMT_VINFO_LIVE_P?) induction or
reduction besides the main counting IV one you can already
special-case?

Richard.

> Tamar
> > 
> > > Also you missed the question below about how to avoid the creation of
> > > the block, You ok with changing that?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > > Or for now disable early-break for inductions that are not the main
> > > > exit control IV (in vect_can_advance_ivs_p)?
> > > >
> > > > > > > >
> > > > > > > > It seems your change handles different kinds of inductions
> > differently.
> > > > > > > > Specifically
> > > > > > > >
> > > > > > > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > > >       if (restart_loop && ivtemp)
> > > > > > > >         {
> > > > > > > >           type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > > >           ni = build_int_cst (type, vf);
> > > > > > > >           if (inversed_iv)
> > > > > > > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > > >                               fold_convert (type, step_expr));
> > > > > > > >         }
> > > > > > > >
> > > > > > > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > > > > > > as the new value.  That seems to be very odd special casing
> > > > > > > > for unknown reasons.  And while you adjust vec_step_op_add,
> > > > > > > > you don't adjust vect_peel_nonlinear_iv_init (maybe not
> > > > > > > > supported - better assert
> > > > > > here).
> > > > > > >
> > > > > > > The VF case is for a normal "non-inverted" loop, where if you
> > > > > > > take an early exit you know that you have to do at most VF iterations.
> > > > > > > The VF
> > > > > > > - step is to account for the inverted loop control flow where
> > > > > > > you exit after adjusting the IV already by + step.
> > > > > >
> > > > > > But doesn't that assume the IV counts from niter to zero?  I
> > > > > > don't see this special case is actually necessary, no?
> > > > > >
> > > > >
> > > > > I needed it because otherwise the scalar loop iterates one
> > > > > iteration too little So I got a miscompile with the inverter loop
> > > > > stuff.  I'll look at it again perhaps It can be solved differently.
> > > > >
> > > > > > >
> > > > > > > Peeling doesn't matter here, since you know you were able to
> > > > > > > do a vector iteration so it's safe to do VF iterations.  So
> > > > > > > having peeled doesn't affect the remaining iters count.
> > > > > > >
> > > > > > > >
> > > > > > > > Also the vec_step_op_add case will keep the original scalar
> > > > > > > > IV live even when it is a vectorized induction.  The code
> > > > > > > > recomputing the value from scratch avoids this.
> > > > > > > >
> > > > > > > >       /* For non-main exit create an intermediat edge to get
> > > > > > > > any updated
> > > > iv
> > > > > > > >          calculations.  */
> > > > > > > >       if (needs_interm_block
> > > > > > > >           && !iv_block
> > > > > > > >           && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > !gimple_seq_empty_p
> > > > > > > > (new_stmts)))
> > > > > > > >         {
> > > > > > > >           iv_block = split_edge (update_e);
> > > > > > > >           update_e = single_succ_edge (update_e->dest);
> > > > > > > >           last_gsi = gsi_last_bb (iv_block);
> > > > > > > >         }
> > > > > > > >
> > > > > > > > this is also odd, can we adjust the API instead?  I suppose
> > > > > > > > this is because your computation uses the original loop IV,
> > > > > > > > if you based the computation off the initial value only this
> > > > > > > > might not be
> > > > necessary?
> > > > > > >
> > > > > > > No, on the main exit the code updates the value in the loop
> > > > > > > header and puts the Calculation in the merge block.  This
> > > > > > > works because it only needs to consume PHI nodes in the merge
> > > > > > > block and things like niters are
> > > > > > adjusted in the guard block.
> > > > > > >
> > > > > > > For an early exit, we don't have a guard block, only the merge block.
> > > > > > > We have to update the PHI nodes in that block,  but can't do
> > > > > > > so since you can't produce a value and consume it in a PHI
> > > > > > > node in the same
> > > > BB.
> > > > > > > So we need to create the block to put the values in for use in
> > > > > > > the merge block.  Because there's no "guard" block for early exits.
> > > > > >
> > > > > > ?  then compute niters in that block as well.
> > > > >
> > > > > We can't since it'll not be reachable through the right edge.
> > > > > What we can do if you want is slightly change peeling, we currently peel
> > as:
> > > > >
> > > > >   \        \             /
> > > > >   E1     E2        Normal exit
> > > > >     \       |          |
> > > > >        \    |          Guard
> > > > >           \ |          |
> > > > >          Merge block
> > > > >                   |
> > > > >              Pre Header
> > > > >
> > > > > If we instead peel as:
> > > > >
> > > > >
> > > > >   \        \             /
> > > > >   E1     E2        Normal exit
> > > > >     \       |          |
> > > > >        Exit join   Guard
> > > > >           \ |          |
> > > > >          Merge block
> > > > >                   |
> > > > >              Pre Header
> > > > >
> > > > > We can use the exit join block.  This would also mean
> > > > > vect_update_ivs_after_vectorizer Doesn't need to iterate over all
> > > > > exits and only really needs to adjust the phi nodes Coming out of
> > > > > the exit join
> > > > and guard block.
> > > > >
> > > > > Does this work for you?
> > 
> > Yeah, I think that would work.  But I'd like to sort out the correctness details of
> > the IV update itself before sorting out this code placement detail.
> > 
> > Richard.
> > 
> > > > > Thanks,
> > > > > Tamar
> > > > > >
> > > > > > > The API can be adjusted by always creating the empty block
> > > > > > > either during
> > > > > > peeling.
> > > > > > > That would prevent us from having to do anything special here.
> > > > > > > Would that work better?  Or I can do it in the loop that
> > > > > > > iterates over the exits to before the call to
> > > > > > > vect_update_ivs_after_vectorizer, which I think
> > > > > > might be more consistent.
> > > > > > >
> > > > > > > >
> > > > > > > > That said, I wonder why we cannot simply pass in an adjusted
> > > > > > > > niter which would be niters_vector_mult_vf - vf and be done with
> > that?
> > > > > > > >
> > > > > > >
> > > > > > > We can ofcourse not have this and recompute it from niters
> > > > > > > itself, however this does affect the epilog code layout.
> > > > > > > Particularly knowing the static number if iterations left
> > > > > > > causes it to usually unroll the loop and share some of the
> > > > > > > computations.  i.e. the scalar code is often more
> > > > > > efficient.
> > > > > > >
> > > > > > > The computation would be niters_vector_mult_vf - iters_done *
> > > > > > > vf, since the value put Here is the remaining iteration count.
> > > > > > > It's static for early
> > > > > > exits.
> > > > > >
> > > > > > Well, it might be "static" in that it doesn't really matter what
> > > > > > you use for the epilog main IV initial value as long as you are
> > > > > > sure you're not going to take that exit as you are sure we're
> > > > > > going to take one of the early exits.  So yeah, the special code
> > > > > > is probably OK, but it needs a better comment and as said the
> > > > > > structure of
> > > > vect_update_ivs_after_vectorizer is a bit hard to follow now.
> > > > > >
> > > > > > As said an important part for optimization is to not keep the
> > > > > > scalar IVs live in the vector loop.
> > > > > >
> > > > > > > But can do whatever you prefer here.  Let me know what you
> > > > > > > prefer for the
> > > > > > above.
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Tamar
> > > > > > >
> > > > > > > > Thanks,
> > > > > > > > Richard.
> > > > > > > >
> > > > > > > >
> > > > > > > > > Regards,
> > > > > > > > > Tamar
> > > > > > > > > >
> > > > > > > > > > > It has to do this since you have to perform the side
> > > > > > > > > > > effects for the non-matching elements still.
> > > > > > > > > > >
> > > > > > > > > > > Regards,
> > > > > > > > > > > Tamar
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > > > > > > +		continue;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > > > > > > +		 init + (final - init) * vf which takes into
> > > > > > > > > > > > > +account
> > > > peeling
> > > > > > > > > > > > > +		 values and non-single steps.  The main exit
> > > > can
> > > > > > > > > > > > > +use
> > > > > > > > niters
> > > > > > > > > > > > > +		 since if you exit from the main exit you've
> > > > done
> > > > > > > > > > > > > +all
> > > > > > > > vector
> > > > > > > > > > > > > +		 iterations.  For an early exit we don't know
> > > > when
> > > > > > > > > > > > > +we
> > > > > > > > exit
> > > > > > > > > > > > > +so
> > > > > > > > > > > > we
> > > > > > > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > > > > > > +				 fold_convert (stype,
> > > > start_expr),
> > > > > > > > > > > > > +				 fold_convert (stype,
> > > > init_expr));
> > > > > > > > > > > > > +	      /* Now adjust for VF to get the final iteration value.
> > > > */
> > > > > > > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > > > > > > +				 build_int_cst (stype, vf));
> > > > > > > > > > > > > +	    }
> > > > > > > > > > > > > +	  else
> > > > > > > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > > > > > +			       fold_convert (stype, niters),
> > > > step_expr);
> > > > > > > > > > > > > +
> > > > > > > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > > > > > > >  	  else
> > > > > > > > > > > > > @@ -2238,6 +2286,8 @@
> > > > > > > > > > > > > vect_update_ivs_after_vectorizer (loop_vec_info
> > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > > > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > > > > > > >  	ni = init_expr;
> > > > > > > > > > > > > +      else if (restart_loop)
> > > > > > > > > > > > > +	continue;
> > > > > > > > > > > >
> > > > > > > > > > > > This looks all a bit complicated - why wouldn't we
> > > > > > > > > > > > simply always use the PHI result when 'restart_loop'?
> > > > > > > > > > > > Isn't that the correct old start value in
> > > > > > > > > > all cases?
> > > > > > > > > > > >
> > > > > > > > > > > > >        else
> > > > > > > > > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > > > > > > > > > >  					  niters, step_expr,
> > @@ -
> > > > > > 2245,9 +2295,20 @@
> > > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > >
> > > > > > > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > > > > > > >
> > > > > > > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > > > > > > >        ni_name = force_gimple_operand (ni,
> > > > > > > > > > > > > &new_stmts, false, var);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +      /* For non-main exit create an intermediat
> > > > > > > > > > > > > + edge to get any
> > > > > > > > updated iv
> > > > > > > > > > > > > +	 calculations.  */
> > > > > > > > > > > > > +      if (needs_interm_block
> > > > > > > > > > > > > +	  && !iv_block
> > > > > > > > > > > > > +	  && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > > > > +!gimple_seq_empty_p
> > > > > > > > > > > > (new_stmts)))
> > > > > > > > > > > > > +	{
> > > > > > > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > > > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > > > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > > > > +	}
> > > > > > > > > > > > > +
> > > > > > > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > > > > > > >  	{
> > > > > > > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling
> > > > > > > > > > > > > (loop_vec_info loop_vinfo, tree
> > > > > > > > > > > > niters, tree nitersm1,
> > > > > > > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > > > > > > >        gcc_checking_assert (vect_can_advance_ivs_p
> > > > (loop_vinfo));
> > > > > > > > > > > > >        update_e = skip_vector ? e : loop_preheader_edge
> > (epilog);
> > > > > > > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > -					update_e);
> > > > > > > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > > > > > > +      bool inversed_iv
> > > > > > > > > > > > > +	= !vect_is_loop_exit_latch_pred
> > > > (LOOP_VINFO_IV_EXIT
> > > > > > > > (loop_vinfo),
> > > > > > > > > > > > > +					 LOOP_VINFO_LOOP
> > > > > > > > (loop_vinfo));
> > > > > > > > > > > >
> > > > > > > > > > > > You are computing this here and in
> > > > > > vect_update_ivs_after_vectorizer?
> > > > > > > > > > > >
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > > > > > > +      vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > + (loop_vinfo, vf,
> > > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > +					update_e,
> > > > inversed_iv);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > > > > > > +	{
> > > > > > > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > > > > > > > > > +	    continue;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	  vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > > > > > > > +vf,
> > > > > > > > > > > > > +
> > > > niters_vector_mult_vf,
> > > > > > > > > > > > > +					    exit, true);
> > > > > > > > > > > >
> > > > > > > > > > > > ... why does the same not work here?  Wouldn't the
> > > > > > > > > > > > proper condition be !dominated_by_p (CDI_DOMINATORS,
> > > > > > > > > > > > exit->src, LOOP_VINFO_IV_EXIT
> > > > > > > > > > > > (loop_vinfo)->src) or similar?  That is, whether the
> > > > > > > > > > > > exit is at or after the main IV exit?  (consider
> > > > > > > > > > > > having
> > > > > > > > > > > > two)
> > > > > > > > > > > >
> > > > > > > > > > > > > +	}
> > > > > > > > > > > > >
> > > > > > > > > > > > >        if (skip_epilog)
> > > > > > > > > > > > >  	{
> > > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > --
> > > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > > Nuernberg, Germany;
> > > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB
> > > > > > > > > > 36809, AG
> > > > > > > > > > Nuernberg)
> > > > > > > > >
> > > > > > > >
> > > > > > > > --
> > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> > > > > > > > AG
> > > > > > > > Nuernberg)
> > > > > > >
> > > > > >
> > > > > > --
> > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > > Nuernberg)
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
>
Tamar Christina Nov. 16, 2023, 3:19 p.m. UTC | #15
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Thursday, November 16, 2023 2:18 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> breaks and arbitrary exits
> 
> On Thu, 16 Nov 2023, Tamar Christina wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Thursday, November 16, 2023 1:36 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> jlaw@ventanamicro.com
> > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > support early breaks and arbitrary exits
> > >
> > > On Thu, 16 Nov 2023, Tamar Christina wrote:
> > >
> > > > > > > > > >
> > > > > > > > > > Perhaps I'm missing something here?
> > > > > > > > >
> > > > > > > > > OK, so I refreshed my mind of what
> > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > does.
> > > > > > > > >
> > > > > > > > > I still do not understand the (complexity of the) patch.
> > > > > > > > > Basically the function computes the new value of the IV
> > > > > > > > > "from scratch" based on the number of scalar iterations
> > > > > > > > > of the vector loop,
> > > > > the 'niter'
> > > > > > > > > argument.  I would have expected that for the early
> > > > > > > > > exits we either pass in a different 'niter' or alternatively a
> 'niter_adjustment'.
> > > > > > > >
> > > > > > > > But for an early exit there's no static value for adjusted
> > > > > > > > niter, since you don't know which iteration you exited from.
> > > > > > > > Unlike the normal exit when you know if you get there
> > > > > > > > you've done all possible
> > > > > > > iterations.
> > > > > > > >
> > > > > > > > So you must compute the scalar iteration count on the exit itself.
> > > > > > >
> > > > > > > ?  You do not need the actual scalar iteration you exited
> > > > > > > (you don't compute that either), you need the scalar
> > > > > > > iteration the vector iteration started with when it exited
> > > > > > > prematurely and that's readily
> > > > > available?
> > > > > >
> > > > > > For a normal exit yes, not for an early exit no?
> > > > > > niters_vector_mult_vf is only valid for the main exit.
> > > > > >
> > > > > > There's the unadjusted scalar count, which is what it's using
> > > > > > to adjust it to the final count.  Unless I'm missing something?
> > > > >
> > > > > Ah, of course - niters_vector_mult_vf is for the countable exit.
> > > > > For the early exits we can't precompute the scalar iteration value.
> > > > > But that then means we should compute the appropriate
> "continuation"
> > > > > as live value of the vectorized IVs even when they were not
> > > > > originally used outside of the loop.  I don't see how we can
> > > > > express this in terms of the scalar IVs in the (not yet)
> > > > > vectorized loop - similar to the reduction case you are going to
> > > > > end up with the wrong values
> > > here.
> > > > >
> > > > > That said, I've for a long time wanted to preserve the original
> > > > > control IV also for the vector code (leaving any "optimization"
> > > > > to IVOPTs there), that would enable us to compute the correct
> > > > > "niters_vector_mult_vf" based on that IV.
> > > > >
> > > > > So given we cannot use the scalar IVs you have to handle all
> > > > > inductions (besides the main exit control IV) in
> > > > > vectorizable_live_operation
> > > I think.
> > > > >
> > > >
> > > > That's what I currently do, that's why there was the
> > > > 	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > 		continue;
> > >
> > > Yes, but that only works for the inductions marked so.  We'd need to
> > > mark the others as well, but only for the early exits.
> > >
> > > > although I don't understand why we use the scalar count,  I
> > > > suppose the reasoning is that we don't really want to keep it
> > > > around, and referencing
> > > it forces it to be kept?
> > >
> > > Referencing it will cause the scalar compute to be retained, but
> > > since we do not adjust the scalar compute during vectorization (but
> > > expect it to be dead) the scalar compute will compute the wrong
> > > thing (as shown by the reduction example - I suspect inductions will suffer
> from the same problem).
> > >
> > > > At the moment it just does `init + (final - init) * vf` which is correct no?
> > >
> > > The issue is that 'final' is not computed correctly in the
> > > vectorized loop.  This formula might work for affine evolutions of course.
> > >
> > > Extracting the correct value from the vectorized induction would be
> > > the preferred solution.
> >
> > Ok, so I should be able to just mark IVs as live during process_use if
> > there are multiple exits right? Since it's just gonna be unused on the
> > main exit since we use niters?
> >
> > Because since it's the PHI inside the loop that needs to be marked
> > live I can't just do it for a specific exits no?
> >
> > If I create a copy of the PHI node during peeling for use in early
> > exits and mark it live it won't work no?
> 
> I guess I wouldn't actually mark it STMT_VINFO_LIVE_P but somehow arrange
> vectorizable_live_operation to be called, possibly adding a edge argument to
> that as well.
> 
> Maybe the thing to do for the moment is to reject vectorization with early
> breaks if there's any (non-STMT_VINFO_LIVE_P?) induction or reduction
> besides the main counting IV one you can already special-case?

Ok so I did a quick hack with:

      if (!virtual_operand_p (PHI_RESULT (phi))
	  && !STMT_VINFO_LIVE_P (phi_info))
	{
	  use_operand_p use_p;
	  imm_use_iterator imm_iter;
	  bool non_exit_use = false;
	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, PHI_RESULT (phi))
	    if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
	      for (auto exit : get_loop_exit_edges (loop))
		{
		  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
		    continue;

		  if (gimple_bb (USE_STMT (use_p)) != exit->dest)
		    {
		      non_exit_use = true;
		      goto fail;
		    }  
		}
fail:
	  if (non_exit_use)
	    return false;
	}

And it does seem to still allow all the cases I want.  I've placed this in vect_can_advance_ivs_p.

Does this cover what you meant?

Thanks,
Tamar

> 
> Richard.
> 
> > Tamar
> > >
> > > > Also you missed the question below about how to avoid the creation
> > > > of the block, You ok with changing that?
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > > Or for now disable early-break for inductions that are not the
> > > > > main exit control IV (in vect_can_advance_ivs_p)?
> > > > >
> > > > > > > > >
> > > > > > > > > It seems your change handles different kinds of
> > > > > > > > > inductions
> > > differently.
> > > > > > > > > Specifically
> > > > > > > > >
> > > > > > > > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > > > >       if (restart_loop && ivtemp)
> > > > > > > > >         {
> > > > > > > > >           type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > > > >           ni = build_int_cst (type, vf);
> > > > > > > > >           if (inversed_iv)
> > > > > > > > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > > > >                               fold_convert (type, step_expr));
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > > > > > > > as the new value.  That seems to be very odd special
> > > > > > > > > casing for unknown reasons.  And while you adjust
> > > > > > > > > vec_step_op_add, you don't adjust
> > > > > > > > > vect_peel_nonlinear_iv_init (maybe not supported -
> > > > > > > > > better assert
> > > > > > > here).
> > > > > > > >
> > > > > > > > The VF case is for a normal "non-inverted" loop, where if
> > > > > > > > you take an early exit you know that you have to do at most VF
> iterations.
> > > > > > > > The VF
> > > > > > > > - step is to account for the inverted loop control flow
> > > > > > > > where you exit after adjusting the IV already by + step.
> > > > > > >
> > > > > > > But doesn't that assume the IV counts from niter to zero?  I
> > > > > > > don't see this special case is actually necessary, no?
> > > > > > >
> > > > > >
> > > > > > I needed it because otherwise the scalar loop iterates one
> > > > > > iteration too little So I got a miscompile with the inverter
> > > > > > loop stuff.  I'll look at it again perhaps It can be solved differently.
> > > > > >
> > > > > > > >
> > > > > > > > Peeling doesn't matter here, since you know you were able
> > > > > > > > to do a vector iteration so it's safe to do VF iterations.
> > > > > > > > So having peeled doesn't affect the remaining iters count.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Also the vec_step_op_add case will keep the original
> > > > > > > > > scalar IV live even when it is a vectorized induction.
> > > > > > > > > The code recomputing the value from scratch avoids this.
> > > > > > > > >
> > > > > > > > >       /* For non-main exit create an intermediat edge to
> > > > > > > > > get any updated
> > > > > iv
> > > > > > > > >          calculations.  */
> > > > > > > > >       if (needs_interm_block
> > > > > > > > >           && !iv_block
> > > > > > > > >           && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > !gimple_seq_empty_p
> > > > > > > > > (new_stmts)))
> > > > > > > > >         {
> > > > > > > > >           iv_block = split_edge (update_e);
> > > > > > > > >           update_e = single_succ_edge (update_e->dest);
> > > > > > > > >           last_gsi = gsi_last_bb (iv_block);
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > this is also odd, can we adjust the API instead?  I
> > > > > > > > > suppose this is because your computation uses the
> > > > > > > > > original loop IV, if you based the computation off the
> > > > > > > > > initial value only this might not be
> > > > > necessary?
> > > > > > > >
> > > > > > > > No, on the main exit the code updates the value in the
> > > > > > > > loop header and puts the Calculation in the merge block.
> > > > > > > > This works because it only needs to consume PHI nodes in
> > > > > > > > the merge block and things like niters are
> > > > > > > adjusted in the guard block.
> > > > > > > >
> > > > > > > > For an early exit, we don't have a guard block, only the merge
> block.
> > > > > > > > We have to update the PHI nodes in that block,  but can't
> > > > > > > > do so since you can't produce a value and consume it in a
> > > > > > > > PHI node in the same
> > > > > BB.
> > > > > > > > So we need to create the block to put the values in for
> > > > > > > > use in the merge block.  Because there's no "guard" block for early
> exits.
> > > > > > >
> > > > > > > ?  then compute niters in that block as well.
> > > > > >
> > > > > > We can't since it'll not be reachable through the right edge.
> > > > > > What we can do if you want is slightly change peeling, we
> > > > > > currently peel
> > > as:
> > > > > >
> > > > > >   \        \             /
> > > > > >   E1     E2        Normal exit
> > > > > >     \       |          |
> > > > > >        \    |          Guard
> > > > > >           \ |          |
> > > > > >          Merge block
> > > > > >                   |
> > > > > >              Pre Header
> > > > > >
> > > > > > If we instead peel as:
> > > > > >
> > > > > >
> > > > > >   \        \             /
> > > > > >   E1     E2        Normal exit
> > > > > >     \       |          |
> > > > > >        Exit join   Guard
> > > > > >           \ |          |
> > > > > >          Merge block
> > > > > >                   |
> > > > > >              Pre Header
> > > > > >
> > > > > > We can use the exit join block.  This would also mean
> > > > > > vect_update_ivs_after_vectorizer Doesn't need to iterate over
> > > > > > all exits and only really needs to adjust the phi nodes Coming
> > > > > > out of the exit join
> > > > > and guard block.
> > > > > >
> > > > > > Does this work for you?
> > >
> > > Yeah, I think that would work.  But I'd like to sort out the
> > > correctness details of the IV update itself before sorting out this code
> placement detail.
> > >
> > > Richard.
> > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > > >
> > > > > > > > The API can be adjusted by always creating the empty block
> > > > > > > > either during
> > > > > > > peeling.
> > > > > > > > That would prevent us from having to do anything special here.
> > > > > > > > Would that work better?  Or I can do it in the loop that
> > > > > > > > iterates over the exits to before the call to
> > > > > > > > vect_update_ivs_after_vectorizer, which I think
> > > > > > > might be more consistent.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > That said, I wonder why we cannot simply pass in an
> > > > > > > > > adjusted niter which would be niters_vector_mult_vf - vf
> > > > > > > > > and be done with
> > > that?
> > > > > > > > >
> > > > > > > >
> > > > > > > > We can ofcourse not have this and recompute it from niters
> > > > > > > > itself, however this does affect the epilog code layout.
> > > > > > > > Particularly knowing the static number if iterations left
> > > > > > > > causes it to usually unroll the loop and share some of the
> > > > > > > > computations.  i.e. the scalar code is often more
> > > > > > > efficient.
> > > > > > > >
> > > > > > > > The computation would be niters_vector_mult_vf -
> > > > > > > > iters_done * vf, since the value put Here is the remaining iteration
> count.
> > > > > > > > It's static for early
> > > > > > > exits.
> > > > > > >
> > > > > > > Well, it might be "static" in that it doesn't really matter
> > > > > > > what you use for the epilog main IV initial value as long as
> > > > > > > you are sure you're not going to take that exit as you are
> > > > > > > sure we're going to take one of the early exits.  So yeah,
> > > > > > > the special code is probably OK, but it needs a better
> > > > > > > comment and as said the structure of
> > > > > vect_update_ivs_after_vectorizer is a bit hard to follow now.
> > > > > > >
> > > > > > > As said an important part for optimization is to not keep
> > > > > > > the scalar IVs live in the vector loop.
> > > > > > >
> > > > > > > > But can do whatever you prefer here.  Let me know what you
> > > > > > > > prefer for the
> > > > > > > above.
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > > Tamar
> > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > > Richard.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > Regards,
> > > > > > > > > > Tamar
> > > > > > > > > > >
> > > > > > > > > > > > It has to do this since you have to perform the
> > > > > > > > > > > > side effects for the non-matching elements still.
> > > > > > > > > > > >
> > > > > > > > > > > > Regards,
> > > > > > > > > > > > Tamar
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > > > > > > > +		continue;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > > > > > > > +		 init + (final - init) * vf which takes
> > > > > > > > > > > > > > +into account
> > > > > peeling
> > > > > > > > > > > > > > +		 values and non-single steps.  The main
> > > > > > > > > > > > > > +exit
> > > > > can
> > > > > > > > > > > > > > +use
> > > > > > > > > niters
> > > > > > > > > > > > > > +		 since if you exit from the main exit
> > > > > > > > > > > > > > +you've
> > > > > done
> > > > > > > > > > > > > > +all
> > > > > > > > > vector
> > > > > > > > > > > > > > +		 iterations.  For an early exit we don't
> > > > > > > > > > > > > > +know
> > > > > when
> > > > > > > > > > > > > > +we
> > > > > > > > > exit
> > > > > > > > > > > > > > +so
> > > > > > > > > > > > > we
> > > > > > > > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > > > > > > > +				 fold_convert (stype,
> > > > > start_expr),
> > > > > > > > > > > > > > +				 fold_convert (stype,
> > > > > init_expr));
> > > > > > > > > > > > > > +	      /* Now adjust for VF to get the final iteration value.
> > > > > */
> > > > > > > > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > > > > > > > +				 build_int_cst (stype, vf));
> > > > > > > > > > > > > > +	    }
> > > > > > > > > > > > > > +	  else
> > > > > > > > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > > > > > > +			       fold_convert (stype, niters),
> > > > > step_expr);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > > > > > > > >  	  else
> > > > > > > > > > > > > > @@ -2238,6 +2286,8 @@
> > > > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > > > > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > > > > > > > >  	ni = init_expr;
> > > > > > > > > > > > > > +      else if (restart_loop)
> > > > > > > > > > > > > > +	continue;
> > > > > > > > > > > > >
> > > > > > > > > > > > > This looks all a bit complicated - why wouldn't
> > > > > > > > > > > > > we simply always use the PHI result when 'restart_loop'?
> > > > > > > > > > > > > Isn't that the correct old start value in
> > > > > > > > > > > all cases?
> > > > > > > > > > > > >
> > > > > > > > > > > > > >        else
> > > > > > > > > > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> > > > > > > > > > > > > >  					  niters, step_expr,
> > > @@ -
> > > > > > > 2245,9 +2295,20 @@
> > > > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > > > > > > > >        ni_name = force_gimple_operand (ni,
> > > > > > > > > > > > > > &new_stmts, false, var);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +      /* For non-main exit create an
> > > > > > > > > > > > > > + intermediat edge to get any
> > > > > > > > > updated iv
> > > > > > > > > > > > > > +	 calculations.  */
> > > > > > > > > > > > > > +      if (needs_interm_block
> > > > > > > > > > > > > > +	  && !iv_block
> > > > > > > > > > > > > > +	  && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > > > > > +!gimple_seq_empty_p
> > > > > > > > > > > > > (new_stmts)))
> > > > > > > > > > > > > > +	{
> > > > > > > > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > > > > > > > +	  update_e = single_succ_edge (update_e->dest);
> > > > > > > > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > > > > > > > >  	{
> > > > > > > > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling
> > > > > > > > > > > > > > (loop_vec_info loop_vinfo, tree
> > > > > > > > > > > > > niters, tree nitersm1,
> > > > > > > > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > > > > > > > >        gcc_checking_assert
> > > > > > > > > > > > > > (vect_can_advance_ivs_p
> > > > > (loop_vinfo));
> > > > > > > > > > > > > >        update_e = skip_vector ? e :
> > > > > > > > > > > > > > loop_preheader_edge
> > > (epilog);
> > > > > > > > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > -					update_e);
> > > > > > > > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > > > > > > > +      bool inversed_iv
> > > > > > > > > > > > > > +	= !vect_is_loop_exit_latch_pred
> > > > > (LOOP_VINFO_IV_EXIT
> > > > > > > > > (loop_vinfo),
> > > > > > > > > > > > > > +					 LOOP_VINFO_LOOP
> > > > > > > > > (loop_vinfo));
> > > > > > > > > > > > >
> > > > > > > > > > > > > You are computing this here and in
> > > > > > > vect_update_ivs_after_vectorizer?
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > > > > > > > +      vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > + (loop_vinfo, vf,
> > > > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > +					update_e,
> > > > > inversed_iv);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > > > > > > > +	{
> > > > > > > > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > > > > > > > > > > > > +	    continue;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	  vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > +(loop_vinfo, vf,
> > > > > > > > > > > > > > +
> > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > +					    exit, true);
> > > > > > > > > > > > >
> > > > > > > > > > > > > ... why does the same not work here?  Wouldn't
> > > > > > > > > > > > > the proper condition be !dominated_by_p
> > > > > > > > > > > > > (CDI_DOMINATORS,
> > > > > > > > > > > > > exit->src, LOOP_VINFO_IV_EXIT
> > > > > > > > > > > > > (loop_vinfo)->src) or similar?  That is, whether
> > > > > > > > > > > > > the exit is at or after the main IV exit?
> > > > > > > > > > > > > (consider having
> > > > > > > > > > > > > two)
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >        if (skip_epilog)
> > > > > > > > > > > > > >  	{
> > > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > --
> > > > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > > > Nuernberg, Germany;
> > > > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich;
> > > > > > > > > > > (HRB 36809, AG
> > > > > > > > > > > Nuernberg)
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > --
> > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > Nuernberg, Germany;
> > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB
> > > > > > > > > 36809, AG
> > > > > > > > > Nuernberg)
> > > > > > > >
> > > > > > >
> > > > > > > --
> > > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> > > > > > > AG
> > > > > > > Nuernberg)
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
Tamar Christina Nov. 16, 2023, 6:41 p.m. UTC | #16
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Thursday, November 16, 2023 3:19 PM
> To: Richard Biener <rguenther@suse.de>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH 7/21]middle-end: update IV update code to support early
> breaks and arbitrary exits
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Thursday, November 16, 2023 2:18 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH 7/21]middle-end: update IV update code to support
> > early breaks and arbitrary exits
> >
> > On Thu, 16 Nov 2023, Tamar Christina wrote:
> >
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Thursday, November 16, 2023 1:36 PM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>;
> > jlaw@ventanamicro.com
> > > > Subject: RE: [PATCH 7/21]middle-end: update IV update code to
> > > > support early breaks and arbitrary exits
> > > >
> > > > On Thu, 16 Nov 2023, Tamar Christina wrote:
> > > >
> > > > > > > > > > >
> > > > > > > > > > > Perhaps I'm missing something here?
> > > > > > > > > >
> > > > > > > > > > OK, so I refreshed my mind of what
> > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > does.
> > > > > > > > > >
> > > > > > > > > > I still do not understand the (complexity of the) patch.
> > > > > > > > > > Basically the function computes the new value of the
> > > > > > > > > > IV "from scratch" based on the number of scalar
> > > > > > > > > > iterations of the vector loop,
> > > > > > the 'niter'
> > > > > > > > > > argument.  I would have expected that for the early
> > > > > > > > > > exits we either pass in a different 'niter' or
> > > > > > > > > > alternatively a
> > 'niter_adjustment'.
> > > > > > > > >
> > > > > > > > > But for an early exit there's no static value for
> > > > > > > > > adjusted niter, since you don't know which iteration you exited
> from.
> > > > > > > > > Unlike the normal exit when you know if you get there
> > > > > > > > > you've done all possible
> > > > > > > > iterations.
> > > > > > > > >
> > > > > > > > > So you must compute the scalar iteration count on the exit itself.
> > > > > > > >
> > > > > > > > ?  You do not need the actual scalar iteration you exited
> > > > > > > > (you don't compute that either), you need the scalar
> > > > > > > > iteration the vector iteration started with when it exited
> > > > > > > > prematurely and that's readily
> > > > > > available?
> > > > > > >
> > > > > > > For a normal exit yes, not for an early exit no?
> > > > > > > niters_vector_mult_vf is only valid for the main exit.
> > > > > > >
> > > > > > > There's the unadjusted scalar count, which is what it's
> > > > > > > using to adjust it to the final count.  Unless I'm missing something?
> > > > > >
> > > > > > Ah, of course - niters_vector_mult_vf is for the countable exit.
> > > > > > For the early exits we can't precompute the scalar iteration value.
> > > > > > But that then means we should compute the appropriate
> > "continuation"
> > > > > > as live value of the vectorized IVs even when they were not
> > > > > > originally used outside of the loop.  I don't see how we can
> > > > > > express this in terms of the scalar IVs in the (not yet)
> > > > > > vectorized loop - similar to the reduction case you are going
> > > > > > to end up with the wrong values
> > > > here.
> > > > > >
> > > > > > That said, I've for a long time wanted to preserve the
> > > > > > original control IV also for the vector code (leaving any "optimization"
> > > > > > to IVOPTs there), that would enable us to compute the correct
> > > > > > "niters_vector_mult_vf" based on that IV.
> > > > > >
> > > > > > So given we cannot use the scalar IVs you have to handle all
> > > > > > inductions (besides the main exit control IV) in
> > > > > > vectorizable_live_operation
> > > > I think.
> > > > > >
> > > > >
> > > > > That's what I currently do, that's why there was the
> > > > > 	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > 		continue;
> > > >
> > > > Yes, but that only works for the inductions marked so.  We'd need
> > > > to mark the others as well, but only for the early exits.
> > > >
> > > > > although I don't understand why we use the scalar count,  I
> > > > > suppose the reasoning is that we don't really want to keep it
> > > > > around, and referencing
> > > > it forces it to be kept?
> > > >
> > > > Referencing it will cause the scalar compute to be retained, but
> > > > since we do not adjust the scalar compute during vectorization
> > > > (but expect it to be dead) the scalar compute will compute the
> > > > wrong thing (as shown by the reduction example - I suspect
> > > > inductions will suffer
> > from the same problem).
> > > >
> > > > > At the moment it just does `init + (final - init) * vf` which is correct no?
> > > >
> > > > The issue is that 'final' is not computed correctly in the
> > > > vectorized loop.  This formula might work for affine evolutions of course.
> > > >
> > > > Extracting the correct value from the vectorized induction would
> > > > be the preferred solution.
> > >
> > > Ok, so I should be able to just mark IVs as live during process_use
> > > if there are multiple exits right? Since it's just gonna be unused
> > > on the main exit since we use niters?
> > >
> > > Because since it's the PHI inside the loop that needs to be marked
> > > live I can't just do it for a specific exits no?
> > >
> > > If I create a copy of the PHI node during peeling for use in early
> > > exits and mark it live it won't work no?
> >
> > I guess I wouldn't actually mark it STMT_VINFO_LIVE_P but somehow
> > arrange vectorizable_live_operation to be called, possibly adding a
> > edge argument to that as well.
> >
> > Maybe the thing to do for the moment is to reject vectorization with
> > early breaks if there's any (non-STMT_VINFO_LIVE_P?) induction or
> > reduction besides the main counting IV one you can already special-case?
> 
> Ok so I did a quick hack with:
> 
>       if (!virtual_operand_p (PHI_RESULT (phi))
> 	  && !STMT_VINFO_LIVE_P (phi_info))
> 	{
> 	  use_operand_p use_p;
> 	  imm_use_iterator imm_iter;
> 	  bool non_exit_use = false;
> 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, PHI_RESULT (phi))
> 	    if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
> 	      for (auto exit : get_loop_exit_edges (loop))
> 		{
> 		  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> 		    continue;
> 
> 		  if (gimple_bb (USE_STMT (use_p)) != exit->dest)
> 		    {
> 		      non_exit_use = true;
> 		      goto fail;
> 		    }
> 		}
> fail:
> 	  if (non_exit_use)
> 	    return false;
> 	}
> 
> And it does seem to still allow all the cases I want.  I've placed this in
> vect_can_advance_ivs_p.
> 
> Does this cover what you meant?
> 

Ok, I've rewritten this in a nicer form, but doesn't this mean we now block any loop there the index is not live?
i.e. we block such simple loops like

#ifndef N
#define N 800
#endif
unsigned vect_a[N];

unsigned test4(unsigned x)
{
 unsigned ret = 0;
 for (int i = 0; i < N; i++)
 {
   if (vect_a[i]*2 != x)
     break;
   vect_a[i] = x;
 }
 return ret;
}

because it does a simple `break`.  If I force it to be live it works, but then I need to differentiate between
the counter and the IV.

# i_15 = PHI <i_12(6), 0(2)>
# ivtmp_7 = PHI <ivtmp_14(6), 803(2)>

I seems like if we don't want to keep i_15 around (at the moment it will be kept because of its usage in the
exit block it won't be DCEd) then we need to mark it live early during analysis.

Most likely if we do this I don't need to care about the "inverted" workflow here at all. What do you think?

Yes that doesn't work for SLP, but I don't think I can get SLP working in the remaining time anyway..

I'll fix reduction and multiple exit live values in the mean time.

Thanks,
Tamar
> Thanks,
> Tamar
> 
> >
> > Richard.
> >
> > > Tamar
> > > >
> > > > > Also you missed the question below about how to avoid the
> > > > > creation of the block, You ok with changing that?
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > > Or for now disable early-break for inductions that are not the
> > > > > > main exit control IV (in vect_can_advance_ivs_p)?
> > > > > >
> > > > > > > > > >
> > > > > > > > > > It seems your change handles different kinds of
> > > > > > > > > > inductions
> > > > differently.
> > > > > > > > > > Specifically
> > > > > > > > > >
> > > > > > > > > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > > > > >       if (restart_loop && ivtemp)
> > > > > > > > > >         {
> > > > > > > > > >           type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > > > > >           ni = build_int_cst (type, vf);
> > > > > > > > > >           if (inversed_iv)
> > > > > > > > > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > > > > >                               fold_convert (type, step_expr));
> > > > > > > > > >         }
> > > > > > > > > >
> > > > > > > > > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > > > > > > > > as the new value.  That seems to be very odd special
> > > > > > > > > > casing for unknown reasons.  And while you adjust
> > > > > > > > > > vec_step_op_add, you don't adjust
> > > > > > > > > > vect_peel_nonlinear_iv_init (maybe not supported -
> > > > > > > > > > better assert
> > > > > > > > here).
> > > > > > > > >
> > > > > > > > > The VF case is for a normal "non-inverted" loop, where
> > > > > > > > > if you take an early exit you know that you have to do
> > > > > > > > > at most VF
> > iterations.
> > > > > > > > > The VF
> > > > > > > > > - step is to account for the inverted loop control flow
> > > > > > > > > where you exit after adjusting the IV already by + step.
> > > > > > > >
> > > > > > > > But doesn't that assume the IV counts from niter to zero?
> > > > > > > > I don't see this special case is actually necessary, no?
> > > > > > > >
> > > > > > >
> > > > > > > I needed it because otherwise the scalar loop iterates one
> > > > > > > iteration too little So I got a miscompile with the inverter
> > > > > > > loop stuff.  I'll look at it again perhaps It can be solved differently.
> > > > > > >
> > > > > > > > >
> > > > > > > > > Peeling doesn't matter here, since you know you were
> > > > > > > > > able to do a vector iteration so it's safe to do VF iterations.
> > > > > > > > > So having peeled doesn't affect the remaining iters count.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Also the vec_step_op_add case will keep the original
> > > > > > > > > > scalar IV live even when it is a vectorized induction.
> > > > > > > > > > The code recomputing the value from scratch avoids this.
> > > > > > > > > >
> > > > > > > > > >       /* For non-main exit create an intermediat edge
> > > > > > > > > > to get any updated
> > > > > > iv
> > > > > > > > > >          calculations.  */
> > > > > > > > > >       if (needs_interm_block
> > > > > > > > > >           && !iv_block
> > > > > > > > > >           && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > !gimple_seq_empty_p
> > > > > > > > > > (new_stmts)))
> > > > > > > > > >         {
> > > > > > > > > >           iv_block = split_edge (update_e);
> > > > > > > > > >           update_e = single_succ_edge (update_e->dest);
> > > > > > > > > >           last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > >         }
> > > > > > > > > >
> > > > > > > > > > this is also odd, can we adjust the API instead?  I
> > > > > > > > > > suppose this is because your computation uses the
> > > > > > > > > > original loop IV, if you based the computation off the
> > > > > > > > > > initial value only this might not be
> > > > > > necessary?
> > > > > > > > >
> > > > > > > > > No, on the main exit the code updates the value in the
> > > > > > > > > loop header and puts the Calculation in the merge block.
> > > > > > > > > This works because it only needs to consume PHI nodes in
> > > > > > > > > the merge block and things like niters are
> > > > > > > > adjusted in the guard block.
> > > > > > > > >
> > > > > > > > > For an early exit, we don't have a guard block, only the
> > > > > > > > > merge
> > block.
> > > > > > > > > We have to update the PHI nodes in that block,  but
> > > > > > > > > can't do so since you can't produce a value and consume
> > > > > > > > > it in a PHI node in the same
> > > > > > BB.
> > > > > > > > > So we need to create the block to put the values in for
> > > > > > > > > use in the merge block.  Because there's no "guard"
> > > > > > > > > block for early
> > exits.
> > > > > > > >
> > > > > > > > ?  then compute niters in that block as well.
> > > > > > >
> > > > > > > We can't since it'll not be reachable through the right edge.
> > > > > > > What we can do if you want is slightly change peeling, we
> > > > > > > currently peel
> > > > as:
> > > > > > >
> > > > > > >   \        \             /
> > > > > > >   E1     E2        Normal exit
> > > > > > >     \       |          |
> > > > > > >        \    |          Guard
> > > > > > >           \ |          |
> > > > > > >          Merge block
> > > > > > >                   |
> > > > > > >              Pre Header
> > > > > > >
> > > > > > > If we instead peel as:
> > > > > > >
> > > > > > >
> > > > > > >   \        \             /
> > > > > > >   E1     E2        Normal exit
> > > > > > >     \       |          |
> > > > > > >        Exit join   Guard
> > > > > > >           \ |          |
> > > > > > >          Merge block
> > > > > > >                   |
> > > > > > >              Pre Header
> > > > > > >
> > > > > > > We can use the exit join block.  This would also mean
> > > > > > > vect_update_ivs_after_vectorizer Doesn't need to iterate
> > > > > > > over all exits and only really needs to adjust the phi nodes
> > > > > > > Coming out of the exit join
> > > > > > and guard block.
> > > > > > >
> > > > > > > Does this work for you?
> > > >
> > > > Yeah, I think that would work.  But I'd like to sort out the
> > > > correctness details of the IV update itself before sorting out
> > > > this code
> > placement detail.
> > > >
> > > > Richard.
> > > >
> > > > > > > Thanks,
> > > > > > > Tamar
> > > > > > > >
> > > > > > > > > The API can be adjusted by always creating the empty
> > > > > > > > > block either during
> > > > > > > > peeling.
> > > > > > > > > That would prevent us from having to do anything special here.
> > > > > > > > > Would that work better?  Or I can do it in the loop that
> > > > > > > > > iterates over the exits to before the call to
> > > > > > > > > vect_update_ivs_after_vectorizer, which I think
> > > > > > > > might be more consistent.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > That said, I wonder why we cannot simply pass in an
> > > > > > > > > > adjusted niter which would be niters_vector_mult_vf -
> > > > > > > > > > vf and be done with
> > > > that?
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > We can ofcourse not have this and recompute it from
> > > > > > > > > niters itself, however this does affect the epilog code layout.
> > > > > > > > > Particularly knowing the static number if iterations
> > > > > > > > > left causes it to usually unroll the loop and share some
> > > > > > > > > of the computations.  i.e. the scalar code is often more
> > > > > > > > efficient.
> > > > > > > > >
> > > > > > > > > The computation would be niters_vector_mult_vf -
> > > > > > > > > iters_done * vf, since the value put Here is the
> > > > > > > > > remaining iteration
> > count.
> > > > > > > > > It's static for early
> > > > > > > > exits.
> > > > > > > >
> > > > > > > > Well, it might be "static" in that it doesn't really
> > > > > > > > matter what you use for the epilog main IV initial value
> > > > > > > > as long as you are sure you're not going to take that exit
> > > > > > > > as you are sure we're going to take one of the early
> > > > > > > > exits.  So yeah, the special code is probably OK, but it
> > > > > > > > needs a better comment and as said the structure of
> > > > > > vect_update_ivs_after_vectorizer is a bit hard to follow now.
> > > > > > > >
> > > > > > > > As said an important part for optimization is to not keep
> > > > > > > > the scalar IVs live in the vector loop.
> > > > > > > >
> > > > > > > > > But can do whatever you prefer here.  Let me know what
> > > > > > > > > you prefer for the
> > > > > > > > above.
> > > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > > Tamar
> > > > > > > > >
> > > > > > > > > > Thanks,
> > > > > > > > > > Richard.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > Regards,
> > > > > > > > > > > Tamar
> > > > > > > > > > > >
> > > > > > > > > > > > > It has to do this since you have to perform the
> > > > > > > > > > > > > side effects for the non-matching elements still.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Regards,
> > > > > > > > > > > > > Tamar
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > > > > > > > > +		continue;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > > > > > > > > +		 init + (final - init) * vf which takes
> > > > > > > > > > > > > > > +into account
> > > > > > peeling
> > > > > > > > > > > > > > > +		 values and non-single steps.  The
> main
> > > > > > > > > > > > > > > +exit
> > > > > > can
> > > > > > > > > > > > > > > +use
> > > > > > > > > > niters
> > > > > > > > > > > > > > > +		 since if you exit from the main exit
> > > > > > > > > > > > > > > +you've
> > > > > > done
> > > > > > > > > > > > > > > +all
> > > > > > > > > > vector
> > > > > > > > > > > > > > > +		 iterations.  For an early exit we don't
> > > > > > > > > > > > > > > +know
> > > > > > when
> > > > > > > > > > > > > > > +we
> > > > > > > > > > exit
> > > > > > > > > > > > > > > +so
> > > > > > > > > > > > > > we
> > > > > > > > > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > > > > > > > > +				 fold_convert (stype,
> > > > > > start_expr),
> > > > > > > > > > > > > > > +				 fold_convert (stype,
> > > > > > init_expr));
> > > > > > > > > > > > > > > +	      /* Now adjust for VF to get the final
> iteration value.
> > > > > > */
> > > > > > > > > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > > > > > > > > +				 build_int_cst (stype,
> vf));
> > > > > > > > > > > > > > > +	    }
> > > > > > > > > > > > > > > +	  else
> > > > > > > > > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > > > > > > > +			       fold_convert (stype,
> niters),
> > > > > > step_expr);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > > > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > > > > > > > > >  	  else
> > > > > > > > > > > > > > > @@ -2238,6 +2286,8 @@
> > > > > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.  */
> > > > > > > > > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > > > > > > > > >  	ni = init_expr;
> > > > > > > > > > > > > > > +      else if (restart_loop)
> > > > > > > > > > > > > > > +	continue;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > This looks all a bit complicated - why
> > > > > > > > > > > > > > wouldn't we simply always use the PHI result when
> 'restart_loop'?
> > > > > > > > > > > > > > Isn't that the correct old start value in
> > > > > > > > > > > > all cases?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >        else
> > > > > > > > > > > > > > >  	ni = vect_peel_nonlinear_iv_init (&stmts,
> init_expr,
> > > > > > > > > > > > > > >  					  niters,
> step_expr,
> > > > @@ -
> > > > > > > > 2245,9 +2295,20 @@
> > > > > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > > > > > > > > >        ni_name = force_gimple_operand (ni,
> > > > > > > > > > > > > > > &new_stmts, false, var);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +      /* For non-main exit create an
> > > > > > > > > > > > > > > + intermediat edge to get any
> > > > > > > > > > updated iv
> > > > > > > > > > > > > > > +	 calculations.  */
> > > > > > > > > > > > > > > +      if (needs_interm_block
> > > > > > > > > > > > > > > +	  && !iv_block
> > > > > > > > > > > > > > > +	  && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > > > > > > +!gimple_seq_empty_p
> > > > > > > > > > > > > > (new_stmts)))
> > > > > > > > > > > > > > > +	{
> > > > > > > > > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > > > > > > > > +	  update_e = single_succ_edge (update_e-
> >dest);
> > > > > > > > > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > > > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > > > > > > > > >  	{
> > > > > > > > > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling
> > > > > > > > > > > > > > > (loop_vec_info loop_vinfo, tree
> > > > > > > > > > > > > > niters, tree nitersm1,
> > > > > > > > > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > > > > > > > > >        gcc_checking_assert
> > > > > > > > > > > > > > > (vect_can_advance_ivs_p
> > > > > > (loop_vinfo));
> > > > > > > > > > > > > > >        update_e = skip_vector ? e :
> > > > > > > > > > > > > > > loop_preheader_edge
> > > > (epilog);
> > > > > > > > > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > > -					update_e);
> > > > > > > > > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > > > > > > > > +      bool inversed_iv
> > > > > > > > > > > > > > > +	= !vect_is_loop_exit_latch_pred
> > > > > > (LOOP_VINFO_IV_EXIT
> > > > > > > > > > (loop_vinfo),
> > > > > > > > > > > > > > > +
> LOOP_VINFO_LOOP
> > > > > > > > > > (loop_vinfo));
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > You are computing this here and in
> > > > > > > > vect_update_ivs_after_vectorizer?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > > > > > > > > +      vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > + (loop_vinfo, vf,
> > > > > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > > +					update_e,
> > > > > > inversed_iv);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > > > > > > > > +	{
> > > > > > > > > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT
> (loop_vinfo))
> > > > > > > > > > > > > > > +	    continue;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +	  vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > +(loop_vinfo, vf,
> > > > > > > > > > > > > > > +
> > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > > +					    exit, true);
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > ... why does the same not work here?  Wouldn't
> > > > > > > > > > > > > > the proper condition be !dominated_by_p
> > > > > > > > > > > > > > (CDI_DOMINATORS,
> > > > > > > > > > > > > > exit->src, LOOP_VINFO_IV_EXIT
> > > > > > > > > > > > > > (loop_vinfo)->src) or similar?  That is,
> > > > > > > > > > > > > > whether the exit is at or after the main IV exit?
> > > > > > > > > > > > > > (consider having
> > > > > > > > > > > > > > two)
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >        if (skip_epilog)
> > > > > > > > > > > > > > >  	{
> > > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > --
> > > > > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > > > > Nuernberg, Germany;
> > > > > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich;
> > > > > > > > > > > > (HRB 36809, AG
> > > > > > > > > > > > Nuernberg)
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > --
> > > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > > Nuernberg, Germany;
> > > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB
> > > > > > > > > > 36809, AG
> > > > > > > > > > Nuernberg)
> > > > > > > > >
> > > > > > > >
> > > > > > > > --
> > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
> > > > > > > > Germany;
> > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB
> > > > > > > > 36809, AG
> > > > > > > > Nuernberg)
> > > > > > >
> > > > > >
> > > > > > --
> > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> > > > > > AG
> > > > > > Nuernberg)
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions Germany
> > > > GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > >
> >
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
Tamar Christina Nov. 17, 2023, 10:40 a.m. UTC | #17
> > > > > Yes, but that only works for the inductions marked so.  We'd
> > > > > need to mark the others as well, but only for the early exits.
> > > > >
> > > > > > although I don't understand why we use the scalar count,  I
> > > > > > suppose the reasoning is that we don't really want to keep it
> > > > > > around, and referencing
> > > > > it forces it to be kept?
> > > > >
> > > > > Referencing it will cause the scalar compute to be retained, but
> > > > > since we do not adjust the scalar compute during vectorization
> > > > > (but expect it to be dead) the scalar compute will compute the
> > > > > wrong thing (as shown by the reduction example - I suspect
> > > > > inductions will suffer
> > > from the same problem).
> > > > >
> > > > > > At the moment it just does `init + (final - init) * vf` which is correct no?
> > > > >
> > > > > The issue is that 'final' is not computed correctly in the
> > > > > vectorized loop.  This formula might work for affine evolutions of
> course.
> > > > >
> > > > > Extracting the correct value from the vectorized induction would
> > > > > be the preferred solution.
> > > >
> > > > Ok, so I should be able to just mark IVs as live during
> > > > process_use if there are multiple exits right? Since it's just
> > > > gonna be unused on the main exit since we use niters?
> > > >
> > > > Because since it's the PHI inside the loop that needs to be marked
> > > > live I can't just do it for a specific exits no?
> > > >
> > > > If I create a copy of the PHI node during peeling for use in early
> > > > exits and mark it live it won't work no?
> > >
> > > I guess I wouldn't actually mark it STMT_VINFO_LIVE_P but somehow
> > > arrange vectorizable_live_operation to be called, possibly adding a
> > > edge argument to that as well.
> > >
> > > Maybe the thing to do for the moment is to reject vectorization with
> > > early breaks if there's any (non-STMT_VINFO_LIVE_P?) induction or
> > > reduction besides the main counting IV one you can already special-case?
> >
> > Ok so I did a quick hack with:
> >
> >       if (!virtual_operand_p (PHI_RESULT (phi))
> > 	  && !STMT_VINFO_LIVE_P (phi_info))
> > 	{
> > 	  use_operand_p use_p;
> > 	  imm_use_iterator imm_iter;
> > 	  bool non_exit_use = false;
> > 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, PHI_RESULT (phi))
> > 	    if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
> > 	      for (auto exit : get_loop_exit_edges (loop))
> > 		{
> > 		  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > 		    continue;
> >
> > 		  if (gimple_bb (USE_STMT (use_p)) != exit->dest)
> > 		    {
> > 		      non_exit_use = true;
> > 		      goto fail;
> > 		    }
> > 		}
> > fail:
> > 	  if (non_exit_use)
> > 	    return false;
> > 	}
> >
> > And it does seem to still allow all the cases I want.  I've placed
> > this in vect_can_advance_ivs_p.
> >
> > Does this cover what you meant?
> >
> 
> Ok, I've rewritten this in a nicer form, but doesn't this mean we now block any
> loop there the index is not live?
> i.e. we block such simple loops like
> 
> #ifndef N
> #define N 800
> #endif
> unsigned vect_a[N];
> 
> unsigned test4(unsigned x)
> {
>  unsigned ret = 0;
>  for (int i = 0; i < N; i++)
>  {
>    if (vect_a[i]*2 != x)
>      break;
>    vect_a[i] = x;
>  }
>  return ret;
> }
> 
> because it does a simple `break`.  If I force it to be live it works, but then I need
> to differentiate between the counter and the IV.
> 
> # i_15 = PHI <i_12(6), 0(2)>
> # ivtmp_7 = PHI <ivtmp_14(6), 803(2)>
> 
> I seems like if we don't want to keep i_15 around (at the moment it will be kept
> because of its usage in the exit block it won't be DCEd) then we need to mark it
> live early during analysis.
> 
> Most likely if we do this I don't need to care about the "inverted" workflow
> here at all. What do you think?
> 
> Yes that doesn't work for SLP, but I don't think I can get SLP working in the
> remaining time anyway..
> 
> I'll fix reduction and multiple exit live values in the mean time.
> 

Ok, so I currently have the following solution.  Let me know if you agree with it
and I'll polish it up today and tomorrow and respin things.

1. During vect_update_ivs_after_vectorizer we no longer touch any PHIs aside from
     Just updating IVtemps with the expected remaining iteration count.
2. During vect_transform_loop after vectorizing any induction or reduction I call vectorizable_live_operation
     For any phi node that still has any usages in the early exit merge block.
3. vectorizable_live_operation is taught to have to materialize the same PHI in multiple exits
4. vectorizable_reduction or maybe vect_create_epilog_for_reduction need to be modified to for early exits materialize
    The previous iteration value.

This seems to work and produces now for the simple loop above:

.L2:
        str     q27, [x1, x3]
        str     q29, [x2, x1]
        add     x1, x1, 16
        cmp     x1, 3200
        beq     .L11
.L4:
        ldr     q31, [x2, x1]
        mov     v28.16b, v30.16b
        add     v30.4s, v30.4s, v26.4s
        shl     v31.4s, v31.4s, 1
        add     v27.4s, v28.4s, v29.4s
        cmeq    v31.4s, v31.4s, v29.4s
        not     v31.16b, v31.16b
        umaxp   v31.4s, v31.4s, v31.4s
        fmov    x4, d31
        cbz     x4, .L2
        fmov    w1, s28
        mov     w6, 4                                                                                                                                                                                                                                                        .L3:

so now the scalar index is no longer kept and it reduces the value from the vector IV in the exit:

fmov    w1, s28

Does this work as you expected?

Thanks,
Tamar

> Thanks,
> Tamar
> > Thanks,
> > Tamar
> >
> > >
> > > Richard.
> > >
> > > > Tamar
> > > > >
> > > > > > Also you missed the question below about how to avoid the
> > > > > > creation of the block, You ok with changing that?
> > > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > > Or for now disable early-break for inductions that are not
> > > > > > > the main exit control IV (in vect_can_advance_ivs_p)?
> > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > It seems your change handles different kinds of
> > > > > > > > > > > inductions
> > > > > differently.
> > > > > > > > > > > Specifically
> > > > > > > > > > >
> > > > > > > > > > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > > > > > >       if (restart_loop && ivtemp)
> > > > > > > > > > >         {
> > > > > > > > > > >           type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > > > > > >           ni = build_int_cst (type, vf);
> > > > > > > > > > >           if (inversed_iv)
> > > > > > > > > > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > > > > > >                               fold_convert (type, step_expr));
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > > > > > > > > > as the new value.  That seems to be very odd special
> > > > > > > > > > > casing for unknown reasons.  And while you adjust
> > > > > > > > > > > vec_step_op_add, you don't adjust
> > > > > > > > > > > vect_peel_nonlinear_iv_init (maybe not supported -
> > > > > > > > > > > better assert
> > > > > > > > > here).
> > > > > > > > > >
> > > > > > > > > > The VF case is for a normal "non-inverted" loop, where
> > > > > > > > > > if you take an early exit you know that you have to do
> > > > > > > > > > at most VF
> > > iterations.
> > > > > > > > > > The VF
> > > > > > > > > > - step is to account for the inverted loop control
> > > > > > > > > > flow where you exit after adjusting the IV already by + step.
> > > > > > > > >
> > > > > > > > > But doesn't that assume the IV counts from niter to zero?
> > > > > > > > > I don't see this special case is actually necessary, no?
> > > > > > > > >
> > > > > > > >
> > > > > > > > I needed it because otherwise the scalar loop iterates one
> > > > > > > > iteration too little So I got a miscompile with the
> > > > > > > > inverter loop stuff.  I'll look at it again perhaps It can be solved
> differently.
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Peeling doesn't matter here, since you know you were
> > > > > > > > > > able to do a vector iteration so it's safe to do VF iterations.
> > > > > > > > > > So having peeled doesn't affect the remaining iters count.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Also the vec_step_op_add case will keep the original
> > > > > > > > > > > scalar IV live even when it is a vectorized induction.
> > > > > > > > > > > The code recomputing the value from scratch avoids this.
> > > > > > > > > > >
> > > > > > > > > > >       /* For non-main exit create an intermediat
> > > > > > > > > > > edge to get any updated
> > > > > > > iv
> > > > > > > > > > >          calculations.  */
> > > > > > > > > > >       if (needs_interm_block
> > > > > > > > > > >           && !iv_block
> > > > > > > > > > >           && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > > !gimple_seq_empty_p
> > > > > > > > > > > (new_stmts)))
> > > > > > > > > > >         {
> > > > > > > > > > >           iv_block = split_edge (update_e);
> > > > > > > > > > >           update_e = single_succ_edge (update_e->dest);
> > > > > > > > > > >           last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > > this is also odd, can we adjust the API instead?  I
> > > > > > > > > > > suppose this is because your computation uses the
> > > > > > > > > > > original loop IV, if you based the computation off
> > > > > > > > > > > the initial value only this might not be
> > > > > > > necessary?
> > > > > > > > > >
> > > > > > > > > > No, on the main exit the code updates the value in the
> > > > > > > > > > loop header and puts the Calculation in the merge block.
> > > > > > > > > > This works because it only needs to consume PHI nodes
> > > > > > > > > > in the merge block and things like niters are
> > > > > > > > > adjusted in the guard block.
> > > > > > > > > >
> > > > > > > > > > For an early exit, we don't have a guard block, only
> > > > > > > > > > the merge
> > > block.
> > > > > > > > > > We have to update the PHI nodes in that block,  but
> > > > > > > > > > can't do so since you can't produce a value and
> > > > > > > > > > consume it in a PHI node in the same
> > > > > > > BB.
> > > > > > > > > > So we need to create the block to put the values in
> > > > > > > > > > for use in the merge block.  Because there's no "guard"
> > > > > > > > > > block for early
> > > exits.
> > > > > > > > >
> > > > > > > > > ?  then compute niters in that block as well.
> > > > > > > >
> > > > > > > > We can't since it'll not be reachable through the right edge.
> > > > > > > > What we can do if you want is slightly change peeling, we
> > > > > > > > currently peel
> > > > > as:
> > > > > > > >
> > > > > > > >   \        \             /
> > > > > > > >   E1     E2        Normal exit
> > > > > > > >     \       |          |
> > > > > > > >        \    |          Guard
> > > > > > > >           \ |          |
> > > > > > > >          Merge block
> > > > > > > >                   |
> > > > > > > >              Pre Header
> > > > > > > >
> > > > > > > > If we instead peel as:
> > > > > > > >
> > > > > > > >
> > > > > > > >   \        \             /
> > > > > > > >   E1     E2        Normal exit
> > > > > > > >     \       |          |
> > > > > > > >        Exit join   Guard
> > > > > > > >           \ |          |
> > > > > > > >          Merge block
> > > > > > > >                   |
> > > > > > > >              Pre Header
> > > > > > > >
> > > > > > > > We can use the exit join block.  This would also mean
> > > > > > > > vect_update_ivs_after_vectorizer Doesn't need to iterate
> > > > > > > > over all exits and only really needs to adjust the phi
> > > > > > > > nodes Coming out of the exit join
> > > > > > > and guard block.
> > > > > > > >
> > > > > > > > Does this work for you?
> > > > >
> > > > > Yeah, I think that would work.  But I'd like to sort out the
> > > > > correctness details of the IV update itself before sorting out
> > > > > this code
> > > placement detail.
> > > > >
> > > > > Richard.
> > > > >
> > > > > > > > Thanks,
> > > > > > > > Tamar
> > > > > > > > >
> > > > > > > > > > The API can be adjusted by always creating the empty
> > > > > > > > > > block either during
> > > > > > > > > peeling.
> > > > > > > > > > That would prevent us from having to do anything special here.
> > > > > > > > > > Would that work better?  Or I can do it in the loop
> > > > > > > > > > that iterates over the exits to before the call to
> > > > > > > > > > vect_update_ivs_after_vectorizer, which I think
> > > > > > > > > might be more consistent.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > That said, I wonder why we cannot simply pass in an
> > > > > > > > > > > adjusted niter which would be niters_vector_mult_vf
> > > > > > > > > > > - vf and be done with
> > > > > that?
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > We can ofcourse not have this and recompute it from
> > > > > > > > > > niters itself, however this does affect the epilog code layout.
> > > > > > > > > > Particularly knowing the static number if iterations
> > > > > > > > > > left causes it to usually unroll the loop and share
> > > > > > > > > > some of the computations.  i.e. the scalar code is
> > > > > > > > > > often more
> > > > > > > > > efficient.
> > > > > > > > > >
> > > > > > > > > > The computation would be niters_vector_mult_vf -
> > > > > > > > > > iters_done * vf, since the value put Here is the
> > > > > > > > > > remaining iteration
> > > count.
> > > > > > > > > > It's static for early
> > > > > > > > > exits.
> > > > > > > > >
> > > > > > > > > Well, it might be "static" in that it doesn't really
> > > > > > > > > matter what you use for the epilog main IV initial value
> > > > > > > > > as long as you are sure you're not going to take that
> > > > > > > > > exit as you are sure we're going to take one of the
> > > > > > > > > early exits.  So yeah, the special code is probably OK,
> > > > > > > > > but it needs a better comment and as said the structure
> > > > > > > > > of
> > > > > > > vect_update_ivs_after_vectorizer is a bit hard to follow now.
> > > > > > > > >
> > > > > > > > > As said an important part for optimization is to not
> > > > > > > > > keep the scalar IVs live in the vector loop.
> > > > > > > > >
> > > > > > > > > > But can do whatever you prefer here.  Let me know what
> > > > > > > > > > you prefer for the
> > > > > > > > > above.
> > > > > > > > > >
> > > > > > > > > > Thanks,
> > > > > > > > > > Tamar
> > > > > > > > > >
> > > > > > > > > > > Thanks,
> > > > > > > > > > > Richard.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > Regards,
> > > > > > > > > > > > Tamar
> > > > > > > > > > > > >
> > > > > > > > > > > > > > It has to do this since you have to perform
> > > > > > > > > > > > > > the side effects for the non-matching elements still.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Regards,
> > > > > > > > > > > > > > Tamar
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > > > > > > > > > +		continue;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > > > > > > > > > +		 init + (final - init) * vf which takes
> > > > > > > > > > > > > > > > +into account
> > > > > > > peeling
> > > > > > > > > > > > > > > > +		 values and non-single steps.  The
> > main
> > > > > > > > > > > > > > > > +exit
> > > > > > > can
> > > > > > > > > > > > > > > > +use
> > > > > > > > > > > niters
> > > > > > > > > > > > > > > > +		 since if you exit from the main exit
> > > > > > > > > > > > > > > > +you've
> > > > > > > done
> > > > > > > > > > > > > > > > +all
> > > > > > > > > > > vector
> > > > > > > > > > > > > > > > +		 iterations.  For an early exit we
> > > > > > > > > > > > > > > > +don't know
> > > > > > > when
> > > > > > > > > > > > > > > > +we
> > > > > > > > > > > exit
> > > > > > > > > > > > > > > > +so
> > > > > > > > > > > > > > > we
> > > > > > > > > > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > > > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > > > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > > > > > > > > > +				 fold_convert (stype,
> > > > > > > start_expr),
> > > > > > > > > > > > > > > > +				 fold_convert (stype,
> > > > > > > init_expr));
> > > > > > > > > > > > > > > > +	      /* Now adjust for VF to get the
> > > > > > > > > > > > > > > > +final
> > iteration value.
> > > > > > > */
> > > > > > > > > > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > > > > > > > > > +				 build_int_cst (stype,
> > vf));
> > > > > > > > > > > > > > > > +	    }
> > > > > > > > > > > > > > > > +	  else
> > > > > > > > > > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > > > > > > > > +			       fold_convert (stype,
> > niters),
> > > > > > > step_expr);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > > > > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > > > > > > > > > >  	  else
> > > > > > > > > > > > > > > > @@ -2238,6 +2286,8 @@
> > > > > > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.
> */
> > > > > > > > > > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > > > > > > > > > >  	ni = init_expr;
> > > > > > > > > > > > > > > > +      else if (restart_loop)
> > > > > > > > > > > > > > > > +	continue;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > This looks all a bit complicated - why
> > > > > > > > > > > > > > > wouldn't we simply always use the PHI result
> > > > > > > > > > > > > > > when
> > 'restart_loop'?
> > > > > > > > > > > > > > > Isn't that the correct old start value in
> > > > > > > > > > > > > all cases?
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >        else
> > > > > > > > > > > > > > > >  	ni = vect_peel_nonlinear_iv_init
> > > > > > > > > > > > > > > > (&stmts,
> > init_expr,
> > > > > > > > > > > > > > > >  					  niters,
> > step_expr,
> > > > > @@ -
> > > > > > > > > 2245,9 +2295,20 @@
> > > > > > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > > > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > > > > > > > > > >        ni_name = force_gimple_operand (ni,
> > > > > > > > > > > > > > > > &new_stmts, false, var);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +      /* For non-main exit create an
> > > > > > > > > > > > > > > > + intermediat edge to get any
> > > > > > > > > > > updated iv
> > > > > > > > > > > > > > > > +	 calculations.  */
> > > > > > > > > > > > > > > > +      if (needs_interm_block
> > > > > > > > > > > > > > > > +	  && !iv_block
> > > > > > > > > > > > > > > > +	  && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > > > > > > > +!gimple_seq_empty_p
> > > > > > > > > > > > > > > (new_stmts)))
> > > > > > > > > > > > > > > > +	{
> > > > > > > > > > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > > > > > > > > > +	  update_e = single_succ_edge (update_e-
> > >dest);
> > > > > > > > > > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > > > > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > > > > > > > > > >  	{
> > > > > > > > > > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling
> > > > > > > > > > > > > > > > (loop_vec_info loop_vinfo, tree
> > > > > > > > > > > > > > > niters, tree nitersm1,
> > > > > > > > > > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > > > > > > > > > >        gcc_checking_assert
> > > > > > > > > > > > > > > > (vect_can_advance_ivs_p
> > > > > > > (loop_vinfo));
> > > > > > > > > > > > > > > >        update_e = skip_vector ? e :
> > > > > > > > > > > > > > > > loop_preheader_edge
> > > > > (epilog);
> > > > > > > > > > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > > > -					update_e);
> > > > > > > > > > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > > > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > > > > > > > > > +      bool inversed_iv
> > > > > > > > > > > > > > > > +	= !vect_is_loop_exit_latch_pred
> > > > > > > (LOOP_VINFO_IV_EXIT
> > > > > > > > > > > (loop_vinfo),
> > > > > > > > > > > > > > > > +
> > LOOP_VINFO_LOOP
> > > > > > > > > > > (loop_vinfo));
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > You are computing this here and in
> > > > > > > > > vect_update_ivs_after_vectorizer?
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > > > > > > > > > +      vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > > + (loop_vinfo, vf,
> > > > > > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > > > +					update_e,
> > > > > > > inversed_iv);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > > > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > > > > > > > > > +	{
> > > > > > > > > > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT
> > (loop_vinfo))
> > > > > > > > > > > > > > > > +	    continue;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +	  vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > > +(loop_vinfo, vf,
> > > > > > > > > > > > > > > > +
> > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > > > +					    exit, true);
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > ... why does the same not work here?
> > > > > > > > > > > > > > > Wouldn't the proper condition be
> > > > > > > > > > > > > > > !dominated_by_p (CDI_DOMINATORS,
> > > > > > > > > > > > > > > exit->src, LOOP_VINFO_IV_EXIT
> > > > > > > > > > > > > > > (loop_vinfo)->src) or similar?  That is,
> > > > > > > > > > > > > > > whether the exit is at or after the main IV exit?
> > > > > > > > > > > > > > > (consider having
> > > > > > > > > > > > > > > two)
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >        if (skip_epilog)
> > > > > > > > > > > > > > > >  	{
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > --
> > > > > > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146,
> > > > > > > > > > > > > 90461 Nuernberg, Germany;
> > > > > > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich;
> > > > > > > > > > > > > (HRB 36809, AG
> > > > > > > > > > > > > Nuernberg)
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > --
> > > > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > > > Nuernberg, Germany;
> > > > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich;
> > > > > > > > > > > (HRB 36809, AG
> > > > > > > > > > > Nuernberg)
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > --
> > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > Nuernberg, Germany;
> > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB
> > > > > > > > > 36809, AG
> > > > > > > > > Nuernberg)
> > > > > > > >
> > > > > > >
> > > > > > > --
> > > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> > > > > > > AG
> > > > > > > Nuernberg)
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
Richard Biener Nov. 17, 2023, 12:13 p.m. UTC | #18
On Fri, 17 Nov 2023, Tamar Christina wrote:

> > > > > > Yes, but that only works for the inductions marked so.  We'd
> > > > > > need to mark the others as well, but only for the early exits.
> > > > > >
> > > > > > > although I don't understand why we use the scalar count,  I
> > > > > > > suppose the reasoning is that we don't really want to keep it
> > > > > > > around, and referencing
> > > > > > it forces it to be kept?
> > > > > >
> > > > > > Referencing it will cause the scalar compute to be retained, but
> > > > > > since we do not adjust the scalar compute during vectorization
> > > > > > (but expect it to be dead) the scalar compute will compute the
> > > > > > wrong thing (as shown by the reduction example - I suspect
> > > > > > inductions will suffer
> > > > from the same problem).
> > > > > >
> > > > > > > At the moment it just does `init + (final - init) * vf` which is correct no?
> > > > > >
> > > > > > The issue is that 'final' is not computed correctly in the
> > > > > > vectorized loop.  This formula might work for affine evolutions of
> > course.
> > > > > >
> > > > > > Extracting the correct value from the vectorized induction would
> > > > > > be the preferred solution.
> > > > >
> > > > > Ok, so I should be able to just mark IVs as live during
> > > > > process_use if there are multiple exits right? Since it's just
> > > > > gonna be unused on the main exit since we use niters?
> > > > >
> > > > > Because since it's the PHI inside the loop that needs to be marked
> > > > > live I can't just do it for a specific exits no?
> > > > >
> > > > > If I create a copy of the PHI node during peeling for use in early
> > > > > exits and mark it live it won't work no?
> > > >
> > > > I guess I wouldn't actually mark it STMT_VINFO_LIVE_P but somehow
> > > > arrange vectorizable_live_operation to be called, possibly adding a
> > > > edge argument to that as well.
> > > >
> > > > Maybe the thing to do for the moment is to reject vectorization with
> > > > early breaks if there's any (non-STMT_VINFO_LIVE_P?) induction or
> > > > reduction besides the main counting IV one you can already special-case?
> > >
> > > Ok so I did a quick hack with:
> > >
> > >       if (!virtual_operand_p (PHI_RESULT (phi))
> > > 	  && !STMT_VINFO_LIVE_P (phi_info))
> > > 	{
> > > 	  use_operand_p use_p;
> > > 	  imm_use_iterator imm_iter;
> > > 	  bool non_exit_use = false;
> > > 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, PHI_RESULT (phi))
> > > 	    if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
> > > 	      for (auto exit : get_loop_exit_edges (loop))
> > > 		{
> > > 		  if (exit == LOOP_VINFO_IV_EXIT (loop_vinfo))
> > > 		    continue;
> > >
> > > 		  if (gimple_bb (USE_STMT (use_p)) != exit->dest)
> > > 		    {
> > > 		      non_exit_use = true;
> > > 		      goto fail;
> > > 		    }
> > > 		}
> > > fail:
> > > 	  if (non_exit_use)
> > > 	    return false;
> > > 	}
> > >
> > > And it does seem to still allow all the cases I want.  I've placed
> > > this in vect_can_advance_ivs_p.
> > >
> > > Does this cover what you meant?
> > >
> > 
> > Ok, I've rewritten this in a nicer form, but doesn't this mean we now block any
> > loop there the index is not live?
> > i.e. we block such simple loops like
> > 
> > #ifndef N
> > #define N 800
> > #endif
> > unsigned vect_a[N];
> > 
> > unsigned test4(unsigned x)
> > {
> >  unsigned ret = 0;
> >  for (int i = 0; i < N; i++)
> >  {
> >    if (vect_a[i]*2 != x)
> >      break;
> >    vect_a[i] = x;
> >  }
> >  return ret;
> > }
> > 
> > because it does a simple `break`.  If I force it to be live it works, but then I need
> > to differentiate between the counter and the IV.
> > 
> > # i_15 = PHI <i_12(6), 0(2)>
> > # ivtmp_7 = PHI <ivtmp_14(6), 803(2)>
> > 
> > I seems like if we don't want to keep i_15 around (at the moment it will be kept
> > because of its usage in the exit block it won't be DCEd) then we need to mark it
> > live early during analysis.
> > 
> > Most likely if we do this I don't need to care about the "inverted" workflow
> > here at all. What do you think?
> > 
> > Yes that doesn't work for SLP, but I don't think I can get SLP working in the
> > remaining time anyway..
> > 
> > I'll fix reduction and multiple exit live values in the mean time.
> > 
> 
> Ok, so I currently have the following solution.  Let me know if you agree with it
> and I'll polish it up today and tomorrow and respin things.
> 
> 1. During vect_update_ivs_after_vectorizer we no longer touch any PHIs aside from
>      Just updating IVtemps with the expected remaining iteration count.

OK

> 2. During vect_transform_loop after vectorizing any induction or reduction I call vectorizable_live_operation
>      For any phi node that still has any usages in the early exit merge block.

OK, I suppose you need to amend the vectorizable_live_operation API to
tell it it works for the early exits or the main exit (and not complain
when !STMT_VINFO_LIVE_P for the early exit case).

> 3. vectorizable_live_operation is taught to have to materialize the same PHI in multiple exits

For the main exit you'd get here via STMT_VINFO_LIVE_P handling and
vect_update_ivs_after_vectorizer would handle the rest.  For the
early exits I think you only have to materialize once (in the merge 
block)?

> 4. vectorizable_reduction or maybe vect_create_epilog_for_reduction need to be modified to for early exits materialize
>     The previous iteration value.

I think you need to only touch vect_create_epilog_for_reduction, the
early exit merge block needs another reduction epilog.  Well, in theory
just another vector to reduce but not sure if the control flow supports
having the same actual epilog for both the main and the early exits.

Richard.

> This seems to work and produces now for the simple loop above:
> 
> .L2:
>         str     q27, [x1, x3]
>         str     q29, [x2, x1]
>         add     x1, x1, 16
>         cmp     x1, 3200
>         beq     .L11
> .L4:
>         ldr     q31, [x2, x1]
>         mov     v28.16b, v30.16b
>         add     v30.4s, v30.4s, v26.4s
>         shl     v31.4s, v31.4s, 1
>         add     v27.4s, v28.4s, v29.4s
>         cmeq    v31.4s, v31.4s, v29.4s
>         not     v31.16b, v31.16b
>         umaxp   v31.4s, v31.4s, v31.4s
>         fmov    x4, d31
>         cbz     x4, .L2
>         fmov    w1, s28
>         mov     w6, 4                                                                                                                                                                                                                                                        .L3:
> 
> so now the scalar index is no longer kept and it reduces the value from the vector IV in the exit:
> 
> fmov    w1, s28
> 
> Does this work as you expected?
> 
> Thanks,
> Tamar
> 
> > Thanks,
> > Tamar
> > > Thanks,
> > > Tamar
> > >
> > > >
> > > > Richard.
> > > >
> > > > > Tamar
> > > > > >
> > > > > > > Also you missed the question below about how to avoid the
> > > > > > > creation of the block, You ok with changing that?
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Tamar
> > > > > > >
> > > > > > > > Or for now disable early-break for inductions that are not
> > > > > > > > the main exit control IV (in vect_can_advance_ivs_p)?
> > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > It seems your change handles different kinds of
> > > > > > > > > > > > inductions
> > > > > > differently.
> > > > > > > > > > > > Specifically
> > > > > > > > > > > >
> > > > > > > > > > > >       bool ivtemp = gimple_cond_lhs (cond) == iv_var;
> > > > > > > > > > > >       if (restart_loop && ivtemp)
> > > > > > > > > > > >         {
> > > > > > > > > > > >           type = TREE_TYPE (gimple_phi_result (phi));
> > > > > > > > > > > >           ni = build_int_cst (type, vf);
> > > > > > > > > > > >           if (inversed_iv)
> > > > > > > > > > > >             ni = fold_build2 (MINUS_EXPR, type, ni,
> > > > > > > > > > > >                               fold_convert (type, step_expr));
> > > > > > > > > > > >         }
> > > > > > > > > > > >
> > > > > > > > > > > > it looks like for the exit test IV we use either 'VF' or 'VF - step'
> > > > > > > > > > > > as the new value.  That seems to be very odd special
> > > > > > > > > > > > casing for unknown reasons.  And while you adjust
> > > > > > > > > > > > vec_step_op_add, you don't adjust
> > > > > > > > > > > > vect_peel_nonlinear_iv_init (maybe not supported -
> > > > > > > > > > > > better assert
> > > > > > > > > > here).
> > > > > > > > > > >
> > > > > > > > > > > The VF case is for a normal "non-inverted" loop, where
> > > > > > > > > > > if you take an early exit you know that you have to do
> > > > > > > > > > > at most VF
> > > > iterations.
> > > > > > > > > > > The VF
> > > > > > > > > > > - step is to account for the inverted loop control
> > > > > > > > > > > flow where you exit after adjusting the IV already by + step.
> > > > > > > > > >
> > > > > > > > > > But doesn't that assume the IV counts from niter to zero?
> > > > > > > > > > I don't see this special case is actually necessary, no?
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I needed it because otherwise the scalar loop iterates one
> > > > > > > > > iteration too little So I got a miscompile with the
> > > > > > > > > inverter loop stuff.  I'll look at it again perhaps It can be solved
> > differently.
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Peeling doesn't matter here, since you know you were
> > > > > > > > > > > able to do a vector iteration so it's safe to do VF iterations.
> > > > > > > > > > > So having peeled doesn't affect the remaining iters count.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Also the vec_step_op_add case will keep the original
> > > > > > > > > > > > scalar IV live even when it is a vectorized induction.
> > > > > > > > > > > > The code recomputing the value from scratch avoids this.
> > > > > > > > > > > >
> > > > > > > > > > > >       /* For non-main exit create an intermediat
> > > > > > > > > > > > edge to get any updated
> > > > > > > > iv
> > > > > > > > > > > >          calculations.  */
> > > > > > > > > > > >       if (needs_interm_block
> > > > > > > > > > > >           && !iv_block
> > > > > > > > > > > >           && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > > > !gimple_seq_empty_p
> > > > > > > > > > > > (new_stmts)))
> > > > > > > > > > > >         {
> > > > > > > > > > > >           iv_block = split_edge (update_e);
> > > > > > > > > > > >           update_e = single_succ_edge (update_e->dest);
> > > > > > > > > > > >           last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > > >         }
> > > > > > > > > > > >
> > > > > > > > > > > > this is also odd, can we adjust the API instead?  I
> > > > > > > > > > > > suppose this is because your computation uses the
> > > > > > > > > > > > original loop IV, if you based the computation off
> > > > > > > > > > > > the initial value only this might not be
> > > > > > > > necessary?
> > > > > > > > > > >
> > > > > > > > > > > No, on the main exit the code updates the value in the
> > > > > > > > > > > loop header and puts the Calculation in the merge block.
> > > > > > > > > > > This works because it only needs to consume PHI nodes
> > > > > > > > > > > in the merge block and things like niters are
> > > > > > > > > > adjusted in the guard block.
> > > > > > > > > > >
> > > > > > > > > > > For an early exit, we don't have a guard block, only
> > > > > > > > > > > the merge
> > > > block.
> > > > > > > > > > > We have to update the PHI nodes in that block,  but
> > > > > > > > > > > can't do so since you can't produce a value and
> > > > > > > > > > > consume it in a PHI node in the same
> > > > > > > > BB.
> > > > > > > > > > > So we need to create the block to put the values in
> > > > > > > > > > > for use in the merge block.  Because there's no "guard"
> > > > > > > > > > > block for early
> > > > exits.
> > > > > > > > > >
> > > > > > > > > > ?  then compute niters in that block as well.
> > > > > > > > >
> > > > > > > > > We can't since it'll not be reachable through the right edge.
> > > > > > > > > What we can do if you want is slightly change peeling, we
> > > > > > > > > currently peel
> > > > > > as:
> > > > > > > > >
> > > > > > > > >   \        \             /
> > > > > > > > >   E1     E2        Normal exit
> > > > > > > > >     \       |          |
> > > > > > > > >        \    |          Guard
> > > > > > > > >           \ |          |
> > > > > > > > >          Merge block
> > > > > > > > >                   |
> > > > > > > > >              Pre Header
> > > > > > > > >
> > > > > > > > > If we instead peel as:
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >   \        \             /
> > > > > > > > >   E1     E2        Normal exit
> > > > > > > > >     \       |          |
> > > > > > > > >        Exit join   Guard
> > > > > > > > >           \ |          |
> > > > > > > > >          Merge block
> > > > > > > > >                   |
> > > > > > > > >              Pre Header
> > > > > > > > >
> > > > > > > > > We can use the exit join block.  This would also mean
> > > > > > > > > vect_update_ivs_after_vectorizer Doesn't need to iterate
> > > > > > > > > over all exits and only really needs to adjust the phi
> > > > > > > > > nodes Coming out of the exit join
> > > > > > > > and guard block.
> > > > > > > > >
> > > > > > > > > Does this work for you?
> > > > > >
> > > > > > Yeah, I think that would work.  But I'd like to sort out the
> > > > > > correctness details of the IV update itself before sorting out
> > > > > > this code
> > > > placement detail.
> > > > > >
> > > > > > Richard.
> > > > > >
> > > > > > > > > Thanks,
> > > > > > > > > Tamar
> > > > > > > > > >
> > > > > > > > > > > The API can be adjusted by always creating the empty
> > > > > > > > > > > block either during
> > > > > > > > > > peeling.
> > > > > > > > > > > That would prevent us from having to do anything special here.
> > > > > > > > > > > Would that work better?  Or I can do it in the loop
> > > > > > > > > > > that iterates over the exits to before the call to
> > > > > > > > > > > vect_update_ivs_after_vectorizer, which I think
> > > > > > > > > > might be more consistent.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > That said, I wonder why we cannot simply pass in an
> > > > > > > > > > > > adjusted niter which would be niters_vector_mult_vf
> > > > > > > > > > > > - vf and be done with
> > > > > > that?
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > We can ofcourse not have this and recompute it from
> > > > > > > > > > > niters itself, however this does affect the epilog code layout.
> > > > > > > > > > > Particularly knowing the static number if iterations
> > > > > > > > > > > left causes it to usually unroll the loop and share
> > > > > > > > > > > some of the computations.  i.e. the scalar code is
> > > > > > > > > > > often more
> > > > > > > > > > efficient.
> > > > > > > > > > >
> > > > > > > > > > > The computation would be niters_vector_mult_vf -
> > > > > > > > > > > iters_done * vf, since the value put Here is the
> > > > > > > > > > > remaining iteration
> > > > count.
> > > > > > > > > > > It's static for early
> > > > > > > > > > exits.
> > > > > > > > > >
> > > > > > > > > > Well, it might be "static" in that it doesn't really
> > > > > > > > > > matter what you use for the epilog main IV initial value
> > > > > > > > > > as long as you are sure you're not going to take that
> > > > > > > > > > exit as you are sure we're going to take one of the
> > > > > > > > > > early exits.  So yeah, the special code is probably OK,
> > > > > > > > > > but it needs a better comment and as said the structure
> > > > > > > > > > of
> > > > > > > > vect_update_ivs_after_vectorizer is a bit hard to follow now.
> > > > > > > > > >
> > > > > > > > > > As said an important part for optimization is to not
> > > > > > > > > > keep the scalar IVs live in the vector loop.
> > > > > > > > > >
> > > > > > > > > > > But can do whatever you prefer here.  Let me know what
> > > > > > > > > > > you prefer for the
> > > > > > > > > > above.
> > > > > > > > > > >
> > > > > > > > > > > Thanks,
> > > > > > > > > > > Tamar
> > > > > > > > > > >
> > > > > > > > > > > > Thanks,
> > > > > > > > > > > > Richard.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > Regards,
> > > > > > > > > > > > > Tamar
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > It has to do this since you have to perform
> > > > > > > > > > > > > > > the side effects for the non-matching elements still.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Regards,
> > > > > > > > > > > > > > > Tamar
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +	      if (STMT_VINFO_LIVE_P (phi_info))
> > > > > > > > > > > > > > > > > +		continue;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +	      /* For early break the final loop IV is:
> > > > > > > > > > > > > > > > > +		 init + (final - init) * vf which takes
> > > > > > > > > > > > > > > > > +into account
> > > > > > > > peeling
> > > > > > > > > > > > > > > > > +		 values and non-single steps.  The
> > > main
> > > > > > > > > > > > > > > > > +exit
> > > > > > > > can
> > > > > > > > > > > > > > > > > +use
> > > > > > > > > > > > niters
> > > > > > > > > > > > > > > > > +		 since if you exit from the main exit
> > > > > > > > > > > > > > > > > +you've
> > > > > > > > done
> > > > > > > > > > > > > > > > > +all
> > > > > > > > > > > > vector
> > > > > > > > > > > > > > > > > +		 iterations.  For an early exit we
> > > > > > > > > > > > > > > > > +don't know
> > > > > > > > when
> > > > > > > > > > > > > > > > > +we
> > > > > > > > > > > > exit
> > > > > > > > > > > > > > > > > +so
> > > > > > > > > > > > > > > > we
> > > > > > > > > > > > > > > > > +		 must re-calculate this on the exit.  */
> > > > > > > > > > > > > > > > > +	      tree start_expr = gimple_phi_result (phi);
> > > > > > > > > > > > > > > > > +	      off = fold_build2 (MINUS_EXPR, stype,
> > > > > > > > > > > > > > > > > +				 fold_convert (stype,
> > > > > > > > start_expr),
> > > > > > > > > > > > > > > > > +				 fold_convert (stype,
> > > > > > > > init_expr));
> > > > > > > > > > > > > > > > > +	      /* Now adjust for VF to get the
> > > > > > > > > > > > > > > > > +final
> > > iteration value.
> > > > > > > > */
> > > > > > > > > > > > > > > > > +	      off = fold_build2 (MULT_EXPR, stype, off,
> > > > > > > > > > > > > > > > > +				 build_int_cst (stype,
> > > vf));
> > > > > > > > > > > > > > > > > +	    }
> > > > > > > > > > > > > > > > > +	  else
> > > > > > > > > > > > > > > > > +	    off = fold_build2 (MULT_EXPR, stype,
> > > > > > > > > > > > > > > > > +			       fold_convert (stype,
> > > niters),
> > > > > > > > step_expr);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > >  	  if (POINTER_TYPE_P (type))
> > > > > > > > > > > > > > > > >  	    ni = fold_build_pointer_plus (init_expr, off);
> > > > > > > > > > > > > > > > >  	  else
> > > > > > > > > > > > > > > > > @@ -2238,6 +2286,8 @@
> > > > > > > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > > > > > >        /* Don't bother call vect_peel_nonlinear_iv_init.
> > */
> > > > > > > > > > > > > > > > >        else if (induction_type == vect_step_op_neg)
> > > > > > > > > > > > > > > > >  	ni = init_expr;
> > > > > > > > > > > > > > > > > +      else if (restart_loop)
> > > > > > > > > > > > > > > > > +	continue;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > This looks all a bit complicated - why
> > > > > > > > > > > > > > > > wouldn't we simply always use the PHI result
> > > > > > > > > > > > > > > > when
> > > 'restart_loop'?
> > > > > > > > > > > > > > > > Isn't that the correct old start value in
> > > > > > > > > > > > > > all cases?
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >        else
> > > > > > > > > > > > > > > > >  	ni = vect_peel_nonlinear_iv_init
> > > > > > > > > > > > > > > > > (&stmts,
> > > init_expr,
> > > > > > > > > > > > > > > > >  					  niters,
> > > step_expr,
> > > > > > @@ -
> > > > > > > > > > 2245,9 +2295,20 @@
> > > > > > > > > > > > > > > > > vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > > (loop_vec_info
> > > > > > > > > > > > > > > > > loop_vinfo,
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >        var = create_tmp_var (type, "tmp");
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > -      last_gsi = gsi_last_bb (exit_bb);
> > > > > > > > > > > > > > > > >        gimple_seq new_stmts = NULL;
> > > > > > > > > > > > > > > > >        ni_name = force_gimple_operand (ni,
> > > > > > > > > > > > > > > > > &new_stmts, false, var);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +      /* For non-main exit create an
> > > > > > > > > > > > > > > > > + intermediat edge to get any
> > > > > > > > > > > > updated iv
> > > > > > > > > > > > > > > > > +	 calculations.  */
> > > > > > > > > > > > > > > > > +      if (needs_interm_block
> > > > > > > > > > > > > > > > > +	  && !iv_block
> > > > > > > > > > > > > > > > > +	  && (!gimple_seq_empty_p (stmts) ||
> > > > > > > > > > > > > > > > > +!gimple_seq_empty_p
> > > > > > > > > > > > > > > > (new_stmts)))
> > > > > > > > > > > > > > > > > +	{
> > > > > > > > > > > > > > > > > +	  iv_block = split_edge (update_e);
> > > > > > > > > > > > > > > > > +	  update_e = single_succ_edge (update_e-
> > > >dest);
> > > > > > > > > > > > > > > > > +	  last_gsi = gsi_last_bb (iv_block);
> > > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > >        /* Exit_bb shouldn't be empty.  */
> > > > > > > > > > > > > > > > >        if (!gsi_end_p (last_gsi))
> > > > > > > > > > > > > > > > >  	{
> > > > > > > > > > > > > > > > > @@ -3342,8 +3403,26 @@ vect_do_peeling
> > > > > > > > > > > > > > > > > (loop_vec_info loop_vinfo, tree
> > > > > > > > > > > > > > > > niters, tree nitersm1,
> > > > > > > > > > > > > > > > >  	 niters_vector_mult_vf steps.  */
> > > > > > > > > > > > > > > > >        gcc_checking_assert
> > > > > > > > > > > > > > > > > (vect_can_advance_ivs_p
> > > > > > > > (loop_vinfo));
> > > > > > > > > > > > > > > > >        update_e = skip_vector ? e :
> > > > > > > > > > > > > > > > > loop_preheader_edge
> > > > > > (epilog);
> > > > > > > > > > > > > > > > > -      vect_update_ivs_after_vectorizer (loop_vinfo,
> > > > > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > > > > -					update_e);
> > > > > > > > > > > > > > > > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > > > > > > > > > > > > > > > +	update_e = single_succ_edge (e->dest);
> > > > > > > > > > > > > > > > > +      bool inversed_iv
> > > > > > > > > > > > > > > > > +	= !vect_is_loop_exit_latch_pred
> > > > > > > > (LOOP_VINFO_IV_EXIT
> > > > > > > > > > > > (loop_vinfo),
> > > > > > > > > > > > > > > > > +
> > > LOOP_VINFO_LOOP
> > > > > > > > > > > > (loop_vinfo));
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > You are computing this here and in
> > > > > > > > > > vect_update_ivs_after_vectorizer?
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +      /* Update the main exit first.  */
> > > > > > > > > > > > > > > > > +      vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > > > + (loop_vinfo, vf,
> > > > > > > > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > > > > +					update_e,
> > > > > > > > inversed_iv);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +      /* And then update the early exits.  */
> > > > > > > > > > > > > > > > > +      for (auto exit : get_loop_exit_edges (loop))
> > > > > > > > > > > > > > > > > +	{
> > > > > > > > > > > > > > > > > +	  if (exit == LOOP_VINFO_IV_EXIT
> > > (loop_vinfo))
> > > > > > > > > > > > > > > > > +	    continue;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +	  vect_update_ivs_after_vectorizer
> > > > > > > > > > > > > > > > > +(loop_vinfo, vf,
> > > > > > > > > > > > > > > > > +
> > > > > > > > niters_vector_mult_vf,
> > > > > > > > > > > > > > > > > +					    exit, true);
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > ... why does the same not work here?
> > > > > > > > > > > > > > > > Wouldn't the proper condition be
> > > > > > > > > > > > > > > > !dominated_by_p (CDI_DOMINATORS,
> > > > > > > > > > > > > > > > exit->src, LOOP_VINFO_IV_EXIT
> > > > > > > > > > > > > > > > (loop_vinfo)->src) or similar?  That is,
> > > > > > > > > > > > > > > > whether the exit is at or after the main IV exit?
> > > > > > > > > > > > > > > > (consider having
> > > > > > > > > > > > > > > > two)
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >        if (skip_epilog)
> > > > > > > > > > > > > > > > >  	{
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146,
> > > > > > > > > > > > > > 90461 Nuernberg, Germany;
> > > > > > > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich;
> > > > > > > > > > > > > > (HRB 36809, AG
> > > > > > > > > > > > > > Nuernberg)
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > --
> > > > > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > > > > Nuernberg, Germany;
> > > > > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich;
> > > > > > > > > > > > (HRB 36809, AG
> > > > > > > > > > > > Nuernberg)
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > --
> > > > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software
> > > > > > > > > > Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > > > > > > > Nuernberg, Germany;
> > > > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB
> > > > > > > > > > 36809, AG
> > > > > > > > > > Nuernberg)
> > > > > > > > >
> > > > > > > >
> > > > > > > > --
> > > > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> > > > > > > > AG
> > > > > > > > Nuernberg)
> > > > > > >
> > > > > >
> > > > > > --
> > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > > Nuernberg)
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
>
Tamar Christina Nov. 20, 2023, 9:54 p.m. UTC | #19
> >
> > Ok, so I currently have the following solution.  Let me know if you
> > agree with it and I'll polish it up today and tomorrow and respin things.
> >
> > 1. During vect_update_ivs_after_vectorizer we no longer touch any PHIs
> aside from
> >      Just updating IVtemps with the expected remaining iteration count.
> 
> OK
> 
> > 2. During vect_transform_loop after vectorizing any induction or reduction I
> call vectorizable_live_operation
> >      For any phi node that still has any usages in the early exit merge block.
> 
> OK, I suppose you need to amend the vectorizable_live_operation API to tell it
> it works for the early exits or the main exit (and not complain when
> !STMT_VINFO_LIVE_P for the early exit case).
> 
> > 3. vectorizable_live_operation is taught to have to materialize the
> > same PHI in multiple exits
> 
> For the main exit you'd get here via STMT_VINFO_LIVE_P handling and
> vect_update_ivs_after_vectorizer would handle the rest.  For the early exits I
> think you only have to materialize once (in the merge block)?
> 
> > 4. vectorizable_reduction or maybe vect_create_epilog_for_reduction need
> to be modified to for early exits materialize
> >     The previous iteration value.
> 
> I think you need to only touch vect_create_epilog_for_reduction, the early exit
> merge block needs another reduction epilog.  Well, in theory just another
> vector to reduce but not sure if the control flow supports having the same
> actual epilog for both the main and the early exits.
> 
> Richard.

Good morning,

Here's the much cleaner respun patch:

This changes the PHI node updates to support early breaks.
It has to support both the case where the loop's exit matches the normal loop
exit and one where the early exit is "inverted", i.e. it's an early exit edge.

In the latter case we must always restart the loop for VF iterations.  For an
early exit the reason is obvious, but there are cases where the "normal" exit
is located before the early one.  This exit then does a check on ivtmp resulting
in us leaving the loop since it thinks we're done.

In these case we may still have side-effects to perform so we also go to the
scalar loop.

For the "normal" exit niters has already been adjusted for peeling, for the
early exits we must find out how many iterations we actually did.  So we have
to recalculate the new position for each exit.

For the "inverse" case I know what to do, but I wanted to ask where you wanted
it.  For inverted cases like ./gcc/testsuite/gcc.dg/vect/vect-early-break_70.c

the requirement is that any PHI value aside from the IV needs to be the value
of the early exit. i.e. the value of the incomplete exit as there's no iteration
that is "complete".

The IV should become:  niters - (((niters / vf) - 1) * vf)

So e.g. on a loop with niters = 17 and VF 4 it becomes
17 - (((17 / 4) - 1) * 4))) = 5.  This addresses the odd +step you had commented
on before.

To do these two I can either modify vect_update_ivs_after_vectorizer, or add
a smaller utility function that patched up this case if we want to keep
vect_update_ivs_after_vectorizer simple.

Which do you prefer?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop-manip.cc (vect_set_loop_condition_normal): Hide unused.
	(vect_update_ivs_after_vectorizer): Support early break.
	(vect_do_peeling): Use it.
	(vect_is_loop_exit_latch_pred): New.
	* tree-vectorizer.h (vect_is_loop_exit_latch_pred): New.

--- inline copy of patch ---

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 5ab883fdeebf1917979fe44eb16356aaef637df7..5751aa6295ca052534cef1984a26c65994a57389 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1200,7 +1200,7 @@ vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
    loop handles exactly VF scalars per iteration.  */
 
 static gcond *
-vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
+vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
 				class loop *loop, tree niters, tree step,
 				tree final_iv, bool niters_maybe_zero,
 				gimple_stmt_iterator loop_cond_gsi)
@@ -1407,6 +1407,17 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
 		     (gimple *) cond_stmt);
 }
 
+/* Determine if the exit choosen by the loop vectorizer differs from the
+   natural loop exit.  i.e. if the exit leads to the loop patch or not.
+   When this happens we need to flip the understanding of main and other
+   exits by peeling and IV updates.  */
+
+bool
+vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)
+{
+  return single_pred (loop->latch) == loop_exit->src;
+}
+
 /* Given LOOP this function generates a new copy of it and puts it
    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
@@ -2134,6 +2145,10 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
                   The phi args associated with the edge UPDATE_E in the bb
                   UPDATE_E->dest are updated accordingly.
 
+     - MULTIPLE_EXIT - Indicates whether the scalar loop needs to restart the
+		       iteration count where the vector loop began.
+     - EXIT_BB - The basic block to insert any new statement for UPDATE_E into.
+
      Assumption 1: Like the rest of the vectorizer, this function assumes
      a single loop exit that has a single predecessor.
 
@@ -2152,17 +2167,14 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
 
 static void
 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
-				  tree niters, edge update_e)
+				  tree niters, edge update_e,
+				  bool multiple_exit, basic_block exit_bb)
 {
   gphi_iterator gsi, gsi1;
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block update_bb = update_e->dest;
-
-  basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
-
-  /* Make sure there exists a single-predecessor exit bb:  */
-  gcc_assert (single_pred_p (exit_bb));
-  gcc_assert (single_succ_edge (exit_bb) == update_e);
+  gcond *cond = get_loop_exit_condition (LOOP_VINFO_IV_EXIT (loop_vinfo));
+  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
 
   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
@@ -2172,7 +2184,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       tree step_expr, off;
       tree type;
       tree var, ni, ni_name;
-      gimple_stmt_iterator last_gsi;
 
       gphi *phi = gsi.phi ();
       gphi *phi1 = gsi1.phi ();
@@ -2204,11 +2215,27 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       enum vect_induction_op_type induction_type
 	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
 
-      if (induction_type == vect_step_op_add)
+      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+      /* create_iv always places it on the LHS.  Alternatively we can set a
+	 property during create_iv to identify it.  */
+      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
+      if (multiple_exit && ivtemp)
+	{
+	  type = TREE_TYPE (gimple_phi_result (phi));
+	  ni = build_int_cst (type, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+	}
+      else if (induction_type == vect_step_op_add)
 	{
+
 	  tree stype = TREE_TYPE (step_expr);
-	  off = fold_build2 (MULT_EXPR, stype,
-			     fold_convert (stype, niters), step_expr);
+
+	  /* Early exits always use last iter value not niters. */
+	  if (multiple_exit)
+	    continue;
+	  else
+	    off = fold_build2 (MULT_EXPR, stype,
+			       fold_convert (stype, niters), step_expr);
+
 	  if (POINTER_TYPE_P (type))
 	    ni = fold_build_pointer_plus (init_expr, off);
 	  else
@@ -2227,9 +2254,9 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
 
       var = create_tmp_var (type, "tmp");
 
-      last_gsi = gsi_last_bb (exit_bb);
       gimple_seq new_stmts = NULL;
       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
+
       /* Exit_bb shouldn't be empty.  */
       if (!gsi_end_p (last_gsi))
 	{
@@ -3324,8 +3351,31 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	 niters_vector_mult_vf steps.  */
       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
       update_e = skip_vector ? e : loop_preheader_edge (epilog);
+      edge alt_exit;
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  for (auto exit : get_loop_exit_edges (loop))
+	    if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
+	      {
+		alt_exit = single_succ_edge (exit->dest);
+		break;
+	      }
+	  update_e = single_succ_edge (e->dest);
+	}
+      bool inversed_iv
+	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
+					 LOOP_VINFO_LOOP (loop_vinfo));
+
+      /* Update the main exit first.  */
       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
-					update_e);
+					update_e, inversed_iv,
+					LOOP_VINFO_IV_EXIT (loop_vinfo)->dest);
+
+      /* And then update the early exits, we only need to update the alt exit
+	 merge edge, but have to find it first.  */
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
+					  alt_exit, true, alt_exit->src);
 
       if (skip_epilog)
 	{
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 39aa4d1250efe308acccf484d370f8adfd1ba843..22a8c3d384d7ae1ca93079b64f2d40821b4a3c56 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2224,6 +2224,7 @@ extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
 extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
 extern edge vec_init_loop_exit_info (class loop *);
+extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
 
 /* In tree-vect-stmts.cc.  */
 extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
Tamar Christina Nov. 24, 2023, 10:18 a.m. UTC | #20
Hi,

Having simplified peeling this patch becomes smaller as well:

This changes the PHI node updates to support early breaks.
It has to support both the case where the loop's exit matches the normal loop
exit and one where the early exit is "inverted", i.e. it's an early exit edge.

In the latter case we must always restart the loop for VF iterations.  For an
early exit the reason is obvious, but there are cases where the "normal" exit
is located before the early one.  This exit then does a check on ivtmp resulting
in us leaving the loop since it thinks we're done.

In these case we may still have side-effects to perform so we also go to the
scalar loop.

For the "normal" exit niters has already been adjusted for peeling, for the
early exits we must find out how many iterations we actually did.  So we have
to recalculate the new position for each exit.

For the "inverse" case we essentially peel a vector iteration *after* the vector
loop has finished. i.e. conceptually it's the same as vect epilogue peeling but
without generating code for the peeled iteration.  That'll be handled by the
scalar loop.

To do this we just adjust niters_vector_mult_vf and remove one VF and for masked
cases we do the same with final_iv.

The normal IV update code will then generate the correct values for us.
Eventually VRP will simplify the constant bounds and we get the proper scalar
unrolling.  This means we don't have to make any changes at all to
vect_update_ivs_after_vectorizer but dropping some asserts.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop-manip.cc (vect_set_loop_condition_partial_vectors,
	vect_set_loop_condition_partial_vectors_avx512,
	vect_gen_vector_loop_niters_mult_vf): Support peeling a vector
	iteration.
	(vect_update_ivs_after_vectorizer): Drop asserts.
	(vect_do_peeling): Skip forwarder edge.
	(vect_is_loop_exit_latch_pred): New.
	* tree-vectorizer.h (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED): New.

--- inline copy of patch ---

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index d61d7c3a189b279fc3bcbb58c3c0e32521db3cf8..476be8a0bb6da2d06c4ca7052cb07bacecca60b1 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -951,7 +951,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
 
   if (final_iv)
     {
-      gassign *assign = gimple_build_assign (final_iv, orig_niters);
+      gassign *assign;
+      /* If vectorizing an inverted early break loop we have to restart the
+	 scalar loop at niters - vf.  This matches what we do in
+	 vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
+      if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
+	{
+	  tree ftype = TREE_TYPE (orig_niters);
+	  tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+	  assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
+	}
+       else
+	assign = gimple_build_assign (final_iv, orig_niters);
       gsi_insert_on_edge_immediate (exit_edge, assign);
     }
 
@@ -1188,8 +1199,19 @@ vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
 
   if (final_iv)
     {
-      gassign *assign = gimple_build_assign (final_iv, orig_niters);
-      gsi_insert_on_edge_immediate (single_exit (loop), assign);
+      gassign *assign;
+      /* If vectorizing an inverted early break loop we have to restart the
+	 scalar loop at niters - vf.  This matches what we do in
+	 vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
+      if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
+	{
+	  tree ftype = TREE_TYPE (orig_niters);
+	  tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+	  assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
+	}
+       else
+	assign = gimple_build_assign (final_iv, orig_niters);
+      gsi_insert_on_edge_immediate (exit_edge, assign);
     }
 
   return cond_stmt;
@@ -2157,11 +2179,8 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
   gphi_iterator gsi, gsi1;
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block update_bb = update_e->dest;
-
   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
-
-  /* Make sure there exists a single-predecessor exit bb:  */
-  gcc_assert (single_pred_p (exit_bb));
+  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
 
   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
@@ -2171,7 +2190,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       tree step_expr, off;
       tree type;
       tree var, ni, ni_name;
-      gimple_stmt_iterator last_gsi;
 
       gphi *phi = gsi.phi ();
       gphi *phi1 = gsi1.phi ();
@@ -2207,7 +2225,8 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
 	{
 	  tree stype = TREE_TYPE (step_expr);
 	  off = fold_build2 (MULT_EXPR, stype,
-			     fold_convert (stype, niters), step_expr);
+			       fold_convert (stype, niters), step_expr);
+
 	  if (POINTER_TYPE_P (type))
 	    ni = fold_build_pointer_plus (init_expr, off);
 	  else
@@ -2226,9 +2245,9 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
 
       var = create_tmp_var (type, "tmp");
 
-      last_gsi = gsi_last_bb (exit_bb);
       gimple_seq new_stmts = NULL;
       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
+
       /* Exit_bb shouldn't be empty.  */
       if (!gsi_end_p (last_gsi))
 	{
@@ -2726,11 +2745,19 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
   tree type = TREE_TYPE (niters_vector);
   tree log_vf = build_int_cst (type, exact_log2 (vf));
+  tree tree_vf = build_int_cst (type, vf);
   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
 
   gcc_assert (niters_vector_mult_vf_ptr != NULL);
   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
 					    niters_vector, log_vf);
+
+  /* If we've peeled a vector iteration then subtract one full vector
+     iteration.  */
+  if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
+    niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
+					 niters_vector_mult_vf, tree_vf);
+
   if (!is_gimple_val (niters_vector_mult_vf))
     {
       tree var = create_tmp_var (type, "niters_vector_mult_vf");
@@ -3328,6 +3355,10 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	 niters_vector_mult_vf steps.  */
       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
       update_e = skip_vector ? e : loop_preheader_edge (epilog);
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	update_e = single_succ_edge (e->dest);
+
+      /* Update the main exit.  */
       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
 					update_e);
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 39aa4d1250efe308acccf484d370f8adfd1ba843..de60da31e2a3030a7fbc302d3f676af9683fd019 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1016,6 +1016,8 @@ public:
 #define LOOP_VINFO_PEELING_FOR_GAPS(L)     (L)->peeling_for_gaps
 #define LOOP_VINFO_PEELING_FOR_NITER(L)    (L)->peeling_for_niter
 #define LOOP_VINFO_EARLY_BREAKS(L)         (L)->early_breaks
+#define LOOP_VINFO_EARLY_BREAKS_VECT_PEELED(L)  \
+  (single_pred ((L)->loop->latch) != (L)->vec_loop_iv_exit->src)
 #define LOOP_VINFO_EARLY_BRK_CONFLICT_STMTS(L) (L)->early_break_conflict
 #define LOOP_VINFO_EARLY_BRK_DEST_BB(L)    (L)->early_break_dest_bb
 #define LOOP_VINFO_EARLY_BRK_VUSES(L)      (L)->early_break_vuses
@@ -2224,6 +2226,7 @@ extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
 extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
 extern edge vec_init_loop_exit_info (class loop *);
+extern void vect_iv_increment_position (edge, gimple_stmt_iterator *, bool *);
 
 /* In tree-vect-stmts.cc.  */
 extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
Richard Biener Nov. 24, 2023, 12:41 p.m. UTC | #21
On Fri, 24 Nov 2023, Tamar Christina wrote:

> Hi,
> 
> Having simplified peeling this patch becomes smaller as well:
> 
> This changes the PHI node updates to support early breaks.
> It has to support both the case where the loop's exit matches the normal loop
> exit and one where the early exit is "inverted", i.e. it's an early exit edge.
> 
> In the latter case we must always restart the loop for VF iterations.  For an
> early exit the reason is obvious, but there are cases where the "normal" exit
> is located before the early one.  This exit then does a check on ivtmp resulting
> in us leaving the loop since it thinks we're done.
> 
> In these case we may still have side-effects to perform so we also go to the
> scalar loop.
> 
> For the "normal" exit niters has already been adjusted for peeling, for the
> early exits we must find out how many iterations we actually did.  So we have
> to recalculate the new position for each exit.
> 
> For the "inverse" case we essentially peel a vector iteration *after* the vector
> loop has finished. i.e. conceptually it's the same as vect epilogue peeling but
> without generating code for the peeled iteration.  That'll be handled by the
> scalar loop.
> 
> To do this we just adjust niters_vector_mult_vf and remove one VF and for masked
> cases we do the same with final_iv.
> 
> The normal IV update code will then generate the correct values for us.
> Eventually VRP will simplify the constant bounds and we get the proper scalar
> unrolling.  This means we don't have to make any changes at all to
> vect_update_ivs_after_vectorizer but dropping some asserts.
> 
> Ok for master?

Nice.  OK.

Thanks,
Richard.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop-manip.cc (vect_set_loop_condition_partial_vectors,
> 	vect_set_loop_condition_partial_vectors_avx512,
> 	vect_gen_vector_loop_niters_mult_vf): Support peeling a vector
> 	iteration.
> 	(vect_update_ivs_after_vectorizer): Drop asserts.
> 	(vect_do_peeling): Skip forwarder edge.
> 	(vect_is_loop_exit_latch_pred): New.
> 	* tree-vectorizer.h (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED): New.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index d61d7c3a189b279fc3bcbb58c3c0e32521db3cf8..476be8a0bb6da2d06c4ca7052cb07bacecca60b1 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -951,7 +951,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
>  
>    if (final_iv)
>      {
> -      gassign *assign = gimple_build_assign (final_iv, orig_niters);
> +      gassign *assign;
> +      /* If vectorizing an inverted early break loop we have to restart the
> +	 scalar loop at niters - vf.  This matches what we do in
> +	 vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
> +      if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
> +	{
> +	  tree ftype = TREE_TYPE (orig_niters);
> +	  tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
> +	  assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
> +	}
> +       else
> +	assign = gimple_build_assign (final_iv, orig_niters);
>        gsi_insert_on_edge_immediate (exit_edge, assign);
>      }
>  
> @@ -1188,8 +1199,19 @@ vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
>  
>    if (final_iv)
>      {
> -      gassign *assign = gimple_build_assign (final_iv, orig_niters);
> -      gsi_insert_on_edge_immediate (single_exit (loop), assign);
> +      gassign *assign;
> +      /* If vectorizing an inverted early break loop we have to restart the
> +	 scalar loop at niters - vf.  This matches what we do in
> +	 vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
> +      if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
> +	{
> +	  tree ftype = TREE_TYPE (orig_niters);
> +	  tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
> +	  assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
> +	}
> +       else
> +	assign = gimple_build_assign (final_iv, orig_niters);
> +      gsi_insert_on_edge_immediate (exit_edge, assign);
>      }
>  
>    return cond_stmt;
> @@ -2157,11 +2179,8 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
>    gphi_iterator gsi, gsi1;
>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>    basic_block update_bb = update_e->dest;
> -
>    basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> -
> -  /* Make sure there exists a single-predecessor exit bb:  */
> -  gcc_assert (single_pred_p (exit_bb));
> +  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
>  
>    for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
>         !gsi_end_p (gsi) && !gsi_end_p (gsi1);
> @@ -2171,7 +2190,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
>        tree step_expr, off;
>        tree type;
>        tree var, ni, ni_name;
> -      gimple_stmt_iterator last_gsi;
>  
>        gphi *phi = gsi.phi ();
>        gphi *phi1 = gsi1.phi ();
> @@ -2207,7 +2225,8 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
>  	{
>  	  tree stype = TREE_TYPE (step_expr);
>  	  off = fold_build2 (MULT_EXPR, stype,
> -			     fold_convert (stype, niters), step_expr);
> +			       fold_convert (stype, niters), step_expr);
> +
>  	  if (POINTER_TYPE_P (type))
>  	    ni = fold_build_pointer_plus (init_expr, off);
>  	  else
> @@ -2226,9 +2245,9 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
>  
>        var = create_tmp_var (type, "tmp");
>  
> -      last_gsi = gsi_last_bb (exit_bb);
>        gimple_seq new_stmts = NULL;
>        ni_name = force_gimple_operand (ni, &new_stmts, false, var);
> +
>        /* Exit_bb shouldn't be empty.  */
>        if (!gsi_end_p (last_gsi))
>  	{
> @@ -2726,11 +2745,19 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
>    int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
>    tree type = TREE_TYPE (niters_vector);
>    tree log_vf = build_int_cst (type, exact_log2 (vf));
> +  tree tree_vf = build_int_cst (type, vf);
>    basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
>  
>    gcc_assert (niters_vector_mult_vf_ptr != NULL);
>    tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
>  					    niters_vector, log_vf);
> +
> +  /* If we've peeled a vector iteration then subtract one full vector
> +     iteration.  */
> +  if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
> +    niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
> +					 niters_vector_mult_vf, tree_vf);
> +
>    if (!is_gimple_val (niters_vector_mult_vf))
>      {
>        tree var = create_tmp_var (type, "niters_vector_mult_vf");
> @@ -3328,6 +3355,10 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>  	 niters_vector_mult_vf steps.  */
>        gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
>        update_e = skip_vector ? e : loop_preheader_edge (epilog);
> +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	update_e = single_succ_edge (e->dest);
> +
> +      /* Update the main exit.  */
>        vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
>  					update_e);
>  
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 39aa4d1250efe308acccf484d370f8adfd1ba843..de60da31e2a3030a7fbc302d3f676af9683fd019 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -1016,6 +1016,8 @@ public:
>  #define LOOP_VINFO_PEELING_FOR_GAPS(L)     (L)->peeling_for_gaps
>  #define LOOP_VINFO_PEELING_FOR_NITER(L)    (L)->peeling_for_niter
>  #define LOOP_VINFO_EARLY_BREAKS(L)         (L)->early_breaks
> +#define LOOP_VINFO_EARLY_BREAKS_VECT_PEELED(L)  \
> +  (single_pred ((L)->loop->latch) != (L)->vec_loop_iv_exit->src)
>  #define LOOP_VINFO_EARLY_BRK_CONFLICT_STMTS(L) (L)->early_break_conflict
>  #define LOOP_VINFO_EARLY_BRK_DEST_BB(L)    (L)->early_break_dest_bb
>  #define LOOP_VINFO_EARLY_BRK_VUSES(L)      (L)->early_break_vuses
> @@ -2224,6 +2226,7 @@ extern dump_user_location_t find_loop_location (class loop *);
>  extern bool vect_can_advance_ivs_p (loop_vec_info);
>  extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
>  extern edge vec_init_loop_exit_info (class loop *);
> +extern void vect_iv_increment_position (edge, gimple_stmt_iterator *, bool *);
>  
>  /* In tree-vect-stmts.cc.  */
>  extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
>
diff mbox series

Patch

--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1187,7 +1187,7 @@  vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
    loop handles exactly VF scalars per iteration.  */
 
 static gcond *
-vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
+vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
 				class loop *loop, tree niters, tree step,
 				tree final_iv, bool niters_maybe_zero,
 				gimple_stmt_iterator loop_cond_gsi)
@@ -1452,7 +1452,7 @@  slpeel_duplicate_current_defs_from_edges (edge from, edge to)
    When this happens we need to flip the understanding of main and other
    exits by peeling and IV updates.  */
 
-bool
+bool inline
 vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)
 {
   return single_pred (loop->latch) == loop_exit->src;
@@ -2193,6 +2193,7 @@  vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
      Input:
      - LOOP - a loop that is going to be vectorized. The last few iterations
               of LOOP were peeled.
+     - VF   - The chosen vectorization factor for LOOP.
      - NITERS - the number of iterations that LOOP executes (before it is
                 vectorized). i.e, the number of times the ivs should be bumped.
      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
@@ -2203,6 +2204,9 @@  vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
                   The phi args associated with the edge UPDATE_E in the bb
                   UPDATE_E->dest are updated accordingly.
 
+     - MAIN_EXIT_P - Indicates whether UPDATE_E is twhat the vectorizer
+		     considers the main loop exit.
+
      Assumption 1: Like the rest of the vectorizer, this function assumes
      a single loop exit that has a single predecessor.
 
@@ -2220,18 +2224,21 @@  vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
  */
 
 static void
-vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
-				  tree niters, edge update_e)
+vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, poly_uint64 vf,
+				  tree niters, edge update_e, bool main_exit_p)
 {
   gphi_iterator gsi, gsi1;
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block update_bb = update_e->dest;
+  bool inversed_iv
+	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
+					 LOOP_VINFO_LOOP (loop_vinfo));
 
-  basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
-
-  /* Make sure there exists a single-predecessor exit bb:  */
-  gcc_assert (single_pred_p (exit_bb));
-  gcc_assert (single_succ_edge (exit_bb) == update_e);
+  edge loop_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+  gcond *cond = get_loop_exit_condition (loop_e);
+  basic_block exit_bb = loop_e->dest;
+  basic_block iv_block = NULL;
+  gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
 
   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
@@ -2241,7 +2248,6 @@  vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       tree step_expr, off;
       tree type;
       tree var, ni, ni_name;
-      gimple_stmt_iterator last_gsi;
 
       gphi *phi = gsi.phi ();
       gphi *phi1 = gsi1.phi ();
@@ -2273,11 +2279,52 @@  vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       enum vect_induction_op_type induction_type
 	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
 
-      if (induction_type == vect_step_op_add)
+      tree iv_var = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+      /* create_iv always places it on the LHS.  Alternatively we can set a
+	 property during create_iv to identify it.  */
+      bool ivtemp = gimple_cond_lhs (cond) == iv_var;
+      if ((!main_exit_p || inversed_iv) && ivtemp)
+	{
+	  step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+	  type = TREE_TYPE (gimple_phi_result (phi));
+	  ni = build_int_cst (type, vf);
+	}
+      else if (!main_exit_p && inversed_iv)
+	continue;
+      else if (induction_type == vect_step_op_add)
 	{
+
 	  tree stype = TREE_TYPE (step_expr);
-	  off = fold_build2 (MULT_EXPR, stype,
-			     fold_convert (stype, niters), step_expr);
+
+	  /* Early exits always use last iter value not niters. */
+	  if (!main_exit_p || (main_exit_p && inversed_iv))
+	    {
+	      /* Live statements in the non-main exit shouldn't be adjusted.  We
+		 normally didn't have this problem with a single exit as live
+		 values would be in the exit block.  However when dealing with
+		 multiple exits all exits are redirected to the merge block
+		 and we restart the iteration.  */
+	      if (STMT_VINFO_LIVE_P (phi_info))
+		continue;
+
+	      /* For early break the final loop IV is:
+		 init + (final - init) * vf which takes into account peeling
+		 values and non-single steps.  The main exit can use niters
+		 since if you exit from the main exit you've done all vector
+		 iterations.  For an early exit we don't know when we exit so we
+		 must re-calculate this on the exit.  */
+	      tree start_expr = gimple_phi_result (phi);
+	      off = fold_build2 (MINUS_EXPR, stype,
+				 fold_convert (stype, start_expr),
+				 fold_convert (stype, init_expr));
+	      /* Now adjust for VF to get the final iteration value.  */
+	      off = fold_build2 (MULT_EXPR, stype, off,
+				 build_int_cst (stype, vf));
+	    }
+	  else
+	    off = fold_build2 (MULT_EXPR, stype,
+			       fold_convert (stype, niters), step_expr);
+
 	  if (POINTER_TYPE_P (type))
 	    ni = fold_build_pointer_plus (init_expr, off);
 	  else
@@ -2289,6 +2336,8 @@  vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
       /* Don't bother call vect_peel_nonlinear_iv_init.  */
       else if (induction_type == vect_step_op_neg)
 	ni = init_expr;
+      else if (!main_exit_p)
+	continue;
       else
 	ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
 					  niters, step_expr,
@@ -2296,9 +2345,20 @@  vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
 
       var = create_tmp_var (type, "tmp");
 
-      last_gsi = gsi_last_bb (exit_bb);
       gimple_seq new_stmts = NULL;
       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
+
+      /* For non-main exit create an intermediat edge to get any updated iv
+	 calculations.  */
+      if (!main_exit_p
+	  && !iv_block
+	  && (!gimple_seq_empty_p (stmts) || !gimple_seq_empty_p (new_stmts)))
+	{
+	  iv_block = split_edge (update_e);
+	  update_e = single_succ_edge (update_e->dest);
+	  last_gsi = gsi_last_bb (iv_block);
+	}
+
       /* Exit_bb shouldn't be empty.  */
       if (!gsi_end_p (last_gsi))
 	{
@@ -2836,12 +2896,18 @@  find_guard_arg (class loop *loop ATTRIBUTE_UNUSED, const_edge loop_e,
 	 tree var = PHI_ARG_DEF (phi, loop_e->dest_idx);
 	 if (TREE_CODE (var) != SSA_NAME)
 	    continue;
-	 tree def = get_current_def (var);
-	 if (!def)
-	   continue;
-	 if (operand_equal_p (def,
-			      PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
-	   return PHI_RESULT (phi);
+
+	  /* The value could be carried all the way from the loop version block
+	     in which case we wouldn't have kept the value if it's not used in
+	     the loop.  In such cases get_current_def returns null as the value
+	     is already current.  */
+	  tree orig_var = get_current_def (var);
+	  if (!orig_var)
+	    orig_var = var;
+
+	  if (operand_equal_p (orig_var,
+			       PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
+	    return PHI_RESULT (phi);
 	}
     }
   return NULL_TREE;
@@ -3528,8 +3594,21 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	 niters_vector_mult_vf steps.  */
       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
       update_e = skip_vector ? e : loop_preheader_edge (epilog);
-      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
-					update_e);
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	update_e = single_succ_edge (e->dest);
+      bool inversed_iv
+	= !vect_is_loop_exit_latch_pred (LOOP_VINFO_IV_EXIT (loop_vinfo),
+					 LOOP_VINFO_LOOP (loop_vinfo));
+
+      for (auto exit : get_loop_exit_edges (loop))
+	{
+	  bool main_exit_p = vect_is_loop_exit_latch_pred (exit, loop);
+	  edge exit_e = main_exit_p ? update_e : exit;
+	  vect_update_ivs_after_vectorizer (loop_vinfo, vf,
+					    niters_vector_mult_vf, exit_e,
+					    main_exit_p);
+
+	}
 
       if (skip_epilog)
 	{