diff mbox series

Fix profile updates in copy-header

Message ID ZKAJV9rjbWo3XFWb@kam.mff.cuni.cz
State New
Headers show
Series Fix profile updates in copy-header | expand

Commit Message

Jan Hubicka July 1, 2023, 11:09 a.m. UTC
Hi,
most common source of profile mismatches is now copyheader pass.  The reason is that
in comon case the duplicated header condition will become constant true and that needs
changes in the loop exit condition probability.

While this can be done by jump threading it is not, since it gives up on loops.
Copy header pass now has logic to prove that first exit will become true, so this
patch adds necessary pumbing to the profile updating.
This is done in gimple_duplicate_sese_region in a way that is specific for this
particular case.  I think general case is kind-of unsolvable and loop-ch is the
only user of the infrastructure.  If we later invent some new users, maybe we
can export the region and region_copy arrays and let user to do the update.

With the patch we now get:

Pass dump id and name            |static mismat|dynamic mismatch          
                                 |in count     |in count                  
107t cunrolli                    |      3    +3|        19237       +19237
127t ch                          |     13   +10|        19237             
131t dom                         |     39   +26|        19237             
133t isolate-paths               |     47    +8|        19237             
134t reassoc                     |     49    +2|        19237             
136t forwprop                    |     53    +4|       226943      +207706
159t cddce                       |     61    +8|       242222       +15279
161t ldist                       |     62    +1|       242222             
172t ifcvt                       |     66    +4|       415472      +173250
173t vect                        |    143   +77|     10859784    +10444312
176t cunroll                     |    294  +151|    150357763   +139497979
183t loopdone                    |    291    -3|    150289533       -68230
194t tracer                      |    322   +31|    153230990     +2941457
195t fre                         |    317    -5|    153230990             
197t dom                         |    286   -31|    154448079     +1217089
199t threadfull                  |    293    +7|    154724763      +276684
200t vrp                         |    297    +4|    155042448      +317685
204t dce                         |    294    -3|    155017073       -25375
206t sink                        |    292    -2|    155017073             
211t cddce                       |    298    +6|    155018657        +1584
255t optimized                   |    296    -2|    155018657             
256r expand                      |    273   -23|    154592622      -426035
258r into_cfglayout              |    268    -5|    154592661          +39
275r loop2_unroll                |    272    +4|    159701866     +5109205
291r ce2                         |    270    -2|    159723509             
312r pro_and_epilogue            |    290   +20|    159792505       +68996
315r jump2                       |    296    +6|    164234016     +4441511
323r bbro                        |    294    -2|    159385430     -4848586

So ch introduces 10 new mismatches while originally it did 308.  At bbro the
number of mismatches dropped from 432 to 294.
Most offender is now cunroll pass. I think it is the case where loop has multiple
exits and one of exits becomes to be false in all but last peeled iteration.

This is another case where non-trivial loop update is needed.

Honza

gcc/ChangeLog:

	* tree-cfg.cc (gimple_duplicate_sese_region): Add elliminated_edge
	parmaeter; update profile.
	* tree-cfg.h (gimple_duplicate_sese_region): Update prototype.
	* tree-ssa-loop-ch.cc (entry_loop_condition_is_static): Rename to ...
	(static_loop_exit): ... this; return the edge to be elliminated.
	(ch_base::copy_headers): Handle profile updating for eliminated exits.

gcc/testsuite/ChangeLog:

	* gcc.dg/tree-ssa/ifc-20040816-1.c: Reduce number of mismatches
	from 2 to 1.
	* gcc.dg/tree-ssa/loop-ch-profile-1.c: New test.
	* gcc.dg/tree-ssa/loop-ch-profile-2.c: New test.
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-20040816-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-20040816-1.c
index f8a6495cbaa..b55a533e374 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ifc-20040816-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-20040816-1.c
@@ -39,4 +39,4 @@  int main1 ()
    which is folded by vectorizer.  Both outgoing edges must have probability
    100% so the resulting profile match after folding.  */
 /* { dg-final { scan-tree-dump-times "Invalid sum of outgoing probabilities 200.0" 1 "ifcvt" } } */
-/* { dg-final { scan-tree-dump-times "Invalid sum of incoming counts" 2 "ifcvt" } } */
+/* { dg-final { scan-tree-dump-times "Invalid sum of incoming counts" 1 "ifcvt" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-ch-profile-1.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-ch-profile-1.c
new file mode 100644
index 00000000000..e8bab62b0d9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-ch-profile-1.c
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-ch2-blocks-details -fdump-tree-optimized" } */
+void foo ();
+void test(int v, int q)
+{
+	for (int i = 0; i < 10 && v/q; i++)
+		foo ();
+}
+/* { dg-final { scan-tree-dump-not "Invalid sum" "ch2"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-ch-profile-2.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-ch-profile-2.c
new file mode 100644
index 00000000000..99d22ba6213
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-ch-profile-2.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-ch2-blocks-details -fdump-tree-optimized" } */
+void foo ();
+void test()
+{
+	for (int i = 0; i < 10; i++)
+		foo ();
+}
+/* We should figure out that after header dulication loop iterates 9 times.  */
+/* { dg-final { scan-tree-dump "90.00" "ch2"} } */
+/* { dg-final { scan-tree-dump "10.00" "ch2"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "ch2"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized"} } */
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 30f26af69f2..4989906706c 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -6658,13 +6658,17 @@  add_phi_args_after_copy (basic_block *region_copy, unsigned n_region,
    blocks are stored to REGION_COPY in the same order as they had in REGION,
    provided that REGION_COPY is not NULL.
    The function returns false if it is unable to copy the region,
-   true otherwise.  */
+   true otherwise.
+
+   ELIMINATED_EDGE is an edge that is known to be removed in the dupicated
+   region.  */
 
 bool
 gimple_duplicate_sese_region (edge entry, edge exit,
-			    basic_block *region, unsigned n_region,
-			    basic_block *region_copy,
-			    bool update_dominance)
+			      basic_block *region, unsigned n_region,
+			      basic_block *region_copy,
+			      bool update_dominance,
+			      edge eliminated_edge)
 {
   unsigned i;
   bool free_region_copy = false, copying_header = false;
@@ -6743,11 +6747,92 @@  gimple_duplicate_sese_region (edge entry, edge exit,
 	    split_edge_bb_loc (entry), update_dominance);
   if (total_count.initialized_p () && entry_count.initialized_p ())
     {
-      scale_bbs_frequencies_profile_count (region, n_region,
-				           total_count - entry_count,
-				           total_count);
-      scale_bbs_frequencies_profile_count (region_copy, n_region, entry_count,
-				           total_count);
+      if (!eliminated_edge)
+	{
+	  scale_bbs_frequencies_profile_count (region, n_region,
+					       total_count - entry_count,
+					       total_count);
+	  scale_bbs_frequencies_profile_count (region_copy, n_region,
+					       entry_count, total_count);
+	}
+      else
+	{
+	  /* We only support only case where eliminated_edge is one and it
+	     exists first BB.  We also assume that the duplicated region is
+	     acyclic.  So we expect the following:
+
+	       // region_copy_start entry will be scaled to entry_count
+		 if (cond1)         <- this condition will become false
+				       and we update probabilities
+		   goto loop_exit;
+		 if (cond2)
+		   goto loop_exit;
+		 goto loop_header   <- this will be redirected to loop.
+	       // region_copy_end
+	     loop:
+		       <body>
+	       // region start
+	     loop_header:
+		       if (cond1)   <- we need to update probabbility here
+			 goto loop_exit;
+		       if (cond2)   <- and determine scaling factor here.
+			 goto loop_exit;
+		       else
+			 goto loop;
+	       // region end
+
+	     Adding support for more exits can be done similarly,
+	     but only consumer so far is tree-ssa-loop-ch and it uses only this
+	     to handle the common case of peeling headers which have
+	     conditionals known to be always true upon entry.  */
+	  gcc_assert (eliminated_edge->src == region[0]
+		      && EDGE_COUNT (region[0]->succs) == 2
+		      && copying_header);
+
+	  edge e, e_copy, eliminated_edge_copy;
+	  if (EDGE_SUCC (region[0], 0) == eliminated_edge)
+	    {
+	      e = EDGE_SUCC (region[0], 1);
+	      e_copy = EDGE_SUCC (region_copy[0], 1);
+	      eliminated_edge_copy = EDGE_SUCC (region_copy[0], 0);
+	    }
+	  else
+	    {
+	      e = EDGE_SUCC (region[0], 0);
+	      e_copy = EDGE_SUCC (region_copy[0], 0);
+	      eliminated_edge_copy = EDGE_SUCC (region_copy[0], 1);
+	    }
+	  gcc_checking_assert (e != e_copy
+			       && eliminated_edge_copy != eliminated_edge
+			       && eliminated_edge_copy->dest
+				  == eliminated_edge->dest);
+
+
+	  /* Handle first basic block in duplicated region as in the
+	     non-eliminating case.  */
+	  scale_bbs_frequencies_profile_count (region_copy, n_region,
+					       entry_count, total_count);
+	  /* Now update redirecting eliminated edge to the other edge.
+	     Actual CFG update is done by caller.  */
+	  e_copy->probability = profile_probability::always ();
+	  eliminated_edge_copy->probability = profile_probability::never ();
+	  /* Header copying is a special case of jump threading, so use
+	     common code to update loop body exit condition.  */
+	  update_bb_profile_for_threading (region[0], e_copy->count (), e);
+	  /* If we duplicated more conditionals first scale the profile of
+	     rest of the preheader.  Then work out the probability of
+	     entering the loop and scale rest of the loop.  */
+	  if (n_region > 1)
+	    {
+	      scale_bbs_frequencies_profile_count (region_copy + 1,
+						   n_region - 1,
+						   e_copy->count (),
+						   region_copy[1]->count);
+	      scale_bbs_frequencies_profile_count (region + 1, n_region - 1,
+						   e->count (),
+						   region[1]->count);
+	    }
+	}
     }
 
   if (copying_header)
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
index 57865c58c90..b9ccd58c3db 100644
--- a/gcc/tree-cfg.h
+++ b/gcc/tree-cfg.h
@@ -70,7 +70,7 @@  extern void add_phi_args_after_copy_bb (basic_block);
 extern void add_phi_args_after_copy (basic_block *, unsigned, edge);
 extern basic_block split_edge_bb_loc (edge);
 extern bool gimple_duplicate_sese_region (edge, edge, basic_block *, unsigned,
-					basic_block *, bool);
+					basic_block *, bool, edge);
 extern bool gimple_duplicate_sese_tail (edge, edge, basic_block *, unsigned,
 				      basic_block *);
 extern void gather_blocks_in_sese_region (basic_block entry, basic_block exit,
diff --git a/gcc/tree-ssa-loop-ch.cc b/gcc/tree-ssa-loop-ch.cc
index 22252bee135..291f2dbcab9 100644
--- a/gcc/tree-ssa-loop-ch.cc
+++ b/gcc/tree-ssa-loop-ch.cc
@@ -59,17 +59,18 @@  edge_range_query (irange &r, edge e, gcond *cond, gimple_ranger &ranger)
     r.set_varying (boolean_type_node);
 }
 
-/* Return true if the condition on the first iteration of the loop can
-   be statically determined.  */
+/* Return edge that is true in the first iteration of the loop
+   and NULL otherwise.  */
 
-static bool
-entry_loop_condition_is_static (class loop *l, gimple_ranger *ranger)
+static edge
+static_loop_exit (class loop *l, gimple_ranger *ranger)
 {
   edge e = loop_preheader_edge (l);
   gcond *last = safe_dyn_cast <gcond *> (*gsi_last_bb (e->dest));
+  edge ret_e;
 
   if (!last)
-    return false;
+    return NULL;
 
   edge true_e, false_e;
   extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
@@ -77,17 +78,23 @@  entry_loop_condition_is_static (class loop *l, gimple_ranger *ranger)
   /* If neither edge is the exit edge, this is not a case we'd like to
      special-case.  */
   if (!loop_exit_edge_p (l, true_e) && !loop_exit_edge_p (l, false_e))
-    return false;
+    return NULL;
 
   int_range<1> desired_static_range;
   if (loop_exit_edge_p (l, true_e))
-    desired_static_range = range_false ();
+   {
+      desired_static_range = range_false ();
+      ret_e = true_e;
+   }
   else
+  {
     desired_static_range = range_true ();
+    ret_e = false_e;
+  }
 
   int_range<2> r;
   edge_range_query (r, e, last, *ranger);
-  return r == desired_static_range;
+  return r == desired_static_range ? ret_e : NULL;
 }
 
 /* Check whether we should duplicate HEADER of LOOP.  At most *LIMIT
@@ -394,7 +401,8 @@  ch_base::copy_headers (function *fun)
   copied_bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (fun));
   bbs_size = n_basic_blocks_for_fn (fun);
 
-  auto_vec<loop_p> candidates;
+  struct candidate {class loop *loop; edge static_exit;};
+  auto_vec<struct candidate> candidates;
   auto_vec<std::pair<edge, loop_p> > copied;
   auto_vec<class loop *> loops_to_unloop;
   auto_vec<int> loops_to_unloop_nunroll;
@@ -427,12 +435,14 @@  ch_base::copy_headers (function *fun)
 	  || !process_loop_p (loop))
 	continue;
 
+      edge static_exit = static_loop_exit (loop, ranger);
+
       /* Avoid loop header copying when optimizing for size unless we can
 	 determine that the loop condition is static in the first
 	 iteration.  */
       if (optimize_loop_for_size_p (loop)
 	  && !loop->force_vectorize
-	  && !entry_loop_condition_is_static (loop, ranger))
+	  && !static_exit)
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    fprintf (dump_file,
@@ -442,13 +452,14 @@  ch_base::copy_headers (function *fun)
 	}
 
       if (should_duplicate_loop_header_p (header, loop, &remaining_limit))
-	candidates.safe_push (loop);
+	candidates.safe_push ({loop, static_exit});
     }
   /* Do not use ranger after we change the IL and not have updated SSA.  */
   delete ranger;
 
-  for (auto loop : candidates)
+  for (auto candidate : candidates)
     {
+      class loop *loop = candidate.loop;
       int initial_limit = param_max_loop_header_insns;
       int remaining_limit = initial_limit;
       if (dump_file && (dump_flags & TDF_DETAILS))
@@ -501,11 +512,17 @@  ch_base::copy_headers (function *fun)
 	continue;
 
       if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file,
-		 "Duplicating header of the loop %d up to edge %d->%d,"
-		 " %i insns.\n",
-		 loop->num, exit->src->index, exit->dest->index,
-		 initial_limit - remaining_limit);
+	{
+	  fprintf (dump_file,
+		   "Duplicating header of the loop %d up to edge %d->%d,"
+		   " %i insns.\n",
+		   loop->num, exit->src->index, exit->dest->index,
+		   initial_limit - remaining_limit);
+	  if (candidate.static_exit)
+	    fprintf (dump_file,
+		     "  Will eliminate peeled conditional in bb %d.\n",
+		     candidate.static_exit->src->index);
+	}
 
       /* Ensure that the header will have just the latch as a predecessor
 	 inside the loop.  */
@@ -519,7 +536,7 @@  ch_base::copy_headers (function *fun)
 
       propagate_threaded_block_debug_into (exit->dest, entry->dest);
       if (!gimple_duplicate_sese_region (entry, exit, bbs, n_bbs, copied_bbs,
-					 true))
+					 true, candidate.static_exit))
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    fprintf (dump_file, "Duplication failed.\n");