diff mbox series

Turn TODO_rebuild_frequencies to a pass

Message ID ZLFa4p55G/H78vE2@kam.mff.cuni.cz
State New
Headers show
Series Turn TODO_rebuild_frequencies to a pass | expand

Commit Message

Jan Hubicka July 14, 2023, 2:25 p.m. UTC
Hi,
currently we rebuild profile_counts from profile_probability after inlining,
because there is a chance that producing large loop nests may get unrealistically
large profile_count values.  This is much less of concern when we switched to
new profile_count representation while back.

This propagation can also compensate for profile inconsistencies caused by
optimization passes.  Since inliner is followed by basic cleanup passes that
does not use profile, we get more realistic profile by delaying the recomputation
after basic optimizations exposed by inlininig are finished.

This does not fit into TODO machinery, so I turn rebuilding into stand alone
pass and schedule it before first consumer of profile in the optimization
queue.

I also added logic that avoids repropagating when CFG is good and not too close
to overflow.  Propagating visits very basic block loop_depth times, so it is
not linear and avoiding it may help a bit.

On tramp3d we get 14 functions repropagated and 916 are OK.  The repropagated
functions are RB tree ones where we produce crazy loop nests by recurisve inlining.
This is something to fix independently.

Bootstrapped/regtested x86_64-linux.  Plan to commit it later today
if there are no complains.

Honza

gcc/ChangeLog:

	* passes.cc (execute_function_todo): Remove
	TODO_rebuild_frequencies
	* passes.def: Add rebuild_frequencies pass.
	* predict.cc (estimate_bb_frequencies): Drop
	force parameter.
	(tree_estimate_probability): Update call of
	estimate_bb_frequencies.
	(rebuild_frequencies): Turn into a pass; verify CFG profile consistency
	first and do not rebuild if not necessary.
	(class pass_rebuild_frequencies): New.
	(make_pass_rebuild_frequencies): New.
	* profile-count.h: Add profile_count::very_large_p.
	* tree-inline.cc (optimize_inline_calls): Do not return
	TODO_rebuild_frequencies
	* tree-pass.h (TODO_rebuild_frequencies): Remove.
	(make_pass_rebuild_frequencies): Declare.
diff mbox series

Patch

diff --git a/gcc/passes.cc b/gcc/passes.cc
index 2f0e378b8b2..d7b0ad271a1 100644
--- a/gcc/passes.cc
+++ b/gcc/passes.cc
@@ -2075,9 +2075,6 @@  execute_function_todo (function *fn, void *data)
   if (flags & TODO_remove_unused_locals)
     remove_unused_locals ();
 
-  if (flags & TODO_rebuild_frequencies)
-    rebuild_frequencies ();
-
   if (flags & TODO_rebuild_cgraph_edges)
     cgraph_edge::rebuild_edges ();
 
diff --git a/gcc/passes.def b/gcc/passes.def
index faa5208b26b..f2893ae8a8b 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -206,6 +206,10 @@  along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_post_ipa_warn);
       /* Must run before loop unrolling.  */
       NEXT_PASS (pass_warn_access, /*early=*/true);
+      /* Profile count may overflow as a result of inlinining very large
+         loop nests.  This pass should run before any late pass that makes
+	 use of profile.  */
+      NEXT_PASS (pass_rebuild_frequencies);
       NEXT_PASS (pass_complete_unrolli);
       NEXT_PASS (pass_backprop);
       NEXT_PASS (pass_phiprop);
@@ -395,6 +399,10 @@  along with GCC; see the file COPYING3.  If not see
          to forward object-size and builtin folding results properly.  */
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_dce);
+      /* Profile count may overflow as a result of inlinining very large
+         loop nests.  This pass should run before any late pass that makes
+	 use of profile.  */
+      NEXT_PASS (pass_rebuild_frequencies);
       NEXT_PASS (pass_sancov);
       NEXT_PASS (pass_asan);
       NEXT_PASS (pass_tsan);
diff --git a/gcc/predict.cc b/gcc/predict.cc
index 1aa4c25eb70..26f9f3f6a88 100644
--- a/gcc/predict.cc
+++ b/gcc/predict.cc
@@ -89,7 +89,7 @@  static void predict_paths_leading_to_edge (edge, enum br_predictor,
 static bool can_predict_insn_p (const rtx_insn *);
 static HOST_WIDE_INT get_predictor_value (br_predictor, HOST_WIDE_INT);
 static void determine_unlikely_bbs ();
-static void estimate_bb_frequencies (bool force);
+static void estimate_bb_frequencies ();
 
 /* Information we hold about each branch predictor.
    Filled using information from predict.def.  */
@@ -3169,8 +3169,9 @@  tree_estimate_probability (bool dry_run)
   delete bb_predictions;
   bb_predictions = NULL;
 
-  if (!dry_run)
-    estimate_bb_frequencies (false);
+  if (!dry_run
+      && profile_status_for_fn (cfun) != PROFILE_READ)
+    estimate_bb_frequencies ();
   free_dominance_info (CDI_POST_DOMINATORS);
   remove_fake_exit_edges ();
 }
@@ -3923,103 +3924,97 @@  determine_unlikely_bbs ()
 }
 
 /* Estimate and propagate basic block frequencies using the given branch
-   probabilities.  If FORCE is true, the frequencies are used to estimate
-   the counts even when there are already non-zero profile counts.  */
+   probabilities.  */
 
 static void
-estimate_bb_frequencies (bool force)
+estimate_bb_frequencies ()
 {
   basic_block bb;
   sreal freq_max;
 
   determine_unlikely_bbs ();
 
-  if (force || profile_status_for_fn (cfun) != PROFILE_READ
-      || !update_max_bb_count ())
-    {
+  mark_dfs_back_edges ();
 
-      mark_dfs_back_edges ();
+  single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun))->probability =
+     profile_probability::always ();
 
-      single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun))->probability =
-	 profile_probability::always ();
+  /* Set up block info for each basic block.  */
+  alloc_aux_for_blocks (sizeof (block_info));
+  alloc_aux_for_edges (sizeof (edge_prob_info));
+  FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
+    {
+      edge e;
+      edge_iterator ei;
 
-      /* Set up block info for each basic block.  */
-      alloc_aux_for_blocks (sizeof (block_info));
-      alloc_aux_for_edges (sizeof (edge_prob_info));
-      FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
+      FOR_EACH_EDGE (e, ei, bb->succs)
 	{
-	  edge e;
-	  edge_iterator ei;
-
-	  FOR_EACH_EDGE (e, ei, bb->succs)
-	    {
-	      /* FIXME: Graphite is producing edges with no profile. Once
-		 this is fixed, drop this.  */
-	      if (e->probability.initialized_p ())
-	        EDGE_INFO (e)->back_edge_prob
-		   = e->probability.to_sreal ();
-	      else
-		/* back_edge_prob = 0.5 */
-		EDGE_INFO (e)->back_edge_prob = sreal (1, -1);
-	    }
+	  /* FIXME: Graphite is producing edges with no profile. Once
+	     this is fixed, drop this.  */
+	  if (e->probability.initialized_p ())
+	    EDGE_INFO (e)->back_edge_prob
+	       = e->probability.to_sreal ();
+	  else
+	    /* back_edge_prob = 0.5 */
+	    EDGE_INFO (e)->back_edge_prob = sreal (1, -1);
 	}
+    }
 
-      /* First compute frequencies locally for each loop from innermost
-         to outermost to examine frequencies for back edges.  */
-      estimate_loops ();
-
-      freq_max = 0;
-      FOR_EACH_BB_FN (bb, cfun)
-	if (freq_max < BLOCK_INFO (bb)->frequency)
-	  freq_max = BLOCK_INFO (bb)->frequency;
-
-      /* Scaling frequencies up to maximal profile count may result in
-	 frequent overflows especially when inlining loops.
-	 Small scalling results in unnecesary precision loss.  Stay in
-	 the half of the (exponential) range.  */
-      freq_max = (sreal (1) << (profile_count::n_bits / 2)) / freq_max;
-      if (freq_max < 16)
-	freq_max = 16;
-      profile_count ipa_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count.ipa ();
-      cfun->cfg->count_max = profile_count::uninitialized ();
-      FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
+  /* First compute frequencies locally for each loop from innermost
+     to outermost to examine frequencies for back edges.  */
+  estimate_loops ();
+
+  freq_max = 0;
+  FOR_EACH_BB_FN (bb, cfun)
+    if (freq_max < BLOCK_INFO (bb)->frequency)
+      freq_max = BLOCK_INFO (bb)->frequency;
+
+  /* Scaling frequencies up to maximal profile count may result in
+     frequent overflows especially when inlining loops.
+     Small scalling results in unnecesary precision loss.  Stay in
+     the half of the (exponential) range.  */
+  freq_max = (sreal (1) << (profile_count::n_bits / 2)) / freq_max;
+  if (freq_max < 16)
+    freq_max = 16;
+  profile_count ipa_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count.ipa ();
+  cfun->cfg->count_max = profile_count::uninitialized ();
+  FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
+    {
+      sreal tmp = BLOCK_INFO (bb)->frequency;
+      if (tmp >= 1)
 	{
-	  sreal tmp = BLOCK_INFO (bb)->frequency;
-	  if (tmp >= 1)
-	    {
-	      gimple_stmt_iterator gsi;
-	      tree decl;
-
-	      /* Self recursive calls can not have frequency greater than 1
-		 or program will never terminate.  This will result in an
-		 inconsistent bb profile but it is better than greatly confusing
-		 IPA cost metrics.  */
-	      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-		if (is_gimple_call (gsi_stmt (gsi))
-		    && (decl = gimple_call_fndecl (gsi_stmt (gsi))) != NULL
-		    && recursive_call_p (current_function_decl, decl))
-		  {
-		    if (dump_file)
-		      fprintf (dump_file, "Dropping frequency of recursive call"
-			       " in bb %i from %f\n", bb->index,
-			       tmp.to_double ());
-		    tmp = (sreal)9 / (sreal)10;
-		    break;
-		  }
-	    }
-	  tmp = tmp * freq_max + sreal (1, -1);
-	  profile_count count = profile_count::from_gcov_type (tmp.to_int ());	
-
-	  /* If we have profile feedback in which this function was never
-	     executed, then preserve this info.  */
-	  if (!(bb->count == profile_count::zero ()))
-	    bb->count = count.guessed_local ().combine_with_ipa_count (ipa_count);
-          cfun->cfg->count_max = cfun->cfg->count_max.max (bb->count);
+	  gimple_stmt_iterator gsi;
+	  tree decl;
+
+	  /* Self recursive calls can not have frequency greater than 1
+	     or program will never terminate.  This will result in an
+	     inconsistent bb profile but it is better than greatly confusing
+	     IPA cost metrics.  */
+	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	    if (is_gimple_call (gsi_stmt (gsi))
+		&& (decl = gimple_call_fndecl (gsi_stmt (gsi))) != NULL
+		&& recursive_call_p (current_function_decl, decl))
+	      {
+		if (dump_file)
+		  fprintf (dump_file, "Dropping frequency of recursive call"
+			   " in bb %i from %f\n", bb->index,
+			   tmp.to_double ());
+		tmp = (sreal)9 / (sreal)10;
+		break;
+	      }
 	}
+      tmp = tmp * freq_max + sreal (1, -1);
+      profile_count count = profile_count::from_gcov_type (tmp.to_int ());
 
-      free_aux_for_blocks ();
-      free_aux_for_edges ();
+      /* If we have profile feedback in which this function was never
+	 executed, then preserve this info.  */
+      if (!(bb->count == profile_count::zero ()))
+	bb->count = count.guessed_local ().combine_with_ipa_count (ipa_count);
+      cfun->cfg->count_max = cfun->cfg->count_max.max (bb->count);
     }
+
+  free_aux_for_blocks ();
+  free_aux_for_edges ();
   compute_function_frequency ();
 }
 
@@ -4307,38 +4302,162 @@  make_pass_strip_predict_hints (gcc::context *ctxt)
 void
 rebuild_frequencies (void)
 {
-  timevar_push (TV_REBUILD_FREQUENCIES);
-
-  /* When the max bb count in the function is small, there is a higher
-     chance that there were truncation errors in the integer scaling
-     of counts by inlining and other optimizations. This could lead
-     to incorrect classification of code as being cold when it isn't.
-     In that case, force the estimation of bb counts/frequencies from the
-     branch probabilities, rather than computing frequencies from counts,
-     which may also lead to frequencies incorrectly reduced to 0. There
-     is less precision in the probabilities, so we only do this for small
-     max counts.  */
-  cfun->cfg->count_max = profile_count::uninitialized ();
+  /* If we have no profile, do nothing.  Note that after inlining
+     profile_status_for_fn may not represent the actual presence/absence of
+     profile.  */
+  if (profile_status_for_fn (cfun) == PROFILE_ABSENT
+      && !ENTRY_BLOCK_PTR_FOR_FN (cfun)->count.initialized_p ())
+    return;
+
+
+  /* See if everything is OK and update count_max.  */
   basic_block bb;
+  bool inconsistency_found = false;
+  bool uninitialized_probablity_found = false;
+  bool uninitialized_count_found = false;
+
+  cfun->cfg->count_max = profile_count::uninitialized ();
   FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
-    cfun->cfg->count_max = cfun->cfg->count_max.max (bb->count);
+    {
+      cfun->cfg->count_max = cfun->cfg->count_max.max (bb->count);
+      /* Uninitialized count may be result of inlining or an omision in an
+         optimization pass.  */
+      if (!bb->count.initialized_p ())
+	{
+	  uninitialized_count_found = true;
+	  if (dump_file)
+	    fprintf (dump_file, "BB %i has uninitialized count\n",
+		     bb->index);
+	}
+      if (bb != ENTRY_BLOCK_PTR_FOR_FN (cfun)
+	  && (!uninitialized_probablity_found || !inconsistency_found))
+        {
+	  profile_count sum = profile_count::zero ();
+	  edge e;
+	  edge_iterator ei;
 
-  if (profile_status_for_fn (cfun) == PROFILE_GUESSED)
+	  FOR_EACH_EDGE (e, ei, bb->preds)
+	    {
+	      sum += e->count ();
+	      /* Uninitialized probability may be result of inlining or an
+	         omision in an optimization pass.  */
+	      if (!e->probability.initialized_p ())
+	        {
+		  if (dump_file)
+		    fprintf (dump_file,
+			     "Edge %i->%i has uninitialized probability\n",
+			     e->src->index, e->dest->index);
+	        }
+	    }
+	  if (sum.differs_from_p (bb->count))
+	    {
+	      if (dump_file)
+		fprintf (dump_file,
+			 "BB %i has invalid sum of incomming counts\n",
+			 bb->index);
+	      inconsistency_found = true;
+	    }
+	}
+    }
+
+  /* If everything is OK, do not re-propagate frequencies.  */
+  if (!inconsistency_found
+      && (!uninitialized_count_found || uninitialized_probablity_found)
+      && !cfun->cfg->count_max.very_large_p ())
     {
-      loop_optimizer_init (LOOPS_HAVE_MARKED_IRREDUCIBLE_REGIONS);
-      connect_infinite_loops_to_exit ();
-      estimate_bb_frequencies (true);
-      remove_fake_exit_edges ();
-      loop_optimizer_finalize ();
+      if (dump_file)
+	fprintf (dump_file, "Profile is consistent\n");
+      return;
     }
-  else if (profile_status_for_fn (cfun) == PROFILE_READ)
-    update_max_bb_count ();
-  else if (profile_status_for_fn (cfun) == PROFILE_ABSENT
-	   && !flag_guess_branch_prob)
-    ;
-  else
-    gcc_unreachable ();
-  timevar_pop (TV_REBUILD_FREQUENCIES);
+  /* Do not re-propagate if we have profile feedback.  Even if the profile is
+     inconsistent from previous transofrmations, it is probably more realistic
+     for hot part of the program than result of repropagating.
+
+     Consider example where we previously has
+
+     if (test)
+       then [large probability for true]
+
+     and we later proved that test is always 0.  In this case, if profile was
+     read correctly, we must have duplicated the conditional (for example by
+     inlining) in to a context where test is false.  From profile feedback
+     we know that most executions if the conditionals were true, so the
+     important copy is not the one we look on.
+
+     Propagating from probabilities would make profile look consistent, but
+     because probablities after code duplication may not be representative
+     for a given run, we would only propagate the error further.  */
+  if (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count.ipa ().nonzero_p ()
+      && !uninitialized_count_found)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+	    "Profile is inconsistent but read from profile feedback;"
+	    " not rebuilding\n");
+      return;
+    }
+
+  loop_optimizer_init (LOOPS_HAVE_MARKED_IRREDUCIBLE_REGIONS);
+  connect_infinite_loops_to_exit ();
+  estimate_bb_frequencies ();
+  remove_fake_exit_edges ();
+  loop_optimizer_finalize ();
+  if (dump_file)
+    fprintf (dump_file, "Rebuilt basic block counts\n");
+
+  return;
+}
+
+namespace {
+
+const pass_data pass_data_rebuild_frequencies =
+{
+  GIMPLE_PASS, /* type */
+  "rebuild_frequencies", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_REBUILD_FREQUENCIES, /* tv_id */
+  PROP_cfg, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_rebuild_frequencies : public gimple_opt_pass
+{
+public:
+  pass_rebuild_frequencies (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_rebuild_frequencies, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  opt_pass * clone () final override
+  {
+    return new pass_rebuild_frequencies (m_ctxt);
+  }
+  void set_pass_param (unsigned int n, bool param) final override
+    {
+      gcc_assert (n == 0);
+      early_p = param;
+    }
+
+  unsigned int execute (function *) final override
+  {
+    rebuild_frequencies ();
+    return 0;
+  }
+
+private:
+  bool early_p;
+
+}; // class pass_rebuild_frequencies
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_rebuild_frequencies (gcc::context *ctxt)
+{
+  return new pass_rebuild_frequencies (ctxt);
 }
 
 /* Perform a dry run of the branch prediction pass and report comparsion of
diff --git a/gcc/profile-count.h b/gcc/profile-count.h
index 99416d9a463..bf1136782a3 100644
--- a/gcc/profile-count.h
+++ b/gcc/profile-count.h
@@ -1276,6 +1276,16 @@  public:
       return ret;
     }
 
+  /* Return true if profile count is very large, so we risk overflows
+     with loop transformations.  */
+  bool
+  very_large_p ()
+  {
+    if (!initialized_p ())
+      return false;
+    return m_val > max_count / 65536;
+  }
+
   int to_frequency (struct function *fun) const;
   int to_cgraph_frequency (profile_count entry_bb_count) const;
   sreal to_sreal_scale (profile_count in, bool *known = NULL) const;
diff --git a/gcc/tree-inline.cc b/gcc/tree-inline.cc
index 99efddc36c8..954b39ae1c6 100644
--- a/gcc/tree-inline.cc
+++ b/gcc/tree-inline.cc
@@ -5526,9 +5526,7 @@  optimize_inline_calls (tree fn)
   return (TODO_update_ssa
 	  | TODO_cleanup_cfg
 	  | (gimple_in_ssa_p (cfun) ? TODO_remove_unused_locals : 0)
-	  | (gimple_in_ssa_p (cfun) ? TODO_update_address_taken : 0)
-	  | (profile_status_for_fn (cfun) != PROFILE_ABSENT
-	     ? TODO_rebuild_frequencies : 0));
+	  | (gimple_in_ssa_p (cfun) ? TODO_update_address_taken : 0));
 }
 
 /* Passed to walk_tree.  Copies the node pointed to, if appropriate.  */
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 6cdaed7d4b2..57865cdfc42 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -239,7 +239,6 @@  protected:
 #define TODO_verify_il			(1 << 6)
 #define TODO_dump_symtab		(1 << 7)
 #define TODO_remove_functions		(1 << 8)
-#define TODO_rebuild_frequencies	(1 << 9)
 
 /* To-do flags for calls to update_ssa.  */
 
@@ -418,6 +417,7 @@  extern gimple_opt_pass *make_pass_pre (gcc::context *ctxt);
 extern unsigned int tail_merge_optimize (bool);
 extern gimple_opt_pass *make_pass_profile (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_strip_predict_hints (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_rebuild_frequencies (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_complex_O0 (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_complex (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_switch (gcc::context *ctxt);