Message ID | 96160a5131c9e5eb302fb9f4db43c5d8b4cfe042.1629805719.git.mjambor@suse.cz |
---|---|
State | New |
Headers | show |
Series | IPA-CP profile feedback handling fixes | expand |
> 2021-08-23 Martin Jambor <mjambor@suse.cz> > > * params.opt (param_ipa_cp_profile_count_base): New parameter. > * ipa-cp.c (max_count): Replace with base_count, replace all > occurrences too, unless otherwise stated. > (ipcp_cloning_candidate_p): identify mostly-directly called > functions based on their counts, not max_count. > (compare_edge_profile_counts): New function. > (ipcp_propagate_stage): Instead of setting max_count, find the > appropriate edge count in a sorted vector of counts of eligible > edges and make it the base_count. > --- > gcc/ipa-cp.c | 82 +++++++++++++++++++++++++++++++++++++++++++++----- > gcc/params.opt | 4 +++ > 2 files changed, 78 insertions(+), 8 deletions(-) > > diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c > index 53cca7aa804..6ab74f61e83 100644 > --- a/gcc/ipa-cp.c > +++ b/gcc/ipa-cp.c > @@ -400,9 +400,9 @@ object_allocator<ipcp_value_source<tree> > ipcp_sources_pool > object_allocator<ipcp_agg_lattice> ipcp_agg_lattice_pool > ("IPA_CP aggregate lattices"); > > -/* Maximal count found in program. */ > +/* Base count to use in heuristics when using profile feedback. */ > > -static profile_count max_count; > +static profile_count base_count; > > /* Original overall size of the program. */ > > @@ -809,7 +809,8 @@ ipcp_cloning_candidate_p (struct cgraph_node *node) > /* When profile is available and function is hot, propagate into it even if > calls seems cold; constant propagation can improve function's speed > significantly. */ > - if (max_count > profile_count::zero ()) > + if (stats.count_sum > profile_count::zero () > + && node->count.ipa ().initialized_p ()) > { > if (stats.count_sum > node->count.ipa ().apply_scale (90, 100)) > { > @@ -3310,10 +3311,10 @@ good_cloning_opportunity_p (struct cgraph_node *node, sreal time_benefit, > > ipa_node_params *info = ipa_node_params_sum->get (node); > int eval_threshold = opt_for_fn (node->decl, param_ipa_cp_eval_threshold); > - if (max_count > profile_count::zero ()) > + if (base_count > profile_count::zero ()) > { > > - sreal factor = count_sum.probability_in (max_count).to_sreal (); > + sreal factor = count_sum.probability_in (base_count).to_sreal (); If you have part of program built with -fprofile-use and part built without this will disable cloning for functions called only from places w/o profile. I think we want to count frequencies when ipa profile is uninitialized and then pass the cloning oportunity if either count or freqs says it is good idea. > sreal evaluation = (time_benefit * factor) / size_cost; > evaluation = incorporate_penalties (node, info, evaluation); > evaluation *= 1000; > @@ -3950,6 +3951,21 @@ value_topo_info<valtype>::propagate_effects () > } > } > > +/* Callback for qsort to sort counts of all edges. */ > + > +static int > +compare_edge_profile_counts (const void *a, const void *b) > +{ > + const profile_count *cnt1 = (const profile_count *) a; > + const profile_count *cnt2 = (const profile_count *) b; > + > + if (*cnt1 < *cnt2) > + return 1; > + if (*cnt1 > *cnt2) > + return -1; > + return 0; > +} > + > > /* Propagate constants, polymorphic contexts and their effects from the > summaries interprocedurally. */ > @@ -3962,8 +3978,10 @@ ipcp_propagate_stage (class ipa_topo_info *topo) > if (dump_file) > fprintf (dump_file, "\n Propagating constants:\n\n"); > > - max_count = profile_count::uninitialized (); > + base_count = profile_count::uninitialized (); > > + bool compute_count_base = false; > + unsigned base_count_pos_percent = 0; > FOR_EACH_DEFINED_FUNCTION (node) > { > if (node->has_gimple_body_p () > @@ -3981,9 +3999,57 @@ ipcp_propagate_stage (class ipa_topo_info *topo) > ipa_size_summary *s = ipa_size_summaries->get (node); > if (node->definition && !node->alias && s != NULL) > overall_size += s->self_size; > - max_count = max_count.max (node->count.ipa ()); > + if (node->count.ipa ().initialized_p ()) > + { > + compute_count_base = true; > + unsigned pos_percent = opt_for_fn (node->decl, > + param_ipa_cp_profile_count_base); > + base_count_pos_percent = MAX (base_count_pos_percent, pos_percent); > + } > } > > + if (compute_count_base) > + { > + auto_vec<profile_count> all_edge_counts; > + all_edge_counts.reserve_exact (symtab->edges_count); > + FOR_EACH_DEFINED_FUNCTION (node) > + for (cgraph_edge *cs = node->callees; cs; cs = cs->next_callee) > + { > + profile_count count = cs->count.ipa (); > + if (!(count > profile_count::zero ())) > + continue; > + > + enum availability avail; > + cgraph_node *tgt > + = cs->callee->function_or_virtual_thunk_symbol (&avail); > + ipa_node_params *info = ipa_node_params_sum->get (tgt); > + if (info && info->versionable) > + all_edge_counts.quick_push (count); > + } I wonder how big those tables gets for whole program, but probably it is safe since it is heap allocatd and temporary. Not sure if reserving exact is going to give good idea what the usual count is - I think if program is built w/o profile feedback we may end up with few functions with zero or 1 count (called once and unlikely). > + > + if (!all_edge_counts.is_empty ()) > + { > + gcc_assert (base_count_pos_percent <= 100); > + all_edge_counts.qsort (compare_edge_profile_counts); > + > + unsigned base_count_pos > + = ((all_edge_counts.length () * (base_count_pos_percent)) / 100); > + base_count = all_edge_counts[base_count_pos]; It may make more sense to sum all the counts and then do the given percentile of function invocations, but we can see how well this fares. Patch is OK (it is improvement over what we have) but we should get the combined FDO/nonFDO case right. It also matters when we lose profile feedback i.e. due to comdat function merging. Honza
Hi, On Wed, Oct 06 2021, Jan Hubicka wrote: >> 2021-08-23 Martin Jambor <mjambor@suse.cz> >> >> * params.opt (param_ipa_cp_profile_count_base): New parameter. >> * ipa-cp.c (max_count): Replace with base_count, replace all >> occurrences too, unless otherwise stated. >> (ipcp_cloning_candidate_p): identify mostly-directly called >> functions based on their counts, not max_count. >> (compare_edge_profile_counts): New function. >> (ipcp_propagate_stage): Instead of setting max_count, find the >> appropriate edge count in a sorted vector of counts of eligible >> edges and make it the base_count. >> --- >> gcc/ipa-cp.c | 82 +++++++++++++++++++++++++++++++++++++++++++++----- >> gcc/params.opt | 4 +++ >> 2 files changed, 78 insertions(+), 8 deletions(-) >> >> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c >> index 53cca7aa804..6ab74f61e83 100644 >> --- a/gcc/ipa-cp.c >> +++ b/gcc/ipa-cp.c >> @@ -400,9 +400,9 @@ object_allocator<ipcp_value_source<tree> > ipcp_sources_pool >> object_allocator<ipcp_agg_lattice> ipcp_agg_lattice_pool >> ("IPA_CP aggregate lattices"); >> >> -/* Maximal count found in program. */ >> +/* Base count to use in heuristics when using profile feedback. */ >> >> -static profile_count max_count; >> +static profile_count base_count; >> >> /* Original overall size of the program. */ >> >> @@ -809,7 +809,8 @@ ipcp_cloning_candidate_p (struct cgraph_node *node) >> /* When profile is available and function is hot, propagate into it even if >> calls seems cold; constant propagation can improve function's speed >> significantly. */ >> - if (max_count > profile_count::zero ()) >> + if (stats.count_sum > profile_count::zero () >> + && node->count.ipa ().initialized_p ()) >> { >> if (stats.count_sum > node->count.ipa ().apply_scale (90, 100)) >> { >> @@ -3310,10 +3311,10 @@ good_cloning_opportunity_p (struct cgraph_node *node, sreal time_benefit, >> >> ipa_node_params *info = ipa_node_params_sum->get (node); >> int eval_threshold = opt_for_fn (node->decl, param_ipa_cp_eval_threshold); >> - if (max_count > profile_count::zero ()) >> + if (base_count > profile_count::zero ()) >> { >> >> - sreal factor = count_sum.probability_in (max_count).to_sreal (); >> + sreal factor = count_sum.probability_in (base_count).to_sreal (); > > If you have part of program built with -fprofile-use and part built without this will > disable cloning for functions called only from places w/o profile. > I think we want to count frequencies when ipa profile is uninitialized > and then pass the cloning oportunity if either count or freqs says it is > good idea. OK, I would like to address that by a separate follow-up patch below. > >> sreal evaluation = (time_benefit * factor) / size_cost; >> evaluation = incorporate_penalties (node, info, evaluation); >> evaluation *= 1000; >> @@ -3950,6 +3951,21 @@ value_topo_info<valtype>::propagate_effects () >> } >> } >> >> +/* Callback for qsort to sort counts of all edges. */ >> + >> +static int >> +compare_edge_profile_counts (const void *a, const void *b) >> +{ >> + const profile_count *cnt1 = (const profile_count *) a; >> + const profile_count *cnt2 = (const profile_count *) b; >> + >> + if (*cnt1 < *cnt2) >> + return 1; >> + if (*cnt1 > *cnt2) >> + return -1; >> + return 0; >> +} >> + >> >> /* Propagate constants, polymorphic contexts and their effects from the >> summaries interprocedurally. */ >> @@ -3962,8 +3978,10 @@ ipcp_propagate_stage (class ipa_topo_info *topo) >> if (dump_file) >> fprintf (dump_file, "\n Propagating constants:\n\n"); >> >> - max_count = profile_count::uninitialized (); >> + base_count = profile_count::uninitialized (); >> >> + bool compute_count_base = false; >> + unsigned base_count_pos_percent = 0; >> FOR_EACH_DEFINED_FUNCTION (node) >> { >> if (node->has_gimple_body_p () >> @@ -3981,9 +3999,57 @@ ipcp_propagate_stage (class ipa_topo_info *topo) >> ipa_size_summary *s = ipa_size_summaries->get (node); >> if (node->definition && !node->alias && s != NULL) >> overall_size += s->self_size; >> - max_count = max_count.max (node->count.ipa ()); >> + if (node->count.ipa ().initialized_p ()) >> + { >> + compute_count_base = true; >> + unsigned pos_percent = opt_for_fn (node->decl, >> + param_ipa_cp_profile_count_base); >> + base_count_pos_percent = MAX (base_count_pos_percent, pos_percent); >> + } >> } >> >> + if (compute_count_base) >> + { >> + auto_vec<profile_count> all_edge_counts; >> + all_edge_counts.reserve_exact (symtab->edges_count); >> + FOR_EACH_DEFINED_FUNCTION (node) >> + for (cgraph_edge *cs = node->callees; cs; cs = cs->next_callee) >> + { >> + profile_count count = cs->count.ipa (); >> + if (!(count > profile_count::zero ())) >> + continue; >> + >> + enum availability avail; >> + cgraph_node *tgt >> + = cs->callee->function_or_virtual_thunk_symbol (&avail); >> + ipa_node_params *info = ipa_node_params_sum->get (tgt); >> + if (info && info->versionable) >> + all_edge_counts.quick_push (count); >> + } > > I wonder how big those tables gets for whole program, but probably it is > safe since it is heap allocatd and temporary. > Not sure if reserving exact is going to give good idea what the usual > count is - I think if program is built w/o profile feedback we may end > up with few functions with zero or 1 count (called once and unlikely). My reasoning was that rather than iterating over all edges twice (once to count how big an array I need and once to fill it in), I'd just allocate the known maximum. If I happen to be wrong, it is easy to change the code not to allocate an element of the array for edges with zero or one count. I can of course change it before pushing to master. >> + >> + if (!all_edge_counts.is_empty ()) >> + { >> + gcc_assert (base_count_pos_percent <= 100); >> + all_edge_counts.qsort (compare_edge_profile_counts); >> + >> + unsigned base_count_pos >> + = ((all_edge_counts.length () * (base_count_pos_percent)) / 100); >> + base_count = all_edge_counts[base_count_pos]; > > It may make more sense to sum all the counts and then do the given > percentile of function invocations, but we can see how well this fares. I hope my approach will have similar results for big applications and also work for small programs (and, well, benchmarks) where very few or even one edge dominate the program. Below is the aforementioned follow-up fix, which has passed LTO profiled-bootstrap on x86_64-linux. Is it OK too? Thanks, Martin Subject: [PATCH 4/4] ipa-cp: Use profile counters (or not) based on local availability This is a follow-up small patch to address Honza's review of my previous patch to select saner profile count to base heuristics on. Currently the IPA-CP heuristics switch to PGO-mode only if there are PGO counters available for any part of the call graph. This change makes it to switch to the PGO mode only if any of the incoming edges bringing in the constant in question had any ipa-quality counts on them. Consequently, if a part of the program is built with -fprofile-use and another part without, IPA-CP will use estimated-frequency-based heuristics for the latter. I still wonder whether this should only happen with flag_profile_partial_training on. It seems like we're behaving as if it was always on. gcc/ChangeLog: 2021-10-18 Martin Jambor <mjambor@suse.cz> * ipa-cp.c (good_cloning_opportunity_p): Decide whether to use profile feedback depending on their local availability. --- gcc/ipa-cp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c index 4d07a6d0a58..703541d15cc 100644 --- a/gcc/ipa-cp.c +++ b/gcc/ipa-cp.c @@ -3311,9 +3311,9 @@ good_cloning_opportunity_p (struct cgraph_node *node, sreal time_benefit, ipa_node_params *info = ipa_node_params_sum->get (node); int eval_threshold = opt_for_fn (node->decl, param_ipa_cp_eval_threshold); - if (base_count > profile_count::zero ()) + if (count_sum > profile_count::zero ()) { - + gcc_assert (base_count > profile_count::zero ()); sreal factor = count_sum.probability_in (base_count).to_sreal (); sreal evaluation = (time_benefit * factor) / size_cost; evaluation = incorporate_penalties (node, info, evaluation);
Hi, On Mon, Aug 23 2021, Martin Jambor wrote: > When profile feedback is available, IPA-CP takes the count of the > hottest node and then evaluates all call contexts relative to it. > This means that typically almost no clones for specialized contexts > are ever created because the maximum is some special function, called > from everywhere (that is likely to get inlined anyway) and all the > examined edges look cold compared to it. > > This patch changes the selection. It simply sorts counts of all edges > eligible for cloning in a vector and then picks the count in 90th > percentile (the actual number is configurable via a parameter). > > I also tried more complex approaches which were summing the counts and > picking the edge which together with all hotter edges accounted for a > given portion of the total sum of all edge counts. But first it was > not apparently clear to me that they make more logical sense that the > simple method and practically I always also had to ignore a few > percent of the hottest edges with really extreme counts (looking at > bash and python). And when I had to do that anyway, it seemed simpler > to just "ignore" more and take the first non-ignored count as the > base. > > Nevertheless, if people think some more sophisticated method should be > used anyway, I am willing to be persuaded. But this patch is a clear > improvement over the current situation. > > gcc/ChangeLog: > > 2021-08-23 Martin Jambor <mjambor@suse.cz> > > * params.opt (param_ipa_cp_profile_count_base): New parameter. > * ipa-cp.c (max_count): Replace with base_count, replace all > occurrences too, unless otherwise stated. > (ipcp_cloning_candidate_p): identify mostly-directly called > functions based on their counts, not max_count. > (compare_edge_profile_counts): New function. > (ipcp_propagate_stage): Instead of setting max_count, find the > appropriate edge count in a sorted vector of counts of eligible > edges and make it the base_count. Honza approved this patch in a private conversation but then I noticed I forgot to add an entry for the new parameter into invoke.texi, so I fixed that problem (and checked the result with make info and make pdf) and pushed the patch to master as commit ab1008255e37b5b51a433ed69e04c06300543799. Thanks, Martin
Hi, On Mon, Oct 18 2021, Martin Jambor wrote: > [...] > > > This is a follow-up small patch to address Honza's review of my > previous patch to select saner profile count to base heuristics on. > Currently the IPA-CP heuristics switch to PGO-mode only if there are > PGO counters available for any part of the call graph. This change > makes it to switch to the PGO mode only if any of the incoming edges > bringing in the constant in question had any ipa-quality counts on > them. Consequently, if a part of the program is built with > -fprofile-use and another part without, IPA-CP will use > estimated-frequency-based heuristics for the latter. > > I still wonder whether this should only happen with > flag_profile_partial_training on. It seems like we're behaving as if > it was always on. > Honza approved this patch in a private conversation and so I have pushed it to master as commit ab810952eb7c061e37054ddd1dfe0aa033365131. Thanks, Martin
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c index 53cca7aa804..6ab74f61e83 100644 --- a/gcc/ipa-cp.c +++ b/gcc/ipa-cp.c @@ -400,9 +400,9 @@ object_allocator<ipcp_value_source<tree> > ipcp_sources_pool object_allocator<ipcp_agg_lattice> ipcp_agg_lattice_pool ("IPA_CP aggregate lattices"); -/* Maximal count found in program. */ +/* Base count to use in heuristics when using profile feedback. */ -static profile_count max_count; +static profile_count base_count; /* Original overall size of the program. */ @@ -809,7 +809,8 @@ ipcp_cloning_candidate_p (struct cgraph_node *node) /* When profile is available and function is hot, propagate into it even if calls seems cold; constant propagation can improve function's speed significantly. */ - if (max_count > profile_count::zero ()) + if (stats.count_sum > profile_count::zero () + && node->count.ipa ().initialized_p ()) { if (stats.count_sum > node->count.ipa ().apply_scale (90, 100)) { @@ -3310,10 +3311,10 @@ good_cloning_opportunity_p (struct cgraph_node *node, sreal time_benefit, ipa_node_params *info = ipa_node_params_sum->get (node); int eval_threshold = opt_for_fn (node->decl, param_ipa_cp_eval_threshold); - if (max_count > profile_count::zero ()) + if (base_count > profile_count::zero ()) { - sreal factor = count_sum.probability_in (max_count).to_sreal (); + sreal factor = count_sum.probability_in (base_count).to_sreal (); sreal evaluation = (time_benefit * factor) / size_cost; evaluation = incorporate_penalties (node, info, evaluation); evaluation *= 1000; @@ -3950,6 +3951,21 @@ value_topo_info<valtype>::propagate_effects () } } +/* Callback for qsort to sort counts of all edges. */ + +static int +compare_edge_profile_counts (const void *a, const void *b) +{ + const profile_count *cnt1 = (const profile_count *) a; + const profile_count *cnt2 = (const profile_count *) b; + + if (*cnt1 < *cnt2) + return 1; + if (*cnt1 > *cnt2) + return -1; + return 0; +} + /* Propagate constants, polymorphic contexts and their effects from the summaries interprocedurally. */ @@ -3962,8 +3978,10 @@ ipcp_propagate_stage (class ipa_topo_info *topo) if (dump_file) fprintf (dump_file, "\n Propagating constants:\n\n"); - max_count = profile_count::uninitialized (); + base_count = profile_count::uninitialized (); + bool compute_count_base = false; + unsigned base_count_pos_percent = 0; FOR_EACH_DEFINED_FUNCTION (node) { if (node->has_gimple_body_p () @@ -3981,9 +3999,57 @@ ipcp_propagate_stage (class ipa_topo_info *topo) ipa_size_summary *s = ipa_size_summaries->get (node); if (node->definition && !node->alias && s != NULL) overall_size += s->self_size; - max_count = max_count.max (node->count.ipa ()); + if (node->count.ipa ().initialized_p ()) + { + compute_count_base = true; + unsigned pos_percent = opt_for_fn (node->decl, + param_ipa_cp_profile_count_base); + base_count_pos_percent = MAX (base_count_pos_percent, pos_percent); + } } + if (compute_count_base) + { + auto_vec<profile_count> all_edge_counts; + all_edge_counts.reserve_exact (symtab->edges_count); + FOR_EACH_DEFINED_FUNCTION (node) + for (cgraph_edge *cs = node->callees; cs; cs = cs->next_callee) + { + profile_count count = cs->count.ipa (); + if (!(count > profile_count::zero ())) + continue; + + enum availability avail; + cgraph_node *tgt + = cs->callee->function_or_virtual_thunk_symbol (&avail); + ipa_node_params *info = ipa_node_params_sum->get (tgt); + if (info && info->versionable) + all_edge_counts.quick_push (count); + } + + if (!all_edge_counts.is_empty ()) + { + gcc_assert (base_count_pos_percent <= 100); + all_edge_counts.qsort (compare_edge_profile_counts); + + unsigned base_count_pos + = ((all_edge_counts.length () * (base_count_pos_percent)) / 100); + base_count = all_edge_counts[base_count_pos]; + + if (dump_file) + { + fprintf (dump_file, "\nSelected base_count from %u edges at " + "position %u, arriving at: ", all_edge_counts.length (), + base_count_pos); + base_count.dump (dump_file); + fprintf (dump_file, "\n"); + } + } + else if (dump_file) + fprintf (dump_file, "\nNo candidates with non-zero call count found, " + "continuing as if without profile feedback.\n"); + } + orig_overall_size = overall_size; if (dump_file) @@ -6576,7 +6642,7 @@ make_pass_ipa_cp (gcc::context *ctxt) void ipa_cp_c_finalize (void) { - max_count = profile_count::uninitialized (); + base_count = profile_count::uninitialized (); overall_size = 0; orig_overall_size = 0; ipcp_free_transformation_sum (); diff --git a/gcc/params.opt b/gcc/params.opt index 8d772309407..5223f784bf0 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -290,6 +290,10 @@ The size of translation unit that IPA-CP pass considers large. Common Joined UInteger Var(param_ipa_cp_value_list_size) Init(8) Param Optimization Maximum size of a list of values associated with each parameter for interprocedural constant propagation. +-param=ipa-cp-profile-count-base= +Common Joined UInteger Var(param_ipa_cp_profile_count_base) Init(10) IntegerRange(0, 100) Param Optimization +When using profile feedback, use the edge at this percentage position in frequncy histogram as the bases for IPA-CP heuristics. + -param=ipa-jump-function-lookups= Common Joined UInteger Var(param_ipa_jump_function_lookups) Init(8) Param Optimization Maximum number of statements visited during jump function offset discovery.