Fix profile update in tree_transform_and_unroll_loop

Message ID	ZMJ9eeM5DRrlLsTv@kam.mff.cuni.cz
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org BA1353854C7C Date: Thu, 27 Jul 2023 16:21:45 +0200 To: gcc-patches@gcc.gnu.org Subject: Fix profile update in tree_transform_and_unroll_loop Message-ID: <ZMJ9eeM5DRrlLsTv@kam.mff.cuni.cz> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Precedence: list From: Jan Hubicka via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Jan Hubicka <hubicka@ucw.cz> Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org>
Series	Fix profile update in tree_transform_and_unroll_loop \| expand Fix profile update in tree_transform_and_unroll_loop

diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc index 3012a8d60f7..c3d292d0dd4 100644 --- a/gcc/cfgloopmanip.cc +++ b/gcc/cfgloopmanip.cc @@ -499,6 +499,32 @@ scale_loop_frequencies (class loop *loop, profile_probability p) free (bbs); } +/* Scales the frequencies of all basic blocks in LOOP that are strictly + dominated by BB by NUM/DEN. */ + +void +scale_dominated_blocks_in_loop (class loop *loop, basic_block bb, + profile_count num, profile_count den) +{ + basic_block son; + + if (!den.nonzero_p () && !(num == profile_count::zero ())) + return; + auto_vec <basic_block, 8> worklist; + worklist.safe_push (bb); + + while (!worklist.is_empty ()) + for (son = first_dom_son (CDI_DOMINATORS, worklist.pop ()); + son; + son = next_dom_son (CDI_DOMINATORS, son)) + { + if (!flow_bb_inside_loop_p (loop, son)) + continue; + son->count = son->count.apply_scale (num, den); + worklist.safe_push (son); + } +} + /* Scale profile in LOOP by P. If ITERATION_BOUND is not -1, scale even further if loop is predicted to iterate too many times. @@ -649,19 +675,9 @@ scale_loop_profile (class loop *loop, profile_probability p, if (other_edge && other_edge->dest == loop->latch) loop->latch->count -= new_exit_count - old_exit_count; else - { - basic_block *body = get_loop_body (loop); - profile_count new_count = exit_edge->src->count - new_exit_count; - profile_count old_count = exit_edge->src->count - old_exit_count; - - for (unsigned int i = 0; i < loop->num_nodes; i++) - if (body[i] != exit_edge->src - && dominated_by_p (CDI_DOMINATORS, body[i], exit_edge->src)) - body[i]->count = body[i]->count.apply_scale (new_count, - old_count); - - free (body); - } + scale_dominated_blocks_in_loop (loop, exit_edge->src, + exit_edge->src->count - new_exit_count, + exit_edge->src->count - old_exit_count); } else if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -1237,6 +1253,7 @@ duplicate_loop_body_to_header_edge (class loop *loop, edge e, should've managed the flags so all except for original loop has won't exist set. */ scale_act = wanted_count.probability_in (count_in); + /* Now simulate the duplication adjustments and compute header frequency of the last copy. */ for (i = 0; i < ndupl; i++) @@ -1252,16 +1269,21 @@ duplicate_loop_body_to_header_edge (class loop *loop, edge e, profile_probability prob_pass_main = bitmap_bit_p (wont_exit, 0) ? prob_pass_wont_exit : prob_pass_thru; - profile_probability p = prob_pass_main; - profile_count scale_main_den = count_in; - for (i = 0; i < ndupl; i++) + if (!(flags & DLTHE_FLAG_FLAT_PROFILE)) { - scale_main_den += count_in.apply_probability (p); - p = p * scale_step[i]; + profile_probability p = prob_pass_main; + profile_count scale_main_den = count_in; + for (i = 0; i < ndupl; i++) + { + scale_main_den += count_in.apply_probability (p); + p = p * scale_step[i]; + } + /* If original loop is executed COUNT_IN times, the unrolled + loop will account SCALE_MAIN_DEN times. */ + scale_main = count_in.probability_in (scale_main_den); } - /* If original loop is executed COUNT_IN times, the unrolled - loop will account SCALE_MAIN_DEN times. */ - scale_main = count_in.probability_in (scale_main_den); + else + scale_main = profile_probability::always (); scale_act = scale_main * prob_pass_main; } else diff --git a/gcc/cfgloopmanip.h b/gcc/cfgloopmanip.h index 75b2a5e9b75..af6a29f70c4 100644 --- a/gcc/cfgloopmanip.h +++ b/gcc/cfgloopmanip.h @@ -32,6 +32,8 @@ enum field of newly create BB. */ #define DLTHE_FLAG_COMPLETTE_PEEL 4 /* Update frequencies expecting a complete peeling. */ +#define DLTHE_FLAG_FLAT_PROFILE 8 /* Profile is flat; do not reduce + count by unroll factor. */ extern edge mfb_kj_edge; extern bool remove_path (edge, bool * = NULL, bitmap = NULL); @@ -64,5 +66,7 @@ class loop * loop_version (class loop *, void *, profile_probability, profile_probability, profile_probability, profile_probability, bool); void adjust_loop_info_after_peeling (class loop *loop, int npeel, bool precise); +void scale_dominated_blocks_in_loop (class loop *loop, basic_block bb, + profile_count num, profile_count den); #endif /* GCC_CFGLOOPMANIP_H */ diff --git a/gcc/predict.cc b/gcc/predict.cc index 6777e6cb9c5..5a1a561cc24 100644 --- a/gcc/predict.cc +++ b/gcc/predict.cc @@ -790,7 +790,7 @@ dump_prediction (FILE *file, enum br_predictor predictor, int probability, { fprintf (file, " exec "); bb->count.dump (file); - if (e) + if (e && e->count ().initialized_p () && bb->count.to_gcov_type ()) { fprintf (file, " hit "); e->count ().dump (file); @@ -4634,43 +4634,6 @@ force_edge_cold (edge e, bool impossible) } } -/* Change E's probability to NEW_E_PROB, redistributing the probabilities - of other outgoing edges proportionally. - - Note that this function does not change the profile counts of any - basic blocks. The caller must do that instead, using whatever - information it has about the region that needs updating. */ - -void -change_edge_frequency (edge e, profile_probability new_e_prob) -{ - profile_probability old_e_prob = e->probability; - profile_probability old_other_prob = old_e_prob.invert (); - profile_probability new_other_prob = new_e_prob.invert (); - - e->probability = new_e_prob; - profile_probability cumulative_prob = new_e_prob; - - unsigned int num_other = EDGE_COUNT (e->src->succs) - 1; - edge other_e; - edge_iterator ei; - FOR_EACH_EDGE (other_e, ei, e->src->succs) - if (other_e != e) - { - num_other -= 1; - if (num_other == 0) - /* Ensure that the probabilities add up to 1 without - rounding error. */ - other_e->probability = cumulative_prob.invert (); - else - { - other_e->probability /= old_other_prob; - other_e->probability *= new_other_prob; - cumulative_prob += other_e->probability; - } - } -} - #if CHECKING_P namespace selftest { diff --git a/gcc/predict.h b/gcc/predict.h index 4864b7d7113..42b7c00c46a 100644 --- a/gcc/predict.h +++ b/gcc/predict.h @@ -100,7 +100,6 @@ extern void rebuild_frequencies (void); extern void report_predictor_hitrates (void); extern void force_edge_cold (edge, bool); extern void propagate_unlikely_bbs_forward (void); -extern void change_edge_frequency (edge, profile_probability); extern void add_reg_br_prob_note (rtx_insn *, profile_probability); diff --git a/gcc/testsuite/gcc.dg/pr102385.c b/gcc/testsuite/gcc.dg/pr102385.c index 1339540c722..bdccc9e43b1 100644 --- a/gcc/testsuite/gcc.dg/pr102385.c +++ b/gcc/testsuite/gcc.dg/pr102385.c @@ -1,4 +1,4 @@ -/* { dg-options "-Wall -Wextra -O2 -fno-toplevel-reorder -fno-tree-ch -fno-tree-dce -fno-tree-dominator-opts -fno-tree-dse -fno-tree-loop-ivcanon -fpredictive-commoning" } */ +/* { dg-options "-Wall -Wextra -O2 -fno-toplevel-reorder -fno-tree-ch -fno-tree-dce -fno-tree-dominator-opts -fno-tree-dse -fno-tree-loop-ivcanon -fpredictive-commoning -fdump-tree-pcom-details-blocks -fdump-tree-lim-details-blocks" } */ short a, b; int c[9]; @@ -12,3 +12,5 @@ void e() { } } int main() {return 0;} +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "lim2" } } */ diff --git a/gcc/testsuite/gcc.dg/pr96931.c b/gcc/testsuite/gcc.dg/pr96931.c index 94b8a1128ee..660391588ff 100644 --- a/gcc/testsuite/gcc.dg/pr96931.c +++ b/gcc/testsuite/gcc.dg/pr96931.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O1 -fpredictive-commoning -fno-tree-loop-im" } */ +/* { dg-options "-O1 -fpredictive-commoning -fno-tree-loop-im -fdump-tree-pcom-details-blocks" } */ int bl; @@ -17,3 +17,4 @@ ie (void) ie (); } +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-1.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-1.c index 8c3d9a4fc58..d0a5318e351 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-1.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-1.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-do run } */ -/* { dg-options "-O2 -fno-tree-vectorize -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-tree-vectorize -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ void abort (void); @@ -47,3 +47,4 @@ int main(void) /* Also check that we undid the transformation previously made by PRE. */ /* { dg-final { scan-tree-dump-times "looparound ref" 1 "pcom" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-2.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-2.c index 1c54679c022..f19edd4cd74 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-2.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-2.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details -fno-tree-pre" } */ +/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks -fno-tree-pre" } */ /* { dg-additional-options "-fno-tree-vectorize" { target amdgcn-*-* } } */ void abort (void); @@ -44,3 +44,4 @@ int main(void) /* Verify that both loops were transformed and unrolled. */ /* { dg-final { scan-tree-dump-times "Unrolling 2 times." 2 "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-3.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-3.c index 9abbe6c1380..aaee3b7ff9e 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-3.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-3.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details -fno-tree-pre -fno-tree-loop-vectorize" } */ +/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks -fno-tree-pre -fno-tree-loop-vectorize" } */ int a[1000], b[1000]; @@ -13,3 +13,4 @@ void test(void) /* Verify that we used 3 temporary variables for the loop. */ /* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-4.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-4.c index 382a464cef7..af9ae0e0f3d 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-4.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-4.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ /* Test for predictive commoning of expressions, without reassociation. */ @@ -26,3 +26,4 @@ int main(void) /* { dg-final { scan-tree-dump-times "Combination" 1 "pcom"} } */ /* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-5.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-5.c index a3ee1d946a7..52adb59d669 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-5.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-5.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ /* Test for predictive commoning of expressions, with reassociation. */ @@ -26,3 +26,4 @@ int main(void) /* { dg-final { scan-tree-dump-times "Combination" 2 "pcom"} } */ /* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-7.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-7.c index 683fb9b5d35..99939767562 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-7.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-7.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O3 -fdump-tree-pcom-details" } */ +/* { dg-options "-O3 -fdump-tree-pcom-details-blocks" } */ int b, f, d[5][2]; unsigned int c; @@ -15,3 +15,7 @@ main () } /* { dg-final { scan-tree-dump "Executing predictive commoning" "pcom" } } */ +/* dom pass introduces one mismatch after simplfying mispredicted conditional + on c being non-zero on first iteration. This happens since c is global variable + and needs alias analysis. */ +/* { dg-final { scan-tree-dump-times "Invalid sum" 1 "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-8.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-8.c index c4562768398..dcddf573145 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-8.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-8.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -fdump-tree-pcom-details" } */ +/* { dg-options "-O3 -fdump-tree-pcom-details-blocks" } */ int is_sorted(int *a, int n) { @@ -10,3 +10,4 @@ int is_sorted(int *a, int n) } /* { dg-final { scan-tree-dump "Executing predictive commoning without unrolling" "pcom" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-1.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-1.c index b61b651fca8..a0a04a08c61 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-1.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-1.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11}; int result0[10] = {2, 3, 5, 7, 11}; @@ -60,3 +60,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-10.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-10.c index bd5575d9502..f770a8ad812 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-10.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-10.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11}; int result0[10] = {2, 3, 5, 7, 11}; @@ -41,4 +41,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump-not "Store-stores chain" "pcom"} } */ - +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-11.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-11.c index 9e496f68a12..ed2b96a0d1a 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-11.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-11.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11}; int x[105] = {2, 3, 5, 7, 11}; @@ -48,4 +48,5 @@ int main (void) } /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */ /* { dg-final { scan-tree-dump "Store-loads chain" "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-12.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-12.c index 510c600f574..2487c1c8205 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-12.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-12.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11}; int result0[10] = {2, 3, 5, 7, 11}; @@ -65,3 +65,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-2.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-2.c index 7f959a833f4..020ca705790 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-2.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-2.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11}; int result0[10] = {2, 3, 5, 7, 11}; @@ -60,3 +60,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-3.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-3.c index 1fc8f089345..667cc333d9f 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-3.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-3.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-tree-vectorize -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-tree-vectorize -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr1[105] = {2, 3, 5, 7, 11, 13, 0}; int arr2[105] = {2, 3, 5, 7, 11, 13, 0}; @@ -106,3 +106,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump-times "Store-stores chain" 4 "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-4.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-4.c index f97d32f4b41..8118461af0b 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-4.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-4.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11}; int result0[10] = {2, 3, 5, 7, 11}; @@ -59,3 +59,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-5.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-5.c index a13d56098be..03fa646661e 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-5.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-5.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11}; int result0[10] = {2, 3, 5, 7, 11}; @@ -61,3 +61,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-6.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-6.c index 63d6c8f33b0..ab2fd403d30 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-6.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-6.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19}; int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19}; @@ -63,3 +63,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-7.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-7.c index 0bde6e6dced..c746ebd7155 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-7.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-7.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19}; int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19}; @@ -61,3 +61,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-8.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-8.c index 45ffd25c424..6c4e9afa487 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-8.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-8.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19}; int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19}; @@ -58,3 +58,4 @@ int main (void) return 0; } +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-9.c b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-9.c index 1c4e3140309..9c5e8ca9a79 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-9.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-9.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */ +/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */ int arr1[105] = {2, 3, 5, 7, 11, 13, 17, 19}; int arr2[105] = {2, 3, 5, 7, 11, 13, 17, 19}; @@ -88,3 +88,4 @@ int main (void) return 0; } /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/update-unroll-1.c b/gcc/testsuite/gcc.dg/tree-ssa/update-unroll-1.c index 138448bac43..23dd5c0d570 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/update-unroll-1.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/update-unroll-1.c @@ -16,5 +16,4 @@ int foo(unsigned n) /* We used to make the probability that the body of the loop (unrolled to enable prefetching) is entered 0, which is not correct. */ -/* { dg-final { scan-tree-dump-not "Invalid sum" "aprefetch" { xfail *-*-* }} } */ /* { dg-final { scan-tree-dump-not "SUCC: 7 .100.0%" "aprefetch"} } */ diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc index b2d3dcf406d..8e3b1057b6f 100644 --- a/gcc/tree-ssa-loop-manip.cc +++ b/gcc/tree-ssa-loop-manip.cc @@ -1040,71 +1040,6 @@ determine_exit_conditions (class loop *loop, class tree_niter_desc *desc, *exit_bound = bound; } -/* Scales the frequencies of all basic blocks in LOOP that are strictly - dominated by BB by NUM/DEN. */ - -static void -scale_dominated_blocks_in_loop (class loop *loop, basic_block bb, - profile_count num, profile_count den) -{ - basic_block son; - - if (!den.nonzero_p () && !(num == profile_count::zero ())) - return; - - for (son = first_dom_son (CDI_DOMINATORS, bb); - son; - son = next_dom_son (CDI_DOMINATORS, son)) - { - if (!flow_bb_inside_loop_p (loop, son)) - continue; - scale_bbs_frequencies_profile_count (&son, 1, num, den); - scale_dominated_blocks_in_loop (loop, son, num, den); - } -} - -/* Return estimated niter for LOOP after unrolling by FACTOR times. */ - -gcov_type -niter_for_unrolled_loop (class loop *loop, unsigned factor) -{ - gcc_assert (factor != 0); - bool profile_p = false; - gcov_type est_niter = expected_loop_iterations_unbounded (loop, &profile_p); - /* Note that this is really CEIL (est_niter + 1, factor) - 1, where the - "+ 1" converts latch iterations to loop iterations and the "- 1" - converts back. */ - gcov_type new_est_niter = est_niter / factor; - - if (est_niter == -1) - return -1; - - /* Without profile feedback, loops for which we do not know a better estimate - are assumed to roll 10 times. When we unroll such loop, it appears to - roll too little, and it may even seem to be cold. To avoid this, we - ensure that the created loop appears to roll at least 5 times (but at - most as many times as before unrolling). Don't do adjustment if profile - feedback is present. */ - if (new_est_niter < 5 && !profile_p) - { - if (est_niter < 5) - new_est_niter = est_niter; - else - new_est_niter = 5; - } - - if (loop->any_upper_bound) - { - /* As above, this is really CEIL (upper_bound + 1, factor) - 1. */ - widest_int bound = wi::udiv_floor (loop->nb_iterations_upper_bound, - factor); - if (wi::ltu_p (bound, new_est_niter)) - new_est_niter = bound.to_uhwi (); - } - - return new_est_niter; -} - /* Unroll LOOP FACTOR times. LOOP is known to have a single exit edge whose source block dominates the latch. DESC describes the number of iterations of LOOP. @@ -1169,47 +1104,39 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor, transform_callback transform, void *data) { - gcov_type new_est_niter = niter_for_unrolled_loop (loop, factor); unsigned irr = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP; enum tree_code exit_cmp; tree enter_main_cond, exit_base, exit_step, exit_bound; + bool flat = maybe_flat_loop_profile (loop); determine_exit_conditions (loop, desc, factor, &enter_main_cond, &exit_base, &exit_step, &exit_cmp, &exit_bound); bool single_loop_p = !exit_base; - /* Let us assume that the unrolled loop is quite likely to be entered. */ - profile_probability prob_entry; - if (integer_nonzerop (enter_main_cond)) - prob_entry = profile_probability::always (); - else - prob_entry = profile_probability::guessed_always () - .apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100); - gcond *exit_if = nullptr; class loop *new_loop = nullptr; edge new_exit; if (!single_loop_p) { - edge exit = single_dom_exit (loop); + profile_count entry_count = loop_preheader_edge (loop)->src->count; + /* Let us assume that the unrolled loop is quite likely to be entered. */ + profile_probability prob_entry; + if (integer_nonzerop (enter_main_cond)) + prob_entry = profile_probability::always (); + else + prob_entry = profile_probability::guessed_always () + .apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100); + /* The values for scales should keep profile consistent, and somewhat - close to correct. - - TODO: The current value of SCALE_REST makes it appear that the loop - that is created by splitting the remaining iterations of the unrolled - loop is executed the same number of times as the original loop, and - with the same frequencies, which is obviously wrong. This does not - appear to cause problems, so we do not bother with fixing it for now. - To make the profile correct, we would need to change the probability - of the exit edge of the loop, and recompute the distribution of - frequencies in its body because of this change (scale the frequencies - of blocks before and after the exit by appropriate factors). */ - profile_probability scale_unrolled = prob_entry; + close to correct. */ new_loop = loop_version (loop, enter_main_cond, NULL, prob_entry, - prob_entry.invert (), scale_unrolled, - profile_probability::guessed_always (), + prob_entry.invert (), + prob_entry, + /* We will later redirect exit from vectorized + loop to new_loop. */ + profile_probability::always (), true); gcc_assert (new_loop != NULL); update_ssa (TODO_update_ssa_no_phi); @@ -1220,18 +1147,16 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor, edge precond_edge = single_pred_edge (rest); split_edge (loop_latch_edge (loop)); basic_block exit_bb = single_pred (loop->latch); + edge exit = single_dom_exit (loop); /* Since the exit edge will be removed, the frequency of all the blocks - in the loop that are dominated by it must be scaled by - 1 / (1 - exit->probability). */ + in the loop that are dominated by it must be scaled. */ if (exit->probability.initialized_p ()) scale_dominated_blocks_in_loop (loop, exit->src, /* We are scaling up here so probability does not fit. */ - loop->header->count, - loop->header->count - - loop->header->count.apply_probability - (exit->probability)); + exit->src->count, + exit->src->count - exit->count ()); gimple_stmt_iterator bsi = gsi_last_bb (exit_bb); exit_if = gimple_build_cond (EQ_EXPR, integer_zero_node, @@ -1243,14 +1168,14 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor, rescan_loop_exit (new_exit, true, false); /* Set the probability of new exit to the same of the old one. Fix - the frequency of the latch block, by scaling it back by - 1 - exit->probability. */ + the count of the latch block. */ new_exit->probability = exit->probability; edge new_nonexit = single_pred_edge (loop->latch); new_nonexit->probability = exit->probability.invert (); new_nonexit->flags = EDGE_TRUE_VALUE; - if (new_nonexit->probability.initialized_p ()) - scale_bbs_frequencies (&loop->latch, 1, new_nonexit->probability); + set_edge_probability_and_rescale_others + (exit, profile_probability::never ()); + loop->latch->count = new_nonexit->count (); edge old_entry = loop_preheader_edge (loop); edge new_entry = loop_preheader_edge (new_loop); @@ -1296,12 +1221,21 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor, } remove_path (exit); + /* We will later redirect exit from vectorized loop to new_loop. */ + loop_preheader_edge (new_loop)->src->count = entry_count; /* The epilog loop latch executes at most factor - 1 times. Since the epilog is entered unconditionally it will need to handle up to factor executions of its body. */ - new_loop->any_upper_bound = 1; + new_loop->any_upper_bound = true; new_loop->nb_iterations_upper_bound = factor - 1; + /* We do not really know estimate on number of iterations, since we do not + track any estimates modulo unroll factor. + Drop estimate from loop_info and scale loop profile. + It may be more realistic to scale loop profile to factor / 2 - 1, + but vectorizer also uses factor - 1. */ + new_loop->any_estimate = false; + scale_loop_profile (new_loop, profile_probability::always (), factor - 1); } else new_exit = single_dom_exit (loop); @@ -1318,10 +1252,10 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor, auto_vec<edge> to_remove; bool ok - = gimple_duplicate_loop_body_to_header_edge (loop, loop_latch_edge (loop), - factor - 1, wont_exit, - new_exit, &to_remove, - DLTHE_FLAG_UPDATE_FREQ); + = gimple_duplicate_loop_body_to_header_edge + (loop, loop_latch_edge (loop), factor - 1, wont_exit, + new_exit, &to_remove, + DLTHE_FLAG_UPDATE_FREQ | (flat ? DLTHE_FLAG_FLAT_PROFILE : 0)); gcc_assert (ok); for (edge e : to_remove) @@ -1332,36 +1266,25 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor, update_ssa (TODO_update_ssa); new_exit = single_dom_exit (loop); + + /* gimple_duplicate_loop_body_to_header_edge depending on + DLTHE_FLAG_UPDATE_FREQ either keeps original frequency of the loop header + or scales it down accordingly. + However exit edge probability is kept as original. Fix it if needed + and compensate. */ + profile_probability new_prob + = loop_preheader_edge + (loop)->count ().probability_in (new_exit->src->count); + if (!(new_prob == new_exit->probability)) + { + profile_count old_count = new_exit->src->count - new_exit->count (); + set_edge_probability_and_rescale_others (new_exit, new_prob); + profile_count new_count = new_exit->src->count - new_exit->count (); + scale_dominated_blocks_in_loop (loop, new_exit->src, + new_count, old_count); + } if (!single_loop_p) { - /* Ensure that the frequencies in the loop match the new estimated - number of iterations, and change the probability of the new - exit edge. */ - - profile_count freq_h = loop->header->count; - profile_count freq_e = (loop_preheader_edge (loop))->count (); - if (freq_h.nonzero_p ()) - { - /* Avoid dropping loop body profile counter to 0 because of zero - count in loop's preheader. */ - if (freq_h.nonzero_p () && !(freq_e == profile_count::zero ())) - freq_e = freq_e.force_nonzero (); - scale_loop_frequencies (loop, freq_e.probability_in (freq_h)); - } - - basic_block rest = new_exit->dest; - new_exit->probability - = (profile_probability::always () / (new_est_niter + 1)); - - rest->count += new_exit->count (); - - edge new_nonexit = single_pred_edge (loop->latch); - profile_probability prob = new_nonexit->probability; - new_nonexit->probability = new_exit->probability.invert (); - prob = new_nonexit->probability / prob; - if (prob.initialized_p ()) - scale_bbs_frequencies (&loop->latch, 1, prob); - /* Finally create the new counter for number of iterations and add the new exit instruction. */ tree ctr_before, ctr_after; @@ -1374,66 +1297,15 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor, gimple_cond_set_rhs (exit_if, exit_bound); update_stmt (exit_if); } - else - { - /* gimple_duplicate_loop_to_header_edge has adjusted the loop body's - original profile counts in line with the unroll factor. However, - the old counts might not have been consistent with the old - iteration count. - - Therefore, if the iteration count is known exactly, make sure that the - profile counts of the loop header (and any other blocks that might be - executed in the final iteration) are consistent with the combination - of (a) the incoming profile count and (b) the new iteration count. */ - profile_count in_count = loop_preheader_edge (loop)->count (); - profile_count old_header_count = loop->header->count; - if (in_count.nonzero_p () - && old_header_count.nonzero_p () - && TREE_CODE (desc->niter) == INTEGER_CST) - { - /* The + 1 converts latch counts to iteration counts. */ - profile_count new_header_count = in_count * (new_est_niter + 1); - basic_block *body = get_loop_body (loop); - scale_bbs_frequencies_profile_count (body, loop->num_nodes, - new_header_count, - old_header_count); - free (body); - } - - /* gimple_duplicate_loop_to_header_edge discarded FACTOR - 1 - exit edges and adjusted the loop body's profile counts for the - new probabilities of the remaining non-exit edges. However, - the remaining exit edge still has the same probability as it - did before, even though it is now more likely. - - Therefore, all blocks executed after a failed exit test now have - a profile count that is too high, and the sum of the profile counts - for the header's incoming edges is greater than the profile count - of the header itself. - - Adjust the profile counts of all code in the loop body after - the exit test so that the sum of the counts on entry to the - header agree. */ - profile_count old_latch_count = loop_latch_edge (loop)->count (); - profile_count new_latch_count = loop->header->count - in_count; - if (old_latch_count.nonzero_p () && new_latch_count.nonzero_p ()) - scale_dominated_blocks_in_loop (loop, new_exit->src, new_latch_count, - old_latch_count); - - /* Set the probability of the exit edge based on NEW_EST_NITER - (which estimates latch counts rather than iteration counts). - Update the probabilities of other edges to match. - - If the profile counts are large enough to give the required - precision, the updates above will have made - - e->dest->count / e->src->count ~= new e->probability - - for every outgoing edge e of NEW_EXIT->src. */ - profile_probability new_exit_prob - = profile_probability::always () / (new_est_niter + 1); - change_edge_frequency (new_exit, new_exit_prob); - } + if (loop->any_upper_bound) + loop->nb_iterations_upper_bound = wi::udiv_floor + (loop->nb_iterations_upper_bound + 1, factor) - 1; + if (loop->any_likely_upper_bound) + loop->nb_iterations_likely_upper_bound = wi::udiv_floor + (loop->nb_iterations_likely_upper_bound + 1, factor) - 1; + if (loop->any_estimate) + loop->nb_iterations_estimate = wi::udiv_floor + (loop->nb_iterations_estimate + 1, factor) - 1; checking_verify_flow_info (); checking_verify_loop_structure ();

Fix profile update in tree_transform_and_unroll_loop

Commit Message

Patch