[google] New fdo summary-based icache sensitive unrolling (issue6282045)

Message ID	20120603063000.394AA61578@tjsboxrox.mtv.corp.google.com
State	New
Headers	show Return-Path: <gcc-patches-return-320132-incoming=patchwork.ozlabs.org@gcc.gnu.org> Comment: DKIM? See http://www.dkim.org Comment: DomainKeys? See http://antispam.yahoo.com/domainkeys DomainKey-Signature: a=rsa-sha1; q=dns; c=nofws; s=default; d=gcc.gnu.org; h=Received:Received:X-SWARE-Spam-Status:X-Spam-Check-By:Received:Received:X-Google-DKIM-Signature:Received:Received:Received:Received:Received:To:Subject:Message-Id:Date:From:X-Gm-Message-State:X-IsSubscribed:Mailing-List:Precedence:List-Id:List-Unsubscribe:List-Archive:List-Post:List-Help:Sender:Delivered-To; b=dalQwcTVatHeakVehzqOcDXg5KLQio4YAjArBmTnVX8iZ3Zla1Kcp1Azq1/6hu KD+7zsI1kTnpLIvkGMquphEa6g4YjyfOy1NX025HSMC0cRNJebLf3LlzH6JLAp90 CofkZ89SP/TBHuFD5xH+LssPJYrinGcJQF/Twp22cIkzY=; To: reply@codereview.appspotmail.com,gcc-patches@gcc.gnu.org Subject: [google] New fdo summary-based icache sensitive unrolling (issue6282045) Message-Id: <20120603063000.394AA61578@tjsboxrox.mtv.corp.google.com> Date: Sat, 2 Jun 2012 23:30:00 -0700 (PDT) From: tejohnson@google.com (Teresa Johnson) Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

Index: libgcc/libgcov.c =================================================================== --- libgcc/libgcov.c (revision 188119) +++ libgcc/libgcov.c (working copy) @@ -795,6 +795,104 @@ gcov_sort_topn_counter_arrays (const struct gcov_i } } +/* Used by qsort to sort gcov values in descending order. */ + +static int +sort_by_reverse_gcov_value (const void *pa, const void *pb) +{ + const gcov_type a = *(gcov_type const *)pa; + const gcov_type b = *(gcov_type const *)pb; + + if (b > a) + return 1; + else if (b == a) + return 0; + else + return -1; +} + +/* Determines the number of counters required to cover a given percentage + of the total sum of execution counts in the summary, which is then also + recorded in SUM. */ + +static void +gcov_compute_cutoff_values (struct gcov_summary *sum) +{ + struct gcov_info *gi_ptr; + const struct gcov_fn_info *gfi_ptr; + const struct gcov_ctr_info *ci_ptr; + struct gcov_ctr_summary *cs_ptr; + unsigned t_ix, f_ix, i, ctr_info_ix, index; + gcov_unsigned_t c_num; + gcov_type *value_array; + gcov_type cum, cum_cutoff; + char *cutoff_str; + unsigned cutoff_perc; + +#define CUM_CUTOFF_PERCENT_TIMES_10 999 + cutoff_str = getenv ("GCOV_HOTCODE_CUTOFF_TIMES_10"); + if (cutoff_str && strlen (cutoff_str)) + cutoff_perc = atoi (cutoff_str); + else + cutoff_perc = CUM_CUTOFF_PERCENT_TIMES_10; + + for (t_ix = 0; t_ix < GCOV_COUNTERS_SUMMABLE; t_ix++) + { + /* First check if there are any counts recorded for this counter. */ + cs_ptr = &(sum->ctrs[t_ix]); + if (!cs_ptr->num) + continue; + + /* Determine the cumulative counter value at the specified cutoff + percentage and record the percentage for use by gcov consumers. */ + cum_cutoff = (cs_ptr->sum_all * cutoff_perc)/1000; + cs_ptr->sum_cutoff_percent = cutoff_perc; + + /* Next, walk through all the per-object structures and save each of + the count values in value_array. */ + index = 0; + value_array = (gcov_type *) malloc (sizeof(gcov_type)*cs_ptr->num); + for (gi_ptr = __gcov_list; gi_ptr; gi_ptr = gi_ptr->next) + { + /* Find the appropriate index into the gcov_ctr_info array + for the counter we are currently working on based on the + existence of the merge function pointer for this object. */ + for (i = 0, ctr_info_ix = 0; i < t_ix; i++) + { + if (gi_ptr->merge[i]) + ctr_info_ix++; + } + for (f_ix = 0; f_ix != gi_ptr->n_functions; f_ix++) + { + gfi_ptr = gi_ptr->functions[f_ix]; + + if (!gfi_ptr || gfi_ptr->key != gi_ptr) + continue; + + ci_ptr = &gfi_ptr->ctrs[ctr_info_ix]; + /* Sanity check that there are enough entries in value_arry + for this function's counters. */ + gcc_assert (index + ci_ptr->num <= cs_ptr->num); + /* Copy over this function's counter values. */ + memcpy (&value_array[index], ci_ptr->values, + sizeof (gcov_type) * ci_ptr->num); + index += ci_ptr->num; + } + } + + /* Sort all the counter values by descending value and finally + accumulate the values from hottest on down until reaching + the cutoff value computed earlier. */ + qsort (value_array, cs_ptr->num, sizeof (gcov_type), + sort_by_reverse_gcov_value); + for (cum = 0, c_num = 0; c_num < cs_ptr->num && cum < cum_cutoff; c_num++) + cum += value_array[c_num]; + /* Record the number of counters required to reach the cutoff value. */ + cs_ptr->num_to_cutoff = c_num; + free (value_array); + } +} + /* Compute object summary recored in gcov_info INFO. The result is stored in OBJ_SUM. Note that the caller is responsible for zeroing out OBJ_SUM, otherwise the summary is accumulated. */ @@ -994,7 +1092,11 @@ rewrite:; if (gi_ptr->merge[t_ix]) { if (!cs_prg->runs++) - cs_prg->num = cs_tprg->num; + { + cs_prg->num = cs_tprg->num; + cs_prg->sum_cutoff_percent = cs_tprg->sum_cutoff_percent; + cs_prg->num_to_cutoff = cs_tprg->num_to_cutoff; + } cs_prg->sum_all += cs_tprg->sum_all; if (cs_prg->run_max < cs_tprg->run_max) cs_prg->run_max = cs_tprg->run_max; @@ -1160,6 +1262,7 @@ gcov_exit_init (void) is FDO/LIPO. */ dump_module_info |= gi_ptr->mod_info->is_primary; } + gcov_compute_cutoff_values (&this_program); gcov_alloc_filename (); Index: gcc/doc/invoke.texi =================================================================== --- gcc/doc/invoke.texi (revision 188119) +++ gcc/doc/invoke.texi (working copy) @@ -392,7 +392,7 @@ Objective-C and Objective-C++ Dialects}. -fno-sched-interblock -fno-sched-spec -fno-signed-zeros @gol -fno-toplevel-reorder -fno-trapping-math -fno-zero-initialized-in-bss @gol -fomit-frame-pointer -foptimize-register-move -foptimize-sibling-calls @gol --fpartial-inlining -fpeel-loops -fpredictive-commoning @gol +-fpartial-inlining -fpeel-codesize-limit -fpeel-loops -fpredictive-commoning @gol -fprefetch-loop-arrays @gol -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol -fprofile-generate=@var{path} -fprofile-generate-sampling @gol @@ -404,8 +404,8 @@ Objective-C and Objective-C++ Dialects}. -freorder-blocks-and-partition -freorder-functions @gol -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol -fripa -fripa-disallow-asm-modules -fripa-disallow-opt-mismatch @gol --fripa-inc-path-sub=@var{path_mapping} -fripa-no-promote-always-inline-func -fripa-verbose @gol --fripa-peel-size-limit -fripa-unroll-size-limit -frounding-math @gol +-fripa-inc-path-sub=@var{path_mapping} -fripa-no-promote-always-inline-func @gol +-fripa-verbose -frounding-math @gol -fsched2-use-superblocks -fsched-pressure @gol -fsched-spec-load -fsched-spec-load-dangerous @gol -fsched-stalled-insns-dep[=@var{n}] -fsched-stalled-insns[=@var{n}] @gol @@ -428,7 +428,7 @@ Objective-C and Objective-C++ Dialects}. -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-pta -ftree-reassoc @gol -ftree-sink -ftree-sra -ftree-switch-conversion -ftree-tail-merge @gol -ftree-ter -ftree-vect-loop-version -ftree-vectorize -ftree-vrp @gol --funit-at-a-time -funroll-all-loops -funroll-loops @gol +-funit-at-a-time -funroll-all-loops -funroll-loops -funroll-codesize-limit @gol -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol -fvariable-expansion-in-unroller -fvect-cost-model -fvpt -fweb @gol -fwhole-program -fwpa -fuse-ld -fuse-linker-plugin @gol @@ -8424,20 +8424,6 @@ Do not promote static functions with always inline Enable printing of verbose information about dynamic inter-procedural optimizations. This is used in conjunction with the @option{-fripa}. -@item -fripa-peel-size-limit -@opindex fripa-peel-size-limit -Limit loop peeling of non-const non-FP loops in a LIPO compilation under estimates -of a large code footprint. Enabled by default under @option{-fripa}. Code size -estimation and thresholds are controlled by the @option{codesize-hotness-threshold} -and @option{unrollpeel-codesize-threshold} parameters. - -@item -fripa-unroll-size-limit -@opindex fripa-unroll-size-limit -Limit loop unrolling of non-const non-FP loops in a LIPO compilation under estimates -of a large code footprint. Enabled by default under @option{-fripa}. Code size -estimation and thresholds are controlled by the @option{codesize-hotness-threshold} -and @option{unrollpeel-codesize-threshold} parameters. - @item -fcallgraph-profiles-sections @opindex fcallgraph-profiles-sections Emit call graph edge profile counts in .note.callgraph.text sections. This is @@ -8774,6 +8760,14 @@ the loop is entered. This usually makes programs @option{-funroll-all-loops} implies the same options as @option{-funroll-loops}. +@item -funroll-codesize-limit +@opindex funroll-codesize-limit +Limit loop unrolling of non-const non-FP loops in a profile feedback compilation +under estimates of a large code footprint. Enabled by default with +@option{-fprofile-use}. Code size and execution weight thresholds are controlled +by the @option{unrollpeel-codesize-threshold} and +@option{unrollpeel-hotness-threshold} parameters. + @item -fpeel-loops @opindex fpeel-loops Peels loops for which there is enough information that they do not @@ -8782,6 +8776,14 @@ roll much (from profile feedback). It also turns Enabled with @option{-fprofile-use}. +@item -fpeel-codesize-limit +@opindex fpeel-codesize-limit +Limit loop peeling of non-const non-FP loops in a profile feedback compilation +under estimates of a large code footprint. Enabled by default with +@option{-fprofile-use}. Code size and execution weight thresholds are controlled +by the @option{unrollpeel-codesize-threshold} and +@option{unrollpeel-hotness-threshold} parameters. + @item -fmove-loop-invariants @opindex fmove-loop-invariants Enables the loop invariant motion pass in the RTL loop optimizer. Enabled @@ -9168,13 +9170,15 @@ hot loops. Its default value is 16. @item max-completely-peel-loop-nest-depth The maximum depth of a loop nest suitable for complete peeling. -@item codesize-hotness-threshold -The minimum profile count of basic blocks to look at when estimating -the code size footprint of the call graph in a LIPO compile. - @item unrollpeel-codesize-threshold -Maximum LIPO code size footprint estimate for loop unrolling and peeling. +Maximum profile-based code size footprint estimate for loop unrolling and +peeling. +@item unrollpeel-hotness-threshold +Maximum ratio of total execution count to loop entry block count under which +most profile-based code size estimates will be ignored for loop unrolling and +peeling. + @item max-unswitch-insns The maximum number of insns of an unswitched loop. Index: gcc/gcov-io.c =================================================================== --- gcc/gcov-io.c (revision 188119) +++ gcc/gcov-io.c (working copy) @@ -520,6 +520,8 @@ gcov_write_summary (gcov_unsigned_t tag, const str for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++) { gcov_write_unsigned (csum->num); + gcov_write_unsigned (csum->sum_cutoff_percent); + gcov_write_unsigned (csum->num_to_cutoff); gcov_write_unsigned (csum->runs); gcov_write_counter (csum->sum_all); gcov_write_counter (csum->run_max); @@ -637,6 +639,8 @@ gcov_read_summary (struct gcov_summary *summary) for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++) { csum->num = gcov_read_unsigned (); + csum->sum_cutoff_percent = gcov_read_unsigned (); + csum->num_to_cutoff = gcov_read_unsigned (); csum->runs = gcov_read_unsigned (); csum->sum_all = gcov_read_counter (); csum->run_max = gcov_read_counter (); Index: gcc/gcov-io.h =================================================================== --- gcc/gcov-io.h (revision 188119) +++ gcc/gcov-io.h (working copy) @@ -440,7 +440,7 @@ typedef HOST_WIDEST_INT gcov_type; #define GCOV_TAG_OBJECT_SUMMARY ((gcov_unsigned_t)0xa1000000) /* Obsolete */ #define GCOV_TAG_PROGRAM_SUMMARY ((gcov_unsigned_t)0xa3000000) #define GCOV_TAG_SUMMARY_LENGTH \ - (1 + GCOV_COUNTERS_SUMMABLE * (2 + 3 * 2)) + (1 + GCOV_COUNTERS_SUMMABLE * (4 + 3 * 2)) #define GCOV_TAG_MODULE_INFO ((gcov_unsigned_t)0xa4000000) #define GCOV_TAG_PMU_LOAD_LATENCY_INFO ((gcov_unsigned_t)0xa5000000) #define GCOV_TAG_PMU_LOAD_LATENCY_LENGTH(filename) \ @@ -541,6 +541,9 @@ typedef HOST_WIDEST_INT gcov_type; struct gcov_ctr_summary { gcov_unsigned_t num; /* number of counters. */ + gcov_unsigned_t sum_cutoff_percent;/* sum_all cutoff percentage computed + (times 10). */ + gcov_unsigned_t num_to_cutoff;/* number of counters to reach above cutoff. */ gcov_unsigned_t runs; /* number of program runs */ gcov_type sum_all; /* sum of all counters accumulated. */ gcov_type run_max; /* maximum value on a single run. */ Index: gcc/loop-unroll.c =================================================================== --- gcc/loop-unroll.c (revision 188119) +++ gcc/loop-unroll.c (working copy) @@ -35,6 +35,7 @@ along with GCC; see the file COPYING3. If not see #include "recog.h" #include "target.h" #include "diagnostic.h" +#include "gcov-io.h" /* This pass performs loop unrolling and peeling. We only perform these optimizations on innermost loops (with single exception) because @@ -128,12 +129,12 @@ struct opt_info static void decide_unrolling_and_peeling (int); static void peel_loops_completely (int); -static void decide_peel_simple (struct loop *, int); +static void decide_peel_simple (struct loop *, int, int); static void decide_peel_once_rolling (struct loop *, int); static void decide_peel_completely (struct loop *, int); -static void decide_unroll_stupid (struct loop *, int); +static void decide_unroll_stupid (struct loop *, int, int); static void decide_unroll_constant_iterations (struct loop *, int); -static void decide_unroll_runtime_iterations (struct loop *, int); +static void decide_unroll_runtime_iterations (struct loop *, int, int); static void peel_loop_simple (struct loop *); static void peel_loop_completely (struct loop *); static void unroll_loop_stupid (struct loop *); @@ -189,36 +190,75 @@ typedef enum { LIMIT_BOTH = 0x3 } limit_type; -extern int cgraph_codesize_estimate; - /* Determine whether LOOP unrolling/peeling should be constrained based - on code footprint estimates. */ + on code footprint estimates. If so, sets *CODESIZE_DIVISOR to the + codesize-based amount divided into the max instructions in an unrolled + or peeled loop. */ static limit_type -limit_code_size(struct loop *loop) +limit_code_size(struct loop *loop, int *codesize_divisor) { unsigned size_threshold; limit_type result = NO_LIMIT; int result_int = 0; struct niter_desc *desc = get_simple_loop_desc (loop); + gcov_type sum_to_header_ratio; + int hotness_ratio_threshold; - if (!flag_dyn_ipa) + /* The codesize-based unroll factor divisor is initialized to 1 so that + unroll/peel factors are not reduced by default. */ + *codesize_divisor = 1; + + if (!flag_unroll_codesize_limit && !flag_peel_codesize_limit) return NO_LIMIT; - gcc_assert (cgraph_codesize_estimate >= 0); + /* First check if the application has a large codesize footprint. + This is estimated from FDO profile summary information for the + program, where the num_to_cutoff indicates the number of hottest + counters (blocks) that compose most of the execution time of + the program. A large value would indicate a large flat execution + profile where icache misses may be a concern. */ + size_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD); + if (!profile_info + || profile_info->num_to_cutoff <= size_threshold) + return NO_LIMIT; + /* Next, exclude some loops where unrolling/peeling may be more + important to overall performance. */ + /* Ignore FP loops, which are more likely to benefit heavily from unrolling. */ if (desc->has_fp) return NO_LIMIT; - size_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD); - if (cgraph_codesize_estimate <= (int)size_threshold) - return NO_LIMIT; + gcc_assert(profile_info->sum_all > 0); + /* Next, set the value of the codesize-based unroll factor divisor which in + most loops will need to be set to a value that will reduce or eliminate + unrolling/peeling. */ + if (profile_info->num_to_cutoff < size_threshold*2) { + /* For appliations that are less than twice the codesize limit, allow + limited unrolling for very hot loops. */ + sum_to_header_ratio = profile_info->sum_all / loop->header->count; + hotness_ratio_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_HOTNESS_THRESHOLD); + /* When the profile count sum to loop entry header ratio is smaller than + the threshold (i.e. the loop entry is hot enough, the divisor is left + at its default of 1 so the unroll/peel factor is not reduced. When it + is bigger than the ratio, increase the divisor by the amount this ratio + is over the threshold, which will quickly reduce the unroll/peel + factor to zero as the loop's hotness reduces. */ + if (sum_to_header_ratio > hotness_ratio_threshold) { + *codesize_divisor = sum_to_header_ratio / hotness_ratio_threshold; + gcc_assert (*codesize_divisor >= 1); + } + } + else + /* For appliations that are at least twice the codesize limit, set + the divisor to a large value that will force the unroll factor to 0. */ + *codesize_divisor = INT_MAX; - if (flag_ripa_peel_size_limit) + if (flag_peel_codesize_limit) result_int |= LIMIT_PEEL; - if (flag_ripa_unroll_size_limit) + if (flag_unroll_codesize_limit) result_int |= LIMIT_UNROLL; result = (limit_type)result_int; @@ -443,6 +483,7 @@ decide_unrolling_and_peeling (int flags) loop_iterator li; location_t locus; limit_type limit; + int codesize_divisor = 1; /* Scan the loops, inner ones first. */ FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST) @@ -484,22 +525,23 @@ decide_unrolling_and_peeling (int flags) loop->av_ninsns = average_num_loop_insns (loop); /* Determine whether to limit code size growth from unrolling and - peeling. This is currently enabled only under LIPO (dynamic IPA) - where we have a partial call graph. It is not applied to loops + peeling, using FDO profile summary information that gives an + estimated number of executed blocks. It is not applied to loops with constant trip counts, as it is easier to determine the profitability of unrolling and peeling such loops. */ - limit = limit_code_size(loop); + limit = limit_code_size(loop, &codesize_divisor); if (limit != NO_LIMIT) { if (dump_file) { fprintf (dump_file, ";; Due to large code size footprint estimate, limit "); if (limit == (LIMIT_UNROLL|LIMIT_PEEL)) - fprintf (dump_file, "unrolling and peeling\n"); + fprintf (dump_file, "unrolling and peeling"); else if (limit == LIMIT_UNROLL) - fprintf (dump_file, "unrolling\n"); + fprintf (dump_file, "unrolling"); else - fprintf (dump_file, "peeling\n"); + fprintf (dump_file, "peeling"); + fprintf (dump_file, " max insns by divisor %d\n", codesize_divisor); } } @@ -507,15 +549,15 @@ decide_unrolling_and_peeling (int flags) priority. */ decide_unroll_constant_iterations (loop, flags); - if (loop->lpt_decision.decision == LPT_NONE - && !(limit & LIMIT_UNROLL)) - decide_unroll_runtime_iterations (loop, flags); - if (loop->lpt_decision.decision == LPT_NONE - && !(limit & LIMIT_UNROLL)) - decide_unroll_stupid (loop, flags); - if (loop->lpt_decision.decision == LPT_NONE - && !(limit & LIMIT_PEEL)) - decide_peel_simple (loop, flags); + if (loop->lpt_decision.decision == LPT_NONE) + decide_unroll_runtime_iterations (loop, flags, (limit & LIMIT_UNROLL) + ? codesize_divisor : 1); + if (loop->lpt_decision.decision == LPT_NONE) + decide_unroll_stupid (loop, flags, (limit & LIMIT_UNROLL) + ? codesize_divisor : 1); + if (loop->lpt_decision.decision == LPT_NONE) + decide_peel_simple (loop, flags, (limit & LIMIT_PEEL) + ? codesize_divisor : 1); if (flag_opt_info >= OPT_INFO_MIN && loop->lpt_decision.decision != LPT_NONE) @@ -1049,7 +1091,8 @@ unroll_loop_constant_iterations (struct loop *loop /* Decide whether to unroll LOOP iterating runtime computable number of times and how much. */ static void -decide_unroll_runtime_iterations (struct loop *loop, int flags) +decide_unroll_runtime_iterations (struct loop *loop, int flags, + int codesize_divisor) { unsigned nunroll, nunroll_by_av, nunroll_branches, i; struct niter_desc *desc; @@ -1067,8 +1110,10 @@ static void /* nunroll = total number of copies of the original loop body in unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ - nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; - nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; + nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / codesize_divisor + / loop->ninsns; + nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) + / codesize_divisor / loop->av_ninsns; if (nunroll > nunroll_by_av) nunroll = nunroll_by_av; if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) @@ -1455,7 +1500,7 @@ unroll_loop_runtime_iterations (struct loop *loop) /* Decide whether to simply peel LOOP and how much. */ static void -decide_peel_simple (struct loop *loop, int flags) +decide_peel_simple (struct loop *loop, int flags, int codesize_divisor) { unsigned npeel; struct niter_desc *desc; @@ -1470,7 +1515,8 @@ static void fprintf (dump_file, "\n;; Considering simply peeling loop\n"); /* npeel = number of iterations to peel. */ - npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns; + npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / codesize_divisor + / loop->ninsns; if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES)) npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES); @@ -1608,7 +1654,7 @@ peel_loop_simple (struct loop *loop) /* Decide whether to unroll LOOP stupidly and how much. */ static void -decide_unroll_stupid (struct loop *loop, int flags) +decide_unroll_stupid (struct loop *loop, int flags, int codesize_divisor) { unsigned nunroll, nunroll_by_av, i; struct niter_desc *desc; @@ -1624,9 +1670,10 @@ static void /* nunroll = total number of copies of the original loop body in unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ - nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; - nunroll_by_av - = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; + nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / codesize_divisor + / loop->ninsns; + nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) + / codesize_divisor / loop->av_ninsns; if (nunroll > nunroll_by_av) nunroll = nunroll_by_av; if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) Index: gcc/coverage.c =================================================================== --- gcc/coverage.c (revision 188119) +++ gcc/coverage.c (working copy) @@ -468,6 +468,9 @@ read_counts_file (const char *da_file_name, unsign gcov_read_summary (&sum); for (ix = 0; ix != GCOV_COUNTERS_SUMMABLE; ix++) { + summary.ctrs[ix].sum_cutoff_percent + = sum.ctrs[ix].sum_cutoff_percent; + summary.ctrs[ix].num_to_cutoff += sum.ctrs[ix].num_to_cutoff; summary.ctrs[ix].runs += sum.ctrs[ix].runs; summary.ctrs[ix].sum_all += sum.ctrs[ix].sum_all; if (summary.ctrs[ix].run_max < sum.ctrs[ix].run_max) Index: gcc/common.opt =================================================================== --- gcc/common.opt (revision 188119) +++ gcc/common.opt (working copy) @@ -1139,14 +1139,6 @@ Common Report Var(flag_ripa_no_promote_always_inli Don't promote always inline static functions assuming they will be inlined and no copy is needed. -fripa-peel-size-limit -Common Report Var(flag_ripa_peel_size_limit) Init(1) Optimization -Limit non-const non-FP loop peeling under dynamic IPA estimates of large code footprint - -fripa-unroll-size-limit -Common Report Var(flag_ripa_unroll_size_limit) Init(1) Optimization -Limit non-const non-FP loop unrolling under dynamic IPA estimates of large code footprint - fripa-inc-path-sub= Common Joined RejectNegative Var(lipo_inc_path_pattern) Substitute substring in include paths with a new string to allow reuse profile data @@ -1639,6 +1631,10 @@ fpcc-struct-return Common Report Var(flag_pcc_struct_return,1) Init(DEFAULT_PCC_STRUCT_RETURN) Return small aggregates in memory, not registers +fpeel-codesize-limit +Common Report Var(flag_peel_codesize_limit) Init(1) Optimization +Limit non-const non-FP loop peeling under profile estimates of large code footprint + fpeel-loops Common Report Var(flag_peel_loops) Optimization Perform loop peeling @@ -2216,6 +2212,10 @@ funroll-all-loops Common Report Var(flag_unroll_all_loops) Optimization Perform loop unrolling for all loops +funroll-codesize-limit +Common Report Var(flag_unroll_codesize_limit) Init(1) Optimization +Limit non-const non-FP loop unrolling under profile estimates of large code footprint + ; Nonzero means that loop optimizer may assume that the induction variables ; that control loops do not overflow and that the loops with nontrivial ; exit condition are not infinite Index: gcc/tree-optimize.c =================================================================== --- gcc/tree-optimize.c (revision 188119) +++ gcc/tree-optimize.c (working copy) @@ -46,7 +46,6 @@ along with GCC; see the file COPYING3. If not see #include "except.h" #include "plugin.h" #include "regset.h" /* FIXME: For reg_obstack. */ -#include "params.h" /* Gate: execute, or not, all of the non-trivial optimizations. */ @@ -149,49 +148,6 @@ struct gimple_opt_pass pass_all_early_optimization } }; -int cgraph_codesize_estimate = -1; - -/* Estimate the code size from the dynamic IPA call graph. */ -static void -compute_codesize_estimate(void) -{ - struct cgraph_node *node; - basic_block bb; - gimple_stmt_iterator bsi; - struct function *my_function; - - if (!flag_dyn_ipa) - return; - - if (cgraph_codesize_estimate >= 0) - return; - cgraph_codesize_estimate = 0; - - for (node = cgraph_nodes; node; node = node->next) - { - if (node->count == 0) - continue; - - if (!gimple_has_body_p(node->decl)) - continue; - - my_function = DECL_STRUCT_FUNCTION (node->decl); - - FOR_EACH_BB_FN (bb, my_function) - { - if (bb->count < PARAM_VALUE (PARAM_CODESIZE_HOTNESS_THRESHOLD)) - continue; - for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) - { - gimple stmt = gsi_stmt (bsi); - cgraph_codesize_estimate += estimate_num_insns (stmt, &eni_size_weights); - } - } - } - if (dump_file) - fprintf (dump_file, "Code size estimate from cgraph: %d\n", cgraph_codesize_estimate); -} - /* Pass: cleanup the CFG just before expanding trees to RTL. This is just a round of label cleanups and case node grouping because after the tree optimizers have run such cleanups may @@ -202,8 +158,6 @@ execute_cleanup_cfg_post_optimizing (void) { unsigned int todo = 0; - /* Estimate the code footprint for hot BBs before we enter RTL */ - compute_codesize_estimate(); if (cleanup_tree_cfg ()) todo |= TODO_update_ssa; maybe_remove_unreachable_handlers (); Index: gcc/params.def =================================================================== --- gcc/params.def (revision 188119) +++ gcc/params.def (working copy) @@ -337,21 +337,23 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, "max-completely-peel-loop-nest-depth", "The maximum depth of a loop nest we completely peel", 8, 0, 0) -/* The minimum profile count of basic blocks to look at when estimating - * the code size footprint of the call graph in a dynamic IPA compile. */ -DEFPARAM(PARAM_CODESIZE_HOTNESS_THRESHOLD, - "codesize-hotness-threshold", - "Minimum profile count of basic blocks counted towards dynamic IPA " - "code size footprint estimate", - 10000, 0, 0) /* The maximum code size estimate under which loop unrolling and peeling - * is allowed in a dynamic IPA compile. This currently applies to loops with - * non-constant iteration counts and no floating point computations. */ + * is allowed in a profile feedback compile. This currently applies to loops + * with non-constant iteration counts and no floating point computations. */ DEFPARAM(PARAM_UNROLLPEEL_CODESIZE_THRESHOLD, "unrollpeel-codesize-threshold", - "Maximum dynamic IPA code size footprint estimate for loop unrolling " + "Maximum profile-based code size footprint estimate for loop unrolling " "and peeling", - 10000, 0, 0) + 15000, 0, 0) +/* The maximum ratio of total profiled execution counts to loop entry block + count that must be exceeded to ignore most code size limits when unrolling + and peeling. */ +DEFPARAM(PARAM_UNROLLPEEL_HOTNESS_THRESHOLD, + "unrollpeel-hotness-threshold", + "Maximum ratio of total profiled execution count to loop entry " + "block count under which most codesize limits for unrolling and " + "peeling will be ignored", + 100, 1, 0) DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, "min-iter-unroll-with-branches", Index: gcc/gcov-dump.c =================================================================== --- gcc/gcov-dump.c (revision 188119) +++ gcc/gcov-dump.c (working copy) @@ -526,8 +526,11 @@ tag_summary (const char *filename ATTRIBUTE_UNUSED { printf ("\n"); print_prefix (filename, 0, 0); - printf ("\t\tcounts=%u, runs=%u", - summary.ctrs[ix].num, summary.ctrs[ix].runs); + printf ("\t\tcounts=%u (cutoff pct*10=%u, num to cutoff=%u), runs=%u", + summary.ctrs[ix].num, + summary.ctrs[ix].sum_cutoff_percent, + summary.ctrs[ix].num_to_cutoff, + summary.ctrs[ix].runs); printf (", sum_all=" HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT)summary.ctrs[ix].sum_all);

[google] New fdo summary-based icache sensitive unrolling (issue6282045)

Commit Message

Patch