===================================================================
@@ -795,6 +795,104 @@ gcov_sort_topn_counter_arrays (const struct gcov_i
}
}
+/* Used by qsort to sort gcov values in descending order. */
+
+static int
+sort_by_reverse_gcov_value (const void *pa, const void *pb)
+{
+ const gcov_type a = *(gcov_type const *)pa;
+ const gcov_type b = *(gcov_type const *)pb;
+
+ if (b > a)
+ return 1;
+ else if (b == a)
+ return 0;
+ else
+ return -1;
+}
+
+/* Determines the number of counters required to cover a given percentage
+ of the total sum of execution counts in the summary, which is then also
+ recorded in SUM. */
+
+static void
+gcov_compute_cutoff_values (struct gcov_summary *sum)
+{
+ struct gcov_info *gi_ptr;
+ const struct gcov_fn_info *gfi_ptr;
+ const struct gcov_ctr_info *ci_ptr;
+ struct gcov_ctr_summary *cs_ptr;
+ unsigned t_ix, f_ix, i, ctr_info_ix, index;
+ gcov_unsigned_t c_num;
+ gcov_type *value_array;
+ gcov_type cum, cum_cutoff;
+ char *cutoff_str;
+ unsigned cutoff_perc;
+
+#define CUM_CUTOFF_PERCENT_TIMES_10 999
+ cutoff_str = getenv ("GCOV_HOTCODE_CUTOFF_TIMES_10");
+ if (cutoff_str && strlen (cutoff_str))
+ cutoff_perc = atoi (cutoff_str);
+ else
+ cutoff_perc = CUM_CUTOFF_PERCENT_TIMES_10;
+
+ for (t_ix = 0; t_ix < GCOV_COUNTERS_SUMMABLE; t_ix++)
+ {
+ /* First check if there are any counts recorded for this counter. */
+ cs_ptr = &(sum->ctrs[t_ix]);
+ if (!cs_ptr->num)
+ continue;
+
+ /* Determine the cumulative counter value at the specified cutoff
+ percentage and record the percentage for use by gcov consumers. */
+ cum_cutoff = (cs_ptr->sum_all * cutoff_perc)/1000;
+ cs_ptr->sum_cutoff_percent = cutoff_perc;
+
+ /* Next, walk through all the per-object structures and save each of
+ the count values in value_array. */
+ index = 0;
+ value_array = (gcov_type *) malloc (sizeof(gcov_type)*cs_ptr->num);
+ for (gi_ptr = __gcov_list; gi_ptr; gi_ptr = gi_ptr->next)
+ {
+ /* Find the appropriate index into the gcov_ctr_info array
+ for the counter we are currently working on based on the
+ existence of the merge function pointer for this object. */
+ for (i = 0, ctr_info_ix = 0; i < t_ix; i++)
+ {
+ if (gi_ptr->merge[i])
+ ctr_info_ix++;
+ }
+ for (f_ix = 0; f_ix != gi_ptr->n_functions; f_ix++)
+ {
+ gfi_ptr = gi_ptr->functions[f_ix];
+
+ if (!gfi_ptr || gfi_ptr->key != gi_ptr)
+ continue;
+
+ ci_ptr = &gfi_ptr->ctrs[ctr_info_ix];
+ /* Sanity check that there are enough entries in value_arry
+ for this function's counters. */
+ gcc_assert (index + ci_ptr->num <= cs_ptr->num);
+ /* Copy over this function's counter values. */
+ memcpy (&value_array[index], ci_ptr->values,
+ sizeof (gcov_type) * ci_ptr->num);
+ index += ci_ptr->num;
+ }
+ }
+
+ /* Sort all the counter values by descending value and finally
+ accumulate the values from hottest on down until reaching
+ the cutoff value computed earlier. */
+ qsort (value_array, cs_ptr->num, sizeof (gcov_type),
+ sort_by_reverse_gcov_value);
+ for (cum = 0, c_num = 0; c_num < cs_ptr->num && cum < cum_cutoff; c_num++)
+ cum += value_array[c_num];
+ /* Record the number of counters required to reach the cutoff value. */
+ cs_ptr->num_to_cutoff = c_num;
+ free (value_array);
+ }
+}
+
/* Compute object summary recored in gcov_info INFO. The result is
stored in OBJ_SUM. Note that the caller is responsible for
zeroing out OBJ_SUM, otherwise the summary is accumulated. */
@@ -994,7 +1092,11 @@ rewrite:;
if (gi_ptr->merge[t_ix])
{
if (!cs_prg->runs++)
- cs_prg->num = cs_tprg->num;
+ {
+ cs_prg->num = cs_tprg->num;
+ cs_prg->sum_cutoff_percent = cs_tprg->sum_cutoff_percent;
+ cs_prg->num_to_cutoff = cs_tprg->num_to_cutoff;
+ }
cs_prg->sum_all += cs_tprg->sum_all;
if (cs_prg->run_max < cs_tprg->run_max)
cs_prg->run_max = cs_tprg->run_max;
@@ -1160,6 +1262,7 @@ gcov_exit_init (void)
is FDO/LIPO. */
dump_module_info |= gi_ptr->mod_info->is_primary;
}
+ gcov_compute_cutoff_values (&this_program);
gcov_alloc_filename ();
===================================================================
@@ -392,7 +392,7 @@ Objective-C and Objective-C++ Dialects}.
-fno-sched-interblock -fno-sched-spec -fno-signed-zeros @gol
-fno-toplevel-reorder -fno-trapping-math -fno-zero-initialized-in-bss @gol
-fomit-frame-pointer -foptimize-register-move -foptimize-sibling-calls @gol
--fpartial-inlining -fpeel-loops -fpredictive-commoning @gol
+-fpartial-inlining -fpeel-codesize-limit -fpeel-loops -fpredictive-commoning @gol
-fprefetch-loop-arrays @gol
-fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
-fprofile-generate=@var{path} -fprofile-generate-sampling @gol
@@ -404,8 +404,8 @@ Objective-C and Objective-C++ Dialects}.
-freorder-blocks-and-partition -freorder-functions @gol
-frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
-fripa -fripa-disallow-asm-modules -fripa-disallow-opt-mismatch @gol
--fripa-inc-path-sub=@var{path_mapping} -fripa-no-promote-always-inline-func -fripa-verbose @gol
--fripa-peel-size-limit -fripa-unroll-size-limit -frounding-math @gol
+-fripa-inc-path-sub=@var{path_mapping} -fripa-no-promote-always-inline-func @gol
+-fripa-verbose -frounding-math @gol
-fsched2-use-superblocks -fsched-pressure @gol
-fsched-spec-load -fsched-spec-load-dangerous @gol
-fsched-stalled-insns-dep[=@var{n}] -fsched-stalled-insns[=@var{n}] @gol
@@ -428,7 +428,7 @@ Objective-C and Objective-C++ Dialects}.
-ftree-parallelize-loops=@var{n} -ftree-pre -ftree-pta -ftree-reassoc @gol
-ftree-sink -ftree-sra -ftree-switch-conversion -ftree-tail-merge @gol
-ftree-ter -ftree-vect-loop-version -ftree-vectorize -ftree-vrp @gol
--funit-at-a-time -funroll-all-loops -funroll-loops @gol
+-funit-at-a-time -funroll-all-loops -funroll-loops -funroll-codesize-limit @gol
-funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol
-fvariable-expansion-in-unroller -fvect-cost-model -fvpt -fweb @gol
-fwhole-program -fwpa -fuse-ld -fuse-linker-plugin @gol
@@ -8424,20 +8424,6 @@ Do not promote static functions with always inline
Enable printing of verbose information about dynamic inter-procedural optimizations.
This is used in conjunction with the @option{-fripa}.
-@item -fripa-peel-size-limit
-@opindex fripa-peel-size-limit
-Limit loop peeling of non-const non-FP loops in a LIPO compilation under estimates
-of a large code footprint. Enabled by default under @option{-fripa}. Code size
-estimation and thresholds are controlled by the @option{codesize-hotness-threshold}
-and @option{unrollpeel-codesize-threshold} parameters.
-
-@item -fripa-unroll-size-limit
-@opindex fripa-unroll-size-limit
-Limit loop unrolling of non-const non-FP loops in a LIPO compilation under estimates
-of a large code footprint. Enabled by default under @option{-fripa}. Code size
-estimation and thresholds are controlled by the @option{codesize-hotness-threshold}
-and @option{unrollpeel-codesize-threshold} parameters.
-
@item -fcallgraph-profiles-sections
@opindex fcallgraph-profiles-sections
Emit call graph edge profile counts in .note.callgraph.text sections. This is
@@ -8774,6 +8760,14 @@ the loop is entered. This usually makes programs
@option{-funroll-all-loops} implies the same options as
@option{-funroll-loops}.
+@item -funroll-codesize-limit
+@opindex funroll-codesize-limit
+Limit loop unrolling of non-const non-FP loops in a profile feedback compilation
+under estimates of a large code footprint. Enabled by default with
+@option{-fprofile-use}. Code size and execution weight thresholds are controlled
+by the @option{unrollpeel-codesize-threshold} and
+@option{unrollpeel-hotness-threshold} parameters.
+
@item -fpeel-loops
@opindex fpeel-loops
Peels loops for which there is enough information that they do not
@@ -8782,6 +8776,14 @@ roll much (from profile feedback). It also turns
Enabled with @option{-fprofile-use}.
+@item -fpeel-codesize-limit
+@opindex fpeel-codesize-limit
+Limit loop peeling of non-const non-FP loops in a profile feedback compilation
+under estimates of a large code footprint. Enabled by default with
+@option{-fprofile-use}. Code size and execution weight thresholds are controlled
+by the @option{unrollpeel-codesize-threshold} and
+@option{unrollpeel-hotness-threshold} parameters.
+
@item -fmove-loop-invariants
@opindex fmove-loop-invariants
Enables the loop invariant motion pass in the RTL loop optimizer. Enabled
@@ -9168,13 +9170,15 @@ hot loops. Its default value is 16.
@item max-completely-peel-loop-nest-depth
The maximum depth of a loop nest suitable for complete peeling.
-@item codesize-hotness-threshold
-The minimum profile count of basic blocks to look at when estimating
-the code size footprint of the call graph in a LIPO compile.
-
@item unrollpeel-codesize-threshold
-Maximum LIPO code size footprint estimate for loop unrolling and peeling.
+Maximum profile-based code size footprint estimate for loop unrolling and
+peeling.
+@item unrollpeel-hotness-threshold
+Maximum ratio of total execution count to loop entry block count under which
+most profile-based code size estimates will be ignored for loop unrolling and
+peeling.
+
@item max-unswitch-insns
The maximum number of insns of an unswitched loop.
===================================================================
@@ -520,6 +520,8 @@ gcov_write_summary (gcov_unsigned_t tag, const str
for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++)
{
gcov_write_unsigned (csum->num);
+ gcov_write_unsigned (csum->sum_cutoff_percent);
+ gcov_write_unsigned (csum->num_to_cutoff);
gcov_write_unsigned (csum->runs);
gcov_write_counter (csum->sum_all);
gcov_write_counter (csum->run_max);
@@ -637,6 +639,8 @@ gcov_read_summary (struct gcov_summary *summary)
for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++)
{
csum->num = gcov_read_unsigned ();
+ csum->sum_cutoff_percent = gcov_read_unsigned ();
+ csum->num_to_cutoff = gcov_read_unsigned ();
csum->runs = gcov_read_unsigned ();
csum->sum_all = gcov_read_counter ();
csum->run_max = gcov_read_counter ();
===================================================================
@@ -440,7 +440,7 @@ typedef HOST_WIDEST_INT gcov_type;
#define GCOV_TAG_OBJECT_SUMMARY ((gcov_unsigned_t)0xa1000000) /* Obsolete */
#define GCOV_TAG_PROGRAM_SUMMARY ((gcov_unsigned_t)0xa3000000)
#define GCOV_TAG_SUMMARY_LENGTH \
- (1 + GCOV_COUNTERS_SUMMABLE * (2 + 3 * 2))
+ (1 + GCOV_COUNTERS_SUMMABLE * (4 + 3 * 2))
#define GCOV_TAG_MODULE_INFO ((gcov_unsigned_t)0xa4000000)
#define GCOV_TAG_PMU_LOAD_LATENCY_INFO ((gcov_unsigned_t)0xa5000000)
#define GCOV_TAG_PMU_LOAD_LATENCY_LENGTH(filename) \
@@ -541,6 +541,9 @@ typedef HOST_WIDEST_INT gcov_type;
struct gcov_ctr_summary
{
gcov_unsigned_t num; /* number of counters. */
+ gcov_unsigned_t sum_cutoff_percent;/* sum_all cutoff percentage computed
+ (times 10). */
+ gcov_unsigned_t num_to_cutoff;/* number of counters to reach above cutoff. */
gcov_unsigned_t runs; /* number of program runs */
gcov_type sum_all; /* sum of all counters accumulated. */
gcov_type run_max; /* maximum value on a single run. */
===================================================================
@@ -35,6 +35,7 @@ along with GCC; see the file COPYING3. If not see
#include "recog.h"
#include "target.h"
#include "diagnostic.h"
+#include "gcov-io.h"
/* This pass performs loop unrolling and peeling. We only perform these
optimizations on innermost loops (with single exception) because
@@ -128,12 +129,12 @@ struct opt_info
static void decide_unrolling_and_peeling (int);
static void peel_loops_completely (int);
-static void decide_peel_simple (struct loop *, int);
+static void decide_peel_simple (struct loop *, int, int);
static void decide_peel_once_rolling (struct loop *, int);
static void decide_peel_completely (struct loop *, int);
-static void decide_unroll_stupid (struct loop *, int);
+static void decide_unroll_stupid (struct loop *, int, int);
static void decide_unroll_constant_iterations (struct loop *, int);
-static void decide_unroll_runtime_iterations (struct loop *, int);
+static void decide_unroll_runtime_iterations (struct loop *, int, int);
static void peel_loop_simple (struct loop *);
static void peel_loop_completely (struct loop *);
static void unroll_loop_stupid (struct loop *);
@@ -189,36 +190,75 @@ typedef enum {
LIMIT_BOTH = 0x3
} limit_type;
-extern int cgraph_codesize_estimate;
-
/* Determine whether LOOP unrolling/peeling should be constrained based
- on code footprint estimates. */
+ on code footprint estimates. If so, sets *CODESIZE_DIVISOR to the
+ codesize-based amount divided into the max instructions in an unrolled
+ or peeled loop. */
static limit_type
-limit_code_size(struct loop *loop)
+limit_code_size(struct loop *loop, int *codesize_divisor)
{
unsigned size_threshold;
limit_type result = NO_LIMIT;
int result_int = 0;
struct niter_desc *desc = get_simple_loop_desc (loop);
+ gcov_type sum_to_header_ratio;
+ int hotness_ratio_threshold;
- if (!flag_dyn_ipa)
+ /* The codesize-based unroll factor divisor is initialized to 1 so that
+ unroll/peel factors are not reduced by default. */
+ *codesize_divisor = 1;
+
+ if (!flag_unroll_codesize_limit && !flag_peel_codesize_limit)
return NO_LIMIT;
- gcc_assert (cgraph_codesize_estimate >= 0);
+ /* First check if the application has a large codesize footprint.
+ This is estimated from FDO profile summary information for the
+ program, where the num_to_cutoff indicates the number of hottest
+ counters (blocks) that compose most of the execution time of
+ the program. A large value would indicate a large flat execution
+ profile where icache misses may be a concern. */
+ size_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD);
+ if (!profile_info
+ || profile_info->num_to_cutoff <= size_threshold)
+ return NO_LIMIT;
+ /* Next, exclude some loops where unrolling/peeling may be more
+ important to overall performance. */
+
/* Ignore FP loops, which are more likely to benefit heavily from
unrolling. */
if (desc->has_fp)
return NO_LIMIT;
- size_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD);
- if (cgraph_codesize_estimate <= (int)size_threshold)
- return NO_LIMIT;
+ gcc_assert(profile_info->sum_all > 0);
+ /* Next, set the value of the codesize-based unroll factor divisor which in
+ most loops will need to be set to a value that will reduce or eliminate
+ unrolling/peeling. */
+ if (profile_info->num_to_cutoff < size_threshold*2) {
+ /* For appliations that are less than twice the codesize limit, allow
+ limited unrolling for very hot loops. */
+ sum_to_header_ratio = profile_info->sum_all / loop->header->count;
+ hotness_ratio_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_HOTNESS_THRESHOLD);
+ /* When the profile count sum to loop entry header ratio is smaller than
+ the threshold (i.e. the loop entry is hot enough, the divisor is left
+ at its default of 1 so the unroll/peel factor is not reduced. When it
+ is bigger than the ratio, increase the divisor by the amount this ratio
+ is over the threshold, which will quickly reduce the unroll/peel
+ factor to zero as the loop's hotness reduces. */
+ if (sum_to_header_ratio > hotness_ratio_threshold) {
+ *codesize_divisor = sum_to_header_ratio / hotness_ratio_threshold;
+ gcc_assert (*codesize_divisor >= 1);
+ }
+ }
+ else
+ /* For appliations that are at least twice the codesize limit, set
+ the divisor to a large value that will force the unroll factor to 0. */
+ *codesize_divisor = INT_MAX;
- if (flag_ripa_peel_size_limit)
+ if (flag_peel_codesize_limit)
result_int |= LIMIT_PEEL;
- if (flag_ripa_unroll_size_limit)
+ if (flag_unroll_codesize_limit)
result_int |= LIMIT_UNROLL;
result = (limit_type)result_int;
@@ -443,6 +483,7 @@ decide_unrolling_and_peeling (int flags)
loop_iterator li;
location_t locus;
limit_type limit;
+ int codesize_divisor = 1;
/* Scan the loops, inner ones first. */
FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
@@ -484,22 +525,23 @@ decide_unrolling_and_peeling (int flags)
loop->av_ninsns = average_num_loop_insns (loop);
/* Determine whether to limit code size growth from unrolling and
- peeling. This is currently enabled only under LIPO (dynamic IPA)
- where we have a partial call graph. It is not applied to loops
+ peeling, using FDO profile summary information that gives an
+ estimated number of executed blocks. It is not applied to loops
with constant trip counts, as it is easier to determine the
profitability of unrolling and peeling such loops. */
- limit = limit_code_size(loop);
+ limit = limit_code_size(loop, &codesize_divisor);
if (limit != NO_LIMIT)
{
if (dump_file)
{
fprintf (dump_file, ";; Due to large code size footprint estimate, limit ");
if (limit == (LIMIT_UNROLL|LIMIT_PEEL))
- fprintf (dump_file, "unrolling and peeling\n");
+ fprintf (dump_file, "unrolling and peeling");
else if (limit == LIMIT_UNROLL)
- fprintf (dump_file, "unrolling\n");
+ fprintf (dump_file, "unrolling");
else
- fprintf (dump_file, "peeling\n");
+ fprintf (dump_file, "peeling");
+ fprintf (dump_file, " max insns by divisor %d\n", codesize_divisor);
}
}
@@ -507,15 +549,15 @@ decide_unrolling_and_peeling (int flags)
priority. */
decide_unroll_constant_iterations (loop, flags);
- if (loop->lpt_decision.decision == LPT_NONE
- && !(limit & LIMIT_UNROLL))
- decide_unroll_runtime_iterations (loop, flags);
- if (loop->lpt_decision.decision == LPT_NONE
- && !(limit & LIMIT_UNROLL))
- decide_unroll_stupid (loop, flags);
- if (loop->lpt_decision.decision == LPT_NONE
- && !(limit & LIMIT_PEEL))
- decide_peel_simple (loop, flags);
+ if (loop->lpt_decision.decision == LPT_NONE)
+ decide_unroll_runtime_iterations (loop, flags, (limit & LIMIT_UNROLL)
+ ? codesize_divisor : 1);
+ if (loop->lpt_decision.decision == LPT_NONE)
+ decide_unroll_stupid (loop, flags, (limit & LIMIT_UNROLL)
+ ? codesize_divisor : 1);
+ if (loop->lpt_decision.decision == LPT_NONE)
+ decide_peel_simple (loop, flags, (limit & LIMIT_PEEL)
+ ? codesize_divisor : 1);
if (flag_opt_info >= OPT_INFO_MIN
&& loop->lpt_decision.decision != LPT_NONE)
@@ -1049,7 +1091,8 @@ unroll_loop_constant_iterations (struct loop *loop
/* Decide whether to unroll LOOP iterating runtime computable number of times
and how much. */
static void
-decide_unroll_runtime_iterations (struct loop *loop, int flags)
+decide_unroll_runtime_iterations (struct loop *loop, int flags,
+ int codesize_divisor)
{
unsigned nunroll, nunroll_by_av, nunroll_branches, i;
struct niter_desc *desc;
@@ -1067,8 +1110,10 @@ static void
/* nunroll = total number of copies of the original loop body in
unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
- nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
- nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
+ nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / codesize_divisor
+ / loop->ninsns;
+ nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS)
+ / codesize_divisor / loop->av_ninsns;
if (nunroll > nunroll_by_av)
nunroll = nunroll_by_av;
if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
@@ -1455,7 +1500,7 @@ unroll_loop_runtime_iterations (struct loop *loop)
/* Decide whether to simply peel LOOP and how much. */
static void
-decide_peel_simple (struct loop *loop, int flags)
+decide_peel_simple (struct loop *loop, int flags, int codesize_divisor)
{
unsigned npeel;
struct niter_desc *desc;
@@ -1470,7 +1515,8 @@ static void
fprintf (dump_file, "\n;; Considering simply peeling loop\n");
/* npeel = number of iterations to peel. */
- npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
+ npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / codesize_divisor
+ / loop->ninsns;
if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
@@ -1608,7 +1654,7 @@ peel_loop_simple (struct loop *loop)
/* Decide whether to unroll LOOP stupidly and how much. */
static void
-decide_unroll_stupid (struct loop *loop, int flags)
+decide_unroll_stupid (struct loop *loop, int flags, int codesize_divisor)
{
unsigned nunroll, nunroll_by_av, i;
struct niter_desc *desc;
@@ -1624,9 +1670,10 @@ static void
/* nunroll = total number of copies of the original loop body in
unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
- nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
- nunroll_by_av
- = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
+ nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / codesize_divisor
+ / loop->ninsns;
+ nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS)
+ / codesize_divisor / loop->av_ninsns;
if (nunroll > nunroll_by_av)
nunroll = nunroll_by_av;
if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
===================================================================
@@ -468,6 +468,9 @@ read_counts_file (const char *da_file_name, unsign
gcov_read_summary (&sum);
for (ix = 0; ix != GCOV_COUNTERS_SUMMABLE; ix++)
{
+ summary.ctrs[ix].sum_cutoff_percent
+ = sum.ctrs[ix].sum_cutoff_percent;
+ summary.ctrs[ix].num_to_cutoff += sum.ctrs[ix].num_to_cutoff;
summary.ctrs[ix].runs += sum.ctrs[ix].runs;
summary.ctrs[ix].sum_all += sum.ctrs[ix].sum_all;
if (summary.ctrs[ix].run_max < sum.ctrs[ix].run_max)
===================================================================
@@ -1139,14 +1139,6 @@ Common Report Var(flag_ripa_no_promote_always_inli
Don't promote always inline static functions assuming they
will be inlined and no copy is needed.
-fripa-peel-size-limit
-Common Report Var(flag_ripa_peel_size_limit) Init(1) Optimization
-Limit non-const non-FP loop peeling under dynamic IPA estimates of large code footprint
-
-fripa-unroll-size-limit
-Common Report Var(flag_ripa_unroll_size_limit) Init(1) Optimization
-Limit non-const non-FP loop unrolling under dynamic IPA estimates of large code footprint
-
fripa-inc-path-sub=
Common Joined RejectNegative Var(lipo_inc_path_pattern)
Substitute substring in include paths with a new string to allow reuse profile data
@@ -1639,6 +1631,10 @@ fpcc-struct-return
Common Report Var(flag_pcc_struct_return,1) Init(DEFAULT_PCC_STRUCT_RETURN)
Return small aggregates in memory, not registers
+fpeel-codesize-limit
+Common Report Var(flag_peel_codesize_limit) Init(1) Optimization
+Limit non-const non-FP loop peeling under profile estimates of large code footprint
+
fpeel-loops
Common Report Var(flag_peel_loops) Optimization
Perform loop peeling
@@ -2216,6 +2212,10 @@ funroll-all-loops
Common Report Var(flag_unroll_all_loops) Optimization
Perform loop unrolling for all loops
+funroll-codesize-limit
+Common Report Var(flag_unroll_codesize_limit) Init(1) Optimization
+Limit non-const non-FP loop unrolling under profile estimates of large code footprint
+
; Nonzero means that loop optimizer may assume that the induction variables
; that control loops do not overflow and that the loops with nontrivial
; exit condition are not infinite
===================================================================
@@ -46,7 +46,6 @@ along with GCC; see the file COPYING3. If not see
#include "except.h"
#include "plugin.h"
#include "regset.h" /* FIXME: For reg_obstack. */
-#include "params.h"
/* Gate: execute, or not, all of the non-trivial optimizations. */
@@ -149,49 +148,6 @@ struct gimple_opt_pass pass_all_early_optimization
}
};
-int cgraph_codesize_estimate = -1;
-
-/* Estimate the code size from the dynamic IPA call graph. */
-static void
-compute_codesize_estimate(void)
-{
- struct cgraph_node *node;
- basic_block bb;
- gimple_stmt_iterator bsi;
- struct function *my_function;
-
- if (!flag_dyn_ipa)
- return;
-
- if (cgraph_codesize_estimate >= 0)
- return;
- cgraph_codesize_estimate = 0;
-
- for (node = cgraph_nodes; node; node = node->next)
- {
- if (node->count == 0)
- continue;
-
- if (!gimple_has_body_p(node->decl))
- continue;
-
- my_function = DECL_STRUCT_FUNCTION (node->decl);
-
- FOR_EACH_BB_FN (bb, my_function)
- {
- if (bb->count < PARAM_VALUE (PARAM_CODESIZE_HOTNESS_THRESHOLD))
- continue;
- for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
- {
- gimple stmt = gsi_stmt (bsi);
- cgraph_codesize_estimate += estimate_num_insns (stmt, &eni_size_weights);
- }
- }
- }
- if (dump_file)
- fprintf (dump_file, "Code size estimate from cgraph: %d\n", cgraph_codesize_estimate);
-}
-
/* Pass: cleanup the CFG just before expanding trees to RTL.
This is just a round of label cleanups and case node grouping
because after the tree optimizers have run such cleanups may
@@ -202,8 +158,6 @@ execute_cleanup_cfg_post_optimizing (void)
{
unsigned int todo = 0;
- /* Estimate the code footprint for hot BBs before we enter RTL */
- compute_codesize_estimate();
if (cleanup_tree_cfg ())
todo |= TODO_update_ssa;
maybe_remove_unreachable_handlers ();
===================================================================
@@ -337,21 +337,23 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
"max-completely-peel-loop-nest-depth",
"The maximum depth of a loop nest we completely peel",
8, 0, 0)
-/* The minimum profile count of basic blocks to look at when estimating
- * the code size footprint of the call graph in a dynamic IPA compile. */
-DEFPARAM(PARAM_CODESIZE_HOTNESS_THRESHOLD,
- "codesize-hotness-threshold",
- "Minimum profile count of basic blocks counted towards dynamic IPA "
- "code size footprint estimate",
- 10000, 0, 0)
/* The maximum code size estimate under which loop unrolling and peeling
- * is allowed in a dynamic IPA compile. This currently applies to loops with
- * non-constant iteration counts and no floating point computations. */
+ * is allowed in a profile feedback compile. This currently applies to loops
+ * with non-constant iteration counts and no floating point computations. */
DEFPARAM(PARAM_UNROLLPEEL_CODESIZE_THRESHOLD,
"unrollpeel-codesize-threshold",
- "Maximum dynamic IPA code size footprint estimate for loop unrolling "
+ "Maximum profile-based code size footprint estimate for loop unrolling "
"and peeling",
- 10000, 0, 0)
+ 15000, 0, 0)
+/* The maximum ratio of total profiled execution counts to loop entry block
+ count that must be exceeded to ignore most code size limits when unrolling
+ and peeling. */
+DEFPARAM(PARAM_UNROLLPEEL_HOTNESS_THRESHOLD,
+ "unrollpeel-hotness-threshold",
+ "Maximum ratio of total profiled execution count to loop entry "
+ "block count under which most codesize limits for unrolling and "
+ "peeling will be ignored",
+ 100, 1, 0)
DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
"min-iter-unroll-with-branches",
===================================================================
@@ -526,8 +526,11 @@ tag_summary (const char *filename ATTRIBUTE_UNUSED
{
printf ("\n");
print_prefix (filename, 0, 0);
- printf ("\t\tcounts=%u, runs=%u",
- summary.ctrs[ix].num, summary.ctrs[ix].runs);
+ printf ("\t\tcounts=%u (cutoff pct*10=%u, num to cutoff=%u), runs=%u",
+ summary.ctrs[ix].num,
+ summary.ctrs[ix].sum_cutoff_percent,
+ summary.ctrs[ix].num_to_cutoff,
+ summary.ctrs[ix].runs);
printf (", sum_all=" HOST_WIDEST_INT_PRINT_DEC,
(HOST_WIDEST_INT)summary.ctrs[ix].sum_all);