===================================================================
@@ -8842,6 +8842,12 @@ The maximum number of insns of an unswitched loop.
@item max-unswitch-level
The maximum number of branches unswitched in a single loop.
+@item min-iter-unroll-with-branches
+Minimum iteration count to ignore branch effects when unrolling.
+
+@item unroll-outer-loop-branch-budget
+Maximum number of branches allowed in hot outer loop region after unroll.
+
@item lim-expensive
The minimum cost of an expensive expression in the loop invariant motion.
===================================================================
@@ -152,6 +152,99 @@ static void combine_var_copies_in_loop_exit (struc
basic_block);
static rtx get_expansion (struct var_to_expand *);
+/* Compute the maximum number of times LOOP can be unrolled without exceeding
+ a branch budget, which can increase branch mispredictions. The number of
+ branches is computed by weighting each branch with its expected execution
+ probability through the loop based on profile data. If no profile feedback
+ data exists, simply return the current NUNROLL factor. */
+
+static unsigned
+max_unroll_with_branches(struct loop *loop, unsigned nunroll)
+{
+ struct loop *outer;
+ struct loop_desc *outer_desc = 0;
+ int outer_niters = 1;
+ int frequent_iteration_threshold;
+ unsigned branch_budget;
+ struct loop_desc *desc = get_simple_loop_desc (loop);
+
+ /* Ignore loops with FP computation as these tend to benefit much more
+ consistently from unrolling. */
+ if (desc->has_fp)
+ return nunroll;
+
+ frequent_iteration_threshold = PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES);
+ if (expected_loop_iterations (loop) >= (unsigned) frequent_iteration_threshold)
+ return nunroll;
+
+ /* If there was no profile feedback data, av_num_branches will be 0
+ and we won't limit unrolling. If the av_num_branches is at most 1,
+ also don't limit unrolling as the back-edge branch will not be duplicated. */
+ if (desc->av_num_branches <= 1)
+ return nunroll;
+
+ /* Walk up the loop tree until we find a hot outer loop in which the current
+ loop is nested. At that point we will compute the number of times the
+ current loop can be unrolled based on the number of branches in the hot
+ outer loop. */
+ outer = loop_outer (loop);
+ /* The loop structure contains a fake outermost loop, so this should always
+ be non-NULL for our current loop. */
+ gcc_assert (outer);
+
+ /* Walk up the loop tree until we either find a hot outer loop or hit the
+ fake outermost loop at the root. */
+ while (true)
+ {
+ outer_desc = get_simple_loop_desc (outer);
+
+ /* Stop if we hit the fake outermost loop at the root of the tree,
+ which includes the whole procedure. */
+ if (!loop_outer (outer))
+ break;
+
+ if (outer_desc->const_iter)
+ outer_niters *= outer_desc->niter;
+ else if (outer->header->count)
+ outer_niters *= expected_loop_iterations (outer);
+
+ /* If the outer loop has enough iterations to be considered hot, then
+ we can stop our upwards loop tree traversal and examine the current
+ outer loop. */
+ if (outer_niters >= frequent_iteration_threshold)
+ break;
+
+ outer = loop_outer (outer);
+ }
+
+ gcc_assert(outer);
+
+ /* Assume that any call will cause the branch budget to be exceeded,
+ and that we can't unroll the current loop without increasing
+ mispredicts. */
+ if (outer_desc->has_call)
+ return 0;
+
+ /* Otherwise, compute the maximum number of times current loop can be
+ unrolled without exceeding our branch budget. First we subtract
+ off the outer loop's average branch count from the budget. Note
+ that this includes the branches in the current loop. This yields
+ the number of branches left in the budget for the unrolled copies.
+ We divide this by the number of branches in the current loop that
+ must be duplicated when we unroll, which is the total average
+ number of branches minus the back-edge branch. This yields the
+ number of new loop body copies that can be created by unrolling
+ without exceeding the budget, to which we add 1 to get the unroll
+ factor. Note that the "outermost loop" may be the whole procedure
+ if we did not find a hot enough enclosing loop. */
+ branch_budget = PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET);
+ if (outer_desc->av_num_branches > branch_budget)
+ return 0;
+ /* We already returned early if desc->av_num_branches <= 1. */
+ return (branch_budget - outer_desc->av_num_branches)
+ / (desc->av_num_branches - 1) + 1;
+}
+
/* Unroll and/or peel (depending on FLAGS) LOOPS. */
void
unroll_and_peel_loops (int flags)
@@ -211,7 +304,7 @@ unroll_and_peel_loops (int flags)
static bool
loop_exit_at_end_p (struct loop *loop)
{
- struct niter_desc *desc = get_simple_loop_desc (loop);
+ struct loop_desc *desc = get_simple_loop_desc (loop);
rtx insn;
if (desc->in_edge->dest != loop->latch)
@@ -321,7 +414,7 @@ decide_unrolling_and_peeling (int flags)
static void
decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
{
- struct niter_desc *desc;
+ struct loop_desc *desc;
if (dump_file)
fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
@@ -361,7 +454,7 @@ static void
decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
{
unsigned npeel;
- struct niter_desc *desc;
+ struct loop_desc *desc;
if (dump_file)
fprintf (dump_file, "\n;; Considering peeling completely\n");
@@ -459,7 +552,7 @@ peel_loop_completely (struct loop *loop)
unsigned i;
VEC (edge, heap) *remove_edges;
edge ein;
- struct niter_desc *desc = get_simple_loop_desc (loop);
+ struct loop_desc *desc = get_simple_loop_desc (loop);
struct opt_info *opt_info = NULL;
npeel = desc->niter;
@@ -522,7 +615,8 @@ static void
decide_unroll_constant_iterations (struct loop *loop, int flags)
{
unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
- struct niter_desc *desc;
+ unsigned nunroll_branches;
+ struct loop_desc *desc;
if (!(flags & UAP_UNROLL))
{
@@ -565,6 +659,21 @@ decide_unroll_constant_iterations (struct loop *lo
return;
}
+ /* Be careful when unrolling loops with branches inside -- it can increase
+ the number of mispredicts. */
+ if (desc->num_branches > 1)
+ {
+ nunroll_branches = max_unroll_with_branches (loop, nunroll);
+ if (nunroll > nunroll_branches)
+ nunroll = nunroll_branches;
+ if (nunroll <= 1)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; Not unrolling, contains branches\n");
+ return;
+ }
+ }
+
/* Check whether the loop rolls enough to consider. */
if (desc->niter < 2 * nunroll)
{
@@ -644,7 +753,7 @@ unroll_loop_constant_iterations (struct loop *loop
VEC (edge, heap) *remove_edges;
edge e;
unsigned max_unroll = loop->lpt_decision.times;
- struct niter_desc *desc = get_simple_loop_desc (loop);
+ struct loop_desc *desc = get_simple_loop_desc (loop);
bool exit_at_end = loop_exit_at_end_p (loop);
struct opt_info *opt_info = NULL;
bool ok;
@@ -802,8 +911,8 @@ unroll_loop_constant_iterations (struct loop *loop
static void
decide_unroll_runtime_iterations (struct loop *loop, int flags)
{
- unsigned nunroll, nunroll_by_av, i;
- struct niter_desc *desc;
+ unsigned nunroll, nunroll_by_av, nunroll_branches, i;
+ struct loop_desc *desc;
if (!(flags & UAP_UNROLL))
{
@@ -856,6 +965,21 @@ decide_unroll_runtime_iterations (struct loop *loo
return;
}
+ /* Be careful when unrolling loops with branches inside -- it can increase
+ the number of mispredicts. */
+ if (desc->num_branches > 1)
+ {
+ nunroll_branches = max_unroll_with_branches (loop, nunroll);
+ if (nunroll > nunroll_branches)
+ nunroll = nunroll_branches;
+ if (nunroll <= 1)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; Not unrolling, contains branches\n");
+ return;
+ }
+ }
+
/* If we have profile feedback, check whether the loop rolls. */
if ((loop->header->count
&& expected_loop_iterations (loop) < 2 * nunroll)
@@ -973,7 +1097,7 @@ unroll_loop_runtime_iterations (struct loop *loop)
edge e;
bool extra_zero_check, last_may_exit;
unsigned max_unroll = loop->lpt_decision.times;
- struct niter_desc *desc = get_simple_loop_desc (loop);
+ struct loop_desc *desc = get_simple_loop_desc (loop);
bool exit_at_end = loop_exit_at_end_p (loop);
struct opt_info *opt_info = NULL;
bool ok;
@@ -1196,7 +1320,7 @@ static void
decide_peel_simple (struct loop *loop, int flags)
{
unsigned npeel;
- struct niter_desc *desc;
+ struct loop_desc *desc;
if (!(flags & UAP_PEEL))
{
@@ -1233,7 +1357,7 @@ decide_peel_simple (struct loop *loop, int flags)
/* Do not simply peel loops with branches inside -- it increases number
of mispredicts. */
- if (num_loop_branches (loop) > 1)
+ if (desc->num_branches > 1)
{
if (dump_file)
fprintf (dump_file, ";; Not peeling, contains branches\n");
@@ -1295,7 +1419,7 @@ peel_loop_simple (struct loop *loop)
{
sbitmap wont_exit;
unsigned npeel = loop->lpt_decision.times;
- struct niter_desc *desc = get_simple_loop_desc (loop);
+ struct loop_desc *desc = get_simple_loop_desc (loop);
struct opt_info *opt_info = NULL;
bool ok;
@@ -1349,7 +1473,7 @@ static void
decide_unroll_stupid (struct loop *loop, int flags)
{
unsigned nunroll, nunroll_by_av, i;
- struct niter_desc *desc;
+ struct loop_desc *desc;
if (!(flags & UAP_UNROLL_ALL))
{
@@ -1394,7 +1518,7 @@ decide_unroll_stupid (struct loop *loop, int flags
/* Do not unroll loops with branches inside -- it increases number
of mispredicts. */
- if (num_loop_branches (loop) > 1)
+ if (desc->num_branches > 1)
{
if (dump_file)
fprintf (dump_file, ";; Not unrolling, contains branches\n");
@@ -1448,7 +1572,7 @@ unroll_loop_stupid (struct loop *loop)
{
sbitmap wont_exit;
unsigned nunroll = loop->lpt_decision.times;
- struct niter_desc *desc = get_simple_loop_desc (loop);
+ struct loop_desc *desc = get_simple_loop_desc (loop);
struct opt_info *opt_info = NULL;
bool ok;
===================================================================
@@ -259,7 +259,7 @@ doloop_condition_get (rtx doloop_pat)
describes the number of iterations of the loop. */
static bool
-doloop_valid_p (struct loop *loop, struct niter_desc *desc)
+doloop_valid_p (struct loop *loop, struct loop_desc *desc)
{
basic_block *body = get_loop_body (loop), bb;
rtx insn;
@@ -397,7 +397,7 @@ add_test (rtx cond, edge *e, basic_block dest)
DOLOOP_SEQ. COUNT is the number of iterations of the LOOP. */
static void
-doloop_modify (struct loop *loop, struct niter_desc *desc,
+doloop_modify (struct loop *loop, struct loop_desc *desc,
rtx doloop_seq, rtx condition, rtx count)
{
rtx counter_reg;
@@ -605,7 +605,7 @@ doloop_optimize (struct loop *loop)
rtx condition;
unsigned level, est_niter;
int max_cost;
- struct niter_desc *desc;
+ struct loop_desc *desc;
unsigned word_mode_size;
unsigned HOST_WIDE_INT word_mode_max;
@@ -751,4 +751,3 @@ doloop_optimize_loops (void)
#endif
}
#endif /* HAVE_doloop_end */
-
===================================================================
@@ -2022,7 +2022,7 @@ simplify_using_initial_values (struct loop *loop,
static void
shorten_into_mode (struct rtx_iv *iv, enum machine_mode mode,
- enum rtx_code cond, bool signed_p, struct niter_desc *desc)
+ enum rtx_code cond, bool signed_p, struct loop_desc *desc)
{
rtx mmin, mmax, cond_over, cond_under;
@@ -2081,7 +2081,7 @@ shorten_into_mode (struct rtx_iv *iv, enum machine
static bool
canonicalize_iv_subregs (struct rtx_iv *iv0, struct rtx_iv *iv1,
- enum rtx_code cond, struct niter_desc *desc)
+ enum rtx_code cond, struct loop_desc *desc)
{
enum machine_mode comp_mode;
bool signed_p;
@@ -2196,7 +2196,7 @@ canonicalize_iv_subregs (struct rtx_iv *iv0, struc
expression for the number of iterations, before we tried to simplify it. */
static unsigned HOST_WIDEST_INT
-determine_max_iter (struct loop *loop, struct niter_desc *desc, rtx old_niter)
+determine_max_iter (struct loop *loop, struct loop_desc *desc, rtx old_niter)
{
rtx niter = desc->niter_expr;
rtx mmin, mmax, cmp;
@@ -2244,7 +2244,7 @@ static unsigned HOST_WIDEST_INT
static void
iv_number_of_iterations (struct loop *loop, rtx insn, rtx condition,
- struct niter_desc *desc)
+ struct loop_desc *desc)
{
rtx op0, op1, delta, step, bound, may_xform, tmp, tmp0, tmp1;
struct rtx_iv iv0, iv1, tmp_iv;
@@ -2803,7 +2803,7 @@ fail:
into DESC. */
static void
-check_simple_exit (struct loop *loop, edge e, struct niter_desc *desc)
+check_simple_exit (struct loop *loop, edge e, struct loop_desc *desc)
{
basic_block exit_bb;
rtx condition, at;
@@ -2850,12 +2850,12 @@ static void
/* Finds a simple exit of LOOP and stores its description into DESC. */
void
-find_simple_exit (struct loop *loop, struct niter_desc *desc)
+find_simple_exit (struct loop *loop, struct loop_desc *desc)
{
unsigned i;
basic_block *body;
edge e;
- struct niter_desc act;
+ struct loop_desc act;
bool any = false;
edge_iterator ei;
@@ -2937,19 +2937,23 @@ void
/* Creates a simple loop description of LOOP if it was not computed
already. */
-struct niter_desc *
+struct loop_desc *
get_simple_loop_desc (struct loop *loop)
{
- struct niter_desc *desc = simple_loop_desc (loop);
+ struct loop_desc *desc = simple_loop_desc (loop);
if (desc)
return desc;
/* At least desc->infinite is not always initialized by
find_simple_loop_exit. */
- desc = XCNEW (struct niter_desc);
- iv_analysis_loop_init (loop);
- find_simple_exit (loop, desc);
+ desc = XCNEW (struct loop_desc);
+ if (loop->latch != EXIT_BLOCK_PTR)
+ {
+ iv_analysis_loop_init (loop);
+ find_simple_exit (loop, desc);
+ }
+ analyze_loop_insns (loop, desc);
loop->aux = desc;
if (desc->simple_p && (desc->assumptions || desc->infinite))
@@ -2995,7 +2999,7 @@ get_simple_loop_desc (struct loop *loop)
void
free_simple_loop_desc (struct loop *loop)
{
- struct niter_desc *desc = simple_loop_desc (loop);
+ struct loop_desc *desc = simple_loop_desc (loop);
if (!desc)
return;
===================================================================
@@ -1156,24 +1156,98 @@ get_loop_exit_edges (const struct loop *loop)
return edges;
}
-/* Counts the number of conditional branches inside LOOP. */
+/* Determine if INSN is a floating point set. */
-unsigned
-num_loop_branches (const struct loop *loop)
+static bool
+insn_has_fp_set(rtx insn)
{
- unsigned i, n;
- basic_block * body;
+ int i;
+ rtx pat = PATTERN(insn);
+ if (GET_CODE (pat) == SET)
+ return (FLOAT_MODE_P (GET_MODE (SET_DEST (pat))));
+ else if (GET_CODE (pat) == PARALLEL)
+ {
+ for (i = 0; i < XVECLEN (pat, 0); i++)
+ {
+ rtx sub = XVECEXP (pat, 0, i);
+ if (GET_CODE (sub) == SET)
+ return (FLOAT_MODE_P (GET_MODE (SET_DEST (sub))));
+ }
+ }
+ return false;
+}
- gcc_assert (loop->latch != EXIT_BLOCK_PTR);
+/* Analyzes the instructions inside LOOP, updating the DESC. Currently counts
+ the number of conditional branch instructions, calls and fp instructions,
+ as well as the average number of branches executed per iteration. */
+void
+analyze_loop_insns (const struct loop *loop, struct loop_desc *desc)
+{
+ unsigned i, nbranch;
+ gcov_type weighted_nbranch;
+ bool has_call, has_fp;
+ basic_block * body, bb;
+ rtx insn;
+ gcov_type header_count = loop->header->count;
+
+ nbranch = weighted_nbranch = 0;
+ has_call = has_fp = false;
+
body = get_loop_body (loop);
- n = 0;
for (i = 0; i < loop->num_nodes; i++)
- if (EDGE_COUNT (body[i]->succs) >= 2)
- n++;
+ {
+ bb = body[i];
+
+ if (EDGE_COUNT (bb->succs) >= 2)
+ {
+ nbranch++;
+
+ /* If this block is executed less frequently than the header (loop
+ entry), then it is weighted based on its execution count, which
+ will be turned into a ratio compared to the loop header below. */
+ if (bb->count < header_count)
+ weighted_nbranch += bb->count;
+
+ /* When it is executed more frequently than the header (i.e. it is
+ in a nested inner loop), simply weight the branch the same as the
+ header execution count, so that it will contribute 1 branch to
+ the ratio computed below. */
+ else
+ weighted_nbranch += header_count;
+ }
+
+ /* No need to iterate through the instructions below if
+ both flags have already been set. */
+ if (has_call && has_fp)
+ continue;
+
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!INSN_P (insn))
+ continue;
+
+ if (!has_call)
+ has_call = CALL_P (insn);
+
+ if (!has_fp)
+ has_fp = insn_has_fp_set (insn);
+ }
+ }
free (body);
- return n;
+ desc->num_branches = nbranch;
+ /* Now divide the weights computed above by the loop header execution count,
+ to compute the average number of branches through the loop. By adding
+ header_count/2 to the numerator we round to nearest with integer
+ division. */
+ if (header_count != 0)
+ desc->av_num_branches
+ = (weighted_nbranch + header_count/2) / header_count;
+ else
+ desc->av_num_branches = 0;
+ desc->has_call = has_call;
+ desc->has_fp = has_fp;
}
/* Adds basic block BB to LOOP. */
===================================================================
@@ -249,7 +249,6 @@ extern basic_block *get_loop_body_in_custom_order
extern VEC (edge, heap) *get_loop_exit_edges (const struct loop *);
edge single_exit (const struct loop *);
-extern unsigned num_loop_branches (const struct loop *);
extern edge loop_preheader_edge (const struct loop *);
extern edge loop_latch_edge (const struct loop *);
@@ -359,9 +358,10 @@ struct rtx_iv
};
/* The description of an exit from the loop and of the number of iterations
- till we take the exit. */
+ till we take the exit. Also includes other information used primarily
+ by the loop unroller. */
-struct niter_desc
+struct loop_desc
{
/* The edge out of the loop. */
edge out_edge;
@@ -400,6 +400,18 @@ struct rtx_iv
/* The number of iterations of the loop. */
rtx niter_expr;
+
+ /* The number of branches in the loop. */
+ unsigned num_branches;
+
+ /* The number of executed branches per iteration. */
+ unsigned av_num_branches;
+
+ /* Whether the loop contains a call instruction. */
+ bool has_call;
+
+ /* Whether the loop contains fp instructions. */
+ bool has_fp;
};
extern void iv_analysis_loop_init (struct loop *);
@@ -408,16 +420,17 @@ extern bool iv_analyze_result (rtx, rtx, struct rt
extern bool iv_analyze_expr (rtx, rtx, enum machine_mode, struct rtx_iv *);
extern rtx get_iv_value (struct rtx_iv *, rtx);
extern bool biv_p (rtx, rtx);
-extern void find_simple_exit (struct loop *, struct niter_desc *);
+extern void find_simple_exit (struct loop *, struct loop_desc *);
extern void iv_analysis_done (void);
-extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
+extern struct loop_desc *get_simple_loop_desc (struct loop *loop);
extern void free_simple_loop_desc (struct loop *loop);
+void analyze_loop_insns (const struct loop *, struct loop_desc *desc);
-static inline struct niter_desc *
+static inline struct loop_desc *
simple_loop_desc (struct loop *loop)
{
- return (struct niter_desc *) loop->aux;
+ return (struct loop_desc *) loop->aux;
}
/* Accessors for the loop structures. */
===================================================================
@@ -312,6 +312,15 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
"The maximum depth of a loop nest we completely peel",
8, 0, 0)
+DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
+ "min-iter-unroll-with-branches",
+ "Minimum iteration count to ignore branch effects when unrolling",
+ 50, 0, 0)
+DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
+ "unroll-outer-loop-branch-budget",
+ "Maximum number of branches allowed in hot outer loop region after unroll",
+ 25, 0, 0)
+
/* The maximum number of insns of an unswitched loop. */
DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
"max-unswitch-insns",