From 3aba91fb4d1a0dcfa04b8c8ead1941679cd46ae3 Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsim01@dsgdps-gold-1.cambridge.arm.com>
Date: Fri, 18 Oct 2019 17:49:58 +0100
Subject: [PATCH 2/3] [PATCH 1/2][vect]PR 88915: Vectorize epilogues when
versioning loops
---
gcc/tree-ssa-loop-niter.c | 6 +-
gcc/tree-ssa-loop-niter.h | 4 +-
gcc/tree-ssa-sccvn.c | 6 +-
gcc/tree-vect-loop-manip.c | 203 ++++++++++++++++++++---
gcc/tree-vect-loop.c | 332 ++++++++++++++++++++++++++++++-------
gcc/tree-vectorizer.c | 25 ++-
gcc/tree-vectorizer.h | 13 +-
7 files changed, 490 insertions(+), 99 deletions(-)
@@ -1935,7 +1935,7 @@ number_of_iterations_cond (class loop *loop,
tree
simplify_replace_tree (tree expr, tree old, tree new_tree,
- tree (*valueize) (tree))
+ tree (*valueize) (tree, void*), void *context)
{
unsigned i, n;
tree ret = NULL_TREE, e, se;
@@ -1951,7 +1951,7 @@ simplify_replace_tree (tree expr, tree old, tree new_tree,
{
if (TREE_CODE (expr) == SSA_NAME)
{
- new_tree = valueize (expr);
+ new_tree = valueize (expr, context);
if (new_tree != expr)
return new_tree;
}
@@ -1967,7 +1967,7 @@ simplify_replace_tree (tree expr, tree old, tree new_tree,
for (i = 0; i < n; i++)
{
e = TREE_OPERAND (expr, i);
- se = simplify_replace_tree (e, old, new_tree, valueize);
+ se = simplify_replace_tree (e, old, new_tree, valueize, context);
if (e == se)
continue;
@@ -53,7 +53,9 @@ extern bool scev_probably_wraps_p (tree, tree, tree, gimple *,
class loop *, bool);
extern void free_numbers_of_iterations_estimates (class loop *);
extern void free_numbers_of_iterations_estimates (function *);
-extern tree simplify_replace_tree (tree, tree, tree, tree (*)(tree) = NULL);
+extern tree simplify_replace_tree (tree, tree,
+ tree, tree (*)(tree, void *) = NULL,
+ void * = NULL);
extern void substitute_in_loop_info (class loop *, tree, tree);
#endif /* GCC_TREE_SSA_LOOP_NITER_H */
@@ -309,6 +309,10 @@ static vn_tables_t valid_info;
/* Valueization hook. Valueize NAME if it is an SSA name, otherwise
just return it. */
tree (*vn_valueize) (tree);
+tree vn_valueize_wrapper (tree t, void* context ATTRIBUTE_UNUSED)
+{
+ return vn_valueize (t);
+}
/* This represents the top of the VN lattice, which is the universal
@@ -6407,7 +6411,7 @@ process_bb (rpo_elim &avail, basic_block bb,
if (bb->loop_father->nb_iterations)
bb->loop_father->nb_iterations
= simplify_replace_tree (bb->loop_father->nb_iterations,
- NULL_TREE, NULL_TREE, vn_valueize);
+ NULL_TREE, NULL_TREE, &vn_valueize_wrapper);
}
/* Value-number all defs in the basic-block. */
@@ -1726,7 +1726,7 @@ vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
CODE and NITERS are as for vect_update_inits_of_dr. */
-static void
+void
vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
tree_code code)
{
@@ -1736,21 +1736,12 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
- /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
+ /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
+ here, but since we might use these niters to update the epilogues niters
+ and data references we can't insert them here as this definition might not
+ always dominate its uses. */
if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
- {
- gimple_seq seq;
- edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
- tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
-
- niters = fold_convert (sizetype, niters);
- niters = force_gimple_operand (niters, &seq, false, var);
- if (seq)
- {
- basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
- gcc_assert (!new_bb);
- }
- }
+ niters = fold_convert (sizetype, niters);
FOR_EACH_VEC_ELT (datarefs, i, dr)
{
@@ -2393,7 +2384,22 @@ slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
Note this function peels prolog and epilog only if it's necessary,
as well as guards.
- Returns created epilogue or NULL.
+ This function returns the epilogue loop if a decision was made to vectorize
+ it, otherwise NULL.
+
+ The analysis resulting in this epilogue loop's loop_vec_info was performed
+ in the same vect_analyze_loop call as the main loop's. At that time
+ vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
+ vectorization factors than the main loop. This list is stored in the main
+ loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
+ vectorize the epilogue loop for a lower vectorization factor, the
+ loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
+ updated and linked to the epilogue loop. This is later used to vectorize
+ the epilogue. The reason the loop_vec_info needs updating is that it was
+ constructed based on the original main loop, and the epilogue loop is a
+ copy of this loop, so all links pointing to statements in the original loop
+ need updating. Furthermore, these loop_vec_infos share the
+ data_reference's records, which will also need to be updated.
TODO: Guard for prefer_scalar_loop should be emitted along with
versioning conditions if loop versioning is needed. */
@@ -2403,7 +2409,8 @@ class loop *
vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
tree *niters_vector, tree *step_vector,
tree *niters_vector_mult_vf_var, int th,
- bool check_profitability, bool niters_no_overflow)
+ bool check_profitability, bool niters_no_overflow,
+ tree *advance, drs_init_vec &orig_drs_init)
{
edge e, guard_e;
tree type = TREE_TYPE (niters), guard_cond;
@@ -2411,6 +2418,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
profile_probability prob_prolog, prob_vector, prob_epilog;
int estimated_vf;
int prolog_peeling = 0;
+ bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
/* We currently do not support prolog peeling if the target alignment is not
known at compile time. 'vect_gen_prolog_loop_niters' depends on the
target alignment being constant. */
@@ -2464,19 +2472,73 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
int bound_prolog = 0;
if (prolog_peeling)
niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
- &bound_prolog);
+ &bound_prolog);
else
niters_prolog = build_int_cst (type, 0);
+ loop_vec_info epilogue_vinfo = NULL;
+ if (vect_epilogues)
+ {
+ epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
+ loop_vinfo->epilogue_vinfos.ordered_remove (0);
+ }
+
+ tree niters_vector_mult_vf = NULL_TREE;
+ /* Saving NITERs before the loop, as this may be changed by prologue. */
+ tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
+ edge update_e = NULL, skip_e = NULL;
+ unsigned int lowest_vf = constant_lower_bound (vf);
+ /* If we know the number of scalar iterations for the main loop we should
+ check whether after the main loop there are enough iterations left over
+ for the epilogue. */
+ if (vect_epilogues
+ && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && prolog_peeling >= 0
+ && known_eq (vf, lowest_vf))
+ {
+ unsigned HOST_WIDE_INT eiters
+ = (LOOP_VINFO_INT_NITERS (loop_vinfo)
+ - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+
+ eiters -= prolog_peeling;
+ eiters
+ = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
+
+ unsigned int ratio;
+ while (!(constant_multiple_p (loop_vinfo->vector_size,
+ epilogue_vinfo->vector_size, &ratio)
+ && eiters >= lowest_vf / ratio))
+ {
+ delete epilogue_vinfo;
+ epilogue_vinfo = NULL;
+ if (loop_vinfo->epilogue_vinfos.length () == 0)
+ {
+ vect_epilogues = false;
+ break;
+ }
+ epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
+ loop_vinfo->epilogue_vinfos.ordered_remove (0);
+ }
+ }
/* Prolog loop may be skipped. */
bool skip_prolog = (prolog_peeling != 0);
- /* Skip to epilog if scalar loop may be preferred. It's only needed
- when we peel for epilog loop and when it hasn't been checked with
- loop versioning. */
+ /* Skip this loop to epilog when there are not enough iterations to enter this
+ vectorized loop. If true we should perform runtime checks on the NITERS
+ to check whether we should skip the current vectorized loop. If we know
+ the number of scalar iterations we may choose to add a runtime check if
+ this number "maybe" smaller than the number of iterations required
+ when we know the number of scalar iterations may potentially
+ be smaller than the number of iterations required to enter this loop, for
+ this we use the upper bounds on the prolog and epilog peeling. When we
+ don't know the number of iterations and don't require versioning it is
+ because we have asserted that there are enough scalar iterations to enter
+ the main loop, so this skip is not necessary. When we are versioning then
+ we only add such a skip if we have chosen to vectorize the epilogue. */
bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
bound_prolog + bound_epilog)
- : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
+ : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || vect_epilogues));
/* Epilog loop must be executed if the number of iterations for epilog
loop is known at compile time, otherwise we need to add a check at
the end of vector loop and skip to the end of epilog loop. */
@@ -2506,6 +2568,12 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
dump_user_location_t loop_loc = find_loop_location (loop);
class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+ if (vect_epilogues)
+ /* Make sure to set the epilogue's epilogue scalar loop, such that we can
+ use the original scalar loop as remaining epilogue if necessary. */
+ LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
+ = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+
if (prolog_peeling)
{
e = loop_preheader_edge (loop);
@@ -2552,6 +2620,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
scale_loop_profile (prolog, prob_prolog, bound_prolog);
}
+
+ /* Save original inits for each data_reference before advancing them with
+ NITERS_PROLOG. */
+ unsigned int i;
+ struct data_reference *dr;
+ vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+ FOR_EACH_VEC_ELT (datarefs, i, dr)
+ orig_drs_init.safe_push (std::make_pair (dr, DR_OFFSET (dr)));
+
/* Update init address of DRs. */
vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
/* Update niters for vector loop. */
@@ -2586,8 +2663,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
"loop can't be duplicated to exit edge.\n");
gcc_unreachable ();
}
- /* Peel epilog and put it on exit edge of loop. */
- epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
+ /* Peel epilog and put it on exit edge of loop. If we are vectorizing
+ said epilog then we should use a copy of the main loop as a starting
+ point. This loop may have already had some preliminary transformations
+ to allow for more optimal vectorization, for example if-conversion.
+ If we are not vectorizing the epilog then we should use the scalar loop
+ as the transformations mentioned above make less or no sense when not
+ vectorizing. */
+ epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
+ epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
if (!epilog)
{
dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
@@ -2616,6 +2700,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
guard_to, guard_bb,
prob_vector.invert (),
irred_flag);
+ skip_e = guard_e;
e = EDGE_PRED (guard_to, 0);
e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
@@ -2637,7 +2722,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
}
basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
- tree niters_vector_mult_vf;
/* If loop is peeled for non-zero constant times, now niters refers to
orig_niters - prolog_peeling, it won't overflow even the orig_niters
overflows. */
@@ -2660,7 +2744,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
/* Update IVs of original loop as if they were advanced by
niters_vector_mult_vf steps. */
gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
- edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
+ update_e = skip_vector ? e : loop_preheader_edge (epilog);
vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
update_e);
@@ -2701,10 +2785,75 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
adjust_vec_debug_stmts ();
scev_reset ();
}
+
+ if (vect_epilogues)
+ {
+ epilog->aux = epilogue_vinfo;
+ LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
+
+ loop_constraint_clear (epilog, LOOP_C_INFINITE);
+
+ /* We now must calculate the number of NITERS performed by the previous
+ loop and EPILOGUE_NITERS to be performed by the epilogue. */
+ tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
+ niters_prolog, niters_vector_mult_vf);
+
+ /* If skip_vector we may skip the previous loop, we insert a phi-node to
+ determine whether we are coming from the previous vectorized loop
+ using the update_e edge or the skip_vector basic block using the
+ skip_e edge. */
+ if (skip_vector)
+ {
+ gcc_assert (update_e != NULL && skip_e != NULL);
+ gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
+ update_e->dest);
+ tree new_ssa = make_ssa_name (TREE_TYPE (niters));
+ gimple *stmt = gimple_build_assign (new_ssa, niters);
+ gimple_stmt_iterator gsi;
+ if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
+ && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
+ {
+ gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
+ gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+ }
+ else
+ {
+ gsi = gsi_last_bb (update_e->src);
+ gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
+ }
+
+ niters = new_ssa;
+ add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
+ add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
+ UNKNOWN_LOCATION);
+ niters = PHI_RESULT (new_phi);
+ }
+
+ /* Subtract the number of iterations performed by the vectorized loop
+ from the number of total iterations. */
+ tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
+ before_loop_niters,
+ niters);
+
+ LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
+ LOOP_VINFO_NITERSM1 (epilogue_vinfo)
+ = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
+ epilogue_niters,
+ build_one_cst (TREE_TYPE (epilogue_niters)));
+
+ /* Set ADVANCE to the number of iterations performed by the previous
+ loop and its prologue. */
+ *advance = niters;
+
+ /* Redo the peeling for niter analysis as the NITERs and alignment
+ may have been updated to take the main loop into account. */
+ determine_peel_for_niter (epilogue_vinfo);
+ }
+
adjust_vec.release ();
free_original_copy_tables ();
- return epilog;
+ return vect_epilogues ? epilog : NULL;
}
/* Function vect_create_cond_for_niters_checks.
@@ -885,6 +885,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
}
}
}
+
+ epilogue_vinfos.create (6);
}
/* Free all levels of MASKS. */
@@ -909,6 +911,7 @@ _loop_vec_info::~_loop_vec_info ()
release_vec_loop_masks (&masks);
delete ivexpr_map;
delete scan_map;
+ epilogue_vinfos.release ();
loop->aux = NULL;
}
@@ -1682,9 +1685,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
return 0;
}
- HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
- if (estimated_niter == -1)
- estimated_niter = likely_max_stmt_executions_int (loop);
+ HOST_WIDE_INT estimated_niter;
+
+ /* If we are vectorizing an epilogue then we know the maximum number of
+ scalar iterations it will cover is at least one lower than the
+ vectorization factor of the main loop. */
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+ estimated_niter
+ = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
+ else
+ {
+ estimated_niter = estimated_stmt_executions_int (loop);
+ if (estimated_niter == -1)
+ estimated_niter = likely_max_stmt_executions_int (loop);
+ }
if (estimated_niter != -1
&& ((unsigned HOST_WIDE_INT) estimated_niter
< MAX (th, (unsigned) min_profitable_estimate)))
@@ -1871,6 +1885,15 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
int res;
unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
poly_uint64 min_vf = 2;
+ loop_vec_info orig_loop_vinfo = NULL;
+
+ /* If we are dealing with an epilogue then orig_loop_vinfo points to the
+ loop_vec_info of the first vectorized loop. */
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+ orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+ else
+ orig_loop_vinfo = loop_vinfo;
+ gcc_assert (orig_loop_vinfo);
/* The first group of checks is independent of the vector size. */
fatal = true;
@@ -2150,8 +2173,18 @@ start_over:
/* During peeling, we need to check if number of loop iterations is
enough for both peeled prolog loop and vector loop. This check
can be merged along with threshold check of loop versioning, so
- increase threshold for this case if necessary. */
- if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+ increase threshold for this case if necessary.
+
+ If we are analyzing an epilogue we still want to check what its
+ versioning threshold would be. If we decide to vectorize the epilogues we
+ will want to use the lowest versioning threshold of all epilogues and main
+ loop. This will enable us to enter a vectorized epilogue even when
+ versioning the loop. We can't simply check whether the epilogue requires
+ versioning though since we may have skipped some versioning checks when
+ analyzing the epilogue. For instance, checks for alias versioning will be
+ skipped when dealing with epilogues as we assume we already checked them
+ for the main loop. So instead we always check the 'orig_loop_vinfo'. */
+ if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
{
poly_uint64 niters_th = 0;
unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
@@ -2344,6 +2377,14 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
poly_uint64 autodetected_vector_size = 0;
opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
poly_uint64 next_vector_size = 0;
+ poly_uint64 lowest_th = 0;
+ unsigned vectorized_loops = 0;
+
+ /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is enabled, this
+ is not a simd loop and it is the most inner loop. */
+ bool vect_epilogues
+ = !loop->simdlen && loop->inner == NULL
+ && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK);
while (1)
{
/* Check the CFG characteristics of the loop (nesting, entry/exit). */
@@ -2363,6 +2404,8 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
if (orig_loop_vinfo)
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
+ else if (vect_epilogues && first_loop_vinfo)
+ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
if (next_size == 0)
@@ -2371,18 +2414,43 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
if (res)
{
LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+ vectorized_loops++;
- if (loop->simdlen
- && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
- (unsigned HOST_WIDE_INT) loop->simdlen))
+ if ((loop->simdlen
+ && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+ (unsigned HOST_WIDE_INT) loop->simdlen))
+ || vect_epilogues)
{
if (first_loop_vinfo == NULL)
{
first_loop_vinfo = loop_vinfo;
+ lowest_th
+ = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
loop->aux = NULL;
}
else
- delete loop_vinfo;
+ {
+ /* Keep track of vector sizes that we know we can vectorize
+ the epilogue with. Only vectorize first epilogue. */
+ if (vect_epilogues
+ && first_loop_vinfo->epilogue_vinfos.is_empty ())
+ {
+ loop->aux = NULL;
+ first_loop_vinfo->epilogue_vinfos.reserve (1);
+ first_loop_vinfo->epilogue_vinfos.quick_push (loop_vinfo);
+ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+ poly_uint64 th
+ = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
+ gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || maybe_ne (lowest_th, 0U));
+ /* Keep track of the known smallest versioning
+ threshold. */
+ if (ordered_p (lowest_th, th))
+ lowest_th = ordered_min (lowest_th, th);
+ }
+ else
+ delete loop_vinfo;
+ }
}
else
{
@@ -2416,6 +2484,8 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
dump_dec (MSG_NOTE, first_loop_vinfo->vector_size);
dump_printf (MSG_NOTE, "\n");
}
+ LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
+
return first_loop_vinfo;
}
else
@@ -7925,6 +7995,186 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
*seen_store = stmt_info;
}
+/* Helper function to pass to simplify_replace_tree to enable replacing tree's
+ in the hash_map with its corresponding values. */
+
+static tree
+find_in_mapping (tree t, void *context)
+{
+ hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
+
+ tree *value = mapping->get (t);
+ return value ? *value : t;
+}
+
+/* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
+ original loop that has now been vectorized.
+
+ The inits of the data_references need to be advanced with the number of
+ iterations of the main loop. This has been computed in vect_do_peeling and
+ is stored in parameter ADVANCE. We first restore the data_references
+ initial offset with the values recored in ORIG_DRS_INIT.
+
+ Since the loop_vec_info of this EPILOGUE was constructed for the original
+ loop, its stmt_vec_infos all point to the original statements. These need
+ to be updated to point to their corresponding copies as well as the SSA_NAMES
+ in their PATTERN_DEF_SEQs and RELATED_STMTs.
+
+ The data_reference's connections also need to be updated. Their
+ corresponding dr_vec_info need to be reconnected to the EPILOGUE's
+ stmt_vec_infos, their statements need to point to their corresponding copy,
+ if they are gather loads or scatter stores then their reference needs to be
+ updated to point to its corresponding copy and finally we set
+ 'base_misaligned' to false as we have already peeled for alignment in the
+ prologue of the main loop. */
+
+static void
+update_epilogue_loop_vinfo (class loop *epilogue, tree advance,
+ drs_init_vec &orig_drs_init)
+{
+ loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+ auto_vec<gimple *> stmt_worklist;
+ hash_map<tree,tree> mapping;
+ gimple *orig_stmt, *new_stmt;
+ gimple_stmt_iterator epilogue_gsi;
+ gphi_iterator epilogue_phi_gsi;
+ stmt_vec_info stmt_vinfo = NULL, related_vinfo;
+ basic_block *epilogue_bbs = get_loop_body (epilogue);
+
+ LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
+
+ /* Restore original data_reference's offset, before the previous loop and its
+ prologue. */
+ std::pair<data_reference*, tree> *dr_init;
+ unsigned i;
+ for (i = 0; orig_drs_init.iterate (i, &dr_init); i++)
+ DR_OFFSET (dr_init->first) = dr_init->second;
+
+ /* Advance data_reference's with the number of iterations of the previous
+ loop and its prologue. */
+ vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
+
+
+ /* The EPILOGUE loop is a copy of the original loop so they share the same
+ gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
+ point to the copied statements. We also create a mapping of all LHS' in
+ the original loop and all the LHS' in the EPILOGUE and create worklists to
+ update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
+ for (unsigned i = 0; i < epilogue->num_nodes; ++i)
+ {
+ for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
+ !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
+ {
+ new_stmt = epilogue_phi_gsi.phi ();
+
+ gcc_assert (gimple_uid (new_stmt) > 0);
+ stmt_vinfo
+ = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
+
+ orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
+ STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+
+ mapping.put (gimple_phi_result (orig_stmt),
+ gimple_phi_result (new_stmt));
+ /* PHI nodes can not have patterns or related statements. */
+ gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
+ && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
+ }
+
+ for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
+ !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
+ {
+ new_stmt = gsi_stmt (epilogue_gsi);
+
+ gcc_assert (gimple_uid (new_stmt) > 0);
+ stmt_vinfo
+ = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
+
+ orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
+ STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+
+ if (tree old_lhs = gimple_get_lhs (orig_stmt))
+ mapping.put (old_lhs, gimple_get_lhs (new_stmt));
+
+ if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+ {
+ gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
+ for (gimple_stmt_iterator gsi = gsi_start (seq);
+ !gsi_end_p (gsi); gsi_next (&gsi))
+ stmt_worklist.safe_push (gsi_stmt (gsi));
+ }
+
+ related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+ if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
+ {
+ gimple *stmt = STMT_VINFO_STMT (related_vinfo);
+ stmt_worklist.safe_push (stmt);
+ /* Set BB such that the assert in
+ 'get_initial_def_for_reduction' is able to determine that
+ the BB of the related stmt is inside this loop. */
+ gimple_set_bb (stmt,
+ gimple_bb (new_stmt));
+ related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
+ gcc_assert (related_vinfo == NULL
+ || related_vinfo == stmt_vinfo);
+ }
+ }
+ }
+
+ /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
+ using the original main loop and thus need to be updated to refer to the
+ cloned variables used in the epilogue. */
+ for (unsigned i = 0; i < stmt_worklist.length (); ++i)
+ {
+ gimple *stmt = stmt_worklist[i];
+ tree *new_op;
+
+ for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
+ {
+ tree op = gimple_op (stmt, j);
+ if ((new_op = mapping.get(op)))
+ gimple_set_op (stmt, j, *new_op);
+ else
+ {
+ op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
+ &find_in_mapping, &mapping);
+ gimple_set_op (stmt, j, op);
+ }
+ }
+ }
+
+ struct data_reference *dr;
+ vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
+ FOR_EACH_VEC_ELT (datarefs, i, dr)
+ {
+ orig_stmt = DR_STMT (dr);
+ gcc_assert (gimple_uid (orig_stmt) > 0);
+ stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
+ /* Data references for gather loads and scatter stores do not use the
+ updated offset we set using ADVANCE. Instead we have to make sure the
+ reference in the data references point to the corresponding copy of
+ the original in the epilogue. */
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
+ {
+ DR_REF (dr)
+ = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
+ &find_in_mapping, &mapping);
+ DR_BASE_ADDRESS (dr)
+ = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
+ &find_in_mapping, &mapping);
+ }
+ DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
+ stmt_vinfo->dr_aux.stmt = stmt_vinfo;
+ /* The vector size of the epilogue is smaller than that of the main loop
+ so the alignment is either the same or lower. This means the dr will
+ thus by definition be aligned. */
+ STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
+ }
+
+ epilogue_vinfo->shared->datarefs_copy.release ();
+ epilogue_vinfo->shared->save_datarefs ();
+}
+
/* Function vect_transform_loop.
The analysis phase has determined that the loop is vectorizable.
@@ -7962,11 +8212,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
if (th >= vect_vf_for_cost (loop_vinfo)
&& !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "Profitability threshold is %d loop iterations.\n",
- th);
- check_profitability = true;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Profitability threshold is %d loop iterations.\n",
+ th);
+ check_profitability = true;
}
/* Make sure there exists a single-predecessor exit bb. Do this before
@@ -8010,9 +8260,14 @@ vect_transform_loop (loop_vec_info loop_vinfo)
LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
+ tree advance;
+ drs_init_vec orig_drs_init;
+
epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
&step_vector, &niters_vector_mult_vf, th,
- check_profitability, niters_no_overflow);
+ check_profitability, niters_no_overflow,
+ &advance, orig_drs_init);
+
if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
&& LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
@@ -8271,57 +8526,14 @@ vect_transform_loop (loop_vec_info loop_vinfo)
since vectorized loop can have loop-carried dependencies. */
loop->safelen = 0;
- /* Don't vectorize epilogue for epilogue. */
- if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
- epilogue = NULL;
-
- if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
- epilogue = NULL;
-
if (epilogue)
{
- auto_vector_sizes vector_sizes;
- targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
- unsigned int next_size = 0;
-
- /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
- on niters already ajusted for the iterations of the prologue. */
- if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && known_eq (vf, lowest_vf))
- {
- unsigned HOST_WIDE_INT eiters
- = (LOOP_VINFO_INT_NITERS (loop_vinfo)
- - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
- eiters
- = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
- epilogue->nb_iterations_upper_bound = eiters - 1;
- epilogue->any_upper_bound = true;
-
- unsigned int ratio;
- while (next_size < vector_sizes.length ()
- && !(constant_multiple_p (loop_vinfo->vector_size,
- vector_sizes[next_size], &ratio)
- && eiters >= lowest_vf / ratio))
- next_size += 1;
- }
- else
- while (next_size < vector_sizes.length ()
- && maybe_lt (loop_vinfo->vector_size, vector_sizes[next_size]))
- next_size += 1;
+ update_epilogue_loop_vinfo (epilogue, advance, orig_drs_init);
- if (next_size == vector_sizes.length ())
- epilogue = NULL;
- }
-
- if (epilogue)
- {
+ epilogue->simduid = loop->simduid;
epilogue->force_vectorize = loop->force_vectorize;
epilogue->safelen = loop->safelen;
epilogue->dont_vectorize = false;
-
- /* We may need to if-convert epilogue to vectorize it. */
- if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
- tree_if_conversion (epilogue);
}
return epilogue;
@@ -874,6 +874,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
vec_info_shared shared;
auto_purge_vect_location sentinel;
vect_location = find_loop_location (loop);
+
if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
&& dump_enabled_p ())
dump_printf (MSG_NOTE | MSG_PRIORITY_INTERNALS,
@@ -881,10 +882,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
LOCATION_FILE (vect_location.get_location_t ()),
LOCATION_LINE (vect_location.get_location_t ()));
- /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p. */
- opt_loop_vec_info loop_vinfo
- = vect_analyze_loop (loop, orig_loop_vinfo, &shared);
- loop->aux = loop_vinfo;
+ opt_loop_vec_info loop_vinfo = opt_loop_vec_info::success (NULL);
+ /* In the case of epilogue vectorization the loop already has its
+ loop_vec_info set, we do not require to analyze the loop in this case. */
+ if (loop_vec_info vinfo = loop_vec_info_for_loop (loop))
+ loop_vinfo = opt_loop_vec_info::success (vinfo);
+ else
+ {
+ /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p. */
+ loop_vinfo = vect_analyze_loop (loop, orig_loop_vinfo, &shared);
+ loop->aux = loop_vinfo;
+ }
if (!loop_vinfo)
if (dump_enabled_p ())
@@ -1012,8 +1020,13 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
/* Epilogue of vectorized loop must be vectorized too. */
if (new_loop)
- ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
- new_loop, loop_vinfo, NULL, NULL);
+ {
+ /* Don't include vectorized epilogues in the "vectorized loops" count.
+ */
+ unsigned dont_count = *num_vectorized_loops;
+ ret |= try_vectorize_loop_1 (simduid_to_vf_htab, &dont_count,
+ new_loop, loop_vinfo, NULL, NULL);
+ }
return ret;
}
@@ -26,6 +26,7 @@ typedef class _stmt_vec_info *stmt_vec_info;
#include "tree-data-ref.h"
#include "tree-hash-traits.h"
#include "target.h"
+#include <utility>
/* Used for naming of new temporaries. */
enum vect_var_kind {
@@ -456,6 +457,8 @@ struct rgroup_masks {
typedef auto_vec<rgroup_masks> vec_loop_masks;
+typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
+
/*-----------------------------------------------------------------*/
/* Info on vectorized loops. */
/*-----------------------------------------------------------------*/
@@ -639,6 +642,10 @@ public:
this points to the original vectorized loop. Otherwise NULL. */
_loop_vec_info *orig_loop_info;
+ /* Used to store loop_vec_infos of epilogues of this loop during
+ analysis. */
+ vec<_loop_vec_info *> epilogue_vinfos;
+
} *loop_vec_info;
/* Access Functions. */
@@ -1589,10 +1596,12 @@ class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *,
class loop *, edge);
class loop *vect_loop_versioning (loop_vec_info);
extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
- tree *, tree *, tree *, int, bool, bool);
+ tree *, tree *, tree *, int, bool, bool,
+ tree *, drs_init_vec &);
extern void vect_prepare_for_masked_peels (loop_vec_info);
extern dump_user_location_t find_loop_location (class loop *);
extern bool vect_can_advance_ivs_p (loop_vec_info);
+extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
/* In tree-vect-stmts.c. */
extern tree get_vectype_for_scalar_type (vec_info *, tree);
@@ -1700,6 +1709,8 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
/* In tree-vect-loop.c. */
extern widest_int vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo);
+/* Used in tree-vect-loop-manip.c */
+extern void determine_peel_for_niter (loop_vec_info);
/* Used in gimple-loop-interchange.c and tree-parloops.c. */
extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
enum tree_code);
--
2.17.1