@@ -3481,7 +3481,8 @@ static bool
vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info, slp_instance_kind kind,
- unsigned max_tree_size, unsigned *limit);
+ unsigned max_tree_size, unsigned *limit,
+ bool force_single_lane = false);
/* Build an interleaving scheme for the store sources RHS_NODES from
SCALAR_STMTS. */
@@ -3676,7 +3677,8 @@ vect_build_slp_instance (vec_info *vinfo,
unsigned max_tree_size, unsigned *limit,
scalar_stmts_to_slp_tree_map_t *bst_map,
/* ??? We need stmt_info for group splitting. */
- stmt_vec_info stmt_info_)
+ stmt_vec_info stmt_info_,
+ bool force_single_lane = false)
{
/* If there's no budget left bail out early. */
if (*limit == 0)
@@ -3705,9 +3707,17 @@ vect_build_slp_instance (vec_info *vinfo,
poly_uint64 max_nunits = 1;
unsigned tree_size = 0;
unsigned i;
- slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
- &max_nunits, matches, limit,
- &tree_size, bst_map);
+
+ slp_tree node = NULL;
+ if (force_single_lane)
+ {
+ matches[0] = true;
+ matches[1] = false;
+ }
+ else
+ node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
+ &max_nunits, matches, limit,
+ &tree_size, bst_map);
if (node != NULL)
{
/* Calculate the unrolling factor based on the smallest type. */
@@ -3922,7 +3932,7 @@ vect_build_slp_instance (vec_info *vinfo,
&& compare_step_with_zero (vinfo, stmt_info) > 0
&& vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
group_size, 1));
- if (want_store_lanes)
+ if (want_store_lanes || force_single_lane)
i = 1;
if (dump_enabled_p ())
@@ -3958,7 +3968,7 @@ vect_build_slp_instance (vec_info *vinfo,
(max_nunits, end - start));
rhs_nodes.safe_push (node);
start = end;
- if (want_store_lanes)
+ if (want_store_lanes || force_single_lane)
end = start + 1;
else
end = group_size;
@@ -4086,7 +4096,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info,
slp_instance_kind kind,
- unsigned max_tree_size, unsigned *limit)
+ unsigned max_tree_size, unsigned *limit,
+ bool force_single_lane)
{
vec<stmt_vec_info> scalar_stmts;
@@ -4131,7 +4142,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
roots, remain,
max_tree_size, limit, bst_map,
kind == slp_inst_kind_store
- ? stmt_info : NULL);
+ ? stmt_info : NULL, force_single_lane);
/* ??? If this is slp_inst_kind_store and the above succeeded here's
where we should do store group splitting. */
@@ -4662,6 +4673,94 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
}
}
+ /* Check whether we should force some SLP instances to use load/store-lanes
+ and do so by forcing SLP re-discovery with single lanes. We used
+ to cancel SLP when this applied to all instances in a loop but now
+ we decide this per SLP instance. It's important to do this only
+ after SLP pattern recognition. */
+ if (is_a <loop_vec_info> (vinfo))
+ FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
+ if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+ && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
+ {
+ slp_tree slp_root = SLP_INSTANCE_TREE (instance);
+ int group_size = SLP_TREE_LANES (slp_root);
+ tree vectype = SLP_TREE_VECTYPE (slp_root);
+
+ auto_vec<slp_tree> loads;
+ hash_set<slp_tree> visited;
+ vect_gather_slp_loads (loads, slp_root, visited);
+
+ /* Check whether any load in the SLP instance is possibly
+ permuted. */
+ bool loads_permuted = false;
+ slp_tree load_node;
+ unsigned j;
+ FOR_EACH_VEC_ELT (loads, j, load_node)
+ {
+ if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+ continue;
+ unsigned k;
+ stmt_vec_info load_info;
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
+ if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
+ {
+ loads_permuted = true;
+ break;
+ }
+ }
+
+ /* If the loads and stores can use load/store-lanes force re-discovery
+ with single lanes. */
+ if (loads_permuted
+ && !slp_root->ldst_lanes
+ && vect_store_lanes_supported (vectype, group_size, false)
+ != IFN_LAST)
+ {
+ bool can_use_lanes = true;
+ FOR_EACH_VEC_ELT (loads, j, load_node)
+ if (STMT_VINFO_GROUPED_ACCESS
+ (SLP_TREE_REPRESENTATIVE (load_node)))
+ {
+ stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+ (SLP_TREE_REPRESENTATIVE (load_node));
+ /* Use SLP for strided accesses (or if we can't
+ load-lanes). */
+ if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+ || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
+ || vect_load_lanes_supported
+ (STMT_VINFO_VECTYPE (stmt_vinfo),
+ DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
+ {
+ can_use_lanes = false;
+ break;
+ }
+ }
+
+ if (can_use_lanes)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP instance %p can use load/store-lanes,"
+ " re-discovering with single-lanes\n",
+ (void *) instance);
+
+ stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
+
+ vect_free_slp_instance (instance);
+ limit = max_tree_size;
+ bool res = vect_analyze_slp_instance (vinfo, bst_map,
+ stmt_info,
+ slp_inst_kind_store,
+ max_tree_size, &limit,
+ true);
+ gcc_assert (res);
+ auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
+ LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
+ }
+ }
+ }
+
/* When we end up with load permutations that we cannot possibly handle,
like those requiring three vector inputs, lower them using interleaving
like schemes. */