@@ -81,9 +81,8 @@ int main (int argc, const char* argv[])
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int && {! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump "can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
@@ -55,8 +55,6 @@ int main (int argc, const char* argv[])
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes } } } } */
/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
@@ -68,9 +68,7 @@ int main (int argc, const char* argv[])
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes } } } } */
/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
@@ -115,4 +115,4 @@ int main (int argc, const char* argv[])
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! { vect_load_lanes && vect_strided5 } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
@@ -106,5 +106,5 @@ int main (int argc, const char* argv[])
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm3_int } } } */
/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */
-/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */
+/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
@@ -58,7 +58,5 @@ int main (int argc, const char* argv[])
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_perm_short || vect32 } || vect_load_lanes } } } } */
/* We don't try permutes with a group size of 3 for variable-length
vectors. */
-/* { dg-final { scan-tree-dump "permutation requires at least three vectors" "vect" { target { vect_perm_short && { ! vect_perm3_short } } xfail vect_variable_length } } } */
-/* { dg-final { scan-tree-dump-not "permutation requires at least three vectors" "vect" { target vect_perm3_short } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_perm3_short || { vect32 || vect_load_lanes } } } } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_short || { vect32 || vect_load_lanes } } } } } */
@@ -2958,82 +2958,6 @@ start_over:
"unsupported SLP instances\n");
goto again;
}
-
- /* Check whether any load in ALL SLP instances is possibly permuted. */
- slp_tree load_node, slp_root;
- unsigned i, x;
- slp_instance instance;
- bool can_use_lanes = true;
- FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
- {
- slp_root = SLP_INSTANCE_TREE (instance);
- int group_size = SLP_TREE_LANES (slp_root);
- tree vectype = SLP_TREE_VECTYPE (slp_root);
- bool loads_permuted = false;
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
- {
- if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
- continue;
- unsigned j;
- stmt_vec_info load_info;
- FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
- if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
- {
- loads_permuted = true;
- break;
- }
- }
-
- /* If the loads and stores can be handled with load/store-lane
- instructions record it and move on to the next instance. */
- if (loads_permuted
- && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
- && vect_store_lanes_supported (vectype, group_size, false)
- != IFN_LAST)
- {
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
- if (STMT_VINFO_GROUPED_ACCESS
- (SLP_TREE_REPRESENTATIVE (load_node)))
- {
- stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
- (SLP_TREE_REPRESENTATIVE (load_node));
- /* Use SLP for strided accesses (or if we can't
- load-lanes). */
- if (STMT_VINFO_STRIDED_P (stmt_vinfo)
- || vect_load_lanes_supported
- (STMT_VINFO_VECTYPE (stmt_vinfo),
- DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
- break;
- }
-
- can_use_lanes
- = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
-
- if (can_use_lanes && dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "SLP instance %p can use load/store-lanes\n",
- (void *) instance);
- }
- else
- {
- can_use_lanes = false;
- break;
- }
- }
-
- /* If all SLP instances can use load/store-lanes abort SLP and try again
- with SLP disabled. */
- if (can_use_lanes)
- {
- ok = opt_result::failure_at (vect_location,
- "Built SLP cancelled: can use "
- "load/store-lanes\n");
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "Built SLP cancelled: all SLP instances support "
- "load/store-lanes\n");
- goto again;
- }
}
/* Dissolve SLP-only groups. */
@@ -3484,7 +3484,8 @@ static bool
vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info, slp_instance_kind kind,
- unsigned max_tree_size, unsigned *limit);
+ unsigned max_tree_size, unsigned *limit,
+ bool force_single_lane = false);
/* Build an interleaving scheme for the store sources RHS_NODES from
SCALAR_STMTS. */
@@ -3679,7 +3680,8 @@ vect_build_slp_instance (vec_info *vinfo,
unsigned max_tree_size, unsigned *limit,
scalar_stmts_to_slp_tree_map_t *bst_map,
/* ??? We need stmt_info for group splitting. */
- stmt_vec_info stmt_info_)
+ stmt_vec_info stmt_info_,
+ bool force_single_lane = false)
{
/* If there's no budget left bail out early. */
if (*limit == 0)
@@ -3708,9 +3710,17 @@ vect_build_slp_instance (vec_info *vinfo,
poly_uint64 max_nunits = 1;
unsigned tree_size = 0;
unsigned i;
- slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
- &max_nunits, matches, limit,
- &tree_size, bst_map);
+
+ slp_tree node = NULL;
+ if (force_single_lane)
+ {
+ matches[0] = true;
+ matches[1] = false;
+ }
+ else
+ node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
+ &max_nunits, matches, limit,
+ &tree_size, bst_map);
if (node != NULL)
{
/* Calculate the unrolling factor based on the smallest type. */
@@ -3925,7 +3935,7 @@ vect_build_slp_instance (vec_info *vinfo,
&& compare_step_with_zero (vinfo, stmt_info) > 0
&& vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
group_size, 1));
- if (want_store_lanes)
+ if (want_store_lanes || force_single_lane)
i = 1;
/* A fatal discovery fail doesn't always mean single-lane SLP
@@ -3966,7 +3976,7 @@ vect_build_slp_instance (vec_info *vinfo,
(max_nunits, end - start));
rhs_nodes.safe_push (node);
start = end;
- if (want_store_lanes)
+ if (want_store_lanes || force_single_lane)
end = start + 1;
else
end = group_size;
@@ -4094,7 +4104,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info,
slp_instance_kind kind,
- unsigned max_tree_size, unsigned *limit)
+ unsigned max_tree_size, unsigned *limit,
+ bool force_single_lane)
{
vec<stmt_vec_info> scalar_stmts;
@@ -4139,7 +4150,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
roots, remain,
max_tree_size, limit, bst_map,
kind == slp_inst_kind_store
- ? stmt_info : NULL);
+ ? stmt_info : NULL, force_single_lane);
/* ??? If this is slp_inst_kind_store and the above succeeded here's
where we should do store group splitting. */
@@ -4670,6 +4681,94 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
}
}
+ /* Check whether we should force some SLP instances to use load/store-lanes
+ and do so by forcing SLP re-discovery with single lanes. We used
+ to cancel SLP when this applied to all instances in a loop but now
+ we decide this per SLP instance. It's important to do this only
+ after SLP pattern recognition. */
+ if (is_a <loop_vec_info> (vinfo))
+ FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
+ if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+ && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
+ {
+ slp_tree slp_root = SLP_INSTANCE_TREE (instance);
+ int group_size = SLP_TREE_LANES (slp_root);
+ tree vectype = SLP_TREE_VECTYPE (slp_root);
+
+ auto_vec<slp_tree> loads;
+ hash_set<slp_tree> visited;
+ vect_gather_slp_loads (loads, slp_root, visited);
+
+ /* Check whether any load in the SLP instance is possibly
+ permuted. */
+ bool loads_permuted = false;
+ slp_tree load_node;
+ unsigned j;
+ FOR_EACH_VEC_ELT (loads, j, load_node)
+ {
+ if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+ continue;
+ unsigned k;
+ stmt_vec_info load_info;
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
+ if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
+ {
+ loads_permuted = true;
+ break;
+ }
+ }
+
+ /* If the loads and stores can use load/store-lanes force re-discovery
+ with single lanes. */
+ if (loads_permuted
+ && !slp_root->ldst_lanes
+ && vect_store_lanes_supported (vectype, group_size, false)
+ != IFN_LAST)
+ {
+ bool can_use_lanes = true;
+ FOR_EACH_VEC_ELT (loads, j, load_node)
+ if (STMT_VINFO_GROUPED_ACCESS
+ (SLP_TREE_REPRESENTATIVE (load_node)))
+ {
+ stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+ (SLP_TREE_REPRESENTATIVE (load_node));
+ /* Use SLP for strided accesses (or if we can't
+ load-lanes). */
+ if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+ || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
+ || vect_load_lanes_supported
+ (STMT_VINFO_VECTYPE (stmt_vinfo),
+ DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
+ {
+ can_use_lanes = false;
+ break;
+ }
+ }
+
+ if (can_use_lanes)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP instance %p can use load/store-lanes,"
+ " re-discovering with single-lanes\n",
+ (void *) instance);
+
+ stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
+
+ vect_free_slp_instance (instance);
+ limit = max_tree_size;
+ bool res = vect_analyze_slp_instance (vinfo, bst_map,
+ stmt_info,
+ slp_inst_kind_store,
+ max_tree_size, &limit,
+ true);
+ gcc_assert (res);
+ auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
+ LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
+ }
+ }
+ }
+
/* When we end up with load permutations that we cannot possibly handle,
like those requiring three vector inputs, lower them using interleaving
like schemes. */