@@ -72,4 +72,4 @@ int main (void)
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided8 && vect_int_mult } } } } */
/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
@@ -80,5 +80,5 @@ int main (void)
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided8 && vect_int_mult } } } } */
/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { vect_strided8 && {! vect_load_lanes } } && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { { vect_strided8 && {! vect_load_lanes } } && vect_int_mult } } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */
@@ -3993,7 +3993,9 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
return;
group_lanes++;
}
- /* Only a power-of-two number of lanes matches interleaving. */
+ /* Only a power-of-two number of lanes matches interleaving with N levels.
+ ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
+ at each step. */
if (exact_log2 (group_lanes) == -1)
return;
@@ -4003,28 +4005,17 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
if (!SLP_TREE_CHILDREN (load).is_empty ())
continue;
+ /* We want to pattern-match special cases here and keep those
+ alone. Candidates are splats and load-lane. */
+
/* We need to lower only loads of less than half of the groups
- lanes, including duplicate lanes. */
+ lanes, including duplicate lanes. Note this leaves nodes
+ with a non-1:1 load permutation around instead of canonicalizing
+ those into a load and a permute node. Removing this early
+ check would do such canonicalization. */
if (SLP_TREE_LANES (load) >= group_lanes / 2)
continue;
- /* Lower by reducing the group to half its size using an
- interleaving scheme. For this try to compute whether all
- elements needed for this load are in even or odd elements of
- an even/odd decomposition with N consecutive elements.
- Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
- with N == 2. */
- unsigned even = (1 << ceil_log2 (DR_GROUP_SIZE (first))) - 1;
- unsigned odd = even;
- for (unsigned l : SLP_TREE_LOAD_PERMUTATION (load))
- {
- even &= ~l;
- odd &= l;
- }
- /* Give up when this doesn't match up with an interleaving scheme. */
- if (!even && !odd)
- continue;
-
/* First build (and possibly re-use) a load node for the
unpermuted group. */
vec<stmt_vec_info> stmts;
@@ -4047,66 +4038,105 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
final_perm.quick_push
(std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
- /* Now build an even or odd extraction from the unpermuted load. */
- lane_permutation_t perm;
- perm.create (group_lanes / 2);
- unsigned level;
- if (even
- && ((level = 1 << ctz_hwi (even)), true)
- && group_lanes % (2 * level) == 0)
- {
- /* { 0, 1, ... 4, 5 ..., } */
- unsigned level = 1 << ctz_hwi (even);
- for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
- for (unsigned j = 0; j < level; ++j)
- perm.quick_push (std::make_pair (0, 2 * i * level + j));
- }
- else if (odd)
- {
- /* { ..., 2, 3, ... 6, 7 } */
- unsigned level = 1 << ctz_hwi (odd);
- gcc_assert (group_lanes % (2 * level) == 0);
- for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
- for (unsigned j = 0; j < level; ++j)
- perm.quick_push (std::make_pair (0, (2 * i + 1) * level + j));
- }
- else
- gcc_unreachable ();
-
- /* And update final_perm. */
- for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
+ while (1)
{
- unsigned l = final_perm[i].second;
- unsigned j;
- for (j = 0; j < perm.length (); ++j)
- if (perm[j].second == l)
- {
- final_perm[i].second = j;
- break;
- }
- gcc_assert (j < perm.length ());
- }
+ unsigned group_lanes = SLP_TREE_LANES (l0);
+ if (SLP_TREE_LANES (load) >= group_lanes / 2)
+ break;
- /* And create scalar stmts. */
- vec<stmt_vec_info> perm_stmts;
- perm_stmts.create (perm.length ());
- for (unsigned i = 0; i < perm.length (); ++i)
- perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
-
- slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
- SLP_TREE_CHILDREN (p).quick_push (l0);
- SLP_TREE_LANE_PERMUTATION (p) = perm;
- SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
- SLP_TREE_LANES (p) = perm.length ();
- SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
- /* ??? As we have scalar stmts for this intermediate permute we
- could CSE it via bst_map but we do not want to pick up
- another SLP node with a load permutation. We instead should
- have a "local" CSE map here. */
- SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
- /* ??? We need to iterate if group_lanes / 2 is still too large. */
- /* ??? Ideally pick the best even/odd scheme usable for
- most of the loads. -> do a multi-step scheme? */
+ /* Try to lower by reducing the group to half its size using an
+ interleaving scheme. For this try to compute whether all
+ elements needed for this load are in even or odd elements of
+ an even/odd decomposition with N consecutive elements.
+ Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
+ with N == 2. */
+ /* ??? Only an even number of lanes can be handed this way, but the
+ fallback below could work for any number. */
+ gcc_assert ((group_lanes & 1) == 0);
+ unsigned even = (1 << ceil_log2 (group_lanes)) - 1;
+ unsigned odd = even;
+ for (auto l : final_perm)
+ {
+ even &= ~l.second;
+ odd &= l.second;
+ }
+
+ /* Now build an even or odd extraction from the unpermuted load. */
+ lane_permutation_t perm;
+ perm.create (group_lanes / 2);
+ unsigned level;
+ if (even
+ && ((level = 1 << ctz_hwi (even)), true)
+ && group_lanes % (2 * level) == 0)
+ {
+ /* { 0, 1, ... 4, 5 ..., } */
+ unsigned level = 1 << ctz_hwi (even);
+ for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
+ for (unsigned j = 0; j < level; ++j)
+ perm.quick_push (std::make_pair (0, 2 * i * level + j));
+ }
+ else if (odd)
+ {
+ /* { ..., 2, 3, ... 6, 7 } */
+ unsigned level = 1 << ctz_hwi (odd);
+ gcc_assert (group_lanes % (2 * level) == 0);
+ for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
+ for (unsigned j = 0; j < level; ++j)
+ perm.quick_push (std::make_pair (0, (2 * i + 1) * level + j));
+ }
+ else
+ {
+ /* As fallback extract all used lanes and fill to half the
+ group size by repeating the last element.
+ ??? This is quite a bad strathegy for re-use - we could
+ brute force our way to find more optimal filling lanes to
+ maximize re-use when looking at all loads from the group. */
+ auto_bitmap l;
+ for (auto p : final_perm)
+ bitmap_set_bit (l, p.second);
+ unsigned i = 0;
+ bitmap_iterator bi;
+ EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
+ perm.quick_push (std::make_pair (0, i));
+ while (perm.length () < group_lanes / 2)
+ perm.quick_push (perm.last ());
+ }
+
+ /* Update final_perm with the intermediate permute. */
+ for (unsigned i = 0; i < final_perm.length (); ++i)
+ {
+ unsigned l = final_perm[i].second;
+ unsigned j;
+ for (j = 0; j < perm.length (); ++j)
+ if (perm[j].second == l)
+ {
+ final_perm[i].second = j;
+ break;
+ }
+ gcc_assert (j < perm.length ());
+ }
+
+ /* And create scalar stmts. */
+ vec<stmt_vec_info> perm_stmts;
+ perm_stmts.create (perm.length ());
+ for (unsigned i = 0; i < perm.length (); ++i)
+ perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
+
+ slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
+ SLP_TREE_CHILDREN (p).quick_push (l0);
+ SLP_TREE_LANE_PERMUTATION (p) = perm;
+ SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
+ SLP_TREE_LANES (p) = perm.length ();
+ SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
+ /* ??? As we have scalar stmts for this intermediate permute we
+ could CSE it via bst_map but we do not want to pick up
+ another SLP node with a load permutation. We instead should
+ have a "local" CSE map here. */
+ SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
+
+ /* We now have a node for group_lanes / 2 lanes. */
+ l0 = p;
+ }
/* And finally from the ordered reduction node create the
permute to shuffle the lanes into the original load-permutation
@@ -4115,7 +4145,7 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
SLP_TREE_LOAD_PERMUTATION (load).release ();
SLP_TREE_LANE_PERMUTATION (load) = final_perm;
SLP_TREE_CHILDREN (load).create (1);
- SLP_TREE_CHILDREN (load).quick_push (p);
+ SLP_TREE_CHILDREN (load).quick_push (l0);
}
}
@@ -4315,7 +4345,18 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
like those requiring three vector inputs, lower them using interleaving
like schemes. */
if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
- vect_lower_load_permutations (loop_vinfo, bst_map);
+ {
+ vect_lower_load_permutations (loop_vinfo, bst_map);
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP graph after lowering permutations:\n");
+ hash_set<slp_tree> visited;
+ FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (instance), visited);
+ }
+ }
hash_set<slp_tree> visited_patterns;
slp_tree_to_load_perm_map_t perm_cache;