@@ -3889,6 +3889,287 @@ vect_analyze_slp_instance (vec_info *vinfo,
return res;
}
+/* qsort comparator ordering SLP load nodes. */
+
+static int
+vllp_cmp (const void *a_, const void *b_)
+{
+ const slp_tree a = *(const slp_tree *)a_;
+ const slp_tree b = *(const slp_tree *)b_;
+ stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
+ stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
+ if (STMT_VINFO_GROUPED_ACCESS (a0)
+ && STMT_VINFO_GROUPED_ACCESS (b0)
+ && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
+ {
+ /* Same group, order after lanes used. */
+ if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
+ return 1;
+ else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
+ return -1;
+ else
+ {
+ /* Try to order loads using the same lanes together, breaking
+ the tie with the lane number that first differs. */
+ if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
+ && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
+ return 0;
+ else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
+ && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
+ return 1;
+ else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
+ && SLP_TREE_LOAD_PERMUTATION (b).exists ())
+ return -1;
+ else
+ {
+ for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
+ if (SLP_TREE_LOAD_PERMUTATION (a)[i]
+ != SLP_TREE_LOAD_PERMUTATION (b)[i])
+ {
+ /* In-order lane first, that's what the above case for
+ no permutation does. */
+ if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
+ return -1;
+ else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
+ return 1;
+ else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
+ < SLP_TREE_LOAD_PERMUTATION (b)[i])
+ return -1;
+ else
+ return 1;
+ }
+ return 0;
+ }
+ }
+ }
+ else /* Different groups or non-groups. */
+ {
+ /* Order groups as their first element to keep them together. */
+ if (STMT_VINFO_GROUPED_ACCESS (a0))
+ a0 = DR_GROUP_FIRST_ELEMENT (a0);
+ if (STMT_VINFO_GROUPED_ACCESS (b0))
+ b0 = DR_GROUP_FIRST_ELEMENT (b0);
+ if (a0 == b0)
+ return 0;
+ /* Tie using UID. */
+ else if (gimple_uid (STMT_VINFO_STMT (a0))
+ < gimple_uid (STMT_VINFO_STMT (b0)))
+ return -1;
+ else
+ {
+ gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
+ != gimple_uid (STMT_VINFO_STMT (b0)));
+ return 1;
+ }
+ }
+}
+
+/* Process the set of LOADS that are all from the same dataref group. */
+
+static void
+vect_lower_load_permutations (loop_vec_info loop_vinfo,
+ scalar_stmts_to_slp_tree_map_t *bst_map,
+ const array_slice<slp_tree> &loads)
+{
+ /* We at this point want to lower without a fixed VF or vector
+ size in mind which means we cannot actually compute whether we
+ need three or more vectors for a load permutation yet. So always
+ lower. */
+ stmt_vec_info first
+ = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
+
+ /* ??? In principle we have to consider a gap up to the next full
+ vector, but we have to actually represent a scalar stmt for the
+ gaps value so delay handling this. The same is true for
+ inbetween gaps which the load places in the load-permutation
+ represent. It's probably not worth trying an intermediate packing
+ to vectors without gap even if that might handle some more cases.
+ Instead get the gap case correct in some way. */
+ unsigned group_lanes = 0;
+ for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
+ {
+ if ((s == first && DR_GROUP_GAP (s) != 0)
+ || (s != first && DR_GROUP_GAP (s) != 1))
+ return;
+ group_lanes++;
+ }
+ /* Only a power-of-two number of lanes matches interleaving. */
+ if (exact_log2 (group_lanes) == -1)
+ return;
+
+ for (slp_tree load : loads)
+ {
+ /* Leave masked or gather loads alone for now. */
+ if (!SLP_TREE_CHILDREN (load).is_empty ())
+ continue;
+
+ /* We need to lower only loads of less than half of the groups
+ lanes, including duplicate lanes. */
+ if (SLP_TREE_LANES (load) >= group_lanes / 2)
+ continue;
+
+ /* Lower by reducing the group to half its size using an
+ interleaving scheme. For this try to compute whether all
+ elements needed for this load are in even or odd elements of
+ an even/odd decomposition with N consecutive elements.
+ Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
+ with N == 2. */
+ unsigned even = (1 << ceil_log2 (DR_GROUP_SIZE (first))) - 1;
+ unsigned odd = even;
+ for (unsigned l : SLP_TREE_LOAD_PERMUTATION (load))
+ {
+ even &= ~l;
+ odd &= l;
+ }
+ /* Give up when this doesn't match up with an interleaving scheme. */
+ if (!even && !odd)
+ continue;
+
+ /* First build (and possibly re-use) a load node for the
+ unpermuted group. */
+ vec<stmt_vec_info> stmts;
+ stmts.create (group_lanes);
+ for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
+ stmts.quick_push (s);
+ poly_uint64 max_nunits;
+ bool *matches = XALLOCAVEC (bool, group_lanes);
+ unsigned limit = 1;
+ unsigned tree_size = 0;
+ slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
+ group_lanes,
+ &max_nunits, matches, &limit,
+ &tree_size, bst_map);
+
+ /* Build the permute to get the original load permutation order. */
+ lane_permutation_t final_perm;
+ final_perm.create (SLP_TREE_LANES (load));
+ for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
+ final_perm.quick_push
+ (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
+
+ /* Now build an even or odd extraction from the unpermuted load. */
+ lane_permutation_t perm;
+ perm.create (group_lanes / 2);
+ unsigned level;
+ if (even
+ && ((level = 1 << ctz_hwi (even)), true)
+ && group_lanes % (2 * level) == 0)
+ {
+ /* { 0, 1, ... 4, 5 ..., } */
+ unsigned level = 1 << ctz_hwi (even);
+ for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
+ for (unsigned j = 0; j < level; ++j)
+ perm.quick_push (std::make_pair (0, 2 * i * level + j));
+ }
+ else if (odd)
+ {
+ /* { ..., 2, 3, ... 6, 7 } */
+ unsigned level = 1 << ctz_hwi (odd);
+ gcc_assert (group_lanes % (2 * level) == 0);
+ for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
+ for (unsigned j = 0; j < level; ++j)
+ perm.quick_push (std::make_pair (0, (2 * i + 1) * level + j));
+ }
+ else
+ gcc_unreachable ();
+
+ /* And update final_perm. */
+ for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
+ {
+ unsigned l = final_perm[i].second;
+ unsigned j;
+ for (j = 0; j < perm.length (); ++j)
+ if (perm[j].second == l)
+ {
+ final_perm[i].second = j;
+ break;
+ }
+ gcc_assert (j < perm.length ());
+ }
+
+ /* And create scalar stmts. */
+ vec<stmt_vec_info> perm_stmts;
+ perm_stmts.create (perm.length ());
+ for (unsigned i = 0; i < perm.length (); ++i)
+ perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
+
+ slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
+ SLP_TREE_CHILDREN (p).quick_push (l0);
+ SLP_TREE_LANE_PERMUTATION (p) = perm;
+ SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
+ SLP_TREE_LANES (p) = perm.length ();
+ SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
+ /* ??? As we have scalar stmts for this intermediate permute we
+ could CSE it via bst_map but we do not want to pick up
+ another SLP node with a load permutation. We instead should
+ have a "local" CSE map here. */
+ SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
+ /* ??? We need to iterate if group_lanes / 2 is still too large. */
+ /* ??? Ideally pick the best even/odd scheme usable for
+ most of the loads. -> do a multi-step scheme? */
+
+ /* And finally from the ordered reduction node create the
+ permute to shuffle the lanes into the original load-permutation
+ order. We replace the original load node with this. */
+ SLP_TREE_CODE (load) = VEC_PERM_EXPR;
+ SLP_TREE_LOAD_PERMUTATION (load).release ();
+ SLP_TREE_LANE_PERMUTATION (load) = final_perm;
+ SLP_TREE_CHILDREN (load).create (1);
+ SLP_TREE_CHILDREN (load).quick_push (p);
+ }
+}
+
+/* Transform SLP loads in the SLP graph created by SLP discovery to
+ group loads from the same group and lower load permutations that
+ are unlikely to be supported into a series of permutes.
+ In the degenerate case of having only single-lane SLP instances
+ this should result in a series of permute nodes emulating an
+ interleaving scheme. */
+
+static void
+vect_lower_load_permutations (loop_vec_info loop_vinfo,
+ scalar_stmts_to_slp_tree_map_t *bst_map)
+{
+ /* Gather and sort loads across all instances. */
+ hash_set<slp_tree> visited;
+ auto_vec<slp_tree> loads;
+ for (auto inst : loop_vinfo->slp_instances)
+ vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
+ if (loads.is_empty ())
+ return;
+ loads.qsort (vllp_cmp);
+
+ /* Now process each dataref group separately. */
+ unsigned firsti = 0;
+ for (unsigned i = 1; i < loads.length (); ++i)
+ {
+ slp_tree first = loads[firsti];
+ slp_tree next = loads[i];
+ stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
+ stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
+ if (STMT_VINFO_GROUPED_ACCESS (a0)
+ && STMT_VINFO_GROUPED_ACCESS (b0)
+ && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
+ continue;
+ /* Just one SLP load of a possible group, leave those alone. */
+ if (i == firsti + 1)
+ {
+ firsti = i;
+ continue;
+ }
+ /* Now we have multiple SLP loads of the same group from
+ firsti to i - 1. */
+ vect_lower_load_permutations (loop_vinfo, bst_map,
+ make_array_slice (&loads[firsti],
+ i - firsti));
+ firsti = i;
+ }
+ if (firsti < loads.length () - 1)
+ vect_lower_load_permutations (loop_vinfo, bst_map,
+ make_array_slice (&loads[firsti],
+ loads.length () - firsti));
+}
+
/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
trees of packed scalar stmts if SLP is possible. */
@@ -4030,6 +4311,12 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
}
}
+ /* When we end up with load permutations that we cannot possibly handle,
+ like those requiring three vector inputs, lower them using interleaving
+ like schemes. */
+ if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
+ vect_lower_load_permutations (loop_vinfo, bst_map);
+
hash_set<slp_tree> visited_patterns;
slp_tree_to_load_perm_map_t perm_cache;
slp_compat_nodes_map_t compat_cache;