@@ -137,4 +137,4 @@ int main (void)
/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_pack_trunc } } } } */
/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc } } } */
@@ -131,4 +131,4 @@ int main (void)
/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_pack_trunc } } } } */
/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc } } } */
@@ -10194,6 +10194,13 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
unsigned i;
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
+ /* True if we're permuting a single input of 2N vectors down
+ to N vectors. This case doesn't generalize beyond 2 since
+ VEC_PERM_EXPR only takes 2 inputs. */
+ bool pack_p = false;
+ /* If we're permuting inputs of N vectors each into X*N outputs,
+ this is the value of X, otherwise it is 1. */
+ unsigned int unpack_factor = 1;
tree op_vectype = NULL_TREE;
FOR_EACH_VEC_ELT (children, i, child)
if (SLP_TREE_VECTYPE (child))
@@ -10215,7 +10222,20 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
"Unsupported vector types in lane permutation\n");
return -1;
}
- if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
+ auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
+ unsigned int this_unpack_factor;
+ /* Check whether the input has twice as many lanes per vector. */
+ if (children.length () == 1
+ && known_eq (SLP_TREE_LANES (child) * nunits,
+ SLP_TREE_LANES (node) * op_nunits * 2))
+ pack_p = true;
+ /* Check whether the output has N times as many lanes per vector. */
+ else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
+ SLP_TREE_LANES (child) * nunits,
+ &this_unpack_factor)
+ && (i == 0 || unpack_factor == this_unpack_factor))
+ unpack_factor = this_unpack_factor;
+ else
repeating_p = false;
}
@@ -10243,29 +10263,47 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
return 1;
}
- /* REPEATING_P is true if every output vector is guaranteed to use the
- same permute vector. We can handle that case for both variable-length
- and constant-length vectors, but we only handle other cases for
- constant-length vectors.
+ /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR
+ and if we can generate the vectors in a vector-length agnostic way.
+ This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
+ compile time.
+
+ The significance of UNPACK_STEP is that, when PACK_P is false,
+ output vector I operates on a window of UNPACK_STEP elements from each
+ input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
+ when UNPACK_FACTOR is 2, the first output vector operates on lanes
+ [0, NUNITS / 2 - 1] of each input vector and the second output vector
+ operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
+
+ When REPEATING_P is true, NOUTPUTS holds the total number of outputs
+ that we actually need to generate. */
+ uint64_t noutputs = 0;
+ poly_uint64 unpack_step = 0;
+ loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
+ if (!linfo
+ || !multiple_p (nunits, unpack_factor, &unpack_step)
+ || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
+ * SLP_TREE_LANES (node), nunits, &noutputs))
+ repeating_p = false;
+
+ /* We can handle the conditions described for REPEATING_P above for
+ both variable- and constant-length vectors. The fallback requires
+ us to generate every element of every permute vector explicitly,
+ which is only possible for constant-length permute vectors.
Set:
- NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
- mask vector that we want to build.
+ mask vectors that we want to build.
- NCOPIES to the number of copies of PERM that we need in order
- to build the necessary permute mask vectors.
-
- - NOUTPUTS_PER_MASK to the number of output vectors we want to create
- for each permute mask vector. This is only relevant when GSI is
- nonnull. */
+ to build the necessary permute mask vectors. */
uint64_t npatterns;
unsigned nelts_per_pattern;
uint64_t ncopies;
- unsigned noutputs_per_mask;
if (repeating_p)
{
- /* We need a single permute mask vector that has the form:
+ /* We need permute mask vectors that have the form:
{ X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
@@ -10274,7 +10312,6 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
that we use for permutes requires 3n elements. */
npatterns = SLP_TREE_LANES (node);
nelts_per_pattern = ncopies = 3;
- noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
}
else
{
@@ -10282,24 +10319,40 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
instead of relying on the pattern described above. */
if (!nunits.is_constant (&npatterns)
|| !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
+ {
+ if (dump_p)
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "unsupported permutation %p on variable-length"
+ " vectors\n", (void *) node);
return -1;
+ }
nelts_per_pattern = ncopies = 1;
- if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
- if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
+ if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
+ {
+ if (dump_p)
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "unsupported permutation %p for variable VF\n",
+ (void *) node);
return -1;
- noutputs_per_mask = 1;
}
- unsigned olanes = ncopies * SLP_TREE_LANES (node);
+ pack_p = false;
+ unpack_factor = 1;
+ }
+ unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
gcc_assert (repeating_p || multiple_p (olanes, nunits));
/* Compute the { { SLP operand, vector index}, lane } permutation sequence
from the { SLP operand, scalar lane } permutation as recorded in the
SLP node as intermediate step. This part should already work
with SLP children with arbitrary number of lanes. */
- auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
- auto_vec<unsigned> active_lane;
+ auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
+ auto_vec<poly_uint64> active_lane;
vperm.create (olanes);
active_lane.safe_grow_cleared (children.length (), true);
+ for (unsigned int ui = 0; ui < unpack_factor; ++ui)
+ {
+ for (unsigned j = 0; j < children.length (); ++j)
+ active_lane[j] = ui * unpack_step;
for (unsigned i = 0; i < ncopies; ++i)
{
for (unsigned pi = 0; pi < perm.length (); ++pi)
@@ -10307,13 +10360,16 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
std::pair<unsigned, unsigned> p = perm[pi];
tree vtype = SLP_TREE_VECTYPE (children[p.first]);
if (repeating_p)
- vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
+ vperm.quick_push ({{p.first, 0},
+ p.second + active_lane[p.first]});
else
{
/* We checked above that the vectors are constant-length. */
- unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
- unsigned vi = (active_lane[p.first] + p.second) / vnunits;
- unsigned vl = (active_lane[p.first] + p.second) % vnunits;
+ unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
+ .to_constant ();
+ unsigned lane = active_lane[p.first].to_constant ();
+ unsigned vi = (lane + p.second) / vnunits;
+ unsigned vl = (lane + p.second) % vnunits;
vperm.quick_push ({{p.first, vi}, vl});
}
}
@@ -10321,6 +10377,7 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
for (unsigned j = 0; j < children.length (); ++j)
active_lane[j] += SLP_TREE_LANES (children[j]);
}
+ }
if (dump_p)
{
@@ -10339,9 +10396,10 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
? multiple_p (i, npatterns)
: multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
dump_printf (MSG_NOTE, ",");
- dump_printf (MSG_NOTE, " vops%u[%u][%u]",
- vperm[i].first.first, vperm[i].first.second,
- vperm[i].second);
+ dump_printf (MSG_NOTE, " vops%u[%u][",
+ vperm[i].first.first, vperm[i].first.second);
+ dump_dec (MSG_NOTE, vperm[i].second);
+ dump_printf (MSG_NOTE, "]");
}
dump_printf (MSG_NOTE, "\n");
}
@@ -10362,16 +10420,37 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
mask.quick_grow (count);
vec_perm_indices indices;
unsigned nperms = 0;
- for (unsigned i = 0; i < vperm.length (); ++i)
- {
- mask_element = vperm[i].second;
- if (first_vec.first == -1U
- || first_vec == vperm[i].first)
- first_vec = vperm[i].first;
+ /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
+ vectors to check during analysis, but we need to generate NOUTPUTS
+ vectors during transformation. */
+ unsigned total_nelts = olanes;
+ if (repeating_p && gsi)
+ total_nelts = (total_nelts / unpack_factor) * noutputs;
+ for (unsigned i = 0; i < total_nelts; ++i)
+ {
+ /* VI is the input vector index when generating code for REPEATING_P. */
+ unsigned vi = i / olanes * (pack_p ? 2 : 1);
+ unsigned ei = i % olanes;
+ mask_element = vperm[ei].second;
+ if (pack_p)
+ {
+ /* In this case, we have N outputs and the single child provides 2N
+ inputs. Output X permutes inputs 2X and 2X+1.
+
+ The mask indices are taken directly from the SLP permutation node.
+ Index X selects from the first vector if (X / NUNITS) % 2 == 0;
+ X selects from the second vector otherwise. These conditions
+ are only known at compile time for constant-length vectors. */
+ first_vec = std::make_pair (0, 0);
+ second_vec = std::make_pair (0, 1);
+ }
+ else if (first_vec.first == -1U
+ || first_vec == vperm[ei].first)
+ first_vec = vperm[ei].first;
else if (second_vec.first == -1U
- || second_vec == vperm[i].first)
+ || second_vec == vperm[ei].first)
{
- second_vec = vperm[i].first;
+ second_vec = vperm[ei].first;
mask_element += nunits;
}
else
@@ -10435,18 +10514,13 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
if (!identity_p)
mask_vec = vect_gen_perm_mask_checked (vectype, indices);
- for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
- {
tree first_def
- = vect_get_slp_vect_def (first_node,
- first_vec.second + vi);
+ = vect_get_slp_vect_def (first_node, first_vec.second + vi);
tree second_def
- = vect_get_slp_vect_def (second_node,
- second_vec.second + vi);
+ = vect_get_slp_vect_def (second_node, second_vec.second + vi);
vect_add_slp_permutation (vinfo, gsi, node, first_def,
second_def, mask_vec, mask[0]);
}
- }
index = 0;
first_vec = std::make_pair (-1U, -1U);