new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+
+void foo (int * __restrict x, int *y)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ x[4*i+0] = y[3*i+0];
+ x[4*i+1] = y[3*i+1] * 2;
+ x[4*i+2] = y[3*i+2] + 3;
+ x[4*i+3] = y[3*i+2] * 2 - 5;
+ }
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */
@@ -3718,7 +3718,8 @@ vect_build_slp_instance (vec_info *vinfo,
with the least number of lanes to one and then repeat until
we end up with two inputs. That scheme makes sure we end
up with permutes satisfying the restriction of requiring at
- most two vector inputs to produce a single vector output. */
+ most two vector inputs to produce a single vector output
+ when the number of lanes is even. */
while (SLP_TREE_CHILDREN (perm).length () > 2)
{
/* Pick the two nodes with the least number of lanes,
@@ -3995,11 +3996,10 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
= DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
/* Only a power-of-two number of lanes matches interleaving with N levels.
- The non-SLP path also supports DR_GROUP_SIZE == 3.
??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
at each step. */
unsigned group_lanes = DR_GROUP_SIZE (first);
- if (exact_log2 (group_lanes) == -1)
+ if (exact_log2 (group_lanes) == -1 && group_lanes != 3)
return;
for (slp_tree load : loads)
@@ -4016,7 +4016,7 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
with a non-1:1 load permutation around instead of canonicalizing
those into a load and a permute node. Removing this early
check would do such canonicalization. */
- if (SLP_TREE_LANES (load) >= group_lanes / 2)
+ if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
continue;
/* First build (and possibly re-use) a load node for the
@@ -4052,7 +4052,7 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
while (1)
{
unsigned group_lanes = SLP_TREE_LANES (l0);
- if (SLP_TREE_LANES (load) >= group_lanes / 2)
+ if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
break;
/* Try to lower by reducing the group to half its size using an
@@ -4062,19 +4062,24 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
with N == 2. */
/* ??? Only an even number of lanes can be handed this way, but the
- fallback below could work for any number. */
- gcc_assert ((group_lanes & 1) == 0);
- unsigned even = (1 << ceil_log2 (group_lanes)) - 1;
- unsigned odd = even;
- for (auto l : final_perm)
+ fallback below could work for any number. We have to make sure
+ to round up in that case. */
+ gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
+ unsigned even = 0, odd = 0;
+ if ((group_lanes & 1) == 0)
{
- even &= ~l.second;
- odd &= l.second;
+ even = (1 << ceil_log2 (group_lanes)) - 1;
+ odd = even;
+ for (auto l : final_perm)
+ {
+ even &= ~l.second;
+ odd &= l.second;
+ }
}
/* Now build an even or odd extraction from the unpermuted load. */
lane_permutation_t perm;
- perm.create (group_lanes / 2);
+ perm.create ((group_lanes + 1) / 2);
unsigned level;
if (even
&& ((level = 1 << ctz_hwi (even)), true)
@@ -4109,7 +4114,7 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
bitmap_iterator bi;
EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
perm.quick_push (std::make_pair (0, i));
- while (perm.length () < group_lanes / 2)
+ while (perm.length () < (group_lanes + 1) / 2)
perm.quick_push (perm.last ());
}
@@ -4145,7 +4150,7 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
have a "local" CSE map here. */
SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
- /* We now have a node for group_lanes / 2 lanes. */
+ /* We now have a node for (group_lanes + 1) / 2 lanes. */
l0 = p;
}