@@ -92,4 +92,4 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_no_align && ilp32 } } } } */
@@ -78,4 +78,4 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_no_align && ilp32 } } } } */
@@ -45,6 +45,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail { vect_no_int_add || { ! { vect_unpack || vect_strided2 } } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */
@@ -6504,7 +6504,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
/* 2.3 Create the reduction code, using one of the three schemes described
above. In SLP we simply need to extract all the elements from the
vector (without reducing them), so we use scalar shifts. */
- else if (reduc_fn != IFN_LAST && !slp_reduc)
+ else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
{
tree tmp;
tree vec_elem_type;
@@ -6674,7 +6674,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
reduc_inputs[0] = new_temp;
- if (reduce_with_shift && !slp_reduc)
+ if (reduce_with_shift && (!slp_reduc || group_size == 1))
{
int element_bitsize = tree_to_uhwi (bitsize);
/* Enforced by vectorizable_reduction, which disallows SLP reductions
@@ -1911,7 +1911,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
/* Reduction chain backedge defs are filled manually.
??? Need a better way to identify a SLP reduction chain PHI.
Or a better overall way to SLP match those. */
- if (all_same && def_type == vect_reduction_def)
+ if (stmts.length () > 1
+ && all_same && def_type == vect_reduction_def)
skip_args[loop_latch_edge (loop)->dest_idx] = true;
}
else if (def_type != vect_internal_def)
@@ -3909,9 +3910,10 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
}
/* Find SLP sequences starting from groups of reductions. */
- if (loop_vinfo->reductions.length () > 1)
+ if (loop_vinfo->reductions.length () > 0)
{
- /* Collect reduction statements. */
+ /* Collect reduction statements we can combine into
+ a SLP reduction. */
vec<stmt_vec_info> scalar_stmts;
scalar_stmts.create (loop_vinfo->reductions.length ());
for (auto next_info : loop_vinfo->reductions)
@@ -3924,23 +3926,58 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
reduction path. In that case we'd have to reverse
engineer that conversion stmt following the chain using
reduc_idx and from the PHI using reduc_def. */
- && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
- /* Do not discover SLP reductions for lane-reducing ops, that
- will fail later. */
- && (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
- || !lane_reducing_op_p (gimple_assign_rhs_code (g))))
- scalar_stmts.quick_push (next_info);
+ && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
+ {
+ /* Do not discover SLP reductions combining lane-reducing
+ ops, that will fail later. */
+ if (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
+ || !lane_reducing_op_p (gimple_assign_rhs_code (g)))
+ scalar_stmts.quick_push (next_info);
+ else
+ {
+ /* Do SLP discovery for single-lane reductions. */
+ vec<stmt_vec_info> stmts;
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ stmts.create (1);
+ stmts.quick_push (next_info);
+ vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, NULL);
+ }
+ }
}
- if (scalar_stmts.length () > 1)
+ /* Save for re-processing on failure. */
+ vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ if (scalar_stmts.length () <= 1
+ || !vect_build_slp_instance (loop_vinfo,
+ slp_inst_kind_reduc_group,
+ scalar_stmts, roots, remain,
+ max_tree_size, &limit, bst_map,
+ NULL))
{
- vec<stmt_vec_info> roots = vNULL;
- vec<tree> remain = vNULL;
- vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group,
- scalar_stmts, roots, remain,
- max_tree_size, &limit, bst_map, NULL);
+ if (scalar_stmts.length () <= 1)
+ scalar_stmts.release ();
+ /* Do SLP discovery for single-lane reductions. */
+ for (auto stmt_info : saved_stmts)
+ {
+ vec<stmt_vec_info> stmts;
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ stmts.create (1);
+ stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
+ vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, NULL);
+ }
+ saved_stmts.release ();
}
- else
- scalar_stmts.release ();
}
}
@@ -2160,6 +2160,23 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
}
overrun_p = true;
}
+
+ /* If this is single-element interleaving with an element
+ distance that leaves unused vector loads around punt - we
+ at least create very sub-optimal code in that case (and
+ blow up memory, see PR65518). */
+ if (loop_vinfo
+ && *memory_access_type == VMAT_CONTIGUOUS
+ && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+ && single_element_p
+ && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "single-element interleaving not supported "
+ "for not adjacent vector loads\n");
+ return false;
+ }
}
}
else
@@ -8202,7 +8219,9 @@ vectorizable_store (vec_info *vinfo,
gcc_assert (ncopies >= 1);
/* FORNOW. This restriction should be relaxed. */
- if (loop && nested_in_vect_loop_p (loop, stmt_info) && ncopies > 1)
+ if (loop
+ && nested_in_vect_loop_p (loop, stmt_info)
+ && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9945,7 +9964,8 @@ vectorizable_load (vec_info *vinfo,
gcc_assert (ncopies >= 1);
/* FORNOW. This restriction should be relaxed. */
- if (nested_in_vect_loop && ncopies > 1)
+ if (nested_in_vect_loop
+ && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,