diff mbox series

[v2] Support single def-use cycle optimization for SLP reduction vectorization

Message ID 20240614090815.7CCF913AAF@imap1.dmz-prg2.suse.org
State New
Headers show
Series [v2] Support single def-use cycle optimization for SLP reduction vectorization | expand

Commit Message

Richard Biener June 14, 2024, 9:08 a.m. UTC
We can at least mimic single def-use cycle optimization when doing
single-lane SLP reductions and that's required to avoid regressing
compared to non-SLP.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

	* tree-vect-loop.cc (vectorizable_reduction): Allow
	single-def-use cycles with SLP.
	(vect_transform_reduction): Handle SLP single def-use cycles.
	(vect_transform_cycle_phi): Likewise.

	* gcc.dg/vect/slp-reduc-12.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/slp-reduc-12.c | 18 ++++++++++
 gcc/tree-vect-loop.cc                    | 45 ++++++++++++++----------
 2 files changed, 45 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-12.c

Comments

Thomas Schwinge June 25, 2024, 11:10 a.m. UTC | #1
Hi!

On 2024-06-14T11:08:15+0200, Richard Biener <rguenther@suse.de> wrote:
> We can at least mimic single def-use cycle optimization when doing
> single-lane SLP reductions and that's required to avoid regressing
> compared to non-SLP.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
>
> 	* tree-vect-loop.cc (vectorizable_reduction): Allow
> 	single-def-use cycles with SLP.
> 	(vect_transform_reduction): Handle SLP single def-use cycles.
> 	(vect_transform_cycle_phi): Likewise.
>
> 	* gcc.dg/vect/slp-reduc-12.c: New testcase.

For GCN target (tested '-march=gfx908' on current sources), I see:

    +PASS: gcc.dg/vect/slp-reduc-12.c (test for excess errors)
    +FAIL: gcc.dg/vect/slp-reduc-12.c scan-tree-dump vect "using single def-use cycle for reduction"

..., where we've got (see attached):

    [...]
    [...]/gcc.dg/vect/slp-reduc-12.c:10:21: optimized: loop vectorized using 256 byte vectors
    [...]
    [...]/gcc.dg/vect/slp-reduc-12.c:10:21: note:   Reduce using direct vector reduction.
    [...]/gcc.dg/vect/slp-reduc-12.c:10:21: note:   vectorizing stmts using SLP.
    [...]

How to address?


Grüße
 Thomas


>  gcc/testsuite/gcc.dg/vect/slp-reduc-12.c | 18 ++++++++++
>  gcc/tree-vect-loop.cc                    | 45 ++++++++++++++----------
>  2 files changed, 45 insertions(+), 18 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-12.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c
> new file mode 100644
> index 00000000000..625f8097c54
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target vect_hw_misalign } */
> +/* { dg-additional-options "-Ofast" } */
> +
> +double foo (double *x, int * __restrict a, int n)
> +{
> +  double r = 0.;
> +  for (int i = 0; i < n; ++i)
> +    {
> +      a[i] = a[i] + i;
> +      r += x[i];
> +    }
> +  return r;
> +}
> +
> +/* { dg-final { scan-tree-dump "using single def-use cycle for reduction" "vect" } } */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bbd5d261907..d9a2ad69484 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -8320,7 +8320,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>     participating.  When unrolling we want each unrolled iteration to have its
>     own reduction accumulator since one of the main goals of unrolling a
>     reduction is to reduce the aggregate loop-carried latency.  */
> -  if (ncopies > 1
> +  if ((ncopies > 1
> +       || (slp_node
> +	   && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
> +	   && SLP_TREE_LANES (slp_node) == 1
> +	   && vect_get_num_copies (loop_vinfo, vectype_in) > 1))
>        && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
>        && reduc_chain_length == 1
>        && loop_vinfo->suggested_unroll_factor == 1)
> @@ -8373,6 +8377,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	    single_defuse_cycle = false;
>  	}
>      }
> +  if (dump_enabled_p () && single_defuse_cycle)
> +    dump_printf_loc (MSG_NOTE, vect_location,
> +		     "using single def-use cycle for reduction by reducing "
> +		     "multiple vectors to one in the loop body\n");
>    STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
>  
>    /* If the reduction stmt is one of the patterns that have lane
> @@ -8528,9 +8536,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  {
>    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> -  int i;
> -  int ncopies;
> -  int vec_num;
> +  unsigned ncopies;
> +  unsigned vec_num;
>  
>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
>    gcc_assert (reduc_info->is_reduc_info);
> @@ -8577,7 +8584,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    auto_vec<tree> vec_oprnds0;
>    auto_vec<tree> vec_oprnds1;
>    auto_vec<tree> vec_oprnds2;
> -  tree def0;
>  
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
> @@ -8652,20 +8658,21 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>       definition.  */
>    if (single_defuse_cycle)
>      {
> -      gcc_assert (!slp_node);
> -      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> -				     op.ops[reduc_index],
> -				     reduc_index == 0 ? &vec_oprnds0
> -				     : (reduc_index == 1 ? &vec_oprnds1
> -					: &vec_oprnds2));
> +      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
> +			 reduc_index == 0 ? op.ops[0] : NULL_TREE, &vec_oprnds0,
> +			 reduc_index == 1 ? op.ops[1] : NULL_TREE, &vec_oprnds1,
> +			 reduc_index == 2 ? op.ops[2] : NULL_TREE,
> +			 &vec_oprnds2);
>      }
>  
>    bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
>  
> -  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
> +  unsigned num = (reduc_index == 0
> +		  ? vec_oprnds1.length () : vec_oprnds0.length ());
> +  for (unsigned i = 0; i < num; ++i)
>      {
>        gimple *new_stmt;
> -      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
> +      tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE };
>        if (masked_loop_p && !mask_by_cond_expr)
>  	{
>  	  /* No conditional ifns have been defined for dot-product yet.  */
> @@ -8720,10 +8727,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
>  	}
>  
> -      if (slp_node)
> -	slp_node->push_vec_def (new_stmt);
> -      else if (single_defuse_cycle
> -	       && i < ncopies - 1)
> +      if (single_defuse_cycle && i < num - 1)
>  	{
>  	  if (reduc_index == 0)
>  	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
> @@ -8732,6 +8736,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  else if (reduc_index == 2)
>  	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
>  	}
> +      else if (slp_node)
> +	slp_node->push_vec_def (new_stmt);
>        else
>  	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>      }
> @@ -8795,7 +8801,10 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>    /* Check whether we should use a single PHI node and accumulate
>       vectors to one before the backedge.  */
>    if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
> -    ncopies = 1;
> +    {
> +      ncopies = 1;
> +      vec_num = 1;
> +    }
>  
>    /* Create the destination vector  */
>    gphi *phi = as_a <gphi *> (stmt_info->stmt);
> -- 
> 2.35.3
Richard Biener June 25, 2024, 12:58 p.m. UTC | #2
On Tue, 25 Jun 2024, Thomas Schwinge wrote:

> Hi!
> 
> On 2024-06-14T11:08:15+0200, Richard Biener <rguenther@suse.de> wrote:
> > We can at least mimic single def-use cycle optimization when doing
> > single-lane SLP reductions and that's required to avoid regressing
> > compared to non-SLP.
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
> >
> > 	* tree-vect-loop.cc (vectorizable_reduction): Allow
> > 	single-def-use cycles with SLP.
> > 	(vect_transform_reduction): Handle SLP single def-use cycles.
> > 	(vect_transform_cycle_phi): Likewise.
> >
> > 	* gcc.dg/vect/slp-reduc-12.c: New testcase.
> 
> For GCN target (tested '-march=gfx908' on current sources), I see:
> 
>     +PASS: gcc.dg/vect/slp-reduc-12.c (test for excess errors)
>     +FAIL: gcc.dg/vect/slp-reduc-12.c scan-tree-dump vect "using single def-use cycle for reduction"
> 
> ..., where we've got (see attached):
> 
>     [...]
>     [...]/gcc.dg/vect/slp-reduc-12.c:10:21: optimized: loop vectorized using 256 byte vectors
>     [...]
>     [...]/gcc.dg/vect/slp-reduc-12.c:10:21: note:   Reduce using direct vector reduction.
>     [...]/gcc.dg/vect/slp-reduc-12.c:10:21: note:   vectorizing stmts using SLP.
>     [...]
> 
> How to address?

The testcase works on the premise that we have VnDF and VmSI with
n != m but for GCN both are 64.  I'm not sure how to gate the dump
scanning properly but there must be precedence?

Richard.

> 
> Grüße
>  Thomas
> 
> 
> >  gcc/testsuite/gcc.dg/vect/slp-reduc-12.c | 18 ++++++++++
> >  gcc/tree-vect-loop.cc                    | 45 ++++++++++++++----------
> >  2 files changed, 45 insertions(+), 18 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-12.c
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c
> > new file mode 100644
> > index 00000000000..625f8097c54
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c
> > @@ -0,0 +1,18 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target vect_double } */
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-require-effective-target vect_hw_misalign } */
> > +/* { dg-additional-options "-Ofast" } */
> > +
> > +double foo (double *x, int * __restrict a, int n)
> > +{
> > +  double r = 0.;
> > +  for (int i = 0; i < n; ++i)
> > +    {
> > +      a[i] = a[i] + i;
> > +      r += x[i];
> > +    }
> > +  return r;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump "using single def-use cycle for reduction" "vect" } } */
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index bbd5d261907..d9a2ad69484 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -8320,7 +8320,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >     participating.  When unrolling we want each unrolled iteration to have its
> >     own reduction accumulator since one of the main goals of unrolling a
> >     reduction is to reduce the aggregate loop-carried latency.  */
> > -  if (ncopies > 1
> > +  if ((ncopies > 1
> > +       || (slp_node
> > +	   && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
> > +	   && SLP_TREE_LANES (slp_node) == 1
> > +	   && vect_get_num_copies (loop_vinfo, vectype_in) > 1))
> >        && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
> >        && reduc_chain_length == 1
> >        && loop_vinfo->suggested_unroll_factor == 1)
> > @@ -8373,6 +8377,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >  	    single_defuse_cycle = false;
> >  	}
> >      }
> > +  if (dump_enabled_p () && single_defuse_cycle)
> > +    dump_printf_loc (MSG_NOTE, vect_location,
> > +		     "using single def-use cycle for reduction by reducing "
> > +		     "multiple vectors to one in the loop body\n");
> >    STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
> >  
> >    /* If the reduction stmt is one of the patterns that have lane
> > @@ -8528,9 +8536,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >  {
> >    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
> >    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > -  int i;
> > -  int ncopies;
> > -  int vec_num;
> > +  unsigned ncopies;
> > +  unsigned vec_num;
> >  
> >    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> >    gcc_assert (reduc_info->is_reduc_info);
> > @@ -8577,7 +8584,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >    auto_vec<tree> vec_oprnds0;
> >    auto_vec<tree> vec_oprnds1;
> >    auto_vec<tree> vec_oprnds2;
> > -  tree def0;
> >  
> >    if (dump_enabled_p ())
> >      dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
> > @@ -8652,20 +8658,21 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >       definition.  */
> >    if (single_defuse_cycle)
> >      {
> > -      gcc_assert (!slp_node);
> > -      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> > -				     op.ops[reduc_index],
> > -				     reduc_index == 0 ? &vec_oprnds0
> > -				     : (reduc_index == 1 ? &vec_oprnds1
> > -					: &vec_oprnds2));
> > +      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
> > +			 reduc_index == 0 ? op.ops[0] : NULL_TREE, &vec_oprnds0,
> > +			 reduc_index == 1 ? op.ops[1] : NULL_TREE, &vec_oprnds1,
> > +			 reduc_index == 2 ? op.ops[2] : NULL_TREE,
> > +			 &vec_oprnds2);
> >      }
> >  
> >    bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> >  
> > -  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
> > +  unsigned num = (reduc_index == 0
> > +		  ? vec_oprnds1.length () : vec_oprnds0.length ());
> > +  for (unsigned i = 0; i < num; ++i)
> >      {
> >        gimple *new_stmt;
> > -      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
> > +      tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE };
> >        if (masked_loop_p && !mask_by_cond_expr)
> >  	{
> >  	  /* No conditional ifns have been defined for dot-product yet.  */
> > @@ -8720,10 +8727,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >  	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> >  	}
> >  
> > -      if (slp_node)
> > -	slp_node->push_vec_def (new_stmt);
> > -      else if (single_defuse_cycle
> > -	       && i < ncopies - 1)
> > +      if (single_defuse_cycle && i < num - 1)
> >  	{
> >  	  if (reduc_index == 0)
> >  	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
> > @@ -8732,6 +8736,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >  	  else if (reduc_index == 2)
> >  	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
> >  	}
> > +      else if (slp_node)
> > +	slp_node->push_vec_def (new_stmt);
> >        else
> >  	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >      }
> > @@ -8795,7 +8801,10 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
> >    /* Check whether we should use a single PHI node and accumulate
> >       vectors to one before the backedge.  */
> >    if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
> > -    ncopies = 1;
> > +    {
> > +      ncopies = 1;
> > +      vec_num = 1;
> > +    }
> >  
> >    /* Create the destination vector  */
> >    gphi *phi = as_a <gphi *> (stmt_info->stmt);
> > -- 
> > 2.35.3
> 
> 
>
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c
new file mode 100644
index 00000000000..625f8097c54
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_hw_misalign } */
+/* { dg-additional-options "-Ofast" } */
+
+double foo (double *x, int * __restrict a, int n)
+{
+  double r = 0.;
+  for (int i = 0; i < n; ++i)
+    {
+      a[i] = a[i] + i;
+      r += x[i];
+    }
+  return r;
+}
+
+/* { dg-final { scan-tree-dump "using single def-use cycle for reduction" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bbd5d261907..d9a2ad69484 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8320,7 +8320,11 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
    participating.  When unrolling we want each unrolled iteration to have its
    own reduction accumulator since one of the main goals of unrolling a
    reduction is to reduce the aggregate loop-carried latency.  */
-  if (ncopies > 1
+  if ((ncopies > 1
+       || (slp_node
+	   && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+	   && SLP_TREE_LANES (slp_node) == 1
+	   && vect_get_num_copies (loop_vinfo, vectype_in) > 1))
       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
       && reduc_chain_length == 1
       && loop_vinfo->suggested_unroll_factor == 1)
@@ -8373,6 +8377,10 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
 	    single_defuse_cycle = false;
 	}
     }
+  if (dump_enabled_p () && single_defuse_cycle)
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "using single def-use cycle for reduction by reducing "
+		     "multiple vectors to one in the loop body\n");
   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
 
   /* If the reduction stmt is one of the patterns that have lane
@@ -8528,9 +8536,8 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
 {
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  int i;
-  int ncopies;
-  int vec_num;
+  unsigned ncopies;
+  unsigned vec_num;
 
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
   gcc_assert (reduc_info->is_reduc_info);
@@ -8577,7 +8584,6 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
   auto_vec<tree> vec_oprnds0;
   auto_vec<tree> vec_oprnds1;
   auto_vec<tree> vec_oprnds2;
-  tree def0;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8652,20 +8658,21 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
      definition.  */
   if (single_defuse_cycle)
     {
-      gcc_assert (!slp_node);
-      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
-				     op.ops[reduc_index],
-				     reduc_index == 0 ? &vec_oprnds0
-				     : (reduc_index == 1 ? &vec_oprnds1
-					: &vec_oprnds2));
+      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
+			 reduc_index == 0 ? op.ops[0] : NULL_TREE, &vec_oprnds0,
+			 reduc_index == 1 ? op.ops[1] : NULL_TREE, &vec_oprnds1,
+			 reduc_index == 2 ? op.ops[2] : NULL_TREE,
+			 &vec_oprnds2);
     }
 
   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
 
-  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+  unsigned num = (reduc_index == 0
+		  ? vec_oprnds1.length () : vec_oprnds0.length ());
+  for (unsigned i = 0; i < num; ++i)
     {
       gimple *new_stmt;
-      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
+      tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE };
       if (masked_loop_p && !mask_by_cond_expr)
 	{
 	  /* No conditional ifns have been defined for dot-product yet.  */
@@ -8720,10 +8727,7 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
 	}
 
-      if (slp_node)
-	slp_node->push_vec_def (new_stmt);
-      else if (single_defuse_cycle
-	       && i < ncopies - 1)
+      if (single_defuse_cycle && i < num - 1)
 	{
 	  if (reduc_index == 0)
 	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
@@ -8732,6 +8736,8 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
 	  else if (reduc_index == 2)
 	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
 	}
+      else if (slp_node)
+	slp_node->push_vec_def (new_stmt);
       else
 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
@@ -8795,7 +8801,10 @@  vect_transform_cycle_phi (loop_vec_info loop_vinfo,
   /* Check whether we should use a single PHI node and accumulate
      vectors to one before the backedge.  */
   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
-    ncopies = 1;
+    {
+      ncopies = 1;
+      vec_num = 1;
+    }
 
   /* Create the destination vector  */
   gphi *phi = as_a <gphi *> (stmt_info->stmt);