Message ID | 20240614090815.7CCF913AAF@imap1.dmz-prg2.suse.org |
---|---|
State | New |
Headers | show |
Series | [v2] Support single def-use cycle optimization for SLP reduction vectorization | expand |
Hi! On 2024-06-14T11:08:15+0200, Richard Biener <rguenther@suse.de> wrote: > We can at least mimic single def-use cycle optimization when doing > single-lane SLP reductions and that's required to avoid regressing > compared to non-SLP. > > Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. > > * tree-vect-loop.cc (vectorizable_reduction): Allow > single-def-use cycles with SLP. > (vect_transform_reduction): Handle SLP single def-use cycles. > (vect_transform_cycle_phi): Likewise. > > * gcc.dg/vect/slp-reduc-12.c: New testcase. For GCN target (tested '-march=gfx908' on current sources), I see: +PASS: gcc.dg/vect/slp-reduc-12.c (test for excess errors) +FAIL: gcc.dg/vect/slp-reduc-12.c scan-tree-dump vect "using single def-use cycle for reduction" ..., where we've got (see attached): [...] [...]/gcc.dg/vect/slp-reduc-12.c:10:21: optimized: loop vectorized using 256 byte vectors [...] [...]/gcc.dg/vect/slp-reduc-12.c:10:21: note: Reduce using direct vector reduction. [...]/gcc.dg/vect/slp-reduc-12.c:10:21: note: vectorizing stmts using SLP. [...] How to address? Grüße Thomas > gcc/testsuite/gcc.dg/vect/slp-reduc-12.c | 18 ++++++++++ > gcc/tree-vect-loop.cc | 45 ++++++++++++++---------- > 2 files changed, 45 insertions(+), 18 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-12.c > > diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c > new file mode 100644 > index 00000000000..625f8097c54 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_double } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target vect_hw_misalign } */ > +/* { dg-additional-options "-Ofast" } */ > + > +double foo (double *x, int * __restrict a, int n) > +{ > + double r = 0.; > + for (int i = 0; i < n; ++i) > + { > + a[i] = a[i] + i; > + r += x[i]; > + } > + return r; > +} > + > +/* { dg-final { scan-tree-dump "using single def-use cycle for reduction" "vect" } } */ > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index bbd5d261907..d9a2ad69484 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -8320,7 +8320,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > participating. When unrolling we want each unrolled iteration to have its > own reduction accumulator since one of the main goals of unrolling a > reduction is to reduce the aggregate loop-carried latency. */ > - if (ncopies > 1 > + if ((ncopies > 1 > + || (slp_node > + && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) > + && SLP_TREE_LANES (slp_node) == 1 > + && vect_get_num_copies (loop_vinfo, vectype_in) > 1)) > && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) > && reduc_chain_length == 1 > && loop_vinfo->suggested_unroll_factor == 1) > @@ -8373,6 +8377,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > single_defuse_cycle = false; > } > } > + if (dump_enabled_p () && single_defuse_cycle) > + dump_printf_loc (MSG_NOTE, vect_location, > + "using single def-use cycle for reduction by reducing " > + "multiple vectors to one in the loop body\n"); > STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; > > /* If the reduction stmt is one of the patterns that have lane > @@ -8528,9 +8536,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > { > tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); > class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > - int i; > - int ncopies; > - int vec_num; > + unsigned ncopies; > + unsigned vec_num; > > stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); > gcc_assert (reduc_info->is_reduc_info); > @@ -8577,7 +8584,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > auto_vec<tree> vec_oprnds0; > auto_vec<tree> vec_oprnds1; > auto_vec<tree> vec_oprnds2; > - tree def0; > > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); > @@ -8652,20 +8658,21 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > definition. */ > if (single_defuse_cycle) > { > - gcc_assert (!slp_node); > - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, > - op.ops[reduc_index], > - reduc_index == 0 ? &vec_oprnds0 > - : (reduc_index == 1 ? &vec_oprnds1 > - : &vec_oprnds2)); > + vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1, > + reduc_index == 0 ? op.ops[0] : NULL_TREE, &vec_oprnds0, > + reduc_index == 1 ? op.ops[1] : NULL_TREE, &vec_oprnds1, > + reduc_index == 2 ? op.ops[2] : NULL_TREE, > + &vec_oprnds2); > } > > bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); > > - FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) > + unsigned num = (reduc_index == 0 > + ? vec_oprnds1.length () : vec_oprnds0.length ()); > + for (unsigned i = 0; i < num; ++i) > { > gimple *new_stmt; > - tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; > + tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE }; > if (masked_loop_p && !mask_by_cond_expr) > { > /* No conditional ifns have been defined for dot-product yet. */ > @@ -8720,10 +8727,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); > } > > - if (slp_node) > - slp_node->push_vec_def (new_stmt); > - else if (single_defuse_cycle > - && i < ncopies - 1) > + if (single_defuse_cycle && i < num - 1) > { > if (reduc_index == 0) > vec_oprnds0.safe_push (gimple_get_lhs (new_stmt)); > @@ -8732,6 +8736,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > else if (reduc_index == 2) > vec_oprnds2.safe_push (gimple_get_lhs (new_stmt)); > } > + else if (slp_node) > + slp_node->push_vec_def (new_stmt); > else > STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); > } > @@ -8795,7 +8801,10 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, > /* Check whether we should use a single PHI node and accumulate > vectors to one before the backedge. */ > if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info)) > - ncopies = 1; > + { > + ncopies = 1; > + vec_num = 1; > + } > > /* Create the destination vector */ > gphi *phi = as_a <gphi *> (stmt_info->stmt); > -- > 2.35.3
On Tue, 25 Jun 2024, Thomas Schwinge wrote: > Hi! > > On 2024-06-14T11:08:15+0200, Richard Biener <rguenther@suse.de> wrote: > > We can at least mimic single def-use cycle optimization when doing > > single-lane SLP reductions and that's required to avoid regressing > > compared to non-SLP. > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. > > > > * tree-vect-loop.cc (vectorizable_reduction): Allow > > single-def-use cycles with SLP. > > (vect_transform_reduction): Handle SLP single def-use cycles. > > (vect_transform_cycle_phi): Likewise. > > > > * gcc.dg/vect/slp-reduc-12.c: New testcase. > > For GCN target (tested '-march=gfx908' on current sources), I see: > > +PASS: gcc.dg/vect/slp-reduc-12.c (test for excess errors) > +FAIL: gcc.dg/vect/slp-reduc-12.c scan-tree-dump vect "using single def-use cycle for reduction" > > ..., where we've got (see attached): > > [...] > [...]/gcc.dg/vect/slp-reduc-12.c:10:21: optimized: loop vectorized using 256 byte vectors > [...] > [...]/gcc.dg/vect/slp-reduc-12.c:10:21: note: Reduce using direct vector reduction. > [...]/gcc.dg/vect/slp-reduc-12.c:10:21: note: vectorizing stmts using SLP. > [...] > > How to address? The testcase works on the premise that we have VnDF and VmSI with n != m but for GCN both are 64. I'm not sure how to gate the dump scanning properly but there must be precedence? Richard. > > Grüße > Thomas > > > > gcc/testsuite/gcc.dg/vect/slp-reduc-12.c | 18 ++++++++++ > > gcc/tree-vect-loop.cc | 45 ++++++++++++++---------- > > 2 files changed, 45 insertions(+), 18 deletions(-) > > create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-12.c > > > > diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c > > new file mode 100644 > > index 00000000000..625f8097c54 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c > > @@ -0,0 +1,18 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target vect_double } */ > > +/* { dg-require-effective-target vect_int } */ > > +/* { dg-require-effective-target vect_hw_misalign } */ > > +/* { dg-additional-options "-Ofast" } */ > > + > > +double foo (double *x, int * __restrict a, int n) > > +{ > > + double r = 0.; > > + for (int i = 0; i < n; ++i) > > + { > > + a[i] = a[i] + i; > > + r += x[i]; > > + } > > + return r; > > +} > > + > > +/* { dg-final { scan-tree-dump "using single def-use cycle for reduction" "vect" } } */ > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index bbd5d261907..d9a2ad69484 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -8320,7 +8320,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > participating. When unrolling we want each unrolled iteration to have its > > own reduction accumulator since one of the main goals of unrolling a > > reduction is to reduce the aggregate loop-carried latency. */ > > - if (ncopies > 1 > > + if ((ncopies > 1 > > + || (slp_node > > + && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) > > + && SLP_TREE_LANES (slp_node) == 1 > > + && vect_get_num_copies (loop_vinfo, vectype_in) > 1)) > > && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) > > && reduc_chain_length == 1 > > && loop_vinfo->suggested_unroll_factor == 1) > > @@ -8373,6 +8377,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > single_defuse_cycle = false; > > } > > } > > + if (dump_enabled_p () && single_defuse_cycle) > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "using single def-use cycle for reduction by reducing " > > + "multiple vectors to one in the loop body\n"); > > STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; > > > > /* If the reduction stmt is one of the patterns that have lane > > @@ -8528,9 +8536,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > > { > > tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); > > class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > - int i; > > - int ncopies; > > - int vec_num; > > + unsigned ncopies; > > + unsigned vec_num; > > > > stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); > > gcc_assert (reduc_info->is_reduc_info); > > @@ -8577,7 +8584,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > > auto_vec<tree> vec_oprnds0; > > auto_vec<tree> vec_oprnds1; > > auto_vec<tree> vec_oprnds2; > > - tree def0; > > > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); > > @@ -8652,20 +8658,21 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > > definition. */ > > if (single_defuse_cycle) > > { > > - gcc_assert (!slp_node); > > - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, > > - op.ops[reduc_index], > > - reduc_index == 0 ? &vec_oprnds0 > > - : (reduc_index == 1 ? &vec_oprnds1 > > - : &vec_oprnds2)); > > + vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1, > > + reduc_index == 0 ? op.ops[0] : NULL_TREE, &vec_oprnds0, > > + reduc_index == 1 ? op.ops[1] : NULL_TREE, &vec_oprnds1, > > + reduc_index == 2 ? op.ops[2] : NULL_TREE, > > + &vec_oprnds2); > > } > > > > bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); > > > > - FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) > > + unsigned num = (reduc_index == 0 > > + ? vec_oprnds1.length () : vec_oprnds0.length ()); > > + for (unsigned i = 0; i < num; ++i) > > { > > gimple *new_stmt; > > - tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; > > + tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE }; > > if (masked_loop_p && !mask_by_cond_expr) > > { > > /* No conditional ifns have been defined for dot-product yet. */ > > @@ -8720,10 +8727,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > > vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); > > } > > > > - if (slp_node) > > - slp_node->push_vec_def (new_stmt); > > - else if (single_defuse_cycle > > - && i < ncopies - 1) > > + if (single_defuse_cycle && i < num - 1) > > { > > if (reduc_index == 0) > > vec_oprnds0.safe_push (gimple_get_lhs (new_stmt)); > > @@ -8732,6 +8736,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > > else if (reduc_index == 2) > > vec_oprnds2.safe_push (gimple_get_lhs (new_stmt)); > > } > > + else if (slp_node) > > + slp_node->push_vec_def (new_stmt); > > else > > STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); > > } > > @@ -8795,7 +8801,10 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, > > /* Check whether we should use a single PHI node and accumulate > > vectors to one before the backedge. */ > > if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info)) > > - ncopies = 1; > > + { > > + ncopies = 1; > > + vec_num = 1; > > + } > > > > /* Create the destination vector */ > > gphi *phi = as_a <gphi *> (stmt_info->stmt); > > -- > > 2.35.3 > > >
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c new file mode 100644 index 00000000000..625f8097c54 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-12.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_hw_misalign } */ +/* { dg-additional-options "-Ofast" } */ + +double foo (double *x, int * __restrict a, int n) +{ + double r = 0.; + for (int i = 0; i < n; ++i) + { + a[i] = a[i] + i; + r += x[i]; + } + return r; +} + +/* { dg-final { scan-tree-dump "using single def-use cycle for reduction" "vect" } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index bbd5d261907..d9a2ad69484 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -8320,7 +8320,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, participating. When unrolling we want each unrolled iteration to have its own reduction accumulator since one of the main goals of unrolling a reduction is to reduce the aggregate loop-carried latency. */ - if (ncopies > 1 + if ((ncopies > 1 + || (slp_node + && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) + && SLP_TREE_LANES (slp_node) == 1 + && vect_get_num_copies (loop_vinfo, vectype_in) > 1)) && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) && reduc_chain_length == 1 && loop_vinfo->suggested_unroll_factor == 1) @@ -8373,6 +8377,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo, single_defuse_cycle = false; } } + if (dump_enabled_p () && single_defuse_cycle) + dump_printf_loc (MSG_NOTE, vect_location, + "using single def-use cycle for reduction by reducing " + "multiple vectors to one in the loop body\n"); STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; /* If the reduction stmt is one of the patterns that have lane @@ -8528,9 +8536,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, { tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - int i; - int ncopies; - int vec_num; + unsigned ncopies; + unsigned vec_num; stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); gcc_assert (reduc_info->is_reduc_info); @@ -8577,7 +8584,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo, auto_vec<tree> vec_oprnds0; auto_vec<tree> vec_oprnds1; auto_vec<tree> vec_oprnds2; - tree def0; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); @@ -8652,20 +8658,21 @@ vect_transform_reduction (loop_vec_info loop_vinfo, definition. */ if (single_defuse_cycle) { - gcc_assert (!slp_node); - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, - op.ops[reduc_index], - reduc_index == 0 ? &vec_oprnds0 - : (reduc_index == 1 ? &vec_oprnds1 - : &vec_oprnds2)); + vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1, + reduc_index == 0 ? op.ops[0] : NULL_TREE, &vec_oprnds0, + reduc_index == 1 ? op.ops[1] : NULL_TREE, &vec_oprnds1, + reduc_index == 2 ? op.ops[2] : NULL_TREE, + &vec_oprnds2); } bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); - FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) + unsigned num = (reduc_index == 0 + ? vec_oprnds1.length () : vec_oprnds0.length ()); + for (unsigned i = 0; i < num; ++i) { gimple *new_stmt; - tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; + tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE }; if (masked_loop_p && !mask_by_cond_expr) { /* No conditional ifns have been defined for dot-product yet. */ @@ -8720,10 +8727,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); } - if (slp_node) - slp_node->push_vec_def (new_stmt); - else if (single_defuse_cycle - && i < ncopies - 1) + if (single_defuse_cycle && i < num - 1) { if (reduc_index == 0) vec_oprnds0.safe_push (gimple_get_lhs (new_stmt)); @@ -8732,6 +8736,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, else if (reduc_index == 2) vec_oprnds2.safe_push (gimple_get_lhs (new_stmt)); } + else if (slp_node) + slp_node->push_vec_def (new_stmt); else STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); } @@ -8795,7 +8801,10 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, /* Check whether we should use a single PHI node and accumulate vectors to one before the backedge. */ if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info)) - ncopies = 1; + { + ncopies = 1; + vec_num = 1; + } /* Create the destination vector */ gphi *phi = as_a <gphi *> (stmt_info->stmt);