Message ID | 2adef8b10433859b6642282b03a11df33c732d11.1694657494.git.linkw@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | vect: Move costing next to the transform for vect store | expand |
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin <linkw@linux.ibm.com> wrote: > > This patch adjusts the cost handling on VMAT_ELEMENTWISE > and VMAT_STRIDED_SLP in function vectorizable_store. We > don't call function vect_model_store_cost for them any more. > > Like what we improved for PR82255 on load side, this change > helps us to get rid of unnecessary vec_to_scalar costing > for some case with VMAT_STRIDED_SLP. One typical test case > gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c has been > associated. And it helps some cases with some inconsistent > costing too. > > Besides, this also special-cases the interleaving stores > for these two affected memory access types, since for the > interleaving stores the whole chain is vectorized when the > last store in the chain is reached, the other stores in the > group would be skipped. To keep consistent with this and > follows the transforming handlings like iterating the whole > group, it only costs for the first store in the group. > Ideally we can only cost for the last one but it's not > trivial and using the first one is actually equivalent. OK > gcc/ChangeLog: > > * tree-vect-stmts.cc (vect_model_store_cost): Assert it won't get > VMAT_ELEMENTWISE and VMAT_STRIDED_SLP any more, and remove their > related handlings. > (vectorizable_store): Adjust the cost handling on VMAT_ELEMENTWISE > and VMAT_STRIDED_SLP without calling vect_model_store_cost. > > gcc/testsuite/ChangeLog: > > * gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c: New test. > --- > .../costmodel/ppc/costmodel-vect-store-1.c | 23 +++ > gcc/tree-vect-stmts.cc | 160 +++++++++++------- > 2 files changed, 120 insertions(+), 63 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c > > diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c > new file mode 100644 > index 00000000000..ab5f3301492 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-additional-options "-O3" } > + > +/* This test case is partially extracted from case > + gcc.dg/vect/vect-avg-16.c, it's to verify we don't > + cost a store with vec_to_scalar when we shouldn't. */ > + > +void > +test (signed char *restrict a, signed char *restrict b, signed char *restrict c, > + int n) > +{ > + for (int j = 0; j < n; ++j) > + { > + for (int i = 0; i < 16; ++i) > + a[i] = (b[i] + c[i]) >> 1; > + a += 20; > + b += 20; > + c += 20; > + } > +} > + > +/* { dg-final { scan-tree-dump-times "vec_to_scalar" 0 "vect" } } */ > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 048c14d291c..3d01168080a 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -964,7 +964,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies, > vec_load_store_type vls_type, slp_tree slp_node, > stmt_vector_for_cost *cost_vec) > { > - gcc_assert (memory_access_type != VMAT_GATHER_SCATTER); > + gcc_assert (memory_access_type != VMAT_GATHER_SCATTER > + && memory_access_type != VMAT_ELEMENTWISE > + && memory_access_type != VMAT_STRIDED_SLP); > unsigned int inside_cost = 0, prologue_cost = 0; > stmt_vec_info first_stmt_info = stmt_info; > bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info); > @@ -1010,29 +1012,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies, > group_size); > } > > - tree vectype = STMT_VINFO_VECTYPE (stmt_info); > /* Costs of the stores. */ > - if (memory_access_type == VMAT_ELEMENTWISE) > - { > - unsigned int assumed_nunits = vect_nunits_for_cost (vectype); > - /* N scalar stores plus extracting the elements. */ > - inside_cost += record_stmt_cost (cost_vec, > - ncopies * assumed_nunits, > - scalar_store, stmt_info, 0, vect_body); > - } > - else > - vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, > - misalignment, &inside_cost, cost_vec); > - > - if (memory_access_type == VMAT_ELEMENTWISE > - || memory_access_type == VMAT_STRIDED_SLP) > - { > - /* N scalar stores plus extracting the elements. */ > - unsigned int assumed_nunits = vect_nunits_for_cost (vectype); > - inside_cost += record_stmt_cost (cost_vec, > - ncopies * assumed_nunits, > - vec_to_scalar, stmt_info, 0, vect_body); > - } > + vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, > + misalignment, &inside_cost, cost_vec); > > /* When vectorizing a store into the function result assign > a penalty if the function returns in a multi-register location. > @@ -8416,6 +8398,18 @@ vectorizable_store (vec_info *vinfo, > "Vectorizing an unaligned access.\n"); > > STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; > + > + /* As function vect_transform_stmt shows, for interleaving stores > + the whole chain is vectorized when the last store in the chain > + is reached, the other stores in the group are skipped. So we > + want to only cost the last one here, but it's not trivial to > + get the last, as it's equivalent to use the first one for > + costing, use the first one instead. */ > + if (grouped_store > + && !slp > + && first_stmt_info != stmt_info > + && memory_access_type == VMAT_ELEMENTWISE) > + return true; > } > gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); > > @@ -8488,14 +8482,7 @@ vectorizable_store (vec_info *vinfo, > if (memory_access_type == VMAT_ELEMENTWISE > || memory_access_type == VMAT_STRIDED_SLP) > { > - if (costing_p) > - { > - vect_model_store_cost (vinfo, stmt_info, ncopies, memory_access_type, > - alignment_support_scheme, misalignment, > - vls_type, slp_node, cost_vec); > - return true; > - } > - > + unsigned inside_cost = 0, prologue_cost = 0; > gimple_stmt_iterator incr_gsi; > bool insert_after; > gimple *incr; > @@ -8503,7 +8490,7 @@ vectorizable_store (vec_info *vinfo, > tree ivstep; > tree running_off; > tree stride_base, stride_step, alias_off; > - tree vec_oprnd; > + tree vec_oprnd = NULL_TREE; > tree dr_offset; > unsigned int g; > /* Checked by get_load_store_type. */ > @@ -8609,26 +8596,30 @@ vectorizable_store (vec_info *vinfo, > lnel = const_nunits; > ltype = vectype; > lvectype = vectype; > + alignment_support_scheme = dr_align; > + misalignment = mis_align; > } > } > ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type)); > ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > } > > - ivstep = stride_step; > - ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, > - build_int_cst (TREE_TYPE (ivstep), vf)); > + if (!costing_p) > + { > + ivstep = stride_step; > + ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, > + build_int_cst (TREE_TYPE (ivstep), vf)); > > - standard_iv_increment_position (loop, &incr_gsi, &insert_after); > + standard_iv_increment_position (loop, &incr_gsi, &insert_after); > > - stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base); > - ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep); > - create_iv (stride_base, PLUS_EXPR, ivstep, NULL, > - loop, &incr_gsi, insert_after, > - &offvar, NULL); > - incr = gsi_stmt (incr_gsi); > + stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base); > + ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep); > + create_iv (stride_base, PLUS_EXPR, ivstep, NULL, loop, &incr_gsi, > + insert_after, &offvar, NULL); > + incr = gsi_stmt (incr_gsi); > > - stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step); > + stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step); > + } > > alias_off = build_int_cst (ref_type, 0); > stmt_vec_info next_stmt_info = first_stmt_info; > @@ -8636,39 +8627,76 @@ vectorizable_store (vec_info *vinfo, > for (g = 0; g < group_size; g++) > { > running_off = offvar; > - if (g) > + if (!costing_p) > { > - tree size = TYPE_SIZE_UNIT (ltype); > - tree pos = fold_build2 (MULT_EXPR, sizetype, size_int (g), > - size); > - tree newoff = copy_ssa_name (running_off, NULL); > - incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, > - running_off, pos); > - vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi); > - running_off = newoff; > + if (g) > + { > + tree size = TYPE_SIZE_UNIT (ltype); > + tree pos > + = fold_build2 (MULT_EXPR, sizetype, size_int (g), size); > + tree newoff = copy_ssa_name (running_off, NULL); > + incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, > + running_off, pos); > + vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi); > + running_off = newoff; > + } > } > if (!slp) > op = vect_get_store_rhs (next_stmt_info); > - vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, > - op, &vec_oprnds); > + if (!costing_p) > + vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, op, > + &vec_oprnds); > + else if (!slp) > + { > + enum vect_def_type cdt; > + gcc_assert (vect_is_simple_use (op, vinfo, &cdt)); > + if (cdt == vect_constant_def || cdt == vect_external_def) > + prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, > + stmt_info, 0, vect_prologue); > + } > unsigned int group_el = 0; > unsigned HOST_WIDE_INT > elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); > for (j = 0; j < ncopies; j++) > { > - vec_oprnd = vec_oprnds[j]; > - /* Pun the vector to extract from if necessary. */ > - if (lvectype != vectype) > + if (!costing_p) > { > - tree tem = make_ssa_name (lvectype); > - gimple *pun > - = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR, > - lvectype, vec_oprnd)); > - vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi); > - vec_oprnd = tem; > + vec_oprnd = vec_oprnds[j]; > + /* Pun the vector to extract from if necessary. */ > + if (lvectype != vectype) > + { > + tree tem = make_ssa_name (lvectype); > + tree cvt > + = build1 (VIEW_CONVERT_EXPR, lvectype, vec_oprnd); > + gimple *pun = gimple_build_assign (tem, cvt); > + vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi); > + vec_oprnd = tem; > + } > } > for (i = 0; i < nstores; i++) > { > + if (costing_p) > + { > + /* Only need vector extracting when there are more > + than one stores. */ > + if (nstores > 1) > + inside_cost > + += record_stmt_cost (cost_vec, 1, vec_to_scalar, > + stmt_info, 0, vect_body); > + /* Take a single lane vector type store as scalar > + store to avoid ICE like 110776. */ > + if (VECTOR_TYPE_P (ltype) > + && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U)) > + vect_get_store_cost (vinfo, stmt_info, 1, > + alignment_support_scheme, > + misalignment, &inside_cost, > + cost_vec); > + else > + inside_cost > + += record_stmt_cost (cost_vec, 1, scalar_store, > + stmt_info, 0, vect_body); > + continue; > + } > tree newref, newoff; > gimple *incr, *assign; > tree size = TYPE_SIZE (ltype); > @@ -8719,6 +8747,12 @@ vectorizable_store (vec_info *vinfo, > break; > } > > + if (costing_p && dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "vect_model_store_cost: inside_cost = %d, " > + "prologue_cost = %d .\n", > + inside_cost, prologue_cost); > + > return true; > } > > -- > 2.31.1 >
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c new file mode 100644 index 00000000000..ab5f3301492 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } + +/* This test case is partially extracted from case + gcc.dg/vect/vect-avg-16.c, it's to verify we don't + cost a store with vec_to_scalar when we shouldn't. */ + +void +test (signed char *restrict a, signed char *restrict b, signed char *restrict c, + int n) +{ + for (int j = 0; j < n; ++j) + { + for (int i = 0; i < 16; ++i) + a[i] = (b[i] + c[i]) >> 1; + a += 20; + b += 20; + c += 20; + } +} + +/* { dg-final { scan-tree-dump-times "vec_to_scalar" 0 "vect" } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 048c14d291c..3d01168080a 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -964,7 +964,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies, vec_load_store_type vls_type, slp_tree slp_node, stmt_vector_for_cost *cost_vec) { - gcc_assert (memory_access_type != VMAT_GATHER_SCATTER); + gcc_assert (memory_access_type != VMAT_GATHER_SCATTER + && memory_access_type != VMAT_ELEMENTWISE + && memory_access_type != VMAT_STRIDED_SLP); unsigned int inside_cost = 0, prologue_cost = 0; stmt_vec_info first_stmt_info = stmt_info; bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info); @@ -1010,29 +1012,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies, group_size); } - tree vectype = STMT_VINFO_VECTYPE (stmt_info); /* Costs of the stores. */ - if (memory_access_type == VMAT_ELEMENTWISE) - { - unsigned int assumed_nunits = vect_nunits_for_cost (vectype); - /* N scalar stores plus extracting the elements. */ - inside_cost += record_stmt_cost (cost_vec, - ncopies * assumed_nunits, - scalar_store, stmt_info, 0, vect_body); - } - else - vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, - misalignment, &inside_cost, cost_vec); - - if (memory_access_type == VMAT_ELEMENTWISE - || memory_access_type == VMAT_STRIDED_SLP) - { - /* N scalar stores plus extracting the elements. */ - unsigned int assumed_nunits = vect_nunits_for_cost (vectype); - inside_cost += record_stmt_cost (cost_vec, - ncopies * assumed_nunits, - vec_to_scalar, stmt_info, 0, vect_body); - } + vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, + misalignment, &inside_cost, cost_vec); /* When vectorizing a store into the function result assign a penalty if the function returns in a multi-register location. @@ -8416,6 +8398,18 @@ vectorizable_store (vec_info *vinfo, "Vectorizing an unaligned access.\n"); STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; + + /* As function vect_transform_stmt shows, for interleaving stores + the whole chain is vectorized when the last store in the chain + is reached, the other stores in the group are skipped. So we + want to only cost the last one here, but it's not trivial to + get the last, as it's equivalent to use the first one for + costing, use the first one instead. */ + if (grouped_store + && !slp + && first_stmt_info != stmt_info + && memory_access_type == VMAT_ELEMENTWISE) + return true; } gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); @@ -8488,14 +8482,7 @@ vectorizable_store (vec_info *vinfo, if (memory_access_type == VMAT_ELEMENTWISE || memory_access_type == VMAT_STRIDED_SLP) { - if (costing_p) - { - vect_model_store_cost (vinfo, stmt_info, ncopies, memory_access_type, - alignment_support_scheme, misalignment, - vls_type, slp_node, cost_vec); - return true; - } - + unsigned inside_cost = 0, prologue_cost = 0; gimple_stmt_iterator incr_gsi; bool insert_after; gimple *incr; @@ -8503,7 +8490,7 @@ vectorizable_store (vec_info *vinfo, tree ivstep; tree running_off; tree stride_base, stride_step, alias_off; - tree vec_oprnd; + tree vec_oprnd = NULL_TREE; tree dr_offset; unsigned int g; /* Checked by get_load_store_type. */ @@ -8609,26 +8596,30 @@ vectorizable_store (vec_info *vinfo, lnel = const_nunits; ltype = vectype; lvectype = vectype; + alignment_support_scheme = dr_align; + misalignment = mis_align; } } ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type)); ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); } - ivstep = stride_step; - ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, - build_int_cst (TREE_TYPE (ivstep), vf)); + if (!costing_p) + { + ivstep = stride_step; + ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, + build_int_cst (TREE_TYPE (ivstep), vf)); - standard_iv_increment_position (loop, &incr_gsi, &insert_after); + standard_iv_increment_position (loop, &incr_gsi, &insert_after); - stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base); - ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep); - create_iv (stride_base, PLUS_EXPR, ivstep, NULL, - loop, &incr_gsi, insert_after, - &offvar, NULL); - incr = gsi_stmt (incr_gsi); + stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base); + ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep); + create_iv (stride_base, PLUS_EXPR, ivstep, NULL, loop, &incr_gsi, + insert_after, &offvar, NULL); + incr = gsi_stmt (incr_gsi); - stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step); + stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step); + } alias_off = build_int_cst (ref_type, 0); stmt_vec_info next_stmt_info = first_stmt_info; @@ -8636,39 +8627,76 @@ vectorizable_store (vec_info *vinfo, for (g = 0; g < group_size; g++) { running_off = offvar; - if (g) + if (!costing_p) { - tree size = TYPE_SIZE_UNIT (ltype); - tree pos = fold_build2 (MULT_EXPR, sizetype, size_int (g), - size); - tree newoff = copy_ssa_name (running_off, NULL); - incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, - running_off, pos); - vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi); - running_off = newoff; + if (g) + { + tree size = TYPE_SIZE_UNIT (ltype); + tree pos + = fold_build2 (MULT_EXPR, sizetype, size_int (g), size); + tree newoff = copy_ssa_name (running_off, NULL); + incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, + running_off, pos); + vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi); + running_off = newoff; + } } if (!slp) op = vect_get_store_rhs (next_stmt_info); - vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, - op, &vec_oprnds); + if (!costing_p) + vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, op, + &vec_oprnds); + else if (!slp) + { + enum vect_def_type cdt; + gcc_assert (vect_is_simple_use (op, vinfo, &cdt)); + if (cdt == vect_constant_def || cdt == vect_external_def) + prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, + stmt_info, 0, vect_prologue); + } unsigned int group_el = 0; unsigned HOST_WIDE_INT elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); for (j = 0; j < ncopies; j++) { - vec_oprnd = vec_oprnds[j]; - /* Pun the vector to extract from if necessary. */ - if (lvectype != vectype) + if (!costing_p) { - tree tem = make_ssa_name (lvectype); - gimple *pun - = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR, - lvectype, vec_oprnd)); - vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi); - vec_oprnd = tem; + vec_oprnd = vec_oprnds[j]; + /* Pun the vector to extract from if necessary. */ + if (lvectype != vectype) + { + tree tem = make_ssa_name (lvectype); + tree cvt + = build1 (VIEW_CONVERT_EXPR, lvectype, vec_oprnd); + gimple *pun = gimple_build_assign (tem, cvt); + vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi); + vec_oprnd = tem; + } } for (i = 0; i < nstores; i++) { + if (costing_p) + { + /* Only need vector extracting when there are more + than one stores. */ + if (nstores > 1) + inside_cost + += record_stmt_cost (cost_vec, 1, vec_to_scalar, + stmt_info, 0, vect_body); + /* Take a single lane vector type store as scalar + store to avoid ICE like 110776. */ + if (VECTOR_TYPE_P (ltype) + && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U)) + vect_get_store_cost (vinfo, stmt_info, 1, + alignment_support_scheme, + misalignment, &inside_cost, + cost_vec); + else + inside_cost + += record_stmt_cost (cost_vec, 1, scalar_store, + stmt_info, 0, vect_body); + continue; + } tree newref, newoff; gimple *incr, *assign; tree size = TYPE_SIZE (ltype); @@ -8719,6 +8747,12 @@ vectorizable_store (vec_info *vinfo, break; } + if (costing_p && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_model_store_cost: inside_cost = %d, " + "prologue_cost = %d .\n", + inside_cost, prologue_cost); + return true; }