Message ID | 1328907521.18863.6.camel@gnopaine |
---|---|
State | New |
Headers | show |
Greetings, Given the recent discussion on getting 4.6 cleaned up, I thought I'd check back on this one. Thanks! Bill On Fri, 2012-02-10 at 14:58 -0600, William J. Schmidt wrote: > This patch backports the two recent trunk fixes for powerpc64 > vectorization degradations. The fixes are largely identical to their > 4.7 counterparts except that (a) the logic for > STMT_VINFO_PATTERN_DEF_SEQ does not apply in 4.6, and (b) the changes to > vectorizable_conversion in 4.7 correspond to changes in > vectorizable_type_demotion and vectorizable_type_promotion in 4.6. > > Bootstrapped and tested for regressions and performance for > powerpc64-linux. OK to commit after the trunk patch has a few days of > burn-in? > > Thanks, > Bill > > > 2012-02-10 Bill Schmidt <wschmidt@linux.vnet.ibm.com> > Ira Rosen <irar@il.ibm.com> > > PR tree-optimization/50031 > PR tree-optimization/50969 > * targhooks.c (default_builtin_vectorization_cost): Handle > vec_promote_demote. > * target.h (enum vect_cost_for_stmt): Add vec_promote_demote. > * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle > all types of reduction and pattern statements. > (vect_estimate_min_profitable_iters): Likewise. > * tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function. > (vect_model_store_cost): Use vec_perm rather than vector_stmt for > statement cost. > (vect_model_load_cost): Likewise. > (vect_get_load_cost): Likewise; add dump logic for explicit realigns. > (vectorizable_type_demotion): Call vect_model_promotion_demotion_cost. > (vectorizable_type_promotion): Likewise. > * config/spu/spu.c (spu_builtin_vectorization_cost): Handle > vec_promote_demote. > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update > vec_perm for VSX and handle vec_promote_demote. > > > Index: gcc/targhooks.c > =================================================================== > --- gcc/targhooks.c (revision 184047) > +++ gcc/targhooks.c (working copy) > @@ -529,6 +529,7 @@ default_builtin_vectorization_cost (enum vect_cost > case scalar_to_vec: > case cond_branch_not_taken: > case vec_perm: > + case vec_promote_demote: > return 1; > > case unaligned_load: > Index: gcc/target.h > =================================================================== > --- gcc/target.h (revision 184047) > +++ gcc/target.h (working copy) > @@ -128,7 +128,8 @@ enum vect_cost_for_stmt > scalar_to_vec, > cond_branch_not_taken, > cond_branch_taken, > - vec_perm > + vec_perm, > + vec_promote_demote > }; > > /* Sets of optimization levels at which an option may be enabled by > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c (revision 184047) > +++ gcc/tree-vect-loop.c (working copy) > @@ -2104,7 +2104,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf > if (stmt_info > && !STMT_VINFO_RELEVANT_P (stmt_info) > && (!STMT_VINFO_LIVE_P (stmt_info) > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) > + && !STMT_VINFO_IN_PATTERN_P (stmt_info)) > continue; > > if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) > @@ -2251,11 +2252,19 @@ vect_estimate_min_profitable_iters (loop_vec_info > { > gimple stmt = gsi_stmt (si); > stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > + > + if (STMT_VINFO_IN_PATTERN_P (stmt_info)) > + { > + stmt = STMT_VINFO_RELATED_STMT (stmt_info); > + stmt_info = vinfo_for_stmt (stmt); > + } > + > /* Skip stmts that are not vectorized inside the loop. */ > if (!STMT_VINFO_RELEVANT_P (stmt_info) > && (!STMT_VINFO_LIVE_P (stmt_info) > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))) > continue; > + > vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; > /* FIXME: for stmts in the inner-loop in outer-loop vectorization, > some of the "outside" costs are generated inside the outer-loop. */ > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c (revision 184047) > +++ gcc/tree-vect-stmts.c (working copy) > @@ -623,6 +623,46 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i > } > > > +/* Model cost for type demotion and promotion operations. PWR is normally > + zero for single-step promotions and demotions. It will be one if > + two-step promotion/demotion is required, and so on. Each additional > + step doubles the number of instructions required. */ > + > +static void > +vect_model_promotion_demotion_cost (stmt_vec_info stmt_info, > + enum vect_def_type *dt, int pwr) > +{ > + int i, tmp; > + int inside_cost = 0, outside_cost = 0, single_stmt_cost; > + > + /* The SLP costs were already calculated during SLP tree build. */ > + if (PURE_SLP_STMT (stmt_info)) > + return; > + > + single_stmt_cost = vect_get_stmt_cost (vec_promote_demote); > + for (i = 0; i < pwr + 1; i++) > + { > + tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ? > + (i + 1) : i; > + inside_cost += vect_pow2 (tmp) * single_stmt_cost; > + } > + > + /* FORNOW: Assuming maximum 2 args per stmts. */ > + for (i = 0; i < 2; i++) > + { > + if (dt[i] == vect_constant_def || dt[i] == vect_external_def) > + outside_cost += vect_get_stmt_cost (vector_stmt); > + } > + > + if (vect_print_dump_info (REPORT_COST)) > + fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, " > + "outside_cost = %d .", inside_cost, outside_cost); > + > + /* Set the costs in STMT_INFO. */ > + stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost); > + stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost); > +} > + > /* Function vect_cost_strided_group_size > > For strided load or store, return the group_size only if it is the first > @@ -691,7 +731,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, in > { > /* Uses a high and low interleave operation for each needed permute. */ > inside_cost = ncopies * exact_log2(group_size) * group_size > - * vect_get_stmt_cost (vector_stmt); > + * vect_get_stmt_cost (vec_perm); > > if (vect_print_dump_info (REPORT_COST)) > fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", > @@ -795,7 +835,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int > { > /* Uses an even and odd extract operations for each needed permute. */ > inside_cost = ncopies * exact_log2(group_size) * group_size > - * vect_get_stmt_cost (vector_stmt); > + * vect_get_stmt_cost (vec_perm); > > if (vect_print_dump_info (REPORT_COST)) > fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .", > @@ -855,7 +895,7 @@ vect_get_load_cost (struct data_reference *dr, int > case dr_explicit_realign: > { > *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) > - + vect_get_stmt_cost (vector_stmt)); > + + vect_get_stmt_cost (vec_perm)); > > /* FIXME: If the misalignment remains fixed across the iterations of > the containing loop, the following cost should be added to the > @@ -863,6 +903,9 @@ vect_get_load_cost (struct data_reference *dr, int > if (targetm.vectorize.builtin_mask_for_load) > *inside_cost += vect_get_stmt_cost (vector_stmt); > > + if (vect_print_dump_info (REPORT_COST)) > + fprintf (vect_dump, "vect_model_load_cost: explicit realign"); > + > break; > } > case dr_explicit_realign_optimized: > @@ -886,7 +929,12 @@ vect_get_load_cost (struct data_reference *dr, int > } > > *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) > - + vect_get_stmt_cost (vector_stmt)); > + + vect_get_stmt_cost (vec_perm)); > + > + if (vect_print_dump_info (REPORT_COST)) > + fprintf (vect_dump, > + "vect_model_load_cost: explicit realign optimized"); > + > break; > } > > @@ -2919,7 +2967,7 @@ vectorizable_type_demotion (gimple stmt, gimple_st > STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; > if (vect_print_dump_info (REPORT_DETAILS)) > fprintf (vect_dump, "=== vectorizable_demotion ==="); > - vect_model_simple_cost (stmt_info, ncopies, dt, NULL); > + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); > return true; > } > > @@ -3217,7 +3265,7 @@ vectorizable_type_promotion (gimple stmt, gimple_s > STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; > if (vect_print_dump_info (REPORT_DETAILS)) > fprintf (vect_dump, "=== vectorizable_promotion ==="); > - vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL); > + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); > return true; > } > > Index: gcc/config/spu/spu.c > =================================================================== > --- gcc/config/spu/spu.c (revision 184047) > +++ gcc/config/spu/spu.c (working copy) > @@ -6794,6 +6794,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for > case scalar_to_vec: > case cond_branch_not_taken: > case vec_perm: > + case vec_promote_demote: > return 1; > > case scalar_store: > Index: gcc/config/i386/i386.c > =================================================================== > --- gcc/config/i386/i386.c (revision 184047) > +++ gcc/config/i386/i386.c (working copy) > @@ -32816,7 +32816,8 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo > return ix86_cost->cond_not_taken_branch_cost; > > case vec_perm: > - return 1; > + case vec_promote_demote: > + return ix86_cost->vec_stmt_cost; > > default: > gcc_unreachable (); > Index: gcc/config/rs6000/rs6000.c > =================================================================== > --- gcc/config/rs6000/rs6000.c (revision 184047) > +++ gcc/config/rs6000/rs6000.c (working copy) > @@ -3695,12 +3695,23 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > case vec_to_scalar: > case scalar_to_vec: > case cond_branch_not_taken: > - case vec_perm: > return 1; > > case cond_branch_taken: > return 3; > > + case vec_perm: > + if (TARGET_VSX) > + return 4; > + else > + return 1; > + > + case vec_promote_demote: > + if (TARGET_VSX) > + return 5; > + else > + return 1; > + > case unaligned_load: > if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN) > { >
On Thu, Feb 16, 2012 at 2:17 PM, William J. Schmidt <wschmidt@linux.vnet.ibm.com> wrote: > Greetings, > > Given the recent discussion on getting 4.6 cleaned up, I thought I'd > check back on this one. Thanks! Ok. Thanks, Richard. > Bill > > On Fri, 2012-02-10 at 14:58 -0600, William J. Schmidt wrote: >> This patch backports the two recent trunk fixes for powerpc64 >> vectorization degradations. The fixes are largely identical to their >> 4.7 counterparts except that (a) the logic for >> STMT_VINFO_PATTERN_DEF_SEQ does not apply in 4.6, and (b) the changes to >> vectorizable_conversion in 4.7 correspond to changes in >> vectorizable_type_demotion and vectorizable_type_promotion in 4.6. >> >> Bootstrapped and tested for regressions and performance for >> powerpc64-linux. OK to commit after the trunk patch has a few days of >> burn-in? >> >> Thanks, >> Bill >> >> >> 2012-02-10 Bill Schmidt <wschmidt@linux.vnet.ibm.com> >> Ira Rosen <irar@il.ibm.com> >> >> PR tree-optimization/50031 >> PR tree-optimization/50969 >> * targhooks.c (default_builtin_vectorization_cost): Handle >> vec_promote_demote. >> * target.h (enum vect_cost_for_stmt): Add vec_promote_demote. >> * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle >> all types of reduction and pattern statements. >> (vect_estimate_min_profitable_iters): Likewise. >> * tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function. >> (vect_model_store_cost): Use vec_perm rather than vector_stmt for >> statement cost. >> (vect_model_load_cost): Likewise. >> (vect_get_load_cost): Likewise; add dump logic for explicit realigns. >> (vectorizable_type_demotion): Call vect_model_promotion_demotion_cost. >> (vectorizable_type_promotion): Likewise. >> * config/spu/spu.c (spu_builtin_vectorization_cost): Handle >> vec_promote_demote. >> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. >> * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update >> vec_perm for VSX and handle vec_promote_demote. >> >> >> Index: gcc/targhooks.c >> =================================================================== >> --- gcc/targhooks.c (revision 184047) >> +++ gcc/targhooks.c (working copy) >> @@ -529,6 +529,7 @@ default_builtin_vectorization_cost (enum vect_cost >> case scalar_to_vec: >> case cond_branch_not_taken: >> case vec_perm: >> + case vec_promote_demote: >> return 1; >> >> case unaligned_load: >> Index: gcc/target.h >> =================================================================== >> --- gcc/target.h (revision 184047) >> +++ gcc/target.h (working copy) >> @@ -128,7 +128,8 @@ enum vect_cost_for_stmt >> scalar_to_vec, >> cond_branch_not_taken, >> cond_branch_taken, >> - vec_perm >> + vec_perm, >> + vec_promote_demote >> }; >> >> /* Sets of optimization levels at which an option may be enabled by >> Index: gcc/tree-vect-loop.c >> =================================================================== >> --- gcc/tree-vect-loop.c (revision 184047) >> +++ gcc/tree-vect-loop.c (working copy) >> @@ -2104,7 +2104,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf >> if (stmt_info >> && !STMT_VINFO_RELEVANT_P (stmt_info) >> && (!STMT_VINFO_LIVE_P (stmt_info) >> - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) >> + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) >> + && !STMT_VINFO_IN_PATTERN_P (stmt_info)) >> continue; >> >> if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) >> @@ -2251,11 +2252,19 @@ vect_estimate_min_profitable_iters (loop_vec_info >> { >> gimple stmt = gsi_stmt (si); >> stmt_vec_info stmt_info = vinfo_for_stmt (stmt); >> + >> + if (STMT_VINFO_IN_PATTERN_P (stmt_info)) >> + { >> + stmt = STMT_VINFO_RELATED_STMT (stmt_info); >> + stmt_info = vinfo_for_stmt (stmt); >> + } >> + >> /* Skip stmts that are not vectorized inside the loop. */ >> if (!STMT_VINFO_RELEVANT_P (stmt_info) >> && (!STMT_VINFO_LIVE_P (stmt_info) >> - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) >> + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))) >> continue; >> + >> vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; >> /* FIXME: for stmts in the inner-loop in outer-loop vectorization, >> some of the "outside" costs are generated inside the outer-loop. */ >> Index: gcc/tree-vect-stmts.c >> =================================================================== >> --- gcc/tree-vect-stmts.c (revision 184047) >> +++ gcc/tree-vect-stmts.c (working copy) >> @@ -623,6 +623,46 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i >> } >> >> >> +/* Model cost for type demotion and promotion operations. PWR is normally >> + zero for single-step promotions and demotions. It will be one if >> + two-step promotion/demotion is required, and so on. Each additional >> + step doubles the number of instructions required. */ >> + >> +static void >> +vect_model_promotion_demotion_cost (stmt_vec_info stmt_info, >> + enum vect_def_type *dt, int pwr) >> +{ >> + int i, tmp; >> + int inside_cost = 0, outside_cost = 0, single_stmt_cost; >> + >> + /* The SLP costs were already calculated during SLP tree build. */ >> + if (PURE_SLP_STMT (stmt_info)) >> + return; >> + >> + single_stmt_cost = vect_get_stmt_cost (vec_promote_demote); >> + for (i = 0; i < pwr + 1; i++) >> + { >> + tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ? >> + (i + 1) : i; >> + inside_cost += vect_pow2 (tmp) * single_stmt_cost; >> + } >> + >> + /* FORNOW: Assuming maximum 2 args per stmts. */ >> + for (i = 0; i < 2; i++) >> + { >> + if (dt[i] == vect_constant_def || dt[i] == vect_external_def) >> + outside_cost += vect_get_stmt_cost (vector_stmt); >> + } >> + >> + if (vect_print_dump_info (REPORT_COST)) >> + fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, " >> + "outside_cost = %d .", inside_cost, outside_cost); >> + >> + /* Set the costs in STMT_INFO. */ >> + stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost); >> + stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost); >> +} >> + >> /* Function vect_cost_strided_group_size >> >> For strided load or store, return the group_size only if it is the first >> @@ -691,7 +731,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, in >> { >> /* Uses a high and low interleave operation for each needed permute. */ >> inside_cost = ncopies * exact_log2(group_size) * group_size >> - * vect_get_stmt_cost (vector_stmt); >> + * vect_get_stmt_cost (vec_perm); >> >> if (vect_print_dump_info (REPORT_COST)) >> fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", >> @@ -795,7 +835,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int >> { >> /* Uses an even and odd extract operations for each needed permute. */ >> inside_cost = ncopies * exact_log2(group_size) * group_size >> - * vect_get_stmt_cost (vector_stmt); >> + * vect_get_stmt_cost (vec_perm); >> >> if (vect_print_dump_info (REPORT_COST)) >> fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .", >> @@ -855,7 +895,7 @@ vect_get_load_cost (struct data_reference *dr, int >> case dr_explicit_realign: >> { >> *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) >> - + vect_get_stmt_cost (vector_stmt)); >> + + vect_get_stmt_cost (vec_perm)); >> >> /* FIXME: If the misalignment remains fixed across the iterations of >> the containing loop, the following cost should be added to the >> @@ -863,6 +903,9 @@ vect_get_load_cost (struct data_reference *dr, int >> if (targetm.vectorize.builtin_mask_for_load) >> *inside_cost += vect_get_stmt_cost (vector_stmt); >> >> + if (vect_print_dump_info (REPORT_COST)) >> + fprintf (vect_dump, "vect_model_load_cost: explicit realign"); >> + >> break; >> } >> case dr_explicit_realign_optimized: >> @@ -886,7 +929,12 @@ vect_get_load_cost (struct data_reference *dr, int >> } >> >> *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) >> - + vect_get_stmt_cost (vector_stmt)); >> + + vect_get_stmt_cost (vec_perm)); >> + >> + if (vect_print_dump_info (REPORT_COST)) >> + fprintf (vect_dump, >> + "vect_model_load_cost: explicit realign optimized"); >> + >> break; >> } >> >> @@ -2919,7 +2967,7 @@ vectorizable_type_demotion (gimple stmt, gimple_st >> STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; >> if (vect_print_dump_info (REPORT_DETAILS)) >> fprintf (vect_dump, "=== vectorizable_demotion ==="); >> - vect_model_simple_cost (stmt_info, ncopies, dt, NULL); >> + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); >> return true; >> } >> >> @@ -3217,7 +3265,7 @@ vectorizable_type_promotion (gimple stmt, gimple_s >> STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; >> if (vect_print_dump_info (REPORT_DETAILS)) >> fprintf (vect_dump, "=== vectorizable_promotion ==="); >> - vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL); >> + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); >> return true; >> } >> >> Index: gcc/config/spu/spu.c >> =================================================================== >> --- gcc/config/spu/spu.c (revision 184047) >> +++ gcc/config/spu/spu.c (working copy) >> @@ -6794,6 +6794,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for >> case scalar_to_vec: >> case cond_branch_not_taken: >> case vec_perm: >> + case vec_promote_demote: >> return 1; >> >> case scalar_store: >> Index: gcc/config/i386/i386.c >> =================================================================== >> --- gcc/config/i386/i386.c (revision 184047) >> +++ gcc/config/i386/i386.c (working copy) >> @@ -32816,7 +32816,8 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo >> return ix86_cost->cond_not_taken_branch_cost; >> >> case vec_perm: >> - return 1; >> + case vec_promote_demote: >> + return ix86_cost->vec_stmt_cost; >> >> default: >> gcc_unreachable (); >> Index: gcc/config/rs6000/rs6000.c >> =================================================================== >> --- gcc/config/rs6000/rs6000.c (revision 184047) >> +++ gcc/config/rs6000/rs6000.c (working copy) >> @@ -3695,12 +3695,23 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ >> case vec_to_scalar: >> case scalar_to_vec: >> case cond_branch_not_taken: >> - case vec_perm: >> return 1; >> >> case cond_branch_taken: >> return 3; >> >> + case vec_perm: >> + if (TARGET_VSX) >> + return 4; >> + else >> + return 1; >> + >> + case vec_promote_demote: >> + if (TARGET_VSX) >> + return 5; >> + else >> + return 1; >> + >> case unaligned_load: >> if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN) >> { >> >
Index: gcc/targhooks.c =================================================================== --- gcc/targhooks.c (revision 184047) +++ gcc/targhooks.c (working copy) @@ -529,6 +529,7 @@ default_builtin_vectorization_cost (enum vect_cost case scalar_to_vec: case cond_branch_not_taken: case vec_perm: + case vec_promote_demote: return 1; case unaligned_load: Index: gcc/target.h =================================================================== --- gcc/target.h (revision 184047) +++ gcc/target.h (working copy) @@ -128,7 +128,8 @@ enum vect_cost_for_stmt scalar_to_vec, cond_branch_not_taken, cond_branch_taken, - vec_perm + vec_perm, + vec_promote_demote }; /* Sets of optimization levels at which an option may be enabled by Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c (revision 184047) +++ gcc/tree-vect-loop.c (working copy) @@ -2104,7 +2104,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf if (stmt_info && !STMT_VINFO_RELEVANT_P (stmt_info) && (!STMT_VINFO_LIVE_P (stmt_info) - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) + && !STMT_VINFO_IN_PATTERN_P (stmt_info)) continue; if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) @@ -2251,11 +2252,19 @@ vect_estimate_min_profitable_iters (loop_vec_info { gimple stmt = gsi_stmt (si); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + + if (STMT_VINFO_IN_PATTERN_P (stmt_info)) + { + stmt = STMT_VINFO_RELATED_STMT (stmt_info); + stmt_info = vinfo_for_stmt (stmt); + } + /* Skip stmts that are not vectorized inside the loop. */ if (!STMT_VINFO_RELEVANT_P (stmt_info) && (!STMT_VINFO_LIVE_P (stmt_info) - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))) continue; + vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; /* FIXME: for stmts in the inner-loop in outer-loop vectorization, some of the "outside" costs are generated inside the outer-loop. */ Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 184047) +++ gcc/tree-vect-stmts.c (working copy) @@ -623,6 +623,46 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i } +/* Model cost for type demotion and promotion operations. PWR is normally + zero for single-step promotions and demotions. It will be one if + two-step promotion/demotion is required, and so on. Each additional + step doubles the number of instructions required. */ + +static void +vect_model_promotion_demotion_cost (stmt_vec_info stmt_info, + enum vect_def_type *dt, int pwr) +{ + int i, tmp; + int inside_cost = 0, outside_cost = 0, single_stmt_cost; + + /* The SLP costs were already calculated during SLP tree build. */ + if (PURE_SLP_STMT (stmt_info)) + return; + + single_stmt_cost = vect_get_stmt_cost (vec_promote_demote); + for (i = 0; i < pwr + 1; i++) + { + tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ? + (i + 1) : i; + inside_cost += vect_pow2 (tmp) * single_stmt_cost; + } + + /* FORNOW: Assuming maximum 2 args per stmts. */ + for (i = 0; i < 2; i++) + { + if (dt[i] == vect_constant_def || dt[i] == vect_external_def) + outside_cost += vect_get_stmt_cost (vector_stmt); + } + + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, " + "outside_cost = %d .", inside_cost, outside_cost); + + /* Set the costs in STMT_INFO. */ + stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost); + stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost); +} + /* Function vect_cost_strided_group_size For strided load or store, return the group_size only if it is the first @@ -691,7 +731,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, in { /* Uses a high and low interleave operation for each needed permute. */ inside_cost = ncopies * exact_log2(group_size) * group_size - * vect_get_stmt_cost (vector_stmt); + * vect_get_stmt_cost (vec_perm); if (vect_print_dump_info (REPORT_COST)) fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", @@ -795,7 +835,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int { /* Uses an even and odd extract operations for each needed permute. */ inside_cost = ncopies * exact_log2(group_size) * group_size - * vect_get_stmt_cost (vector_stmt); + * vect_get_stmt_cost (vec_perm); if (vect_print_dump_info (REPORT_COST)) fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .", @@ -855,7 +895,7 @@ vect_get_load_cost (struct data_reference *dr, int case dr_explicit_realign: { *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) - + vect_get_stmt_cost (vector_stmt)); + + vect_get_stmt_cost (vec_perm)); /* FIXME: If the misalignment remains fixed across the iterations of the containing loop, the following cost should be added to the @@ -863,6 +903,9 @@ vect_get_load_cost (struct data_reference *dr, int if (targetm.vectorize.builtin_mask_for_load) *inside_cost += vect_get_stmt_cost (vector_stmt); + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_model_load_cost: explicit realign"); + break; } case dr_explicit_realign_optimized: @@ -886,7 +929,12 @@ vect_get_load_cost (struct data_reference *dr, int } *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) - + vect_get_stmt_cost (vector_stmt)); + + vect_get_stmt_cost (vec_perm)); + + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, + "vect_model_load_cost: explicit realign optimized"); + break; } @@ -2919,7 +2967,7 @@ vectorizable_type_demotion (gimple stmt, gimple_st STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vectorizable_demotion ==="); - vect_model_simple_cost (stmt_info, ncopies, dt, NULL); + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); return true; } @@ -3217,7 +3265,7 @@ vectorizable_type_promotion (gimple stmt, gimple_s STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vectorizable_promotion ==="); - vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL); + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); return true; } Index: gcc/config/spu/spu.c =================================================================== --- gcc/config/spu/spu.c (revision 184047) +++ gcc/config/spu/spu.c (working copy) @@ -6794,6 +6794,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for case scalar_to_vec: case cond_branch_not_taken: case vec_perm: + case vec_promote_demote: return 1; case scalar_store: Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 184047) +++ gcc/config/i386/i386.c (working copy) @@ -32816,7 +32816,8 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo return ix86_cost->cond_not_taken_branch_cost; case vec_perm: - return 1; + case vec_promote_demote: + return ix86_cost->vec_stmt_cost; default: gcc_unreachable (); Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 184047) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -3695,12 +3695,23 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ case vec_to_scalar: case scalar_to_vec: case cond_branch_not_taken: - case vec_perm: return 1; case cond_branch_taken: return 3; + case vec_perm: + if (TARGET_VSX) + return 4; + else + return 1; + + case vec_promote_demote: + if (TARGET_VSX) + return 5; + else + return 1; + case unaligned_load: if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN) {