Message ID | 1328806575.2799.49.camel@gnopaine |
---|---|
State | New |
Headers | show |
On Thu, Feb 9, 2012 at 5:56 PM, William J. Schmidt <wschmidt@linux.vnet.ibm.com> wrote: > Following is a revision of yesterday's PR50031 patch submission, > modified per Richard's comments. Bootstrapped and tested with no > regressions on powerpc64-linux. I've confirmed the same performance > improvements in SPEC. OK for trunk? Some more questions - maybe Jakub can clarify as well given he worked on patterns recently ... > Thanks, > Bill > > > 2012-02-09 Bill Schmidt <wschmidt@linux.vnet.ibm.com> > Ira Rosen <irar@il.ibm.com> > > PR tree-optimization/50031 > * targhooks.c (default_builtin_vectorization_cost): Handle > vec_promote_demote. > * target.h (enum vect_cost_for_stmt): Add vec_promote_demote. > * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle > all types of reduction and pattern statements. > (vect_estimate_min_profitable_iters): Likewise. > * tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function. > (vect_get_load_cost): Use vec_perm for permutations; add dump logic > for explicit realigns. > (vectorizable_conversion): Call vect_model_promotion_demotion_cost. > * config/spu/spu.c (spu_builtin_vectorization_cost): Handle > vec_promote_demote. > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update > vec_perm for VSX and handle vec_promote_demote. > > > Index: gcc/targhooks.c > =================================================================== > --- gcc/targhooks.c (revision 183944) > +++ gcc/targhooks.c (working copy) > @@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost > case scalar_to_vec: > case cond_branch_not_taken: > case vec_perm: > + case vec_promote_demote: > return 1; > > case unaligned_load: > Index: gcc/target.h > =================================================================== > --- gcc/target.h (revision 183944) > +++ gcc/target.h (working copy) > @@ -145,7 +145,8 @@ enum vect_cost_for_stmt > scalar_to_vec, > cond_branch_not_taken, > cond_branch_taken, > - vec_perm > + vec_perm, > + vec_promote_demote > }; > > /* The target structure. This holds all the backend hooks. */ > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c (revision 183944) > +++ gcc/tree-vect-loop.c (working copy) > @@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf > if (stmt_info > && !STMT_VINFO_RELEVANT_P (stmt_info) > && (!STMT_VINFO_LIVE_P (stmt_info) > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) > + && !STMT_VINFO_IN_PATTERN_P (stmt_info)) > continue; Why would we exclude !relevant stmts when they are part of a pattern? We are looking at a scalar iteration cost, so all stmts that are not "dead" count, no? > > if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) > @@ -2564,15 +2565,48 @@ vect_estimate_min_profitable_iters (loop_vec_info > { > gimple stmt = gsi_stmt (si); > stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > + > + /* Translate the last statement in a pattern into the > + related replacement statement. */ > + if (STMT_VINFO_IN_PATTERN_P (stmt_info)) > + { > + stmt = STMT_VINFO_RELATED_STMT (stmt_info); > + stmt_info = vinfo_for_stmt (stmt); > + } So here we are tanslating stmt to the "main" scalar pattern stmt - and thus count it as many times as we have stmts in that pattern? That looks wrong. More like if (STMT_VINFO_IN_PATTERN_P (stmt_info) && STMT_VINFO_RELATED_STMT (stmt_info) != stmt) continue; ? Does the main stmt has the flag set and points to itself? > /* Skip stmts that are not vectorized inside the loop. */ > if (!STMT_VINFO_RELEVANT_P (stmt_info) > && (!STMT_VINFO_LIVE_P (stmt_info) > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))) > continue; > + ... and then, what does STMT_VINFO_INSIDE_OF_LOOP_COST of that "main" pattern stmt represent? Shouldn't it represent the cost of the whole sequence, and thus ... > vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; > /* FIXME: for stmts in the inner-loop in outer-loop vectorization, > some of the "outside" costs are generated inside the outer-loop. */ > vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); > + if (is_pattern_stmt_p (stmt_info) > + && STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)) > + { > + gimple_stmt_iterator gsi; The following is excessive? Especially as you probably count the "main" stmt twice? Thus, if the main stmt cost does not cover the sequence then you should skip adding its cost above? Thanks, Richard. > + for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); > + !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple pattern_def_stmt = gsi_stmt (gsi); > + stmt_vec_info pattern_def_stmt_info > + = vinfo_for_stmt (pattern_def_stmt); > + if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) > + || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) > + { > + vec_inside_cost > + += STMT_VINFO_INSIDE_OF_LOOP_COST > + (pattern_def_stmt_info) * factor; > + vec_outside_cost > + += STMT_VINFO_OUTSIDE_OF_LOOP_COST > + (pattern_def_stmt_info); > + } > + } > + } > } > } > > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c (revision 183944) > +++ gcc/tree-vect-stmts.c (working copy) > @@ -811,6 +811,46 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i > } > > > +/* Model cost for type demotion and promotion operations. PWR is normally > + zero for single-step promotions and demotions. It will be one if > + two-step promotion/demotion is required, and so on. Each additional > + step doubles the number of instructions required. */ > + > +static void > +vect_model_promotion_demotion_cost (stmt_vec_info stmt_info, > + enum vect_def_type *dt, int pwr) > +{ > + int i, tmp; > + int inside_cost = 0, outside_cost = 0, single_stmt_cost; > + > + /* The SLP costs were already calculated during SLP tree build. */ > + if (PURE_SLP_STMT (stmt_info)) > + return; > + > + single_stmt_cost = vect_get_stmt_cost (vec_promote_demote); > + for (i = 0; i < pwr + 1; i++) > + { > + tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ? > + (i + 1) : i; > + inside_cost += vect_pow2 (tmp) * single_stmt_cost; > + } > + > + /* FORNOW: Assuming maximum 2 args per stmts. */ > + for (i = 0; i < 2; i++) > + { > + if (dt[i] == vect_constant_def || dt[i] == vect_external_def) > + outside_cost += vect_get_stmt_cost (vector_stmt); > + } > + > + if (vect_print_dump_info (REPORT_COST)) > + fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, " > + "outside_cost = %d .", inside_cost, outside_cost); > + > + /* Set the costs in STMT_INFO. */ > + stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost); > + stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost); > +} > + > /* Function vect_cost_strided_group_size > > For strided load or store, return the group_size only if it is the first > @@ -887,7 +927,6 @@ vect_model_store_cost (stmt_vec_info stmt_info, in > if (vect_print_dump_info (REPORT_COST)) > fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", > group_size); > - > } > > /* Costs of the stores. */ > @@ -1049,7 +1088,7 @@ vect_get_load_cost (struct data_reference *dr, int > case dr_explicit_realign: > { > *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) > - + vect_get_stmt_cost (vector_stmt)); > + + vect_get_stmt_cost (vec_perm)); > > /* FIXME: If the misalignment remains fixed across the iterations of > the containing loop, the following cost should be added to the > @@ -1057,6 +1096,9 @@ vect_get_load_cost (struct data_reference *dr, int > if (targetm.vectorize.builtin_mask_for_load) > *inside_cost += vect_get_stmt_cost (vector_stmt); > > + if (vect_print_dump_info (REPORT_COST)) > + fprintf (vect_dump, "vect_model_load_cost: explicit realign"); > + > break; > } > case dr_explicit_realign_optimized: > @@ -1080,7 +1122,12 @@ vect_get_load_cost (struct data_reference *dr, int > } > > *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) > - + vect_get_stmt_cost (vector_stmt)); > + + vect_get_stmt_cost (vec_perm)); > + > + if (vect_print_dump_info (REPORT_COST)) > + fprintf (vect_dump, > + "vect_model_load_cost: explicit realign optimized"); > + > break; > } > > @@ -2392,16 +2439,19 @@ vectorizable_conversion (gimple stmt, gimple_stmt_ > if (vect_print_dump_info (REPORT_DETAILS)) > fprintf (vect_dump, "=== vectorizable_conversion ==="); > if (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR) > - STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; > + { > + STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; > + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); > + } > else if (modifier == NARROW) > { > STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; > - vect_model_simple_cost (stmt_info, ncopies, dt, NULL); > + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); > } > else > { > STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; > - vect_model_simple_cost (stmt_info, 2 * ncopies, dt, NULL); > + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); > } > VEC_free (tree, heap, interm_types); > return true; > Index: gcc/config/spu/spu.c > =================================================================== > --- gcc/config/spu/spu.c (revision 183944) > +++ gcc/config/spu/spu.c (working copy) > @@ -6920,6 +6920,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for > case scalar_to_vec: > case cond_branch_not_taken: > case vec_perm: > + case vec_promote_demote: > return 1; > > case scalar_store: > Index: gcc/config/i386/i386.c > =================================================================== > --- gcc/config/i386/i386.c (revision 183944) > +++ gcc/config/i386/i386.c (working copy) > @@ -35336,6 +35336,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo > return ix86_cost->cond_not_taken_branch_cost; > > case vec_perm: > + case vec_promote_demote: > return ix86_cost->vec_stmt_cost; > > default: > Index: gcc/config/rs6000/rs6000.c > =================================================================== > --- gcc/config/rs6000/rs6000.c (revision 183944) > +++ gcc/config/rs6000/rs6000.c (working copy) > @@ -3543,10 +3543,17 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > return 1; > > case vec_perm: > - if (!TARGET_VSX) > + if (TARGET_VSX) > + return 4; > + else > return 1; > - return 2; > > + case vec_promote_demote: > + if (TARGET_VSX) > + return 5; > + else > + return 1; > + > case cond_branch_taken: > return 3; > > >
Richard, thanks. I can answer most of your questions, but for the last one I will have to ask Ira to weigh in. On Fri, 2012-02-10 at 13:06 +0100, Richard Guenther wrote: > On Thu, Feb 9, 2012 at 5:56 PM, William J. Schmidt > <wschmidt@linux.vnet.ibm.com> wrote: > > Following is a revision of yesterday's PR50031 patch submission, > > modified per Richard's comments. Bootstrapped and tested with no > > regressions on powerpc64-linux. I've confirmed the same performance > > improvements in SPEC. OK for trunk? > > Some more questions - maybe Jakub can clarify as well given he worked > on patterns recently ... > > > Thanks, > > Bill > > > > > > 2012-02-09 Bill Schmidt <wschmidt@linux.vnet.ibm.com> > > Ira Rosen <irar@il.ibm.com> > > > > PR tree-optimization/50031 > > * targhooks.c (default_builtin_vectorization_cost): Handle > > vec_promote_demote. > > * target.h (enum vect_cost_for_stmt): Add vec_promote_demote. > > * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle > > all types of reduction and pattern statements. > > (vect_estimate_min_profitable_iters): Likewise. > > * tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function. > > (vect_get_load_cost): Use vec_perm for permutations; add dump logic > > for explicit realigns. > > (vectorizable_conversion): Call vect_model_promotion_demotion_cost. > > * config/spu/spu.c (spu_builtin_vectorization_cost): Handle > > vec_promote_demote. > > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > > * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update > > vec_perm for VSX and handle vec_promote_demote. > > > > > > Index: gcc/targhooks.c > > =================================================================== > > --- gcc/targhooks.c (revision 183944) > > +++ gcc/targhooks.c (working copy) > > @@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost > > case scalar_to_vec: > > case cond_branch_not_taken: > > case vec_perm: > > + case vec_promote_demote: > > return 1; > > > > case unaligned_load: > > Index: gcc/target.h > > =================================================================== > > --- gcc/target.h (revision 183944) > > +++ gcc/target.h (working copy) > > @@ -145,7 +145,8 @@ enum vect_cost_for_stmt > > scalar_to_vec, > > cond_branch_not_taken, > > cond_branch_taken, > > - vec_perm > > + vec_perm, > > + vec_promote_demote > > }; > > > > /* The target structure. This holds all the backend hooks. */ > > Index: gcc/tree-vect-loop.c > > =================================================================== > > --- gcc/tree-vect-loop.c (revision 183944) > > +++ gcc/tree-vect-loop.c (working copy) > > @@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf > > if (stmt_info > > && !STMT_VINFO_RELEVANT_P (stmt_info) > > && (!STMT_VINFO_LIVE_P (stmt_info) > > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) > > + && !STMT_VINFO_IN_PATTERN_P (stmt_info)) > > continue; > > Why would we exclude !relevant stmts when they are part of a pattern? > We are looking at a scalar iteration cost, so all stmts that are not "dead" > count, no? As I understand it, we're at a point where a statement replacing the pattern exists but has not yet been inserted in the code stream. All of the statements in the pattern are marked irrelevant, but the related statement of the main (last) statement of the pattern is relevant. Thus we need to allow the main statement through this check so the replacement statement can be found and counted. > > > > > if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) > > @@ -2564,15 +2565,48 @@ vect_estimate_min_profitable_iters (loop_vec_info > > { > > gimple stmt = gsi_stmt (si); > > stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > > + > > + /* Translate the last statement in a pattern into the > > + related replacement statement. */ > > + if (STMT_VINFO_IN_PATTERN_P (stmt_info)) > > + { > > + stmt = STMT_VINFO_RELATED_STMT (stmt_info); > > + stmt_info = vinfo_for_stmt (stmt); > > + } > > So here we are tanslating stmt to the "main" scalar pattern stmt - and thus > count it as many times as we have stmts in that pattern? That looks wrong. > More like > > if (STMT_VINFO_IN_PATTERN_P (stmt_info) > && STMT_VINFO_RELATED_STMT (stmt_info) != stmt) > continue; > > ? Does the main stmt has the flag set and points to itself? From the commentary at the end of tree-vect-patterns.c, only the main statement in the pattern (the last one) is flagged as STMT_VINFO_IN_PATTERN_P. So this is finding the new replacement statement which has been created but not yet inserted in the code. It only gets counted once. > > > /* Skip stmts that are not vectorized inside the loop. */ > > if (!STMT_VINFO_RELEVANT_P (stmt_info) > > && (!STMT_VINFO_LIVE_P (stmt_info) > > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))) > > continue; > > + > > ... and then, what does STMT_VINFO_INSIDE_OF_LOOP_COST of > that "main" pattern stmt represent? Shouldn't it represent the cost > of the whole sequence, and thus ... > > > vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; > > /* FIXME: for stmts in the inner-loop in outer-loop vectorization, > > some of the "outside" costs are generated inside the outer-loop. */ > > vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); > > + if (is_pattern_stmt_p (stmt_info) > > + && STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)) > > + { > > + gimple_stmt_iterator gsi; > > The following is excessive? Especially as you probably count the "main" > stmt twice? Thus, if the main stmt cost does not cover the sequence > then you should skip adding its cost above? This is what I believe is going on here. Note that for the main pattern statement, we have converted it so that stmt_info now points to its related (replacement) statement. It is apparently possible for this replacement statement to also be recognized as part of a new pattern (I've seen references to such possibilities elsewhere in the vectorizer code). When that happens, this code is counting the costs of that pattern. Ira, can you please weigh in on this part? I'm not 100% certain of my explanation. In any case, it's clear we need more comments in the code. :) Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so this section has to be omitted if we backport it (which is desirable since the degradation was introduced in 4.6). Removing it apparently does not affect the sphinx3 degradation. > > Thanks, > Richard. > > > + for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); > > + !gsi_end_p (gsi); gsi_next (&gsi)) > > + { > > + gimple pattern_def_stmt = gsi_stmt (gsi); > > + stmt_vec_info pattern_def_stmt_info > > + = vinfo_for_stmt (pattern_def_stmt); > > + if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) > > + || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) > > + { > > + vec_inside_cost > > + += STMT_VINFO_INSIDE_OF_LOOP_COST > > + (pattern_def_stmt_info) * factor; > > + vec_outside_cost > > + += STMT_VINFO_OUTSIDE_OF_LOOP_COST > > + (pattern_def_stmt_info); > > + } > > + } > > + } > > } > > } > > > > Index: gcc/tree-vect-stmts.c > > =================================================================== > > --- gcc/tree-vect-stmts.c (revision 183944) > > +++ gcc/tree-vect-stmts.c (working copy) > > @@ -811,6 +811,46 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i > > } > > > > > > +/* Model cost for type demotion and promotion operations. PWR is normally > > + zero for single-step promotions and demotions. It will be one if > > + two-step promotion/demotion is required, and so on. Each additional > > + step doubles the number of instructions required. */ > > + > > +static void > > +vect_model_promotion_demotion_cost (stmt_vec_info stmt_info, > > + enum vect_def_type *dt, int pwr) > > +{ > > + int i, tmp; > > + int inside_cost = 0, outside_cost = 0, single_stmt_cost; > > + > > + /* The SLP costs were already calculated during SLP tree build. */ > > + if (PURE_SLP_STMT (stmt_info)) > > + return; > > + > > + single_stmt_cost = vect_get_stmt_cost (vec_promote_demote); > > + for (i = 0; i < pwr + 1; i++) > > + { > > + tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ? > > + (i + 1) : i; > > + inside_cost += vect_pow2 (tmp) * single_stmt_cost; > > + } > > + > > + /* FORNOW: Assuming maximum 2 args per stmts. */ > > + for (i = 0; i < 2; i++) > > + { > > + if (dt[i] == vect_constant_def || dt[i] == vect_external_def) > > + outside_cost += vect_get_stmt_cost (vector_stmt); > > + } > > + > > + if (vect_print_dump_info (REPORT_COST)) > > + fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, " > > + "outside_cost = %d .", inside_cost, outside_cost); > > + > > + /* Set the costs in STMT_INFO. */ > > + stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost); > > + stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost); > > +} > > + > > /* Function vect_cost_strided_group_size > > > > For strided load or store, return the group_size only if it is the first > > @@ -887,7 +927,6 @@ vect_model_store_cost (stmt_vec_info stmt_info, in > > if (vect_print_dump_info (REPORT_COST)) > > fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", > > group_size); > > - > > } > > > > /* Costs of the stores. */ > > @@ -1049,7 +1088,7 @@ vect_get_load_cost (struct data_reference *dr, int > > case dr_explicit_realign: > > { > > *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) > > - + vect_get_stmt_cost (vector_stmt)); > > + + vect_get_stmt_cost (vec_perm)); > > > > /* FIXME: If the misalignment remains fixed across the iterations of > > the containing loop, the following cost should be added to the > > @@ -1057,6 +1096,9 @@ vect_get_load_cost (struct data_reference *dr, int > > if (targetm.vectorize.builtin_mask_for_load) > > *inside_cost += vect_get_stmt_cost (vector_stmt); > > > > + if (vect_print_dump_info (REPORT_COST)) > > + fprintf (vect_dump, "vect_model_load_cost: explicit realign"); > > + > > break; > > } > > case dr_explicit_realign_optimized: > > @@ -1080,7 +1122,12 @@ vect_get_load_cost (struct data_reference *dr, int > > } > > > > *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) > > - + vect_get_stmt_cost (vector_stmt)); > > + + vect_get_stmt_cost (vec_perm)); > > + > > + if (vect_print_dump_info (REPORT_COST)) > > + fprintf (vect_dump, > > + "vect_model_load_cost: explicit realign optimized"); > > + > > break; > > } > > > > @@ -2392,16 +2439,19 @@ vectorizable_conversion (gimple stmt, gimple_stmt_ > > if (vect_print_dump_info (REPORT_DETAILS)) > > fprintf (vect_dump, "=== vectorizable_conversion ==="); > > if (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR) > > - STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; > > + { > > + STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; > > + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); > > + } > > else if (modifier == NARROW) > > { > > STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; > > - vect_model_simple_cost (stmt_info, ncopies, dt, NULL); > > + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); > > } > > else > > { > > STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; > > - vect_model_simple_cost (stmt_info, 2 * ncopies, dt, NULL); > > + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); > > } > > VEC_free (tree, heap, interm_types); > > return true; > > Index: gcc/config/spu/spu.c > > =================================================================== > > --- gcc/config/spu/spu.c (revision 183944) > > +++ gcc/config/spu/spu.c (working copy) > > @@ -6920,6 +6920,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for > > case scalar_to_vec: > > case cond_branch_not_taken: > > case vec_perm: > > + case vec_promote_demote: > > return 1; > > > > case scalar_store: > > Index: gcc/config/i386/i386.c > > =================================================================== > > --- gcc/config/i386/i386.c (revision 183944) > > +++ gcc/config/i386/i386.c (working copy) > > @@ -35336,6 +35336,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo > > return ix86_cost->cond_not_taken_branch_cost; > > > > case vec_perm: > > + case vec_promote_demote: > > return ix86_cost->vec_stmt_cost; > > > > default: > > Index: gcc/config/rs6000/rs6000.c > > =================================================================== > > --- gcc/config/rs6000/rs6000.c (revision 183944) > > +++ gcc/config/rs6000/rs6000.c (working copy) > > @@ -3543,10 +3543,17 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > > return 1; > > > > case vec_perm: > > - if (!TARGET_VSX) > > + if (TARGET_VSX) > > + return 4; > > + else > > return 1; > > - return 2; > > > > + case vec_promote_demote: > > + if (TARGET_VSX) > > + return 5; > > + else > > + return 1; > > + > > case cond_branch_taken: > > return 3; > > > > > > >
On Fri, 2012-02-10 at 07:31 -0600, William J. Schmidt wrote: > Richard, thanks. I can answer most of your questions, but for the last > one I will have to ask Ira to weigh in. > > On Fri, 2012-02-10 at 13:06 +0100, Richard Guenther wrote: > > On Thu, Feb 9, 2012 at 5:56 PM, William J. Schmidt > > <wschmidt@linux.vnet.ibm.com> wrote: > > > Following is a revision of yesterday's PR50031 patch submission, > > > modified per Richard's comments. Bootstrapped and tested with no > > > regressions on powerpc64-linux. I've confirmed the same performance > > > improvements in SPEC. OK for trunk? > > > > Some more questions - maybe Jakub can clarify as well given he worked > > on patterns recently ... > > > > > Thanks, > > > Bill > > > > > > > > > 2012-02-09 Bill Schmidt <wschmidt@linux.vnet.ibm.com> > > > Ira Rosen <irar@il.ibm.com> > > > > > > PR tree-optimization/50031 > > > * targhooks.c (default_builtin_vectorization_cost): Handle > > > vec_promote_demote. > > > * target.h (enum vect_cost_for_stmt): Add vec_promote_demote. > > > * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle > > > all types of reduction and pattern statements. > > > (vect_estimate_min_profitable_iters): Likewise. > > > * tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function. > > > (vect_get_load_cost): Use vec_perm for permutations; add dump logic > > > for explicit realigns. > > > (vectorizable_conversion): Call vect_model_promotion_demotion_cost. > > > * config/spu/spu.c (spu_builtin_vectorization_cost): Handle > > > vec_promote_demote. > > > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > > > * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update > > > vec_perm for VSX and handle vec_promote_demote. > > > > > > > > > Index: gcc/targhooks.c > > > =================================================================== > > > --- gcc/targhooks.c (revision 183944) > > > +++ gcc/targhooks.c (working copy) > > > @@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost > > > case scalar_to_vec: > > > case cond_branch_not_taken: > > > case vec_perm: > > > + case vec_promote_demote: > > > return 1; > > > > > > case unaligned_load: > > > Index: gcc/target.h > > > =================================================================== > > > --- gcc/target.h (revision 183944) > > > +++ gcc/target.h (working copy) > > > @@ -145,7 +145,8 @@ enum vect_cost_for_stmt > > > scalar_to_vec, > > > cond_branch_not_taken, > > > cond_branch_taken, > > > - vec_perm > > > + vec_perm, > > > + vec_promote_demote > > > }; > > > > > > /* The target structure. This holds all the backend hooks. */ > > > Index: gcc/tree-vect-loop.c > > > =================================================================== > > > --- gcc/tree-vect-loop.c (revision 183944) > > > +++ gcc/tree-vect-loop.c (working copy) > > > @@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf > > > if (stmt_info > > > && !STMT_VINFO_RELEVANT_P (stmt_info) > > > && (!STMT_VINFO_LIVE_P (stmt_info) > > > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > > > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) > > > + && !STMT_VINFO_IN_PATTERN_P (stmt_info)) > > > continue; > > > > Why would we exclude !relevant stmts when they are part of a pattern? > > We are looking at a scalar iteration cost, so all stmts that are not "dead" > > count, no? > > As I understand it, we're at a point where a statement replacing the > pattern exists but has not yet been inserted in the code stream. All of > the statements in the pattern are marked irrelevant, but the related > statement of the main (last) statement of the pattern is relevant. Thus > we need to allow the main statement through this check so the > replacement statement can be found and counted. > > > > > > > > > if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) > > > @@ -2564,15 +2565,48 @@ vect_estimate_min_profitable_iters (loop_vec_info > > > { > > > gimple stmt = gsi_stmt (si); > > > stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > > > + > > > + /* Translate the last statement in a pattern into the > > > + related replacement statement. */ > > > + if (STMT_VINFO_IN_PATTERN_P (stmt_info)) > > > + { > > > + stmt = STMT_VINFO_RELATED_STMT (stmt_info); > > > + stmt_info = vinfo_for_stmt (stmt); > > > + } > > > > So here we are tanslating stmt to the "main" scalar pattern stmt - and thus > > count it as many times as we have stmts in that pattern? That looks wrong. > > More like > > > > if (STMT_VINFO_IN_PATTERN_P (stmt_info) > > && STMT_VINFO_RELATED_STMT (stmt_info) != stmt) > > continue; > > > > ? Does the main stmt has the flag set and points to itself? > > From the commentary at the end of tree-vect-patterns.c, only the main > statement in the pattern (the last one) is flagged as > STMT_VINFO_IN_PATTERN_P. So this is finding the new replacement > statement which has been created but not yet inserted in the code. It > only gets counted once. > > > > > > /* Skip stmts that are not vectorized inside the loop. */ > > > if (!STMT_VINFO_RELEVANT_P (stmt_info) > > > && (!STMT_VINFO_LIVE_P (stmt_info) > > > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > > > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))) > > > continue; > > > + > > > > ... and then, what does STMT_VINFO_INSIDE_OF_LOOP_COST of > > that "main" pattern stmt represent? Shouldn't it represent the cost > > of the whole sequence, and thus ... > > > > > vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; > > > /* FIXME: for stmts in the inner-loop in outer-loop vectorization, > > > some of the "outside" costs are generated inside the outer-loop. */ > > > vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); > > > + if (is_pattern_stmt_p (stmt_info) > > > + && STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)) > > > + { > > > + gimple_stmt_iterator gsi; > > > > The following is excessive? Especially as you probably count the "main" > > stmt twice? Thus, if the main stmt cost does not cover the sequence > > then you should skip adding its cost above? > > This is what I believe is going on here. Note that for the main pattern > statement, we have converted it so that stmt_info now points to its > related (replacement) statement. It is apparently possible for this > replacement statement to also be recognized as part of a new pattern > (I've seen references to such possibilities elsewhere in the vectorizer > code). When that happens, this code is counting the costs of that > pattern. > > Ira, can you please weigh in on this part? I'm not 100% certain of my > explanation. Also, I forgot to mention that I think Richard is right that the replacement statement may be getting counted twice in this scenario. > > In any case, it's clear we need more comments in the code. :) > > Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so > this section has to be omitted if we backport it (which is desirable > since the degradation was introduced in 4.6). Removing it apparently > does not affect the sphinx3 degradation. > > > > > Thanks, > > Richard. > > > > > + for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); > > > + !gsi_end_p (gsi); gsi_next (&gsi)) > > > + { > > > + gimple pattern_def_stmt = gsi_stmt (gsi); > > > + stmt_vec_info pattern_def_stmt_info > > > + = vinfo_for_stmt (pattern_def_stmt); > > > + if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) > > > + || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) > > > + { > > > + vec_inside_cost > > > + += STMT_VINFO_INSIDE_OF_LOOP_COST > > > + (pattern_def_stmt_info) * factor; > > > + vec_outside_cost > > > + += STMT_VINFO_OUTSIDE_OF_LOOP_COST > > > + (pattern_def_stmt_info); > > > + } > > > + } > > > + } > > > } > > > } > > > > > > Index: gcc/tree-vect-stmts.c > > > =================================================================== > > > --- gcc/tree-vect-stmts.c (revision 183944) > > > +++ gcc/tree-vect-stmts.c (working copy) > > > @@ -811,6 +811,46 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i > > > } > > > > > > > > > +/* Model cost for type demotion and promotion operations. PWR is normally > > > + zero for single-step promotions and demotions. It will be one if > > > + two-step promotion/demotion is required, and so on. Each additional > > > + step doubles the number of instructions required. */ > > > + > > > +static void > > > +vect_model_promotion_demotion_cost (stmt_vec_info stmt_info, > > > + enum vect_def_type *dt, int pwr) > > > +{ > > > + int i, tmp; > > > + int inside_cost = 0, outside_cost = 0, single_stmt_cost; > > > + > > > + /* The SLP costs were already calculated during SLP tree build. */ > > > + if (PURE_SLP_STMT (stmt_info)) > > > + return; > > > + > > > + single_stmt_cost = vect_get_stmt_cost (vec_promote_demote); > > > + for (i = 0; i < pwr + 1; i++) > > > + { > > > + tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ? > > > + (i + 1) : i; > > > + inside_cost += vect_pow2 (tmp) * single_stmt_cost; > > > + } > > > + > > > + /* FORNOW: Assuming maximum 2 args per stmts. */ > > > + for (i = 0; i < 2; i++) > > > + { > > > + if (dt[i] == vect_constant_def || dt[i] == vect_external_def) > > > + outside_cost += vect_get_stmt_cost (vector_stmt); > > > + } > > > + > > > + if (vect_print_dump_info (REPORT_COST)) > > > + fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, " > > > + "outside_cost = %d .", inside_cost, outside_cost); > > > + > > > + /* Set the costs in STMT_INFO. */ > > > + stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost); > > > + stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost); > > > +} > > > + > > > /* Function vect_cost_strided_group_size > > > > > > For strided load or store, return the group_size only if it is the first > > > @@ -887,7 +927,6 @@ vect_model_store_cost (stmt_vec_info stmt_info, in > > > if (vect_print_dump_info (REPORT_COST)) > > > fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", > > > group_size); > > > - > > > } > > > > > > /* Costs of the stores. */ > > > @@ -1049,7 +1088,7 @@ vect_get_load_cost (struct data_reference *dr, int > > > case dr_explicit_realign: > > > { > > > *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) > > > - + vect_get_stmt_cost (vector_stmt)); > > > + + vect_get_stmt_cost (vec_perm)); > > > > > > /* FIXME: If the misalignment remains fixed across the iterations of > > > the containing loop, the following cost should be added to the > > > @@ -1057,6 +1096,9 @@ vect_get_load_cost (struct data_reference *dr, int > > > if (targetm.vectorize.builtin_mask_for_load) > > > *inside_cost += vect_get_stmt_cost (vector_stmt); > > > > > > + if (vect_print_dump_info (REPORT_COST)) > > > + fprintf (vect_dump, "vect_model_load_cost: explicit realign"); > > > + > > > break; > > > } > > > case dr_explicit_realign_optimized: > > > @@ -1080,7 +1122,12 @@ vect_get_load_cost (struct data_reference *dr, int > > > } > > > > > > *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) > > > - + vect_get_stmt_cost (vector_stmt)); > > > + + vect_get_stmt_cost (vec_perm)); > > > + > > > + if (vect_print_dump_info (REPORT_COST)) > > > + fprintf (vect_dump, > > > + "vect_model_load_cost: explicit realign optimized"); > > > + > > > break; > > > } > > > > > > @@ -2392,16 +2439,19 @@ vectorizable_conversion (gimple stmt, gimple_stmt_ > > > if (vect_print_dump_info (REPORT_DETAILS)) > > > fprintf (vect_dump, "=== vectorizable_conversion ==="); > > > if (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR) > > > - STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; > > > + { > > > + STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; > > > + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); > > > + } > > > else if (modifier == NARROW) > > > { > > > STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; > > > - vect_model_simple_cost (stmt_info, ncopies, dt, NULL); > > > + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); > > > } > > > else > > > { > > > STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; > > > - vect_model_simple_cost (stmt_info, 2 * ncopies, dt, NULL); > > > + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); > > > } > > > VEC_free (tree, heap, interm_types); > > > return true; > > > Index: gcc/config/spu/spu.c > > > =================================================================== > > > --- gcc/config/spu/spu.c (revision 183944) > > > +++ gcc/config/spu/spu.c (working copy) > > > @@ -6920,6 +6920,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for > > > case scalar_to_vec: > > > case cond_branch_not_taken: > > > case vec_perm: > > > + case vec_promote_demote: > > > return 1; > > > > > > case scalar_store: > > > Index: gcc/config/i386/i386.c > > > =================================================================== > > > --- gcc/config/i386/i386.c (revision 183944) > > > +++ gcc/config/i386/i386.c (working copy) > > > @@ -35336,6 +35336,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo > > > return ix86_cost->cond_not_taken_branch_cost; > > > > > > case vec_perm: > > > + case vec_promote_demote: > > > return ix86_cost->vec_stmt_cost; > > > > > > default: > > > Index: gcc/config/rs6000/rs6000.c > > > =================================================================== > > > --- gcc/config/rs6000/rs6000.c (revision 183944) > > > +++ gcc/config/rs6000/rs6000.c (working copy) > > > @@ -3543,10 +3543,17 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > > > return 1; > > > > > > case vec_perm: > > > - if (!TARGET_VSX) > > > + if (TARGET_VSX) > > > + return 4; > > > + else > > > return 1; > > > - return 2; > > > > > > + case vec_promote_demote: > > > + if (TARGET_VSX) > > > + return 5; > > > + else > > > + return 1; > > > + > > > case cond_branch_taken: > > > return 3; > > > > > > > > > > >
On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote: > >From the commentary at the end of tree-vect-patterns.c, only the main > statement in the pattern (the last one) is flagged as > STMT_VINFO_IN_PATTERN_P. So this is finding the new replacement > statement which has been created but not yet inserted in the code. It > only gets counted once. STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not just the last, but all original stmts that are being replaced by the pattern). Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt (last one corresponding to that original stmt). If needed, further pattern stmts for that original stmts are put into STMT_VINFO_PATTERN_DEF_SEQ. The pattern matcher could e.g. match 3 original stmts, and stick a single pattern stmt into STMT_VINFO_RELATED_STMT on the first original stmt, then say 2 pattern stmts into STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one STMT_VINFO_RELATED_STMT and finally one pattern stmt through STMT_VINFO_RELATED_STMT on the third original stmt. > Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so > this section has to be omitted if we backport it (which is desirable > since the degradation was introduced in 4.6). Removing it apparently > does not affect the sphinx3 degradation. Yeah, when backporting you can basically assume that STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL instead of itt (and simplify), because no pattern recognizers needed more than one pattern stmt for each original stmt back then. Jakub
Jakub, thanks! Based on this, I believe the patch is correct in its handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double counting. I misinterpreted what the commentary for vect_pattern_recog was saying: I thought that all replacements were hung off of the last pattern statement, but this was just true for the particular example, where only one replacement statement was created. Sorry for any confusion! Based on the discussion, is the patch OK for trunk? Thanks, Bill On Fri, 2012-02-10 at 14:44 +0100, Jakub Jelinek wrote: > On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote: > > >From the commentary at the end of tree-vect-patterns.c, only the main > > statement in the pattern (the last one) is flagged as > > STMT_VINFO_IN_PATTERN_P. So this is finding the new replacement > > statement which has been created but not yet inserted in the code. It > > only gets counted once. > > STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not > just the last, but all original stmts that are being replaced by the > pattern). Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt > (last one corresponding to that original stmt). If needed, further > pattern stmts for that original stmts are put into > STMT_VINFO_PATTERN_DEF_SEQ. The pattern matcher could e.g. match > 3 original stmts, and stick a single pattern stmt into STMT_VINFO_RELATED_STMT > on the first original stmt, then say 2 pattern stmts into > STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one > STMT_VINFO_RELATED_STMT and finally one pattern stmt through > STMT_VINFO_RELATED_STMT on the third original stmt. > > > Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so > > this section has to be omitted if we backport it (which is desirable > > since the degradation was introduced in 4.6). Removing it apparently > > does not affect the sphinx3 degradation. > > Yeah, when backporting you can basically assume that > STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL instead > of itt (and simplify), because no pattern recognizers needed more than one > pattern stmt for each original stmt back then. > > Jakub >
On Fri, Feb 10, 2012 at 3:39 PM, William J. Schmidt <wschmidt@linux.vnet.ibm.com> wrote: > Jakub, thanks! Based on this, I believe the patch is correct in its > handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double > counting. > > I misinterpreted what the commentary for vect_pattern_recog was saying: > I thought that all replacements were hung off of the last pattern > statement, but this was just true for the particular example, where only > one replacement statement was created. Sorry for any confusion! > > Based on the discussion, is the patch OK for trunk? But you still count for each of the matched scalar stmts the cost of the whole pattern sequence. No? Richard. > Thanks, > Bill > > On Fri, 2012-02-10 at 14:44 +0100, Jakub Jelinek wrote: >> On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote: >> > >From the commentary at the end of tree-vect-patterns.c, only the main >> > statement in the pattern (the last one) is flagged as >> > STMT_VINFO_IN_PATTERN_P. So this is finding the new replacement >> > statement which has been created but not yet inserted in the code. It >> > only gets counted once. >> >> STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not >> just the last, but all original stmts that are being replaced by the >> pattern). Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt >> (last one corresponding to that original stmt). If needed, further >> pattern stmts for that original stmts are put into >> STMT_VINFO_PATTERN_DEF_SEQ. The pattern matcher could e.g. match >> 3 original stmts, and stick a single pattern stmt into STMT_VINFO_RELATED_STMT >> on the first original stmt, then say 2 pattern stmts into >> STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one >> STMT_VINFO_RELATED_STMT and finally one pattern stmt through >> STMT_VINFO_RELATED_STMT on the third original stmt. >> >> > Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so >> > this section has to be omitted if we backport it (which is desirable >> > since the degradation was introduced in 4.6). Removing it apparently >> > does not affect the sphinx3 degradation. >> >> Yeah, when backporting you can basically assume that >> STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL instead >> of itt (and simplify), because no pattern recognizers needed more than one >> pattern stmt for each original stmt back then. >> >> Jakub >> >
On Fri, 2012-02-10 at 16:22 +0100, Richard Guenther wrote: > On Fri, Feb 10, 2012 at 3:39 PM, William J. Schmidt > <wschmidt@linux.vnet.ibm.com> wrote: > > Jakub, thanks! Based on this, I believe the patch is correct in its > > handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double > > counting. > > > > I misinterpreted what the commentary for vect_pattern_recog was saying: > > I thought that all replacements were hung off of the last pattern > > statement, but this was just true for the particular example, where only > > one replacement statement was created. Sorry for any confusion! > > > > Based on the discussion, is the patch OK for trunk? > > But you still count for each of the matched scalar stmts the cost of the > whole pattern sequence. No? I need to change the commentary to make this more clear now. My comment: "Translate the last statement..." is incorrect; this is done for each statement in a pattern. Per Jakub's explanation, the replacement statements are distributed over the original pattern statements. Visiting STMT_VINFO_RELATED_STMT for a statement marked STMT_VINFO_IN_PATTERN_P will find zero or one replacement statements that should be examined. Visiting STMT_VINFO_PATTERN_DEF_SEQ may pick up some leftover replacement statements that didn't fit in the 1-1 mapping. The point is that all of these related statements and pattern-def-seq statements are disjoint, and Ira's logic is ensuring that they all get examined once. It's not a very clean way to represent the replacement of a pattern -- not how you'd do it if designing from scratch -- but I guess I can see how it got this way when the STMT_VINFO_PATTERN_DEF_SEQ was glued onto the existing 1-1 mapping. Bill > > Richard. > > > Thanks, > > Bill > > > > On Fri, 2012-02-10 at 14:44 +0100, Jakub Jelinek wrote: > >> On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote: > >> > >From the commentary at the end of tree-vect-patterns.c, only the main > >> > statement in the pattern (the last one) is flagged as > >> > STMT_VINFO_IN_PATTERN_P. So this is finding the new replacement > >> > statement which has been created but not yet inserted in the code. It > >> > only gets counted once. > >> > >> STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not > >> just the last, but all original stmts that are being replaced by the > >> pattern). Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt > >> (last one corresponding to that original stmt). If needed, further > >> pattern stmts for that original stmts are put into > >> STMT_VINFO_PATTERN_DEF_SEQ. The pattern matcher could e.g. match > >> 3 original stmts, and stick a single pattern stmt into STMT_VINFO_RELATED_STMT > >> on the first original stmt, then say 2 pattern stmts into > >> STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one > >> STMT_VINFO_RELATED_STMT and finally one pattern stmt through > >> STMT_VINFO_RELATED_STMT on the third original stmt. > >> > >> > Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so > >> > this section has to be omitted if we backport it (which is desirable > >> > since the degradation was introduced in 4.6). Removing it apparently > >> > does not affect the sphinx3 degradation. > >> > >> Yeah, when backporting you can basically assume that > >> STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL instead > >> of itt (and simplify), because no pattern recognizers needed more than one > >> pattern stmt for each original stmt back then. > >> > >> Jakub > >> > > >
On Fri, Feb 10, 2012 at 04:22:32PM +0100, Richard Guenther wrote: > On Fri, Feb 10, 2012 at 3:39 PM, William J. Schmidt > <wschmidt@linux.vnet.ibm.com> wrote: > > Jakub, thanks! Based on this, I believe the patch is correct in its > > handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double > > counting. > > > > I misinterpreted what the commentary for vect_pattern_recog was saying: > > I thought that all replacements were hung off of the last pattern > > statement, but this was just true for the particular example, where only > > one replacement statement was created. Sorry for any confusion! > > > > Based on the discussion, is the patch OK for trunk? > > But you still count for each of the matched scalar stmts the cost of the > whole pattern sequence. No? For each stmt you should count the original stmt if it is relevant (I think there are cases where both the original stmt and pattern stmt for it are both relevant and emitted, didn't understand it exactly though, but usually only the orig or only the pattern stmts are relevant), and if STMT_VINFO_IN_PATTERN_P, count the STMT_VINFO_RELATED_STMT (if any) and all STMT_VINFO_PATTERN_DEF_SEQ sequence stmts (if any), again, not sure if STMT_VINFO_RELEVANT needs to be tested for each or not. I guess look at what we actually emit if the cost model says it should be emitted. Neither the STMT_VINFO_PATTERN_DEF_SEQ nor STMT_VINFO_RELATED_STMT are shared in between any orig stmts, each has its own set of stmts. I believe if some original stmt should map to no pattern stmts, it isn't marked as STMT_VINFO_IN_PATTERN_P at all, but as nothing uses its lhs with the pattern stmts in place, it won't be relevant and will be ignored. Jakub
On Fri, Feb 10, 2012 at 09:36:01AM -0600, William J. Schmidt wrote: > Per Jakub's explanation, the replacement statements are distributed over > the original pattern statements. Visiting STMT_VINFO_RELATED_STMT for a > statement marked STMT_VINFO_IN_PATTERN_P will find zero or one > replacement statements that should be examined. Visiting > STMT_VINFO_PATTERN_DEF_SEQ may pick up some leftover replacement > statements that didn't fit in the 1-1 mapping. The point is that all of > these related statements and pattern-def-seq statements are disjoint, > and Ira's logic is ensuring that they all get examined once. > > It's not a very clean way to represent the replacement of a pattern -- > not how you'd do it if designing from scratch -- but I guess I can see > how it got this way when the STMT_VINFO_PATTERN_DEF_SEQ was glued onto > the existing 1-1 mapping. Still in 4.6 we have just a 1-1 or 1-0 mapping (as I wrote in the previous mail, I think those orig stmts that don't need any replacements, e.g. if you have 2 orig stmts mapping to just one pattern stmt, are just marked as not relevant). Then 4.7 first added (Ira's changes) STMT_VINFO_PATTERN_DEF_STMT, i.e. in addition to 1-1 and 1-0 there was a possibility for 1-2 mapping. And then I needed more than 2, but it was too late in the game to do large changes, so we end up with 1-1+N scheme by changing the DEF_STMT into DEF_SEQ. For 4.8 we definitely at least should remove STMT_VINFO_RELATED_STMT as a way to represent pattern stmts, even for 1-1 mapping the pattern stmt should go into a sequence. Jakub
On Fri, Feb 10, 2012 at 4:36 PM, William J. Schmidt <wschmidt@linux.vnet.ibm.com> wrote: > > > On Fri, 2012-02-10 at 16:22 +0100, Richard Guenther wrote: >> On Fri, Feb 10, 2012 at 3:39 PM, William J. Schmidt >> <wschmidt@linux.vnet.ibm.com> wrote: >> > Jakub, thanks! Based on this, I believe the patch is correct in its >> > handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double >> > counting. >> > >> > I misinterpreted what the commentary for vect_pattern_recog was saying: >> > I thought that all replacements were hung off of the last pattern >> > statement, but this was just true for the particular example, where only >> > one replacement statement was created. Sorry for any confusion! >> > >> > Based on the discussion, is the patch OK for trunk? >> >> But you still count for each of the matched scalar stmts the cost of the >> whole pattern sequence. No? > > I need to change the commentary to make this more clear now. My > comment: "Translate the last statement..." is incorrect; this is done > for each statement in a pattern. > > Per Jakub's explanation, the replacement statements are distributed over > the original pattern statements. Ok, looking at the code in tree-vect-patterns.c it seems that STMT_VINFO_IN_PATTERN_P is only set for the stmt that contains the first replacement stmt in STMT_VINFO_RELATED_STMT and further stmts in that stmts STMT_VINFO_PATTERN_DEF_SEQ. Other stmts that were matched do not have STMT_VINFO_IN_PATTERN_P set (but their STMT_VINFO_RELATED_STMT points to the stmt that has STMT_VINFO_IN_PATTERN_P set). Possibly the other scalar stmts participating in the pattern are marked as not relevant (couldn't spot this quickly). So it seems that your patch should indeed work as-is. Thus, ok, if Jakub doesn't spot another error. Thanks, Richard. > Visiting STMT_VINFO_RELATED_STMT for a > statement marked STMT_VINFO_IN_PATTERN_P will find zero or one > replacement statements that should be examined. Visiting > STMT_VINFO_PATTERN_DEF_SEQ may pick up some leftover replacement > statements that didn't fit in the 1-1 mapping. The point is that all of > these related statements and pattern-def-seq statements are disjoint, > and Ira's logic is ensuring that they all get examined once. > > It's not a very clean way to represent the replacement of a pattern -- > not how you'd do it if designing from scratch -- but I guess I can see > how it got this way when the STMT_VINFO_PATTERN_DEF_SEQ was glued onto > the existing 1-1 mapping. > > Bill > >> >> Richard. >> >> > Thanks, >> > Bill >> > >> > On Fri, 2012-02-10 at 14:44 +0100, Jakub Jelinek wrote: >> >> On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote: >> >> > >From the commentary at the end of tree-vect-patterns.c, only the main >> >> > statement in the pattern (the last one) is flagged as >> >> > STMT_VINFO_IN_PATTERN_P. So this is finding the new replacement >> >> > statement which has been created but not yet inserted in the code. It >> >> > only gets counted once. >> >> >> >> STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not >> >> just the last, but all original stmts that are being replaced by the >> >> pattern). Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt >> >> (last one corresponding to that original stmt). If needed, further >> >> pattern stmts for that original stmts are put into >> >> STMT_VINFO_PATTERN_DEF_SEQ. The pattern matcher could e.g. match >> >> 3 original stmts, and stick a single pattern stmt into STMT_VINFO_RELATED_STMT >> >> on the first original stmt, then say 2 pattern stmts into >> >> STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one >> >> STMT_VINFO_RELATED_STMT and finally one pattern stmt through >> >> STMT_VINFO_RELATED_STMT on the third original stmt. >> >> >> >> > Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so >> >> > this section has to be omitted if we backport it (which is desirable >> >> > since the degradation was introduced in 4.6). Removing it apparently >> >> > does not affect the sphinx3 degradation. >> >> >> >> Yeah, when backporting you can basically assume that >> >> STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL instead >> >> of itt (and simplify), because no pattern recognizers needed more than one >> >> pattern stmt for each original stmt back then. >> >> >> >> Jakub >> >> >> > >> >
Index: gcc/targhooks.c =================================================================== --- gcc/targhooks.c (revision 183944) +++ gcc/targhooks.c (working copy) @@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost case scalar_to_vec: case cond_branch_not_taken: case vec_perm: + case vec_promote_demote: return 1; case unaligned_load: Index: gcc/target.h =================================================================== --- gcc/target.h (revision 183944) +++ gcc/target.h (working copy) @@ -145,7 +145,8 @@ enum vect_cost_for_stmt scalar_to_vec, cond_branch_not_taken, cond_branch_taken, - vec_perm + vec_perm, + vec_promote_demote }; /* The target structure. This holds all the backend hooks. */ Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c (revision 183944) +++ gcc/tree-vect-loop.c (working copy) @@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf if (stmt_info && !STMT_VINFO_RELEVANT_P (stmt_info) && (!STMT_VINFO_LIVE_P (stmt_info) - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) + && !STMT_VINFO_IN_PATTERN_P (stmt_info)) continue; if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) @@ -2564,15 +2565,48 @@ vect_estimate_min_profitable_iters (loop_vec_info { gimple stmt = gsi_stmt (si); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + + /* Translate the last statement in a pattern into the + related replacement statement. */ + if (STMT_VINFO_IN_PATTERN_P (stmt_info)) + { + stmt = STMT_VINFO_RELATED_STMT (stmt_info); + stmt_info = vinfo_for_stmt (stmt); + } + /* Skip stmts that are not vectorized inside the loop. */ if (!STMT_VINFO_RELEVANT_P (stmt_info) && (!STMT_VINFO_LIVE_P (stmt_info) - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))) continue; + vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; /* FIXME: for stmts in the inner-loop in outer-loop vectorization, some of the "outside" costs are generated inside the outer-loop. */ vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); + if (is_pattern_stmt_p (stmt_info) + && STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)) + { + gimple_stmt_iterator gsi; + + for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple pattern_def_stmt = gsi_stmt (gsi); + stmt_vec_info pattern_def_stmt_info + = vinfo_for_stmt (pattern_def_stmt); + if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) + || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) + { + vec_inside_cost + += STMT_VINFO_INSIDE_OF_LOOP_COST + (pattern_def_stmt_info) * factor; + vec_outside_cost + += STMT_VINFO_OUTSIDE_OF_LOOP_COST + (pattern_def_stmt_info); + } + } + } } } Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 183944) +++ gcc/tree-vect-stmts.c (working copy) @@ -811,6 +811,46 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i } +/* Model cost for type demotion and promotion operations. PWR is normally + zero for single-step promotions and demotions. It will be one if + two-step promotion/demotion is required, and so on. Each additional + step doubles the number of instructions required. */ + +static void +vect_model_promotion_demotion_cost (stmt_vec_info stmt_info, + enum vect_def_type *dt, int pwr) +{ + int i, tmp; + int inside_cost = 0, outside_cost = 0, single_stmt_cost; + + /* The SLP costs were already calculated during SLP tree build. */ + if (PURE_SLP_STMT (stmt_info)) + return; + + single_stmt_cost = vect_get_stmt_cost (vec_promote_demote); + for (i = 0; i < pwr + 1; i++) + { + tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ? + (i + 1) : i; + inside_cost += vect_pow2 (tmp) * single_stmt_cost; + } + + /* FORNOW: Assuming maximum 2 args per stmts. */ + for (i = 0; i < 2; i++) + { + if (dt[i] == vect_constant_def || dt[i] == vect_external_def) + outside_cost += vect_get_stmt_cost (vector_stmt); + } + + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, " + "outside_cost = %d .", inside_cost, outside_cost); + + /* Set the costs in STMT_INFO. */ + stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost); + stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost); +} + /* Function vect_cost_strided_group_size For strided load or store, return the group_size only if it is the first @@ -887,7 +927,6 @@ vect_model_store_cost (stmt_vec_info stmt_info, in if (vect_print_dump_info (REPORT_COST)) fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", group_size); - } /* Costs of the stores. */ @@ -1049,7 +1088,7 @@ vect_get_load_cost (struct data_reference *dr, int case dr_explicit_realign: { *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) - + vect_get_stmt_cost (vector_stmt)); + + vect_get_stmt_cost (vec_perm)); /* FIXME: If the misalignment remains fixed across the iterations of the containing loop, the following cost should be added to the @@ -1057,6 +1096,9 @@ vect_get_load_cost (struct data_reference *dr, int if (targetm.vectorize.builtin_mask_for_load) *inside_cost += vect_get_stmt_cost (vector_stmt); + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_model_load_cost: explicit realign"); + break; } case dr_explicit_realign_optimized: @@ -1080,7 +1122,12 @@ vect_get_load_cost (struct data_reference *dr, int } *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) - + vect_get_stmt_cost (vector_stmt)); + + vect_get_stmt_cost (vec_perm)); + + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, + "vect_model_load_cost: explicit realign optimized"); + break; } @@ -2392,16 +2439,19 @@ vectorizable_conversion (gimple stmt, gimple_stmt_ if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vectorizable_conversion ==="); if (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR) - STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; + { + STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); + } else if (modifier == NARROW) { STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; - vect_model_simple_cost (stmt_info, ncopies, dt, NULL); + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); } else { STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; - vect_model_simple_cost (stmt_info, 2 * ncopies, dt, NULL); + vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt); } VEC_free (tree, heap, interm_types); return true; Index: gcc/config/spu/spu.c =================================================================== --- gcc/config/spu/spu.c (revision 183944) +++ gcc/config/spu/spu.c (working copy) @@ -6920,6 +6920,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for case scalar_to_vec: case cond_branch_not_taken: case vec_perm: + case vec_promote_demote: return 1; case scalar_store: Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 183944) +++ gcc/config/i386/i386.c (working copy) @@ -35336,6 +35336,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo return ix86_cost->cond_not_taken_branch_cost; case vec_perm: + case vec_promote_demote: return ix86_cost->vec_stmt_cost; default: Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 183944) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -3543,10 +3543,17 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ return 1; case vec_perm: - if (!TARGET_VSX) + if (TARGET_VSX) + return 4; + else return 1; - return 2; + case vec_promote_demote: + if (TARGET_VSX) + return 5; + else + return 1; + case cond_branch_taken: return 3;