Message ID | 20240812141039.902343-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | Move ix86_align_loops into a separate pass and insert the pass after pass_endbr_and_patchable_area. | expand |
On Mon, Aug 12, 2024 at 10:10 PM liuhongt <hongtao.liu@intel.com> wrote: > > > Are there any assumptions that BB_HEAD must be a note or label? > > Maybe we should move ix86_align_loops into a separate pass and insert > > the pass just before pass_final. > The patch inserts .p2align after endbr pass, it can also fix the issue. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Any comments? Committed > > gcc/ChangeLog: > > PR target/116174 > * config/i386/i386.cc (ix86_align_loops): Move this to .. > * config/i386/i386-features.cc (ix86_align_loops): .. here. > (class pass_align_tight_loops): New class. > (make_pass_align_tight_loops): New function. > * config/i386/i386-passes.def: Insert pass_align_tight_loops > after pass_insert_endbr_and_patchable_area. > * config/i386/i386-protos.h (make_pass_align_tight_loops): New > declare. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr116174.c: New test. > --- > gcc/config/i386/i386-features.cc | 190 +++++++++++++++++++++++ > gcc/config/i386/i386-passes.def | 3 + > gcc/config/i386/i386-protos.h | 1 + > gcc/config/i386/i386.cc | 146 ----------------- > gcc/testsuite/gcc.target/i386/pr116174.c | 12 ++ > 5 files changed, 206 insertions(+), 146 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c > > diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc > index c36d181f2d6..7e80e7b0103 100644 > --- a/gcc/config/i386/i386-features.cc > +++ b/gcc/config/i386/i386-features.cc > @@ -3417,6 +3417,196 @@ make_pass_apx_nf_convert (gcc::context *ctxt) > return new pass_apx_nf_convert (ctxt); > } > > +/* When a hot loop can be fit into one cacheline, > + force align the loop without considering the max skip. */ > +static void > +ix86_align_loops () > +{ > + basic_block bb; > + > + /* Don't do this when we don't know cache line size. */ > + if (ix86_cost->prefetch_block == 0) > + return; > + > + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); > + profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; > + FOR_EACH_BB_FN (bb, cfun) > + { > + rtx_insn *label = BB_HEAD (bb); > + bool has_fallthru = 0; > + edge e; > + edge_iterator ei; > + > + if (!LABEL_P (label)) > + continue; > + > + profile_count fallthru_count = profile_count::zero (); > + profile_count branch_count = profile_count::zero (); > + > + FOR_EACH_EDGE (e, ei, bb->preds) > + { > + if (e->flags & EDGE_FALLTHRU) > + has_fallthru = 1, fallthru_count += e->count (); > + else > + branch_count += e->count (); > + } > + > + if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) > + continue; > + > + if (bb->loop_father > + && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) > + && (has_fallthru > + ? (!(single_succ_p (bb) > + && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) > + && optimize_bb_for_speed_p (bb) > + && branch_count + fallthru_count > count_threshold > + && (branch_count > fallthru_count * param_align_loop_iterations)) > + /* In case there'no fallthru for the loop. > + Nops inserted won't be executed. */ > + : (branch_count > count_threshold > + || (bb->count > bb->prev_bb->count * 10 > + && (bb->prev_bb->count > + <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) > + { > + rtx_insn* insn, *end_insn; > + HOST_WIDE_INT size = 0; > + bool padding_p = true; > + basic_block tbb = bb; > + unsigned cond_branch_num = 0; > + bool detect_tight_loop_p = false; > + > + for (unsigned int i = 0; i != bb->loop_father->num_nodes; > + i++, tbb = tbb->next_bb) > + { > + /* Only handle continuous cfg layout. */ > + if (bb->loop_father != tbb->loop_father) > + { > + padding_p = false; > + break; > + } > + > + FOR_BB_INSNS (tbb, insn) > + { > + if (!NONDEBUG_INSN_P (insn)) > + continue; > + size += ix86_min_insn_size (insn); > + > + /* We don't know size of inline asm. > + Don't align loop for call. */ > + if (asm_noperands (PATTERN (insn)) >= 0 > + || CALL_P (insn)) > + { > + size = -1; > + break; > + } > + } > + > + if (size == -1 || size > ix86_cost->prefetch_block) > + { > + padding_p = false; > + break; > + } > + > + FOR_EACH_EDGE (e, ei, tbb->succs) > + { > + /* It could be part of the loop. */ > + if (e->dest == bb) > + { > + detect_tight_loop_p = true; > + break; > + } > + } > + > + if (detect_tight_loop_p) > + break; > + > + end_insn = BB_END (tbb); > + if (JUMP_P (end_insn)) > + { > + /* For decoded icache: > + 1. Up to two branches are allowed per Way. > + 2. A non-conditional branch is the last micro-op in a Way. > + */ > + if (onlyjump_p (end_insn) > + && (any_uncondjump_p (end_insn) > + || single_succ_p (tbb))) > + { > + padding_p = false; > + break; > + } > + else if (++cond_branch_num >= 2) > + { > + padding_p = false; > + break; > + } > + } > + > + } > + > + if (padding_p && detect_tight_loop_p) > + { > + emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), > + GEN_INT (0)), label); > + /* End of function. */ > + if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) > + break; > + /* Skip bb which already fits into one cacheline. */ > + bb = tbb; > + } > + } > + } > + > + loop_optimizer_finalize (); > + free_dominance_info (CDI_DOMINATORS); > +} > + > +namespace { > + > +const pass_data pass_data_align_tight_loops = > +{ > + RTL_PASS, /* type */ > + "align_tight_loops", /* name */ > + OPTGROUP_NONE, /* optinfo_flags */ > + TV_MACH_DEP, /* tv_id */ > + 0, /* properties_required */ > + 0, /* properties_provided */ > + 0, /* properties_destroyed */ > + 0, /* todo_flags_start */ > + 0, /* todo_flags_finish */ > +}; > + > +class pass_align_tight_loops : public rtl_opt_pass > +{ > +public: > + pass_align_tight_loops (gcc::context *ctxt) > + : rtl_opt_pass (pass_data_align_tight_loops, ctxt) > + {} > + > + /* opt_pass methods: */ > + bool gate (function *) final override > + { > + return optimize && optimize_function_for_speed_p (cfun); > + } > + > + unsigned int execute (function *) final override > + { > + timevar_push (TV_MACH_DEP); > +#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN > + ix86_align_loops (); > +#endif > + timevar_pop (TV_MACH_DEP); > + return 0; > + } > +}; // class pass_align_tight_loops > + > +} // anon namespace > + > +rtl_opt_pass * > +make_pass_align_tight_loops (gcc::context *ctxt) > +{ > + return new pass_align_tight_loops (ctxt); > +} > > /* This compares the priority of target features in function DECL1 > and DECL2. It returns positive value if DECL1 is higher priority, > diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def > index 99fc8805b22..a9d350dcfca 100644 > --- a/gcc/config/i386/i386-passes.def > +++ b/gcc/config/i386/i386-passes.def > @@ -31,6 +31,9 @@ along with GCC; see the file COPYING3. If not see > INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */); > > INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); > + /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area. > + PR116174. */ > + INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); > > INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency); > INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert); > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index a80432b3742..3a7bc949e56 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -425,6 +425,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area > extern rtl_opt_pass *make_pass_remove_partial_avx_dependency > (gcc::context *); > extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *); > +extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); > > extern bool ix86_has_no_direct_extern_access; > extern bool ix86_rpad_gate (); > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index f044826269c..0721e38ab2a 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -23403,150 +23403,6 @@ ix86_split_stlf_stall_load () > } > } > > -/* When a hot loop can be fit into one cacheline, > - force align the loop without considering the max skip. */ > -static void > -ix86_align_loops () > -{ > - basic_block bb; > - > - /* Don't do this when we don't know cache line size. */ > - if (ix86_cost->prefetch_block == 0) > - return; > - > - loop_optimizer_init (AVOID_CFG_MODIFICATIONS); > - profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; > - FOR_EACH_BB_FN (bb, cfun) > - { > - rtx_insn *label = BB_HEAD (bb); > - bool has_fallthru = 0; > - edge e; > - edge_iterator ei; > - > - if (!LABEL_P (label)) > - continue; > - > - profile_count fallthru_count = profile_count::zero (); > - profile_count branch_count = profile_count::zero (); > - > - FOR_EACH_EDGE (e, ei, bb->preds) > - { > - if (e->flags & EDGE_FALLTHRU) > - has_fallthru = 1, fallthru_count += e->count (); > - else > - branch_count += e->count (); > - } > - > - if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) > - continue; > - > - if (bb->loop_father > - && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) > - && (has_fallthru > - ? (!(single_succ_p (bb) > - && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) > - && optimize_bb_for_speed_p (bb) > - && branch_count + fallthru_count > count_threshold > - && (branch_count > fallthru_count * param_align_loop_iterations)) > - /* In case there'no fallthru for the loop. > - Nops inserted won't be executed. */ > - : (branch_count > count_threshold > - || (bb->count > bb->prev_bb->count * 10 > - && (bb->prev_bb->count > - <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) > - { > - rtx_insn* insn, *end_insn; > - HOST_WIDE_INT size = 0; > - bool padding_p = true; > - basic_block tbb = bb; > - unsigned cond_branch_num = 0; > - bool detect_tight_loop_p = false; > - > - for (unsigned int i = 0; i != bb->loop_father->num_nodes; > - i++, tbb = tbb->next_bb) > - { > - /* Only handle continuous cfg layout. */ > - if (bb->loop_father != tbb->loop_father) > - { > - padding_p = false; > - break; > - } > - > - FOR_BB_INSNS (tbb, insn) > - { > - if (!NONDEBUG_INSN_P (insn)) > - continue; > - size += ix86_min_insn_size (insn); > - > - /* We don't know size of inline asm. > - Don't align loop for call. */ > - if (asm_noperands (PATTERN (insn)) >= 0 > - || CALL_P (insn)) > - { > - size = -1; > - break; > - } > - } > - > - if (size == -1 || size > ix86_cost->prefetch_block) > - { > - padding_p = false; > - break; > - } > - > - FOR_EACH_EDGE (e, ei, tbb->succs) > - { > - /* It could be part of the loop. */ > - if (e->dest == bb) > - { > - detect_tight_loop_p = true; > - break; > - } > - } > - > - if (detect_tight_loop_p) > - break; > - > - end_insn = BB_END (tbb); > - if (JUMP_P (end_insn)) > - { > - /* For decoded icache: > - 1. Up to two branches are allowed per Way. > - 2. A non-conditional branch is the last micro-op in a Way. > - */ > - if (onlyjump_p (end_insn) > - && (any_uncondjump_p (end_insn) > - || single_succ_p (tbb))) > - { > - padding_p = false; > - break; > - } > - else if (++cond_branch_num >= 2) > - { > - padding_p = false; > - break; > - } > - } > - > - } > - > - if (padding_p && detect_tight_loop_p) > - { > - emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), > - GEN_INT (0)), label); > - /* End of function. */ > - if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) > - break; > - /* Skip bb which already fits into one cacheline. */ > - bb = tbb; > - } > - } > - } > - > - loop_optimizer_finalize (); > - free_dominance_info (CDI_DOMINATORS); > -} > - > /* Implement machine specific optimizations. We implement padding of returns > for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ > static void > @@ -23570,8 +23426,6 @@ ix86_reorg (void) > #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN > if (TARGET_FOUR_JUMP_LIMIT) > ix86_avoid_jump_mispredicts (); > - > - ix86_align_loops (); > #endif > } > } > diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c > new file mode 100644 > index 00000000000..8877d0b51af > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr116174.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile { target *-*-linux* } } */ > +/* { dg-options "-O2 -fcf-protection=branch" } */ > + > +char * > +foo (char *dest, const char *src) > +{ > + while ((*dest++ = *src++) != '\0') > + /* nothing */; > + return --dest; > +} > + > +/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */ > -- > 2.31.1 >
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index c36d181f2d6..7e80e7b0103 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3417,6 +3417,196 @@ make_pass_apx_nf_convert (gcc::context *ctxt) return new pass_apx_nf_convert (ctxt); } +/* When a hot loop can be fit into one cacheline, + force align the loop without considering the max skip. */ +static void +ix86_align_loops () +{ + basic_block bb; + + /* Don't do this when we don't know cache line size. */ + if (ix86_cost->prefetch_block == 0) + return; + + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); + profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; + FOR_EACH_BB_FN (bb, cfun) + { + rtx_insn *label = BB_HEAD (bb); + bool has_fallthru = 0; + edge e; + edge_iterator ei; + + if (!LABEL_P (label)) + continue; + + profile_count fallthru_count = profile_count::zero (); + profile_count branch_count = profile_count::zero (); + + FOR_EACH_EDGE (e, ei, bb->preds) + { + if (e->flags & EDGE_FALLTHRU) + has_fallthru = 1, fallthru_count += e->count (); + else + branch_count += e->count (); + } + + if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) + continue; + + if (bb->loop_father + && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) + && (has_fallthru + ? (!(single_succ_p (bb) + && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) + && optimize_bb_for_speed_p (bb) + && branch_count + fallthru_count > count_threshold + && (branch_count > fallthru_count * param_align_loop_iterations)) + /* In case there'no fallthru for the loop. + Nops inserted won't be executed. */ + : (branch_count > count_threshold + || (bb->count > bb->prev_bb->count * 10 + && (bb->prev_bb->count + <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) + { + rtx_insn* insn, *end_insn; + HOST_WIDE_INT size = 0; + bool padding_p = true; + basic_block tbb = bb; + unsigned cond_branch_num = 0; + bool detect_tight_loop_p = false; + + for (unsigned int i = 0; i != bb->loop_father->num_nodes; + i++, tbb = tbb->next_bb) + { + /* Only handle continuous cfg layout. */ + if (bb->loop_father != tbb->loop_father) + { + padding_p = false; + break; + } + + FOR_BB_INSNS (tbb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + size += ix86_min_insn_size (insn); + + /* We don't know size of inline asm. + Don't align loop for call. */ + if (asm_noperands (PATTERN (insn)) >= 0 + || CALL_P (insn)) + { + size = -1; + break; + } + } + + if (size == -1 || size > ix86_cost->prefetch_block) + { + padding_p = false; + break; + } + + FOR_EACH_EDGE (e, ei, tbb->succs) + { + /* It could be part of the loop. */ + if (e->dest == bb) + { + detect_tight_loop_p = true; + break; + } + } + + if (detect_tight_loop_p) + break; + + end_insn = BB_END (tbb); + if (JUMP_P (end_insn)) + { + /* For decoded icache: + 1. Up to two branches are allowed per Way. + 2. A non-conditional branch is the last micro-op in a Way. + */ + if (onlyjump_p (end_insn) + && (any_uncondjump_p (end_insn) + || single_succ_p (tbb))) + { + padding_p = false; + break; + } + else if (++cond_branch_num >= 2) + { + padding_p = false; + break; + } + } + + } + + if (padding_p && detect_tight_loop_p) + { + emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), + GEN_INT (0)), label); + /* End of function. */ + if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) + break; + /* Skip bb which already fits into one cacheline. */ + bb = tbb; + } + } + } + + loop_optimizer_finalize (); + free_dominance_info (CDI_DOMINATORS); +} + +namespace { + +const pass_data pass_data_align_tight_loops = +{ + RTL_PASS, /* type */ + "align_tight_loops", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_align_tight_loops : public rtl_opt_pass +{ +public: + pass_align_tight_loops (gcc::context *ctxt) + : rtl_opt_pass (pass_data_align_tight_loops, ctxt) + {} + + /* opt_pass methods: */ + bool gate (function *) final override + { + return optimize && optimize_function_for_speed_p (cfun); + } + + unsigned int execute (function *) final override + { + timevar_push (TV_MACH_DEP); +#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN + ix86_align_loops (); +#endif + timevar_pop (TV_MACH_DEP); + return 0; + } +}; // class pass_align_tight_loops + +} // anon namespace + +rtl_opt_pass * +make_pass_align_tight_loops (gcc::context *ctxt) +{ + return new pass_align_tight_loops (ctxt); +} /* This compares the priority of target features in function DECL1 and DECL2. It returns positive value if DECL1 is higher priority, diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index 99fc8805b22..a9d350dcfca 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -31,6 +31,9 @@ along with GCC; see the file COPYING3. If not see INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */); INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); + /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area. + PR116174. */ + INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency); INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index a80432b3742..3a7bc949e56 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -425,6 +425,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *); +extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); extern bool ix86_has_no_direct_extern_access; extern bool ix86_rpad_gate (); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index f044826269c..0721e38ab2a 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -23403,150 +23403,6 @@ ix86_split_stlf_stall_load () } } -/* When a hot loop can be fit into one cacheline, - force align the loop without considering the max skip. */ -static void -ix86_align_loops () -{ - basic_block bb; - - /* Don't do this when we don't know cache line size. */ - if (ix86_cost->prefetch_block == 0) - return; - - loop_optimizer_init (AVOID_CFG_MODIFICATIONS); - profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; - FOR_EACH_BB_FN (bb, cfun) - { - rtx_insn *label = BB_HEAD (bb); - bool has_fallthru = 0; - edge e; - edge_iterator ei; - - if (!LABEL_P (label)) - continue; - - profile_count fallthru_count = profile_count::zero (); - profile_count branch_count = profile_count::zero (); - - FOR_EACH_EDGE (e, ei, bb->preds) - { - if (e->flags & EDGE_FALLTHRU) - has_fallthru = 1, fallthru_count += e->count (); - else - branch_count += e->count (); - } - - if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) - continue; - - if (bb->loop_father - && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) - && (has_fallthru - ? (!(single_succ_p (bb) - && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) - && optimize_bb_for_speed_p (bb) - && branch_count + fallthru_count > count_threshold - && (branch_count > fallthru_count * param_align_loop_iterations)) - /* In case there'no fallthru for the loop. - Nops inserted won't be executed. */ - : (branch_count > count_threshold - || (bb->count > bb->prev_bb->count * 10 - && (bb->prev_bb->count - <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) - { - rtx_insn* insn, *end_insn; - HOST_WIDE_INT size = 0; - bool padding_p = true; - basic_block tbb = bb; - unsigned cond_branch_num = 0; - bool detect_tight_loop_p = false; - - for (unsigned int i = 0; i != bb->loop_father->num_nodes; - i++, tbb = tbb->next_bb) - { - /* Only handle continuous cfg layout. */ - if (bb->loop_father != tbb->loop_father) - { - padding_p = false; - break; - } - - FOR_BB_INSNS (tbb, insn) - { - if (!NONDEBUG_INSN_P (insn)) - continue; - size += ix86_min_insn_size (insn); - - /* We don't know size of inline asm. - Don't align loop for call. */ - if (asm_noperands (PATTERN (insn)) >= 0 - || CALL_P (insn)) - { - size = -1; - break; - } - } - - if (size == -1 || size > ix86_cost->prefetch_block) - { - padding_p = false; - break; - } - - FOR_EACH_EDGE (e, ei, tbb->succs) - { - /* It could be part of the loop. */ - if (e->dest == bb) - { - detect_tight_loop_p = true; - break; - } - } - - if (detect_tight_loop_p) - break; - - end_insn = BB_END (tbb); - if (JUMP_P (end_insn)) - { - /* For decoded icache: - 1. Up to two branches are allowed per Way. - 2. A non-conditional branch is the last micro-op in a Way. - */ - if (onlyjump_p (end_insn) - && (any_uncondjump_p (end_insn) - || single_succ_p (tbb))) - { - padding_p = false; - break; - } - else if (++cond_branch_num >= 2) - { - padding_p = false; - break; - } - } - - } - - if (padding_p && detect_tight_loop_p) - { - emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), - GEN_INT (0)), label); - /* End of function. */ - if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) - break; - /* Skip bb which already fits into one cacheline. */ - bb = tbb; - } - } - } - - loop_optimizer_finalize (); - free_dominance_info (CDI_DOMINATORS); -} - /* Implement machine specific optimizations. We implement padding of returns for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ static void @@ -23570,8 +23426,6 @@ ix86_reorg (void) #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN if (TARGET_FOUR_JUMP_LIMIT) ix86_avoid_jump_mispredicts (); - - ix86_align_loops (); #endif } } diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c new file mode 100644 index 00000000000..8877d0b51af --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr116174.c @@ -0,0 +1,12 @@ +/* { dg-do compile { target *-*-linux* } } */ +/* { dg-options "-O2 -fcf-protection=branch" } */ + +char * +foo (char *dest, const char *src) +{ + while ((*dest++ = *src++) != '\0') + /* nothing */; + return --dest; +} + +/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */