diff mbox series

Move ix86_align_loops into a separate pass and insert the pass after pass_endbr_and_patchable_area.

Message ID 20240812141039.902343-1-hongtao.liu@intel.com
State New
Headers show
Series Move ix86_align_loops into a separate pass and insert the pass after pass_endbr_and_patchable_area. | expand

Commit Message

liuhongt Aug. 12, 2024, 2:10 p.m. UTC
> Are there any assumptions that BB_HEAD must be a note or label?
> Maybe we should move ix86_align_loops into a separate pass and insert
> the pass just before pass_final.
The patch inserts .p2align after endbr pass, it can also fix the issue.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

gcc/ChangeLog:

	PR target/116174
	* config/i386/i386.cc (ix86_align_loops): Move this to ..
	* config/i386/i386-features.cc (ix86_align_loops): .. here.
	(class pass_align_tight_loops): New class.
	(make_pass_align_tight_loops): New function.
	* config/i386/i386-passes.def: Insert pass_align_tight_loops
	after pass_insert_endbr_and_patchable_area.
	* config/i386/i386-protos.h (make_pass_align_tight_loops): New
	declare.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr116174.c: New test.
---
 gcc/config/i386/i386-features.cc         | 190 +++++++++++++++++++++++
 gcc/config/i386/i386-passes.def          |   3 +
 gcc/config/i386/i386-protos.h            |   1 +
 gcc/config/i386/i386.cc                  | 146 -----------------
 gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
 5 files changed, 206 insertions(+), 146 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c

Comments

Hongtao Liu Aug. 14, 2024, 1:39 a.m. UTC | #1
On Mon, Aug 12, 2024 at 10:10 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> > Are there any assumptions that BB_HEAD must be a note or label?
> > Maybe we should move ix86_align_loops into a separate pass and insert
> > the pass just before pass_final.
> The patch inserts .p2align after endbr pass, it can also fix the issue.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Any comments?
Committed
>
> gcc/ChangeLog:
>
>         PR target/116174
>         * config/i386/i386.cc (ix86_align_loops): Move this to ..
>         * config/i386/i386-features.cc (ix86_align_loops): .. here.
>         (class pass_align_tight_loops): New class.
>         (make_pass_align_tight_loops): New function.
>         * config/i386/i386-passes.def: Insert pass_align_tight_loops
>         after pass_insert_endbr_and_patchable_area.
>         * config/i386/i386-protos.h (make_pass_align_tight_loops): New
>         declare.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr116174.c: New test.
> ---
>  gcc/config/i386/i386-features.cc         | 190 +++++++++++++++++++++++
>  gcc/config/i386/i386-passes.def          |   3 +
>  gcc/config/i386/i386-protos.h            |   1 +
>  gcc/config/i386/i386.cc                  | 146 -----------------
>  gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
>  5 files changed, 206 insertions(+), 146 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c
>
> diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
> index c36d181f2d6..7e80e7b0103 100644
> --- a/gcc/config/i386/i386-features.cc
> +++ b/gcc/config/i386/i386-features.cc
> @@ -3417,6 +3417,196 @@ make_pass_apx_nf_convert (gcc::context *ctxt)
>    return new pass_apx_nf_convert (ctxt);
>  }
>
> +/* When a hot loop can be fit into one cacheline,
> +   force align the loop without considering the max skip.  */
> +static void
> +ix86_align_loops ()
> +{
> +  basic_block bb;
> +
> +  /* Don't do this when we don't know cache line size.  */
> +  if (ix86_cost->prefetch_block == 0)
> +    return;
> +
> +  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> +  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
> +  FOR_EACH_BB_FN (bb, cfun)
> +    {
> +      rtx_insn *label = BB_HEAD (bb);
> +      bool has_fallthru = 0;
> +      edge e;
> +      edge_iterator ei;
> +
> +      if (!LABEL_P (label))
> +       continue;
> +
> +      profile_count fallthru_count = profile_count::zero ();
> +      profile_count branch_count = profile_count::zero ();
> +
> +      FOR_EACH_EDGE (e, ei, bb->preds)
> +       {
> +         if (e->flags & EDGE_FALLTHRU)
> +           has_fallthru = 1, fallthru_count += e->count ();
> +         else
> +           branch_count += e->count ();
> +       }
> +
> +      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
> +       continue;
> +
> +      if (bb->loop_father
> +         && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
> +         && (has_fallthru
> +             ? (!(single_succ_p (bb)
> +                  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
> +                && optimize_bb_for_speed_p (bb)
> +                && branch_count + fallthru_count > count_threshold
> +                && (branch_count > fallthru_count * param_align_loop_iterations))
> +             /* In case there'no fallthru for the loop.
> +                Nops inserted won't be executed.  */
> +             : (branch_count > count_threshold
> +                || (bb->count > bb->prev_bb->count * 10
> +                    && (bb->prev_bb->count
> +                        <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
> +       {
> +         rtx_insn* insn, *end_insn;
> +         HOST_WIDE_INT size = 0;
> +         bool padding_p = true;
> +         basic_block tbb = bb;
> +         unsigned cond_branch_num = 0;
> +         bool detect_tight_loop_p = false;
> +
> +         for (unsigned int i = 0; i != bb->loop_father->num_nodes;
> +              i++, tbb = tbb->next_bb)
> +           {
> +             /* Only handle continuous cfg layout. */
> +             if (bb->loop_father != tbb->loop_father)
> +               {
> +                 padding_p = false;
> +                 break;
> +               }
> +
> +             FOR_BB_INSNS (tbb, insn)
> +               {
> +                 if (!NONDEBUG_INSN_P (insn))
> +                   continue;
> +                 size += ix86_min_insn_size (insn);
> +
> +                 /* We don't know size of inline asm.
> +                    Don't align loop for call.  */
> +                 if (asm_noperands (PATTERN (insn)) >= 0
> +                     || CALL_P (insn))
> +                   {
> +                     size = -1;
> +                     break;
> +                   }
> +               }
> +
> +             if (size == -1 || size > ix86_cost->prefetch_block)
> +               {
> +                 padding_p = false;
> +                 break;
> +               }
> +
> +             FOR_EACH_EDGE (e, ei, tbb->succs)
> +               {
> +                 /* It could be part of the loop.  */
> +                 if (e->dest == bb)
> +                   {
> +                     detect_tight_loop_p = true;
> +                     break;
> +                   }
> +               }
> +
> +             if (detect_tight_loop_p)
> +               break;
> +
> +             end_insn = BB_END (tbb);
> +             if (JUMP_P (end_insn))
> +               {
> +                 /* For decoded icache:
> +                    1. Up to two branches are allowed per Way.
> +                    2. A non-conditional branch is the last micro-op in a Way.
> +                 */
> +                 if (onlyjump_p (end_insn)
> +                     && (any_uncondjump_p (end_insn)
> +                         || single_succ_p (tbb)))
> +                   {
> +                     padding_p = false;
> +                     break;
> +                   }
> +                 else if (++cond_branch_num >= 2)
> +                   {
> +                     padding_p = false;
> +                     break;
> +                   }
> +               }
> +
> +           }
> +
> +         if (padding_p && detect_tight_loop_p)
> +           {
> +             emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
> +                                                   GEN_INT (0)), label);
> +             /* End of function.  */
> +             if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
> +               break;
> +             /* Skip bb which already fits into one cacheline.  */
> +             bb = tbb;
> +           }
> +       }
> +    }
> +
> +  loop_optimizer_finalize ();
> +  free_dominance_info (CDI_DOMINATORS);
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_align_tight_loops =
> +{
> +  RTL_PASS, /* type */
> +  "align_tight_loops", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_MACH_DEP, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  0, /* todo_flags_finish */
> +};
> +
> +class pass_align_tight_loops : public rtl_opt_pass
> +{
> +public:
> +  pass_align_tight_loops (gcc::context *ctxt)
> +    : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
> +  {}
> +
> +  /* opt_pass methods: */
> +  bool gate (function *) final override
> +    {
> +      return optimize && optimize_function_for_speed_p (cfun);
> +    }
> +
> +  unsigned int execute (function *) final override
> +    {
> +      timevar_push (TV_MACH_DEP);
> +#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
> +      ix86_align_loops ();
> +#endif
> +      timevar_pop (TV_MACH_DEP);
> +      return 0;
> +    }
> +}; // class pass_align_tight_loops
> +
> +} // anon namespace
> +
> +rtl_opt_pass *
> +make_pass_align_tight_loops (gcc::context *ctxt)
> +{
> +  return new pass_align_tight_loops (ctxt);
> +}
>
>  /* This compares the priority of target features in function DECL1
>     and DECL2.  It returns positive value if DECL1 is higher priority,
> diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
> index 99fc8805b22..a9d350dcfca 100644
> --- a/gcc/config/i386/i386-passes.def
> +++ b/gcc/config/i386/i386-passes.def
> @@ -31,6 +31,9 @@ along with GCC; see the file COPYING3.  If not see
>    INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
>
>    INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
> +  /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area.
> +     PR116174.  */
> +  INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
>
>    INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
>    INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index a80432b3742..3a7bc949e56 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -425,6 +425,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
>  extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
>    (gcc::context *);
>  extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
> +extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
>
>  extern bool ix86_has_no_direct_extern_access;
>  extern bool ix86_rpad_gate ();
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index f044826269c..0721e38ab2a 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -23403,150 +23403,6 @@ ix86_split_stlf_stall_load ()
>      }
>  }
>
> -/* When a hot loop can be fit into one cacheline,
> -   force align the loop without considering the max skip.  */
> -static void
> -ix86_align_loops ()
> -{
> -  basic_block bb;
> -
> -  /* Don't do this when we don't know cache line size.  */
> -  if (ix86_cost->prefetch_block == 0)
> -    return;
> -
> -  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> -  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
> -  FOR_EACH_BB_FN (bb, cfun)
> -    {
> -      rtx_insn *label = BB_HEAD (bb);
> -      bool has_fallthru = 0;
> -      edge e;
> -      edge_iterator ei;
> -
> -      if (!LABEL_P (label))
> -       continue;
> -
> -      profile_count fallthru_count = profile_count::zero ();
> -      profile_count branch_count = profile_count::zero ();
> -
> -      FOR_EACH_EDGE (e, ei, bb->preds)
> -       {
> -         if (e->flags & EDGE_FALLTHRU)
> -           has_fallthru = 1, fallthru_count += e->count ();
> -         else
> -           branch_count += e->count ();
> -       }
> -
> -      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
> -       continue;
> -
> -      if (bb->loop_father
> -         && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
> -         && (has_fallthru
> -             ? (!(single_succ_p (bb)
> -                  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
> -                && optimize_bb_for_speed_p (bb)
> -                && branch_count + fallthru_count > count_threshold
> -                && (branch_count > fallthru_count * param_align_loop_iterations))
> -             /* In case there'no fallthru for the loop.
> -                Nops inserted won't be executed.  */
> -             : (branch_count > count_threshold
> -                || (bb->count > bb->prev_bb->count * 10
> -                    && (bb->prev_bb->count
> -                        <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
> -       {
> -         rtx_insn* insn, *end_insn;
> -         HOST_WIDE_INT size = 0;
> -         bool padding_p = true;
> -         basic_block tbb = bb;
> -         unsigned cond_branch_num = 0;
> -         bool detect_tight_loop_p = false;
> -
> -         for (unsigned int i = 0; i != bb->loop_father->num_nodes;
> -              i++, tbb = tbb->next_bb)
> -           {
> -             /* Only handle continuous cfg layout. */
> -             if (bb->loop_father != tbb->loop_father)
> -               {
> -                 padding_p = false;
> -                 break;
> -               }
> -
> -             FOR_BB_INSNS (tbb, insn)
> -               {
> -                 if (!NONDEBUG_INSN_P (insn))
> -                   continue;
> -                 size += ix86_min_insn_size (insn);
> -
> -                 /* We don't know size of inline asm.
> -                    Don't align loop for call.  */
> -                 if (asm_noperands (PATTERN (insn)) >= 0
> -                     || CALL_P (insn))
> -                   {
> -                     size = -1;
> -                     break;
> -                   }
> -               }
> -
> -             if (size == -1 || size > ix86_cost->prefetch_block)
> -               {
> -                 padding_p = false;
> -                 break;
> -               }
> -
> -             FOR_EACH_EDGE (e, ei, tbb->succs)
> -               {
> -                 /* It could be part of the loop.  */
> -                 if (e->dest == bb)
> -                   {
> -                     detect_tight_loop_p = true;
> -                     break;
> -                   }
> -               }
> -
> -             if (detect_tight_loop_p)
> -               break;
> -
> -             end_insn = BB_END (tbb);
> -             if (JUMP_P (end_insn))
> -               {
> -                 /* For decoded icache:
> -                    1. Up to two branches are allowed per Way.
> -                    2. A non-conditional branch is the last micro-op in a Way.
> -                 */
> -                 if (onlyjump_p (end_insn)
> -                     && (any_uncondjump_p (end_insn)
> -                         || single_succ_p (tbb)))
> -                   {
> -                     padding_p = false;
> -                     break;
> -                   }
> -                 else if (++cond_branch_num >= 2)
> -                   {
> -                     padding_p = false;
> -                     break;
> -                   }
> -               }
> -
> -           }
> -
> -         if (padding_p && detect_tight_loop_p)
> -           {
> -             emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
> -                                                   GEN_INT (0)), label);
> -             /* End of function.  */
> -             if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
> -               break;
> -             /* Skip bb which already fits into one cacheline.  */
> -             bb = tbb;
> -           }
> -       }
> -    }
> -
> -  loop_optimizer_finalize ();
> -  free_dominance_info (CDI_DOMINATORS);
> -}
> -
>  /* Implement machine specific optimizations.  We implement padding of returns
>     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
>  static void
> @@ -23570,8 +23426,6 @@ ix86_reorg (void)
>  #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
>        if (TARGET_FOUR_JUMP_LIMIT)
>         ix86_avoid_jump_mispredicts ();
> -
> -      ix86_align_loops ();
>  #endif
>      }
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c
> new file mode 100644
> index 00000000000..8877d0b51af
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr116174.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fcf-protection=branch" } */
> +
> +char *
> +foo (char *dest, const char *src)
> +{
> +  while ((*dest++ = *src++) != '\0')
> +    /* nothing */;
> +  return --dest;
> +}
> +
> +/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */
> --
> 2.31.1
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c36d181f2d6..7e80e7b0103 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3417,6 +3417,196 @@  make_pass_apx_nf_convert (gcc::context *ctxt)
   return new pass_apx_nf_convert (ctxt);
 }
 
+/* When a hot loop can be fit into one cacheline,
+   force align the loop without considering the max skip.  */
+static void
+ix86_align_loops ()
+{
+  basic_block bb;
+
+  /* Don't do this when we don't know cache line size.  */
+  if (ix86_cost->prefetch_block == 0)
+    return;
+
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *label = BB_HEAD (bb);
+      bool has_fallthru = 0;
+      edge e;
+      edge_iterator ei;
+
+      if (!LABEL_P (label))
+	continue;
+
+      profile_count fallthru_count = profile_count::zero ();
+      profile_count branch_count = profile_count::zero ();
+
+      FOR_EACH_EDGE (e, ei, bb->preds)
+	{
+	  if (e->flags & EDGE_FALLTHRU)
+	    has_fallthru = 1, fallthru_count += e->count ();
+	  else
+	    branch_count += e->count ();
+	}
+
+      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+	continue;
+
+      if (bb->loop_father
+	  && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+	  && (has_fallthru
+	      ? (!(single_succ_p (bb)
+		   && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+		 && optimize_bb_for_speed_p (bb)
+		 && branch_count + fallthru_count > count_threshold
+		 && (branch_count > fallthru_count * param_align_loop_iterations))
+	      /* In case there'no fallthru for the loop.
+		 Nops inserted won't be executed.  */
+	      : (branch_count > count_threshold
+		 || (bb->count > bb->prev_bb->count * 10
+		     && (bb->prev_bb->count
+			 <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
+	{
+	  rtx_insn* insn, *end_insn;
+	  HOST_WIDE_INT size = 0;
+	  bool padding_p = true;
+	  basic_block tbb = bb;
+	  unsigned cond_branch_num = 0;
+	  bool detect_tight_loop_p = false;
+
+	  for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+	       i++, tbb = tbb->next_bb)
+	    {
+	      /* Only handle continuous cfg layout. */
+	      if (bb->loop_father != tbb->loop_father)
+		{
+		  padding_p = false;
+		  break;
+		}
+
+	      FOR_BB_INSNS (tbb, insn)
+		{
+		  if (!NONDEBUG_INSN_P (insn))
+		    continue;
+		  size += ix86_min_insn_size (insn);
+
+		  /* We don't know size of inline asm.
+		     Don't align loop for call.  */
+		  if (asm_noperands (PATTERN (insn)) >= 0
+		      || CALL_P (insn))
+		    {
+		      size = -1;
+		      break;
+		    }
+		}
+
+	      if (size == -1 || size > ix86_cost->prefetch_block)
+		{
+		  padding_p = false;
+		  break;
+		}
+
+	      FOR_EACH_EDGE (e, ei, tbb->succs)
+		{
+		  /* It could be part of the loop.  */
+		  if (e->dest == bb)
+		    {
+		      detect_tight_loop_p = true;
+		      break;
+		    }
+		}
+
+	      if (detect_tight_loop_p)
+		break;
+
+	      end_insn = BB_END (tbb);
+	      if (JUMP_P (end_insn))
+		{
+		  /* For decoded icache:
+		     1. Up to two branches are allowed per Way.
+		     2. A non-conditional branch is the last micro-op in a Way.
+		  */
+		  if (onlyjump_p (end_insn)
+		      && (any_uncondjump_p (end_insn)
+			  || single_succ_p (tbb)))
+		    {
+		      padding_p = false;
+		      break;
+		    }
+		  else if (++cond_branch_num >= 2)
+		    {
+		      padding_p = false;
+		      break;
+		    }
+		}
+
+	    }
+
+	  if (padding_p && detect_tight_loop_p)
+	    {
+	      emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
+						    GEN_INT (0)), label);
+	      /* End of function.  */
+	      if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+		break;
+	      /* Skip bb which already fits into one cacheline.  */
+	      bb = tbb;
+	    }
+	}
+    }
+
+  loop_optimizer_finalize ();
+  free_dominance_info (CDI_DOMINATORS);
+}
+
+namespace {
+
+const pass_data pass_data_align_tight_loops =
+{
+  RTL_PASS, /* type */
+  "align_tight_loops", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_align_tight_loops : public rtl_opt_pass
+{
+public:
+  pass_align_tight_loops (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) final override
+    {
+      return optimize && optimize_function_for_speed_p (cfun);
+    }
+
+  unsigned int execute (function *) final override
+    {
+      timevar_push (TV_MACH_DEP);
+#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
+      ix86_align_loops ();
+#endif
+      timevar_pop (TV_MACH_DEP);
+      return 0;
+    }
+}; // class pass_align_tight_loops
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_align_tight_loops (gcc::context *ctxt)
+{
+  return new pass_align_tight_loops (ctxt);
+}
 
 /* This compares the priority of target features in function DECL1
    and DECL2.  It returns positive value if DECL1 is higher priority,
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 99fc8805b22..a9d350dcfca 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -31,6 +31,9 @@  along with GCC; see the file COPYING3.  If not see
   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
 
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
+  /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area.
+     PR116174.  */
+  INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
 
   INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
   INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index a80432b3742..3a7bc949e56 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -425,6 +425,7 @@  extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
 extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
+extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
 
 extern bool ix86_has_no_direct_extern_access;
 extern bool ix86_rpad_gate ();
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f044826269c..0721e38ab2a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23403,150 +23403,6 @@  ix86_split_stlf_stall_load ()
     }
 }
 
-/* When a hot loop can be fit into one cacheline,
-   force align the loop without considering the max skip.  */
-static void
-ix86_align_loops ()
-{
-  basic_block bb;
-
-  /* Don't do this when we don't know cache line size.  */
-  if (ix86_cost->prefetch_block == 0)
-    return;
-
-  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
-  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
-  FOR_EACH_BB_FN (bb, cfun)
-    {
-      rtx_insn *label = BB_HEAD (bb);
-      bool has_fallthru = 0;
-      edge e;
-      edge_iterator ei;
-
-      if (!LABEL_P (label))
-	continue;
-
-      profile_count fallthru_count = profile_count::zero ();
-      profile_count branch_count = profile_count::zero ();
-
-      FOR_EACH_EDGE (e, ei, bb->preds)
-	{
-	  if (e->flags & EDGE_FALLTHRU)
-	    has_fallthru = 1, fallthru_count += e->count ();
-	  else
-	    branch_count += e->count ();
-	}
-
-      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
-	continue;
-
-      if (bb->loop_father
-	  && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
-	  && (has_fallthru
-	      ? (!(single_succ_p (bb)
-		   && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
-		 && optimize_bb_for_speed_p (bb)
-		 && branch_count + fallthru_count > count_threshold
-		 && (branch_count > fallthru_count * param_align_loop_iterations))
-	      /* In case there'no fallthru for the loop.
-		 Nops inserted won't be executed.  */
-	      : (branch_count > count_threshold
-		 || (bb->count > bb->prev_bb->count * 10
-		     && (bb->prev_bb->count
-			 <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
-	{
-	  rtx_insn* insn, *end_insn;
-	  HOST_WIDE_INT size = 0;
-	  bool padding_p = true;
-	  basic_block tbb = bb;
-	  unsigned cond_branch_num = 0;
-	  bool detect_tight_loop_p = false;
-
-	  for (unsigned int i = 0; i != bb->loop_father->num_nodes;
-	       i++, tbb = tbb->next_bb)
-	    {
-	      /* Only handle continuous cfg layout. */
-	      if (bb->loop_father != tbb->loop_father)
-		{
-		  padding_p = false;
-		  break;
-		}
-
-	      FOR_BB_INSNS (tbb, insn)
-		{
-		  if (!NONDEBUG_INSN_P (insn))
-		    continue;
-		  size += ix86_min_insn_size (insn);
-
-		  /* We don't know size of inline asm.
-		     Don't align loop for call.  */
-		  if (asm_noperands (PATTERN (insn)) >= 0
-		      || CALL_P (insn))
-		    {
-		      size = -1;
-		      break;
-		    }
-		}
-
-	      if (size == -1 || size > ix86_cost->prefetch_block)
-		{
-		  padding_p = false;
-		  break;
-		}
-
-	      FOR_EACH_EDGE (e, ei, tbb->succs)
-		{
-		  /* It could be part of the loop.  */
-		  if (e->dest == bb)
-		    {
-		      detect_tight_loop_p = true;
-		      break;
-		    }
-		}
-
-	      if (detect_tight_loop_p)
-		break;
-
-	      end_insn = BB_END (tbb);
-	      if (JUMP_P (end_insn))
-		{
-		  /* For decoded icache:
-		     1. Up to two branches are allowed per Way.
-		     2. A non-conditional branch is the last micro-op in a Way.
-		  */
-		  if (onlyjump_p (end_insn)
-		      && (any_uncondjump_p (end_insn)
-			  || single_succ_p (tbb)))
-		    {
-		      padding_p = false;
-		      break;
-		    }
-		  else if (++cond_branch_num >= 2)
-		    {
-		      padding_p = false;
-		      break;
-		    }
-		}
-
-	    }
-
-	  if (padding_p && detect_tight_loop_p)
-	    {
-	      emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
-						    GEN_INT (0)), label);
-	      /* End of function.  */
-	      if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
-		break;
-	      /* Skip bb which already fits into one cacheline.  */
-	      bb = tbb;
-	    }
-	}
-    }
-
-  loop_optimizer_finalize ();
-  free_dominance_info (CDI_DOMINATORS);
-}
-
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
 static void
@@ -23570,8 +23426,6 @@  ix86_reorg (void)
 #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
       if (TARGET_FOUR_JUMP_LIMIT)
 	ix86_avoid_jump_mispredicts ();
-
-      ix86_align_loops ();
 #endif
     }
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c
new file mode 100644
index 00000000000..8877d0b51af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116174.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile { target *-*-linux* } } */
+/* { dg-options "-O2 -fcf-protection=branch" } */
+
+char *
+foo (char *dest, const char *src)
+{
+  while ((*dest++ = *src++) != '\0')
+    /* nothing */;
+  return --dest;
+}
+
+/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */