diff mbox series

[v8,07/12] Enable musttail tail conversion even when not optimizing

Message ID 20240622185557.1589179-8-ak@linux.intel.com
State New
Headers show
Series [v8,01/12] Improve must tail in RTL backend | expand

Commit Message

Andi Kleen June 22, 2024, 6:54 p.m. UTC
Enable the tailcall optimization for non optimizing builds,
but in this case only checks calls that have the musttail attribute set.
This makes musttail work without optimization.

This is done with a new late musttail pass that is only active when
not optimizing. The new pass relies on tree-cfg to discover musttails.
This avoids a ~0.8% compiler run time penalty at -O0.

gcc/ChangeLog:

	* function.h (struct function): Add has_musttail.
	* lto-streamer-in.cc (input_struct_function_base): Stream
	has_musttail.
	* lto-streamer-out.cc (output_struct_function_base): Dito.
	* passes.def (pass_musttail): Add.
	* tree-cfg.cc (notice_special_calls): Record has_musttail.
	(clear_special_calls): Clear has_musttail.
	* tree-pass.h (make_pass_musttail): Add.
	* tree-tailcall.cc (find_tail_calls): Handle only_musttail
	  argument.
	(tree_optimize_tail_calls_1): Pass on only_musttail.
	(execute_tail_calls): Pass only_musttail as false.
	(class pass_musttail): Add.
	(make_pass_musttail): Add.
---
 gcc/function.h          |  3 ++
 gcc/lto-streamer-in.cc  |  1 +
 gcc/lto-streamer-out.cc |  1 +
 gcc/passes.def          |  1 +
 gcc/tree-cfg.cc         |  3 ++
 gcc/tree-pass.h         |  1 +
 gcc/tree-tailcall.cc    | 66 +++++++++++++++++++++++++++++++++++------
 7 files changed, 67 insertions(+), 9 deletions(-)

Comments

Richard Biener July 5, 2024, 10:38 a.m. UTC | #1
On Sat, Jun 22, 2024 at 8:59 PM Andi Kleen <ak@linux.intel.com> wrote:
>
> Enable the tailcall optimization for non optimizing builds,
> but in this case only checks calls that have the musttail attribute set.
> This makes musttail work without optimization.
>
> This is done with a new late musttail pass that is only active when
> not optimizing. The new pass relies on tree-cfg to discover musttails.
> This avoids a ~0.8% compiler run time penalty at -O0.
>
> gcc/ChangeLog:
>
>         * function.h (struct function): Add has_musttail.
>         * lto-streamer-in.cc (input_struct_function_base): Stream
>         has_musttail.
>         * lto-streamer-out.cc (output_struct_function_base): Dito.
>         * passes.def (pass_musttail): Add.
>         * tree-cfg.cc (notice_special_calls): Record has_musttail.
>         (clear_special_calls): Clear has_musttail.
>         * tree-pass.h (make_pass_musttail): Add.
>         * tree-tailcall.cc (find_tail_calls): Handle only_musttail
>           argument.
>         (tree_optimize_tail_calls_1): Pass on only_musttail.
>         (execute_tail_calls): Pass only_musttail as false.
>         (class pass_musttail): Add.
>         (make_pass_musttail): Add.
> ---
>  gcc/function.h          |  3 ++
>  gcc/lto-streamer-in.cc  |  1 +
>  gcc/lto-streamer-out.cc |  1 +
>  gcc/passes.def          |  1 +
>  gcc/tree-cfg.cc         |  3 ++
>  gcc/tree-pass.h         |  1 +
>  gcc/tree-tailcall.cc    | 66 +++++++++++++++++++++++++++++++++++------
>  7 files changed, 67 insertions(+), 9 deletions(-)
>
> diff --git a/gcc/function.h b/gcc/function.h
> index c0ba6cc1531a..fbeadeaf4104 100644
> --- a/gcc/function.h
> +++ b/gcc/function.h
> @@ -430,6 +430,9 @@ struct GTY(()) function {
>    /* Nonzero when the tail call has been identified.  */
>    unsigned int tail_call_marked : 1;
>
> +  /* Has musttail marked calls.  */
> +  unsigned int has_musttail : 1;
> +
>    /* Nonzero if the current function contains a #pragma GCC unroll.  */
>    unsigned int has_unroll : 1;
>
> diff --git a/gcc/lto-streamer-in.cc b/gcc/lto-streamer-in.cc
> index ad0ca24007a0..2e592be80823 100644
> --- a/gcc/lto-streamer-in.cc
> +++ b/gcc/lto-streamer-in.cc
> @@ -1325,6 +1325,7 @@ input_struct_function_base (struct function *fn, class data_in *data_in,
>    fn->calls_eh_return = bp_unpack_value (&bp, 1);
>    fn->has_force_vectorize_loops = bp_unpack_value (&bp, 1);
>    fn->has_simduid_loops = bp_unpack_value (&bp, 1);
> +  fn->has_musttail = bp_unpack_value (&bp, 1);
>    fn->assume_function = bp_unpack_value (&bp, 1);
>    fn->va_list_fpr_size = bp_unpack_value (&bp, 8);
>    fn->va_list_gpr_size = bp_unpack_value (&bp, 8);
> diff --git a/gcc/lto-streamer-out.cc b/gcc/lto-streamer-out.cc
> index d4f728094ed5..0be381abbd96 100644
> --- a/gcc/lto-streamer-out.cc
> +++ b/gcc/lto-streamer-out.cc
> @@ -2290,6 +2290,7 @@ output_struct_function_base (struct output_block *ob, struct function *fn)
>    bp_pack_value (&bp, fn->calls_eh_return, 1);
>    bp_pack_value (&bp, fn->has_force_vectorize_loops, 1);
>    bp_pack_value (&bp, fn->has_simduid_loops, 1);
> +  bp_pack_value (&bp, fn->has_musttail, 1);
>    bp_pack_value (&bp, fn->assume_function, 1);
>    bp_pack_value (&bp, fn->va_list_fpr_size, 8);
>    bp_pack_value (&bp, fn->va_list_gpr_size, 8);
> diff --git a/gcc/passes.def b/gcc/passes.def
> index 041229e47a68..5b5390e6ac0b 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -444,6 +444,7 @@ along with GCC; see the file COPYING3.  If not see
>    NEXT_PASS (pass_tsan_O0);
>    NEXT_PASS (pass_sanopt);
>    NEXT_PASS (pass_cleanup_eh);
> +  NEXT_PASS (pass_musttail);
>    NEXT_PASS (pass_lower_resx);
>    NEXT_PASS (pass_nrv);
>    NEXT_PASS (pass_gimple_isel);
> diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
> index 7fb7b92966be..e6fd1294b958 100644
> --- a/gcc/tree-cfg.cc
> +++ b/gcc/tree-cfg.cc
> @@ -2290,6 +2290,8 @@ notice_special_calls (gcall *call)
>      cfun->calls_alloca = true;
>    if (flags & ECF_RETURNS_TWICE)
>      cfun->calls_setjmp = true;
> +  if (gimple_call_must_tail_p (call))
> +    cfun->has_musttail = true;
>  }
>
>
> @@ -2301,6 +2303,7 @@ clear_special_calls (void)
>  {
>    cfun->calls_alloca = false;
>    cfun->calls_setjmp = false;
> +  cfun->has_musttail = false;
>  }
>
>  /* Remove PHI nodes associated with basic block BB and all edges out of BB.  */
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index edebb2be245d..59e53558034f 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -368,6 +368,7 @@ extern gimple_opt_pass *make_pass_sra (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_sra_early (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_tail_recursion (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_tail_calls (gcc::context *ctxt);
> +extern gimple_opt_pass *make_pass_musttail (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_fix_loops (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_tree_loop (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_tree_no_loop (gcc::context *ctxt);
> diff --git a/gcc/tree-tailcall.cc b/gcc/tree-tailcall.cc
> index e9f7f8a12b3a..0c6df10e64f7 100644
> --- a/gcc/tree-tailcall.cc
> +++ b/gcc/tree-tailcall.cc
> @@ -408,10 +408,10 @@ static live_vars_map *live_vars;
>  static vec<bitmap_head> live_vars_vec;
>
>  /* Finds tailcalls falling into basic block BB. The list of found tailcalls is
> -   added to the start of RET.  */
> +   added to the start of RET. When ONLY_MUSTTAIL is set only handle musttail.  */
>
>  static void
> -find_tail_calls (basic_block bb, struct tailcall **ret)
> +find_tail_calls (basic_block bb, struct tailcall **ret, bool only_musttail)
>  {
>    tree ass_var = NULL_TREE, ret_var, func, param;
>    gimple *stmt;
> @@ -445,6 +445,9 @@ find_tail_calls (basic_block bb, struct tailcall **ret)
>        if (is_gimple_call (stmt))
>         {
>           call = as_a <gcall *> (stmt);
> +         /* Handle only musttail calls when not optimizing.  */
> +         if (only_musttail && !gimple_call_must_tail_p (call))
> +           return;
>           ass_var = gimple_call_lhs (call);
>           break;
>         }
> @@ -467,7 +470,7 @@ find_tail_calls (basic_block bb, struct tailcall **ret)
>        edge_iterator ei;
>        /* Recurse to the predecessors.  */
>        FOR_EACH_EDGE (e, ei, bb->preds)
> -       find_tail_calls (e->src, ret);
> +       find_tail_calls (e->src, ret, only_musttail);
>
>        return;
>      }
> @@ -528,7 +531,8 @@ find_tail_calls (basic_block bb, struct tailcall **ret)
>    func = gimple_call_fndecl (call);
>    if (func
>        && !fndecl_built_in_p (func)
> -      && recursive_call_p (current_function_decl, func))
> +      && recursive_call_p (current_function_decl, func)
> +      && !only_musttail)
>      {
>        tree arg;
>
> @@ -1094,10 +1098,11 @@ create_tailcall_accumulator (const char *label, basic_block bb, tree init)
>  }
>
>  /* Optimizes tail calls in the function, turning the tail recursion
> -   into iteration.  */
> +   into iteration. When ONLY_MUSTCALL is true only optimize mustcall
> +   marked calls.  */
>
>  static unsigned int
> -tree_optimize_tail_calls_1 (bool opt_tailcalls)
> +tree_optimize_tail_calls_1 (bool opt_tailcalls, bool only_mustcall)
>  {
>    edge e;
>    bool phis_constructed = false;
> @@ -1117,7 +1122,7 @@ tree_optimize_tail_calls_1 (bool opt_tailcalls)
>        /* Only traverse the normal exits, i.e. those that end with return
>          statement.  */
>        if (safe_is_a <greturn *> (*gsi_last_bb (e->src)))
> -       find_tail_calls (e->src, &tailcalls);
> +       find_tail_calls (e->src, &tailcalls, only_mustcall);
>      }
>
>    if (live_vars)
> @@ -1228,7 +1233,7 @@ gate_tail_calls (void)
>  static unsigned int
>  execute_tail_calls (void)
>  {
> -  return tree_optimize_tail_calls_1 (true);
> +  return tree_optimize_tail_calls_1 (true, false);
>  }
>
>  namespace {
> @@ -1261,7 +1266,7 @@ public:
>    bool gate (function *) final override { return gate_tail_calls (); }
>    unsigned int execute (function *) final override
>      {
> -      return tree_optimize_tail_calls_1 (false);
> +      return tree_optimize_tail_calls_1 (false, false);
>      }
>
>  }; // class pass_tail_recursion
> @@ -1312,3 +1317,46 @@ make_pass_tail_calls (gcc::context *ctxt)
>  {
>    return new pass_tail_calls (ctxt);
>  }
> +
> +namespace {
> +
> +const pass_data pass_data_musttail =
> +{
> +  GIMPLE_PASS, /* type */
> +  "musttail", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_NONE, /* tv_id */
> +  ( PROP_cfg | PROP_ssa ), /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  0, /* todo_flags_finish */
> +};
> +
> +class pass_musttail : public gimple_opt_pass
> +{
> +public:
> +  pass_musttail (gcc::context *ctxt)
> +    : gimple_opt_pass (pass_data_musttail, ctxt)
> +  {}
> +
> +  /* opt_pass methods: */
> +  /* This pass is only used when not optimizing to make [[musttail]] still
> +     work.  */
> +  bool gate (function *) final override { return !flag_optimize_sibling_calls; }

Shouldn't this check f->has_musttail only?  That is, I would expect
-fno-optimize-sibling-calls to still tail-call [[musttail]]?  The comment says
the pass only runs when not optimizing - so maybe you wanted to do
return optimize == 0;?

Otherwise this patch looks OK.

> +  unsigned int execute (function *f) final override
> +  {
> +    if (!f->has_musttail)
> +      return 0;
> +    return tree_optimize_tail_calls_1 (true, true);
> +  }
> +
> +}; // class pass_musttail
> +
> +} // anon namespace
> +
> +gimple_opt_pass *
> +make_pass_musttail (gcc::context *ctxt)
> +{
> +  return new pass_musttail (ctxt);
> +}
> --
> 2.45.2
>
Andi Kleen July 6, 2024, 4:07 p.m. UTC | #2
> > +class pass_musttail : public gimple_opt_pass
> > +{
> > +public:
> > +  pass_musttail (gcc::context *ctxt)
> > +    : gimple_opt_pass (pass_data_musttail, ctxt)
> > +  {}
> > +
> > +  /* opt_pass methods: */
> > +  /* This pass is only used when not optimizing to make [[musttail]] still
> > +     work.  */
> > +  bool gate (function *) final override { return !flag_optimize_sibling_calls; }
> 
> Shouldn't this check f->has_musttail only?  That is, I would expect
> -fno-optimize-sibling-calls to still tail-call [[musttail]]?  The comment says
> the pass only runs when not optimizing - so maybe you wanted to do
> return optimize == 0;?

When flag_optimize_sibling_call is set the other tailcall pass will 
take care of the musttails. It is only needed when that one doesn't run.
So I think looking at that flag is correct.

But I should move the f->has_musttail check into the gate (done) and
clarified the comment because it is not specific to optimizing.

Thanks,
-Andi
Richard Biener July 8, 2024, 6:53 a.m. UTC | #3
On Sat, Jul 6, 2024 at 6:07 PM Andi Kleen <ak@linux.intel.com> wrote:
>
> > > +class pass_musttail : public gimple_opt_pass
> > > +{
> > > +public:
> > > +  pass_musttail (gcc::context *ctxt)
> > > +    : gimple_opt_pass (pass_data_musttail, ctxt)
> > > +  {}
> > > +
> > > +  /* opt_pass methods: */
> > > +  /* This pass is only used when not optimizing to make [[musttail]] still
> > > +     work.  */
> > > +  bool gate (function *) final override { return !flag_optimize_sibling_calls; }
> >
> > Shouldn't this check f->has_musttail only?  That is, I would expect
> > -fno-optimize-sibling-calls to still tail-call [[musttail]]?  The comment says
> > the pass only runs when not optimizing - so maybe you wanted to do
> > return optimize == 0;?
>
> When flag_optimize_sibling_call is set the other tailcall pass will
> take care of the musttails. It is only needed when that one doesn't run.
> So I think looking at that flag is correct.

Ah, I see.  So this pass is responsible for both -O0 and
-fno-optimized-sibling-calls.
But I'm quite sure the other pass doesn't run with -O0
-foptimize-sibling-calls, does it?

> But I should move the f->has_musttail check into the gate (done) and
> clarified the comment because it is not specific to optimizing.

Yeah, clarifying the comment would be nice.

Thanks,
Richard.

> Thanks,
> -Andi
Andi Kleen July 8, 2024, 3:22 p.m. UTC | #4
On Mon, Jul 08, 2024 at 08:53:27AM +0200, Richard Biener wrote:
> Ah, I see.  So this pass is responsible for both -O0 and
> -fno-optimized-sibling-calls.
> But I'm quite sure the other pass doesn't run with -O0
> -foptimize-sibling-calls, does it?

It does run:

./cc1 -O0 -fdump-passes -foptimize-sibling-calls t.c 2>&1 | grep tail
         tree-tailr1                                   :  ON
      tree-tailr2                                      :  ON
      tree-tailc                                       :  ON

But I suspect without the earlier expand patch to adjust the cfg rebuild it may
ICE on some of the targets.

-Andi
Richard Biener July 8, 2024, 3:27 p.m. UTC | #5
> Am 08.07.2024 um 17:22 schrieb Andi Kleen <ak@linux.intel.com>:
> 
> On Mon, Jul 08, 2024 at 08:53:27AM +0200, Richard Biener wrote:
>> Ah, I see.  So this pass is responsible for both -O0 and
>> -fno-optimized-sibling-calls.
>> But I'm quite sure the other pass doesn't run with -O0
>> -foptimize-sibling-calls, does it?
> 
> It does run:
> 
> ./cc1 -O0 -fdump-passes -foptimize-sibling-calls t.c 2>&1 | grep tail
>         tree-tailr1                                   :  ON
>      tree-tailr2                                      :  ON
>      tree-tailc                                       :  ON

I would not trust -fdump-passes, IIRC that just executes the gate functions.

> But I suspect without the earlier expand patch to adjust the cfg rebuild it may
> ICE on some of the targets.
> 
> -Andi
Andi Kleen July 8, 2024, 3:46 p.m. UTC | #6
On Mon, Jul 08, 2024 at 05:27:53PM +0200, Richard Biener wrote:
> 
> 
> > Am 08.07.2024 um 17:22 schrieb Andi Kleen <ak@linux.intel.com>:
> > 
> > On Mon, Jul 08, 2024 at 08:53:27AM +0200, Richard Biener wrote:
> >> Ah, I see.  So this pass is responsible for both -O0 and
> >> -fno-optimized-sibling-calls.
> >> But I'm quite sure the other pass doesn't run with -O0
> >> -foptimize-sibling-calls, does it?
> > 
> > It does run:
> > 
> > ./cc1 -O0 -fdump-passes -foptimize-sibling-calls t.c 2>&1 | grep tail
> >         tree-tailr1                                   :  ON
> >      tree-tailr2                                      :  ON
> >      tree-tailc                                       :  ON
> 
> I would not trust -fdump-passes, IIRC that just executes the gate functions.

You're right it's not invoked according to gdb.

So the musttail gate needs a || optimize == 0.

-Andi
diff mbox series

Patch

diff --git a/gcc/function.h b/gcc/function.h
index c0ba6cc1531a..fbeadeaf4104 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -430,6 +430,9 @@  struct GTY(()) function {
   /* Nonzero when the tail call has been identified.  */
   unsigned int tail_call_marked : 1;
 
+  /* Has musttail marked calls.  */
+  unsigned int has_musttail : 1;
+
   /* Nonzero if the current function contains a #pragma GCC unroll.  */
   unsigned int has_unroll : 1;
 
diff --git a/gcc/lto-streamer-in.cc b/gcc/lto-streamer-in.cc
index ad0ca24007a0..2e592be80823 100644
--- a/gcc/lto-streamer-in.cc
+++ b/gcc/lto-streamer-in.cc
@@ -1325,6 +1325,7 @@  input_struct_function_base (struct function *fn, class data_in *data_in,
   fn->calls_eh_return = bp_unpack_value (&bp, 1);
   fn->has_force_vectorize_loops = bp_unpack_value (&bp, 1);
   fn->has_simduid_loops = bp_unpack_value (&bp, 1);
+  fn->has_musttail = bp_unpack_value (&bp, 1);
   fn->assume_function = bp_unpack_value (&bp, 1);
   fn->va_list_fpr_size = bp_unpack_value (&bp, 8);
   fn->va_list_gpr_size = bp_unpack_value (&bp, 8);
diff --git a/gcc/lto-streamer-out.cc b/gcc/lto-streamer-out.cc
index d4f728094ed5..0be381abbd96 100644
--- a/gcc/lto-streamer-out.cc
+++ b/gcc/lto-streamer-out.cc
@@ -2290,6 +2290,7 @@  output_struct_function_base (struct output_block *ob, struct function *fn)
   bp_pack_value (&bp, fn->calls_eh_return, 1);
   bp_pack_value (&bp, fn->has_force_vectorize_loops, 1);
   bp_pack_value (&bp, fn->has_simduid_loops, 1);
+  bp_pack_value (&bp, fn->has_musttail, 1);
   bp_pack_value (&bp, fn->assume_function, 1);
   bp_pack_value (&bp, fn->va_list_fpr_size, 8);
   bp_pack_value (&bp, fn->va_list_gpr_size, 8);
diff --git a/gcc/passes.def b/gcc/passes.def
index 041229e47a68..5b5390e6ac0b 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -444,6 +444,7 @@  along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_tsan_O0);
   NEXT_PASS (pass_sanopt);
   NEXT_PASS (pass_cleanup_eh);
+  NEXT_PASS (pass_musttail);
   NEXT_PASS (pass_lower_resx);
   NEXT_PASS (pass_nrv);
   NEXT_PASS (pass_gimple_isel);
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 7fb7b92966be..e6fd1294b958 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -2290,6 +2290,8 @@  notice_special_calls (gcall *call)
     cfun->calls_alloca = true;
   if (flags & ECF_RETURNS_TWICE)
     cfun->calls_setjmp = true;
+  if (gimple_call_must_tail_p (call))
+    cfun->has_musttail = true;
 }
 
 
@@ -2301,6 +2303,7 @@  clear_special_calls (void)
 {
   cfun->calls_alloca = false;
   cfun->calls_setjmp = false;
+  cfun->has_musttail = false;
 }
 
 /* Remove PHI nodes associated with basic block BB and all edges out of BB.  */
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index edebb2be245d..59e53558034f 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -368,6 +368,7 @@  extern gimple_opt_pass *make_pass_sra (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_sra_early (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tail_recursion (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tail_calls (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_musttail (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_fix_loops (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_loop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_no_loop (gcc::context *ctxt);
diff --git a/gcc/tree-tailcall.cc b/gcc/tree-tailcall.cc
index e9f7f8a12b3a..0c6df10e64f7 100644
--- a/gcc/tree-tailcall.cc
+++ b/gcc/tree-tailcall.cc
@@ -408,10 +408,10 @@  static live_vars_map *live_vars;
 static vec<bitmap_head> live_vars_vec;
 
 /* Finds tailcalls falling into basic block BB. The list of found tailcalls is
-   added to the start of RET.  */
+   added to the start of RET. When ONLY_MUSTTAIL is set only handle musttail.  */
 
 static void
-find_tail_calls (basic_block bb, struct tailcall **ret)
+find_tail_calls (basic_block bb, struct tailcall **ret, bool only_musttail)
 {
   tree ass_var = NULL_TREE, ret_var, func, param;
   gimple *stmt;
@@ -445,6 +445,9 @@  find_tail_calls (basic_block bb, struct tailcall **ret)
       if (is_gimple_call (stmt))
 	{
 	  call = as_a <gcall *> (stmt);
+	  /* Handle only musttail calls when not optimizing.  */
+	  if (only_musttail && !gimple_call_must_tail_p (call))
+	    return;
 	  ass_var = gimple_call_lhs (call);
 	  break;
 	}
@@ -467,7 +470,7 @@  find_tail_calls (basic_block bb, struct tailcall **ret)
       edge_iterator ei;
       /* Recurse to the predecessors.  */
       FOR_EACH_EDGE (e, ei, bb->preds)
-	find_tail_calls (e->src, ret);
+	find_tail_calls (e->src, ret, only_musttail);
 
       return;
     }
@@ -528,7 +531,8 @@  find_tail_calls (basic_block bb, struct tailcall **ret)
   func = gimple_call_fndecl (call);
   if (func
       && !fndecl_built_in_p (func)
-      && recursive_call_p (current_function_decl, func))
+      && recursive_call_p (current_function_decl, func)
+      && !only_musttail)
     {
       tree arg;
 
@@ -1094,10 +1098,11 @@  create_tailcall_accumulator (const char *label, basic_block bb, tree init)
 }
 
 /* Optimizes tail calls in the function, turning the tail recursion
-   into iteration.  */
+   into iteration. When ONLY_MUSTCALL is true only optimize mustcall
+   marked calls.  */
 
 static unsigned int
-tree_optimize_tail_calls_1 (bool opt_tailcalls)
+tree_optimize_tail_calls_1 (bool opt_tailcalls, bool only_mustcall)
 {
   edge e;
   bool phis_constructed = false;
@@ -1117,7 +1122,7 @@  tree_optimize_tail_calls_1 (bool opt_tailcalls)
       /* Only traverse the normal exits, i.e. those that end with return
 	 statement.  */
       if (safe_is_a <greturn *> (*gsi_last_bb (e->src)))
-	find_tail_calls (e->src, &tailcalls);
+	find_tail_calls (e->src, &tailcalls, only_mustcall);
     }
 
   if (live_vars)
@@ -1228,7 +1233,7 @@  gate_tail_calls (void)
 static unsigned int
 execute_tail_calls (void)
 {
-  return tree_optimize_tail_calls_1 (true);
+  return tree_optimize_tail_calls_1 (true, false);
 }
 
 namespace {
@@ -1261,7 +1266,7 @@  public:
   bool gate (function *) final override { return gate_tail_calls (); }
   unsigned int execute (function *) final override
     {
-      return tree_optimize_tail_calls_1 (false);
+      return tree_optimize_tail_calls_1 (false, false);
     }
 
 }; // class pass_tail_recursion
@@ -1312,3 +1317,46 @@  make_pass_tail_calls (gcc::context *ctxt)
 {
   return new pass_tail_calls (ctxt);
 }
+
+namespace {
+
+const pass_data pass_data_musttail =
+{
+  GIMPLE_PASS, /* type */
+  "musttail", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  ( PROP_cfg | PROP_ssa ), /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_musttail : public gimple_opt_pass
+{
+public:
+  pass_musttail (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_musttail, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  /* This pass is only used when not optimizing to make [[musttail]] still
+     work.  */
+  bool gate (function *) final override { return !flag_optimize_sibling_calls; }
+  unsigned int execute (function *f) final override
+  {
+    if (!f->has_musttail)
+      return 0;
+    return tree_optimize_tail_calls_1 (true, true);
+  }
+
+}; // class pass_musttail
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_musttail (gcc::context *ctxt)
+{
+  return new pass_musttail (ctxt);
+}