diff mbox

Initial shrink-wrapping patch

Message ID 4E5E7342.9050103@codesourcery.com
State New
Headers show

Commit Message

Bernd Schmidt Aug. 31, 2011, 5:45 p.m. UTC
This is a new version of the original 4/6 shrink wrapping patch, minus
the preliminary bits that were already approved, and with some extra bug
fixes that were discovered in the meantime. This is now mostly contained
to just function.c. I'll resubmit the other parts (i.e. exposing more
shrink-wrapping opportunities through code movement and copy
propagation) later after retesting them.

This requires the two patches I sent earlier today, for mips epilogue
unwinding and copying the frame_related bit.  Together with these, BSRT
on i686-linux, and sim tested on mips64-elf. ARM tests are still running
so I've dropped those target-specific bits for now.

Experience with testing on mips shows that at the very least the
consistency checks in dwarf2cfi are working well.


Bernd
* doc/tm.texi (RETURN_ADDR_REGNUM): Document.
	* doc/invoke.texi (-fshrink-wrap): Document.
	* opts.c (default_options_table): Add it.
	* common.opt (fshrink-wrap): Add.
	* function.c (emit_return_into_block): Remove useless declaration.
	(record_hard_reg_uses_1, record_hard_reg_uses, frame_required_for_rtx,
	requires_stack_frame_p, gen_return_pattern): New static functions.
	(emit_return_into_block): New arg simple_p.  All callers changed.
	Use gen_return_pattern.
	(thread_prologue_and_epilogue_insns): Implement shrink-wrapping.
	* config/i386/i386.md (returns): New code_iterator.
	(return_str, return_cond): New code_attrs.
	(<return_str>return, <return_str>return_internal,
	<return_str>return_internal_long, <return_str>return_pop_internal,
	<return_str>return_indirect_internal): New patterns, replacing...
	(return, return_internal, return_internal_long, return_pop_internal,
	return_indirect_internal): ... these.
	* config/i386/i386.c (ix86_expand_epilogue): Adjust to expand
	simple returns.
	(ix86_pad_returns): Likewise.

Comments

Richard Sandiford Sept. 1, 2011, 1:57 p.m. UTC | #1
Bernd Schmidt <bernds@codesourcery.com> writes:
> +/* A for_each_rtx subroutine of record_hard_reg_sets.  */
> +static int
> +record_hard_reg_uses_1 (rtx *px, void *data)
> +{
> +  rtx x = *px;
> +  HARD_REG_SET *pused = (HARD_REG_SET *)data;
> +
> +  if (REG_P (x) && REGNO (x) < FIRST_PSEUDO_REGISTER)
> +    {
> +      int nregs = hard_regno_nregs[REGNO (x)][GET_MODE (x)];
> +      while (nregs-- > 0)
> +	SET_HARD_REG_BIT (*pused, REGNO (x) + nregs);
> +    }

add_to_hard_reg_set (pused, GET_MODE (x), REGNO (x));

> +/* A subroutine of requires_stack_frame_p, called via for_each_rtx.
> +   If any change is made, set CHANGED
> +   to true.  */

Strange line break, and comment doesn't match code (no "changed" variable).
Probably moot though because:

> +/* Return true if INSN requires the stack frame to be set up.
> +   PROLOGUE_USED contains the hard registers used in the function
> +   prologue.  */
> +static bool
> +requires_stack_frame_p (rtx insn, HARD_REG_SET prologue_used)
> +{
>[...]
> +  if (for_each_rtx (&PATTERN (insn), frame_required_for_rtx, NULL))
> +    return true;
> +  CLEAR_HARD_REG_SET (hardregs);
> +  note_stores (PATTERN (insn), record_hard_reg_sets, &hardregs);
> +  if (hard_reg_set_intersect_p (hardregs, prologue_used))
> +    return true;
> +  AND_COMPL_HARD_REG_SET (hardregs, call_used_reg_set);
> +  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +    if (TEST_HARD_REG_BIT (hardregs, regno)
> +	&& df_regs_ever_live_p (regno))
> +      return true;

...I suppose this ought to use DF instead.

Does something guarantee that non-local gotos are marked as
needing a frame?

Why do we need to check specifically for pic_offset_table_rtx?  Isn't it
handled by the generic "live registers set by the prologue" test?
Reason for asking is that pic_offset_table_rtx can be a global value,
such as for mips*-elf.

> -/* Insert gen_return at the end of block BB.  This also means updating
> -   block_for_insn appropriately.  */
> +
> +static rtx
> +gen_return_pattern (bool simple_p)
> +{
> +#ifdef HAVE_simple_return
> +  return simple_p ? gen_simple_return () : gen_return ();
> +#else
> +  gcc_assert (!simple_p);
> +  return gen_return ();
> +#endif
> +}

Missing function comment.

> +
> +/* Insert an appropriate return pattern at the end of block BB.  This
> +   also means updating block_for_insn appropriately.  */
>  
>  static void
> -emit_return_into_block (basic_block bb)
> +emit_return_into_block (bool simple_p, basic_block bb)

Pedantic, but the comment should reference SIMPLE_P.

> +#ifdef HAVE_simple_return
> +  /* Try to perform a kind of shrink-wrapping, making sure the
> +     prologue/epilogue is emitted only around those parts of the
> +     function that require it.  */
> +
> +  if (flag_shrink_wrap && HAVE_simple_return && !flag_non_call_exceptions
> +      && HAVE_prologue && !crtl->calls_eh_return)
> +    {

Maybe it should be obvious, but could you add a comment explaining why
flag_non_call_exceptions and crtl->calls_eh_return need to be checked
explicitly?  I can see why we don't want to optimise functions that
call eh_return, but I was curious why it wasn't handled by the general
insn-level analysis.

Would checking prologue_seq != NULL be better than HAVE_prologue?

> +      HARD_REG_SET prologue_clobbered, prologue_used, live_on_edge;
> +      rtx p_insn;
> +
> +      VEC(basic_block, heap) *vec;
> +      basic_block bb;
> +      bitmap_head bb_antic_flags;
> +      bitmap_head bb_on_list;
> +
> +      /* Compute the registers set and used in the prologue.  */
> +      CLEAR_HARD_REG_SET (prologue_clobbered);
> +      CLEAR_HARD_REG_SET (prologue_used);
> +      for (p_insn = prologue_seq; p_insn; p_insn = NEXT_INSN (p_insn))
> +	{
> +	  HARD_REG_SET this_used;
> +	  if (!NONDEBUG_INSN_P (p_insn))
> +	    continue;
> +
> +	  CLEAR_HARD_REG_SET (this_used);
> +	  note_uses (&PATTERN (p_insn), record_hard_reg_uses,
> +		     &this_used);
> +	  AND_COMPL_HARD_REG_SET (this_used, prologue_clobbered);
> +	  IOR_HARD_REG_SET (prologue_used, this_used);
> +	  note_stores (PATTERN (p_insn), record_hard_reg_sets,
> +		       &prologue_clobbered);
> +	}

Should this iterate over split_prologue_seq as well?

Could we combine prologue_seq and split_prologue_seq into a single sequence?

> +      bitmap_initialize (&bb_antic_flags, &bitmap_default_obstack);
> +      bitmap_initialize (&bb_on_list, &bitmap_default_obstack);
> +
> +      vec = VEC_alloc (basic_block, heap, n_basic_blocks);
> +
> +      FOR_EACH_BB (bb)
> +	{
> +	  rtx insn;
> +	  FOR_BB_INSNS (bb, insn)
> +	    {
> +	      if (requires_stack_frame_p (insn, prologue_used))
> +		{
> +		  bitmap_set_bit (&bb_flags, bb->index);
> +		  VEC_quick_push (basic_block, vec, bb);
> +		  break;
> +		}
> +	    }

Excess { and } in for loop.

> +	}
> +
> +      /* For every basic block that needs a prologue, mark all blocks
> +	 reachable from it, so as to ensure they are also seen as
> +	 requiring a prologue.  */
> +      while (!VEC_empty (basic_block, vec))
> +	{
> +	  basic_block tmp_bb = VEC_pop (basic_block, vec);
> +	  edge e;
> +	  edge_iterator ei;
> +	  FOR_EACH_EDGE (e, ei, tmp_bb->succs)
> +	    {
> +	      if (e->dest == EXIT_BLOCK_PTR
> +		  || bitmap_bit_p (&bb_flags, e->dest->index))
> +		continue;
> +	      bitmap_set_bit (&bb_flags, e->dest->index);
> +	      VEC_quick_push (basic_block, vec, e->dest);
> +	    }

Don't know which is the preferred style, but:

    if (e->dest != EXIT_BLOCK_PTR
        && bitmap_set_bit (&bb_flags, e->dest->index))
      VEC_quick_push (basic_block, vec, e->dest);

should be a little more efficient.

> +      /* Now walk backwards from every block that is marked as needing
> +	 a prologue to compute the bb_antic_flags bitmap.  */
> +      bitmap_copy (&bb_antic_flags, &bb_flags);
> +      FOR_EACH_BB (bb)
> +	{
> +	  edge e;
> +	  edge_iterator ei;
> +	  if (!bitmap_bit_p (&bb_flags, bb->index))
> +	    continue;
> +	  FOR_EACH_EDGE (e, ei, bb->preds)
> +	    if (!bitmap_bit_p (&bb_antic_flags, e->src->index))
> +	      {
> +		VEC_quick_push (basic_block, vec, e->src);
> +		bitmap_set_bit (&bb_on_list, e->src->index);
> +	      }

Here too I think we want:

	  FOR_EACH_EDGE (e, ei, bb->preds)
	    if (!bitmap_bit_p (&bb_antic_flags, e->src->index)
		&& bitmap_set_bit (&bb_on_list, e->src->index))
	      VEC_quick_push (basic_block, vec, e->src);

in this case to avoid pushing the same thing twice.

> +	  bitmap_clear_bit (&bb_on_list, tmp_bb->index);
> +	  FOR_EACH_EDGE (e, ei, tmp_bb->succs)
> +	    {
> +	      if (!bitmap_bit_p (&bb_antic_flags, e->dest->index))
> +		{
> +		  all_set = false;
> +		  break;
> +		}
> +	    }

Excess { and } in for loop.

> +	  if (all_set)
> +	    {
> +	      bitmap_set_bit (&bb_antic_flags, tmp_bb->index);
> +	      FOR_EACH_EDGE (e, ei, tmp_bb->preds)
> +		if (!bitmap_bit_p (&bb_antic_flags, e->src->index))
> +		  {
> +		    VEC_quick_push (basic_block, vec, e->src);
> +		    bitmap_set_bit (&bb_on_list, e->src->index);
> +		  }
> +	    }

Same:

	  FOR_EACH_EDGE (e, ei, tmp_bb->preds)
	    if (!bitmap_bit_p (&bb_antic_flags, e->src->index)
		&& bitmap_set_bit (&bb_on_list, e->src->index))
	      VEC_quick_push (basic_block, vec, e->src);

comment as above.

> +      /* Test whether the prologue is known to clobber any register
> +	 (other than FP or SP) which are live on the edge.  */
> +      CLEAR_HARD_REG_SET (prologue_clobbered);
> +      for (p_insn = prologue_seq; p_insn; p_insn = NEXT_INSN (p_insn))
> +	if (NONDEBUG_INSN_P (p_insn))
> +	  note_stores (PATTERN (p_insn), record_hard_reg_sets,
> +		       &prologue_clobbered);
> +      for (p_insn = split_prologue_seq; p_insn; p_insn = NEXT_INSN (p_insn))
> +	if (NONDEBUG_INSN_P (p_insn))
> +	  note_stores (PATTERN (p_insn), record_hard_reg_sets,
> +		       &prologue_clobbered);
> +      CLEAR_HARD_REG_BIT (prologue_clobbered, STACK_POINTER_REGNUM);
> +      if (frame_pointer_needed)
> +	CLEAR_HARD_REG_BIT (prologue_clobbered, HARD_FRAME_POINTER_REGNUM);

Seems like we should be able to reuse the insn walk from the beginning
of the enclosing if statement.

> +#ifdef HAVE_simple_return
> +	  simple_p = (entry_edge != orig_entry_edge
> +		      ? !bitmap_bit_p (&bb_flags, bb->index) : false);
> +#else
> +	  simple_p = false;
> +#endif

Why is this bb_flags rather than bb_antic_flags?

	  simple_p = (entry_edge != orig_entry_edge
		      && !bitmap_bit_p (&bb_flags, bb->index));

seems more readable, but I suppose that's personal taste.

> +	      gcc_assert (simple_p);
> +	      new_bb = split_edge (e);
> +	      emit_barrier_after (BB_END (new_bb));
> +	      emit_return_into_block (simple_p, new_bb);
> +#ifdef HAVE_simple_return
> +	      if (simple_p)

Check is redundant given the gcc_assert.


> +	      /* If this block has only one successor, it both jumps
> +		 and falls through to the fallthru block, so we can't
> +		 delete the edge.  */
> +	      if (single_succ_p (bb))
> +		continue;

Seems odd that this could happen in optimised code, but if it did,
wouldn't it invalidate the simple_p transformation?  Seems like
the non-fallthrough edge would use a simple return but the
fallthrough one wouldn't.

> -      start_sequence ();
> -      emit_note (NOTE_INSN_EPILOGUE_BEG);
> -      emit_insn (gen_sibcall_epilogue ());
> -      seq = get_insns ();
> -      end_sequence ();
> +      ep_seq = gen_sibcall_epilogue ();
> +      if (ep_seq)
> +	{

Why the new null check?

Richard
diff mbox

Patch

Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi	(revision 178135)
+++ gcc/doc/tm.texi	(working copy)
@@ -3208,6 +3208,12 @@  Define this if the return address of a p
 from the frame pointer of the previous stack frame.
 @end defmac
 
+@defmac RETURN_ADDR_REGNUM
+If defined, a C expression whose value is the register number of the return
+address for the current function.  Targets that pass the return address on
+the stack should not define this macro.
+@end defmac
+
 @defmac INCOMING_RETURN_ADDR_RTX
 A C expression whose value is RTL representing the location of the
 incoming return address at the beginning of any function, before the
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 178135)
+++ gcc/doc/invoke.texi	(working copy)
@@ -395,10 +395,10 @@  Objective-C and Objective-C++ Dialects}.
 -fschedule-insns -fschedule-insns2 -fsection-anchors @gol
 -fselective-scheduling -fselective-scheduling2 @gol
 -fsel-sched-pipelining -fsel-sched-pipelining-outer-loops @gol
--fsignaling-nans -fsingle-precision-constant -fsplit-ivs-in-unroller @gol
--fsplit-wide-types -fstack-protector -fstack-protector-all @gol
--fstrict-aliasing -fstrict-overflow -fthread-jumps -ftracer @gol
--ftree-bit-ccp @gol
+-fshrink-wrap -fsignaling-nans -fsingle-precision-constant @gol
+-fsplit-ivs-in-unroller -fsplit-wide-types -fstack-protector @gol
+-fstack-protector-all -fstrict-aliasing -fstrict-overflow @gol
+-fthread-jumps -ftracer -ftree-bit-ccp @gol
 -ftree-builtin-call-dce -ftree-ccp -ftree-ch -ftree-copy-prop @gol
 -ftree-copyrename -ftree-dce -ftree-dominator-opts -ftree-dse @gol
 -ftree-forwprop -ftree-fre -ftree-loop-if-convert @gol
@@ -6872,6 +6872,12 @@  This option has no effect until one of @
 When pipelining loops during selective scheduling, also pipeline outer loops.
 This option has no effect until @option{-fsel-sched-pipelining} is turned on.
 
+@item -fshrink-wrap
+@opindex fshrink-wrap
+Emit function prologues only before parts of the function that need it,
+rather than at the top of the function.  This flag is enabled by default at
+@option{-O} and higher.
+
 @item -fcaller-saves
 @opindex fcaller-saves
 Enable values to be allocated in registers that will be clobbered by
Index: gcc/opts.c
===================================================================
--- gcc/opts.c	(revision 178135)
+++ gcc/opts.c	(working copy)
@@ -434,6 +434,7 @@  static const struct default_options defa
     { OPT_LEVELS_1_PLUS, OPT_fipa_reference, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_fipa_profile, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_fmerge_constants, NULL, 1 },
+    { OPT_LEVELS_1_PLUS, OPT_fshrink_wrap, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_fsplit_wide_types, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_ftree_ccp, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_ftree_bit_ccp, NULL, 1 },
Index: gcc/function.c
===================================================================
--- gcc/function.c	(revision 178135)
+++ gcc/function.c	(working copy)
@@ -147,9 +147,6 @@  extern tree debug_find_var_in_block_tree
    can always export `prologue_epilogue_contains'.  */
 static void record_insns (rtx, rtx, htab_t *) ATTRIBUTE_UNUSED;
 static bool contains (const_rtx, htab_t);
-#ifdef HAVE_return
-static void emit_return_into_block (basic_block);
-#endif
 static void prepare_function_start (void);
 static void do_clobber_return_reg (rtx, void *);
 static void do_use_return_reg (rtx, void *);
@@ -5285,6 +5282,77 @@  prologue_epilogue_contains (const_rtx in
   return 0;
 }
 
+#ifdef HAVE_simple_return
+
+/* A for_each_rtx subroutine of record_hard_reg_sets.  */
+static int
+record_hard_reg_uses_1 (rtx *px, void *data)
+{
+  rtx x = *px;
+  HARD_REG_SET *pused = (HARD_REG_SET *)data;
+
+  if (REG_P (x) && REGNO (x) < FIRST_PSEUDO_REGISTER)
+    {
+      int nregs = hard_regno_nregs[REGNO (x)][GET_MODE (x)];
+      while (nregs-- > 0)
+	SET_HARD_REG_BIT (*pused, REGNO (x) + nregs);
+    }
+  return 0;
+}
+
+/* Like record_hard_reg_sets, but called through note_uses.  */
+static void
+record_hard_reg_uses (rtx *px, void *data)
+{
+  for_each_rtx (px, record_hard_reg_uses_1, data);
+}
+
+/* A subroutine of requires_stack_frame_p, called via for_each_rtx.
+   If any change is made, set CHANGED
+   to true.  */
+
+static int
+frame_required_for_rtx (rtx *loc, void *data ATTRIBUTE_UNUSED)
+{
+  rtx x = *loc;
+  if (x == stack_pointer_rtx || x == hard_frame_pointer_rtx
+      || x == arg_pointer_rtx || x == pic_offset_table_rtx
+#ifdef RETURN_ADDR_REGNUM
+      || (REG_P (x) && REGNO (x) == RETURN_ADDR_REGNUM)
+#endif
+      )
+    return 1;
+  return 0;
+}
+
+/* Return true if INSN requires the stack frame to be set up.
+   PROLOGUE_USED contains the hard registers used in the function
+   prologue.  */
+static bool
+requires_stack_frame_p (rtx insn, HARD_REG_SET prologue_used)
+{
+  HARD_REG_SET hardregs;
+  unsigned regno;
+
+  if (!INSN_P (insn) || DEBUG_INSN_P (insn))
+    return false;
+  if (CALL_P (insn))
+    return !SIBLING_CALL_P (insn);
+  if (for_each_rtx (&PATTERN (insn), frame_required_for_rtx, NULL))
+    return true;
+  CLEAR_HARD_REG_SET (hardregs);
+  note_stores (PATTERN (insn), record_hard_reg_sets, &hardregs);
+  if (hard_reg_set_intersect_p (hardregs, prologue_used))
+    return true;
+  AND_COMPL_HARD_REG_SET (hardregs, call_used_reg_set);
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (TEST_HARD_REG_BIT (hardregs, regno)
+	&& df_regs_ever_live_p (regno))
+      return true;
+  return false;
+}
+#endif
+
 #ifdef HAVE_return
 /* Insert use of return register before the end of BB.  */
 
@@ -5299,45 +5367,141 @@  emit_use_return_register_into_block (bas
   emit_insn_before (seq, BB_END (bb));
 }
 
-/* Insert gen_return at the end of block BB.  This also means updating
-   block_for_insn appropriately.  */
+
+static rtx
+gen_return_pattern (bool simple_p)
+{
+#ifdef HAVE_simple_return
+  return simple_p ? gen_simple_return () : gen_return ();
+#else
+  gcc_assert (!simple_p);
+  return gen_return ();
+#endif
+}
+
+/* Insert an appropriate return pattern at the end of block BB.  This
+   also means updating block_for_insn appropriately.  */
 
 static void
-emit_return_into_block (basic_block bb)
+emit_return_into_block (bool simple_p, basic_block bb)
 {
-  rtx jump = emit_jump_insn_after (gen_return (), BB_END (bb));
-  rtx pat = PATTERN (jump);
+  rtx jump, pat;
+  jump = emit_jump_insn_after (gen_return_pattern (simple_p), BB_END (bb));
+  pat = PATTERN (jump);
   if (GET_CODE (pat) == PARALLEL)
     pat = XVECEXP (pat, 0, 0);
   gcc_assert (ANY_RETURN_P (pat));
   JUMP_LABEL (jump) = pat;
 }
-#endif /* HAVE_return */
+#endif
 
 /* Generate the prologue and epilogue RTL if the machine supports it.  Thread
    this into place with notes indicating where the prologue ends and where
-   the epilogue begins.  Update the basic block information when possible.  */
+   the epilogue begins.  Update the basic block information when possible.
+
+   Notes on epilogue placement:
+   There are several kinds of edges to the exit block:
+   * a single fallthru edge from LAST_BB
+   * possibly, edges from blocks containing sibcalls
+   * possibly, fake edges from infinite loops
+
+   The epilogue is always emitted on the fallthru edge from the last basic
+   block in the function, LAST_BB, into the exit block.
+
+   If LAST_BB is empty except for a label, it is the target of every
+   other basic block in the function that ends in a return.  If a
+   target has a return or simple_return pattern (possibly with
+   conditional variants), these basic blocks can be changed so that a
+   return insn is emitted into them, and their target is adjusted to
+   the real exit block.
+
+   Notes on shrink wrapping: We implement a fairly conservative
+   version of shrink-wrapping rather than the textbook one.  We only
+   generate a single prologue and a single epilogue.  This is
+   sufficient to catch a number of interesting cases involving early
+   exits.
+
+   First, we identify the blocks that require the prologue to occur before
+   them.  These are the ones that modify a call-saved register, or reference
+   any of the stack or frame pointer registers.  To simplify things, we then
+   mark everything reachable from these blocks as also requiring a prologue.
+   This takes care of loops automatically, and avoids the need to examine
+   whether MEMs reference the frame, since it is sufficient to check for
+   occurrences of the stack or frame pointer.
+
+   We then compute the set of blocks for which the need for a prologue
+   is anticipatable (borrowing terminology from the shrink-wrapping
+   description in Muchnick's book).  These are the blocks which either
+   require a prologue themselves, or those that have only successors
+   where the prologue is anticipatable.  The prologue needs to be
+   inserted on all edges from BB1->BB2 where BB2 is in ANTIC and BB1
+   is not.  For the moment, we ensure that only one such edge exists.
+
+   The epilogue is placed as described above, but we make a
+   distinction between inserting return and simple_return patterns
+   when modifying other blocks that end in a return.  Blocks that end
+   in a sibcall omit the sibcall_epilogue if the block is not in
+   ANTIC.  */
 
 static void
 thread_prologue_and_epilogue_insns (void)
 {
   bool inserted;
+  basic_block last_bb;
+  bool last_bb_active;
+#ifdef HAVE_simple_return
+  bool unconverted_simple_returns = false;
+  basic_block simple_return_block_hot = NULL;
+  basic_block simple_return_block_cold = NULL;
+#endif
+  rtx returnjump ATTRIBUTE_UNUSED;
   rtx seq ATTRIBUTE_UNUSED, epilogue_end ATTRIBUTE_UNUSED;
-  edge entry_edge, e;
+  rtx prologue_seq ATTRIBUTE_UNUSED, split_prologue_seq ATTRIBUTE_UNUSED;
+  edge e, entry_edge, orig_entry_edge, exit_fallthru_edge;
   edge_iterator ei;
+  bitmap_head bb_flags;
+
+  df_analyze ();
 
   rtl_profile_for_bb (ENTRY_BLOCK_PTR);
 
   inserted = false;
   seq = NULL_RTX;
+  prologue_seq = NULL_RTX;
   epilogue_end = NULL_RTX;
+  returnjump = NULL_RTX;
 
   /* Can't deal with multiple successors of the entry block at the
      moment.  Function should always have at least one entry
      point.  */
   gcc_assert (single_succ_p (ENTRY_BLOCK_PTR));
   entry_edge = single_succ_edge (ENTRY_BLOCK_PTR);
+  orig_entry_edge = entry_edge;
 
+  exit_fallthru_edge = find_fallthru_edge (EXIT_BLOCK_PTR->preds);
+  if (exit_fallthru_edge != NULL)
+    {
+      rtx label;
+
+      last_bb = exit_fallthru_edge->src;
+      /* Test whether there are active instructions in the last block.  */
+      label = BB_END (last_bb);
+      while (label && !LABEL_P (label))
+	{
+	  if (active_insn_p (label))
+	    break;
+	  label = PREV_INSN (label);
+	}
+
+      last_bb_active = BB_HEAD (last_bb) != label || !LABEL_P (label);
+    }
+  else
+    {
+      last_bb = NULL;
+      last_bb_active = false;
+    }
+
+  split_prologue_seq = NULL_RTX;
   if (flag_split_stack
       && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl))
 	  == NULL))
@@ -5349,17 +5513,15 @@  thread_prologue_and_epilogue_insns (void
 
       start_sequence ();
       emit_insn (gen_split_stack_prologue ());
-      seq = get_insns ();
+      split_prologue_seq = get_insns ();
       end_sequence ();
 
-      record_insns (seq, NULL, &prologue_insn_hash);
-      set_insn_locators (seq, prologue_locator);
-
-      insert_insn_on_edge (seq, entry_edge);
-      inserted = true;
+      record_insns (split_prologue_seq, NULL, &prologue_insn_hash);
+      set_insn_locators (split_prologue_seq, prologue_locator);
 #endif
     }
 
+  prologue_seq = NULL_RTX;
 #ifdef HAVE_prologue
   if (HAVE_prologue)
     {
@@ -5382,15 +5544,196 @@  thread_prologue_and_epilogue_insns (void
       if (!targetm.profile_before_prologue () && crtl->profile)
         emit_insn (gen_blockage ());
 
-      seq = get_insns ();
+      prologue_seq = get_insns ();
       end_sequence ();
-      set_insn_locators (seq, prologue_locator);
+      set_insn_locators (prologue_seq, prologue_locator);
+    }
+#endif
 
-      insert_insn_on_edge (seq, entry_edge);
-      inserted = true;
+  bitmap_initialize (&bb_flags, &bitmap_default_obstack);
+
+#ifdef HAVE_simple_return
+  /* Try to perform a kind of shrink-wrapping, making sure the
+     prologue/epilogue is emitted only around those parts of the
+     function that require it.  */
+
+  if (flag_shrink_wrap && HAVE_simple_return && !flag_non_call_exceptions
+      && HAVE_prologue && !crtl->calls_eh_return)
+    {
+      HARD_REG_SET prologue_clobbered, prologue_used, live_on_edge;
+      rtx p_insn;
+
+      VEC(basic_block, heap) *vec;
+      basic_block bb;
+      bitmap_head bb_antic_flags;
+      bitmap_head bb_on_list;
+
+      /* Compute the registers set and used in the prologue.  */
+      CLEAR_HARD_REG_SET (prologue_clobbered);
+      CLEAR_HARD_REG_SET (prologue_used);
+      for (p_insn = prologue_seq; p_insn; p_insn = NEXT_INSN (p_insn))
+	{
+	  HARD_REG_SET this_used;
+	  if (!NONDEBUG_INSN_P (p_insn))
+	    continue;
+
+	  CLEAR_HARD_REG_SET (this_used);
+	  note_uses (&PATTERN (p_insn), record_hard_reg_uses,
+		     &this_used);
+	  AND_COMPL_HARD_REG_SET (this_used, prologue_clobbered);
+	  IOR_HARD_REG_SET (prologue_used, this_used);
+	  note_stores (PATTERN (p_insn), record_hard_reg_sets,
+		       &prologue_clobbered);
+	}
+
+      bitmap_initialize (&bb_antic_flags, &bitmap_default_obstack);
+      bitmap_initialize (&bb_on_list, &bitmap_default_obstack);
+
+      vec = VEC_alloc (basic_block, heap, n_basic_blocks);
+
+      FOR_EACH_BB (bb)
+	{
+	  rtx insn;
+	  FOR_BB_INSNS (bb, insn)
+	    {
+	      if (requires_stack_frame_p (insn, prologue_used))
+		{
+		  bitmap_set_bit (&bb_flags, bb->index);
+		  VEC_quick_push (basic_block, vec, bb);
+		  break;
+		}
+	    }
+	}
+
+      /* For every basic block that needs a prologue, mark all blocks
+	 reachable from it, so as to ensure they are also seen as
+	 requiring a prologue.  */
+      while (!VEC_empty (basic_block, vec))
+	{
+	  basic_block tmp_bb = VEC_pop (basic_block, vec);
+	  edge e;
+	  edge_iterator ei;
+	  FOR_EACH_EDGE (e, ei, tmp_bb->succs)
+	    {
+	      if (e->dest == EXIT_BLOCK_PTR
+		  || bitmap_bit_p (&bb_flags, e->dest->index))
+		continue;
+	      bitmap_set_bit (&bb_flags, e->dest->index);
+	      VEC_quick_push (basic_block, vec, e->dest);
+	    }
+	}
+      /* If the last basic block contains only a label, we'll be able
+	 to convert jumps to it to (potentially conditional) return
+	 insns later.  This means we don't necessarily need a prologue
+	 for paths reaching it.  */
+      if (last_bb)
+	{
+	  if (!last_bb_active)
+	    bitmap_clear_bit (&bb_flags, last_bb->index);
+	  else if (!bitmap_bit_p (&bb_flags, last_bb->index))
+	    goto fail_shrinkwrap;
+	}
+
+      /* Now walk backwards from every block that is marked as needing
+	 a prologue to compute the bb_antic_flags bitmap.  */
+      bitmap_copy (&bb_antic_flags, &bb_flags);
+      FOR_EACH_BB (bb)
+	{
+	  edge e;
+	  edge_iterator ei;
+	  if (!bitmap_bit_p (&bb_flags, bb->index))
+	    continue;
+	  FOR_EACH_EDGE (e, ei, bb->preds)
+	    if (!bitmap_bit_p (&bb_antic_flags, e->src->index))
+	      {
+		VEC_quick_push (basic_block, vec, e->src);
+		bitmap_set_bit (&bb_on_list, e->src->index);
+	      }
+	}
+      while (!VEC_empty (basic_block, vec))
+	{
+	  basic_block tmp_bb = VEC_pop (basic_block, vec);
+	  edge e;
+	  edge_iterator ei;
+	  bool all_set = true;
+
+	  bitmap_clear_bit (&bb_on_list, tmp_bb->index);
+	  FOR_EACH_EDGE (e, ei, tmp_bb->succs)
+	    {
+	      if (!bitmap_bit_p (&bb_antic_flags, e->dest->index))
+		{
+		  all_set = false;
+		  break;
+		}
+	    }
+	  if (all_set)
+	    {
+	      bitmap_set_bit (&bb_antic_flags, tmp_bb->index);
+	      FOR_EACH_EDGE (e, ei, tmp_bb->preds)
+		if (!bitmap_bit_p (&bb_antic_flags, e->src->index))
+		  {
+		    VEC_quick_push (basic_block, vec, e->src);
+		    bitmap_set_bit (&bb_on_list, e->src->index);
+		  }
+	    }
+	}
+      /* Find exactly one edge that leads to a block in ANTIC from
+	 a block that isn't.  */
+      if (!bitmap_bit_p (&bb_antic_flags, entry_edge->dest->index))
+	FOR_EACH_BB (bb)
+	  {
+	    if (!bitmap_bit_p (&bb_antic_flags, bb->index))
+	      continue;
+	    FOR_EACH_EDGE (e, ei, bb->preds)
+	      if (!bitmap_bit_p (&bb_antic_flags, e->src->index))
+		{
+		  if (entry_edge != orig_entry_edge)
+		    {
+		      entry_edge = orig_entry_edge;
+		      goto fail_shrinkwrap;
+		    }
+		  entry_edge = e;
+		}
+	  }
+
+      /* Test whether the prologue is known to clobber any register
+	 (other than FP or SP) which are live on the edge.  */
+      CLEAR_HARD_REG_SET (prologue_clobbered);
+      for (p_insn = prologue_seq; p_insn; p_insn = NEXT_INSN (p_insn))
+	if (NONDEBUG_INSN_P (p_insn))
+	  note_stores (PATTERN (p_insn), record_hard_reg_sets,
+		       &prologue_clobbered);
+      for (p_insn = split_prologue_seq; p_insn; p_insn = NEXT_INSN (p_insn))
+	if (NONDEBUG_INSN_P (p_insn))
+	  note_stores (PATTERN (p_insn), record_hard_reg_sets,
+		       &prologue_clobbered);
+      CLEAR_HARD_REG_BIT (prologue_clobbered, STACK_POINTER_REGNUM);
+      if (frame_pointer_needed)
+	CLEAR_HARD_REG_BIT (prologue_clobbered, HARD_FRAME_POINTER_REGNUM);
+      CLEAR_HARD_REG_SET (live_on_edge);
+      reg_set_to_hard_reg_set (&live_on_edge,
+			       df_get_live_in (entry_edge->dest));
+      if (hard_reg_set_intersect_p (live_on_edge, prologue_clobbered))
+	entry_edge = orig_entry_edge;
+
+    fail_shrinkwrap:
+      bitmap_clear (&bb_antic_flags);
+      bitmap_clear (&bb_on_list);
+      VEC_free (basic_block, heap, vec);
     }
 #endif
 
+  if (split_prologue_seq != NULL_RTX)
+    {
+      insert_insn_on_edge (split_prologue_seq, entry_edge);
+      inserted = true;
+    }
+  if (prologue_seq != NULL_RTX)
+    {
+      insert_insn_on_edge (prologue_seq, entry_edge);
+      inserted = true;
+    }
+
   /* If the exit block has no non-fake predecessors, we don't need
      an epilogue.  */
   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
@@ -5400,110 +5743,148 @@  thread_prologue_and_epilogue_insns (void
     goto epilogue_done;
 
   rtl_profile_for_bb (EXIT_BLOCK_PTR);
+
 #ifdef HAVE_return
-  if (optimize && HAVE_return)
+  /* If we're allowed to generate a simple return instruction, then by
+     definition we don't need a full epilogue.  If the last basic
+     block before the exit block does not contain active instructions,
+     examine its predecessors and try to emit (conditional) return
+     instructions.  */
+  if (optimize && !last_bb_active
+      && (HAVE_return || entry_edge != orig_entry_edge))
     {
-      /* If we're allowed to generate a simple return instruction,
-	 then by definition we don't need a full epilogue.  Examine
-	 the block that falls through to EXIT.   If it does not
-	 contain any code, examine its predecessors and try to
-	 emit (conditional) return instructions.  */
-
-      basic_block last;
+      edge_iterator ei2;
+      int i;
+      basic_block bb;
       rtx label;
+      VEC(basic_block,heap) *src_bbs;
 
-      e = find_fallthru_edge (EXIT_BLOCK_PTR->preds);
-      if (e == NULL)
+      if (exit_fallthru_edge == NULL)
 	goto epilogue_done;
-      last = e->src;
+      label = BB_HEAD (last_bb);
 
-      /* Verify that there are no active instructions in the last block.  */
-      label = BB_END (last);
-      while (label && !LABEL_P (label))
-	{
-	  if (active_insn_p (label))
-	    break;
-	  label = PREV_INSN (label);
-	}
+      src_bbs = VEC_alloc (basic_block, heap, EDGE_COUNT (last_bb->preds));
+      FOR_EACH_EDGE (e, ei2, last_bb->preds)
+	if (e->src != ENTRY_BLOCK_PTR)
+	  VEC_quick_push (basic_block, src_bbs, e->src);
 
-      if (BB_HEAD (last) == label && LABEL_P (label))
+      FOR_EACH_VEC_ELT (basic_block, src_bbs, i, bb)
 	{
-	  edge_iterator ei2;
+	  bool simple_p;
+	  rtx jump;
+	  e = find_edge (bb, last_bb);
 
-	  for (ei2 = ei_start (last->preds); (e = ei_safe_edge (ei2)); )
-	    {
-	      basic_block bb = e->src;
-	      rtx jump;
+	  jump = BB_END (bb);
 
-	      if (bb == ENTRY_BLOCK_PTR)
-		{
-		  ei_next (&ei2);
-		  continue;
-		}
+#ifdef HAVE_simple_return
+	  simple_p = (entry_edge != orig_entry_edge
+		      ? !bitmap_bit_p (&bb_flags, bb->index) : false);
+#else
+	  simple_p = false;
+#endif
 
-	      jump = BB_END (bb);
-	      if (!JUMP_P (jump) || JUMP_LABEL (jump) != label)
-		{
-		  ei_next (&ei2);
-		  continue;
-		}
+	  if (!simple_p
+	      && (!HAVE_return || !JUMP_P (jump)
+		  || JUMP_LABEL (jump) != label))
+	    continue;
 
-	      /* If we have an unconditional jump, we can replace that
-		 with a simple return instruction.  */
-	      if (simplejump_p (jump))
-		{
-		  /* The use of the return register might be present in the exit
-		     fallthru block.  Either:
-		     - removing the use is safe, and we should remove the use in
-		       the exit fallthru block, or
-		     - removing the use is not safe, and we should add it here.
-		     For now, we conservatively choose the latter.  Either of the
-		     2 helps in crossjumping.  */
-		  emit_use_return_register_into_block (bb);
+	  /* If we have an unconditional jump, we can replace that
+	     with a simple return instruction.  */
+	  if (!JUMP_P (jump))
+	    {
+	      emit_barrier_after (BB_END (bb));
+	      emit_return_into_block (simple_p, bb);
+	    }
+	  else if (simplejump_p (jump))
+	    {
+	      /* The use of the return register might be present in the exit
+	         fallthru block.  Either:
+	         - removing the use is safe, and we should remove the use in
+	           the exit fallthru block, or
+	         - removing the use is not safe, and we should add it here.
+	         For now, we conservatively choose the latter.  Either of the
+	         2 helps in crossjumping.  */
+	      emit_use_return_register_into_block (bb);
 
-		  emit_return_into_block (bb);
-		  delete_insn (jump);
-		}
+	      emit_return_into_block (simple_p, bb);
+	      delete_insn (jump);
+	    }
+	  else if (condjump_p (jump) && JUMP_LABEL (jump) != label)
+	    {
+	      basic_block new_bb;
+	      edge new_e;
 
-	      /* If we have a conditional jump, we can try to replace
-		 that with a conditional return instruction.  */
-	      else if (condjump_p (jump))
+	      gcc_assert (simple_p);
+	      new_bb = split_edge (e);
+	      emit_barrier_after (BB_END (new_bb));
+	      emit_return_into_block (simple_p, new_bb);
+#ifdef HAVE_simple_return
+	      if (simple_p)
 		{
-		  if (! redirect_jump (jump, ret_rtx, 0))
-		    {
-		      ei_next (&ei2);
-		      continue;
-		    }
-
-		  /* See comment in simple_jump_p case above.  */
-		  emit_use_return_register_into_block (bb);
-
-		  /* If this block has only one successor, it both jumps
-		     and falls through to the fallthru block, so we can't
-		     delete the edge.  */
-		  if (single_succ_p (bb))
-		    {
-		      ei_next (&ei2);
-		      continue;
-		    }
+		  if (BB_PARTITION (new_bb) == BB_HOT_PARTITION)
+		    simple_return_block_hot = new_bb;
+		  else
+		    simple_return_block_cold = new_bb;
 		}
+#endif
+	      new_e = single_succ_edge (new_bb);
+	      redirect_edge_succ (new_e, EXIT_BLOCK_PTR);
+
+	      continue;
+	    }
+	  /* If we have a conditional jump branching to the last
+	     block, we can try to replace that with a conditional
+	     return instruction.  */
+	  else if (condjump_p (jump))
+	    {
+	      rtx dest;
+	      if (simple_p)
+		dest = simple_return_rtx;
 	      else
+		dest = ret_rtx;
+	      if (! redirect_jump (jump, dest, 0))
 		{
-		  ei_next (&ei2);
+#ifdef HAVE_simple_return
+		  if (simple_p)
+		    unconverted_simple_returns = true;
+#endif
 		  continue;
 		}
 
-	      /* Fix up the CFG for the successful change we just made.  */
-	      redirect_edge_succ (e, EXIT_BLOCK_PTR);
+	      /* See comment in simple_jump_p case above.  */
+	      emit_use_return_register_into_block (bb);
+
+	      /* If this block has only one successor, it both jumps
+		 and falls through to the fallthru block, so we can't
+		 delete the edge.  */
+	      if (single_succ_p (bb))
+		continue;
+	    }
+	  else
+	    {
+#ifdef HAVE_simple_return
+	      if (simple_p)
+		unconverted_simple_returns = true;
+#endif
+	      continue;
 	    }
 
+	  /* Fix up the CFG for the successful change we just made.  */
+	  redirect_edge_succ (e, EXIT_BLOCK_PTR);
+	}
+      VEC_free (basic_block, heap, src_bbs);
+
+      if (HAVE_return)
+	{
 	  /* Emit a return insn for the exit fallthru block.  Whether
 	     this is still reachable will be determined later.  */
 
-	  emit_barrier_after (BB_END (last));
-	  emit_return_into_block (last);
-	  epilogue_end = BB_END (last);
-	  single_succ_edge (last)->flags &= ~EDGE_FALLTHRU;
+	  emit_barrier_after (BB_END (last_bb));
+	  emit_return_into_block (false, last_bb);
+	  epilogue_end = BB_END (last_bb);
+	  if (JUMP_P (epilogue_end))
+	    JUMP_LABEL (epilogue_end) = ret_rtx;
+	  single_succ_edge (last_bb)->flags &= ~EDGE_FALLTHRU;
 	  goto epilogue_done;
 	}
     }
@@ -5540,20 +5921,15 @@  thread_prologue_and_epilogue_insns (void
     }
 #endif
 
-  /* Find the edge that falls through to EXIT.  Other edges may exist
-     due to RETURN instructions, but those don't need epilogues.
-     There really shouldn't be a mixture -- either all should have
-     been converted or none, however...  */
+  /* If nothing falls through into the exit block, we don't need an
+     epilogue.  */
 
-  e = find_fallthru_edge (EXIT_BLOCK_PTR->preds);
-  if (e == NULL)
+  if (exit_fallthru_edge == NULL)
     goto epilogue_done;
 
 #ifdef HAVE_epilogue
   if (HAVE_epilogue)
     {
-      rtx returnjump;
-
       start_sequence ();
       epilogue_end = emit_note (NOTE_INSN_EPILOGUE_BEG);
       seq = gen_epilogue ();
@@ -5564,11 +5940,11 @@  thread_prologue_and_epilogue_insns (void
       record_insns (seq, NULL, &epilogue_insn_hash);
       set_insn_locators (seq, epilogue_locator);
 
-      returnjump = get_last_insn ();
       seq = get_insns ();
+      returnjump = get_last_insn ();
       end_sequence ();
 
-      insert_insn_on_edge (seq, e);
+      insert_insn_on_edge (seq, exit_fallthru_edge);
       inserted = true;
 
       if (JUMP_P (returnjump))
@@ -5581,23 +5957,21 @@  thread_prologue_and_epilogue_insns (void
 	  else
 	    JUMP_LABEL (returnjump) = ret_rtx;
 	}
-      else
-	returnjump = NULL_RTX;
     }
   else
 #endif
     {
       basic_block cur_bb;
 
-      if (! next_active_insn (BB_END (e->src)))
+      if (! next_active_insn (BB_END (exit_fallthru_edge->src)))
 	goto epilogue_done;
       /* We have a fall-through edge to the exit block, the source is not
          at the end of the function, and there will be an assembler epilogue
          at the end of the function.
          We can't use force_nonfallthru here, because that would try to
-         use return.  Inserting a jump 'by hand' is extremely messy, so
+	 use return.  Inserting a jump 'by hand' is extremely messy, so
 	 we take advantage of cfg_layout_finalize using
-	fixup_fallthru_exit_predecessor.  */
+	 fixup_fallthru_exit_predecessor.  */
       cfg_layout_initialize (0);
       FOR_EACH_BB (cur_bb)
 	if (cur_bb->index >= NUM_FIXED_BLOCKS
@@ -5607,6 +5981,7 @@  thread_prologue_and_epilogue_insns (void
     }
 
 epilogue_done:
+
   default_rtl_profile ();
 
   if (inserted)
@@ -5632,33 +6007,102 @@  epilogue_done:
 	}
     }
 
+#ifdef HAVE_simple_return
+  /* If there were branches to an empty LAST_BB which we tried to
+     convert to conditional simple_returns, but couldn't for some
+     reason, create a block to hold a simple_return insn and redirect
+     those remaining edges.  */
+  if (unconverted_simple_returns)
+    {
+      edge_iterator ei2;
+      basic_block exit_pred = EXIT_BLOCK_PTR->prev_bb;
+
+      gcc_assert (entry_edge != orig_entry_edge);
+
+      /* See if we can reuse the last insn that was emitted for the
+	 epilogue.  */
+      if (returnjump != NULL_RTX
+	  && JUMP_LABEL (returnjump) == simple_return_rtx)
+	{
+	  edge e = split_block (exit_fallthru_edge->src,
+				PREV_INSN (returnjump));
+	  if (BB_PARTITION (e->src) == BB_HOT_PARTITION)
+	    simple_return_block_hot = e->dest;
+	  else
+	    simple_return_block_cold = e->dest;
+	}
+
+    restart_scan:
+      for (ei2 = ei_start (last_bb->preds); (e = ei_safe_edge (ei2)); )
+	{
+	  basic_block bb = e->src;
+	  basic_block *pdest_bb;
+
+	  if (bb == ENTRY_BLOCK_PTR
+	      || bitmap_bit_p (&bb_flags, bb->index))
+	    {
+	      ei_next (&ei2);
+	      continue;
+	    }
+	  if (BB_PARTITION (e->src) == BB_HOT_PARTITION)
+	    pdest_bb = &simple_return_block_hot;
+	  else
+	    pdest_bb = &simple_return_block_cold;
+	  if (*pdest_bb == NULL)
+	    {
+	      basic_block bb;
+	      rtx start;
+
+	      bb = create_basic_block (NULL, NULL, exit_pred);
+	      BB_COPY_PARTITION (bb, e->src);
+	      start = emit_jump_insn_after (gen_simple_return (),
+					    BB_END (bb));
+	      JUMP_LABEL (start) = simple_return_rtx;
+	      emit_barrier_after (start);
+
+	      *pdest_bb = bb;
+	      make_edge (bb, EXIT_BLOCK_PTR, 0);
+	    }
+	  redirect_edge_and_branch_force (e, *pdest_bb);
+	  goto restart_scan;
+	}
+    }
+#endif
+
 #ifdef HAVE_sibcall_epilogue
   /* Emit sibling epilogues before any sibling call sites.  */
   for (ei = ei_start (EXIT_BLOCK_PTR->preds); (e = ei_safe_edge (ei)); )
     {
       basic_block bb = e->src;
       rtx insn = BB_END (bb);
+      rtx ep_seq;
 
       if (!CALL_P (insn)
-	  || ! SIBLING_CALL_P (insn))
+	  || ! SIBLING_CALL_P (insn)
+	  || (entry_edge != orig_entry_edge
+	      && !bitmap_bit_p (&bb_flags, bb->index)))
 	{
 	  ei_next (&ei);
 	  continue;
 	}
 
-      start_sequence ();
-      emit_note (NOTE_INSN_EPILOGUE_BEG);
-      emit_insn (gen_sibcall_epilogue ());
-      seq = get_insns ();
-      end_sequence ();
+      ep_seq = gen_sibcall_epilogue ();
+      if (ep_seq)
+	{
+	  start_sequence ();
+	  emit_note (NOTE_INSN_EPILOGUE_BEG);
+	  emit_insn (ep_seq);
+	  seq = get_insns ();
+	  end_sequence ();
 
-      /* Retain a map of the epilogue insns.  Used in life analysis to
-	 avoid getting rid of sibcall epilogue insns.  Do this before we
-	 actually emit the sequence.  */
-      record_insns (seq, NULL, &epilogue_insn_hash);
-      set_insn_locators (seq, epilogue_locator);
+	  /* Retain a map of the epilogue insns.  Used in life analysis to
+	     avoid getting rid of sibcall epilogue insns.  Do this before we
+	     actually emit the sequence.  */
+	  record_insns (seq, NULL, &epilogue_insn_hash);
+	  set_insn_locators (seq, epilogue_locator);
 
-      emit_insn_before (seq, insn);
+	  emit_insn_before (seq, insn);
+	}
       ei_next (&ei);
     }
 #endif
@@ -5683,6 +6127,8 @@  epilogue_done:
     }
 #endif
 
+  bitmap_clear (&bb_flags);
+
   /* Threading the prologue and epilogue changes the artificial refs
      in the entry and exit blocks.  */
   epilogue_completed = 1;
Index: gcc/common.opt
===================================================================
--- gcc/common.opt	(revision 178135)
+++ gcc/common.opt	(working copy)
@@ -1768,6 +1768,11 @@  fshow-column
 Common Report Var(flag_show_column) Init(1)
 Show column numbers in diagnostics, when available.  Default on
 
+fshrink-wrap
+Common Report Var(flag_shrink_wrap) Optimization
+Emit function prologues only before parts of the function that need it,
+rather than at the top of the function.
+
 fsignaling-nans
 Common Report Var(flag_signaling_nans) Optimization SetByCombined
 Disable optimizations observable by IEEE signaling NaNs
Index: gcc/config/i386/i386.md
===================================================================
--- gcc/config/i386/i386.md	(revision 178135)
+++ gcc/config/i386/i386.md	(working copy)
@@ -11686,24 +11686,29 @@  (define_insn "prologue_use"
   ""
   [(set_attr "length" "0")])
 
+(define_code_iterator returns [return simple_return])
+(define_code_attr return_str [(return "") (simple_return "simple_")])
+(define_code_attr return_cond [(return "ix86_can_use_return_insn_p ()")
+			       (simple_return "")])
+
 ;; Insn emitted into the body of a function to return from a function.
 ;; This is only done if the function's epilogue is known to be simple.
 ;; See comments for ix86_can_use_return_insn_p in i386.c.
 
-(define_expand "return"
-  [(return)]
-  "ix86_can_use_return_insn_p ()"
+(define_expand "<return_str>return"
+  [(returns)]
+  "<return_cond>"
 {
   if (crtl->args.pops_args)
     {
       rtx popc = GEN_INT (crtl->args.pops_args);
-      emit_jump_insn (gen_return_pop_internal (popc));
+      emit_jump_insn (gen_<return_str>return_pop_internal (popc));
       DONE;
     }
 })
 
-(define_insn "return_internal"
-  [(return)]
+(define_insn "<return_str>return_internal"
+  [(returns)]
   "reload_completed"
   "ret"
   [(set_attr "length" "1")
@@ -11714,8 +11719,8 @@  (define_insn "return_internal"
 ;; Used by x86_machine_dependent_reorg to avoid penalty on single byte RET
 ;; instruction Athlon and K8 have.
 
-(define_insn "return_internal_long"
-  [(return)
+(define_insn "<return_str>return_internal_long"
+  [(returns)
    (unspec [(const_int 0)] UNSPEC_REP)]
   "reload_completed"
   "rep\;ret"
@@ -11725,8 +11730,8 @@  (define_insn "return_internal_long"
    (set_attr "prefix_rep" "1")
    (set_attr "modrm" "0")])
 
-(define_insn "return_pop_internal"
-  [(return)
+(define_insn "<return_str>return_pop_internal"
+  [(returns)
    (use (match_operand:SI 0 "const_int_operand" ""))]
   "reload_completed"
   "ret\t%0"
@@ -11735,8 +11740,8 @@  (define_insn "return_pop_internal"
    (set_attr "length_immediate" "2")
    (set_attr "modrm" "0")])
 
-(define_insn "return_indirect_internal"
-  [(return)
+(define_insn "<return_str>return_indirect_internal"
+  [(returns)
    (use (match_operand:SI 0 "register_operand" "r"))]
   "reload_completed"
   "jmp\t%A0"
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 178135)
+++ gcc/config/i386/i386.c	(working copy)
@@ -10754,13 +10754,13 @@  ix86_expand_epilogue (int style)
 
 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
 				     popc, -1, true);
-	  emit_jump_insn (gen_return_indirect_internal (ecx));
+	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
 	}
       else
-	emit_jump_insn (gen_return_pop_internal (popc));
+	emit_jump_insn (gen_simple_return_pop_internal (popc));
     }
   else
-    emit_jump_insn (gen_return_internal ());
+    emit_jump_insn (gen_simple_return_internal ());
 
   /* Restore the state back to the state from the prologue,
      so that it's correct for the next epilogue.  */
@@ -30615,7 +30615,10 @@  ix86_pad_returns (void)
 	}
       if (replace)
 	{
-	  emit_jump_insn_before (gen_return_internal_long (), ret);
+	  if (PATTERN (ret) == ret_rtx)
+	    emit_jump_insn_before (gen_return_internal_long (), ret);
+	  else
+	    emit_jump_insn_before (gen_simple_return_internal_long (), ret);
 	  delete_insn (ret);
 	}
     }