diff mbox

PATCH: Pad short functions for Atom

Message ID AANLkTimgbTto91dDEc_H0hEFXBTphSCK6yLE0YbKqtGU@mail.gmail.com
State New
Headers show

Commit Message

H.J. Lu Sept. 16, 2010, 8:55 p.m. UTC
On Thu, Sep 16, 2010 at 1:38 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, Sep 16, 2010 at 1:13 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Thu, Sep 16, 2010 at 11:19 AM, Richard Henderson <rth@redhat.com> wrote:
>>> On 09/16/2010 10:52 AM, H.J. Lu wrote:
>>>> On Thu, Sep 16, 2010 at 8:47 AM, Richard Henderson <rth@redhat.com> wrote:
>>>>> On 09/16/2010 08:34 AM, H.J. Lu wrote:
>>>>>>> Surely this is overkill.  Why not assume that any function
>>>>>>> with more than a single basic block is not short?  You can
>>>>>>> then significantly simplify these two functions.
>>> ...
>>>> There are 3 basic blocks. But one code path has only 3 instructions.
>>>>
>>>>       testl   %edi, %edi
>>>>       movl    %edi, %eax
>>>>       je      .L2
>>>> ...
>>>> .L2:
>>>>        ret
>>>>
>>>> I want to check  if any code path has less than 4 instructions
>>>> with less than 3 basic blocks.  How can I do it?
>>>
>>> As I said: you simply ignore that case.
>>>
>>> Honestly, is this *really* worth it?  As far as I can see,
>>> it's a large complication to the code for what looks like
>>> zero gain.  It's not like you're truly counting cycles,
>>> you're just guessing based on insn counts.
>>>
>>
>> I just want to make sure that I should also ignore
>>
>> ---
>> extern void bar ();
>>
>> void
>> foo (int x)
>> {
>>  if (x)
>>    bar ();
>> }
>> ----
>>
>> foo:
>>        testl   %edi, %edi
>>        jne     .L4
>>        ret
>> .L4:
>>        xorl    %eax, %eax
>>        jmp     bar
>>
>>
>
> Padding 4 nops speeds it up by 37%.
>
>

Here is the updated patch just for reference. I can remove
the extra optimization by removing one FOR_EACH_EDGE.

Thanks.

Comments

Jakub Jelinek Sept. 16, 2010, 9:05 p.m. UTC | #1
On Thu, Sep 16, 2010 at 01:55:24PM -0700, H.J. Lu wrote:
> @@ -8024,6 +8027,11 @@ ix86_code_end (void)
>  
>        xops[0] = gen_rtx_REG (Pmode, regno);
>        xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
> +      /* Pad stack IP move with 4 instructions.  2 NOPs count as 1
> +         instruction.  */
> +      if (TARGET_PAD_SHORT_FUNCTION)
> +	output_asm_insn ("nop; nop; nop; nop; nop; nop; nop; nop",
> +			 xops);
>        output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
>        output_asm_insn ("ret", xops);
>        final_end_function ();

Doesn't the movl %esp, %ebx before ret count as one (or half) insn,
thus wouldn't it be enough to have just 6 or 7 nops instead of 8?

	Jakub
H.J. Lu Sept. 16, 2010, 9:55 p.m. UTC | #2
On Thu, Sep 16, 2010 at 2:05 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Thu, Sep 16, 2010 at 01:55:24PM -0700, H.J. Lu wrote:
>> @@ -8024,6 +8027,11 @@ ix86_code_end (void)
>>
>>        xops[0] = gen_rtx_REG (Pmode, regno);
>>        xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
>> +      /* Pad stack IP move with 4 instructions.  2 NOPs count as 1
>> +         instruction.  */
>> +      if (TARGET_PAD_SHORT_FUNCTION)
>> +     output_asm_insn ("nop; nop; nop; nop; nop; nop; nop; nop",
>> +                      xops);
>>        output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
>>        output_asm_insn ("ret", xops);
>>        final_end_function ();
>
> Doesn't the movl %esp, %ebx before ret count as one (or half) insn,
> thus wouldn't it be enough to have just 6 or 7 nops instead of 8?
>

It is

__i686.get_pc_thunk.bx:
	movl	(%esp), %ebx <<=== Take return address
	ret

Return address won't be ready in 4 cycles.
Richard Henderson Sept. 17, 2010, 7:27 p.m. UTC | #3
> +  int insn_count = ix86_count_insn_bb (bb);
> +  int min_insn_count;
> +
> +  if (insn_count >= 4)
> +    return insn_count;
> +
> +  /* This block has less than 4 instructions.  Count predecessor
> +     edges of this block.  */
> +  min_insn_count = insn_count;
> +  FOR_EACH_EDGE (e, ei, bb->preds)

So this is the exit block we're looking at now...

> +      int count = insn_count + ix86_count_insn_bb (e->src);

(You may be counting insns in ENTRY_BLOCK here.)

> +
> +      if (count < 4)
> +	{
> +	  /* This block plus its predecessor have less than 4
> +	     instructions.  Check predecessor edges.  */
> +	  edge prev_e;
> +	  edge_iterator prev_ei;
> +	  int old_count = count;
> +	  bool has_prev_bb = false;
> +
> +	  count = 4;
> +	  FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
> +	    {
> +	      /* Check if the predecessor is entry point.  Not a short
> +		 function if it has more than 2 basic blocks.  */
> +	      if (prev_e->src == ENTRY_BLOCK_PTR)

... and this is the predecessor.

I suspect that it would be quicker to verify the CFG form
before counting instructions.  At least it'll be clearer.
E.g.

  /* Only bother counting instructions along paths with no
     more than 2 basic blocks between entry and exit.  Given
     that BB has an edge to exit, determine if a predecessor
     of BB has an edge from entry.  If so, compute the number
     of instructions in the predecessor block.  If there 
     happen to be multiple such blocks, compute the minimum.  */
  min_prev_count = 4;
  FOR_EACH_EDGE (e, ei, bb->preds)
    {
      if (e->src == ENTRY_BLOCK_PTR)
	{
	  min_prev_count = 0;
	  break;
 	}
      FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
 	{
	  if (prev_e->src == ENTRY_BLOCK_PTR)
	    {
	      c = ix86_count_insn_bb (e->src);
	      if (c < min_prev_count)
 		min_prev_count = c;
	      break;		
	    }
	}

  if (min_prev_count < 4)
    min_prev_count += ix86_count_insn_bb (bb);
  return min_prev_count;



r~
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index dcf8875..260f65d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1576,6 +1576,9 @@  static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
   /* X86_TUNE_PAD_RETURNS */
   m_AMD_MULTIPLE | m_CORE2 | m_GENERIC,
 
+  /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion.  */
+  m_ATOM,
+
   /* X86_TUNE_EXT_80387_CONSTANTS */
   m_K6_GEODE | m_ATHLON_K8 | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO
   | m_CORE2 | m_GENERIC,
@@ -8024,6 +8027,11 @@  ix86_code_end (void)
 
       xops[0] = gen_rtx_REG (Pmode, regno);
       xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
+      /* Pad stack IP move with 4 instructions.  2 NOPs count as 1
+         instruction.  */
+      if (TARGET_PAD_SHORT_FUNCTION)
+	output_asm_insn ("nop; nop; nop; nop; nop; nop; nop; nop",
+			 xops);
       output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
       output_asm_insn ("ret", xops);
       final_end_function ();
@@ -27885,6 +27893,131 @@  ix86_pad_returns (void)
     }
 }
 
+/* Count the minimum number of instructions in BB.  Return 4 if the
+   number of instructions >= 4.  */
+
+static int 
+ix86_count_insn_bb (basic_block bb)
+{
+  rtx insn;
+  int insn_count = 0;
+
+  /* Count number of instructions in this block.  Return 4 if the number
+     of instructions >= 4.  */
+  FOR_BB_INSNS (bb, insn)
+    {
+      /* Only happen in exit blocks.  */
+      if (JUMP_P (insn)
+	  && GET_CODE (PATTERN (insn)) == RETURN)
+	break;
+
+      if (NONDEBUG_INSN_P (insn)
+	  && GET_CODE (PATTERN (insn)) != USE
+	  && GET_CODE (PATTERN (insn)) != CLOBBER)
+	{
+	  insn_count++;
+	  if (insn_count >= 4)
+	    return insn_count;
+	}
+    }
+
+  return insn_count;
+}
+
+
+/* Count the minimum number of instructions in code path in BB.  
+   Return 4 if the number of instructions >= 4.  */
+
+static int 
+ix86_count_insn (basic_block bb)
+{
+  edge e;
+  edge_iterator ei;
+  int insn_count = ix86_count_insn_bb (bb);
+  int min_insn_count;
+
+  if (insn_count >= 4)
+    return insn_count;
+
+  /* This block has less than 4 instructions.  Count predecessor
+     edges of this block.  */
+  min_insn_count = insn_count;
+  FOR_EACH_EDGE (e, ei, bb->preds)
+    {
+      int count = insn_count + ix86_count_insn_bb (e->src);
+
+      if (count < 4)
+	{
+	  /* This block plus its predecessor have less than 4
+	     instructions.  Check predecessor edges.  */
+	  edge prev_e;
+	  edge_iterator prev_ei;
+	  int old_count = count;
+	  bool has_prev_bb = false;
+
+	  count = 4;
+	  FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
+	    {
+	      /* Check if the predecessor is entry point.  Not a short
+		 function if it has more than 2 basic blocks.  */
+	      if (prev_e->src == ENTRY_BLOCK_PTR)
+		{
+		  has_prev_bb = false;
+		  break;
+		}
+	      has_prev_bb = true;
+	    }
+
+	  if (!has_prev_bb)
+	    count = old_count;
+	}
+
+      if (min_insn_count == insn_count)
+	min_insn_count = count;
+      else if (count < min_insn_count)
+	min_insn_count = count;
+    }
+
+  return min_insn_count;
+}
+
+/* Pad short funtion to 4 instructions.   */
+
+static void
+ix86_pad_short_function (void)
+{
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
+    {
+      rtx ret = BB_END (e->src);
+      if (JUMP_P (ret) && GET_CODE (PATTERN (ret)) == RETURN)
+	{
+	  int insn_count = ix86_count_insn (e->src);
+
+	  /* Pad short function.  */
+	  if (insn_count < 4)
+	    {
+	      rtx insn = ret;
+
+	      /* Find epilogue.  */
+	      while (insn
+		     && (!NOTE_P (insn)
+			 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
+		insn = PREV_INSN (insn);
+
+	      if (!insn)
+		insn = ret;
+
+	      /* Two NOPs are counted as one instruction.  */
+	      insn_count = 2 * (4  - insn_count);
+	      emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
+	    }
+	}
+    }
+}
+
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
 static void
@@ -27892,7 +28025,9 @@  ix86_reorg (void)
 {
   if (optimize && optimize_function_for_speed_p (cfun))
     {
-      if (TARGET_PAD_RETURNS)
+      if (TARGET_PAD_SHORT_FUNCTION)
+	ix86_pad_short_function ();
+      else if (TARGET_PAD_RETURNS)
 	ix86_pad_returns ();
 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
       if (TARGET_FOUR_JUMP_LIMIT)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 22dd02b..aa246c6 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -299,6 +299,7 @@  enum ix86_tune_indices {
   X86_TUNE_USE_BT,
   X86_TUNE_USE_INCDEC,
   X86_TUNE_PAD_RETURNS,
+  X86_TUNE_PAD_SHORT_FUNCTION,
   X86_TUNE_EXT_80387_CONSTANTS,
   X86_TUNE_SHORTEN_X87_SSE,
   X86_TUNE_AVOID_VECTOR_DECODE,
@@ -385,6 +386,8 @@  extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_USE_BT		ix86_tune_features[X86_TUNE_USE_BT]
 #define TARGET_USE_INCDEC	ix86_tune_features[X86_TUNE_USE_INCDEC]
 #define TARGET_PAD_RETURNS	ix86_tune_features[X86_TUNE_PAD_RETURNS]
+#define TARGET_PAD_SHORT_FUNCTION \
+	ix86_tune_features[X86_TUNE_PAD_SHORT_FUNCTION]
 #define TARGET_EXT_80387_CONSTANTS \
 	ix86_tune_features[X86_TUNE_EXT_80387_CONSTANTS]
 #define TARGET_SHORTEN_X87_SSE	ix86_tune_features[X86_TUNE_SHORTEN_X87_SSE]
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 7ca64a3..03980e2 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -104,6 +104,7 @@ 
   UNSPEC_LD_MPIC	; load_macho_picbase
   UNSPEC_TRUNC_NOOP
   UNSPEC_DIV_ALREADY_SPLIT
+  UNSPEC_NOPS
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -11465,6 +11466,39 @@ 
    (set_attr "length_immediate" "0")
    (set_attr "modrm" "0")])
 
+;; Generate nops.  Operand 0 is the number of nops, up to 8.
+(define_insn "nops"
+  [(unspec [(match_operand 0 "const_int_operand" "")]
+	   UNSPEC_NOPS)]
+  "reload_completed"
+{
+  switch (INTVAL (operands[0]))
+    {
+    case 1:
+      return "nop";
+    case 2:
+      return "nop; nop";
+    case 3:
+      return "nop; nop; nop";
+    case 4:
+      return "nop; nop; nop; nop";
+    case 5:
+      return "nop; nop; nop; nop; nop";
+    case 6:
+      return "nop; nop; nop; nop; nop; nop";
+    case 7:
+      return "nop; nop; nop; nop; nop; nop; nop";
+    case 8:
+      return "nop; nop; nop; nop; nop; nop; nop; nop";
+    default:
+      gcc_unreachable ();
+      break;
+  }
+}
+  [(set (attr "length") (symbol_ref "INTVAL (operands[0])"))
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
 ;; Pad to 16-byte boundary, max skip in op0.  Used to avoid
 ;; branch prediction penalty for the third jump in a 16-byte
diff --git a/gcc/testsuite/gcc.target/i386/pad-1.c b/gcc/testsuite/gcc.target/i386/pad-1.c
new file mode 100644
index 0000000..87a9d6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-1.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer -mtune=generic -S" } */
+/* { dg-final { scan-assembler "rep" } } */
+/* { dg-final { scan-assembler-not "nop" } } */
+
+void
+foo ()
+{
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-10.c b/gcc/testsuite/gcc.target/i386/pad-10.c
new file mode 100644
index 0000000..6ba3b78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-10.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-not "nop" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+extern void bar ();
+
+int
+foo2 (int z, int x)
+{
+  if (x == 1)
+    {
+      bar ();
+      return z;
+    }
+  else
+    return x + z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-2.c b/gcc/testsuite/gcc.target/i386/pad-2.c
new file mode 100644
index 0000000..964547c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-2.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-times "nop; nop; nop; nop; nop; nop; nop; nop" 1 } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+void
+foo ()
+{
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-3.c b/gcc/testsuite/gcc.target/i386/pad-3.c
new file mode 100644
index 0000000..52442b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-3.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-not "nop" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int s[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+int d[8] = {11, 22, 33, 44, 55, 66, 77, 88};
+
+void
+foo ()
+{
+  int i;
+  for (i = 0; i < 8; i++)
+    d[i] = s[i] + 0x1000;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-4.c b/gcc/testsuite/gcc.target/i386/pad-4.c
new file mode 100644
index 0000000..a7033fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-4.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S -fPIC" } */
+/* { dg-final { scan-assembler-times "nop; nop; nop; nop; nop; nop; nop; nop" 1 } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+extern int bar;
+
+int
+foo ()
+{
+  return bar;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-5a.c b/gcc/testsuite/gcc.target/i386/pad-5a.c
new file mode 100644
index 0000000..9d0aa2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-5a.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-times "nop; nop" 1 } } */
+/* { dg-final { scan-assembler-not "nop; nop; nop" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y, int z)
+{
+   return x + y + z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-5b.c b/gcc/testsuite/gcc.target/i386/pad-5b.c
new file mode 100644
index 0000000..2e1cf12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-5b.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-times "nop; nop; nop; nop" 1 } } */
+/* { dg-final { scan-assembler-not "nop; nop; nop; nop; nop" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y, int z)
+{
+   return x + y + z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-6a.c b/gcc/testsuite/gcc.target/i386/pad-6a.c
new file mode 100644
index 0000000..e865967
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-6a.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-times "nop; nop; nop; nop" 1 } } */
+/* { dg-final { scan-assembler-not "nop; nop; nop; nop; nop" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y)
+{
+   return x + y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-6b.c b/gcc/testsuite/gcc.target/i386/pad-6b.c
new file mode 100644
index 0000000..41aeaee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-6b.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-times "nop; nop; nop; nop; nop; nop" 1 } } */
+/* { dg-final { scan-assembler-not "nop; nop; nop; nop; nop; nop; nop" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y)
+{
+   return x + y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-7.c b/gcc/testsuite/gcc.target/i386/pad-7.c
new file mode 100644
index 0000000..7a7493d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-7.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-not "nop" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y, int z)
+{
+   return x + y + z + y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-8.c b/gcc/testsuite/gcc.target/i386/pad-8.c
new file mode 100644
index 0000000..873a0a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-8.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-times "nop; nop; nop; nop; nop; nop" 1 } } */
+/* { dg-final { scan-assembler-not "nop; nop; nop; nop; nop; nop; nop" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y)
+{
+   return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-9.c b/gcc/testsuite/gcc.target/i386/pad-9.c
new file mode 100644
index 0000000..3d68805
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-9.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-times "nop; nop; nop; nop" 1 } } */
+/* { dg-final { scan-assembler-not "nop; nop; nop; nop; nop" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+extern void bar (void);
+
+void
+foo (int x)
+{
+  if (x)
+    bar ();
+}