diff mbox

PATCH: PR target/46519: Missing vzeroupper

Message ID AANLkTikmEmv51OOhquhHfgTf5+xGvGN=i49zYmbPxHAd@mail.gmail.com
State New
Headers show

Commit Message

H.J. Lu Jan. 16, 2011, 1:08 a.m. UTC
On Fri, Jan 14, 2011 at 8:02 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Fri, Jan 14, 2011 at 7:36 AM, Richard Henderson <rth@redhat.com> wrote:
>> On 01/13/2011 10:20 AM, H.J. Lu wrote:
>>> We have to scan its successors even if the exit state of the current block
>>> is unchanged since one predecessor exit state is just one factor which
>>> impacts the exit state and vzeroupper optimization.
>>
>> Then you're not interpreting the "state" of a block broadly enough.
>>
>> You certainly can consider the exit state of a block to be a function
>> of its input state and whatever it does locally.  Indeed, this is
>> exactly the formulation of the problem that you will need in order to
>> re-use the LCM infrastructure for 4.7.  But it doesn't hurt to think
>> about the problem in those terms now.
>>
>
> vzeroupper works this way:
>
> 1. We add vzeroupper where there may be AVX->SSE transition
> at function call and return RTL expansion.  It won't affect
> correctness of the program.
> 2. We also add a special vzeroupper to function call which returns
> or passes AVX registers. This will affect correctness of the program
> if it isn't removed.
>
> In vzeroupper optimization pass, we use the special vzeroupper to
> propagate the AVX register state and remove it afterward.  We
> also remove any redundant vzeroupper, which depends on the
> the AVX register state at basic block entry.
>
> Because #2, we have to scan a basic block at least once.
> I can add more checks to avoid unnecessary rescans.
>

Here is the new patch. It rechecks outgoing edges only if the exit
tate is changed. It generates the identical SPEC CPU 2K/2006
executables as trunk.  Compiler difference is less than 0.3%.
OK for trunk?

Thanks.

Comments

Richard Henderson Jan. 24, 2011, 5:17 p.m. UTC | #1
On 01/15/2011 05:08 PM, H.J. Lu wrote:
> Here is the new patch. It rechecks outgoing edges only if the exit
> tate is changed. It generates the identical SPEC CPU 2K/2006
> executables as trunk.  Compiler difference is less than 0.3%.
> OK for trunk?

This is ok.  I'm looking forward to seeing this cleaned up
for 4.7 though, I must say...


r~
H.J. Lu Jan. 24, 2011, 5:26 p.m. UTC | #2
On Mon, Jan 24, 2011 at 9:17 AM, Richard Henderson <rth@redhat.com> wrote:
> On 01/15/2011 05:08 PM, H.J. Lu wrote:
>> Here is the new patch. It rechecks outgoing edges only if the exit
>> tate is changed. It generates the identical SPEC CPU 2K/2006
>> executables as trunk.  Compiler difference is less than 0.3%.
>> OK for trunk?
>
> This is ok.  I'm looking forward to seeing this cleaned up
> for 4.7 though, I must say...
>

I opened:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47440

Thanks.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index a26314b..450cddf 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -56,6 +56,8 @@  along with GCC; see the file COPYING3.  If not see
 #include "debug.h"
 #include "dwarf2out.h"
 #include "sched-int.h"
+#include "sbitmap.h"
+#include "fibheap.h"
 
 enum upper_128bits_state
 {
@@ -73,6 +75,10 @@  typedef struct block_info_def
   bool unchanged;
   /* TRUE if block has been processed.  */
   bool processed;
+  /* TRUE if block has been scanned.  */
+  bool scanned;
+  /* Previous state of the upper 128bits of AVX registers at entry.  */
+  enum upper_128bits_state prev;
 } *block_info;
 
 #define BLOCK_INFO(B)   ((block_info) (B)->aux)
@@ -135,6 +141,16 @@  move_or_delete_vzeroupper_2 (basic_block bb,
       return;
     }
 
+  if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
+    {
+      if (dump_file)
+	fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
+		 bb->index, BLOCK_INFO (bb)->state);
+      return;
+    }
+
+  BLOCK_INFO (bb)->prev = state;
+
   if (dump_file)
     fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
 	     bb->index, state);
@@ -264,6 +280,7 @@  move_or_delete_vzeroupper_2 (basic_block bb,
 
   BLOCK_INFO (bb)->state = state;
   BLOCK_INFO (bb)->unchanged = unchanged;
+  BLOCK_INFO (bb)->scanned = true;
 
   if (dump_file)
     fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
@@ -273,9 +290,10 @@  move_or_delete_vzeroupper_2 (basic_block bb,
 
 /* Helper function for move_or_delete_vzeroupper.  Process vzeroupper
    in BLOCK and check its predecessor blocks.  Treat UNKNOWN state
-   as USED if UNKNOWN_IS_UNUSED is true.  */
+   as USED if UNKNOWN_IS_UNUSED is true.  Return TRUE if the exit
+   state is changed.  */
 
-static void
+static bool
 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
 {
   edge e;
@@ -288,7 +306,7 @@  move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
 	     block->index, BLOCK_INFO (block)->processed);
 
   if (BLOCK_INFO (block)->processed)
-    return;
+    return false;
 
   state = unused;
 
@@ -324,8 +342,14 @@  done:
 
   /* Need to rescan if the upper 128bits of AVX registers are changed
      to USED at exit.  */
-  if (new_state != old_state && new_state == used)
-    cfun->machine->rescan_vzeroupper_p = 1;
+  if (new_state != old_state)
+    {
+      if (new_state == used)
+	cfun->machine->rescan_vzeroupper_p = 1;
+      return true;
+    }
+  else
+    return false;
 }
 
 /* Go through the instruction stream looking for vzeroupper.  Delete
@@ -338,14 +362,18 @@  move_or_delete_vzeroupper (void)
   edge e;
   edge_iterator ei;
   basic_block bb;
-  unsigned int count;
+  fibheap_t worklist, pending, fibheap_swap;
+  sbitmap visited, in_worklist, in_pending, sbitmap_swap;
+  int *bb_order;
+  int *rc_order;
+  int i;
 
   /* Set up block info for each basic block.  */
   alloc_aux_for_blocks (sizeof (struct block_info_def));
 
-  /* Process successor blocks of all entry points.  */
+  /* Process outgoing edges of entry point.  */
   if (dump_file)
-    fprintf (dump_file, "Process all entry points\n");
+    fprintf (dump_file, "Process outgoing edges of entry point\n");
 
   FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
     {
@@ -355,25 +383,102 @@  move_or_delete_vzeroupper (void)
       BLOCK_INFO (e->dest)->processed = true;
     }
 
-  /* Process all basic blocks.  */
-  count = 0;
-  do
+  /* Compute reverse completion order of depth first search of the CFG
+     so that the data-flow runs faster.  */
+  rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
+  bb_order = XNEWVEC (int, last_basic_block);
+  pre_and_rev_post_order_compute (NULL, rc_order, false);
+  for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
+    bb_order[rc_order[i]] = i;
+  free (rc_order);
+
+  worklist = fibheap_new ();
+  pending = fibheap_new ();
+  visited = sbitmap_alloc (last_basic_block);
+  in_worklist = sbitmap_alloc (last_basic_block);
+  in_pending = sbitmap_alloc (last_basic_block);
+  sbitmap_zero (in_worklist);
+
+  /* Don't check outgoing edges of entry point.  */
+  sbitmap_ones (in_pending);
+  FOR_EACH_BB (bb)
+    if (BLOCK_INFO (bb)->processed)
+      RESET_BIT (in_pending, bb->index);
+    else
+      {
+	move_or_delete_vzeroupper_1 (bb, false);
+	fibheap_insert (pending, bb_order[bb->index], bb);
+      }
+
+  if (dump_file)
+    fprintf (dump_file, "Check remaining basic blocks\n");
+
+  while (!fibheap_empty (pending))
     {
-      if (dump_file)
-	fprintf (dump_file, "Process all basic blocks: trip %d\n",
-		 count);
+      fibheap_swap = pending;
+      pending = worklist;
+      worklist = fibheap_swap;
+      sbitmap_swap = in_pending;
+      in_pending = in_worklist;
+      in_worklist = sbitmap_swap;
+
+      sbitmap_zero (visited);
+
       cfun->machine->rescan_vzeroupper_p = 0;
-      FOR_EACH_BB (bb)
-	move_or_delete_vzeroupper_1 (bb, false);
+
+      while (!fibheap_empty (worklist))
+	{
+	  bb = (basic_block) fibheap_extract_min (worklist);
+	  RESET_BIT (in_worklist, bb->index);
+	  gcc_assert (!TEST_BIT (visited, bb->index));
+	  if (!TEST_BIT (visited, bb->index))
+	    {
+	      edge_iterator ei;
+
+	      SET_BIT (visited, bb->index);
+
+	      if (move_or_delete_vzeroupper_1 (bb, false))
+		FOR_EACH_EDGE (e, ei, bb->succs)
+		  {
+		    if (e->dest == EXIT_BLOCK_PTR
+			|| BLOCK_INFO (e->dest)->processed)
+		      continue;
+
+		    if (TEST_BIT (visited, e->dest->index))
+		      {
+			if (!TEST_BIT (in_pending, e->dest->index))
+			  {
+			    /* Send E->DEST to next round.  */
+			    SET_BIT (in_pending, e->dest->index);
+			    fibheap_insert (pending,
+					    bb_order[e->dest->index],
+					    e->dest);
+			  }
+		      }
+		    else if (!TEST_BIT (in_worklist, e->dest->index))
+		      {
+			/* Add E->DEST to current round.  */
+			SET_BIT (in_worklist, e->dest->index);
+			fibheap_insert (worklist, bb_order[e->dest->index],
+					e->dest);
+		      }
+		  }
+	    }
+	}
+
+      if (!cfun->machine->rescan_vzeroupper_p)
+	break;
     }
-  while (cfun->machine->rescan_vzeroupper_p && count++ < 20);
 
-  /* FIXME: Is 20 big enough?  */
-  if (count >= 20)
-    gcc_unreachable ();
+  free (bb_order);
+  fibheap_delete (worklist);
+  fibheap_delete (pending);
+  sbitmap_free (visited);
+  sbitmap_free (in_worklist);
+  sbitmap_free (in_pending);
 
   if (dump_file)
-    fprintf (dump_file, "Process all basic blocks\n");
+    fprintf (dump_file, "Process remaining basic blocks\n");
 
   FOR_EACH_BB (bb)
     move_or_delete_vzeroupper_1 (bb, true);
diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386
index 6c801a5..1c658a1 100644
--- a/gcc/config/i386/t-i386
+++ b/gcc/config/i386/t-i386
@@ -23,7 +23,7 @@  i386.o: $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
   $(RECOG_H) $(EXPR_H) $(OPTABS_H) toplev.h $(BASIC_BLOCK_H) \
   $(GGC_H) $(TARGET_H) $(TARGET_DEF_H) langhooks.h $(CGRAPH_H) \
   $(TREE_GIMPLE_H) $(DWARF2_H) $(DF_H) tm-constrs.h $(PARAMS_H) \
-  i386-builtin-types.inc debug.h dwarf2out.h
+  i386-builtin-types.inc debug.h dwarf2out.h sbitmap.h $(FIBHEAP_H)
 
 i386-c.o: $(srcdir)/config/i386/i386-c.c \
   $(srcdir)/config/i386/i386-protos.h $(CONFIG_H) $(SYSTEM_H) coretypes.h \
diff --git a/gcc/testsuite/ChangeLog.vzero b/gcc/testsuite/ChangeLog.vzero
new file mode 100644
index 0000000..fedac70
--- /dev/null
+++ b/gcc/testsuite/ChangeLog.vzero
@@ -0,0 +1,115 @@ 
+2010-12-17  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gfortran.dg/pr46519-2.f90: New.
+
+2010-11-24  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gfortran.dg/pr46519-1.f: Replace -mtune=generic with
+	-mvzeroupper.
+
+2010-11-24  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gfortran.dg/pr46519-1.f: New.
+
+2010-11-20  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-14.c: Replace -O0 with -O2.
+	* gcc.target/i386/avx-vzeroupper-15.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-16.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-17.c: Likewise.
+
+	* gcc.target/i386/avx-vzeroupper-25.c: New.
+	* gcc.target/i386/avx-vzeroupper-26.c: Likewise.
+
+2010-11-18  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-24.c: New.
+
+2010-11-18  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-21.c: New.
+	* gcc.target/i386/avx-vzeroupper-22.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-23.c: Likewise.
+
+2010-11-17  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-10.c: Expect no avx_vzeroupper.
+	* gcc.target/i386/avx-vzeroupper-11.c: Likewise.
+
+2010-11-16  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-20.c: New.
+
+2010-11-04  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-19.c: New.
+
+2010-11-03  H.J. Lu  <hongjiu.lu@intel.com>
+
+	PR target/46295
+	* gcc.target/i386/pr46295.c: New.
+
+2010-11-03  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/pr46285.c: Remove -dp.
+
+2010-11-02  H.J. Lu  <hongjiu.lu@intel.com>
+
+	PR target/46285
+	* gcc.target/i386/pr46285.c.
+
+2010-11-02  H.J. Lu  <hongjiu.lu@intel.com>
+
+	PR target/46253
+	* gcc.target/i386/pr46253.c: New.
+
+2010-11-02  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-16.c: New.
+	* gcc.target/i386/avx-vzeroupper-17.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-18.c: Likewise.
+
+2010-11-02  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-15.c: New.
+
+2010-10-26  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-4.c: Don't scan
+	avx_vzeroupper_nop.  Scan avx_vzeroupper instead of
+	*avx_vzeroupper.
+	* gcc.target/i386/avx-vzeroupper-10.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-11.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-12.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-13.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-14.c: Likewise.
+
+2010-10-19  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-1.c: Add -mtune=generic.
+	* gcc.target/i386/avx-vzeroupper-1.c: Likewise.
+
+	* gcc.target/i386/avx-vzeroupper-13.c: New.
+	* gcc.target/i386/avx-vzeroupper-14.c: Likewise.
+
+2010-10-12  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-9.c: New.
+	* gcc.target/i386/avx-vzeroupper-10.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-11.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-12.c: Likewise.
+
+2010-10-12  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-5.c: New.
+	* gcc.target/i386/avx-vzeroupper-6.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-7.c: Likewise.
+	* gcc.target/i386/avx-vzeroupper-8.c: Likewise.
+
+2010-10-12  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-4.c: New.
+
+2010-06-15  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* gcc.target/i386/avx-vzeroupper-3.c: New.