diff mbox

PATCH: Disable double precision vectorizer for Atom

Message ID 20100913134740.GA20178@intel.com
State New
Headers show

Commit Message

H.J. Lu Sept. 13, 2010, 1:47 p.m. UTC
Hi,

Double precision vector instructions are much slower than double
precision scalar instructions on Atom.  This patch disables double
precision vectorizer for Atom.  It improves SPEC CPU 2K FP geomean by
7% on 64bit and 3% on 32bit.  OK for trunk?

Thanks.


H.J.
----
gcc/

2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>

	* config/i386/i386.c (initial_ix86_tune_features): Add
	X86_TUNE_VECTORIZE_DOUBLE.
	* config/i386/i386.h (ix86_tune_indices): Likewise.
	(TARGET_VECTORIZE_DOUBLE): New.
	(UNITS_PER_SIMD_WORD): Return UNITS_PER_WORD for DFmode if
	TARGET_VECTORIZE_DOUBLE is false.

gcc/testsuite/

2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>

	* gcc.target/i386/fma4-256-vector.c: Add -mtune=generic.
	* gcc.target/i386/fma4-vector.c: Likewise.
	* gcc.target/i386/vectorize2.c: Likewise.
	* gcc.target/i386/vectorize4.c: Likewise.
	* gcc.target/i386/vectorize5.c: Likewise.
	* gcc.target/i386/vectorize6.c: Likewise.
	* gcc.target/i386/vectorize8.c: Likewise.

	* gcc.target/i386/vect-double-1.c: New.
	* gcc.target/i386/vect-double-1a.c: Likewise.
	* gcc.target/i386/vect-double-2.c: Likewise.
	* gcc.target/i386/vect-double-2a.c: Likewise.

	* lib/target-supports.exp (check_effective_target_vect_double):
	Set et_vect_double_saved to 0 when tuning for Atom.

Comments

Uros Bizjak Sept. 13, 2010, 6:51 p.m. UTC | #1
On Mon, Sep 13, 2010 at 3:47 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:

> Double precision vector instructions are much slower than double
> precision scalar instructions on Atom.  This patch disables double
> precision vectorizer for Atom.  It improves SPEC CPU 2K FP geomean by
> 7% on 64bit and 3% on 32bit.  OK for trunk?
>
> Thanks.
>
>
> H.J.
> ----
> gcc/
>
> 2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>
>
>        * config/i386/i386.c (initial_ix86_tune_features): Add
>        X86_TUNE_VECTORIZE_DOUBLE.
>        * config/i386/i386.h (ix86_tune_indices): Likewise.
>        (TARGET_VECTORIZE_DOUBLE): New.
>        (UNITS_PER_SIMD_WORD): Return UNITS_PER_WORD for DFmode if
>        TARGET_VECTORIZE_DOUBLE is false.
>
> gcc/testsuite/
>
> 2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>
>
>        * gcc.target/i386/fma4-256-vector.c: Add -mtune=generic.
>        * gcc.target/i386/fma4-vector.c: Likewise.
>        * gcc.target/i386/vectorize2.c: Likewise.
>        * gcc.target/i386/vectorize4.c: Likewise.
>        * gcc.target/i386/vectorize5.c: Likewise.
>        * gcc.target/i386/vectorize6.c: Likewise.
>        * gcc.target/i386/vectorize8.c: Likewise.
>
>        * gcc.target/i386/vect-double-1.c: New.
>        * gcc.target/i386/vect-double-1a.c: Likewise.
>        * gcc.target/i386/vect-double-2.c: Likewise.
>        * gcc.target/i386/vect-double-2a.c: Likewise.
>
>        * lib/target-supports.exp (check_effective_target_vect_double):
>        Set et_vect_double_saved to 0 when tuning for Atom.

OK, but see comments bellow ...

> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 1d79a18..7d165bb 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -1627,6 +1627,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
>   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
>      will impact LEA instruction selection. */
>   m_ATOM,
> +
> +  /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
> +     instructions.  */
> +  ~m_ATOM,
>  };
>
>  /* Feature tests against the various architecture variations.  */
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 91238d5..2acf60a 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -312,6 +312,7 @@ enum ix86_tune_indices {
>   X86_TUNE_USE_VECTOR_CONVERTS,
>   X86_TUNE_FUSE_CMP_AND_BRANCH,
>   X86_TUNE_OPT_AGU,
> +  X86_TUNE_VECTORIZE_DOUBLE,
>
>   X86_TUNE_LAST
>  };
> @@ -404,6 +405,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
>  #define TARGET_FUSE_CMP_AND_BRANCH \
>        ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH]
>  #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
> +#define TARGET_VECTORIZE_DOUBLE \
> +       ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE]
>
>  /* Feature tests against the various architecture variations.  */
>  enum ix86_arch_indices {
> @@ -1037,8 +1040,10 @@ enum target_cpu_default
>    different sizes for integer and floating point vectors.  We limit
>    vector size to 16byte.  */
>  #define UNITS_PER_SIMD_WORD(MODE)                                      \
> -  (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)     \
> -             : (TARGET_SSE ? 16 : UNITS_PER_WORD))
> +  ((MODE) == DFmode && !TARGET_VECTORIZE_DOUBLE                                \
> +   ? UNITS_PER_WORD                                                    \
> +   : (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)  \
> +                : (TARGET_SSE ? 16 : UNITS_PER_WORD)))

Please rewrite this function to a helper function using switch
statement. I must admit I'm not able to parse this mess.

Thanks,
Uros.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 1d79a18..7d165bb 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1627,6 +1627,10 @@  static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
      will impact LEA instruction selection. */
   m_ATOM,
+
+  /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
+     instructions.  */
+  ~m_ATOM,
 };
 
 /* Feature tests against the various architecture variations.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 91238d5..2acf60a 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -312,6 +312,7 @@  enum ix86_tune_indices {
   X86_TUNE_USE_VECTOR_CONVERTS,
   X86_TUNE_FUSE_CMP_AND_BRANCH,
   X86_TUNE_OPT_AGU,
+  X86_TUNE_VECTORIZE_DOUBLE,
 
   X86_TUNE_LAST
 };
@@ -404,6 +405,8 @@  extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_FUSE_CMP_AND_BRANCH \
 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
+#define TARGET_VECTORIZE_DOUBLE \
+	ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
@@ -1037,8 +1040,10 @@  enum target_cpu_default
    different sizes for integer and floating point vectors.  We limit
    vector size to 16byte.  */
 #define UNITS_PER_SIMD_WORD(MODE)					\
-  (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)	\
-   	      : (TARGET_SSE ? 16 : UNITS_PER_WORD))
+  ((MODE) == DFmode && !TARGET_VECTORIZE_DOUBLE				\
+   ? UNITS_PER_WORD							\
+   : (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)	\
+		 : (TARGET_SSE ? 16 : UNITS_PER_WORD)))
 
 #define VALID_DFP_MODE_P(MODE) \
   ((MODE) == SDmode || (MODE) == DDmode || (MODE) == TDmode)
diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-vector.c b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c
index 714b743..1bd2ce4 100644
--- a/gcc/testsuite/gcc.target/i386/fma4-256-vector.c
+++ b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c
@@ -3,7 +3,7 @@ 
 
 /* { dg-do compile } */
 /* { dg-require-effective-target lp64 } */
-/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */
+/* { dg-options "-O2 -mfma4 -ftree-vectorize -mtune=generic" } */
 
 extern void exit (int);
 
diff --git a/gcc/testsuite/gcc.target/i386/fma4-vector.c b/gcc/testsuite/gcc.target/i386/fma4-vector.c
index df8463e..da12780 100644
--- a/gcc/testsuite/gcc.target/i386/fma4-vector.c
+++ b/gcc/testsuite/gcc.target/i386/fma4-vector.c
@@ -3,7 +3,7 @@ 
 
 /* { dg-do compile } */
 /* { dg-require-effective-target lp64 } */
-/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */
+/* { dg-options "-O2 -mfma4 -ftree-vectorize -mtune=generic" } */
 
 extern void exit (int);
 
diff --git a/gcc/testsuite/gcc.target/i386/vect-double-1.c b/gcc/testsuite/gcc.target/i386/vect-double-1.c
new file mode 100644
index 0000000..87e5fe9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-double-1.c
@@ -0,0 +1,35 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -march=core2 -fdump-tree-vect-stats" } */
+
+extern void abort (void);
+
+#ifndef STATIC
+#define STATIC
+#endif
+
+#define N 16
+ 
+double cb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+double ca[N];
+
+STATIC void
+__attribute__ ((noinline))
+sse2_test (void)
+{  
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      ca[i] = cb[i];
+    }
+
+  /* check results:  */
+  for (i = 0; i < N; i++)
+    {
+      if (ca[i] != cb[i])
+        abort ();
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-double-1a.c b/gcc/testsuite/gcc.target/i386/vect-double-1a.c
new file mode 100644
index 0000000..a62c939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-double-1a.c
@@ -0,0 +1,8 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -msse2 -mtune=core2" } */
+
+#define STATIC static
+
+#include "vect-double-1.c"
+#include "sse2-check.h"
diff --git a/gcc/testsuite/gcc.target/i386/vect-double-2.c b/gcc/testsuite/gcc.target/i386/vect-double-2.c
new file mode 100644
index 0000000..a76dcb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-double-2.c
@@ -0,0 +1,35 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -msse2 -mtune=atom -fdump-tree-vect-stats" } */
+
+extern void abort (void);
+
+#ifndef STATIC
+#define STATIC
+#endif
+
+#define N 16
+ 
+double cb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+double ca[N];
+
+STATIC void
+__attribute__ ((noinline))
+sse2_test (void)
+{  
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      ca[i] = cb[i];
+    }
+
+  /* check results:  */
+  for (i = 0; i < N; i++)
+    {
+      if (ca[i] != cb[i])
+        abort ();
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "vectorized 1 loops" "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-double-2a.c b/gcc/testsuite/gcc.target/i386/vect-double-2a.c
new file mode 100644
index 0000000..94f8062
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-double-2a.c
@@ -0,0 +1,8 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -msse2 -mtune=atom" } */
+
+#define STATIC static
+
+#include "vect-double-2.c"
+#include "sse2-check.h"
diff --git a/gcc/testsuite/gcc.target/i386/vectorize2.c b/gcc/testsuite/gcc.target/i386/vectorize2.c
index 4196487..427e2d4 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize2.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize2.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target ilp32 } */
-/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse" } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mtune=generic" } */
 
 double a[256];
 int b[256];
diff --git a/gcc/testsuite/gcc.target/i386/vectorize4.c b/gcc/testsuite/gcc.target/i386/vectorize4.c
index f3d605e..557d0a2 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize4.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize4.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target ilp32 } */
-/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 --param ggc-min-expand=0 --param ggc-min-heapsize=0" } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mtune=generic --param ggc-min-expand=0 --param ggc-min-heapsize=0" } */
 /* This test, tests two thing, we vectorize square root and also we don't crash due to a GC issue.  */
 
 
diff --git a/gcc/testsuite/gcc.target/i386/vectorize5.c b/gcc/testsuite/gcc.target/i386/vectorize5.c
index 3894240..04f044f 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize5.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize5.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target lp64 } */
-/* { dg-options "-O2 -ftree-vectorize -mveclibabi=acml -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -mveclibabi=acml -ffast-math -mtune=generic" } */
 
 double x[256];
 
diff --git a/gcc/testsuite/gcc.target/i386/vectorize6.c b/gcc/testsuite/gcc.target/i386/vectorize6.c
index 78ec53d..d299a15 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize6.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize6.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2 -ftree-vectorize -mveclibabi=svml -ffast-math" } */
+/* { dg-options "-O2 -msse2 -ftree-vectorize -mveclibabi=svml -ffast-math -mtune=generic" } */
 
 double x[256];
 
diff --git a/gcc/testsuite/gcc.target/i386/vectorize8.c b/gcc/testsuite/gcc.target/i386/vectorize8.c
index ed1517b..a194bb0 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize8.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize8.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -msse2" } */
+/* { dg-options "-O2 -ftree-vectorize -msse2 -mtune=generic" } */
 
 unsigned int a[256];
 double b[256];
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index fc24b78..de9f21e 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2309,8 +2309,17 @@  proc check_effective_target_vect_double { } {
     } else {
 	set et_vect_double_saved 0
 	if { [istarget i?86-*-*]
-	      || [istarget x86_64-*-*] 
-	      || [istarget spu-*-*] } {
+	      || [istarget x86_64-*-*] } {
+	   if { [check_no_compiler_messages vect_double assembly {
+		 #ifdef __tune_atom__
+		 # error No double vectorizer support.
+		 #endif
+		}] } {
+		set et_vect_double_saved 1
+	    } else {
+		set et_vect_double_saved 0
+	    }
+	} elseif { [istarget spu-*-*] } {
 	   set et_vect_double_saved 1
 	}
     }