diff mbox series

Make ix86_align_loops uarch-specific tune.

Message ID 20241107011732.242945-1-hongtao.liu@intel.com
State New
Headers show
Series Make ix86_align_loops uarch-specific tune. | expand

Commit Message

liuhongt Nov. 7, 2024, 1:17 a.m. UTC
Disable the tune for Zhaoxin/CLX/SKX since it could hurt performance
for the inner loop.

According to last test, align_loop helps performance for SPEC2017 on EMR and Znver4.
So I'll still keep the tune for generic part.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comment?

gcc/ChangeLog:

	PR target/117438
	* config/i386/i386-features.cc (pass_align_tight_loops::gate):
	Add TARGET_ALIGN_TIGHT_LOOPS to the predicate.
	* config/i386/i386.h (TARGET_ALIGN_TIGHT_LOOPS): New macro.
	* config/i386/x86-tune.def (X86_TUNE_ALIGN_TIGHT_LOOPS): New
	tune.
---
 gcc/config/i386/i386-features.cc | 3 ++-
 gcc/config/i386/i386.h           | 2 ++
 gcc/config/i386/x86-tune.def     | 8 +++++++-
 3 files changed, 11 insertions(+), 2 deletions(-)
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index e2e85212a4f..70bda4bc021 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3620,7 +3620,8 @@  public:
   /* opt_pass methods: */
   bool gate (function *) final override
     {
-      return optimize && optimize_function_for_speed_p (cfun);
+      return TARGET_ALIGN_TIGHT_LOOPS
+	&& optimize && optimize_function_for_speed_p (cfun);
     }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 51934400951..2c6dbf6dfdc 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -466,6 +466,8 @@  extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR]
 #define TARGET_SSE_MOVCC_USE_BLENDV \
 	ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV]
+#define TARGET_ALIGN_TIGHT_LOOPS \
+	ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 6ebb2fd3414..8afa0cd9823 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -214,7 +214,7 @@  DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
 
 /*****************************************************************************/
-/* Branch predictor tuning  		                                     */
+/* Branch predictor and The Front-end tuning  		                                     */
 /*****************************************************************************/
 
 /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
@@ -235,6 +235,12 @@  DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
 	  m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_GOLDMONT
 	  | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
 
+/* X86_TUNE_ALIGN_TIGHT_LOOP: For tight loop whose size is
+   smaller than prefetch_block, align it to ceil_log2 (loop_size). The tune
+   overwrites -falign-loops=N.  */
+DEF_TUNE (X86_TUNE_ALIGN_TIGHT_LOOPS, "align_tight_loops",
+	 ~(m_ZHAOXIN | m_CASCADELAKE | m_SKYLAKE_AVX512))
+
 /*****************************************************************************/
 /* Integer instruction selection tuning                                      */
 /*****************************************************************************/