Message ID | 20241107022246.418240-1-MayShao-oc@zhaoxin.com |
---|---|
State | New |
Headers | show |
Series | [x86_64] Add microarchtecture tunable for pass_align_tight_loops | expand |
On Thu, Nov 7, 2024 at 10:29 AM MayShao-oc <MayShao-oc@zhaoxin.com> wrote: > > Hi all: > For zhaoxin, I find no improvement when enable pass_align_tight_loops, > and have performance drop in some cases. > This patch add a new tunable to bypass pass_align_tight_loops in zhaoxin. > > Bootstrapped X86_64. > Ok for trunk? > BR > Mayshao > gcc/ChangeLog: > > * config/i386/i386-features.cc (TARGET_ALIGN_TIGHT_LOOPS): > default true in all processors except for zhaoxin. > * config/i386/i386.h (TARGET_ALIGN_TIGHT_LOOPS): New Macro. > * config/i386/x86-tune.def (X86_TUNE_ALIGN_TIGHT_LOOPS): > New tune > --- > gcc/config/i386/i386-features.cc | 4 +++- > gcc/config/i386/i386.h | 3 +++ > gcc/config/i386/x86-tune.def | 4 ++++ > 3 files changed, 10 insertions(+), 1 deletion(-) > > diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc > index e2e85212a4f..d9fd92964fe 100644 > --- a/gcc/config/i386/i386-features.cc > +++ b/gcc/config/i386/i386-features.cc > @@ -3620,7 +3620,9 @@ public: > /* opt_pass methods: */ > bool gate (function *) final override > { > - return optimize && optimize_function_for_speed_p (cfun); > + return TARGET_ALIGN_TIGHT_LOOPS > + && optimize > + && optimize_function_for_speed_p (cfun); > } > > unsigned int execute (function *) final override > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index 2dcd8803a08..7f9010246c2 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -466,6 +466,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; > #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR] > #define TARGET_SSE_MOVCC_USE_BLENDV \ > ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] > +#define TARGET_ALIGN_TIGHT_LOOPS \ > + ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS] > + > > /* Feature tests against the various architecture variations. */ > enum ix86_arch_indices { > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > index 6ebb2fd3414..bd4fa8b3eee 100644 > --- a/gcc/config/i386/x86-tune.def > +++ b/gcc/config/i386/x86-tune.def > @@ -542,6 +542,10 @@ DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, > DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, > "sse_movcc_use_blendv", ~m_CORE_ATOM) > > +/* X86_TUNE_ALIGN_TIGHT_LOOPS: if false, tight loops are not aligned. */ > +DEF_TUNE (X86_TUNE_ALIGN_TIGHT_LOOPS, "align_tight_loops", > + ~(m_ZHAOXIN)) Please also add ~(m_ZHAOXIN | m_CASCADELAKE | m_SKYLAKE_AVX512)) And could you put it under the section of /*****************************************************************************/ -/* Branch predictor tuning */ +/* Branch predictor and The Front-end tuning */ /*****************************************************************************/ > + > /*****************************************************************************/ > /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ > /*****************************************************************************/ > -- > 2.27.0 > -- BR, Hongtao
> > On Thu, Nov 7, 2024 at 10:29?AM MayShao-oc <MayShao-oc@zhaoxin.com> wrote: > > > > Hi all: > > For zhaoxin, I find no improvement when enable pass_align_tight_loops, > > and have performance drop in some cases. > > This patch add a new tunable to bypass pass_align_tight_loops in zhaoxin. > > > > Bootstrapped X86_64. > > Ok for trunk? > > BR > > Mayshao > > gcc/ChangeLog: > > > > * config/i386/i386-features.cc (TARGET_ALIGN_TIGHT_LOOPS): > > default true in all processors except for zhaoxin. > > * config/i386/i386.h (TARGET_ALIGN_TIGHT_LOOPS): New Macro. > > * config/i386/x86-tune.def (X86_TUNE_ALIGN_TIGHT_LOOPS): > > New tune > > --- > > gcc/config/i386/i386-features.cc | 4 +++- > > gcc/config/i386/i386.h | 3 +++ > > gcc/config/i386/x86-tune.def | 4 ++++ > > 3 files changed, 10 insertions(+), 1 deletion(-) > > > > diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc > > index e2e85212a4f..d9fd92964fe 100644 > > --- a/gcc/config/i386/i386-features.cc > > +++ b/gcc/config/i386/i386-features.cc > > @@ -3620,7 +3620,9 @@ public: > > /* opt_pass methods: */ > > bool gate (function *) final override > > { > > - return optimize && optimize_function_for_speed_p (cfun); > > + return TARGET_ALIGN_TIGHT_LOOPS > > + && optimize > > + && optimize_function_for_speed_p (cfun); > > } > > > > unsigned int execute (function *) final override > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > index 2dcd8803a08..7f9010246c2 100644 > > --- a/gcc/config/i386/i386.h > > +++ b/gcc/config/i386/i386.h > > @@ -466,6 +466,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; > > #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR] > > #define TARGET_SSE_MOVCC_USE_BLENDV \ > > ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] > > +#define TARGET_ALIGN_TIGHT_LOOPS \ > > + ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS] > > + > > > > /* Feature tests against the various architecture variations. */ > > enum ix86_arch_indices { > > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > > index 6ebb2fd3414..bd4fa8b3eee 100644 > > --- a/gcc/config/i386/x86-tune.def > > +++ b/gcc/config/i386/x86-tune.def > > @@ -542,6 +542,10 @@ DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, > > DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, > > "sse_movcc_use_blendv", ~m_CORE_ATOM) > > > > +/* X86_TUNE_ALIGN_TIGHT_LOOPS: if false, tight loops are not aligned. */ > > +DEF_TUNE (X86_TUNE_ALIGN_TIGHT_LOOPS, "align_tight_loops", > > + ~(m_ZHAOXIN)) > Please also add ~(m_ZHAOXIN | m_CASCADELAKE | m_SKYLAKE_AVX512)) > And could you put it under the section of > > /*****************************************************************************/ > -/* Branch predictor tuning */ > +/* Branch predictor and The Front-end tuning > */ > /*****************************************************************************/ > > + > > /*****************************************************************************/ > > /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ > > /*****************************************************************************/ > > -- > > 2.27.0 > > > > > -- > BR, > Hongtao Ok BR Mayshao
> -----Original Message----- > From: Mayshao-oc <Mayshao-oc@zhaoxin.com> > Sent: Thursday, November 7, 2024 11:13 AM > To: Hongtao Liu <crazylht@gmail.com> > Cc: gcc-patches@gcc.gnu.org; hubicka@ucw.cz; Liu, Hongtao > <hongtao.liu@intel.com>; ubizjak@gmail.com; richard.guenther@gmail.com; > Tim Hu(WH-RD) <TimHu@zhaoxin.com>; Silvia Zhao(BJ-RD) > <SilviaZhao@zhaoxin.com>; Louis Qi(BJ-RD) <LouisQi@zhaoxin.com>; Cobe > Chen(BJ-RD) <CobeChen@zhaoxin.com> > Subject: Re: [PATCH] [x86_64] Add microarchtecture tunable for > pass_align_tight_loops > > > > On Thu, Nov 7, 2024 at 10:29?AM MayShao-oc <MayShao- > oc@zhaoxin.com> wrote: > > > > > > Hi all: > > > For zhaoxin, I find no improvement when enable > > > pass_align_tight_loops, and have performance drop in some cases. > > > This patch add a new tunable to bypass pass_align_tight_loops in > zhaoxin. > > > > > > Bootstrapped X86_64. > > > Ok for trunk? LGTM. > > > BR > > > Mayshao > > > gcc/ChangeLog: > > > > > > * config/i386/i386-features.cc (TARGET_ALIGN_TIGHT_LOOPS): > > > default true in all processors except for zhaoxin. > > > * config/i386/i386.h (TARGET_ALIGN_TIGHT_LOOPS): New Macro. > > > * config/i386/x86-tune.def (X86_TUNE_ALIGN_TIGHT_LOOPS): > > > New tune > > > --- > > > gcc/config/i386/i386-features.cc | 4 +++- > > > gcc/config/i386/i386.h | 3 +++ > > > gcc/config/i386/x86-tune.def | 4 ++++ > > > 3 files changed, 10 insertions(+), 1 deletion(-) > > > > > > diff --git a/gcc/config/i386/i386-features.cc > > > b/gcc/config/i386/i386-features.cc > > > index e2e85212a4f..d9fd92964fe 100644 > > > --- a/gcc/config/i386/i386-features.cc > > > +++ b/gcc/config/i386/i386-features.cc > > > @@ -3620,7 +3620,9 @@ public: > > > /* opt_pass methods: */ > > > bool gate (function *) final override > > > { > > > - return optimize && optimize_function_for_speed_p (cfun); > > > + return TARGET_ALIGN_TIGHT_LOOPS > > > + && optimize > > > + && optimize_function_for_speed_p (cfun); > > > } > > > > > > unsigned int execute (function *) final override diff --git > > > a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index > > > 2dcd8803a08..7f9010246c2 100644 > > > --- a/gcc/config/i386/i386.h > > > +++ b/gcc/config/i386/i386.h > > > @@ -466,6 +466,9 @@ extern unsigned char > > > ix86_tune_features[X86_TUNE_LAST]; > > > #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR] > > > #define TARGET_SSE_MOVCC_USE_BLENDV \ > > > ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] > > > +#define TARGET_ALIGN_TIGHT_LOOPS \ > > > + ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS] > > > + > > > > > > /* Feature tests against the various architecture variations. */ > > > enum ix86_arch_indices { diff --git a/gcc/config/i386/x86-tune.def > > > b/gcc/config/i386/x86-tune.def index 6ebb2fd3414..bd4fa8b3eee > 100644 > > > --- a/gcc/config/i386/x86-tune.def > > > +++ b/gcc/config/i386/x86-tune.def > > > @@ -542,6 +542,10 @@ DEF_TUNE > > > (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, > > > DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, > > > "sse_movcc_use_blendv", ~m_CORE_ATOM) > > > > > > +/* X86_TUNE_ALIGN_TIGHT_LOOPS: if false, tight loops are not > > > +aligned. */ DEF_TUNE (X86_TUNE_ALIGN_TIGHT_LOOPS, > "align_tight_loops", > > > + ~(m_ZHAOXIN)) > > Please also add ~(m_ZHAOXIN | m_CASCADELAKE | > m_SKYLAKE_AVX512)) > > And could you put it under the section of > > > > > /****************************************************************** > ***********/ > > -/* Branch predictor tuning */ > > +/* Branch predictor and The Front-end tuning > > */ > > > > > /****************************************************************** > *** > > ********/ > > > + > > > > /****************************************************************** > ***********/ > > > /* AVX instruction selection tuning (some of SSE flags affects AVX, too) > */ > > > > > > > /****************************************************************** > * > > > **********/ > > > -- > > > 2.27.0 > > > > > > > > > -- > > BR, > > Hongtao > > Ok > > BR > Mayshao
On Thu, 2024-11-07 at 04:58 +0000, Liu, Hongtao wrote: > > > > Hi all: > > > > For zhaoxin, I find no improvement when enable > > > > pass_align_tight_loops, and have performance drop in some cases. > > > > This patch add a new tunable to bypass > > > > pass_align_tight_loops in > > zhaoxin. > > > > > > > > Bootstrapped X86_64. > > > > Ok for trunk? > LGTM. I'd suggest to add the reference to PR 117438 into the subject and ChangeLog.
> -----Original Message----- > From: Xi Ruoyao <xry111@xry111.site> > Sent: Thursday, November 7, 2024 1:12 PM > To: Liu, Hongtao <hongtao.liu@intel.com>; Mayshao-oc <Mayshao- > oc@zhaoxin.com>; Hongtao Liu <crazylht@gmail.com> > Cc: gcc-patches@gcc.gnu.org; hubicka@ucw.cz; ubizjak@gmail.com; > richard.guenther@gmail.com; Tim Hu(WH-RD) <TimHu@zhaoxin.com>; Silvia > Zhao(BJ-RD) <SilviaZhao@zhaoxin.com>; Louis Qi(BJ-RD) > <LouisQi@zhaoxin.com>; Cobe Chen(BJ-RD) <CobeChen@zhaoxin.com> > Subject: Re: [PATCH] [x86_64] Add microarchtecture tunable for > pass_align_tight_loops > > On Thu, 2024-11-07 at 04:58 +0000, Liu, Hongtao wrote: > > > > > Hi all: > > > > > For zhaoxin, I find no improvement when enable > > > > > pass_align_tight_loops, and have performance drop in some cases. > > > > > This patch add a new tunable to bypass > > > > > pass_align_tight_loops in > > > zhaoxin. > > > > > > > > > > Bootstrapped X86_64. > > > > > Ok for trunk? > > LGTM. > > I'd suggest to add the reference to PR 117438 into the subject and ChangeLog. Yes, thanks. > > -- > Xi Ruoyao <xry111@xry111.site> > School of Aerospace Science and Technology, Xidian University
> > -----Original Message----- > > From: Xi Ruoyao <xry111@xry111.site> > > Sent: Thursday, November 7, 2024 1:12 PM > > To: Liu, Hongtao <hongtao.liu@intel.com>; Mayshao-oc <Mayshao- > > oc@zhaoxin.com>; Hongtao Liu <crazylht@gmail.com> > > Cc: gcc-patches@gcc.gnu.org; hubicka@ucw.cz; ubizjak@gmail.com; > > richard.guenther@gmail.com; Tim Hu(WH-RD) <TimHu@zhaoxin.com>; Silvia > > Zhao(BJ-RD) <SilviaZhao@zhaoxin.com>; Louis Qi(BJ-RD) > > <LouisQi@zhaoxin.com>; Cobe Chen(BJ-RD) <CobeChen@zhaoxin.com> > > Subject: Re: [PATCH] [x86_64] Add microarchtecture tunable for > > pass_align_tight_loops > > On Thu, 2024-11-07 at 04:58 +0000, Liu, Hongtao wrote: > > > > > > Hi all: > > > > > > For zhaoxin, I find no improvement when enable > > > > > > pass_align_tight_loops, and have performance drop in some cases. > > > > > > This patch add a new tunable to bypass > > > > > > pass_align_tight_loops in > > > > zhaoxin. > > > > > > > > > > > > Bootstrapped X86_64. > > > > > > Ok for trunk? > > > LGTM. > > > > I'd suggest to add the reference to PR 117438 into the subject and ChangeLog. > Yes, thanks. Add PR 117438 into the subject and ChangeLog. > > > > -- > > Xi Ruoyao <xry111@xry111.site> > > School of Aerospace Science and Technology, Xidian University BR Mayshao
On Fri, Nov 8, 2024 at 10:21 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote: > > > > -----Original Message----- > > > From: Xi Ruoyao <xry111@xry111.site> > > > Sent: Thursday, November 7, 2024 1:12 PM > > > To: Liu, Hongtao <hongtao.liu@intel.com>; Mayshao-oc <Mayshao- > > > oc@zhaoxin.com>; Hongtao Liu <crazylht@gmail.com> > > > Cc: gcc-patches@gcc.gnu.org; hubicka@ucw.cz; ubizjak@gmail.com; > > > richard.guenther@gmail.com; Tim Hu(WH-RD) <TimHu@zhaoxin.com>; Silvia > > > Zhao(BJ-RD) <SilviaZhao@zhaoxin.com>; Louis Qi(BJ-RD) > > > <LouisQi@zhaoxin.com>; Cobe Chen(BJ-RD) <CobeChen@zhaoxin.com> > > > Subject: Re: [PATCH] [x86_64] Add microarchtecture tunable for > > > pass_align_tight_loops > > > On Thu, 2024-11-07 at 04:58 +0000, Liu, Hongtao wrote: > > > > > > > Hi all: > > > > > > > For zhaoxin, I find no improvement when enable > > > > > > > pass_align_tight_loops, and have performance drop in some cases. > > > > > > > This patch add a new tunable to bypass > > > > > > > pass_align_tight_loops in > > > > > zhaoxin. > > > > > > > > > > > > > > Bootstrapped X86_64. > > > > > > > Ok for trunk? > > > > LGTM. > > > > > > I'd suggest to add the reference to PR 117438 into the subject and ChangeLog. > > Yes, thanks. > Add PR 117438 into the subject and ChangeLog. PR target/117438 Others LGTM. > > > > > > -- > > > Xi Ruoyao <xry111@xry111.site> > > > School of Aerospace Science and Technology, Xidian University > BR > Mayshao
> On Fri, Nov 8, 2024 at 10:21 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote: > > > > -----Original Message----- > > > > From: Xi Ruoyao <xry111@xry111.site> > > > > Sent: Thursday, November 7, 2024 1:12 PM > > > > To: Liu, Hongtao <hongtao.liu@intel.com>; Mayshao-oc <Mayshao- > > > > oc@zhaoxin.com>; Hongtao Liu <crazylht@gmail.com> > > > > Cc: gcc-patches@gcc.gnu.org; hubicka@ucw.cz; ubizjak@gmail.com; > > > > richard.guenther@gmail.com; Tim Hu(WH-RD) <TimHu@zhaoxin.com>; Silvia > > > > Zhao(BJ-RD) <SilviaZhao@zhaoxin.com>; Louis Qi(BJ-RD) > > > > <LouisQi@zhaoxin.com>; Cobe Chen(BJ-RD) <CobeChen@zhaoxin.com> > > > > Subject: Re: [PATCH] [x86_64] Add microarchtecture tunable for > > > > pass_align_tight_loops > > > > On Thu, 2024-11-07 at 04:58 +0000, Liu, Hongtao wrote: > > > > > > > > Hi all: > > > > > > > > For zhaoxin, I find no improvement when enable > > > > > > > > pass_align_tight_loops, and have performance drop in some cases. > > > > > > > > This patch add a new tunable to bypass > > > > > > > > pass_align_tight_loops in > > > > > > zhaoxin. > > > > > > > > > > > > > > > > Bootstrapped X86_64. > > > > > > > > Ok for trunk? > > > > > LGTM. > > > > > > > > I'd suggest to add the reference to PR 117438 into the subject and ChangeLog. > > > Yes, thanks. > > Add PR 117438 into the subject and ChangeLog. > PR target/117438 > Others LGTM. Update this in ChangeLog. I should report the PR in bugzilla in target category in the first place. Thanks. > > > > > > > > -- > > > > Xi Ruoyao <xry111@xry111.site> > > > > School of Aerospace Science and Technology, Xidian University > > BR > > Mayshao > > > > -- > BR, > Hongtao BR Mayshao
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index e2e85212a4f..d9fd92964fe 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3620,7 +3620,9 @@ public: /* opt_pass methods: */ bool gate (function *) final override { - return optimize && optimize_function_for_speed_p (cfun); + return TARGET_ALIGN_TIGHT_LOOPS + && optimize + && optimize_function_for_speed_p (cfun); } unsigned int execute (function *) final override diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 2dcd8803a08..7f9010246c2 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -466,6 +466,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR] #define TARGET_SSE_MOVCC_USE_BLENDV \ ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] +#define TARGET_ALIGN_TIGHT_LOOPS \ + ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS] + /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 6ebb2fd3414..bd4fa8b3eee 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -542,6 +542,10 @@ DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, "sse_movcc_use_blendv", ~m_CORE_ATOM) +/* X86_TUNE_ALIGN_TIGHT_LOOPS: if false, tight loops are not aligned. */ +DEF_TUNE (X86_TUNE_ALIGN_TIGHT_LOOPS, "align_tight_loops", + ~(m_ZHAOXIN)) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/