Message ID | CAOvf_xwBqY++PGRC_+1=rzHO18jvc+TP5mmR=PjYvGgm=63NuA@mail.gmail.com |
---|---|
State | New |
Headers | show |
On Fri, Oct 10, 2014 at 5:40 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > Hi, > > The patch increase PARAM_MAX_COMPLETELY_PEELED_INSNS for CPUs with > high branch cost. > Bootstrap and make check are in progress. > The patch boosts (up to 2,5 times improve) several benchmarks compiled > with "-Ofast" on Silvermont > Spec2000: > +5% gain on 173.applu > +1% gain on 255.vortex > > Is it ok for trunk when pass bootstrap and make check? This is only a 20% increase - from 100 to 120. I would instead suggest to explore doing this change unconditionally if it helps that much. Richard. > Thanks, > Evgeny > > 2014-10-10 Evgeny Stupachenko <evstupac@gmail.com> > * config/i386/i386.c (ix86_option_override_internal): Increase > PARAM_MAX_COMPLETELY_PEELED_INSNS for CPUs with high branch cost. > * config/i386/i386.h (TARGET_HIGH_BRANCH_COST): New. > * config/i386/x86-tune.def (X86_TUNE_HIGH_BRANCH_COST): Indicates > CPUs with high branch cost. > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 6337aa5..5ac10eb 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -4081,6 +4081,14 @@ ix86_option_override_internal (bool main_args_p, > opts->x_param_values, > opts_set->x_param_values); > > + /* Extend full peel max insns parameter for CPUs with high branch cost. */ > + if (TARGET_HIGH_BRANCH_COST) > + maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS, > + 120, > + opts->x_param_values, > + opts_set->x_param_values); > + > + > /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ > if (opts->x_flag_prefetch_loop_arrays < 0 > && HAVE_prefetch > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index 2c64162..da0c57b 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -415,6 +415,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; > #define TARGET_INTER_UNIT_CONVERSIONS \ > ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS] > #define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT] > +#define TARGET_HIGH_BRANCH_COST > ix86_tune_features[X86_TUNE_HIGH_BRANCH_COST] > #define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE] > #define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT] > #define TARGET_USE_INCDEC ix86_tune_features[X86_TUNE_USE_INCDEC] > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > index b6b210e..04d8bf8 100644 > --- a/gcc/config/i386/x86-tune.def > +++ b/gcc/config/i386/x86-tune.def > @@ -208,6 +208,11 @@ DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", > m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL | > m_ATHLON_K8 | m_AMDFAM10) > > +/* X86_TUNE_HIGH_BRANCH_COST: Some CPUs have higher branch cost. This could be > + used to tune unroll, if-cvt, inline... heuristics. */ > +DEF_TUNE (X86_TUNE_HIGH_BRANCH_COST, "high_branch_cost", > + m_BONNELL | m_SILVERMONT | m_INTEL) > + > /*****************************************************************************/ > /* Integer instruction selection tuning */ > /*****************************************************************************/
I need to collect data from Haswell, but the patch should not help it's performance much, just increase code size. On Mon, Oct 13, 2014 at 12:01 PM, Richard Biener <richard.guenther@gmail.com> wrote: > On Fri, Oct 10, 2014 at 5:40 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: >> Hi, >> >> The patch increase PARAM_MAX_COMPLETELY_PEELED_INSNS for CPUs with >> high branch cost. >> Bootstrap and make check are in progress. >> The patch boosts (up to 2,5 times improve) several benchmarks compiled >> with "-Ofast" on Silvermont >> Spec2000: >> +5% gain on 173.applu >> +1% gain on 255.vortex >> >> Is it ok for trunk when pass bootstrap and make check? > > This is only a 20% increase - from 100 to 120. I would instead suggest > to explore doing this change unconditionally if it helps that much. > > Richard. > >> Thanks, >> Evgeny >> >> 2014-10-10 Evgeny Stupachenko <evstupac@gmail.com> >> * config/i386/i386.c (ix86_option_override_internal): Increase >> PARAM_MAX_COMPLETELY_PEELED_INSNS for CPUs with high branch cost. >> * config/i386/i386.h (TARGET_HIGH_BRANCH_COST): New. >> * config/i386/x86-tune.def (X86_TUNE_HIGH_BRANCH_COST): Indicates >> CPUs with high branch cost. >> >> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c >> index 6337aa5..5ac10eb 100644 >> --- a/gcc/config/i386/i386.c >> +++ b/gcc/config/i386/i386.c >> @@ -4081,6 +4081,14 @@ ix86_option_override_internal (bool main_args_p, >> opts->x_param_values, >> opts_set->x_param_values); >> >> + /* Extend full peel max insns parameter for CPUs with high branch cost. */ >> + if (TARGET_HIGH_BRANCH_COST) >> + maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS, >> + 120, >> + opts->x_param_values, >> + opts_set->x_param_values); >> + >> + >> /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ >> if (opts->x_flag_prefetch_loop_arrays < 0 >> && HAVE_prefetch >> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h >> index 2c64162..da0c57b 100644 >> --- a/gcc/config/i386/i386.h >> +++ b/gcc/config/i386/i386.h >> @@ -415,6 +415,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; >> #define TARGET_INTER_UNIT_CONVERSIONS \ >> ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS] >> #define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT] >> +#define TARGET_HIGH_BRANCH_COST >> ix86_tune_features[X86_TUNE_HIGH_BRANCH_COST] >> #define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE] >> #define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT] >> #define TARGET_USE_INCDEC ix86_tune_features[X86_TUNE_USE_INCDEC] >> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def >> index b6b210e..04d8bf8 100644 >> --- a/gcc/config/i386/x86-tune.def >> +++ b/gcc/config/i386/x86-tune.def >> @@ -208,6 +208,11 @@ DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", >> m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL | >> m_ATHLON_K8 | m_AMDFAM10) >> >> +/* X86_TUNE_HIGH_BRANCH_COST: Some CPUs have higher branch cost. This could be >> + used to tune unroll, if-cvt, inline... heuristics. */ >> +DEF_TUNE (X86_TUNE_HIGH_BRANCH_COST, "high_branch_cost", >> + m_BONNELL | m_SILVERMONT | m_INTEL) >> + >> /*****************************************************************************/ >> /* Integer instruction selection tuning */ >> /*****************************************************************************/
> On Fri, Oct 10, 2014 at 5:40 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > > Hi, > > > > The patch increase PARAM_MAX_COMPLETELY_PEELED_INSNS for CPUs with > > high branch cost. > > Bootstrap and make check are in progress. > > The patch boosts (up to 2,5 times improve) several benchmarks compiled > > with "-Ofast" on Silvermont > > Spec2000: > > +5% gain on 173.applu > > +1% gain on 255.vortex > > > > Is it ok for trunk when pass bootstrap and make check? > > This is only a 20% increase - from 100 to 120. I would instead suggest > to explore doing this change unconditionally if it helps that much. Agreed, I think the value of 100 was set decade ago by Zdenek and me completely artifically. I do not recall any serious tuning of this flag. Note that I plan to update https://gcc.gnu.org/ml/gcc-patches/2013-11/msg02270.html to current tree so PARAM_MAX_COMPLETELY_PEELED_INSNS will be used at gimple level rather than tree changing its meaning somewhat. Perhaps I could try to find time this or next week to update the patch so we do not need to do the tuning twice. Honza > > Richard. > > > Thanks, > > Evgeny > > > > 2014-10-10 Evgeny Stupachenko <evstupac@gmail.com> > > * config/i386/i386.c (ix86_option_override_internal): Increase > > PARAM_MAX_COMPLETELY_PEELED_INSNS for CPUs with high branch cost. > > * config/i386/i386.h (TARGET_HIGH_BRANCH_COST): New. > > * config/i386/x86-tune.def (X86_TUNE_HIGH_BRANCH_COST): Indicates > > CPUs with high branch cost. > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > index 6337aa5..5ac10eb 100644 > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -4081,6 +4081,14 @@ ix86_option_override_internal (bool main_args_p, > > opts->x_param_values, > > opts_set->x_param_values); > > > > + /* Extend full peel max insns parameter for CPUs with high branch cost. */ > > + if (TARGET_HIGH_BRANCH_COST) > > + maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS, > > + 120, > > + opts->x_param_values, > > + opts_set->x_param_values); > > + > > + > > /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ > > if (opts->x_flag_prefetch_loop_arrays < 0 > > && HAVE_prefetch > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > index 2c64162..da0c57b 100644 > > --- a/gcc/config/i386/i386.h > > +++ b/gcc/config/i386/i386.h > > @@ -415,6 +415,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; > > #define TARGET_INTER_UNIT_CONVERSIONS \ > > ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS] > > #define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT] > > +#define TARGET_HIGH_BRANCH_COST > > ix86_tune_features[X86_TUNE_HIGH_BRANCH_COST] > > #define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE] > > #define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT] > > #define TARGET_USE_INCDEC ix86_tune_features[X86_TUNE_USE_INCDEC] > > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > > index b6b210e..04d8bf8 100644 > > --- a/gcc/config/i386/x86-tune.def > > +++ b/gcc/config/i386/x86-tune.def > > @@ -208,6 +208,11 @@ DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", > > m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL | > > m_ATHLON_K8 | m_AMDFAM10) > > > > +/* X86_TUNE_HIGH_BRANCH_COST: Some CPUs have higher branch cost. This could be > > + used to tune unroll, if-cvt, inline... heuristics. */ > > +DEF_TUNE (X86_TUNE_HIGH_BRANCH_COST, "high_branch_cost", > > + m_BONNELL | m_SILVERMONT | m_INTEL) > > + > > /*****************************************************************************/ > > /* Integer instruction selection tuning */ > > /*****************************************************************************/
> Agreed, I think the value of 100 was set decade ago by Zdenek and me > completely artifically. I do not recall any serious tuning of this flag. Are you talking bout PARAM_MAX_COMPLETELY_PEELED_INSNS here? If so, see: https://gcc.gnu.org/ml/gcc-patches/2012-11/msg01193.html We have experienced performance regressions because of this arbitrary change and bumped it back to 200 unconditionally.
I've measured spec2000, spec2006 as well and EEMBC for Silvermont in addition. 100->120 change gives gain for Silvermont, the results on Haswell are flat. On Fri, Oct 31, 2014 at 3:14 PM, Eric Botcazou <ebotcazou@adacore.com> wrote: >> Agreed, I think the value of 100 was set decade ago by Zdenek and me >> completely artifically. I do not recall any serious tuning of this flag. > > Are you talking bout PARAM_MAX_COMPLETELY_PEELED_INSNS here? If so, see: > https://gcc.gnu.org/ml/gcc-patches/2012-11/msg01193.html > > We have experienced performance regressions because of this arbitrary change > and bumped it back to 200 unconditionally. > > -- > Eric Botcazou
So are there any objections to enable this (PARAM_MAX_COMPLETELY_PEELED_INSNS increase from 100 to 120) for x86? On Fri, Oct 31, 2014 at 7:52 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > I've measured spec2000, spec2006 as well and EEMBC for Silvermont in addition. > 100->120 change gives gain for Silvermont, the results on Haswell are flat. > > On Fri, Oct 31, 2014 at 3:14 PM, Eric Botcazou <ebotcazou@adacore.com> wrote: >>> Agreed, I think the value of 100 was set decade ago by Zdenek and me >>> completely artifically. I do not recall any serious tuning of this flag. >> >> Are you talking bout PARAM_MAX_COMPLETELY_PEELED_INSNS here? If so, see: >> https://gcc.gnu.org/ml/gcc-patches/2012-11/msg01193.html >> >> We have experienced performance regressions because of this arbitrary change >> and bumped it back to 200 unconditionally. >> >> -- >> Eric Botcazou
150 and 200 make Silvermont performance better on 173.applu (+8%) and 183.equake (+3%); Haswell spec2006 performance stays almost unchanged. Higher value of 300 leave the performance of mentioned tests unchanged, but add some regressions on other benchmarks. So I like 200 as well as 120 and 150, but can confirm performance gains only for x86. On Fri, Nov 7, 2014 at 6:37 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > So are there any objections to enable this > (PARAM_MAX_COMPLETELY_PEELED_INSNS increase from 100 to 120) for x86? > > On Fri, Oct 31, 2014 at 7:52 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: >> I've measured spec2000, spec2006 as well and EEMBC for Silvermont in addition. >> 100->120 change gives gain for Silvermont, the results on Haswell are flat. >> >> On Fri, Oct 31, 2014 at 3:14 PM, Eric Botcazou <ebotcazou@adacore.com> wrote: >>>> Agreed, I think the value of 100 was set decade ago by Zdenek and me >>>> completely artifically. I do not recall any serious tuning of this flag. >>> >>> Are you talking bout PARAM_MAX_COMPLETELY_PEELED_INSNS here? If so, see: >>> https://gcc.gnu.org/ml/gcc-patches/2012-11/msg01193.html >>> >>> We have experienced performance regressions because of this arbitrary change >>> and bumped it back to 200 unconditionally. >>> >>> -- >>> Eric Botcazou
> 150 and 200 make Silvermont performance better on 173.applu (+8%) and > 183.equake (+3%); Haswell spec2006 performance stays almost unchanged. > Higher value of 300 leave the performance of mentioned tests > unchanged, but add some regressions on other benchmarks. > > So I like 200 as well as 120 and 150, but can confirm performance > gains only for x86. IMO it's either 150 or 200. We chose 200 for our 4.9-based compiler because this gave the performance boost without affecting the code size (on x86-64) and because this was previously 400, but it's your call.
> > 150 and 200 make Silvermont performance better on 173.applu (+8%) and > > 183.equake (+3%); Haswell spec2006 performance stays almost unchanged. > > Higher value of 300 leave the performance of mentioned tests > > unchanged, but add some regressions on other benchmarks. > > > > So I like 200 as well as 120 and 150, but can confirm performance > > gains only for x86. > > IMO it's either 150 or 200. We chose 200 for our 4.9-based compiler because > this gave the performance boost without affecting the code size (on x86-64) > and because this was previously 400, but it's your call. Both 150 or 200 globally work for me if there is not too much of code size bloat (did not see code size mentioned here). What I did before decreasing the bounds was strenghtening the loop iteraton count bounds and adding logic the predicts constant propagation enabled by unrolling. For this reason 400 became too large as we did a lot more complete unrolling than before. Also 400 in older compilers is not really 400 in newer. Because I saw performance to drop only with values bellow 50, I went for 100. It would be very interesting to actually analyze what happends for those two benchmarks (that should not be too hard with perf). Honza
Code size for spec2000 is almost unchanged (many benchmarks have the same binaries). For those that are changed we have the following numbers (200 vs 100, both dynamic build -Ofast -funroll-loops -flto): 183.equake +10% 164.gzip, 173.applu +3,5% 187.facerec, 191.fma3d +2,5% 200.sixstrack +2% 177.mesa, 178.galgel +1% On Wed, Nov 12, 2014 at 2:51 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> > 150 and 200 make Silvermont performance better on 173.applu (+8%) and >> > 183.equake (+3%); Haswell spec2006 performance stays almost unchanged. >> > Higher value of 300 leave the performance of mentioned tests >> > unchanged, but add some regressions on other benchmarks. >> > >> > So I like 200 as well as 120 and 150, but can confirm performance >> > gains only for x86. >> >> IMO it's either 150 or 200. We chose 200 for our 4.9-based compiler because >> this gave the performance boost without affecting the code size (on x86-64) >> and because this was previously 400, but it's your call. > > Both 150 or 200 globally work for me if there is not too much of code size > bloat (did not see code size mentioned here). > > What I did before decreasing the bounds was strenghtening the loop iteraton > count bounds and adding logic the predicts constant propagation enabled by > unrolling. For this reason 400 became too large as we did a lot more complete > unrolling than before. Also 400 in older compilers is not really 400 in newer. > > Because I saw performance to drop only with values bellow 50, I went for 100. > It would be very interesting to actually analyze what happends for those two > benchmarks (that should not be too hard with perf). > > Honza
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 6337aa5..5ac10eb 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -4081,6 +4081,14 @@ ix86_option_override_internal (bool main_args_p, opts->x_param_values, opts_set->x_param_values); + /* Extend full peel max insns parameter for CPUs with high branch cost. */ + if (TARGET_HIGH_BRANCH_COST) + maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS, + 120, + opts->x_param_values, + opts_set->x_param_values); + + /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ if (opts->x_flag_prefetch_loop_arrays < 0 && HAVE_prefetch diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 2c64162..da0c57b 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -415,6 +415,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_INTER_UNIT_CONVERSIONS \ ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS] #define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT] +#define TARGET_HIGH_BRANCH_COST ix86_tune_features[X86_TUNE_HIGH_BRANCH_COST] #define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE] #define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT] #define TARGET_USE_INCDEC ix86_tune_features[X86_TUNE_USE_INCDEC] diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index b6b210e..04d8bf8 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -208,6 +208,11 @@ DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL | m_ATHLON_K8 | m_AMDFAM10) +/* X86_TUNE_HIGH_BRANCH_COST: Some CPUs have higher branch cost. This could be + used to tune unroll, if-cvt, inline... heuristics. */ +DEF_TUNE (X86_TUNE_HIGH_BRANCH_COST, "high_branch_cost", + m_BONNELL | m_SILVERMONT | m_INTEL) + /*****************************************************************************/ /* Integer instruction selection tuning */ /*****************************************************************************/