Message ID | d9230de2-d3d3-c960-f39a-4f81b6a094bc@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [v2] rs6000: Don't use optimize_function_for_speed_p too early [PR108184] | expand |
Hi, Gentle ping this: https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609993.html BR, Kewen on 2023/1/16 17:08, Kewen.Lin via Gcc-patches wrote: > Hi, > > As Honza pointed out in [1], the current uses of function > optimize_function_for_speed_p in rs6000_option_override_internal > are too early, since the query results from the functions > optimize_function_for_{speed,size}_p could be changed later due > to profile feedback and some function attributes handlings etc. > > This patch is to move optimize_function_for_speed_p to all the > use places of the corresponding flags, which follows the existing > practices. Maybe we can cache it somewhere at an appropriate > timing, but that's another thing. > > Comparing with v1[2], this version added one test case for > SAVE_TOC_INDIRECT as Segher questioned and suggested, and it > also considered the possibility of explicit option (see test > cases pr108184-2.c and pr108184-4.c). I believe that excepting > for the intentional change on optimize_function_for_{speed, > size}_p, there is no other function change. > > [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/607527.html > [2] https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609379.html > > Bootstrapped and regtested on powerpc64-linux-gnu P8, > powerpc64le-linux-gnu P{9,10} and powerpc-ibm-aix. > > Is it ok for trunk? > > BR, > Kewen > ----- > gcc/ChangeLog: > > * config/rs6000/rs6000.cc (rs6000_option_override_internal): Remove > all optimize_function_for_speed_p uses. > (fusion_gpr_load_p): Call optimize_function_for_speed_p along > with TARGET_P8_FUSION_SIGN. > (expand_fusion_gpr_load): Likewise. > (rs6000_call_aix): Call optimize_function_for_speed_p along with > TARGET_SAVE_TOC_INDIRECT. > * config/rs6000/predicates.md (fusion_gpr_mem_load): Call > optimize_function_for_speed_p along with TARGET_P8_FUSION_SIGN. > > gcc/testsuite/ChangeLog: > > * gcc.target/powerpc/pr108184-1.c: New test. > * gcc.target/powerpc/pr108184-2.c: New test. > * gcc.target/powerpc/pr108184-3.c: New test. > * gcc.target/powerpc/pr108184-4.c: New test. > --- > gcc/config/rs6000/predicates.md | 5 +++- > gcc/config/rs6000/rs6000.cc | 19 +++++++++----- > gcc/testsuite/gcc.target/powerpc/pr108184-1.c | 16 ++++++++++++ > gcc/testsuite/gcc.target/powerpc/pr108184-2.c | 15 +++++++++++ > gcc/testsuite/gcc.target/powerpc/pr108184-3.c | 25 +++++++++++++++++++ > gcc/testsuite/gcc.target/powerpc/pr108184-4.c | 24 ++++++++++++++++++ > 6 files changed, 97 insertions(+), 7 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-1.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-2.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-3.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-4.c > > diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md > index a1764018545..9f84468db84 100644 > --- a/gcc/config/rs6000/predicates.md > +++ b/gcc/config/rs6000/predicates.md > @@ -1878,7 +1878,10 @@ (define_predicate "fusion_gpr_mem_load" > > /* Handle sign/zero extend. */ > if (GET_CODE (op) == ZERO_EXTEND > - || (TARGET_P8_FUSION_SIGN && GET_CODE (op) == SIGN_EXTEND)) > + || (TARGET_P8_FUSION_SIGN > + && GET_CODE (op) == SIGN_EXTEND > + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN > + || optimize_function_for_speed_p (cfun)))) > { > op = XEXP (op, 0); > mode = GET_MODE (op); > diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc > index 6ac3adcec6b..f47d21980a9 100644 > --- a/gcc/config/rs6000/rs6000.cc > +++ b/gcc/config/rs6000/rs6000.cc > @@ -3997,8 +3997,7 @@ rs6000_option_override_internal (bool global_init_p) > /* If we can shrink-wrap the TOC register save separately, then use > -msave-toc-indirect unless explicitly disabled. */ > if ((rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT) == 0 > - && flag_shrink_wrap_separate > - && optimize_function_for_speed_p (cfun)) > + && flag_shrink_wrap_separate) > rs6000_isa_flags |= OPTION_MASK_SAVE_TOC_INDIRECT; > > /* Enable power8 fusion if we are tuning for power8, even if we aren't > @@ -4032,7 +4031,6 @@ rs6000_option_override_internal (bool global_init_p) > zero extending load, and an explicit sign extension. */ > if (TARGET_P8_FUSION > && !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN) > - && optimize_function_for_speed_p (cfun) > && optimize >= 3) > rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN; > > @@ -25690,7 +25688,10 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie) > > /* Can we optimize saving the TOC in the prologue or > do we need to do it at every call? */ > - if (TARGET_SAVE_TOC_INDIRECT && !cfun->calls_alloca) > + if (TARGET_SAVE_TOC_INDIRECT > + && !cfun->calls_alloca > + && (rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT > + || optimize_function_for_speed_p (cfun))) > cfun->machine->save_toc_in_prologue = true; > else > { > @@ -27557,7 +27558,10 @@ fusion_gpr_load_p (rtx addis_reg, /* register set via addis. */ > > /* Allow sign/zero extension. */ > if (GET_CODE (mem) == ZERO_EXTEND > - || (GET_CODE (mem) == SIGN_EXTEND && TARGET_P8_FUSION_SIGN)) > + || (GET_CODE (mem) == SIGN_EXTEND > + && TARGET_P8_FUSION_SIGN > + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN > + || optimize_function_for_speed_p (cfun)))) > mem = XEXP (mem, 0); > > if (!MEM_P (mem)) > @@ -27621,7 +27625,10 @@ expand_fusion_gpr_load (rtx *operands) > enum rtx_code extend = UNKNOWN; > > if (GET_CODE (orig_mem) == ZERO_EXTEND > - || (TARGET_P8_FUSION_SIGN && GET_CODE (orig_mem) == SIGN_EXTEND)) > + || (TARGET_P8_FUSION_SIGN > + && GET_CODE (orig_mem) == SIGN_EXTEND > + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN > + || optimize_function_for_speed_p (cfun)))) > { > extend = GET_CODE (orig_mem); > orig_mem = XEXP (orig_mem, 0); > diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-1.c b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c > new file mode 100644 > index 00000000000..eae3bb9b7c4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c > @@ -0,0 +1,16 @@ > +/* Only possible to fuse sign extended loads with the addis when > + optimize >= 3 and Power8 fusion takes effects. -mno-prefixed > + ensures test point isn't broken (due to prefixed plha). */ > +/* { dg-options "-mdejagnu-tune=power8 -O3 -mno-prefixed" } */ > + > +/* Verify it doesn't optimize this function for speed as it's cold, > + by checking it doesn't try to fuse sign extended loads with addis, > + which results in a zero extended load and a sign extension. */ > + > +__attribute__ ((cold)) int > +fusion_short (short *p) > +{ > + return p[0x12345]; > +} > + > +/* { dg-final { scan-assembler-not {\mextsh\M} } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-2.c b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c > new file mode 100644 > index 00000000000..9f703f7fc6b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c > @@ -0,0 +1,15 @@ > +/* -mno-prefixed ensures test point isn't broken (due to prefixed plha). */ > +/* { dg-options "-mdejagnu-tune=power8 -O2 -mpower8-fusion-sign -mno-prefixed" } */ > + > +/* Verify the explicit option -mpower8-fusion-sign can still fuse > + sign extended loads with addis, which results in a zero extended > + load and a sign extension. It means the cold attribute which > + indicates to optimize for size doesn't affect. */ > + > +__attribute__ ((cold)) int > +fusion_short (short *p) > +{ > + return p[0x12345]; > +} > + > +/* { dg-final { scan-assembler-times {\mextsh\M} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-3.c b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c > new file mode 100644 > index 00000000000..ceaa96e4421 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ > +/* { dg-require-effective-target fpic } */ > +/* { dg-options "-fpic -mno-pcrel -O2" } */ > + > +/* Verify it doesn't imply -msave-toc-indirect, so > + it doesn't take effect and we have two separated > + toc savings for these two long calls. */ > + > +void foo (void) __attribute__((__longcall__)); > +int baz (void) __attribute__((__longcall__)); > + > +__attribute__ ((cold)) int > +bar (void) > +{ > + foo (); > + return baz () + 1; > +} > + > +/* Linux LE */ > +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 2 { target powerpc_elfv2 } } } */ > +/* Linux BE 64 bit or AIX 64 bit */ > +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 2 { target { lp64 && { ! powerpc_elfv2 } } } } } */ > +/* AIX 32 bit */ > +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 2 { target { ilp32 && powerpc*-*-aix* } } } } */ > + > diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-4.c b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c > new file mode 100644 > index 00000000000..2a70a603047 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c > @@ -0,0 +1,24 @@ > +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ > +/* { dg-require-effective-target fpic } */ > +/* { dg-options "-fpic -mno-pcrel -O2 -msave-toc-indirect" } */ > + > +/* Verify the explicit -msave-toc-indirect always > + takes effect, so there is only one toc saving. */ > + > +void foo (void) __attribute__((__longcall__)); > +int baz (void) __attribute__((__longcall__)); > + > +__attribute__ ((cold)) int > +bar (void) > +{ > + foo (); > + return baz () + 1; > +} > + > +/* Linux LE */ > +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 1 { target powerpc_elfv2 } } } */ > +/* Linux BE 64 bit or AIX 64 bit */ > +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 1 { target { lp64 && { ! powerpc_elfv2 } } } } } */ > +/* AIX 32 bit */ > +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 1 { target { ilp32 && powerpc*-*-aix* } } } } */ > + > -- > 2.37.0
Hi, Gentle ping this: https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609993.html BR, Kewen > on 2023/1/16 17:08, Kewen.Lin via Gcc-patches wrote: >> Hi, >> >> As Honza pointed out in [1], the current uses of function >> optimize_function_for_speed_p in rs6000_option_override_internal >> are too early, since the query results from the functions >> optimize_function_for_{speed,size}_p could be changed later due >> to profile feedback and some function attributes handlings etc. >> >> This patch is to move optimize_function_for_speed_p to all the >> use places of the corresponding flags, which follows the existing >> practices. Maybe we can cache it somewhere at an appropriate >> timing, but that's another thing. >> >> Comparing with v1[2], this version added one test case for >> SAVE_TOC_INDIRECT as Segher questioned and suggested, and it >> also considered the possibility of explicit option (see test >> cases pr108184-2.c and pr108184-4.c). I believe that excepting >> for the intentional change on optimize_function_for_{speed, >> size}_p, there is no other function change. >> >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/607527.html >> [2] https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609379.html >> >> Bootstrapped and regtested on powerpc64-linux-gnu P8, >> powerpc64le-linux-gnu P{9,10} and powerpc-ibm-aix. >> >> Is it ok for trunk? >> >> BR, >> Kewen >> ----- >> gcc/ChangeLog: >> >> * config/rs6000/rs6000.cc (rs6000_option_override_internal): Remove >> all optimize_function_for_speed_p uses. >> (fusion_gpr_load_p): Call optimize_function_for_speed_p along >> with TARGET_P8_FUSION_SIGN. >> (expand_fusion_gpr_load): Likewise. >> (rs6000_call_aix): Call optimize_function_for_speed_p along with >> TARGET_SAVE_TOC_INDIRECT. >> * config/rs6000/predicates.md (fusion_gpr_mem_load): Call >> optimize_function_for_speed_p along with TARGET_P8_FUSION_SIGN. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/powerpc/pr108184-1.c: New test. >> * gcc.target/powerpc/pr108184-2.c: New test. >> * gcc.target/powerpc/pr108184-3.c: New test. >> * gcc.target/powerpc/pr108184-4.c: New test. >> --- >> gcc/config/rs6000/predicates.md | 5 +++- >> gcc/config/rs6000/rs6000.cc | 19 +++++++++----- >> gcc/testsuite/gcc.target/powerpc/pr108184-1.c | 16 ++++++++++++ >> gcc/testsuite/gcc.target/powerpc/pr108184-2.c | 15 +++++++++++ >> gcc/testsuite/gcc.target/powerpc/pr108184-3.c | 25 +++++++++++++++++++ >> gcc/testsuite/gcc.target/powerpc/pr108184-4.c | 24 ++++++++++++++++++ >> 6 files changed, 97 insertions(+), 7 deletions(-) >> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-1.c >> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-2.c >> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-3.c >> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-4.c >> >> diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md >> index a1764018545..9f84468db84 100644 >> --- a/gcc/config/rs6000/predicates.md >> +++ b/gcc/config/rs6000/predicates.md >> @@ -1878,7 +1878,10 @@ (define_predicate "fusion_gpr_mem_load" >> >> /* Handle sign/zero extend. */ >> if (GET_CODE (op) == ZERO_EXTEND >> - || (TARGET_P8_FUSION_SIGN && GET_CODE (op) == SIGN_EXTEND)) >> + || (TARGET_P8_FUSION_SIGN >> + && GET_CODE (op) == SIGN_EXTEND >> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >> + || optimize_function_for_speed_p (cfun)))) >> { >> op = XEXP (op, 0); >> mode = GET_MODE (op); >> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >> index 6ac3adcec6b..f47d21980a9 100644 >> --- a/gcc/config/rs6000/rs6000.cc >> +++ b/gcc/config/rs6000/rs6000.cc >> @@ -3997,8 +3997,7 @@ rs6000_option_override_internal (bool global_init_p) >> /* If we can shrink-wrap the TOC register save separately, then use >> -msave-toc-indirect unless explicitly disabled. */ >> if ((rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT) == 0 >> - && flag_shrink_wrap_separate >> - && optimize_function_for_speed_p (cfun)) >> + && flag_shrink_wrap_separate) >> rs6000_isa_flags |= OPTION_MASK_SAVE_TOC_INDIRECT; >> >> /* Enable power8 fusion if we are tuning for power8, even if we aren't >> @@ -4032,7 +4031,6 @@ rs6000_option_override_internal (bool global_init_p) >> zero extending load, and an explicit sign extension. */ >> if (TARGET_P8_FUSION >> && !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN) >> - && optimize_function_for_speed_p (cfun) >> && optimize >= 3) >> rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN; >> >> @@ -25690,7 +25688,10 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie) >> >> /* Can we optimize saving the TOC in the prologue or >> do we need to do it at every call? */ >> - if (TARGET_SAVE_TOC_INDIRECT && !cfun->calls_alloca) >> + if (TARGET_SAVE_TOC_INDIRECT >> + && !cfun->calls_alloca >> + && (rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT >> + || optimize_function_for_speed_p (cfun))) >> cfun->machine->save_toc_in_prologue = true; >> else >> { >> @@ -27557,7 +27558,10 @@ fusion_gpr_load_p (rtx addis_reg, /* register set via addis. */ >> >> /* Allow sign/zero extension. */ >> if (GET_CODE (mem) == ZERO_EXTEND >> - || (GET_CODE (mem) == SIGN_EXTEND && TARGET_P8_FUSION_SIGN)) >> + || (GET_CODE (mem) == SIGN_EXTEND >> + && TARGET_P8_FUSION_SIGN >> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >> + || optimize_function_for_speed_p (cfun)))) >> mem = XEXP (mem, 0); >> >> if (!MEM_P (mem)) >> @@ -27621,7 +27625,10 @@ expand_fusion_gpr_load (rtx *operands) >> enum rtx_code extend = UNKNOWN; >> >> if (GET_CODE (orig_mem) == ZERO_EXTEND >> - || (TARGET_P8_FUSION_SIGN && GET_CODE (orig_mem) == SIGN_EXTEND)) >> + || (TARGET_P8_FUSION_SIGN >> + && GET_CODE (orig_mem) == SIGN_EXTEND >> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >> + || optimize_function_for_speed_p (cfun)))) >> { >> extend = GET_CODE (orig_mem); >> orig_mem = XEXP (orig_mem, 0); >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-1.c b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >> new file mode 100644 >> index 00000000000..eae3bb9b7c4 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >> @@ -0,0 +1,16 @@ >> +/* Only possible to fuse sign extended loads with the addis when >> + optimize >= 3 and Power8 fusion takes effects. -mno-prefixed >> + ensures test point isn't broken (due to prefixed plha). */ >> +/* { dg-options "-mdejagnu-tune=power8 -O3 -mno-prefixed" } */ >> + >> +/* Verify it doesn't optimize this function for speed as it's cold, >> + by checking it doesn't try to fuse sign extended loads with addis, >> + which results in a zero extended load and a sign extension. */ >> + >> +__attribute__ ((cold)) int >> +fusion_short (short *p) >> +{ >> + return p[0x12345]; >> +} >> + >> +/* { dg-final { scan-assembler-not {\mextsh\M} } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-2.c b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >> new file mode 100644 >> index 00000000000..9f703f7fc6b >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >> @@ -0,0 +1,15 @@ >> +/* -mno-prefixed ensures test point isn't broken (due to prefixed plha). */ >> +/* { dg-options "-mdejagnu-tune=power8 -O2 -mpower8-fusion-sign -mno-prefixed" } */ >> + >> +/* Verify the explicit option -mpower8-fusion-sign can still fuse >> + sign extended loads with addis, which results in a zero extended >> + load and a sign extension. It means the cold attribute which >> + indicates to optimize for size doesn't affect. */ >> + >> +__attribute__ ((cold)) int >> +fusion_short (short *p) >> +{ >> + return p[0x12345]; >> +} >> + >> +/* { dg-final { scan-assembler-times {\mextsh\M} 1 } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-3.c b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >> new file mode 100644 >> index 00000000000..ceaa96e4421 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >> @@ -0,0 +1,25 @@ >> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >> +/* { dg-require-effective-target fpic } */ >> +/* { dg-options "-fpic -mno-pcrel -O2" } */ >> + >> +/* Verify it doesn't imply -msave-toc-indirect, so >> + it doesn't take effect and we have two separated >> + toc savings for these two long calls. */ >> + >> +void foo (void) __attribute__((__longcall__)); >> +int baz (void) __attribute__((__longcall__)); >> + >> +__attribute__ ((cold)) int >> +bar (void) >> +{ >> + foo (); >> + return baz () + 1; >> +} >> + >> +/* Linux LE */ >> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 2 { target powerpc_elfv2 } } } */ >> +/* Linux BE 64 bit or AIX 64 bit */ >> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 2 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >> +/* AIX 32 bit */ >> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 2 { target { ilp32 && powerpc*-*-aix* } } } } */ >> + >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-4.c b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >> new file mode 100644 >> index 00000000000..2a70a603047 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >> @@ -0,0 +1,24 @@ >> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >> +/* { dg-require-effective-target fpic } */ >> +/* { dg-options "-fpic -mno-pcrel -O2 -msave-toc-indirect" } */ >> + >> +/* Verify the explicit -msave-toc-indirect always >> + takes effect, so there is only one toc saving. */ >> + >> +void foo (void) __attribute__((__longcall__)); >> +int baz (void) __attribute__((__longcall__)); >> + >> +__attribute__ ((cold)) int >> +bar (void) >> +{ >> + foo (); >> + return baz () + 1; >> +} >> + >> +/* Linux LE */ >> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 1 { target powerpc_elfv2 } } } */ >> +/* Linux BE 64 bit or AIX 64 bit */ >> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 1 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >> +/* AIX 32 bit */ >> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 1 { target { ilp32 && powerpc*-*-aix* } } } } */ >> + >> -- >> 2.37.0 > BR, Kewen
Hi, Gentle ping this: https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609993.html BR, Kewen >> on 2023/1/16 17:08, Kewen.Lin via Gcc-patches wrote: >>> Hi, >>> >>> As Honza pointed out in [1], the current uses of function >>> optimize_function_for_speed_p in rs6000_option_override_internal >>> are too early, since the query results from the functions >>> optimize_function_for_{speed,size}_p could be changed later due >>> to profile feedback and some function attributes handlings etc. >>> >>> This patch is to move optimize_function_for_speed_p to all the >>> use places of the corresponding flags, which follows the existing >>> practices. Maybe we can cache it somewhere at an appropriate >>> timing, but that's another thing. >>> >>> Comparing with v1[2], this version added one test case for >>> SAVE_TOC_INDIRECT as Segher questioned and suggested, and it >>> also considered the possibility of explicit option (see test >>> cases pr108184-2.c and pr108184-4.c). I believe that excepting >>> for the intentional change on optimize_function_for_{speed, >>> size}_p, there is no other function change. >>> >>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/607527.html >>> [2] https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609379.html >>> >>> Bootstrapped and regtested on powerpc64-linux-gnu P8, >>> powerpc64le-linux-gnu P{9,10} and powerpc-ibm-aix. >>> >>> Is it ok for trunk? >>> >>> BR, >>> Kewen >>> ----- >>> gcc/ChangeLog: >>> >>> * config/rs6000/rs6000.cc (rs6000_option_override_internal): Remove >>> all optimize_function_for_speed_p uses. >>> (fusion_gpr_load_p): Call optimize_function_for_speed_p along >>> with TARGET_P8_FUSION_SIGN. >>> (expand_fusion_gpr_load): Likewise. >>> (rs6000_call_aix): Call optimize_function_for_speed_p along with >>> TARGET_SAVE_TOC_INDIRECT. >>> * config/rs6000/predicates.md (fusion_gpr_mem_load): Call >>> optimize_function_for_speed_p along with TARGET_P8_FUSION_SIGN. >>> >>> gcc/testsuite/ChangeLog: >>> >>> * gcc.target/powerpc/pr108184-1.c: New test. >>> * gcc.target/powerpc/pr108184-2.c: New test. >>> * gcc.target/powerpc/pr108184-3.c: New test. >>> * gcc.target/powerpc/pr108184-4.c: New test. >>> --- >>> gcc/config/rs6000/predicates.md | 5 +++- >>> gcc/config/rs6000/rs6000.cc | 19 +++++++++----- >>> gcc/testsuite/gcc.target/powerpc/pr108184-1.c | 16 ++++++++++++ >>> gcc/testsuite/gcc.target/powerpc/pr108184-2.c | 15 +++++++++++ >>> gcc/testsuite/gcc.target/powerpc/pr108184-3.c | 25 +++++++++++++++++++ >>> gcc/testsuite/gcc.target/powerpc/pr108184-4.c | 24 ++++++++++++++++++ >>> 6 files changed, 97 insertions(+), 7 deletions(-) >>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>> >>> diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md >>> index a1764018545..9f84468db84 100644 >>> --- a/gcc/config/rs6000/predicates.md >>> +++ b/gcc/config/rs6000/predicates.md >>> @@ -1878,7 +1878,10 @@ (define_predicate "fusion_gpr_mem_load" >>> >>> /* Handle sign/zero extend. */ >>> if (GET_CODE (op) == ZERO_EXTEND >>> - || (TARGET_P8_FUSION_SIGN && GET_CODE (op) == SIGN_EXTEND)) >>> + || (TARGET_P8_FUSION_SIGN >>> + && GET_CODE (op) == SIGN_EXTEND >>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>> + || optimize_function_for_speed_p (cfun)))) >>> { >>> op = XEXP (op, 0); >>> mode = GET_MODE (op); >>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >>> index 6ac3adcec6b..f47d21980a9 100644 >>> --- a/gcc/config/rs6000/rs6000.cc >>> +++ b/gcc/config/rs6000/rs6000.cc >>> @@ -3997,8 +3997,7 @@ rs6000_option_override_internal (bool global_init_p) >>> /* If we can shrink-wrap the TOC register save separately, then use >>> -msave-toc-indirect unless explicitly disabled. */ >>> if ((rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT) == 0 >>> - && flag_shrink_wrap_separate >>> - && optimize_function_for_speed_p (cfun)) >>> + && flag_shrink_wrap_separate) >>> rs6000_isa_flags |= OPTION_MASK_SAVE_TOC_INDIRECT; >>> >>> /* Enable power8 fusion if we are tuning for power8, even if we aren't >>> @@ -4032,7 +4031,6 @@ rs6000_option_override_internal (bool global_init_p) >>> zero extending load, and an explicit sign extension. */ >>> if (TARGET_P8_FUSION >>> && !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN) >>> - && optimize_function_for_speed_p (cfun) >>> && optimize >= 3) >>> rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN; >>> >>> @@ -25690,7 +25688,10 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie) >>> >>> /* Can we optimize saving the TOC in the prologue or >>> do we need to do it at every call? */ >>> - if (TARGET_SAVE_TOC_INDIRECT && !cfun->calls_alloca) >>> + if (TARGET_SAVE_TOC_INDIRECT >>> + && !cfun->calls_alloca >>> + && (rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT >>> + || optimize_function_for_speed_p (cfun))) >>> cfun->machine->save_toc_in_prologue = true; >>> else >>> { >>> @@ -27557,7 +27558,10 @@ fusion_gpr_load_p (rtx addis_reg, /* register set via addis. */ >>> >>> /* Allow sign/zero extension. */ >>> if (GET_CODE (mem) == ZERO_EXTEND >>> - || (GET_CODE (mem) == SIGN_EXTEND && TARGET_P8_FUSION_SIGN)) >>> + || (GET_CODE (mem) == SIGN_EXTEND >>> + && TARGET_P8_FUSION_SIGN >>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>> + || optimize_function_for_speed_p (cfun)))) >>> mem = XEXP (mem, 0); >>> >>> if (!MEM_P (mem)) >>> @@ -27621,7 +27625,10 @@ expand_fusion_gpr_load (rtx *operands) >>> enum rtx_code extend = UNKNOWN; >>> >>> if (GET_CODE (orig_mem) == ZERO_EXTEND >>> - || (TARGET_P8_FUSION_SIGN && GET_CODE (orig_mem) == SIGN_EXTEND)) >>> + || (TARGET_P8_FUSION_SIGN >>> + && GET_CODE (orig_mem) == SIGN_EXTEND >>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>> + || optimize_function_for_speed_p (cfun)))) >>> { >>> extend = GET_CODE (orig_mem); >>> orig_mem = XEXP (orig_mem, 0); >>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-1.c b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>> new file mode 100644 >>> index 00000000000..eae3bb9b7c4 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>> @@ -0,0 +1,16 @@ >>> +/* Only possible to fuse sign extended loads with the addis when >>> + optimize >= 3 and Power8 fusion takes effects. -mno-prefixed >>> + ensures test point isn't broken (due to prefixed plha). */ >>> +/* { dg-options "-mdejagnu-tune=power8 -O3 -mno-prefixed" } */ >>> + >>> +/* Verify it doesn't optimize this function for speed as it's cold, >>> + by checking it doesn't try to fuse sign extended loads with addis, >>> + which results in a zero extended load and a sign extension. */ >>> + >>> +__attribute__ ((cold)) int >>> +fusion_short (short *p) >>> +{ >>> + return p[0x12345]; >>> +} >>> + >>> +/* { dg-final { scan-assembler-not {\mextsh\M} } } */ >>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-2.c b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>> new file mode 100644 >>> index 00000000000..9f703f7fc6b >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>> @@ -0,0 +1,15 @@ >>> +/* -mno-prefixed ensures test point isn't broken (due to prefixed plha). */ >>> +/* { dg-options "-mdejagnu-tune=power8 -O2 -mpower8-fusion-sign -mno-prefixed" } */ >>> + >>> +/* Verify the explicit option -mpower8-fusion-sign can still fuse >>> + sign extended loads with addis, which results in a zero extended >>> + load and a sign extension. It means the cold attribute which >>> + indicates to optimize for size doesn't affect. */ >>> + >>> +__attribute__ ((cold)) int >>> +fusion_short (short *p) >>> +{ >>> + return p[0x12345]; >>> +} >>> + >>> +/* { dg-final { scan-assembler-times {\mextsh\M} 1 } } */ >>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-3.c b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>> new file mode 100644 >>> index 00000000000..ceaa96e4421 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>> @@ -0,0 +1,25 @@ >>> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >>> +/* { dg-require-effective-target fpic } */ >>> +/* { dg-options "-fpic -mno-pcrel -O2" } */ >>> + >>> +/* Verify it doesn't imply -msave-toc-indirect, so >>> + it doesn't take effect and we have two separated >>> + toc savings for these two long calls. */ >>> + >>> +void foo (void) __attribute__((__longcall__)); >>> +int baz (void) __attribute__((__longcall__)); >>> + >>> +__attribute__ ((cold)) int >>> +bar (void) >>> +{ >>> + foo (); >>> + return baz () + 1; >>> +} >>> + >>> +/* Linux LE */ >>> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 2 { target powerpc_elfv2 } } } */ >>> +/* Linux BE 64 bit or AIX 64 bit */ >>> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 2 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >>> +/* AIX 32 bit */ >>> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 2 { target { ilp32 && powerpc*-*-aix* } } } } */ >>> + >>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-4.c b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>> new file mode 100644 >>> index 00000000000..2a70a603047 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>> @@ -0,0 +1,24 @@ >>> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >>> +/* { dg-require-effective-target fpic } */ >>> +/* { dg-options "-fpic -mno-pcrel -O2 -msave-toc-indirect" } */ >>> + >>> +/* Verify the explicit -msave-toc-indirect always >>> + takes effect, so there is only one toc saving. */ >>> + >>> +void foo (void) __attribute__((__longcall__)); >>> +int baz (void) __attribute__((__longcall__)); >>> + >>> +__attribute__ ((cold)) int >>> +bar (void) >>> +{ >>> + foo (); >>> + return baz () + 1; >>> +} >>> + >>> +/* Linux LE */ >>> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 1 { target powerpc_elfv2 } } } */ >>> +/* Linux BE 64 bit or AIX 64 bit */ >>> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 1 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >>> +/* AIX 32 bit */ >>> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 1 { target { ilp32 && powerpc*-*-aix* } } } } */ >>> + >>> -- >>> 2.37.0
Hi, Gentle ping this: https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609993.html BR, Kewen >>> on 2023/1/16 17:08, Kewen.Lin via Gcc-patches wrote: >>>> Hi, >>>> >>>> As Honza pointed out in [1], the current uses of function >>>> optimize_function_for_speed_p in rs6000_option_override_internal >>>> are too early, since the query results from the functions >>>> optimize_function_for_{speed,size}_p could be changed later due >>>> to profile feedback and some function attributes handlings etc. >>>> >>>> This patch is to move optimize_function_for_speed_p to all the >>>> use places of the corresponding flags, which follows the existing >>>> practices. Maybe we can cache it somewhere at an appropriate >>>> timing, but that's another thing. >>>> >>>> Comparing with v1[2], this version added one test case for >>>> SAVE_TOC_INDIRECT as Segher questioned and suggested, and it >>>> also considered the possibility of explicit option (see test >>>> cases pr108184-2.c and pr108184-4.c). I believe that excepting >>>> for the intentional change on optimize_function_for_{speed, >>>> size}_p, there is no other function change. >>>> >>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/607527.html >>>> [2] https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609379.html >>>> >>>> Bootstrapped and regtested on powerpc64-linux-gnu P8, >>>> powerpc64le-linux-gnu P{9,10} and powerpc-ibm-aix. >>>> >>>> Is it ok for trunk? >>>> >>>> BR, >>>> Kewen >>>> ----- >>>> gcc/ChangeLog: >>>> >>>> * config/rs6000/rs6000.cc (rs6000_option_override_internal): Remove >>>> all optimize_function_for_speed_p uses. >>>> (fusion_gpr_load_p): Call optimize_function_for_speed_p along >>>> with TARGET_P8_FUSION_SIGN. >>>> (expand_fusion_gpr_load): Likewise. >>>> (rs6000_call_aix): Call optimize_function_for_speed_p along with >>>> TARGET_SAVE_TOC_INDIRECT. >>>> * config/rs6000/predicates.md (fusion_gpr_mem_load): Call >>>> optimize_function_for_speed_p along with TARGET_P8_FUSION_SIGN. >>>> >>>> gcc/testsuite/ChangeLog: >>>> >>>> * gcc.target/powerpc/pr108184-1.c: New test. >>>> * gcc.target/powerpc/pr108184-2.c: New test. >>>> * gcc.target/powerpc/pr108184-3.c: New test. >>>> * gcc.target/powerpc/pr108184-4.c: New test. >>>> --- >>>> gcc/config/rs6000/predicates.md | 5 +++- >>>> gcc/config/rs6000/rs6000.cc | 19 +++++++++----- >>>> gcc/testsuite/gcc.target/powerpc/pr108184-1.c | 16 ++++++++++++ >>>> gcc/testsuite/gcc.target/powerpc/pr108184-2.c | 15 +++++++++++ >>>> gcc/testsuite/gcc.target/powerpc/pr108184-3.c | 25 +++++++++++++++++++ >>>> gcc/testsuite/gcc.target/powerpc/pr108184-4.c | 24 ++++++++++++++++++ >>>> 6 files changed, 97 insertions(+), 7 deletions(-) >>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>>> >>>> diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md >>>> index a1764018545..9f84468db84 100644 >>>> --- a/gcc/config/rs6000/predicates.md >>>> +++ b/gcc/config/rs6000/predicates.md >>>> @@ -1878,7 +1878,10 @@ (define_predicate "fusion_gpr_mem_load" >>>> >>>> /* Handle sign/zero extend. */ >>>> if (GET_CODE (op) == ZERO_EXTEND >>>> - || (TARGET_P8_FUSION_SIGN && GET_CODE (op) == SIGN_EXTEND)) >>>> + || (TARGET_P8_FUSION_SIGN >>>> + && GET_CODE (op) == SIGN_EXTEND >>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>>> + || optimize_function_for_speed_p (cfun)))) >>>> { >>>> op = XEXP (op, 0); >>>> mode = GET_MODE (op); >>>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >>>> index 6ac3adcec6b..f47d21980a9 100644 >>>> --- a/gcc/config/rs6000/rs6000.cc >>>> +++ b/gcc/config/rs6000/rs6000.cc >>>> @@ -3997,8 +3997,7 @@ rs6000_option_override_internal (bool global_init_p) >>>> /* If we can shrink-wrap the TOC register save separately, then use >>>> -msave-toc-indirect unless explicitly disabled. */ >>>> if ((rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT) == 0 >>>> - && flag_shrink_wrap_separate >>>> - && optimize_function_for_speed_p (cfun)) >>>> + && flag_shrink_wrap_separate) >>>> rs6000_isa_flags |= OPTION_MASK_SAVE_TOC_INDIRECT; >>>> >>>> /* Enable power8 fusion if we are tuning for power8, even if we aren't >>>> @@ -4032,7 +4031,6 @@ rs6000_option_override_internal (bool global_init_p) >>>> zero extending load, and an explicit sign extension. */ >>>> if (TARGET_P8_FUSION >>>> && !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN) >>>> - && optimize_function_for_speed_p (cfun) >>>> && optimize >= 3) >>>> rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN; >>>> >>>> @@ -25690,7 +25688,10 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie) >>>> >>>> /* Can we optimize saving the TOC in the prologue or >>>> do we need to do it at every call? */ >>>> - if (TARGET_SAVE_TOC_INDIRECT && !cfun->calls_alloca) >>>> + if (TARGET_SAVE_TOC_INDIRECT >>>> + && !cfun->calls_alloca >>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT >>>> + || optimize_function_for_speed_p (cfun))) >>>> cfun->machine->save_toc_in_prologue = true; >>>> else >>>> { >>>> @@ -27557,7 +27558,10 @@ fusion_gpr_load_p (rtx addis_reg, /* register set via addis. */ >>>> >>>> /* Allow sign/zero extension. */ >>>> if (GET_CODE (mem) == ZERO_EXTEND >>>> - || (GET_CODE (mem) == SIGN_EXTEND && TARGET_P8_FUSION_SIGN)) >>>> + || (GET_CODE (mem) == SIGN_EXTEND >>>> + && TARGET_P8_FUSION_SIGN >>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>>> + || optimize_function_for_speed_p (cfun)))) >>>> mem = XEXP (mem, 0); >>>> >>>> if (!MEM_P (mem)) >>>> @@ -27621,7 +27625,10 @@ expand_fusion_gpr_load (rtx *operands) >>>> enum rtx_code extend = UNKNOWN; >>>> >>>> if (GET_CODE (orig_mem) == ZERO_EXTEND >>>> - || (TARGET_P8_FUSION_SIGN && GET_CODE (orig_mem) == SIGN_EXTEND)) >>>> + || (TARGET_P8_FUSION_SIGN >>>> + && GET_CODE (orig_mem) == SIGN_EXTEND >>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>>> + || optimize_function_for_speed_p (cfun)))) >>>> { >>>> extend = GET_CODE (orig_mem); >>>> orig_mem = XEXP (orig_mem, 0); >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-1.c b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>>> new file mode 100644 >>>> index 00000000000..eae3bb9b7c4 >>>> --- /dev/null >>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>>> @@ -0,0 +1,16 @@ >>>> +/* Only possible to fuse sign extended loads with the addis when >>>> + optimize >= 3 and Power8 fusion takes effects. -mno-prefixed >>>> + ensures test point isn't broken (due to prefixed plha). */ >>>> +/* { dg-options "-mdejagnu-tune=power8 -O3 -mno-prefixed" } */ >>>> + >>>> +/* Verify it doesn't optimize this function for speed as it's cold, >>>> + by checking it doesn't try to fuse sign extended loads with addis, >>>> + which results in a zero extended load and a sign extension. */ >>>> + >>>> +__attribute__ ((cold)) int >>>> +fusion_short (short *p) >>>> +{ >>>> + return p[0x12345]; >>>> +} >>>> + >>>> +/* { dg-final { scan-assembler-not {\mextsh\M} } } */ >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-2.c b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>>> new file mode 100644 >>>> index 00000000000..9f703f7fc6b >>>> --- /dev/null >>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>>> @@ -0,0 +1,15 @@ >>>> +/* -mno-prefixed ensures test point isn't broken (due to prefixed plha). */ >>>> +/* { dg-options "-mdejagnu-tune=power8 -O2 -mpower8-fusion-sign -mno-prefixed" } */ >>>> + >>>> +/* Verify the explicit option -mpower8-fusion-sign can still fuse >>>> + sign extended loads with addis, which results in a zero extended >>>> + load and a sign extension. It means the cold attribute which >>>> + indicates to optimize for size doesn't affect. */ >>>> + >>>> +__attribute__ ((cold)) int >>>> +fusion_short (short *p) >>>> +{ >>>> + return p[0x12345]; >>>> +} >>>> + >>>> +/* { dg-final { scan-assembler-times {\mextsh\M} 1 } } */ >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-3.c b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>>> new file mode 100644 >>>> index 00000000000..ceaa96e4421 >>>> --- /dev/null >>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>>> @@ -0,0 +1,25 @@ >>>> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >>>> +/* { dg-require-effective-target fpic } */ >>>> +/* { dg-options "-fpic -mno-pcrel -O2" } */ >>>> + >>>> +/* Verify it doesn't imply -msave-toc-indirect, so >>>> + it doesn't take effect and we have two separated >>>> + toc savings for these two long calls. */ >>>> + >>>> +void foo (void) __attribute__((__longcall__)); >>>> +int baz (void) __attribute__((__longcall__)); >>>> + >>>> +__attribute__ ((cold)) int >>>> +bar (void) >>>> +{ >>>> + foo (); >>>> + return baz () + 1; >>>> +} >>>> + >>>> +/* Linux LE */ >>>> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 2 { target powerpc_elfv2 } } } */ >>>> +/* Linux BE 64 bit or AIX 64 bit */ >>>> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 2 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >>>> +/* AIX 32 bit */ >>>> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 2 { target { ilp32 && powerpc*-*-aix* } } } } */ >>>> + >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-4.c b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>>> new file mode 100644 >>>> index 00000000000..2a70a603047 >>>> --- /dev/null >>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>>> @@ -0,0 +1,24 @@ >>>> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >>>> +/* { dg-require-effective-target fpic } */ >>>> +/* { dg-options "-fpic -mno-pcrel -O2 -msave-toc-indirect" } */ >>>> + >>>> +/* Verify the explicit -msave-toc-indirect always >>>> + takes effect, so there is only one toc saving. */ >>>> + >>>> +void foo (void) __attribute__((__longcall__)); >>>> +int baz (void) __attribute__((__longcall__)); >>>> + >>>> +__attribute__ ((cold)) int >>>> +bar (void) >>>> +{ >>>> + foo (); >>>> + return baz () + 1; >>>> +} >>>> + >>>> +/* Linux LE */ >>>> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 1 { target powerpc_elfv2 } } } */ >>>> +/* Linux BE 64 bit or AIX 64 bit */ >>>> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 1 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >>>> +/* AIX 32 bit */ >>>> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 1 { target { ilp32 && powerpc*-*-aix* } } } } */ >>>> + >>>> -- >>>> 2.37.0
Hi, Gentle ping this: https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609993.html BR, Kewen > >>>> on 2023/1/16 17:08, Kewen.Lin via Gcc-patches wrote: >>>>> Hi, >>>>> >>>>> As Honza pointed out in [1], the current uses of function >>>>> optimize_function_for_speed_p in rs6000_option_override_internal >>>>> are too early, since the query results from the functions >>>>> optimize_function_for_{speed,size}_p could be changed later due >>>>> to profile feedback and some function attributes handlings etc. >>>>> >>>>> This patch is to move optimize_function_for_speed_p to all the >>>>> use places of the corresponding flags, which follows the existing >>>>> practices. Maybe we can cache it somewhere at an appropriate >>>>> timing, but that's another thing. >>>>> >>>>> Comparing with v1[2], this version added one test case for >>>>> SAVE_TOC_INDIRECT as Segher questioned and suggested, and it >>>>> also considered the possibility of explicit option (see test >>>>> cases pr108184-2.c and pr108184-4.c). I believe that excepting >>>>> for the intentional change on optimize_function_for_{speed, >>>>> size}_p, there is no other function change. >>>>> >>>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/607527.html >>>>> [2] https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609379.html >>>>> >>>>> Bootstrapped and regtested on powerpc64-linux-gnu P8, >>>>> powerpc64le-linux-gnu P{9,10} and powerpc-ibm-aix. >>>>> >>>>> Is it ok for trunk? >>>>> >>>>> BR, >>>>> Kewen >>>>> ----- >>>>> gcc/ChangeLog: >>>>> >>>>> * config/rs6000/rs6000.cc (rs6000_option_override_internal): Remove >>>>> all optimize_function_for_speed_p uses. >>>>> (fusion_gpr_load_p): Call optimize_function_for_speed_p along >>>>> with TARGET_P8_FUSION_SIGN. >>>>> (expand_fusion_gpr_load): Likewise. >>>>> (rs6000_call_aix): Call optimize_function_for_speed_p along with >>>>> TARGET_SAVE_TOC_INDIRECT. >>>>> * config/rs6000/predicates.md (fusion_gpr_mem_load): Call >>>>> optimize_function_for_speed_p along with TARGET_P8_FUSION_SIGN. >>>>> >>>>> gcc/testsuite/ChangeLog: >>>>> >>>>> * gcc.target/powerpc/pr108184-1.c: New test. >>>>> * gcc.target/powerpc/pr108184-2.c: New test. >>>>> * gcc.target/powerpc/pr108184-3.c: New test. >>>>> * gcc.target/powerpc/pr108184-4.c: New test. >>>>> --- >>>>> gcc/config/rs6000/predicates.md | 5 +++- >>>>> gcc/config/rs6000/rs6000.cc | 19 +++++++++----- >>>>> gcc/testsuite/gcc.target/powerpc/pr108184-1.c | 16 ++++++++++++ >>>>> gcc/testsuite/gcc.target/powerpc/pr108184-2.c | 15 +++++++++++ >>>>> gcc/testsuite/gcc.target/powerpc/pr108184-3.c | 25 +++++++++++++++++++ >>>>> gcc/testsuite/gcc.target/powerpc/pr108184-4.c | 24 ++++++++++++++++++ >>>>> 6 files changed, 97 insertions(+), 7 deletions(-) >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>>>> >>>>> diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md >>>>> index a1764018545..9f84468db84 100644 >>>>> --- a/gcc/config/rs6000/predicates.md >>>>> +++ b/gcc/config/rs6000/predicates.md >>>>> @@ -1878,7 +1878,10 @@ (define_predicate "fusion_gpr_mem_load" >>>>> >>>>> /* Handle sign/zero extend. */ >>>>> if (GET_CODE (op) == ZERO_EXTEND >>>>> - || (TARGET_P8_FUSION_SIGN && GET_CODE (op) == SIGN_EXTEND)) >>>>> + || (TARGET_P8_FUSION_SIGN >>>>> + && GET_CODE (op) == SIGN_EXTEND >>>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>>>> + || optimize_function_for_speed_p (cfun)))) >>>>> { >>>>> op = XEXP (op, 0); >>>>> mode = GET_MODE (op); >>>>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >>>>> index 6ac3adcec6b..f47d21980a9 100644 >>>>> --- a/gcc/config/rs6000/rs6000.cc >>>>> +++ b/gcc/config/rs6000/rs6000.cc >>>>> @@ -3997,8 +3997,7 @@ rs6000_option_override_internal (bool global_init_p) >>>>> /* If we can shrink-wrap the TOC register save separately, then use >>>>> -msave-toc-indirect unless explicitly disabled. */ >>>>> if ((rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT) == 0 >>>>> - && flag_shrink_wrap_separate >>>>> - && optimize_function_for_speed_p (cfun)) >>>>> + && flag_shrink_wrap_separate) >>>>> rs6000_isa_flags |= OPTION_MASK_SAVE_TOC_INDIRECT; >>>>> >>>>> /* Enable power8 fusion if we are tuning for power8, even if we aren't >>>>> @@ -4032,7 +4031,6 @@ rs6000_option_override_internal (bool global_init_p) >>>>> zero extending load, and an explicit sign extension. */ >>>>> if (TARGET_P8_FUSION >>>>> && !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN) >>>>> - && optimize_function_for_speed_p (cfun) >>>>> && optimize >= 3) >>>>> rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN; >>>>> >>>>> @@ -25690,7 +25688,10 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie) >>>>> >>>>> /* Can we optimize saving the TOC in the prologue or >>>>> do we need to do it at every call? */ >>>>> - if (TARGET_SAVE_TOC_INDIRECT && !cfun->calls_alloca) >>>>> + if (TARGET_SAVE_TOC_INDIRECT >>>>> + && !cfun->calls_alloca >>>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT >>>>> + || optimize_function_for_speed_p (cfun))) >>>>> cfun->machine->save_toc_in_prologue = true; >>>>> else >>>>> { >>>>> @@ -27557,7 +27558,10 @@ fusion_gpr_load_p (rtx addis_reg, /* register set via addis. */ >>>>> >>>>> /* Allow sign/zero extension. */ >>>>> if (GET_CODE (mem) == ZERO_EXTEND >>>>> - || (GET_CODE (mem) == SIGN_EXTEND && TARGET_P8_FUSION_SIGN)) >>>>> + || (GET_CODE (mem) == SIGN_EXTEND >>>>> + && TARGET_P8_FUSION_SIGN >>>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>>>> + || optimize_function_for_speed_p (cfun)))) >>>>> mem = XEXP (mem, 0); >>>>> >>>>> if (!MEM_P (mem)) >>>>> @@ -27621,7 +27625,10 @@ expand_fusion_gpr_load (rtx *operands) >>>>> enum rtx_code extend = UNKNOWN; >>>>> >>>>> if (GET_CODE (orig_mem) == ZERO_EXTEND >>>>> - || (TARGET_P8_FUSION_SIGN && GET_CODE (orig_mem) == SIGN_EXTEND)) >>>>> + || (TARGET_P8_FUSION_SIGN >>>>> + && GET_CODE (orig_mem) == SIGN_EXTEND >>>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>>>> + || optimize_function_for_speed_p (cfun)))) >>>>> { >>>>> extend = GET_CODE (orig_mem); >>>>> orig_mem = XEXP (orig_mem, 0); >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-1.c b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>>>> new file mode 100644 >>>>> index 00000000000..eae3bb9b7c4 >>>>> --- /dev/null >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>>>> @@ -0,0 +1,16 @@ >>>>> +/* Only possible to fuse sign extended loads with the addis when >>>>> + optimize >= 3 and Power8 fusion takes effects. -mno-prefixed >>>>> + ensures test point isn't broken (due to prefixed plha). */ >>>>> +/* { dg-options "-mdejagnu-tune=power8 -O3 -mno-prefixed" } */ >>>>> + >>>>> +/* Verify it doesn't optimize this function for speed as it's cold, >>>>> + by checking it doesn't try to fuse sign extended loads with addis, >>>>> + which results in a zero extended load and a sign extension. */ >>>>> + >>>>> +__attribute__ ((cold)) int >>>>> +fusion_short (short *p) >>>>> +{ >>>>> + return p[0x12345]; >>>>> +} >>>>> + >>>>> +/* { dg-final { scan-assembler-not {\mextsh\M} } } */ >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-2.c b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>>>> new file mode 100644 >>>>> index 00000000000..9f703f7fc6b >>>>> --- /dev/null >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>>>> @@ -0,0 +1,15 @@ >>>>> +/* -mno-prefixed ensures test point isn't broken (due to prefixed plha). */ >>>>> +/* { dg-options "-mdejagnu-tune=power8 -O2 -mpower8-fusion-sign -mno-prefixed" } */ >>>>> + >>>>> +/* Verify the explicit option -mpower8-fusion-sign can still fuse >>>>> + sign extended loads with addis, which results in a zero extended >>>>> + load and a sign extension. It means the cold attribute which >>>>> + indicates to optimize for size doesn't affect. */ >>>>> + >>>>> +__attribute__ ((cold)) int >>>>> +fusion_short (short *p) >>>>> +{ >>>>> + return p[0x12345]; >>>>> +} >>>>> + >>>>> +/* { dg-final { scan-assembler-times {\mextsh\M} 1 } } */ >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-3.c b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>>>> new file mode 100644 >>>>> index 00000000000..ceaa96e4421 >>>>> --- /dev/null >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>>>> @@ -0,0 +1,25 @@ >>>>> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >>>>> +/* { dg-require-effective-target fpic } */ >>>>> +/* { dg-options "-fpic -mno-pcrel -O2" } */ >>>>> + >>>>> +/* Verify it doesn't imply -msave-toc-indirect, so >>>>> + it doesn't take effect and we have two separated >>>>> + toc savings for these two long calls. */ >>>>> + >>>>> +void foo (void) __attribute__((__longcall__)); >>>>> +int baz (void) __attribute__((__longcall__)); >>>>> + >>>>> +__attribute__ ((cold)) int >>>>> +bar (void) >>>>> +{ >>>>> + foo (); >>>>> + return baz () + 1; >>>>> +} >>>>> + >>>>> +/* Linux LE */ >>>>> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 2 { target powerpc_elfv2 } } } */ >>>>> +/* Linux BE 64 bit or AIX 64 bit */ >>>>> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 2 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >>>>> +/* AIX 32 bit */ >>>>> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 2 { target { ilp32 && powerpc*-*-aix* } } } } */ >>>>> + >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-4.c b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>>>> new file mode 100644 >>>>> index 00000000000..2a70a603047 >>>>> --- /dev/null >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>>>> @@ -0,0 +1,24 @@ >>>>> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >>>>> +/* { dg-require-effective-target fpic } */ >>>>> +/* { dg-options "-fpic -mno-pcrel -O2 -msave-toc-indirect" } */ >>>>> + >>>>> +/* Verify the explicit -msave-toc-indirect always >>>>> + takes effect, so there is only one toc saving. */ >>>>> + >>>>> +void foo (void) __attribute__((__longcall__)); >>>>> +int baz (void) __attribute__((__longcall__)); >>>>> + >>>>> +__attribute__ ((cold)) int >>>>> +bar (void) >>>>> +{ >>>>> + foo (); >>>>> + return baz () + 1; >>>>> +} >>>>> + >>>>> +/* Linux LE */ >>>>> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 1 { target powerpc_elfv2 } } } */ >>>>> +/* Linux BE 64 bit or AIX 64 bit */ >>>>> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 1 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >>>>> +/* AIX 32 bit */ >>>>> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 1 { target { ilp32 && powerpc*-*-aix* } } } } */ >>>>> + >>>>> -- >>>>> 2.37.0
Hi, Gentle ping this: https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609993.html BR, Kewen >>>>> on 2023/1/16 17:08, Kewen.Lin via Gcc-patches wrote: >>>>>> Hi, >>>>>> >>>>>> As Honza pointed out in [1], the current uses of function >>>>>> optimize_function_for_speed_p in rs6000_option_override_internal >>>>>> are too early, since the query results from the functions >>>>>> optimize_function_for_{speed,size}_p could be changed later due >>>>>> to profile feedback and some function attributes handlings etc. >>>>>> >>>>>> This patch is to move optimize_function_for_speed_p to all the >>>>>> use places of the corresponding flags, which follows the existing >>>>>> practices. Maybe we can cache it somewhere at an appropriate >>>>>> timing, but that's another thing. >>>>>> >>>>>> Comparing with v1[2], this version added one test case for >>>>>> SAVE_TOC_INDIRECT as Segher questioned and suggested, and it >>>>>> also considered the possibility of explicit option (see test >>>>>> cases pr108184-2.c and pr108184-4.c). I believe that excepting >>>>>> for the intentional change on optimize_function_for_{speed, >>>>>> size}_p, there is no other function change. >>>>>> >>>>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/607527.html >>>>>> [2] https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609379.html >>>>>> >>>>>> Bootstrapped and regtested on powerpc64-linux-gnu P8, >>>>>> powerpc64le-linux-gnu P{9,10} and powerpc-ibm-aix. >>>>>> >>>>>> Is it ok for trunk? >>>>>> >>>>>> BR, >>>>>> Kewen >>>>>> ----- >>>>>> gcc/ChangeLog: >>>>>> >>>>>> * config/rs6000/rs6000.cc (rs6000_option_override_internal): Remove >>>>>> all optimize_function_for_speed_p uses. >>>>>> (fusion_gpr_load_p): Call optimize_function_for_speed_p along >>>>>> with TARGET_P8_FUSION_SIGN. >>>>>> (expand_fusion_gpr_load): Likewise. >>>>>> (rs6000_call_aix): Call optimize_function_for_speed_p along with >>>>>> TARGET_SAVE_TOC_INDIRECT. >>>>>> * config/rs6000/predicates.md (fusion_gpr_mem_load): Call >>>>>> optimize_function_for_speed_p along with TARGET_P8_FUSION_SIGN. >>>>>> >>>>>> gcc/testsuite/ChangeLog: >>>>>> >>>>>> * gcc.target/powerpc/pr108184-1.c: New test. >>>>>> * gcc.target/powerpc/pr108184-2.c: New test. >>>>>> * gcc.target/powerpc/pr108184-3.c: New test. >>>>>> * gcc.target/powerpc/pr108184-4.c: New test. >>>>>> --- >>>>>> gcc/config/rs6000/predicates.md | 5 +++- >>>>>> gcc/config/rs6000/rs6000.cc | 19 +++++++++----- >>>>>> gcc/testsuite/gcc.target/powerpc/pr108184-1.c | 16 ++++++++++++ >>>>>> gcc/testsuite/gcc.target/powerpc/pr108184-2.c | 15 +++++++++++ >>>>>> gcc/testsuite/gcc.target/powerpc/pr108184-3.c | 25 +++++++++++++++++++ >>>>>> gcc/testsuite/gcc.target/powerpc/pr108184-4.c | 24 ++++++++++++++++++ >>>>>> 6 files changed, 97 insertions(+), 7 deletions(-) >>>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>>>>> >>>>>> diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md >>>>>> index a1764018545..9f84468db84 100644 >>>>>> --- a/gcc/config/rs6000/predicates.md >>>>>> +++ b/gcc/config/rs6000/predicates.md >>>>>> @@ -1878,7 +1878,10 @@ (define_predicate "fusion_gpr_mem_load" >>>>>> >>>>>> /* Handle sign/zero extend. */ >>>>>> if (GET_CODE (op) == ZERO_EXTEND >>>>>> - || (TARGET_P8_FUSION_SIGN && GET_CODE (op) == SIGN_EXTEND)) >>>>>> + || (TARGET_P8_FUSION_SIGN >>>>>> + && GET_CODE (op) == SIGN_EXTEND >>>>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>>>>> + || optimize_function_for_speed_p (cfun)))) >>>>>> { >>>>>> op = XEXP (op, 0); >>>>>> mode = GET_MODE (op); >>>>>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >>>>>> index 6ac3adcec6b..f47d21980a9 100644 >>>>>> --- a/gcc/config/rs6000/rs6000.cc >>>>>> +++ b/gcc/config/rs6000/rs6000.cc >>>>>> @@ -3997,8 +3997,7 @@ rs6000_option_override_internal (bool global_init_p) >>>>>> /* If we can shrink-wrap the TOC register save separately, then use >>>>>> -msave-toc-indirect unless explicitly disabled. */ >>>>>> if ((rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT) == 0 >>>>>> - && flag_shrink_wrap_separate >>>>>> - && optimize_function_for_speed_p (cfun)) >>>>>> + && flag_shrink_wrap_separate) >>>>>> rs6000_isa_flags |= OPTION_MASK_SAVE_TOC_INDIRECT; >>>>>> >>>>>> /* Enable power8 fusion if we are tuning for power8, even if we aren't >>>>>> @@ -4032,7 +4031,6 @@ rs6000_option_override_internal (bool global_init_p) >>>>>> zero extending load, and an explicit sign extension. */ >>>>>> if (TARGET_P8_FUSION >>>>>> && !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN) >>>>>> - && optimize_function_for_speed_p (cfun) >>>>>> && optimize >= 3) >>>>>> rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN; >>>>>> >>>>>> @@ -25690,7 +25688,10 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie) >>>>>> >>>>>> /* Can we optimize saving the TOC in the prologue or >>>>>> do we need to do it at every call? */ >>>>>> - if (TARGET_SAVE_TOC_INDIRECT && !cfun->calls_alloca) >>>>>> + if (TARGET_SAVE_TOC_INDIRECT >>>>>> + && !cfun->calls_alloca >>>>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT >>>>>> + || optimize_function_for_speed_p (cfun))) >>>>>> cfun->machine->save_toc_in_prologue = true; >>>>>> else >>>>>> { >>>>>> @@ -27557,7 +27558,10 @@ fusion_gpr_load_p (rtx addis_reg, /* register set via addis. */ >>>>>> >>>>>> /* Allow sign/zero extension. */ >>>>>> if (GET_CODE (mem) == ZERO_EXTEND >>>>>> - || (GET_CODE (mem) == SIGN_EXTEND && TARGET_P8_FUSION_SIGN)) >>>>>> + || (GET_CODE (mem) == SIGN_EXTEND >>>>>> + && TARGET_P8_FUSION_SIGN >>>>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>>>>> + || optimize_function_for_speed_p (cfun)))) >>>>>> mem = XEXP (mem, 0); >>>>>> >>>>>> if (!MEM_P (mem)) >>>>>> @@ -27621,7 +27625,10 @@ expand_fusion_gpr_load (rtx *operands) >>>>>> enum rtx_code extend = UNKNOWN; >>>>>> >>>>>> if (GET_CODE (orig_mem) == ZERO_EXTEND >>>>>> - || (TARGET_P8_FUSION_SIGN && GET_CODE (orig_mem) == SIGN_EXTEND)) >>>>>> + || (TARGET_P8_FUSION_SIGN >>>>>> + && GET_CODE (orig_mem) == SIGN_EXTEND >>>>>> + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN >>>>>> + || optimize_function_for_speed_p (cfun)))) >>>>>> { >>>>>> extend = GET_CODE (orig_mem); >>>>>> orig_mem = XEXP (orig_mem, 0); >>>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-1.c b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>>>>> new file mode 100644 >>>>>> index 00000000000..eae3bb9b7c4 >>>>>> --- /dev/null >>>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c >>>>>> @@ -0,0 +1,16 @@ >>>>>> +/* Only possible to fuse sign extended loads with the addis when >>>>>> + optimize >= 3 and Power8 fusion takes effects. -mno-prefixed >>>>>> + ensures test point isn't broken (due to prefixed plha). */ >>>>>> +/* { dg-options "-mdejagnu-tune=power8 -O3 -mno-prefixed" } */ >>>>>> + >>>>>> +/* Verify it doesn't optimize this function for speed as it's cold, >>>>>> + by checking it doesn't try to fuse sign extended loads with addis, >>>>>> + which results in a zero extended load and a sign extension. */ >>>>>> + >>>>>> +__attribute__ ((cold)) int >>>>>> +fusion_short (short *p) >>>>>> +{ >>>>>> + return p[0x12345]; >>>>>> +} >>>>>> + >>>>>> +/* { dg-final { scan-assembler-not {\mextsh\M} } } */ >>>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-2.c b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>>>>> new file mode 100644 >>>>>> index 00000000000..9f703f7fc6b >>>>>> --- /dev/null >>>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c >>>>>> @@ -0,0 +1,15 @@ >>>>>> +/* -mno-prefixed ensures test point isn't broken (due to prefixed plha). */ >>>>>> +/* { dg-options "-mdejagnu-tune=power8 -O2 -mpower8-fusion-sign -mno-prefixed" } */ >>>>>> + >>>>>> +/* Verify the explicit option -mpower8-fusion-sign can still fuse >>>>>> + sign extended loads with addis, which results in a zero extended >>>>>> + load and a sign extension. It means the cold attribute which >>>>>> + indicates to optimize for size doesn't affect. */ >>>>>> + >>>>>> +__attribute__ ((cold)) int >>>>>> +fusion_short (short *p) >>>>>> +{ >>>>>> + return p[0x12345]; >>>>>> +} >>>>>> + >>>>>> +/* { dg-final { scan-assembler-times {\mextsh\M} 1 } } */ >>>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-3.c b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>>>>> new file mode 100644 >>>>>> index 00000000000..ceaa96e4421 >>>>>> --- /dev/null >>>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c >>>>>> @@ -0,0 +1,25 @@ >>>>>> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >>>>>> +/* { dg-require-effective-target fpic } */ >>>>>> +/* { dg-options "-fpic -mno-pcrel -O2" } */ >>>>>> + >>>>>> +/* Verify it doesn't imply -msave-toc-indirect, so >>>>>> + it doesn't take effect and we have two separated >>>>>> + toc savings for these two long calls. */ >>>>>> + >>>>>> +void foo (void) __attribute__((__longcall__)); >>>>>> +int baz (void) __attribute__((__longcall__)); >>>>>> + >>>>>> +__attribute__ ((cold)) int >>>>>> +bar (void) >>>>>> +{ >>>>>> + foo (); >>>>>> + return baz () + 1; >>>>>> +} >>>>>> + >>>>>> +/* Linux LE */ >>>>>> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 2 { target powerpc_elfv2 } } } */ >>>>>> +/* Linux BE 64 bit or AIX 64 bit */ >>>>>> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 2 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >>>>>> +/* AIX 32 bit */ >>>>>> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 2 { target { ilp32 && powerpc*-*-aix* } } } } */ >>>>>> + >>>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-4.c b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>>>>> new file mode 100644 >>>>>> index 00000000000..2a70a603047 >>>>>> --- /dev/null >>>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c >>>>>> @@ -0,0 +1,24 @@ >>>>>> +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ >>>>>> +/* { dg-require-effective-target fpic } */ >>>>>> +/* { dg-options "-fpic -mno-pcrel -O2 -msave-toc-indirect" } */ >>>>>> + >>>>>> +/* Verify the explicit -msave-toc-indirect always >>>>>> + takes effect, so there is only one toc saving. */ >>>>>> + >>>>>> +void foo (void) __attribute__((__longcall__)); >>>>>> +int baz (void) __attribute__((__longcall__)); >>>>>> + >>>>>> +__attribute__ ((cold)) int >>>>>> +bar (void) >>>>>> +{ >>>>>> + foo (); >>>>>> + return baz () + 1; >>>>>> +} >>>>>> + >>>>>> +/* Linux LE */ >>>>>> +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 1 { target powerpc_elfv2 } } } */ >>>>>> +/* Linux BE 64 bit or AIX 64 bit */ >>>>>> +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 1 { target { lp64 && { ! powerpc_elfv2 } } } } } */ >>>>>> +/* AIX 32 bit */ >>>>>> +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 1 { target { ilp32 && powerpc*-*-aix* } } } } */ >>>>>> + >>>>>> -- >>>>>> 2.37.0
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index a1764018545..9f84468db84 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -1878,7 +1878,10 @@ (define_predicate "fusion_gpr_mem_load" /* Handle sign/zero extend. */ if (GET_CODE (op) == ZERO_EXTEND - || (TARGET_P8_FUSION_SIGN && GET_CODE (op) == SIGN_EXTEND)) + || (TARGET_P8_FUSION_SIGN + && GET_CODE (op) == SIGN_EXTEND + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN + || optimize_function_for_speed_p (cfun)))) { op = XEXP (op, 0); mode = GET_MODE (op); diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 6ac3adcec6b..f47d21980a9 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -3997,8 +3997,7 @@ rs6000_option_override_internal (bool global_init_p) /* If we can shrink-wrap the TOC register save separately, then use -msave-toc-indirect unless explicitly disabled. */ if ((rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT) == 0 - && flag_shrink_wrap_separate - && optimize_function_for_speed_p (cfun)) + && flag_shrink_wrap_separate) rs6000_isa_flags |= OPTION_MASK_SAVE_TOC_INDIRECT; /* Enable power8 fusion if we are tuning for power8, even if we aren't @@ -4032,7 +4031,6 @@ rs6000_option_override_internal (bool global_init_p) zero extending load, and an explicit sign extension. */ if (TARGET_P8_FUSION && !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN) - && optimize_function_for_speed_p (cfun) && optimize >= 3) rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN; @@ -25690,7 +25688,10 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie) /* Can we optimize saving the TOC in the prologue or do we need to do it at every call? */ - if (TARGET_SAVE_TOC_INDIRECT && !cfun->calls_alloca) + if (TARGET_SAVE_TOC_INDIRECT + && !cfun->calls_alloca + && (rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT + || optimize_function_for_speed_p (cfun))) cfun->machine->save_toc_in_prologue = true; else { @@ -27557,7 +27558,10 @@ fusion_gpr_load_p (rtx addis_reg, /* register set via addis. */ /* Allow sign/zero extension. */ if (GET_CODE (mem) == ZERO_EXTEND - || (GET_CODE (mem) == SIGN_EXTEND && TARGET_P8_FUSION_SIGN)) + || (GET_CODE (mem) == SIGN_EXTEND + && TARGET_P8_FUSION_SIGN + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN + || optimize_function_for_speed_p (cfun)))) mem = XEXP (mem, 0); if (!MEM_P (mem)) @@ -27621,7 +27625,10 @@ expand_fusion_gpr_load (rtx *operands) enum rtx_code extend = UNKNOWN; if (GET_CODE (orig_mem) == ZERO_EXTEND - || (TARGET_P8_FUSION_SIGN && GET_CODE (orig_mem) == SIGN_EXTEND)) + || (TARGET_P8_FUSION_SIGN + && GET_CODE (orig_mem) == SIGN_EXTEND + && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN + || optimize_function_for_speed_p (cfun)))) { extend = GET_CODE (orig_mem); orig_mem = XEXP (orig_mem, 0); diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-1.c b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c new file mode 100644 index 00000000000..eae3bb9b7c4 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-1.c @@ -0,0 +1,16 @@ +/* Only possible to fuse sign extended loads with the addis when + optimize >= 3 and Power8 fusion takes effects. -mno-prefixed + ensures test point isn't broken (due to prefixed plha). */ +/* { dg-options "-mdejagnu-tune=power8 -O3 -mno-prefixed" } */ + +/* Verify it doesn't optimize this function for speed as it's cold, + by checking it doesn't try to fuse sign extended loads with addis, + which results in a zero extended load and a sign extension. */ + +__attribute__ ((cold)) int +fusion_short (short *p) +{ + return p[0x12345]; +} + +/* { dg-final { scan-assembler-not {\mextsh\M} } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-2.c b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c new file mode 100644 index 00000000000..9f703f7fc6b --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-2.c @@ -0,0 +1,15 @@ +/* -mno-prefixed ensures test point isn't broken (due to prefixed plha). */ +/* { dg-options "-mdejagnu-tune=power8 -O2 -mpower8-fusion-sign -mno-prefixed" } */ + +/* Verify the explicit option -mpower8-fusion-sign can still fuse + sign extended loads with addis, which results in a zero extended + load and a sign extension. It means the cold attribute which + indicates to optimize for size doesn't affect. */ + +__attribute__ ((cold)) int +fusion_short (short *p) +{ + return p[0x12345]; +} + +/* { dg-final { scan-assembler-times {\mextsh\M} 1 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-3.c b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c new file mode 100644 index 00000000000..ceaa96e4421 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-3.c @@ -0,0 +1,25 @@ +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ +/* { dg-require-effective-target fpic } */ +/* { dg-options "-fpic -mno-pcrel -O2" } */ + +/* Verify it doesn't imply -msave-toc-indirect, so + it doesn't take effect and we have two separated + toc savings for these two long calls. */ + +void foo (void) __attribute__((__longcall__)); +int baz (void) __attribute__((__longcall__)); + +__attribute__ ((cold)) int +bar (void) +{ + foo (); + return baz () + 1; +} + +/* Linux LE */ +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 2 { target powerpc_elfv2 } } } */ +/* Linux BE 64 bit or AIX 64 bit */ +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 2 { target { lp64 && { ! powerpc_elfv2 } } } } } */ +/* AIX 32 bit */ +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 2 { target { ilp32 && powerpc*-*-aix* } } } } */ + diff --git a/gcc/testsuite/gcc.target/powerpc/pr108184-4.c b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c new file mode 100644 index 00000000000..2a70a603047 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr108184-4.c @@ -0,0 +1,24 @@ +/* { dg-do compile { target { powerpc*-*-aix* powerpc*-*-linux* } } } */ +/* { dg-require-effective-target fpic } */ +/* { dg-options "-fpic -mno-pcrel -O2 -msave-toc-indirect" } */ + +/* Verify the explicit -msave-toc-indirect always + takes effect, so there is only one toc saving. */ + +void foo (void) __attribute__((__longcall__)); +int baz (void) __attribute__((__longcall__)); + +__attribute__ ((cold)) int +bar (void) +{ + foo (); + return baz () + 1; +} + +/* Linux LE */ +/* { dg-final { scan-assembler-times {\mstd 2,24\(1\)} 1 { target powerpc_elfv2 } } } */ +/* Linux BE 64 bit or AIX 64 bit */ +/* { dg-final { scan-assembler-times {\mstd 2,40\(1\)} 1 { target { lp64 && { ! powerpc_elfv2 } } } } } */ +/* AIX 32 bit */ +/* { dg-final { scan-assembler-times {\mstw 2,20\(1\)} 1 { target { ilp32 && powerpc*-*-aix* } } } } */ +