Message ID | d8cb03b5-2392-d38c-d89e-712965358cc7@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [rs6000] Add store fusion support for Power10 | expand |
Hi Pat, Good stuff! Comments below. On 8/2/21 3:19 PM, Pat Haugen via Gcc-patches wrote: > Enable store fusion on Power10. > > Use the SCHED_REORDER hook to implement Power10 specific ready list reordering. > As of now, pairing stores for store fusion is the only function being > performed. > > Bootstrap/regtest on powerpc64le(Power10) with no new regressions. Ok for > master? > > -Pat > > > 2021-08-02 Pat Haugen <pthaugen@linux.ibm.com> > > gcc/ChangeLog: > > * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag. > (POWERPC_MASKS): Likewise. > * config/rs6000/rs6000.c (rs6000_option_override_internal): Enable > store fusion for Power10. > (is_load_insn1): Verify destination is a register. > (is_store_insn1): Verify source is a register. > (is_fusable_store): New. > (power10_sched_reorder): Likewise. > (rs6000_sched_reorder): Do Power10 specific reordering. > (rs6000_sched_reorder2): Likewise. > * config/rs6000/rs6000.opt: Add new option. > > > > diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def > index 6758296c0fd..f5812da0184 100644 > --- a/gcc/config/rs6000/rs6000-cpus.def > +++ b/gcc/config/rs6000/rs6000-cpus.def > @@ -90,7 +90,8 @@ > | OPTION_MASK_P10_FUSION_2LOGICAL \ > | OPTION_MASK_P10_FUSION_LOGADD \ > | OPTION_MASK_P10_FUSION_ADDLOG \ > - | OPTION_MASK_P10_FUSION_2ADD) > + | OPTION_MASK_P10_FUSION_2ADD \ > + | OPTION_MASK_P10_FUSION_2STORE) This is all fine for now; as we've discussed elsewhere, we probably want to eventually consolidate all these fusion flags into one. > > /* Flags that need to be turned off if -mno-power9-vector. */ > #define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW \ > @@ -143,6 +144,7 @@ > | OPTION_MASK_P10_FUSION_LOGADD \ > | OPTION_MASK_P10_FUSION_ADDLOG \ > | OPTION_MASK_P10_FUSION_2ADD \ > + | OPTION_MASK_P10_FUSION_2STORE \ > | OPTION_MASK_HTM \ > | OPTION_MASK_ISEL \ > | OPTION_MASK_MFCRF \ > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c > index 279f00cc648..1460a0d7c5c 100644 > --- a/gcc/config/rs6000/rs6000.c > +++ b/gcc/config/rs6000/rs6000.c > @@ -4490,6 +4490,10 @@ rs6000_option_override_internal (bool global_init_p) > && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0) > rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD; > > + if (TARGET_POWER10 > + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0) > + rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE; > + > /* Turn off vector pair/mma options on non-power10 systems. */ > else if (!TARGET_POWER10 && TARGET_MMA) > { > @@ -18357,7 +18361,7 @@ is_load_insn1 (rtx pat, rtx *load_mem) > if (!pat || pat == NULL_RTX) > return false; > > - if (GET_CODE (pat) == SET) > + if (GET_CODE (pat) == SET && REG_P (SET_DEST (pat))) > return find_mem_ref (SET_SRC (pat), load_mem); Looks like this is just an optimization to quickly discard stores, right? > if (GET_CODE (pat) == PARALLEL) > @@ -18394,7 +18398,8 @@ is_store_insn1 (rtx pat, rtx *str_mem) > if (!pat || pat == NULL_RTX) > return false; > > - if (GET_CODE (pat) == SET) > + if (GET_CODE (pat) == SET > + && (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat)))) > return find_mem_ref (SET_DEST (pat), str_mem); Similar question. > > if (GET_CODE (pat) == PARALLEL) > @@ -18859,6 +18864,96 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos) > return cached_can_issue_more; > } > > +/* Determine if INSN is a store to memory that can be fused with a similar > + adjacent store. */ > + > +static bool > +is_fusable_store (rtx_insn *insn, rtx *str_mem) > +{ > + /* Exit early if not doing store fusion. */ > + if (!(TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE)) > + return false; > + > + /* Insn must be a non-prefixed base+disp form store. */ > + if (is_store_insn (insn, str_mem) > + && get_attr_prefixed (insn) == PREFIXED_NO > + && get_attr_update (insn) == UPDATE_NO > + && get_attr_indexed (insn) == INDEXED_NO) > + { > + /* Further restictions by mode and size. */ > + machine_mode mode = GET_MODE (*str_mem); > + HOST_WIDE_INT size; > + if MEM_SIZE_KNOWN_P (*str_mem) > + size = MEM_SIZE (*str_mem); > + else > + return false; > + > + if INTEGRAL_MODE_P (mode) > + { > + /* Must be word or dword size. */ > + return (size == 4 || size == 8); > + } > + else if FLOAT_MODE_P (mode) > + { > + /* Must be dword size. */ > + return (size == 8); > + } > + } > + > + return false; > +} > + > +/* Do Power10 specific reordering of the ready list. */ > + > +static int > +power10_sched_reorder (rtx_insn **ready, int lastpos) > +{ > + int pos; > + rtx mem1, mem2; > + > + /* Do store fusion during sched2 only. */ > + if (!reload_completed) > + return cached_can_issue_more; > + > + /* If the prior insn finished off a store fusion pair then simply > + reset the counter and return, nothing more to do. */ Good comments throughout, thanks! > + if (load_store_pendulum != 0) > + { > + load_store_pendulum = 0; > + return cached_can_issue_more; > + } > + > + /* Try to pair certain store insns to adjacent memory locations > + so that the hardware will fuse them to a single operation. */ > + if (is_fusable_store (last_scheduled_insn, &mem1)) > + { > + /* A fusable store was just scheduled. Scan the ready list for another > + store that it can fuse with. */ > + pos = lastpos; > + while (pos >= 0) > + { > + /* GPR stores can be ascending or descending offsets, FPR/VSR stores VSR? I don't see how that applies here. > + must be ascending only. */ > + if (is_fusable_store (ready[pos], &mem2) > + && ((INTEGRAL_MODE_P (GET_MODE (mem1)) > + && adjacent_mem_locations (mem1, mem2)) > + || (FLOAT_MODE_P (GET_MODE (mem1)) > + && (adjacent_mem_locations (mem1, mem2) == mem1)))) > + { > + /* Found a fusable store. Move it to the end of the ready list > + so it is scheduled next. */ > + move_to_end_of_ready (ready, pos, lastpos); > + > + load_store_pendulum = -1; > + break; > + } > + pos--; > + } > + } > + > + return cached_can_issue_more; > +} > + > /* We are about to begin issuing insns for this clock cycle. */ > > static int > @@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose, > if (rs6000_tune == PROCESSOR_POWER6) > load_store_pendulum = 0; > > + /* Do Power10 dependent reordering. */ > + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn) > + power10_sched_reorder (ready, *pn_ready - 1); > + I happened to notice that pn_ready is marked as ATTRIBUTE_UNUSED. This predates your patch, but maybe you could clean that up too. > return rs6000_issue_rate (); > } > > @@ -18906,6 +19005,10 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready, > && recog_memoized (last_scheduled_insn) >= 0) > return power9_sched_reorder2 (ready, *pn_ready - 1); > > + /* Do Power10 dependent reordering. */ > + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn) > + return power10_sched_reorder (ready, *pn_ready - 1); > + > return cached_can_issue_more; > } > > diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt > index 0538db387dc..3753de19557 100644 > --- a/gcc/config/rs6000/rs6000.opt > +++ b/gcc/config/rs6000/rs6000.opt > @@ -514,6 +514,10 @@ mpower10-fusion-2add > Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags) > Fuse dependent pairs of add or vaddudm instructions for better performance on power10. > > +mpower10-fusion-2store > +Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags) > +Fuse certain store operations together for better performance on power10. > + > mcrypto > Target Mask(CRYPTO) Var(rs6000_isa_flags) > Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions. Can you think of any test cases we can use to demonstrate store fusion? Otherwise looks good to me. Thanks! Bill
On Wed, Aug 04, 2021 at 09:23:13AM -0500, Bill Schmidt wrote: > On 8/2/21 3:19 PM, Pat Haugen via Gcc-patches wrote: (I reviewed this elsewhere instead of on the list... Not good, since the patch was on the list already. Sorry.) > >@@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump > >ATTRIBUTE_UNUSED, int sched_verbose, > > if (rs6000_tune == PROCESSOR_POWER6) > > load_store_pendulum = 0; > > > >+ /* Do Power10 dependent reordering. */ > >+ if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn) > >+ power10_sched_reorder (ready, *pn_ready - 1); > >+ > > I happened to notice that pn_ready is marked as ATTRIBUTE_UNUSED. This > predates your patch, but maybe you could clean that up too. *All* ATTRIBUTE_UNUSED instances should go away (or almost all, only those that really mean "maybe unused" can stay). The preferred way to in C++ say some argument is unused is by not naming it (or only in a comment). So instead of void f (int a ATTRIBUTE_UNUSED, int b ATTRIBUTE_UNUSED) { ... } you can say void f (int, int) { ... } or void f (int/*a*/, int/*b*/) { ... } > Can you think of any test cases we can use to demonstrate store fusion? It may be possible to make that reliable by doing a *really* simple testcase? But then it isn't testing very much :-( Segher
On 8/4/21 9:23 AM, Bill Schmidt wrote: > Hi Pat, > > Good stuff! Comments below. > > On 8/2/21 3:19 PM, Pat Haugen via Gcc-patches wrote: >> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c >> index 279f00cc648..1460a0d7c5c 100644 >> --- a/gcc/config/rs6000/rs6000.c >> +++ b/gcc/config/rs6000/rs6000.c >> @@ -4490,6 +4490,10 @@ rs6000_option_override_internal (bool global_init_p) >> && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0) >> rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD; >> >> + if (TARGET_POWER10 >> + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0) >> + rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE; >> + >> /* Turn off vector pair/mma options on non-power10 systems. */ >> else if (!TARGET_POWER10 && TARGET_MMA) >> { >> @@ -18357,7 +18361,7 @@ is_load_insn1 (rtx pat, rtx *load_mem) >> if (!pat || pat == NULL_RTX) >> return false; >> >> - if (GET_CODE (pat) == SET) >> + if (GET_CODE (pat) == SET && REG_P (SET_DEST (pat))) >> return find_mem_ref (SET_SRC (pat), load_mem); > Looks like this is just an optimization to quickly discard stores, right? Additional verification check to make sure destination is REG, yes. This will become a separate patch. >> if (GET_CODE (pat) == PARALLEL) >> @@ -18394,7 +18398,8 @@ is_store_insn1 (rtx pat, rtx *str_mem) >> if (!pat || pat == NULL_RTX) >> return false; >> >> - if (GET_CODE (pat) == SET) >> + if (GET_CODE (pat) == SET >> + && (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat)))) >> return find_mem_ref (SET_DEST (pat), str_mem); > > > Similar question. Similar answer. :) > >> >> if (GET_CODE (pat) == PARALLEL) >> @@ -18859,6 +18864,96 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos) >> return cached_can_issue_more; >> } >> >> +/* Determine if INSN is a store to memory that can be fused with a similar >> + adjacent store. */ >> + >> +static bool >> +is_fusable_store (rtx_insn *insn, rtx *str_mem) >> +{ >> + /* Exit early if not doing store fusion. */ >> + if (!(TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE)) >> + return false; >> + >> + /* Insn must be a non-prefixed base+disp form store. */ >> + if (is_store_insn (insn, str_mem) >> + && get_attr_prefixed (insn) == PREFIXED_NO >> + && get_attr_update (insn) == UPDATE_NO >> + && get_attr_indexed (insn) == INDEXED_NO) >> + { >> + /* Further restictions by mode and size. */ >> + machine_mode mode = GET_MODE (*str_mem); >> + HOST_WIDE_INT size; >> + if MEM_SIZE_KNOWN_P (*str_mem) >> + size = MEM_SIZE (*str_mem); >> + else >> + return false; >> + >> + if INTEGRAL_MODE_P (mode) >> + { >> + /* Must be word or dword size. */ >> + return (size == 4 || size == 8); >> + } >> + else if FLOAT_MODE_P (mode) >> + { >> + /* Must be dword size. */ >> + return (size == 8); >> + } >> + } >> + >> + return false; >> +} >> + >> +/* Do Power10 specific reordering of the ready list. */ >> + >> +static int >> +power10_sched_reorder (rtx_insn **ready, int lastpos) >> +{ >> + int pos; >> + rtx mem1, mem2; >> + >> + /* Do store fusion during sched2 only. */ >> + if (!reload_completed) >> + return cached_can_issue_more; >> + >> + /* If the prior insn finished off a store fusion pair then simply >> + reset the counter and return, nothing more to do. */ > > > Good comments throughout, thanks! > >> + if (load_store_pendulum != 0) >> + { >> + load_store_pendulum = 0; >> + return cached_can_issue_more; >> + } >> + >> + /* Try to pair certain store insns to adjacent memory locations >> + so that the hardware will fuse them to a single operation. */ >> + if (is_fusable_store (last_scheduled_insn, &mem1)) >> + { >> + /* A fusable store was just scheduled. Scan the ready list for another >> + store that it can fuse with. */ >> + pos = lastpos; >> + while (pos >= 0) >> + { >> + /* GPR stores can be ascending or descending offsets, FPR/VSR stores > VSR? I don't see how that applies here. Scalar floating point store from VSX reg (i.e. stxsd). >> + must be ascending only. */ >> + if (is_fusable_store (ready[pos], &mem2) >> + && ((INTEGRAL_MODE_P (GET_MODE (mem1)) >> + && adjacent_mem_locations (mem1, mem2)) >> + || (FLOAT_MODE_P (GET_MODE (mem1)) >> + && (adjacent_mem_locations (mem1, mem2) == mem1)))) >> + { >> + /* Found a fusable store. Move it to the end of the ready list >> + so it is scheduled next. */ >> + move_to_end_of_ready (ready, pos, lastpos); >> + >> + load_store_pendulum = -1; >> + break; >> + } >> + pos--; >> + } >> + } >> + >> + return cached_can_issue_more; >> +} >> + >> /* We are about to begin issuing insns for this clock cycle. */ >> >> static int >> @@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose, >> if (rs6000_tune == PROCESSOR_POWER6) >> load_store_pendulum = 0; >> >> + /* Do Power10 dependent reordering. */ >> + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn) >> + power10_sched_reorder (ready, *pn_ready - 1); >> + > > > I happened to notice that pn_ready is marked as ATTRIBUTE_UNUSED. This predates your patch, but maybe you could clean that up too. Will do as a separate cleanup. > >> return rs6000_issue_rate (); >> } >> >> @@ -18906,6 +19005,10 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready, >> && recog_memoized (last_scheduled_insn) >= 0) >> return power9_sched_reorder2 (ready, *pn_ready - 1); >> >> + /* Do Power10 dependent reordering. */ >> + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn) >> + return power10_sched_reorder (ready, *pn_ready - 1); >> + >> return cached_can_issue_more; >> } >> >> diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt >> index 0538db387dc..3753de19557 100644 >> --- a/gcc/config/rs6000/rs6000.opt >> +++ b/gcc/config/rs6000/rs6000.opt >> @@ -514,6 +514,10 @@ mpower10-fusion-2add >> Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags) >> Fuse dependent pairs of add or vaddudm instructions for better performance on power10. >> >> +mpower10-fusion-2store >> +Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags) >> +Fuse certain store operations together for better performance on power10. >> + >> mcrypto >> Target Mask(CRYPTO) Var(rs6000_isa_flags) >> Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions. > > Can you think of any test cases we can use to demonstrate store fusion? Yeah, should be able to come up with something to verify two adjacent stores. Thanks for the review. > > Otherwise looks good to me. Thanks! > > Bill >
Hi! On Wed, Aug 04, 2021 at 04:16:45PM -0500, Pat Haugen wrote: > On 8/4/21 9:23 AM, Bill Schmidt wrote: > >> + /* GPR stores can be ascending or descending offsets, FPR/VSR stores > > VSR? I don't see how that applies here. Almost all scalar FP insns have a VR alternative, but unfortunately not (always) with a unified or consistent syntax. > > Can you think of any test cases we can use to demonstrate store fusion? > > Yeah, should be able to come up with something to verify two adjacent stores. The testcase should also stop working when the fusion does. This might mean you need more than one testcase. The number one goal of our testsuite is to warn of regressions, so it really should be more than just a single two-liner scanning for one assembler insn. Maybe do a handful of cases? :-) Segher
diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def index 6758296c0fd..f5812da0184 100644 --- a/gcc/config/rs6000/rs6000-cpus.def +++ b/gcc/config/rs6000/rs6000-cpus.def @@ -90,7 +90,8 @@ | OPTION_MASK_P10_FUSION_2LOGICAL \ | OPTION_MASK_P10_FUSION_LOGADD \ | OPTION_MASK_P10_FUSION_ADDLOG \ - | OPTION_MASK_P10_FUSION_2ADD) + | OPTION_MASK_P10_FUSION_2ADD \ + | OPTION_MASK_P10_FUSION_2STORE) /* Flags that need to be turned off if -mno-power9-vector. */ #define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW \ @@ -143,6 +144,7 @@ | OPTION_MASK_P10_FUSION_LOGADD \ | OPTION_MASK_P10_FUSION_ADDLOG \ | OPTION_MASK_P10_FUSION_2ADD \ + | OPTION_MASK_P10_FUSION_2STORE \ | OPTION_MASK_HTM \ | OPTION_MASK_ISEL \ | OPTION_MASK_MFCRF \ diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 279f00cc648..1460a0d7c5c 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4490,6 +4490,10 @@ rs6000_option_override_internal (bool global_init_p) && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0) rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD; + if (TARGET_POWER10 + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0) + rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE; + /* Turn off vector pair/mma options on non-power10 systems. */ else if (!TARGET_POWER10 && TARGET_MMA) { @@ -18357,7 +18361,7 @@ is_load_insn1 (rtx pat, rtx *load_mem) if (!pat || pat == NULL_RTX) return false; - if (GET_CODE (pat) == SET) + if (GET_CODE (pat) == SET && REG_P (SET_DEST (pat))) return find_mem_ref (SET_SRC (pat), load_mem); if (GET_CODE (pat) == PARALLEL) @@ -18394,7 +18398,8 @@ is_store_insn1 (rtx pat, rtx *str_mem) if (!pat || pat == NULL_RTX) return false; - if (GET_CODE (pat) == SET) + if (GET_CODE (pat) == SET + && (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat)))) return find_mem_ref (SET_DEST (pat), str_mem); if (GET_CODE (pat) == PARALLEL) @@ -18859,6 +18864,96 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos) return cached_can_issue_more; } +/* Determine if INSN is a store to memory that can be fused with a similar + adjacent store. */ + +static bool +is_fusable_store (rtx_insn *insn, rtx *str_mem) +{ + /* Exit early if not doing store fusion. */ + if (!(TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE)) + return false; + + /* Insn must be a non-prefixed base+disp form store. */ + if (is_store_insn (insn, str_mem) + && get_attr_prefixed (insn) == PREFIXED_NO + && get_attr_update (insn) == UPDATE_NO + && get_attr_indexed (insn) == INDEXED_NO) + { + /* Further restictions by mode and size. */ + machine_mode mode = GET_MODE (*str_mem); + HOST_WIDE_INT size; + if MEM_SIZE_KNOWN_P (*str_mem) + size = MEM_SIZE (*str_mem); + else + return false; + + if INTEGRAL_MODE_P (mode) + { + /* Must be word or dword size. */ + return (size == 4 || size == 8); + } + else if FLOAT_MODE_P (mode) + { + /* Must be dword size. */ + return (size == 8); + } + } + + return false; +} + +/* Do Power10 specific reordering of the ready list. */ + +static int +power10_sched_reorder (rtx_insn **ready, int lastpos) +{ + int pos; + rtx mem1, mem2; + + /* Do store fusion during sched2 only. */ + if (!reload_completed) + return cached_can_issue_more; + + /* If the prior insn finished off a store fusion pair then simply + reset the counter and return, nothing more to do. */ + if (load_store_pendulum != 0) + { + load_store_pendulum = 0; + return cached_can_issue_more; + } + + /* Try to pair certain store insns to adjacent memory locations + so that the hardware will fuse them to a single operation. */ + if (is_fusable_store (last_scheduled_insn, &mem1)) + { + /* A fusable store was just scheduled. Scan the ready list for another + store that it can fuse with. */ + pos = lastpos; + while (pos >= 0) + { + /* GPR stores can be ascending or descending offsets, FPR/VSR stores + must be ascending only. */ + if (is_fusable_store (ready[pos], &mem2) + && ((INTEGRAL_MODE_P (GET_MODE (mem1)) + && adjacent_mem_locations (mem1, mem2)) + || (FLOAT_MODE_P (GET_MODE (mem1)) + && (adjacent_mem_locations (mem1, mem2) == mem1)))) + { + /* Found a fusable store. Move it to the end of the ready list + so it is scheduled next. */ + move_to_end_of_ready (ready, pos, lastpos); + + load_store_pendulum = -1; + break; + } + pos--; + } + } + + return cached_can_issue_more; +} + /* We are about to begin issuing insns for this clock cycle. */ static int @@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose, if (rs6000_tune == PROCESSOR_POWER6) load_store_pendulum = 0; + /* Do Power10 dependent reordering. */ + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn) + power10_sched_reorder (ready, *pn_ready - 1); + return rs6000_issue_rate (); } @@ -18906,6 +19005,10 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready, && recog_memoized (last_scheduled_insn) >= 0) return power9_sched_reorder2 (ready, *pn_ready - 1); + /* Do Power10 dependent reordering. */ + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn) + return power10_sched_reorder (ready, *pn_ready - 1); + return cached_can_issue_more; } diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 0538db387dc..3753de19557 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -514,6 +514,10 @@ mpower10-fusion-2add Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags) Fuse dependent pairs of add or vaddudm instructions for better performance on power10. +mpower10-fusion-2store +Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags) +Fuse certain store operations together for better performance on power10. + mcrypto Target Mask(CRYPTO) Var(rs6000_isa_flags) Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.