Message ID | AM5PR0802MB26102C1AB3F6B5364538F4E883100@AM5PR0802MB2610.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
On 27/04/17 18:38, Wilco Dijkstra wrote: > The aarch_forward_to_shift_is_not_shifted_reg bypass always returns true > on AArch64 shifted instructions. This causes the bypass to activate in > too many cases, resulting in slower execution on Cortex-A53 like reported > in PR79665. > > This patch uses the arm_no_early_alu_shift_dep condition instead which > improves the example in PR79665 by ~7%. Given it is no longer used, > remove aarch_forward_to_shift_is_not_shifted_reg. > > Passes AArch64 bootstrap and regress. OK for commit? > > ChangeLog: > 2017-04-27 Wilco Dijkstra <wdijkstr@arm.com> > > PR target/79665 > * config/arm/aarch-common.c (arm_no_early_alu_shift_dep): > Remove redundant if. > (aarch_forward_to_shift_is_not_shifted_reg): Remove. > * config/arm/aarch-common-protos.h > (aarch_forward_to_shift_is_not_shifted_re): Remove. > * config/arm/cortex-a53.md: Use arm_no_early_alu_shift_dep in bypass. > > -- > > diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h > index 7c2bb4c2ed93728efcbd9e2811c09dddd04b37fe..4350d975abbbbd2cda55ac31e0d47971b40fcde5 100644 > --- a/gcc/config/arm/aarch-common-protos.h > +++ b/gcc/config/arm/aarch-common-protos.h > @@ -25,7 +25,6 @@ > > extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *); > extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *); > -extern int aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *, rtx_insn *); > extern bool aarch_rev16_p (rtx); > extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode); > extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode); > diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c > index 742d2ff4c7b779ae07b92f8a800e4667e32c44fb..9da2e382b2a1ecabd56acccc57997dbf626da513 100644 > --- a/gcc/config/arm/aarch-common.c > +++ b/gcc/config/arm/aarch-common.c > @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer) > return 0; > > if ((early_op = arm_find_shift_sub_rtx (op))) > - { > - if (REG_P (early_op)) > - early_op = op; > - > - return !reg_overlap_mentioned_p (value, early_op); > - } > + return !reg_overlap_mentioned_p (value, early_op); > > return 0; > } This function is used by several aarch32 pipeline description models. What testing have you given it there. Are the changes appropriate for those cores as well? R. > @@ -472,38 +467,6 @@ aarch_accumulator_forwarding (rtx_insn *producer, rtx_insn *consumer) > return (REGNO (dest) == REGNO (accumulator)); > } > > -/* Return nonzero if the CONSUMER instruction is some sort of > - arithmetic or logic + shift operation, and the register we are > - writing in PRODUCER is not used in a register shift by register > - operation. */ > - > -int > -aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *producer, > - rtx_insn *consumer) > -{ > - rtx value, op; > - rtx early_op; > - > - if (!arm_get_set_operands (producer, consumer, &value, &op)) > - return 0; > - > - if ((early_op = arm_find_shift_sub_rtx (op))) > - { > - if (REG_P (early_op)) > - early_op = op; > - > - /* Any other canonicalisation of a shift is a shift-by-constant > - so we don't care. */ > - if (GET_CODE (early_op) == ASHIFT) > - return (!REG_P (XEXP (early_op, 0)) > - || !REG_P (XEXP (early_op, 1))); > - else > - return 1; > - } > - > - return 0; > -} > - > /* Return non-zero if the consumer (a multiply-accumulate instruction) > has an accumulator dependency on the result of the producer (a > multiplication instruction) and no other dependency on that result. */ > diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md > index 7cf5fc5a0cd1d59efd0be3310b78303018138547..5bd0e62a108241ca56b01315908426e3f095fa81 100644 > --- a/gcc/config/arm/cortex-a53.md > +++ b/gcc/config/arm/cortex-a53.md > @@ -211,7 +211,7 @@ (define_bypass 1 "cortex_a53_alu*" > > (define_bypass 1 "cortex_a53_alu*" > "cortex_a53_alu_shift*" > - "aarch_forward_to_shift_is_not_shifted_reg") > + "arm_no_early_alu_shift_dep") > > (define_bypass 2 "cortex_a53_alu*" > "cortex_a53_alu_*,cortex_a53_shift*") >
Richard Earnshaw (lists) wrote: > --- a/gcc/config/arm/aarch-common.c > +++ b/gcc/config/arm/aarch-common.c > @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer) > return 0; > > if ((early_op = arm_find_shift_sub_rtx (op))) > - { > - if (REG_P (early_op)) > - early_op = op; > - > - return !reg_overlap_mentioned_p (value, early_op); > - } > + return !reg_overlap_mentioned_p (value, early_op); > > return 0; > } > This function is used by several aarch32 pipeline description models. > What testing have you given it there. Are the changes appropriate for > those cores as well? arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the check for REG_P is dead code. Bootstrap passes on ARM too of course. Wilco
ping Richard Earnshaw (lists) wrote: > --- a/gcc/config/arm/aarch-common.c > +++ b/gcc/config/arm/aarch-common.c > @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer) > return 0; > > if ((early_op = arm_find_shift_sub_rtx (op))) > - { > - if (REG_P (early_op)) > - early_op = op; > - > - return !reg_overlap_mentioned_p (value, early_op); > - } > + return !reg_overlap_mentioned_p (value, early_op); > > return 0; > } > This function is used by several aarch32 pipeline description models. > What testing have you given it there. Are the changes appropriate for > those cores as well? arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the check for REG_P is dead code. Bootstrap passes on ARM too of course. Wilco
On Fri, May 05, 2017 at 05:02:46PM +0100, Wilco Dijkstra wrote: > Richard Earnshaw (lists) wrote: > > > --- a/gcc/config/arm/aarch-common.c > > +++ b/gcc/config/arm/aarch-common.c > > @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer) > > return 0; > > > > if ((early_op = arm_find_shift_sub_rtx (op))) > > - { > > - if (REG_P (early_op)) > > - early_op = op; > > - > > - return !reg_overlap_mentioned_p (value, early_op); > > - } > > + return !reg_overlap_mentioned_p (value, early_op); > > > > return 0; > > } > > > This function is used by several aarch32 pipeline description models. > > What testing have you given it there. Are the changes appropriate for > > those cores as well? > > arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the > check for REG_P is dead code. Bootstrap passes on ARM too of course. This took me a bit of head-scratching to get right... arm_find_shift_sub_rtx calls arm_find_sub_rtx_with_code, looking for ASHIFT, with find_any_shift set to TRUE. There, we're going to run through the subRTX of pattern, if the code of the subrtx is one of the shift-like patterns, we return X, otherwise we return NULL_RTX. Thus > > - if (REG_P (early_op)) > > - early_op = op; is not needed, and the code can be reduced to: if ((early_op = arm_find_shift_sub_rtx (op))) return !reg_overlap_mentioned_p (value, early_op); return 0; So, this looks fine to me from an AArch64 perspective - but you'll need an ARM OK too as this is shared code. Cheers, James
ping On Fri, May 05, 2017 at 05:02:46PM +0100, Wilco Dijkstra wrote: > Richard Earnshaw (lists) wrote: > > > --- a/gcc/config/arm/aarch-common.c > > +++ b/gcc/config/arm/aarch-common.c > > @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer) > > return 0; > > > > if ((early_op = arm_find_shift_sub_rtx (op))) > > - { > > - if (REG_P (early_op)) > > - early_op = op; > > - > > - return !reg_overlap_mentioned_p (value, early_op); > > - } > > + return !reg_overlap_mentioned_p (value, early_op); > > > > return 0; > > } > > > This function is used by several aarch32 pipeline description models. > > What testing have you given it there. Are the changes appropriate for > > those cores as well? > > arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the > check for REG_P is dead code. Bootstrap passes on ARM too of course. This took me a bit of head-scratching to get right... arm_find_shift_sub_rtx calls arm_find_sub_rtx_with_code, looking for ASHIFT, with find_any_shift set to TRUE. There, we're going to run through the subRTX of pattern, if the code of the subrtx is one of the shift-like patterns, we return X, otherwise we return NULL_RTX. Thus > > - if (REG_P (early_op)) > > - early_op = op; is not needed, and the code can be reduced to: if ((early_op = arm_find_shift_sub_rtx (op))) return !reg_overlap_mentioned_p (value, early_op); return 0; So, this looks fine to me from an AArch64 perspective - but you'll need an ARM OK too as this is shared code. Cheers, James
On Wed, Jun 14, 2017 at 2:55 PM, James Greenhalgh <james.greenhalgh@arm.com> wrote: > On Fri, May 05, 2017 at 05:02:46PM +0100, Wilco Dijkstra wrote: >> Richard Earnshaw (lists) wrote: >> >> > --- a/gcc/config/arm/aarch-common.c >> > +++ b/gcc/config/arm/aarch-common.c >> > @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer) >> > return 0; >> > >> > if ((early_op = arm_find_shift_sub_rtx (op))) >> > - { >> > - if (REG_P (early_op)) >> > - early_op = op; >> > - >> > - return !reg_overlap_mentioned_p (value, early_op); >> > - } >> > + return !reg_overlap_mentioned_p (value, early_op); >> > >> > return 0; >> > } >> >> > This function is used by several aarch32 pipeline description models. >> > What testing have you given it there. Are the changes appropriate for >> > those cores as well? >> >> arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the >> check for REG_P is dead code. Bootstrap passes on ARM too of course. > > This took me a bit of head-scratching to get right... > > arm_find_shift_sub_rtx calls arm_find_sub_rtx_with_code, looking for > ASHIFT, with find_any_shift set to TRUE. There, we're going to run > through the subRTX of pattern, if the code of the subrtx is one of the > shift-like patterns, we return X, otherwise we return NULL_RTX. > > Thus > >> > - if (REG_P (early_op)) >> > - early_op = op; > > is not needed, and the code can be reduced to: > > if ((early_op = arm_find_shift_sub_rtx (op))) > return !reg_overlap_mentioned_p (value, early_op); > return 0; > > So, this looks fine to me from an AArch64 perspective - but you'll need an > ARM OK too as this is shared code. I'm about to run home for the day but this came in from https://gcc.gnu.org/ml/gcc-patches/2013-09/msg02109.html and James said in that email that this was put in to ensure no segfaults on cortex-a15 / cortex-a7 tuning. I'll try and look at it later this week. Ramana > > Cheers, > James >
Ramana Radhakrishnan wrote: > > I'm about to run home for the day but this came in from > https://gcc.gnu.org/ml/gcc-patches/2013-09/msg02109.html and James > said in that email that this was put in to ensure no segfaults on > cortex-a15 / cortex-a7 tuning. The code is historical - an older version didn't specifically look just for shifts. Given we can only return shift rtx's now it's completely redundant. A bootstrap on armhf with --with-cpu=cortex-a15 passed. Wilco
On 6/28/17 1:49 PM, Wilco Dijkstra wrote: > Ramana Radhakrishnan wrote: >> >> I'm about to run home for the day but this came in from >> https://gcc.gnu.org/ml/gcc-patches/2013-09/msg02109.html and James >> said in that email that this was put in to ensure no segfaults on >> cortex-a15 / cortex-a7 tuning. > > The code is historical - an older version didn't specifically look just for > shifts. Given we can only return shift rtx's now it's completely redundant. > A bootstrap on armhf with --with-cpu=cortex-a15 passed. > In which case - ok . thanks, regards Ramana > Wilco > >
On Wed, Jun 28, 2017 at 01:49:26PM +0100, Wilco Dijkstra wrote: > Ramana Radhakrishnan wrote: > > > > I'm about to run home for the day but this came in from > > https://gcc.gnu.org/ml/gcc-patches/2013-09/msg02109.html and James > > said in that email that this was put in to ensure no segfaults on > > cortex-a15 / cortex-a7 tuning. > > The code is historical - an older version didn't specifically look just for > shifts. Given we can only return shift rtx's now it's completely redundant. > A bootstrap on armhf with --with-cpu=cortex-a15 passed. The final commit was: https://gcc.gnu.org/ml/gcc-patches/2013-11/msg00533.html That ends up being a more involved rewrite of the code in arm_no_early_alu_shift_dep . The check Wilco deletes was in their before the clean-up. Previosuly we were manually walking the RTX for the consumer, assuming that after stripping COND_EXEC and PARALLEL we'd have either: OP = (some_shift (register) (*)) or OP = (some_arithmetic_code (some_shift (register) (*)) (*)) That is, we're either looking at a shift, or an arithmetic operation that contains a shift. The early_op = XEXP (op, 0); would give you either: EARLY_OP = (regiser) or EARLY_OP = (some_shift (register) (*)) But, we're interested in getting to the whole shift. So, we check if we are now looking at a register, and if we are, use op instead: if (REG_P (early_op)) early_op = op; So we end up with either: EARLY_OP = (some_shift (register) (*)) or EARLY_OP = (some_shift (register) (*)) Which is exactly what we wanted. After the refactoring, arm_find_shift_sub_rtx will always return you (some_shift (*) (*)) - that's good we're now more resilient, and always operate on a full shift, but it does mean the check for REG_P can never match. I wrote this a fair while ago, but it looks like a simple oversight. So, this code is safely dead and can be cleaned up as Wilco suggests with no issue. Hope that helps! Thanks, James
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h index 7c2bb4c2ed93728efcbd9e2811c09dddd04b37fe..4350d975abbbbd2cda55ac31e0d47971b40fcde5 100644 --- a/gcc/config/arm/aarch-common-protos.h +++ b/gcc/config/arm/aarch-common-protos.h @@ -25,7 +25,6 @@ extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *); extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *); -extern int aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *, rtx_insn *); extern bool aarch_rev16_p (rtx); extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode); extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode); diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c index 742d2ff4c7b779ae07b92f8a800e4667e32c44fb..9da2e382b2a1ecabd56acccc57997dbf626da513 100644 --- a/gcc/config/arm/aarch-common.c +++ b/gcc/config/arm/aarch-common.c @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer) return 0; if ((early_op = arm_find_shift_sub_rtx (op))) - { - if (REG_P (early_op)) - early_op = op; - - return !reg_overlap_mentioned_p (value, early_op); - } + return !reg_overlap_mentioned_p (value, early_op); return 0; } @@ -472,38 +467,6 @@ aarch_accumulator_forwarding (rtx_insn *producer, rtx_insn *consumer) return (REGNO (dest) == REGNO (accumulator)); } -/* Return nonzero if the CONSUMER instruction is some sort of - arithmetic or logic + shift operation, and the register we are - writing in PRODUCER is not used in a register shift by register - operation. */ - -int -aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *producer, - rtx_insn *consumer) -{ - rtx value, op; - rtx early_op; - - if (!arm_get_set_operands (producer, consumer, &value, &op)) - return 0; - - if ((early_op = arm_find_shift_sub_rtx (op))) - { - if (REG_P (early_op)) - early_op = op; - - /* Any other canonicalisation of a shift is a shift-by-constant - so we don't care. */ - if (GET_CODE (early_op) == ASHIFT) - return (!REG_P (XEXP (early_op, 0)) - || !REG_P (XEXP (early_op, 1))); - else - return 1; - } - - return 0; -} - /* Return non-zero if the consumer (a multiply-accumulate instruction) has an accumulator dependency on the result of the producer (a multiplication instruction) and no other dependency on that result. */ diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md index 7cf5fc5a0cd1d59efd0be3310b78303018138547..5bd0e62a108241ca56b01315908426e3f095fa81 100644 --- a/gcc/config/arm/cortex-a53.md +++ b/gcc/config/arm/cortex-a53.md @@ -211,7 +211,7 @@ (define_bypass 1 "cortex_a53_alu*" (define_bypass 1 "cortex_a53_alu*" "cortex_a53_alu_shift*" - "aarch_forward_to_shift_is_not_shifted_reg") + "arm_no_early_alu_shift_dep") (define_bypass 2 "cortex_a53_alu*" "cortex_a53_alu_*,cortex_a53_shift*")