Message ID | ri68r0qya8w.fsf@virgil.suse.cz |
---|---|
State | New |
Headers | show |
Series | sra: Do not leave work for DSE (that it can sometimes not perform) | expand |
On Fri, 3 May 2024, Martin Jambor wrote: > Hi, > > when looking again at the g++.dg/tree-ssa/pr109849.C testcase we > discovered that it generates terrible store-to-load forwarding stalls > because SRA was leaving behind aggregate loads but all the stores were > by scalar parts and DSE failed to remove the useless load. SRA has > all the knowledge to remove the statement even now, so this small > patch makes it do so. > > With this patch, the g++.dg/tree-ssa/pr109849.C micro-benchmark runs 9 > times faster (on an AMD EPYC 75F3 machine). > > Bootstrapped and tested on x86_64. OK for master? OK. > Given that the patch is simple but can sometimes have large benefit, > could it possibly be backported to gcc-14 branch even if it is not a > regression (at least not in the last decade) in a few weeks? Sounds reasonable. We have some more leeway for X.2 releases. Thanks, Richard. > Thanks, > > Martin > > > gcc/ChangeLog: > > 2024-04-18 Martin Jambor <mjambor@suse.cz> > > * tree-sra.cc (sra_modify_assign): Remove the original statement > also when dealing with a store to a fully covered aggregate from a > non-candidate. > > gcc/testsuite/ChangeLog: > > 2024-04-23 Martin Jambor <mjambor@suse.cz> > > * g++.dg/tree-ssa/pr109849.C: Also check that the aggeegate store > to cur disappears. > * gcc.dg/tree-ssa/ssa-dse-26.c: Instead of relying on DSE, > check that the unwanted stores were removed at early SRA time. > --- > gcc/testsuite/g++.dg/tree-ssa/pr109849.C | 3 ++- > gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c | 6 +++--- > gcc/tree-sra.cc | 14 ++++++++++++-- > 3 files changed, 17 insertions(+), 6 deletions(-) > > diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr109849.C b/gcc/testsuite/g++.dg/tree-ssa/pr109849.C > index cd348c0f590..d06dbb10482 100644 > --- a/gcc/testsuite/g++.dg/tree-ssa/pr109849.C > +++ b/gcc/testsuite/g++.dg/tree-ssa/pr109849.C > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -fdump-tree-sra" } */ > +/* { dg-options "-O2 -fdump-tree-sra -fdump-tree-optimized" } */ > > #include <vector> > typedef unsigned int uint32_t; > @@ -29,3 +29,4 @@ main() > } > > /* { dg-final { scan-tree-dump "Created a replacement for stack offset" "sra"} } */ > +/* { dg-final { scan-tree-dump-not "cur = MEM" "optimized"} } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c > index 43152de5616..1d01392c595 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -fdump-tree-dse1-details -fno-short-enums -fno-tree-fre" } */ > +/* { dg-options "-O2 -fdump-tree-esra -fno-short-enums -fno-tree-fre" } */ > /* { dg-skip-if "we want a BIT_FIELD_REF from fold_truth_andor" { ! lp64 } } */ > /* { dg-skip-if "temporary variable names are not x and y" { mmix-knuth-mmixware } } */ > > @@ -31,5 +31,5 @@ constraint_equal (struct constraint a, struct constraint b) > && constraint_expr_equal (a.rhs, b.rhs); > } > > -/* { dg-final { scan-tree-dump-times "Deleted dead store: x = " 2 "dse1" } } */ > -/* { dg-final { scan-tree-dump-times "Deleted dead store: y = " 2 "dse1" } } */ > +/* { dg-final { scan-tree-dump-not "x = " "esra" } } */ > +/* { dg-final { scan-tree-dump-not "y = " "esra" } } */ > diff --git a/gcc/tree-sra.cc b/gcc/tree-sra.cc > index 32fa28911f2..8040b0c5645 100644 > --- a/gcc/tree-sra.cc > +++ b/gcc/tree-sra.cc > @@ -4854,8 +4854,18 @@ sra_modify_assign (gimple *stmt, gimple_stmt_iterator *gsi) > But use the RHS aggregate to load from to expose more > optimization opportunities. */ > if (access_has_children_p (lacc)) > - generate_subtree_copies (lacc->first_child, rhs, lacc->offset, > - 0, 0, gsi, true, true, loc); > + { > + generate_subtree_copies (lacc->first_child, rhs, lacc->offset, > + 0, 0, gsi, true, true, loc); > + if (lacc->grp_covered) > + { > + unlink_stmt_vdef (stmt); > + gsi_remove (& orig_gsi, true); > + release_defs (stmt); > + sra_stats.deleted++; > + return SRA_AM_REMOVED; > + } > + } > } > > return SRA_AM_NONE; >
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr109849.C b/gcc/testsuite/g++.dg/tree-ssa/pr109849.C index cd348c0f590..d06dbb10482 100644 --- a/gcc/testsuite/g++.dg/tree-ssa/pr109849.C +++ b/gcc/testsuite/g++.dg/tree-ssa/pr109849.C @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-sra" } */ +/* { dg-options "-O2 -fdump-tree-sra -fdump-tree-optimized" } */ #include <vector> typedef unsigned int uint32_t; @@ -29,3 +29,4 @@ main() } /* { dg-final { scan-tree-dump "Created a replacement for stack offset" "sra"} } */ +/* { dg-final { scan-tree-dump-not "cur = MEM" "optimized"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c index 43152de5616..1d01392c595 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-dse1-details -fno-short-enums -fno-tree-fre" } */ +/* { dg-options "-O2 -fdump-tree-esra -fno-short-enums -fno-tree-fre" } */ /* { dg-skip-if "we want a BIT_FIELD_REF from fold_truth_andor" { ! lp64 } } */ /* { dg-skip-if "temporary variable names are not x and y" { mmix-knuth-mmixware } } */ @@ -31,5 +31,5 @@ constraint_equal (struct constraint a, struct constraint b) && constraint_expr_equal (a.rhs, b.rhs); } -/* { dg-final { scan-tree-dump-times "Deleted dead store: x = " 2 "dse1" } } */ -/* { dg-final { scan-tree-dump-times "Deleted dead store: y = " 2 "dse1" } } */ +/* { dg-final { scan-tree-dump-not "x = " "esra" } } */ +/* { dg-final { scan-tree-dump-not "y = " "esra" } } */ diff --git a/gcc/tree-sra.cc b/gcc/tree-sra.cc index 32fa28911f2..8040b0c5645 100644 --- a/gcc/tree-sra.cc +++ b/gcc/tree-sra.cc @@ -4854,8 +4854,18 @@ sra_modify_assign (gimple *stmt, gimple_stmt_iterator *gsi) But use the RHS aggregate to load from to expose more optimization opportunities. */ if (access_has_children_p (lacc)) - generate_subtree_copies (lacc->first_child, rhs, lacc->offset, - 0, 0, gsi, true, true, loc); + { + generate_subtree_copies (lacc->first_child, rhs, lacc->offset, + 0, 0, gsi, true, true, loc); + if (lacc->grp_covered) + { + unlink_stmt_vdef (stmt); + gsi_remove (& orig_gsi, true); + release_defs (stmt); + sra_stats.deleted++; + return SRA_AM_REMOVED; + } + } } return SRA_AM_NONE;