Message ID | AM3PR08MB00881BE3867DF3FC5B5B7530836C0@AM3PR08MB0088.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
On Tue, Apr 19, 2016 at 6:32 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > Richard Biener wrote: >> >> This folding should be added to gimple-fold.c:gimple_fold_builtin instead, >> the builtins.c foldings are purerly for folding to constants nowadays. > > So is this better? It's a lot more verbose for something so simple... > Unfortunately match.pd doesn't support this kind of thing either. Better - comments below. Jakub objections to the usefulness of the transform remain - we do have the strlen pass that uses some global knowledge to decide on profitability. One could argue that for -Os doing the reverse transform is profitable? Yes, match.pd doesn't support this (yet). It may be possible to teach it simple cases like this - I plan to revisit this at some point. The issue is one of side-effects which are tricky to handle if you consider patterns that do not only match a single call (as this one would). So a baby-step towards getting this supported in match.pd is to handle matching toplevel calls specially. Richard. > Wilco > > > ChangeLog: > 2016-04-19 Wilco Dijkstra <wdijkstr@arm.com> > > gcc/ > * gcc/gimple-fold.c (gimple_fold_builtin_strchr): > New function to optimize strchr (s, 0) to strlen. > (gimple_fold_builtin): Add BUILT_IN_STRCHR case. > > testsuite/ > * gcc/testsuite/gcc.dg/strlenopt-20.c: Update test. > * gcc/testsuite/gcc.dg/strlenopt-21.c: Likewise. > * gcc/testsuite/gcc.dg/strlenopt-22.c: Likewise. > * gcc/testsuite/gcc.dg/strlenopt-26.c: Likewise. > * gcc/testsuite/gcc.dg/strlenopt-5.c: Likewise. > * gcc/testsuite/gcc.dg/strlenopt-7.c: Likewise. > * gcc/testsuite/gcc.dg/strlenopt-9.c: Likewise. > > -- > > diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c > index eb130d048469f0b8196e565fed9a40de74b098bd..11dcf69fc919f066362f4f713db392d14b39764e 100644 > --- a/gcc/gimple-fold.c > +++ b/gcc/gimple-fold.c > @@ -1380,6 +1380,59 @@ gimple_fold_builtin_strncpy (gimple_stmt_iterator *gsi, > return true; > } > > +/* Simplify strchr (str, 0) into str + strlen (str). > + In general strlen is significantly faster than strchr > + due to being a simpler operation. */ > +static bool > +gimple_fold_builtin_strchr (gimple_stmt_iterator *gsi) > +{ > + gimple *stmt = gsi_stmt (*gsi); > + tree str = gimple_call_arg (stmt, 0); > + tree c = gimple_call_arg (stmt, 1); > + location_t loc = gimple_location (stmt); > + > + if (optimize_function_for_size_p (cfun)) > + return false; Hmm, I think we'd want a optimize_stmt_for_size_p (stmt) which does the right thing for the case we have a CFG (look at the BB) or when not (look at the function). > + if (!integer_zerop (c) || !gimple_call_lhs (stmt)) > + return false; > + > + tree newstr; > + tree strlen_fn = builtin_decl_implicit (BUILT_IN_STRLEN); > + > + if (!strlen_fn) > + return false; > + > + /* Create newstr = strlen (str). */ > + gimple_seq stmts = NULL, stmts2; > + gimple *repl = gimple_build_call (strlen_fn, 1, str); > + gimple_set_location (repl, loc); > + if (gimple_in_ssa_p (cfun)) > + newstr = make_ssa_name (size_type_node); > + else > + newstr = create_tmp_reg (size_type_node); > + gimple_call_set_lhs (repl, newstr); > + gimple_seq_add_stmt_without_update (&stmts, repl); > + > + /* Create (str p+ strlen (str)). */ > + newstr = fold_build_pointer_plus_loc (loc, str, newstr); > + newstr = force_gimple_operand (newstr, &stmts2, true, NULL_TREE); I think you want to build a gimple_assign directly here, otherwise ... > + gimple_seq_add_seq_without_update (&stmts, stmts2); > + > + repl = gimple_build_assign (gimple_call_lhs (stmt), newstr); > + gimple_seq_add_stmt_without_update (&stmts, repl); > + gsi_replace_with_seq_vops (gsi, stmts); > + /* gsi now points at the assignment to the lhs, get a > + stmt iterator to the strlen. > + ??? We can't use gsi_for_stmt as that doesn't work when the > + CFG isn't built yet. */ > + gimple_stmt_iterator gsi2 = *gsi; > + gsi_prev (&gsi2); > + gsi_prev (&gsi2); ... this may not reliably end up at the call stmt. > + fold_stmt (&gsi2); > + return true; > +} > + > /* Simplify a call to the strcat builtin. DST and SRC are the arguments > to the call. > > @@ -2821,6 +2874,8 @@ gimple_fold_builtin (gimple_stmt_iterator *gsi) > gimple_call_arg (stmt, 1)); > case BUILT_IN_STRNCAT: > return gimple_fold_builtin_strncat (gsi); > + case BUILT_IN_STRCHR: > + return gimple_fold_builtin_strchr (gsi); > case BUILT_IN_FPUTS: > return gimple_fold_builtin_fputs (gsi, gimple_call_arg (stmt, 0), > gimple_call_arg (stmt, 1), false); > diff --git a/gcc/testsuite/gcc.dg/strlenopt-20.c b/gcc/testsuite/gcc.dg/strlenopt-20.c > index a83e845c26d88e5acdcabf142f7b319136663488..7b483eaeac1aa47278111a92148a16f00b2aaa2d 100644 > --- a/gcc/testsuite/gcc.dg/strlenopt-20.c > +++ b/gcc/testsuite/gcc.dg/strlenopt-20.c > @@ -86,9 +86,9 @@ main () > return 0; > } > > -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ > -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ > diff --git a/gcc/testsuite/gcc.dg/strlenopt-21.c b/gcc/testsuite/gcc.dg/strlenopt-21.c > index e22fa9fca9ba14354db2cd5f602283b64bd8dcac..05b85a49dde0a7f5d269174fd4269e40be910dbd 100644 > --- a/gcc/testsuite/gcc.dg/strlenopt-21.c > +++ b/gcc/testsuite/gcc.dg/strlenopt-21.c > @@ -57,9 +57,9 @@ main () > return 0; > } > > -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ > -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ > diff --git a/gcc/testsuite/gcc.dg/strlenopt-22.c b/gcc/testsuite/gcc.dg/strlenopt-22.c > index aa55f5ebd6a2d4803ee9a7fd60fc538d86f47124..b4ef772f0e59252f10a5419ede6837b3c8ca8265 100644 > --- a/gcc/testsuite/gcc.dg/strlenopt-22.c > +++ b/gcc/testsuite/gcc.dg/strlenopt-22.c > @@ -31,9 +31,9 @@ main () > return 0; > } > > -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ > -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ > diff --git a/gcc/testsuite/gcc.dg/strlenopt-26.c b/gcc/testsuite/gcc.dg/strlenopt-26.c > index 4bd54bef540806ca90d2b9bcfc33eb531991c967..da2f465a5b5003fa5dca05f3a6ee00e97b98b5dd 100644 > --- a/gcc/testsuite/gcc.dg/strlenopt-26.c > +++ b/gcc/testsuite/gcc.dg/strlenopt-26.c > @@ -21,4 +21,5 @@ main (void) > return 0; > } > > -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ > diff --git a/gcc/testsuite/gcc.dg/strlenopt-5.c b/gcc/testsuite/gcc.dg/strlenopt-5.c > index 1b006a93045599a75bdb10a39e86ffa59b475c83..a24aea44e8b00ff7b35a907aaa941b4c509642c4 100644 > --- a/gcc/testsuite/gcc.dg/strlenopt-5.c > +++ b/gcc/testsuite/gcc.dg/strlenopt-5.c > @@ -48,9 +48,9 @@ main () > return 0; > } > > -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ > -/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ > diff --git a/gcc/testsuite/gcc.dg/strlenopt-7.c b/gcc/testsuite/gcc.dg/strlenopt-7.c > index 3ae1e2cb3f0d08c8b05107c4e65b67bdb39cf7ab..aa53d7e75254dfe56c93172afc49f95e5b7901e6 100644 > --- a/gcc/testsuite/gcc.dg/strlenopt-7.c > +++ b/gcc/testsuite/gcc.dg/strlenopt-7.c > @@ -40,11 +40,11 @@ main () > return 0; > } > > -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ > -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c b/gcc/testsuite/gcc.dg/strlenopt-9.c > index b0406b162d48fca375883035043b0c50b9db61a1..e5e276210ba4b7d75867605f1ecf5c06eb970ef5 100644 > --- a/gcc/testsuite/gcc.dg/strlenopt-9.c > +++ b/gcc/testsuite/gcc.dg/strlenopt-9.c > @@ -98,10 +98,10 @@ main () > return 0; > } > > -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ > -/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */ > +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ > /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */ >
On Wed, Apr 20, 2016 at 10:45 AM, Richard Biener <richard.guenther@gmail.com> wrote: > On Tue, Apr 19, 2016 at 6:32 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: >> Richard Biener wrote: >>> >>> This folding should be added to gimple-fold.c:gimple_fold_builtin instead, >>> the builtins.c foldings are purerly for folding to constants nowadays. >> >> So is this better? It's a lot more verbose for something so simple... >> Unfortunately match.pd doesn't support this kind of thing either. > > Better - comments below. Jakub objections to the usefulness of the transform > remain - we do have the strlen pass that uses some global knowledge to decide > on profitability. One could argue that for -Os doing the reverse transform is > profitable? > > Yes, match.pd doesn't support this (yet). It may be possible to teach it simple > cases like this - I plan to revisit this at some point. The issue is one of > side-effects which are tricky to handle if you consider patterns that do not > only match a single call (as this one would). So a baby-step towards getting > this supported in match.pd is to handle matching toplevel calls specially. Ok, just gave it a stab in a different way - see attached. This makes (simplify (BUILT_IN_STRCHR @0 integer_zerop) (pointer_plus @0 (BUILT_IN_STRLEN:size_type_node @0))) work. Note how the SSA following predicate needs to be aware of side-effects to guard against bogus applies of complicated patterns (none of those exist yet). Richard. > Richard. > >> Wilco >> >> >> ChangeLog: >> 2016-04-19 Wilco Dijkstra <wdijkstr@arm.com> >> >> gcc/ >> * gcc/gimple-fold.c (gimple_fold_builtin_strchr): >> New function to optimize strchr (s, 0) to strlen. >> (gimple_fold_builtin): Add BUILT_IN_STRCHR case. >> >> testsuite/ >> * gcc/testsuite/gcc.dg/strlenopt-20.c: Update test. >> * gcc/testsuite/gcc.dg/strlenopt-21.c: Likewise. >> * gcc/testsuite/gcc.dg/strlenopt-22.c: Likewise. >> * gcc/testsuite/gcc.dg/strlenopt-26.c: Likewise. >> * gcc/testsuite/gcc.dg/strlenopt-5.c: Likewise. >> * gcc/testsuite/gcc.dg/strlenopt-7.c: Likewise. >> * gcc/testsuite/gcc.dg/strlenopt-9.c: Likewise. >> >> -- >> >> diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c >> index eb130d048469f0b8196e565fed9a40de74b098bd..11dcf69fc919f066362f4f713db392d14b39764e 100644 >> --- a/gcc/gimple-fold.c >> +++ b/gcc/gimple-fold.c >> @@ -1380,6 +1380,59 @@ gimple_fold_builtin_strncpy (gimple_stmt_iterator *gsi, >> return true; >> } >> >> +/* Simplify strchr (str, 0) into str + strlen (str). >> + In general strlen is significantly faster than strchr >> + due to being a simpler operation. */ >> +static bool >> +gimple_fold_builtin_strchr (gimple_stmt_iterator *gsi) >> +{ >> + gimple *stmt = gsi_stmt (*gsi); >> + tree str = gimple_call_arg (stmt, 0); >> + tree c = gimple_call_arg (stmt, 1); >> + location_t loc = gimple_location (stmt); >> + >> + if (optimize_function_for_size_p (cfun)) >> + return false; > > Hmm, I think we'd want a optimize_stmt_for_size_p (stmt) which > does the right thing for the case we have a CFG (look at the BB) > or when not (look at the function). > >> + if (!integer_zerop (c) || !gimple_call_lhs (stmt)) >> + return false; >> + >> + tree newstr; >> + tree strlen_fn = builtin_decl_implicit (BUILT_IN_STRLEN); >> + >> + if (!strlen_fn) >> + return false; >> + >> + /* Create newstr = strlen (str). */ >> + gimple_seq stmts = NULL, stmts2; >> + gimple *repl = gimple_build_call (strlen_fn, 1, str); >> + gimple_set_location (repl, loc); >> + if (gimple_in_ssa_p (cfun)) >> + newstr = make_ssa_name (size_type_node); >> + else >> + newstr = create_tmp_reg (size_type_node); >> + gimple_call_set_lhs (repl, newstr); >> + gimple_seq_add_stmt_without_update (&stmts, repl); >> + >> + /* Create (str p+ strlen (str)). */ >> + newstr = fold_build_pointer_plus_loc (loc, str, newstr); >> + newstr = force_gimple_operand (newstr, &stmts2, true, NULL_TREE); > > I think you want to build a gimple_assign directly here, otherwise ... > >> + gimple_seq_add_seq_without_update (&stmts, stmts2); >> + >> + repl = gimple_build_assign (gimple_call_lhs (stmt), newstr); >> + gimple_seq_add_stmt_without_update (&stmts, repl); >> + gsi_replace_with_seq_vops (gsi, stmts); >> + /* gsi now points at the assignment to the lhs, get a >> + stmt iterator to the strlen. >> + ??? We can't use gsi_for_stmt as that doesn't work when the >> + CFG isn't built yet. */ >> + gimple_stmt_iterator gsi2 = *gsi; >> + gsi_prev (&gsi2); >> + gsi_prev (&gsi2); > > ... this may not reliably end up at the call stmt. > >> + fold_stmt (&gsi2); >> + return true; >> +} >> + >> /* Simplify a call to the strcat builtin. DST and SRC are the arguments >> to the call. >> >> @@ -2821,6 +2874,8 @@ gimple_fold_builtin (gimple_stmt_iterator *gsi) >> gimple_call_arg (stmt, 1)); >> case BUILT_IN_STRNCAT: >> return gimple_fold_builtin_strncat (gsi); >> + case BUILT_IN_STRCHR: >> + return gimple_fold_builtin_strchr (gsi); >> case BUILT_IN_FPUTS: >> return gimple_fold_builtin_fputs (gsi, gimple_call_arg (stmt, 0), >> gimple_call_arg (stmt, 1), false); >> diff --git a/gcc/testsuite/gcc.dg/strlenopt-20.c b/gcc/testsuite/gcc.dg/strlenopt-20.c >> index a83e845c26d88e5acdcabf142f7b319136663488..7b483eaeac1aa47278111a92148a16f00b2aaa2d 100644 >> --- a/gcc/testsuite/gcc.dg/strlenopt-20.c >> +++ b/gcc/testsuite/gcc.dg/strlenopt-20.c >> @@ -86,9 +86,9 @@ main () >> return 0; >> } >> >> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ >> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ >> diff --git a/gcc/testsuite/gcc.dg/strlenopt-21.c b/gcc/testsuite/gcc.dg/strlenopt-21.c >> index e22fa9fca9ba14354db2cd5f602283b64bd8dcac..05b85a49dde0a7f5d269174fd4269e40be910dbd 100644 >> --- a/gcc/testsuite/gcc.dg/strlenopt-21.c >> +++ b/gcc/testsuite/gcc.dg/strlenopt-21.c >> @@ -57,9 +57,9 @@ main () >> return 0; >> } >> >> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ >> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ >> diff --git a/gcc/testsuite/gcc.dg/strlenopt-22.c b/gcc/testsuite/gcc.dg/strlenopt-22.c >> index aa55f5ebd6a2d4803ee9a7fd60fc538d86f47124..b4ef772f0e59252f10a5419ede6837b3c8ca8265 100644 >> --- a/gcc/testsuite/gcc.dg/strlenopt-22.c >> +++ b/gcc/testsuite/gcc.dg/strlenopt-22.c >> @@ -31,9 +31,9 @@ main () >> return 0; >> } >> >> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ >> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ >> diff --git a/gcc/testsuite/gcc.dg/strlenopt-26.c b/gcc/testsuite/gcc.dg/strlenopt-26.c >> index 4bd54bef540806ca90d2b9bcfc33eb531991c967..da2f465a5b5003fa5dca05f3a6ee00e97b98b5dd 100644 >> --- a/gcc/testsuite/gcc.dg/strlenopt-26.c >> +++ b/gcc/testsuite/gcc.dg/strlenopt-26.c >> @@ -21,4 +21,5 @@ main (void) >> return 0; >> } >> >> -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ >> diff --git a/gcc/testsuite/gcc.dg/strlenopt-5.c b/gcc/testsuite/gcc.dg/strlenopt-5.c >> index 1b006a93045599a75bdb10a39e86ffa59b475c83..a24aea44e8b00ff7b35a907aaa941b4c509642c4 100644 >> --- a/gcc/testsuite/gcc.dg/strlenopt-5.c >> +++ b/gcc/testsuite/gcc.dg/strlenopt-5.c >> @@ -48,9 +48,9 @@ main () >> return 0; >> } >> >> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ >> -/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ >> diff --git a/gcc/testsuite/gcc.dg/strlenopt-7.c b/gcc/testsuite/gcc.dg/strlenopt-7.c >> index 3ae1e2cb3f0d08c8b05107c4e65b67bdb39cf7ab..aa53d7e75254dfe56c93172afc49f95e5b7901e6 100644 >> --- a/gcc/testsuite/gcc.dg/strlenopt-7.c >> +++ b/gcc/testsuite/gcc.dg/strlenopt-7.c >> @@ -40,11 +40,11 @@ main () >> return 0; >> } >> >> -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ >> -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */ >> diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c b/gcc/testsuite/gcc.dg/strlenopt-9.c >> index b0406b162d48fca375883035043b0c50b9db61a1..e5e276210ba4b7d75867605f1ecf5c06eb970ef5 100644 >> --- a/gcc/testsuite/gcc.dg/strlenopt-9.c >> +++ b/gcc/testsuite/gcc.dg/strlenopt-9.c >> @@ -98,10 +98,10 @@ main () >> return 0; >> } >> >> -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ >> -/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */ >> +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ >> /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */ >>
On Wed, Apr 20, 2016 at 11:44:08AM +0200, Richard Biener wrote: > (simplify > (BUILT_IN_STRCHR @0 integer_zerop) > (pointer_plus @0 (BUILT_IN_STRLEN:size_type_node @0))) I still don't like this transformation and would very much prefer to see using rawmemchr instead on targets that provide it, and also this is something that IMHO should be done in the tree-ssa-strlen.c pass together with the other optimizations in there. Similarly to stpcpy, which is also non-standard (in POSIX, but not in C), we should just look at headers if rawmemchr is defined with compatible prototype. Also, strrchr (s, 0) should be folded to strchr (s, 0) or handled the same like that one. And, while x = strchr (s, 0) to x = rawmemchr (s, 0) is a reasonable -Os transformation, x = s + strlen (s) is not, it makes code usually larger (especially because it increases register pressure across the call). Jakub
On Wed, Apr 20, 2016 at 12:33 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Wed, Apr 20, 2016 at 11:44:08AM +0200, Richard Biener wrote: >> (simplify >> (BUILT_IN_STRCHR @0 integer_zerop) >> (pointer_plus @0 (BUILT_IN_STRLEN:size_type_node @0))) > > I still don't like this transformation and would very much prefer to see > using rawmemchr instead on targets that provide it, and also this is > something that IMHO should be done in the tree-ssa-strlen.c pass together > with the other optimizations in there. Similarly to stpcpy, which is also > non-standard (in POSIX, but not in C), we should just look at headers if > rawmemchr is defined with compatible prototype. > Also, strrchr (s, 0) should be folded to strchr (s, 0) or handled the same > like that one. > And, while x = strchr (s, 0) to x = rawmemchr (s, 0) is a reasonable -Os > transformation, x = s + strlen (s) is not, it makes code usually larger > (especially because it increases register pressure across the call). Sure - agreed. So with the patch (simplify (BUILT_IN_STRRCHR @0 integer_zerop@1) (BUILT_IN_STRCHR @0 @1)) is possible at least ;) Richard. > Jakub
Jakub Jelinek wrote: > I still don't like this transformation and would very much prefer to see > using rawmemchr instead on targets that provide it, and also this is > something that IMHO should be done in the tree-ssa-strlen.c pass together > with the other optimizations in there. Similarly to stpcpy, which is also > non-standard (in POSIX, but not in C), we should just look at headers if > rawmemchr is defined with compatible prototype. Can you quantify "don't like"? I benchmarked rawmemchr on a few targets and it's slower than strlen, so it's hard to guess what you don't like about it. Several targets don't even have an assembly implementation of rawmemchr, so looking at the header would not be sufficient to determine rawmemchr is fast, let alone as fast as strlen. The tree-ssa-strlen pass seems to optimize repeated calls to strlen, or strcpy after a strlen, so I'm not sure how this is related - this is a local transformation like the foldings in builtin.c/gimple-fold.c. > Also, strrchr (s, 0) should be folded to strchr (s, 0) or handled the same > like that one. GCC converts strrchr (s, 0) to strchr (s, 0) which then gets optimized. I checked this happens as expected with both versions of my patch. > And, while x = strchr (s, 0) to x = rawmemchr (s, 0) is a reasonable -Os > transformation, x = s + strlen (s) is not, it makes code usually larger > (especially because it increases register pressure across the call). Indeed, that's why my transformation is disabled with -Os. Wilco
Richard Biener wrote: > Better - comments below. Jakub objections to the usefulness of the transform > remain - we do have the strlen pass that uses some global knowledge to decide > on profitability. One could argue that for -Os doing the reverse transform is > profitable? In what way would it get more info to decide on profitability? The transform is profitable unless you messed up your strlen implementation badly. For -Os one could do the reverse, but I don't think it is going to give a substantial codesize gain compared to other simple improvements, so unlikely worth it. >> + if (optimize_function_for_size_p (cfun)) >> + return false; > > Hmm, I think we'd want a optimize_stmt_for_size_p (stmt) which > does the right thing for the case we have a CFG (look at the BB) > or when not (look at the function). Does that use the often incorrect BB probabilities? I used the function variant on purpose to avoid it making the wrong decision. A typical example I see is that GCC inlines a return sequence into an if marked with __builtin_expect (c, 0) but not in the hot code that follows... > I think you want to build a gimple_assign directly here, otherwise ... >... this may not reliably end up at the call stmt. OK, I revisit that once we've agreed how to proceed with this patch - we now have 3 variants... Wilco
On Wed, Apr 20, 2016 at 1:56 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > Richard Biener wrote: >> Better - comments below. Jakub objections to the usefulness of the transform >> remain - we do have the strlen pass that uses some global knowledge to decide >> on profitability. One could argue that for -Os doing the reverse transform is >> profitable? > > In what way would it get more info to decide on profitability? The transform is > profitable unless you messed up your strlen implementation badly. > > For -Os one could do the reverse, but I don't think it is going to give a substantial > codesize gain compared to other simple improvements, so unlikely worth it. > >>> + if (optimize_function_for_size_p (cfun)) >>> + return false; >> >> Hmm, I think we'd want a optimize_stmt_for_size_p (stmt) which >> does the right thing for the case we have a CFG (look at the BB) >> or when not (look at the function). > > Does that use the often incorrect BB probabilities? I used the function variant on > purpose to avoid it making the wrong decision. A typical example I see is that GCC > inlines a return sequence into an if marked with __builtin_expect (c, 0) but not in the > hot code that follows... > >> I think you want to build a gimple_assign directly here, otherwise ... > >>... this may not reliably end up at the call stmt. > > OK, I revisit that once we've agreed how to proceed with this patch - we now have > 3 variants... Yeah ;) I'm currently bootstrapping/testing the patch that makes it possible to write all this in match.pd. Richard. > Wilco >
On Wed, Apr 20, 2016 at 11:17:06AM +0000, Wilco Dijkstra wrote: > Can you quantify "don't like"? I benchmarked rawmemchr on a few targets > and it's slower than strlen, so it's hard to guess what you don't like about it. This is the same stuff as has been discussed for mempcpy, rawmemchr is the API meant to use for getting pointer to the terminating '\0', if there are deficiencies on the library side, they should be fixed. If you hardcode in GCC emitting worse sequence at the caller (which s + strlen (s) is), then even once the library deficiency is fixed, you still don't get benefit from it. I wonder how you work around the define strchr(s, c) \ (__extension__ (__builtin_constant_p (c) && !__builtin_constant_p (s) \ && (c) == '\0' \ ? (char *) __rawmemchr (s, c) \ : __builtin_strchr (s, c))) in glibc headers anyway. Another thing is for the cases where strlen is desirable to be expanded inline, in that case rawmemchr (x, 0) or strchr (x, 0) is likely useful to be expanded inline as well and then this decision should be done at expansion time. Jakub
Jakub Jelinek wrote: > On Wed, Apr 20, 2016 at 11:17:06AM +0000, Wilco Dijkstra wrote: >> Can you quantify "don't like"? I benchmarked rawmemchr on a few targets >> and it's slower than strlen, so it's hard to guess what you don't like about it. > > This is the same stuff as has been discussed for mempcpy, rawmemchr is the > API meant to use for getting pointer to the terminating '\0', if there are > deficiencies on the library side, they should be fixed. About mempcpy, GLIBC nowadays expands it into memcpy (p, q, n) + n by default in string.h. Generally after a lot of discussion on this last year, the consensus is that these functions don't provide a useful gain and are often detrimental to performance even if optimized assembly implementations happen to be available due to I-cache pressure. Emitting rawmemchr/mempcpy/stpcpy automatically as a result of optimization is a bad idea for most targets given libraries often have inefficient default implementations. I fixed the GLIBC mempcpy and stpcpy C implementations to use memcpy and strlen so at least for these performance is no longer absolutely terrible. Saying that all C libraries should be forced to provide highly optimized assembler versions for these functions is onerous since they are not frequently used in code (a quick grep of SPEC resulted in one use of mempcpy, 0 uses of rawmemchr, strchrnul and stpcpy). > If you hardcode in > GCC emitting worse sequence at the caller (which s + strlen (s) is), then > even once the library deficiency is fixed, you still don't get benefit from > it. What benefit exactly? Rawmemchr cannot ever beat strlen. There is a trick that can make a good strlen faster than rawmemchr, but even ignoring that, an integer based rawmemchr needs to do extra operations in its inner loop. A SIMD version could use similar inner loops although rawmemchr still has a higher cost. You could special case searching for '\0' and jump to strlen (I have patches for that), but that also adds cost... >I wonder how you work around the > define strchr(s, c) \ > .. > in glibc headers anyway. That should either be removed or changed to use strlen (I have patches for both options out for review). > Another thing is for the cases where strlen is desirable to be expanded > inline, in that case rawmemchr (x, 0) or strchr (x, 0) is likely useful to be > expanded inline as well and then this decision should be done at expansion > time. I'm not sure I'm following you here - that's an argument to expand into strlen early as strlen is better optimized in GCC... Wilco
Richard Biener wrote: > > Yeah ;) I'm currently bootstrapping/testing the patch that makes it possible to > write all this in match.pd. So did that pass bootstrap? It would be good to decide how to proceed with this. Wilco
Richard Biener wrote: > > Yeah ;) I'm currently bootstrapping/testing the patch that makes it possible to > write all this in match.pd. So what was the conclusion? Improving match.pd to be able to handle more cases like this seems like a nice thing. Wilco
On Wed, May 18, 2016 at 2:29 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > Richard Biener wrote: >> >> Yeah ;) I'm currently bootstrapping/testing the patch that makes it possible to >> write all this in match.pd. > > So what was the conclusion? Improving match.pd to be able to handle more cases > like this seems like a nice thing. I'm stuck with fallout and making this work requires some serious thought. Don't hold your breath here :/ The restricted case of strchr (a, 0) -> strlen () can be made working more easily but I didn't yet try to implement a restriction only allowing the cases that would work. Meanwhile the strlenopt pass would be an appropriate place to handle this transform (well, if we now agree on its usefulness). Richard. > > Wilco >
Richard Biener <richard.guenther@gmail.com> wrote: > On Wed, May 18, 2016 at 2:29 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: >> Richard Biener wrote: >> >>> Yeah ;) I'm currently bootstrapping/testing the patch that makes it possible to >>> write all this in match.pd. >> >> So what was the conclusion? Improving match.pd to be able to handle more cases >> like this seems like a nice thing. > > I'm stuck with fallout and making this work requires some serious > thought. Don't > hold your breath here :/ > > The restricted case of strchr (a, 0) -> strlen () can be made working > more easily > but I didn't yet try to implement a restriction only allowing the > cases that would work. > > Meanwhile the strlenopt pass would be an appropriate place to handle > this transform > (well, if we now agree on its usefulness). I'd like to pick this up again so we can make GCC7 a bit less inefficient for this case. (original thread: https://gcc.gnu.org/ml/gcc-patches/2016-04/msg00870.html) We've seen several different proposals for where/how to do this simplification, why did you say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite, ie. completely unrelated to what strlenopt does. We do all the other simplifications based on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different? Wilco
On Tue, 2016-09-13 at 12:36 +0000, Wilco Dijkstra wrote: > Richard Biener <richard.guenther@gmail.com> wrote: > > > > On Wed, May 18, 2016 at 2:29 PM, Wilco Dijkstra <Wilco.Dijkstra@arm > > .com> wrote: > > > > > > Richard Biener wrote: > > > > > > > > > > > Yeah ;) I'm currently bootstrapping/testing the patch that > > > > makes it possible to > > > > write all this in match.pd. > > > So what was the conclusion? Improving match.pd to be able to > > > handle more cases > > > like this seems like a nice thing. > > I'm stuck with fallout and making this work requires some serious > > thought. Don't > > hold your breath here :/ > > > > The restricted case of strchr (a, 0) -> strlen () can be made > > working > > more easily > > but I didn't yet try to implement a restriction only allowing the > > cases that would work. > > > > Meanwhile the strlenopt pass would be an appropriate place to > > handle > > this transform > > (well, if we now agree on its usefulness). > I'd like to pick this up again so we can make GCC7 a bit less > inefficient for this case. > (original thread: https://gcc.gnu.org/ml/gcc-patches/2016-04/msg00870 > .html) > > We've seen several different proposals for where/how to do this > simplification, why did you > say strlenopt is best? It would be an unconditional strchr (a, 0) -> > a + strlen (a) rewrite, > ie. completely unrelated to what strlenopt does. We do all the other > simplifications based > on constant arguments in builtins.c and gimple-fold.c, why is strchr > (s, 0) different? > BTW, there are two PRs for this: 61056 and 32650. Please take them into account when committing something for this issue. Cheers, Oleg
On Tue, Sep 13, 2016 at 2:36 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > Richard Biener <richard.guenther@gmail.com> wrote: >> On Wed, May 18, 2016 at 2:29 PM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: >>> Richard Biener wrote: >>> >>>> Yeah ;) I'm currently bootstrapping/testing the patch that makes it possible to >>>> write all this in match.pd. >>> >>> So what was the conclusion? Improving match.pd to be able to handle more cases >>> like this seems like a nice thing. >> >> I'm stuck with fallout and making this work requires some serious >> thought. Don't >> hold your breath here :/ >> >> The restricted case of strchr (a, 0) -> strlen () can be made working >> more easily >> but I didn't yet try to implement a restriction only allowing the >> cases that would work. >> >> Meanwhile the strlenopt pass would be an appropriate place to handle >> this transform >> (well, if we now agree on its usefulness). > > I'd like to pick this up again so we can make GCC7 a bit less inefficient for this case. > (original thread: https://gcc.gnu.org/ml/gcc-patches/2016-04/msg00870.html) > > We've seen several different proposals for where/how to do this simplification, why did you > say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite, > ie. completely unrelated to what strlenopt does. We do all the other simplifications based > on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different? I was thinking about the case where strlen opt already knows strlen (a). But sure, gimple-fold.c works as well. Richard. > Wilco > >
On Wed, Sep 14, 2016 at 03:41:33PM +0200, Richard Biener wrote: > > We've seen several different proposals for where/how to do this simplification, why did you > > say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite, > > ie. completely unrelated to what strlenopt does. We do all the other simplifications based > > on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different? > > I was thinking about the case where strlen opt already knows strlen > (a). But sure, gimple-fold.c > works as well. I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a) is better, and at expansion time we can decide what to use (a + strlen (a) if you'd expand strlen inline, rather than as a function call, or __rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)). Jakub
On 14/09/16 14:45, Jakub Jelinek wrote: > I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a) > is better, and at expansion time we can decide what to use (a + strlen (a) > if you'd expand strlen inline, rather than as a function call, or > __rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)). > i think the compiler should move away from generating calls to nonstandard functions, __rawmemchr is much less used than strlen so it's less likely to be optimized in certain target/libc combination.
On Wed, Sep 14, 2016 at 3:45 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Wed, Sep 14, 2016 at 03:41:33PM +0200, Richard Biener wrote: >> > We've seen several different proposals for where/how to do this simplification, why did you >> > say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite, >> > ie. completely unrelated to what strlenopt does. We do all the other simplifications based >> > on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different? >> >> I was thinking about the case where strlen opt already knows strlen >> (a). But sure, gimple-fold.c >> works as well. > > I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a) > is better, and at expansion time we can decide what to use (a + strlen (a) > if you'd expand strlen inline, rather than as a function call, or > __rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)). OTOH that then argues for doing it in strlenopt because that knows whether we maybe already computed strlen (a) (which might have other uses than adding to a). Richard. > Jakub
On Thu, Sep 15, 2016 at 09:52:34AM +0200, Richard Biener wrote: > On Wed, Sep 14, 2016 at 3:45 PM, Jakub Jelinek <jakub@redhat.com> wrote: > > On Wed, Sep 14, 2016 at 03:41:33PM +0200, Richard Biener wrote: > >> > We've seen several different proposals for where/how to do this simplification, why did you > >> > say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite, > >> > ie. completely unrelated to what strlenopt does. We do all the other simplifications based > >> > on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different? > >> > >> I was thinking about the case where strlen opt already knows strlen > >> (a). But sure, gimple-fold.c > >> works as well. > > > > I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a) > > is better, and at expansion time we can decide what to use (a + strlen (a) > > if you'd expand strlen inline, rather than as a function call, or > > __rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)). > > OTOH that then argues for doing it in strlenopt because that knows > whether we maybe > already computed strlen (a) (which might have other uses than adding to a). Sure. Jakub
Richard Biener wrote: > On Wed, Sep 14, 2016 at 3:45 PM, Jakub Jelinek <jakub@redhat.com> wrote: > > On Wed, Sep 14, 2016 at 03:41:33PM +0200, Richard Biener wrote: > >> > We've seen several different proposals for where/how to do this simplification, why did you > >> > say strlenopt is best? It would be an unconditional strchr (a, 0) -> a + strlen (a) rewrite, > >> > ie. completely unrelated to what strlenopt does. We do all the other simplifications based > >> > on constant arguments in builtins.c and gimple-fold.c, why is strchr (s, 0) different? >>> > >> I was thinking about the case where strlen opt already knows strlen > >> (a). But sure, gimple-fold.c > >> works as well. > > > > I think for the middle-end, using strchr (a, 0) as canonical instead of a + strlen (a) > > is better, and at expansion time we can decide what to use (a + strlen (a) > > if you'd expand strlen inline, rather than as a function call, or > > __rawmemchr (which if libc is sane should be fastest), or strchr, or a + strlen (a)). __rawmemchr is not the fastest on any target I tried, including x86, so GLIBC's current default behaviour of always using __rawmemchr is inefficient. GCC doesn't support __rawmemchr at all, so the strlen optimization can't optimize it. strchr is significantly slower than strlen, so generating that is a bad idea too. So the only reasonable optimization is to always emit a + strlen (a). > OTOH that then argues for doing it in strlenopt because that knows > whether we maybe > already computed strlen (a) (which might have other uses than adding to a). strlenopt can already change strchr (a, 0) into strlen (a) + a when strlen has been called before it, so that part is already done. However it doesn't optimize a strlen after a strchr, so if the expansion is done late you'd end up with a redundant strlen. That means the expansion would need to happen either before or very early in strlenopt (likely an extra pass at init time to avoid upsetting the strlen optimization - we want to treat any strchr as a real strlen). Wilco
On Thu, Sep 15, 2016 at 01:38:45PM +0000, Wilco Dijkstra wrote: > __rawmemchr is not the fastest on any target I tried, including x86, so GLIBC's > current default behaviour of always using __rawmemchr is inefficient. GCC doesn't > support __rawmemchr at all, so the strlen optimization can't optimize it. Why? It knows you are targeting glibc, and if it sees rawmemchr (or __rawmemchr) in the headers as well, it can emit that. As for speed, there is no inherent reason why rawmemchr should be slower than strlen, on the contrary. So, if on some target rawmemchr is slower than strlen, most likely it has never been implemented there properly, or somebody improved strlen without bothering to improve rawmemchr at the same time. > strchr is significantly slower than strlen, so generating that is a bad idea too. > > So the only reasonable optimization is to always emit a + strlen (a). I disagree. Another, on glibc more reasonable, optimization is to make sure rawmemchr is fast enough. Jakub
On 09/15/2016 03:38 PM, Wilco Dijkstra wrote: > __rawmemchr is not the fastest on any target I tried, including x86, Interesting. Care to share your test program? I just looked at the libc sources and strlen/rawmemchr are practically identical code so I'd expect any difference to be lost in the noise. Of course there might be inlines interfering with the comparison. > So the only reasonable optimization is to always emit a + strlen (a). Not sure about "only reasonable" but on the whole I'd agree that it's reasonable and we shouldn't let the perfect be the enemy of the good here. I'm sure we can come up with lots of different ways to do this but let's just pick one and if the one Wilco submitted looks decent let's just put it in. Out of curiousity, is there real-world code that this is intended to optimize? Bernd
On 15/09/16 14:49, Jakub Jelinek wrote: > On Thu, Sep 15, 2016 at 01:38:45PM +0000, Wilco Dijkstra wrote: >> __rawmemchr is not the fastest on any target I tried, including x86, so GLIBC's >> current default behaviour of always using __rawmemchr is inefficient. GCC doesn't >> support __rawmemchr at all, so the strlen optimization can't optimize it. > > Why? It knows you are targeting glibc, and if it sees rawmemchr (or > __rawmemchr) in the headers as well, it can emit that. > As for speed, there is no inherent reason why rawmemchr should be slower > than strlen, on the contrary. So, if on some target rawmemchr is slower > than strlen, most likely it has never been implemented there properly, or > somebody improved strlen without bothering to improve rawmemchr at the same > time. > from libc point of view, rawmemchr is a rarely used nonstandard function that should be optimized for size. (glibc does not do this now, but it should in my opinion.) in case of static linking strlen is most likely linked into your binary already, but rawmemchr is not, so emitting calls to it increases code size. >> strchr is significantly slower than strlen, so generating that is a bad idea too. >> >> So the only reasonable optimization is to always emit a + strlen (a). > > I disagree. Another, on glibc more reasonable, optimization is to make sure > rawmemchr is fast enough. > libc should not imlpement n variants of similar functions, it is a maintainance problem and it makes the code bloated. (glibc even has asm implementations, which history tells is a bad idea: string functions had bugs, performance regressions and other problems because the asm is not future proof and it is hard to guarantee consistent behaviour across targets e.g. memchr on x86_64 is broken right now BZ 19387)
On Thu, Sep 15, 2016 at 03:16:52PM +0100, Szabolcs Nagy wrote: > On 15/09/16 14:49, Jakub Jelinek wrote: > > On Thu, Sep 15, 2016 at 01:38:45PM +0000, Wilco Dijkstra wrote: > >> __rawmemchr is not the fastest on any target I tried, including x86, so GLIBC's > >> current default behaviour of always using __rawmemchr is inefficient. GCC doesn't > >> support __rawmemchr at all, so the strlen optimization can't optimize it. > > > > Why? It knows you are targeting glibc, and if it sees rawmemchr (or > > __rawmemchr) in the headers as well, it can emit that. > > As for speed, there is no inherent reason why rawmemchr should be slower > > than strlen, on the contrary. So, if on some target rawmemchr is slower > > than strlen, most likely it has never been implemented there properly, or > > somebody improved strlen without bothering to improve rawmemchr at the same > > time. > > > > from libc point of view, rawmemchr is a rarely used > nonstandard function that should be optimized for size. > (glibc does not do this now, but it should in my opinion.) rawmemchr with 0 is to strlen conceptually like stpcpy is to strcpy. Are you arguing that glibc should implement strcpy using stpcpy, or vice versa? rawmemchr is certainly not rarely used, strchr (p, 0) is optimized to __rawmemchr by the glibc header macros, so it is very frequently used. tree-ssa-strlen.c should be tought to handle these. I'm not asking musl to implement rawmemchr, but glibc already has it and should provide an efficient implementation of it (if it doesn't, I really haven't seen any benchmark results that would say it isn't). > libc should not imlpement n variants of similar functions, > it is a maintainance problem and it makes the code bloated. > > (glibc even has asm implementations, which history tells > is a bad idea: string functions had bugs, performance > regressions and other problems because the asm is not > future proof and it is hard to guarantee consistent > behaviour across targets e.g. memchr on x86_64 is broken > right now BZ 19387) I disagree with that. For some routines like string ops, writing them in assembly is highly desirable. Jakub
Jakub Jelinek wrote: On Thu, Sep 15, 2016 at 03:16:52PM +0100, Szabolcs Nagy wrote: > > > > from libc point of view, rawmemchr is a rarely used > > nonstandard function that should be optimized for size. > > (glibc does not do this now, but it should in my opinion.) > > rawmemchr with 0 is to strlen conceptually like stpcpy is to strcpy. > Are you arguing that glibc should implement strcpy using stpcpy, or vice > versa? stpcpy is not conceptually the same, but for mempcpy, yes. By default it's converted into memcpy in the GLIBC headers and the generic implementation. stpcpy uses strlen and memcpy which is generally the most efficient version (it even beat several assembler implementations). > rawmemchr is certainly not rarely used, strchr (p, 0) is optimized to >__rawmemchr by the glibc header macros, so it is very frequently used. That's pretty much its only use, so not an argument for rawmemchr really but for optimizing strchr (p, 0) better. Wilco
On Thu, Sep 15, 2016 at 02:55:48PM +0000, Wilco Dijkstra wrote: > Jakub Jelinek wrote: > On Thu, Sep 15, 2016 at 03:16:52PM +0100, Szabolcs Nagy wrote: > > > > > > from libc point of view, rawmemchr is a rarely used > > > nonstandard function that should be optimized for size. > > > (glibc does not do this now, but it should in my opinion.) > > > > rawmemchr with 0 is to strlen conceptually like stpcpy is to strcpy. > > Are you arguing that glibc should implement strcpy using stpcpy, or vice > > versa? > > stpcpy is not conceptually the same, but for mempcpy, yes. By default > it's converted into memcpy in the GLIBC headers and the generic implementation. > > stpcpy uses strlen and memcpy which is generally the most efficient version > (it even beat several assembler implementations). ?? I certainly see something completely different, at least on the arches I've looked at. Jakub
From: Jakub Jelinek <jakub@redhat.com> > On Thu, Sep 15, 2016 at 02:55:48PM +0000, Wilco Dijkstra wrote: >> stpcpy is not conceptually the same, but for mempcpy, yes. By default >> it's converted into memcpy in the GLIBC headers and the generic implementation. >> >> stpcpy uses strlen and memcpy which is generally the most efficient version >> (it even beat several assembler implementations). > > ?? I certainly see something completely different, at least on the arches > I've looked at. glibc/string/string.h contains: #if defined __USE_GNU && defined __OPTIMIZE__ \ && defined __extern_always_inline && __GNUC_PREREQ (3,2) # if !defined _FORCE_INLINES && !defined _HAVE_STRING_ARCH_mempcpy #define mempcpy(dest, src, n) __mempcpy_inline (dest, src, n) #define __mempcpy(dest, src, n) __mempcpy_inline (dest, src, n) __extern_always_inline void * __mempcpy_inline (void *__restrict __dest, const void *__restrict __src, size_t __n) { return (char *) memcpy (__dest, __src, __n) + __n; } That's best done GCC as a general optimization as currently mempcpy is not handled efficiently (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70140), and it avoids having to repeat this for every C library out there... glibc/string/mempcpy.c: void * MEMPCPY (void *dest, const void *src, size_t len) { return memcpy (dest, src, len) + len; } And glibc/string/stpcpy.c: char * STPCPY (char *dest, const char *src) { size_t len = strlen (src); return memcpy (dest, src, len + 1) + len; } This means that without having to write any assembly code, by default mempcpy, stpcpy etc are as efficient as possible (memcpy and strlen are optimized well on all targets, that's not true for mempcpy, stpcpy and similar functions, and to make matters worse, the generic code used to be very inefficient). Wilco
On Thu, Sep 15, 2016 at 03:27:52PM +0000, Wilco Dijkstra wrote: > That's best done GCC as a general optimization as currently mempcpy is not > handled efficiently (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70140), > and it avoids having to repeat this for every C library out there... > > glibc/string/mempcpy.c: > > void * > MEMPCPY (void *dest, const void *src, size_t len) > { > return memcpy (dest, src, len) + len; > } > > And glibc/string/stpcpy.c: > > char * > STPCPY (char *dest, const char *src) > { > size_t len = strlen (src); > return memcpy (dest, src, len + 1) + len; > } Those are the generic definitions, all targets that care about performance obviously should replace them with assembly code. Jakub
Jakub Jelinek <jakub@redhat.com> > > Those are the generic definitions, all targets that care about performance > obviously should replace them with assembly code. No, that's exactly my point, it is not true that it is always best to write assembly code. For example there is absolutely no benefit in writing an optimized mempcpy. At best it is as fast as memcpy, and therefore expanding mempcpy into memcpy (p, q, n) + n would have the same performance. In actual use memcpy will then be slightly faster due to lower I-cache pressure. Wilco
Bernd Schmidt wrote: > On 09/15/2016 03:38 PM, Wilco Dijkstra wrote: > > __rawmemchr is not the fastest on any target I tried, including x86, > > Interesting. Care to share your test program? I just looked at the libc > sources and strlen/rawmemchr are practically identical code so I'd > expect any difference to be lost in the noise. Of course there might be > inlines interfering with the comparison. It's glibc/benchtests/bench-strlen.c slightly modified to compare strlen, rawmemchr and strchr. Even if they appear identical the inner loop of strlen is much faster than strchr and rawmemchr at larger sizes: strchr rawmemchr strlen Length 4096, alignment 12: 3.35132e+06 2.39842e+06 1.88962e+06 > > So the only reasonable optimization is to always emit a + strlen (a). > > Not sure about "only reasonable" but on the whole I'd agree that it's > reasonable and we shouldn't let the perfect be the enemy of the good > here. I'm sure we can come up with lots of different ways to do this but > let's just pick one and if the one Wilco submitted looks decent let's > just put it in. > > Out of curiousity, is there real-world code that this is intended to > optimize? I noticed rawmemchr taking non-trivial amounts of time in various profiles despite no use of rawmemchr in any of the source code. It's apparently a common idiom to use strchr (s, 0) to find the end of a string. Given strchr is slower than strlen, it is changed to rawmemchr by GLIBC headers. However this makes things even slower since few targets have an optimized rawmemchr, and for targets that do, strlen is faster. So this is one of many improvements to ensure GCC/GLIBC by default do optimizations in a way that is best for most targets. If a particular target wants to do something different that is always possible of course. Wilco
On 09/16/2016 07:28 AM, Wilco Dijkstra wrote: > > I noticed rawmemchr taking non-trivial amounts of time in various profiles > despite no use of rawmemchr in any of the source code. It's apparently a > common idiom to use strchr (s, 0) to find the end of a string. A bit of a surprise, but it is what it is I guess. jeff
diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c index eb130d048469f0b8196e565fed9a40de74b098bd..11dcf69fc919f066362f4f713db392d14b39764e 100644 --- a/gcc/gimple-fold.c +++ b/gcc/gimple-fold.c @@ -1380,6 +1380,59 @@ gimple_fold_builtin_strncpy (gimple_stmt_iterator *gsi, return true; } +/* Simplify strchr (str, 0) into str + strlen (str). + In general strlen is significantly faster than strchr + due to being a simpler operation. */ +static bool +gimple_fold_builtin_strchr (gimple_stmt_iterator *gsi) +{ + gimple *stmt = gsi_stmt (*gsi); + tree str = gimple_call_arg (stmt, 0); + tree c = gimple_call_arg (stmt, 1); + location_t loc = gimple_location (stmt); + + if (optimize_function_for_size_p (cfun)) + return false; + + if (!integer_zerop (c) || !gimple_call_lhs (stmt)) + return false; + + tree newstr; + tree strlen_fn = builtin_decl_implicit (BUILT_IN_STRLEN); + + if (!strlen_fn) + return false; + + /* Create newstr = strlen (str). */ + gimple_seq stmts = NULL, stmts2; + gimple *repl = gimple_build_call (strlen_fn, 1, str); + gimple_set_location (repl, loc); + if (gimple_in_ssa_p (cfun)) + newstr = make_ssa_name (size_type_node); + else + newstr = create_tmp_reg (size_type_node); + gimple_call_set_lhs (repl, newstr); + gimple_seq_add_stmt_without_update (&stmts, repl); + + /* Create (str p+ strlen (str)). */ + newstr = fold_build_pointer_plus_loc (loc, str, newstr); + newstr = force_gimple_operand (newstr, &stmts2, true, NULL_TREE); + gimple_seq_add_seq_without_update (&stmts, stmts2); + + repl = gimple_build_assign (gimple_call_lhs (stmt), newstr); + gimple_seq_add_stmt_without_update (&stmts, repl); + gsi_replace_with_seq_vops (gsi, stmts); + /* gsi now points at the assignment to the lhs, get a + stmt iterator to the strlen. + ??? We can't use gsi_for_stmt as that doesn't work when the + CFG isn't built yet. */ + gimple_stmt_iterator gsi2 = *gsi; + gsi_prev (&gsi2); + gsi_prev (&gsi2); + fold_stmt (&gsi2); + return true; +} + /* Simplify a call to the strcat builtin. DST and SRC are the arguments to the call. @@ -2821,6 +2874,8 @@ gimple_fold_builtin (gimple_stmt_iterator *gsi) gimple_call_arg (stmt, 1)); case BUILT_IN_STRNCAT: return gimple_fold_builtin_strncat (gsi); + case BUILT_IN_STRCHR: + return gimple_fold_builtin_strchr (gsi); case BUILT_IN_FPUTS: return gimple_fold_builtin_fputs (gsi, gimple_call_arg (stmt, 0), gimple_call_arg (stmt, 1), false); diff --git a/gcc/testsuite/gcc.dg/strlenopt-20.c b/gcc/testsuite/gcc.dg/strlenopt-20.c index a83e845c26d88e5acdcabf142f7b319136663488..7b483eaeac1aa47278111a92148a16f00b2aaa2d 100644 --- a/gcc/testsuite/gcc.dg/strlenopt-20.c +++ b/gcc/testsuite/gcc.dg/strlenopt-20.c @@ -86,9 +86,9 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ diff --git a/gcc/testsuite/gcc.dg/strlenopt-21.c b/gcc/testsuite/gcc.dg/strlenopt-21.c index e22fa9fca9ba14354db2cd5f602283b64bd8dcac..05b85a49dde0a7f5d269174fd4269e40be910dbd 100644 --- a/gcc/testsuite/gcc.dg/strlenopt-21.c +++ b/gcc/testsuite/gcc.dg/strlenopt-21.c @@ -57,9 +57,9 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ diff --git a/gcc/testsuite/gcc.dg/strlenopt-22.c b/gcc/testsuite/gcc.dg/strlenopt-22.c index aa55f5ebd6a2d4803ee9a7fd60fc538d86f47124..b4ef772f0e59252f10a5419ede6837b3c8ca8265 100644 --- a/gcc/testsuite/gcc.dg/strlenopt-22.c +++ b/gcc/testsuite/gcc.dg/strlenopt-22.c @@ -31,9 +31,9 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */ /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ diff --git a/gcc/testsuite/gcc.dg/strlenopt-26.c b/gcc/testsuite/gcc.dg/strlenopt-26.c index 4bd54bef540806ca90d2b9bcfc33eb531991c967..da2f465a5b5003fa5dca05f3a6ee00e97b98b5dd 100644 --- a/gcc/testsuite/gcc.dg/strlenopt-26.c +++ b/gcc/testsuite/gcc.dg/strlenopt-26.c @@ -21,4 +21,5 @@ main (void) return 0; } -/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ diff --git a/gcc/testsuite/gcc.dg/strlenopt-5.c b/gcc/testsuite/gcc.dg/strlenopt-5.c index 1b006a93045599a75bdb10a39e86ffa59b475c83..a24aea44e8b00ff7b35a907aaa941b4c509642c4 100644 --- a/gcc/testsuite/gcc.dg/strlenopt-5.c +++ b/gcc/testsuite/gcc.dg/strlenopt-5.c @@ -48,9 +48,9 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */ /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ -/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ diff --git a/gcc/testsuite/gcc.dg/strlenopt-7.c b/gcc/testsuite/gcc.dg/strlenopt-7.c index 3ae1e2cb3f0d08c8b05107c4e65b67bdb39cf7ab..aa53d7e75254dfe56c93172afc49f95e5b7901e6 100644 --- a/gcc/testsuite/gcc.dg/strlenopt-7.c +++ b/gcc/testsuite/gcc.dg/strlenopt-7.c @@ -40,11 +40,11 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */ /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ -/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */ /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c b/gcc/testsuite/gcc.dg/strlenopt-9.c index b0406b162d48fca375883035043b0c50b9db61a1..e5e276210ba4b7d75867605f1ecf5c06eb970ef5 100644 --- a/gcc/testsuite/gcc.dg/strlenopt-9.c +++ b/gcc/testsuite/gcc.dg/strlenopt-9.c @@ -98,10 +98,10 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */ /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */ -/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */ +/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */ /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */