Message ID | orcyvfambh.fsf@lxoliva.fsfla.org |
---|---|
State | New |
Headers | show |
Series | [v2] -finline-stringops: check base blksize for memset [PR112778] | expand |
On Sat, Dec 9, 2023 at 8:05 AM Alexandre Oliva <oliva@adacore.com> wrote: > > Scratch the previous one, the "slightly different version" I had before > it was not entirely broken due to unnecessary, suboptimal and incorrect > use of ctz. Here I have yet another implementation of that loop that > should perform better and even work correctly ;-) > > > This one has so far regstrapped on x86_64-linux-gnu (v1 failed in > regression testing, sorry), and bootstrapped with -finline-stringops on > ppc64le-linux-gnu (still ongoing on x86-64-linux-gnu and > aarch64-linux-gnu). Ok to install? OK > > The recently-added logic for -finline-stringops=memset introduced an > assumption that doesn't necessarily hold, namely, that > can_store_by_pieces of a larger size implies can_store_by_pieces by > smaller sizes. Checks for all sizes the by-multiple-pieces machinery > might use before committing to an expansion pattern. > > > for gcc/ChangeLog > > PR target/112778 > * builtins.cc (can_store_by_multiple_pieces): New. > (try_store_by_multiple_pieces): Call it. > > for gcc/testsuite/ChangeLog > > PR target/112778 > * gcc.dg/inline-mem-cmp-pr112778.c: New. > --- > gcc/builtins.cc | 57 ++++++++++++++++++++---- > gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c | 10 ++++ > 2 files changed, 58 insertions(+), 9 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c > > diff --git a/gcc/builtins.cc b/gcc/builtins.cc > index 12a535d313f12..f6c96498f0783 100644 > --- a/gcc/builtins.cc > +++ b/gcc/builtins.cc > @@ -4284,6 +4284,40 @@ expand_builtin_memset (tree exp, rtx target, machine_mode mode) > return expand_builtin_memset_args (dest, val, len, target, mode, exp); > } > > +/* Check that store_by_pieces allows BITS + LEN (so that we don't > + expand something too unreasonably long), and every power of 2 in > + BITS. It is assumed that LEN has already been tested by > + itself. */ > +static bool > +can_store_by_multiple_pieces (unsigned HOST_WIDE_INT bits, > + by_pieces_constfn constfun, > + void *constfundata, unsigned int align, > + bool memsetp, > + unsigned HOST_WIDE_INT len) > +{ > + if (bits > + && !can_store_by_pieces (bits + len, constfun, constfundata, > + align, memsetp)) > + return false; > + > + /* BITS set are expected to be generally in the low range and > + contiguous. We do NOT want to repeat the test above in case BITS > + has a single bit set, so we terminate the loop when BITS == BIT. > + In the unlikely case that BITS has the MSB set, also terminate in > + case BIT gets shifted out. */ > + for (unsigned HOST_WIDE_INT bit = 1; bit < bits && bit; bit <<= 1) > + { > + if ((bits & bit) == 0) > + continue; > + > + if (!can_store_by_pieces (bit, constfun, constfundata, > + align, memsetp)) > + return false; > + } > + > + return true; > +} > + > /* Try to store VAL (or, if NULL_RTX, VALC) in LEN bytes starting at TO. > Return TRUE if successful, FALSE otherwise. TO is assumed to be > aligned at an ALIGN-bits boundary. LEN must be a multiple of > @@ -4341,7 +4375,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, > else > /* Huh, max_len < min_len? Punt. See pr100843.c. */ > return false; > - if (min_len >= blksize) > + if (min_len >= blksize > + /* ??? Maybe try smaller fixed-prefix blksizes before > + punting? */ > + && can_store_by_pieces (blksize, builtin_memset_read_str, > + &valc, align, true)) > { > min_len -= blksize; > min_bits = floor_log2 (min_len); > @@ -4367,8 +4405,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, > happen because of the way max_bits and blksize are related, but > it doesn't hurt to test. */ > if (blksize > xlenest > - || !can_store_by_pieces (xlenest, builtin_memset_read_str, > - &valc, align, true)) > + || !can_store_by_multiple_pieces (xlenest - blksize, > + builtin_memset_read_str, > + &valc, align, true, blksize)) > { > if (!(flag_inline_stringops & ILSOP_MEMSET)) > return false; > @@ -4386,17 +4425,17 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, > of overflow. */ > if (max_bits < orig_max_bits > && xlenest + blksize >= xlenest > - && can_store_by_pieces (xlenest + blksize, > - builtin_memset_read_str, > - &valc, align, true)) > + && can_store_by_multiple_pieces (xlenest, > + builtin_memset_read_str, > + &valc, align, true, blksize)) > { > max_loop = true; > break; > } > if (blksize > - && can_store_by_pieces (xlenest, > - builtin_memset_read_str, > - &valc, align, true)) > + && can_store_by_multiple_pieces (xlenest, > + builtin_memset_read_str, > + &valc, align, true, 0)) > { > max_len += blksize; > min_len += blksize; > diff --git a/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c > new file mode 100644 > index 0000000000000..fdfc5b6f28c8e > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c > @@ -0,0 +1,10 @@ > +/* { dg-do compile } */ > +/* { dg-options "-finline-stringops" } */ > + > +char buf[3]; > + > +int > +f () > +{ > + __builtin_memset (buf, 'v', 3); > +} > > > -- > Alexandre Oliva, happy hacker https://FSFLA.org/blogs/lxo/ > Free Software Activist GNU Toolchain Engineer > More tolerance and less prejudice are key for inclusion and diversity > Excluding neuro-others for not behaving ""normal"" is *not* inclusive
diff --git a/gcc/builtins.cc b/gcc/builtins.cc index 12a535d313f12..f6c96498f0783 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -4284,6 +4284,40 @@ expand_builtin_memset (tree exp, rtx target, machine_mode mode) return expand_builtin_memset_args (dest, val, len, target, mode, exp); } +/* Check that store_by_pieces allows BITS + LEN (so that we don't + expand something too unreasonably long), and every power of 2 in + BITS. It is assumed that LEN has already been tested by + itself. */ +static bool +can_store_by_multiple_pieces (unsigned HOST_WIDE_INT bits, + by_pieces_constfn constfun, + void *constfundata, unsigned int align, + bool memsetp, + unsigned HOST_WIDE_INT len) +{ + if (bits + && !can_store_by_pieces (bits + len, constfun, constfundata, + align, memsetp)) + return false; + + /* BITS set are expected to be generally in the low range and + contiguous. We do NOT want to repeat the test above in case BITS + has a single bit set, so we terminate the loop when BITS == BIT. + In the unlikely case that BITS has the MSB set, also terminate in + case BIT gets shifted out. */ + for (unsigned HOST_WIDE_INT bit = 1; bit < bits && bit; bit <<= 1) + { + if ((bits & bit) == 0) + continue; + + if (!can_store_by_pieces (bit, constfun, constfundata, + align, memsetp)) + return false; + } + + return true; +} + /* Try to store VAL (or, if NULL_RTX, VALC) in LEN bytes starting at TO. Return TRUE if successful, FALSE otherwise. TO is assumed to be aligned at an ALIGN-bits boundary. LEN must be a multiple of @@ -4341,7 +4375,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, else /* Huh, max_len < min_len? Punt. See pr100843.c. */ return false; - if (min_len >= blksize) + if (min_len >= blksize + /* ??? Maybe try smaller fixed-prefix blksizes before + punting? */ + && can_store_by_pieces (blksize, builtin_memset_read_str, + &valc, align, true)) { min_len -= blksize; min_bits = floor_log2 (min_len); @@ -4367,8 +4405,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, happen because of the way max_bits and blksize are related, but it doesn't hurt to test. */ if (blksize > xlenest - || !can_store_by_pieces (xlenest, builtin_memset_read_str, - &valc, align, true)) + || !can_store_by_multiple_pieces (xlenest - blksize, + builtin_memset_read_str, + &valc, align, true, blksize)) { if (!(flag_inline_stringops & ILSOP_MEMSET)) return false; @@ -4386,17 +4425,17 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, of overflow. */ if (max_bits < orig_max_bits && xlenest + blksize >= xlenest - && can_store_by_pieces (xlenest + blksize, - builtin_memset_read_str, - &valc, align, true)) + && can_store_by_multiple_pieces (xlenest, + builtin_memset_read_str, + &valc, align, true, blksize)) { max_loop = true; break; } if (blksize - && can_store_by_pieces (xlenest, - builtin_memset_read_str, - &valc, align, true)) + && can_store_by_multiple_pieces (xlenest, + builtin_memset_read_str, + &valc, align, true, 0)) { max_len += blksize; min_len += blksize; diff --git a/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c new file mode 100644 index 0000000000000..fdfc5b6f28c8e --- /dev/null +++ b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-finline-stringops" } */ + +char buf[3]; + +int +f () +{ + __builtin_memset (buf, 'v', 3); +}