diff mbox series

New finish_compare_by_pieces target hook (for x86).

Message ID 001001d99d36$a38111c0$ea833540$@nextmovesoftware.com
State New
Headers show
Series New finish_compare_by_pieces target hook (for x86). | expand

Commit Message

Roger Sayle June 12, 2023, 2:03 p.m. UTC
The following simple test case, from PR 104610, shows that memcmp () == 0
can result in some bizarre code sequences on x86.

int foo(char *a)
{
    static const char t[] = "0123456789012345678901234567890";
    return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
}

with -O2 currently contains both:
        xorl    %eax, %eax
        xorl    $1, %eax
and also
        movl    $1, %eax
        xorl    $1, %eax

Changing the return type of foo to _Bool results in the equally
bizarre:
        xorl    %eax, %eax
        testl   %eax, %eax
        sete    %al
and also
        movl    $1, %eax
        testl   %eax, %eax
        sete    %al

All these sequences set the result to a constant, but this optimization
opportunity only occurs very late during compilation, by basic block
duplication in the 322r.bbro pass, too late for CSE or peephole2 to
do anything about it.  The problem is that the idiom expanded by
compare_by_pieces for __builtin_memcmp_eq contains basic blocks that
can't easily be optimized by if-conversion due to the multiple
incoming edges on the fail block.

In summary, compare_by_pieces generates code that looks like:

        if (x[0] != y[0]) goto fail_label;
        if (x[1] != y[1]) goto fail_label;
        ...
        if (x[n] != y[n]) goto fail_label;
        result = 1;
        goto end_label;
fail_label:
        result = 0;
end_label:

In theory, the RTL if-conversion pass could be enhanced to tackle
arbitrarily complex if-then-else graphs, but the solution proposed
here is to allow suitable targets to perform if-conversion during
compare_by_pieces.  The x86, for example, can take advantage that
all of the above comparisons set and test the zero flag (ZF), which
can then be used in combination with sete.  Hence compare_by_pieces
could instead generate:

        if (x[0] != y[0]) goto fail_label;
        if (x[1] != y[1]) goto fail_label;
        ...
        if (x[n] != y[n]) goto fail_label;
fail_label:
        sete result

which requires one less basic block, and the redundant conditional
branch to a label immediately after is cleaned up by GCC's existing
RTL optimizations.

For the test case above, where -O2 -msse4 previously generated:

foo:    movdqu  (%rdi), %xmm0
        pxor    .LC0(%rip), %xmm0
        ptest   %xmm0, %xmm0
        je      .L5
.L2:    movl    $1, %eax
        xorl    $1, %eax
        ret
.L5:    movdqu  16(%rdi), %xmm0
        pxor    .LC1(%rip), %xmm0
        ptest   %xmm0, %xmm0
        jne     .L2
        xorl    %eax, %eax
        xorl    $1, %eax
        ret

we now generate:

foo:    movdqu  (%rdi), %xmm0
        pxor    .LC0(%rip), %xmm0
        ptest   %xmm0, %xmm0
        jne     .L2
        movdqu  16(%rdi), %xmm0
        pxor    .LC1(%rip), %xmm0
        ptest   %xmm0, %xmm0
.L2:    sete    %al
        movzbl  %al, %eax
        ret

Using a target hook allows the large amount of intelligence already in
compare_by_pieces to be re-used by the i386 backend, but this can also
help other backends with condition flags where the equality result can
be materialized.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2023-06-12  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/i386/i386.cc (ix86_finish_compare_by_pieces): New
        function to provide a backend specific implementation.
        (TARGET_FINISH_COMPARE_BY_PIECES): Use the above function.

        * doc/tm.texi.in (TARGET_FINISH_COMPARE_BY_PIECES): New @hook.
        * doc/tm.texi: Regenerate.

        * expr.cc (compare_by_pieces): Call finish_compare_by_pieces in
        targetm to finalize the RTL expansion.  Move the current
        implementation to a default target hook.
        * target.def (finish_compare_by_pieces): New target hook to allow
        compare_by_pieces to be customized by the target.
        * targhooks.cc (default_finish_compare_by_pieces): Default
        implementation moved here from expr.cc's compare_by_pieces.
        * targhooks.h (default_finish_compare_by_pieces): Prototype.

gcc/testsuite/ChangeLog
        * gcc.target/i386/pieces-memcmp-1.c: New test case.


Thanks in advance,
Roger
--

Comments

Uros Bizjak June 12, 2023, 6:46 p.m. UTC | #1
On Mon, Jun 12, 2023 at 4:03 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> The following simple test case, from PR 104610, shows that memcmp () == 0
> can result in some bizarre code sequences on x86.
>
> int foo(char *a)
> {
>     static const char t[] = "0123456789012345678901234567890";
>     return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
> }
>
> with -O2 currently contains both:
>         xorl    %eax, %eax
>         xorl    $1, %eax
> and also
>         movl    $1, %eax
>         xorl    $1, %eax
>
> Changing the return type of foo to _Bool results in the equally
> bizarre:
>         xorl    %eax, %eax
>         testl   %eax, %eax
>         sete    %al
> and also
>         movl    $1, %eax
>         testl   %eax, %eax
>         sete    %al
>
> All these sequences set the result to a constant, but this optimization
> opportunity only occurs very late during compilation, by basic block
> duplication in the 322r.bbro pass, too late for CSE or peephole2 to
> do anything about it.  The problem is that the idiom expanded by
> compare_by_pieces for __builtin_memcmp_eq contains basic blocks that
> can't easily be optimized by if-conversion due to the multiple
> incoming edges on the fail block.
>
> In summary, compare_by_pieces generates code that looks like:
>
>         if (x[0] != y[0]) goto fail_label;
>         if (x[1] != y[1]) goto fail_label;
>         ...
>         if (x[n] != y[n]) goto fail_label;
>         result = 1;
>         goto end_label;
> fail_label:
>         result = 0;
> end_label:
>
> In theory, the RTL if-conversion pass could be enhanced to tackle
> arbitrarily complex if-then-else graphs, but the solution proposed
> here is to allow suitable targets to perform if-conversion during
> compare_by_pieces.  The x86, for example, can take advantage that
> all of the above comparisons set and test the zero flag (ZF), which
> can then be used in combination with sete.  Hence compare_by_pieces
> could instead generate:
>
>         if (x[0] != y[0]) goto fail_label;
>         if (x[1] != y[1]) goto fail_label;
>         ...
>         if (x[n] != y[n]) goto fail_label;
> fail_label:
>         sete result
>
> which requires one less basic block, and the redundant conditional
> branch to a label immediately after is cleaned up by GCC's existing
> RTL optimizations.
>
> For the test case above, where -O2 -msse4 previously generated:
>
> foo:    movdqu  (%rdi), %xmm0
>         pxor    .LC0(%rip), %xmm0
>         ptest   %xmm0, %xmm0
>         je      .L5
> .L2:    movl    $1, %eax
>         xorl    $1, %eax
>         ret
> .L5:    movdqu  16(%rdi), %xmm0
>         pxor    .LC1(%rip), %xmm0
>         ptest   %xmm0, %xmm0
>         jne     .L2
>         xorl    %eax, %eax
>         xorl    $1, %eax
>         ret
>
> we now generate:
>
> foo:    movdqu  (%rdi), %xmm0
>         pxor    .LC0(%rip), %xmm0
>         ptest   %xmm0, %xmm0
>         jne     .L2
>         movdqu  16(%rdi), %xmm0
>         pxor    .LC1(%rip), %xmm0
>         ptest   %xmm0, %xmm0
> .L2:    sete    %al
>         movzbl  %al, %eax
>         ret
>
> Using a target hook allows the large amount of intelligence already in
> compare_by_pieces to be re-used by the i386 backend, but this can also
> help other backends with condition flags where the equality result can
> be materialized.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
>
> 2023-06-12  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386.cc (ix86_finish_compare_by_pieces): New
>         function to provide a backend specific implementation.
>         (TARGET_FINISH_COMPARE_BY_PIECES): Use the above function.
>
>         * doc/tm.texi.in (TARGET_FINISH_COMPARE_BY_PIECES): New @hook.
>         * doc/tm.texi: Regenerate.
>
>         * expr.cc (compare_by_pieces): Call finish_compare_by_pieces in
>         targetm to finalize the RTL expansion.  Move the current
>         implementation to a default target hook.
>         * target.def (finish_compare_by_pieces): New target hook to allow
>         compare_by_pieces to be customized by the target.
>         * targhooks.cc (default_finish_compare_by_pieces): Default
>         implementation moved here from expr.cc's compare_by_pieces.
>         * targhooks.h (default_finish_compare_by_pieces): Prototype.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/pieces-memcmp-1.c: New test case.

This patch needs middle-end approval first.

Uros.

>
>
> Thanks in advance,
> Roger
> --
>
Richard Biener June 13, 2023, 11:02 a.m. UTC | #2
On Mon, Jun 12, 2023 at 4:04 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> The following simple test case, from PR 104610, shows that memcmp () == 0
> can result in some bizarre code sequences on x86.
>
> int foo(char *a)
> {
>     static const char t[] = "0123456789012345678901234567890";
>     return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
> }
>
> with -O2 currently contains both:
>         xorl    %eax, %eax
>         xorl    $1, %eax
> and also
>         movl    $1, %eax
>         xorl    $1, %eax
>
> Changing the return type of foo to _Bool results in the equally
> bizarre:
>         xorl    %eax, %eax
>         testl   %eax, %eax
>         sete    %al
> and also
>         movl    $1, %eax
>         testl   %eax, %eax
>         sete    %al
>
> All these sequences set the result to a constant, but this optimization
> opportunity only occurs very late during compilation, by basic block
> duplication in the 322r.bbro pass, too late for CSE or peephole2 to
> do anything about it.  The problem is that the idiom expanded by
> compare_by_pieces for __builtin_memcmp_eq contains basic blocks that
> can't easily be optimized by if-conversion due to the multiple
> incoming edges on the fail block.
>
> In summary, compare_by_pieces generates code that looks like:
>
>         if (x[0] != y[0]) goto fail_label;
>         if (x[1] != y[1]) goto fail_label;
>         ...
>         if (x[n] != y[n]) goto fail_label;
>         result = 1;
>         goto end_label;
> fail_label:
>         result = 0;
> end_label:
>
> In theory, the RTL if-conversion pass could be enhanced to tackle
> arbitrarily complex if-then-else graphs, but the solution proposed
> here is to allow suitable targets to perform if-conversion during
> compare_by_pieces.  The x86, for example, can take advantage that
> all of the above comparisons set and test the zero flag (ZF), which
> can then be used in combination with sete.  Hence compare_by_pieces
> could instead generate:
>
>         if (x[0] != y[0]) goto fail_label;
>         if (x[1] != y[1]) goto fail_label;
>         ...
>         if (x[n] != y[n]) goto fail_label;
> fail_label:
>         sete result
>
> which requires one less basic block, and the redundant conditional
> branch to a label immediately after is cleaned up by GCC's existing
> RTL optimizations.
>
> For the test case above, where -O2 -msse4 previously generated:
>
> foo:    movdqu  (%rdi), %xmm0
>         pxor    .LC0(%rip), %xmm0
>         ptest   %xmm0, %xmm0
>         je      .L5
> .L2:    movl    $1, %eax
>         xorl    $1, %eax
>         ret
> .L5:    movdqu  16(%rdi), %xmm0
>         pxor    .LC1(%rip), %xmm0
>         ptest   %xmm0, %xmm0
>         jne     .L2
>         xorl    %eax, %eax
>         xorl    $1, %eax
>         ret
>
> we now generate:
>
> foo:    movdqu  (%rdi), %xmm0
>         pxor    .LC0(%rip), %xmm0
>         ptest   %xmm0, %xmm0
>         jne     .L2
>         movdqu  16(%rdi), %xmm0
>         pxor    .LC1(%rip), %xmm0
>         ptest   %xmm0, %xmm0
> .L2:    sete    %al
>         movzbl  %al, %eax
>         ret
>
> Using a target hook allows the large amount of intelligence already in
> compare_by_pieces to be re-used by the i386 backend, but this can also
> help other backends with condition flags where the equality result can
> be materialized.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?

What's the guarantee that the zero flag is appropriately set on all
edges incoming now and forever?  Does this require target specific
knowledge on how do_compare_rtx_and_jump is emitting RTL?

Do you see matching this in ifcvt to be unreasonable?  I'm thinking
of "reducing" the incoming edges pairwise without actually looking
at the ifcvt code.

Thanks,
Richard.

>
> 2023-06-12  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386.cc (ix86_finish_compare_by_pieces): New
>         function to provide a backend specific implementation.
>         (TARGET_FINISH_COMPARE_BY_PIECES): Use the above function.
>
>         * doc/tm.texi.in (TARGET_FINISH_COMPARE_BY_PIECES): New @hook.
>         * doc/tm.texi: Regenerate.
>
>         * expr.cc (compare_by_pieces): Call finish_compare_by_pieces in
>         targetm to finalize the RTL expansion.  Move the current
>         implementation to a default target hook.
>         * target.def (finish_compare_by_pieces): New target hook to allow
>         compare_by_pieces to be customized by the target.
>         * targhooks.cc (default_finish_compare_by_pieces): Default
>         implementation moved here from expr.cc's compare_by_pieces.
>         * targhooks.h (default_finish_compare_by_pieces): Prototype.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/pieces-memcmp-1.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>
Roger Sayle June 25, 2023, 5:39 a.m. UTC | #3
On Tue, 13 June 2023 12:02, Richard Biener wrote:
> On Mon, Jun 12, 2023 at 4:04 PM Roger Sayle <roger@nextmovesoftware.com>
> wrote:
> > The following simple test case, from PR 104610, shows that memcmp ()
> > == 0 can result in some bizarre code sequences on x86.
> >
> > int foo(char *a)
> > {
> >     static const char t[] = "0123456789012345678901234567890";
> >     return __builtin_memcmp(a, &t[0], sizeof(t)) == 0; }
> >
> > with -O2 currently contains both:
> >         xorl    %eax, %eax
> >         xorl    $1, %eax
> > and also
> >         movl    $1, %eax
> >         xorl    $1, %eax
> >
> > Changing the return type of foo to _Bool results in the equally
> > bizarre:
> >         xorl    %eax, %eax
> >         testl   %eax, %eax
> >         sete    %al
> > and also
> >         movl    $1, %eax
> >         testl   %eax, %eax
> >         sete    %al
> >
> > All these sequences set the result to a constant, but this
> > optimization opportunity only occurs very late during compilation, by
> > basic block duplication in the 322r.bbro pass, too late for CSE or
> > peephole2 to do anything about it.  The problem is that the idiom
> > expanded by compare_by_pieces for __builtin_memcmp_eq contains basic
> > blocks that can't easily be optimized by if-conversion due to the
> > multiple incoming edges on the fail block.
> >
> > In summary, compare_by_pieces generates code that looks like:
> >
> >         if (x[0] != y[0]) goto fail_label;
> >         if (x[1] != y[1]) goto fail_label;
> >         ...
> >         if (x[n] != y[n]) goto fail_label;
> >         result = 1;
> >         goto end_label;
> > fail_label:
> >         result = 0;
> > end_label:
> >
> > In theory, the RTL if-conversion pass could be enhanced to tackle
> > arbitrarily complex if-then-else graphs, but the solution proposed
> > here is to allow suitable targets to perform if-conversion during
> > compare_by_pieces.  The x86, for example, can take advantage that all
> > of the above comparisons set and test the zero flag (ZF), which can
> > then be used in combination with sete.  Hence compare_by_pieces could
> > instead generate:
> >
> >         if (x[0] != y[0]) goto fail_label;
> >         if (x[1] != y[1]) goto fail_label;
> >         ...
> >         if (x[n] != y[n]) goto fail_label;
> > fail_label:
> >         sete result
> >
> > which requires one less basic block, and the redundant conditional
> > branch to a label immediately after is cleaned up by GCC's existing
> > RTL optimizations.
> >
> > For the test case above, where -O2 -msse4 previously generated:
> >
> > foo:    movdqu  (%rdi), %xmm0
> >         pxor    .LC0(%rip), %xmm0
> >         ptest   %xmm0, %xmm0
> >         je      .L5
> > .L2:    movl    $1, %eax
> >         xorl    $1, %eax
> >         ret
> > .L5:    movdqu  16(%rdi), %xmm0
> >         pxor    .LC1(%rip), %xmm0
> >         ptest   %xmm0, %xmm0
> >         jne     .L2
> >         xorl    %eax, %eax
> >         xorl    $1, %eax
> >         ret
> >
> > we now generate:
> >
> > foo:    movdqu  (%rdi), %xmm0
> >         pxor    .LC0(%rip), %xmm0
> >         ptest   %xmm0, %xmm0
> >         jne     .L2
> >         movdqu  16(%rdi), %xmm0
> >         pxor    .LC1(%rip), %xmm0
> >         ptest   %xmm0, %xmm0
> > .L2:    sete    %al
> >         movzbl  %al, %eax
> >         ret
> >
> > Using a target hook allows the large amount of intelligence already in
> > compare_by_pieces to be re-used by the i386 backend, but this can also
> > help other backends with condition flags where the equality result can
> > be materialized.
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32}
> > with no new failures.  Ok for mainline?
> 
> What's the guarantee that the zero flag is appropriately set on all edges incoming
> now and forever?

Is there any reason why this target hook can't be removed (in future) should it stop
being useful?  It's completely optional and not required for the correct functioning
of the compiler.

> Does this require target specific knowledge on how do_compare_rtx_and_jump
> is emitting RTL?

Yes.  Each backend can decide how best to implement finish_compare_by_pieces
given its internal knowledge of how do_compare_rtx_and_jump works.  It's not
important to the middle-end how the underlying invariants are guaranteed, just
that they are and the backend produces correct code.  A backend may store flags
on the target label, or maintain state in cfun.  Future changes to the i386 backend
might cause it to revert to the default finish_compare_by_pieces, or provide an
alternate implementation, but at the moment this patch improves the code that
GCC generates.  Very little (in software like GCC) is forever.

> Do you see matching this in ifcvt to be unreasonable?  I'm thinking of "reducing"
> the incoming edges pairwise without actually looking at the ifcvt code.

There's nothing about the proposed patch that prevents or blocks improvements
to ifcvt, which I agree completely could be improved.  But even (in future) if later
RTL passes could clean things up, that's no reason for RTL expansion to initially
generate poor/inefficient code.  I'm not sure that a (hypothetical) ifcvt improvement
would be sufficient reason to revert/remove enhancements to compare_by_pieces.

Is it that there's not enough (bootstrap and) testsuite coverage of compare_by_pieces
to make you feel comfortable with this change?  The proposed implementation should
be "obvious enough" to future generations what the intended behaviour should be.
And the x86 target hook implementation (i.e. the change) is only four lines long, a
fraction of the size of the new documentation and comments.

Thanks in advance.
Roger


> > 2023-06-12  Roger Sayle  <roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> >         * config/i386/i386.cc (ix86_finish_compare_by_pieces): New
> >         function to provide a backend specific implementation.
> >         (TARGET_FINISH_COMPARE_BY_PIECES): Use the above function.
> >
> >         * doc/tm.texi.in (TARGET_FINISH_COMPARE_BY_PIECES): New @hook.
> >         * doc/tm.texi: Regenerate.
> >
> >         * expr.cc (compare_by_pieces): Call finish_compare_by_pieces in
> >         targetm to finalize the RTL expansion.  Move the current
> >         implementation to a default target hook.
> >         * target.def (finish_compare_by_pieces): New target hook to allow
> >         compare_by_pieces to be customized by the target.
> >         * targhooks.cc (default_finish_compare_by_pieces): Default
> >         implementation moved here from expr.cc's compare_by_pieces.
> >         * targhooks.h (default_finish_compare_by_pieces): Prototype.
> >
> > gcc/testsuite/ChangeLog
> >         * gcc.target/i386/pieces-memcmp-1.c: New test case.
> >
Richard Biener June 26, 2023, 7:18 a.m. UTC | #4
On Sun, Jun 25, 2023 at 7:39 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> On Tue, 13 June 2023 12:02, Richard Biener wrote:
> > On Mon, Jun 12, 2023 at 4:04 PM Roger Sayle <roger@nextmovesoftware.com>
> > wrote:
> > > The following simple test case, from PR 104610, shows that memcmp ()
> > > == 0 can result in some bizarre code sequences on x86.
> > >
> > > int foo(char *a)
> > > {
> > >     static const char t[] = "0123456789012345678901234567890";
> > >     return __builtin_memcmp(a, &t[0], sizeof(t)) == 0; }
> > >
> > > with -O2 currently contains both:
> > >         xorl    %eax, %eax
> > >         xorl    $1, %eax
> > > and also
> > >         movl    $1, %eax
> > >         xorl    $1, %eax
> > >
> > > Changing the return type of foo to _Bool results in the equally
> > > bizarre:
> > >         xorl    %eax, %eax
> > >         testl   %eax, %eax
> > >         sete    %al
> > > and also
> > >         movl    $1, %eax
> > >         testl   %eax, %eax
> > >         sete    %al
> > >
> > > All these sequences set the result to a constant, but this
> > > optimization opportunity only occurs very late during compilation, by
> > > basic block duplication in the 322r.bbro pass, too late for CSE or
> > > peephole2 to do anything about it.  The problem is that the idiom
> > > expanded by compare_by_pieces for __builtin_memcmp_eq contains basic
> > > blocks that can't easily be optimized by if-conversion due to the
> > > multiple incoming edges on the fail block.
> > >
> > > In summary, compare_by_pieces generates code that looks like:
> > >
> > >         if (x[0] != y[0]) goto fail_label;
> > >         if (x[1] != y[1]) goto fail_label;
> > >         ...
> > >         if (x[n] != y[n]) goto fail_label;
> > >         result = 1;
> > >         goto end_label;
> > > fail_label:
> > >         result = 0;
> > > end_label:
> > >
> > > In theory, the RTL if-conversion pass could be enhanced to tackle
> > > arbitrarily complex if-then-else graphs, but the solution proposed
> > > here is to allow suitable targets to perform if-conversion during
> > > compare_by_pieces.  The x86, for example, can take advantage that all
> > > of the above comparisons set and test the zero flag (ZF), which can
> > > then be used in combination with sete.  Hence compare_by_pieces could
> > > instead generate:
> > >
> > >         if (x[0] != y[0]) goto fail_label;
> > >         if (x[1] != y[1]) goto fail_label;
> > >         ...
> > >         if (x[n] != y[n]) goto fail_label;
> > > fail_label:
> > >         sete result
> > >
> > > which requires one less basic block, and the redundant conditional
> > > branch to a label immediately after is cleaned up by GCC's existing
> > > RTL optimizations.
> > >
> > > For the test case above, where -O2 -msse4 previously generated:
> > >
> > > foo:    movdqu  (%rdi), %xmm0
> > >         pxor    .LC0(%rip), %xmm0
> > >         ptest   %xmm0, %xmm0
> > >         je      .L5
> > > .L2:    movl    $1, %eax
> > >         xorl    $1, %eax
> > >         ret
> > > .L5:    movdqu  16(%rdi), %xmm0
> > >         pxor    .LC1(%rip), %xmm0
> > >         ptest   %xmm0, %xmm0
> > >         jne     .L2
> > >         xorl    %eax, %eax
> > >         xorl    $1, %eax
> > >         ret
> > >
> > > we now generate:
> > >
> > > foo:    movdqu  (%rdi), %xmm0
> > >         pxor    .LC0(%rip), %xmm0
> > >         ptest   %xmm0, %xmm0
> > >         jne     .L2
> > >         movdqu  16(%rdi), %xmm0
> > >         pxor    .LC1(%rip), %xmm0
> > >         ptest   %xmm0, %xmm0
> > > .L2:    sete    %al
> > >         movzbl  %al, %eax
> > >         ret
> > >
> > > Using a target hook allows the large amount of intelligence already in
> > > compare_by_pieces to be re-used by the i386 backend, but this can also
> > > help other backends with condition flags where the equality result can
> > > be materialized.
> > >
> > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > > and make -k check, both with and without --target_board=unix{-m32}
> > > with no new failures.  Ok for mainline?
> >
> > What's the guarantee that the zero flag is appropriately set on all edges incoming
> > now and forever?
>
> Is there any reason why this target hook can't be removed (in future) should it stop
> being useful?  It's completely optional and not required for the correct functioning
> of the compiler.
>
> > Does this require target specific knowledge on how do_compare_rtx_and_jump
> > is emitting RTL?
>
> Yes.  Each backend can decide how best to implement finish_compare_by_pieces
> given its internal knowledge of how do_compare_rtx_and_jump works.  It's not
> important to the middle-end how the underlying invariants are guaranteed, just
> that they are and the backend produces correct code.  A backend may store flags
> on the target label, or maintain state in cfun.  Future changes to the i386 backend
> might cause it to revert to the default finish_compare_by_pieces, or provide an
> alternate implementation, but at the moment this patch improves the code that
> GCC generates.  Very little (in software like GCC) is forever.
>
> > Do you see matching this in ifcvt to be unreasonable?  I'm thinking of "reducing"
> > the incoming edges pairwise without actually looking at the ifcvt code.
>
> There's nothing about the proposed patch that prevents or blocks improvements
> to ifcvt, which I agree completely could be improved.  But even (in future) if later
> RTL passes could clean things up, that's no reason for RTL expansion to initially
> generate poor/inefficient code.  I'm not sure that a (hypothetical) ifcvt improvement
> would be sufficient reason to revert/remove enhancements to compare_by_pieces.
>
> Is it that there's not enough (bootstrap and) testsuite coverage of compare_by_pieces
> to make you feel comfortable with this change?  The proposed implementation should
> be "obvious enough" to future generations what the intended behaviour should be.
> And the x86 target hook implementation (i.e. the change) is only four lines long, a
> fraction of the size of the new documentation and comments.

My main concern was that we are communicating "implicit" dependences between
the target hook and expand RTL code generation - we don't seem to pass down
enough info for example to have the target verify constraints and
excuse itself if
they do not hold.  They also seem to be poorly documented and the
compare_by_pieces (and all _by_pieces) stuff has been extended to be quite
"configurable" and thus is probably prone to emit vastly different RTL in
some special cases?

That said, I'm not familiar enough with the compare_by_pices logic to say that
this looks obviously safe, but maybe it is.

Yes - all target hooks have this kind of "implicit" dependence on how
the calling
code works, so I guess the main thing is that it's not obvious how the calling
code works in this case?

Anyway, I'm not standing in the way of somebody approving this change - the code
generation improvements are definitely worth it.

Maybe it's the abstraction boundary that makes me worry - can we make the
target simply indicate whether a do_compare_rtx_and_jump sets a flag
(maybe query the CC mode used?) and have the generic code then emit
the setcc?  The calling code would then know it emitted suitable CC producing
compare and branch and it would emit the corresponding setcc with the proper
CCmode?

Richard.

> Thanks in advance.
> Roger
>
>
> > > 2023-06-12  Roger Sayle  <roger@nextmovesoftware.com>
> > >
> > > gcc/ChangeLog
> > >         * config/i386/i386.cc (ix86_finish_compare_by_pieces): New
> > >         function to provide a backend specific implementation.
> > >         (TARGET_FINISH_COMPARE_BY_PIECES): Use the above function.
> > >
> > >         * doc/tm.texi.in (TARGET_FINISH_COMPARE_BY_PIECES): New @hook.
> > >         * doc/tm.texi: Regenerate.
> > >
> > >         * expr.cc (compare_by_pieces): Call finish_compare_by_pieces in
> > >         targetm to finalize the RTL expansion.  Move the current
> > >         implementation to a default target hook.
> > >         * target.def (finish_compare_by_pieces): New target hook to allow
> > >         compare_by_pieces to be customized by the target.
> > >         * targhooks.cc (default_finish_compare_by_pieces): Default
> > >         implementation moved here from expr.cc's compare_by_pieces.
> > >         * targhooks.h (default_finish_compare_by_pieces): Prototype.
> > >
> > > gcc/testsuite/ChangeLog
> > >         * gcc.target/i386/pieces-memcmp-1.c: New test case.
> > >
>
>
Richard Sandiford June 26, 2023, 6:24 p.m. UTC | #5
Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> On Sun, Jun 25, 2023 at 7:39 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>>
>>
>> On Tue, 13 June 2023 12:02, Richard Biener wrote:
>> > On Mon, Jun 12, 2023 at 4:04 PM Roger Sayle <roger@nextmovesoftware.com>
>> > wrote:
>> > > The following simple test case, from PR 104610, shows that memcmp ()
>> > > == 0 can result in some bizarre code sequences on x86.
>> > >
>> > > int foo(char *a)
>> > > {
>> > >     static const char t[] = "0123456789012345678901234567890";
>> > >     return __builtin_memcmp(a, &t[0], sizeof(t)) == 0; }
>> > >
>> > > with -O2 currently contains both:
>> > >         xorl    %eax, %eax
>> > >         xorl    $1, %eax
>> > > and also
>> > >         movl    $1, %eax
>> > >         xorl    $1, %eax
>> > >
>> > > Changing the return type of foo to _Bool results in the equally
>> > > bizarre:
>> > >         xorl    %eax, %eax
>> > >         testl   %eax, %eax
>> > >         sete    %al
>> > > and also
>> > >         movl    $1, %eax
>> > >         testl   %eax, %eax
>> > >         sete    %al
>> > >
>> > > All these sequences set the result to a constant, but this
>> > > optimization opportunity only occurs very late during compilation, by
>> > > basic block duplication in the 322r.bbro pass, too late for CSE or
>> > > peephole2 to do anything about it.  The problem is that the idiom
>> > > expanded by compare_by_pieces for __builtin_memcmp_eq contains basic
>> > > blocks that can't easily be optimized by if-conversion due to the
>> > > multiple incoming edges on the fail block.
>> > >
>> > > In summary, compare_by_pieces generates code that looks like:
>> > >
>> > >         if (x[0] != y[0]) goto fail_label;
>> > >         if (x[1] != y[1]) goto fail_label;
>> > >         ...
>> > >         if (x[n] != y[n]) goto fail_label;
>> > >         result = 1;
>> > >         goto end_label;
>> > > fail_label:
>> > >         result = 0;
>> > > end_label:
>> > >
>> > > In theory, the RTL if-conversion pass could be enhanced to tackle
>> > > arbitrarily complex if-then-else graphs, but the solution proposed
>> > > here is to allow suitable targets to perform if-conversion during
>> > > compare_by_pieces.  The x86, for example, can take advantage that all
>> > > of the above comparisons set and test the zero flag (ZF), which can
>> > > then be used in combination with sete.  Hence compare_by_pieces could
>> > > instead generate:
>> > >
>> > >         if (x[0] != y[0]) goto fail_label;
>> > >         if (x[1] != y[1]) goto fail_label;
>> > >         ...
>> > >         if (x[n] != y[n]) goto fail_label;
>> > > fail_label:
>> > >         sete result
>> > >
>> > > which requires one less basic block, and the redundant conditional
>> > > branch to a label immediately after is cleaned up by GCC's existing
>> > > RTL optimizations.
>> > >
>> > > For the test case above, where -O2 -msse4 previously generated:
>> > >
>> > > foo:    movdqu  (%rdi), %xmm0
>> > >         pxor    .LC0(%rip), %xmm0
>> > >         ptest   %xmm0, %xmm0
>> > >         je      .L5
>> > > .L2:    movl    $1, %eax
>> > >         xorl    $1, %eax
>> > >         ret
>> > > .L5:    movdqu  16(%rdi), %xmm0
>> > >         pxor    .LC1(%rip), %xmm0
>> > >         ptest   %xmm0, %xmm0
>> > >         jne     .L2
>> > >         xorl    %eax, %eax
>> > >         xorl    $1, %eax
>> > >         ret
>> > >
>> > > we now generate:
>> > >
>> > > foo:    movdqu  (%rdi), %xmm0
>> > >         pxor    .LC0(%rip), %xmm0
>> > >         ptest   %xmm0, %xmm0
>> > >         jne     .L2
>> > >         movdqu  16(%rdi), %xmm0
>> > >         pxor    .LC1(%rip), %xmm0
>> > >         ptest   %xmm0, %xmm0
>> > > .L2:    sete    %al
>> > >         movzbl  %al, %eax
>> > >         ret
>> > >
>> > > Using a target hook allows the large amount of intelligence already in
>> > > compare_by_pieces to be re-used by the i386 backend, but this can also
>> > > help other backends with condition flags where the equality result can
>> > > be materialized.
>> > >
>> > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
>> > > and make -k check, both with and without --target_board=unix{-m32}
>> > > with no new failures.  Ok for mainline?
>> >
>> > What's the guarantee that the zero flag is appropriately set on all edges incoming
>> > now and forever?
>>
>> Is there any reason why this target hook can't be removed (in future) should it stop
>> being useful?  It's completely optional and not required for the correct functioning
>> of the compiler.
>>
>> > Does this require target specific knowledge on how do_compare_rtx_and_jump
>> > is emitting RTL?
>>
>> Yes.  Each backend can decide how best to implement finish_compare_by_pieces
>> given its internal knowledge of how do_compare_rtx_and_jump works.  It's not
>> important to the middle-end how the underlying invariants are guaranteed, just
>> that they are and the backend produces correct code.  A backend may store flags
>> on the target label, or maintain state in cfun.  Future changes to the i386 backend
>> might cause it to revert to the default finish_compare_by_pieces, or provide an
>> alternate implementation, but at the moment this patch improves the code that
>> GCC generates.  Very little (in software like GCC) is forever.
>>
>> > Do you see matching this in ifcvt to be unreasonable?  I'm thinking of "reducing"
>> > the incoming edges pairwise without actually looking at the ifcvt code.
>>
>> There's nothing about the proposed patch that prevents or blocks improvements
>> to ifcvt, which I agree completely could be improved.  But even (in future) if later
>> RTL passes could clean things up, that's no reason for RTL expansion to initially
>> generate poor/inefficient code.  I'm not sure that a (hypothetical) ifcvt improvement
>> would be sufficient reason to revert/remove enhancements to compare_by_pieces.
>>
>> Is it that there's not enough (bootstrap and) testsuite coverage of compare_by_pieces
>> to make you feel comfortable with this change?  The proposed implementation should
>> be "obvious enough" to future generations what the intended behaviour should be.
>> And the x86 target hook implementation (i.e. the change) is only four lines long, a
>> fraction of the size of the new documentation and comments.
>
> My main concern was that we are communicating "implicit" dependences between
> the target hook and expand RTL code generation - we don't seem to pass down
> enough info for example to have the target verify constraints and
> excuse itself if
> they do not hold.  They also seem to be poorly documented and the
> compare_by_pieces (and all _by_pieces) stuff has been extended to be quite
> "configurable" and thus is probably prone to emit vastly different RTL in
> some special cases?

FWIW, I share these concerns, especially about the target hook not
being able to verify its assumptions.  Maybe it wouldn't matter too
much if the worst outcome was a missed optimisation, but I don't think
the mechanism is reliable enough for us to rely on it for correctness.

Richard
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3a1444d..509c0ee 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -16146,6 +16146,20 @@  ix86_fp_compare_code_to_integer (enum rtx_code code)
     }
 }
 
+/* Override compare_by_pieces' default implementation using the state
+   of the CCZmode FLAGS_REG and sete instruction.  TARGET is the integral
+   mode result, and FAIL_LABEL is the branch target of mismatched
+   comparisons.  */
+
+void
+ix86_finish_compare_by_pieces (rtx target, rtx_code_label *fail_label)
+{
+  rtx tmp = gen_reg_rtx (QImode);
+  emit_label (fail_label);
+  ix86_expand_setcc (tmp, NE, gen_rtx_REG (CCZmode, FLAGS_REG), const0_rtx);
+  convert_move (target, tmp, 1);
+}
+
 /* Zero extend possibly SImode EXP to Pmode register.  */
 rtx
 ix86_zero_extend_to_Pmode (rtx exp)
@@ -25127,6 +25141,8 @@  ix86_run_selftests (void)
 
 #undef TARGET_OVERLAP_OP_BY_PIECES_P
 #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
+#undef TARGET_FINISH_COMPARE_BY_PIECES
+#define TARGET_FINISH_COMPARE_BY_PIECES ix86_finish_compare_by_pieces
 
 #undef TARGET_FLAGS_REGNUM
 #define TARGET_FLAGS_REGNUM FLAGS_REG
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 95ba56e..28d8361 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6922,6 +6922,18 @@  particular mode from being used for block comparisons by returning a
 negative number from this hook.
 @end deftypefn
 
+@deftypefn {Target Hook} void TARGET_FINISH_COMPARE_BY_PIECES (rtx @var{target}, rtx_code_label *@var{fail_label})
+Allow targets with a zero flag and suitable setcc instruction to provide
+an alternate implementation for @code{compare_by_pieces}.  The function
+@code{compare_by_pieces} generates a sequence of equality tests that
+branch to @var{failure_label} on mismatches, and fall through on success.
+By default, this hook assigns @code{const1_rtx} to @var{target} in the
+current basic block and @code{const0_rtx} to @var{target} in a new
+@var{fail_label} basic block.  Targets like x86 can take advantage
+of the property that the condition codes/zero flag are appropriately
+set to avoid introducing a new basic block.
+@end deftypefn
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 4ac96dc..45711db 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4529,6 +4529,8 @@  If you don't define this, a reasonable default is used.
 
 @hook TARGET_COMPARE_BY_PIECES_BRANCH_RATIO
 
+@hook TARGET_FINISH_COMPARE_BY_PIECES
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 868fa6e..25fca17 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -1923,7 +1923,6 @@  compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
 		   by_pieces_constfn a1_cfn, void *a1_cfn_data)
 {
   rtx_code_label *fail_label = gen_label_rtx ();
-  rtx_code_label *end_label = gen_label_rtx ();
 
   if (target == NULL_RTX
       || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
@@ -1934,12 +1933,8 @@  compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
 
   data.run ();
 
-  emit_move_insn (target, const0_rtx);
-  emit_jump (end_label);
-  emit_barrier ();
-  emit_label (fail_label);
-  emit_move_insn (target, const1_rtx);
-  emit_label (end_label);
+  /* Allow the backend to override the default implementation.  */
+  targetm.finish_compare_by_pieces (target, fail_label);
 
   return target;
 }
diff --git a/gcc/target.def b/gcc/target.def
index 7d68429..0593917 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3749,6 +3749,20 @@  negative number from this hook.",
  default_compare_by_pieces_branch_ratio)
 
 DEFHOOK
+(finish_compare_by_pieces,
+ "Allow targets with a zero flag and suitable setcc instruction to provide\n\
+an alternate implementation for @code{compare_by_pieces}.  The function\n\
+@code{compare_by_pieces} generates a sequence of equality tests that\n\
+branch to @var{failure_label} on mismatches, and fall through on success.\n\
+By default, this hook assigns @code{const1_rtx} to @var{target} in the\n\
+current basic block and @code{const0_rtx} to @var{target} in a new\n\
+@var{fail_label} basic block.  Targets like x86 can take advantage\n\
+of the property that the condition codes/zero flag are appropriately\n\
+set to avoid introducing a new basic block.",
+  void, (rtx target, rtx_code_label *fail_label),
+  default_finish_compare_by_pieces)
+
+DEFHOOK
 (slow_unaligned_access,
  "This hook returns true if memory accesses described by the\n\
 @var{mode} and @var{alignment} parameters have a cost many times greater\n\
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index e190369..767fe38 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -2094,6 +2094,24 @@  default_compare_by_pieces_branch_ratio (machine_mode)
   return 1;
 }
 
+/* This hook allows the backend to modify/override code generation for
+   compare_by_pieces.  Targets with a zero flag and a suitable setcc
+   function can use them instead of the default "compare ? 0 : 1"
+   implementation below.  TARGET is the integral mode result and
+   FAIL_LABEL is the destination for comparison mismatches.  */
+
+void
+default_finish_compare_by_pieces (rtx target, rtx_code_label *fail_label)
+{
+  rtx_code_label *end_label = gen_label_rtx ();
+  emit_move_insn (target, const0_rtx);
+  emit_jump (end_label);
+  emit_barrier ();
+  emit_label (fail_label);
+  emit_move_insn (target, const1_rtx);
+  emit_label (end_label);
+}
+
 /* Write PATCH_AREA_SIZE NOPs into the asm outfile FILE around a function
    entry.  If RECORD_P is true and the target supports named sections,
    the location of the NOPs will be recorded in a special object section
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 1a0db8d..b99fff6 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -237,6 +237,7 @@  extern bool default_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT,
 						    enum by_pieces_operation,
 						    bool);
 extern int default_compare_by_pieces_branch_ratio (machine_mode);
+extern void default_finish_compare_by_pieces (rtx, rtx_code_label *);
 
 extern void default_print_patchable_function_entry (FILE *,
 						    unsigned HOST_WIDE_INT,
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcmp-1.c b/gcc/testsuite/gcc.target/i386/pieces-memcmp-1.c
new file mode 100644
index 0000000..de1d82f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcmp-1.c
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int foo(char *a)
+{
+    static const char t[] = "0123456789012345678901234567890";
+    return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
+}
+
+/* { dg-final { scan-assembler-not "xorl\[ \\t]*\\\$1," } } */