Message ID | 013101d9a91b$eb84cb60$c28e6220$@nextmovesoftware.com |
---|---|
State | New |
Headers | show |
Series | [x86] Add cbranchti4 pattern to i386.md (for -m32 compare_by_pieces). | expand |
On Tue, Jun 27, 2023 at 7:22 PM Roger Sayle <roger@nextmovesoftware.com> wrote: > > > This patch fixes some very odd (unanticipated) code generation by > compare_by_pieces with -m32 -mavx, since the recent addition of the > cbranchoi4 pattern. The issue is that cbranchoi4 is available with > TARGET_AVX, but cbranchti4 is currently conditional on TARGET_64BIT > which results in the odd behaviour (thanks to OPTAB_WIDEN) that with > -m32 -mavx, compare_by_pieces ends up (inefficiently) widening 128-bit > comparisons to 256-bits before performing PTEST. > > This patch fixes this by providing a cbranchti4 pattern that's available > with either TARGET_64BIT or TARGET_SSE4_1. > > For the test case below (again from PR 104610): > > int foo(char *a) > { > static const char t[] = "0123456789012345678901234567890"; > return __builtin_memcmp(a, &t[0], sizeof(t)) == 0; > } > > GCC with -m32 -O2 -mavx currently produces the bonkers: > > foo: pushl %ebp > movl %esp, %ebp > andl $-32, %esp > subl $64, %esp > movl 8(%ebp), %eax > vmovdqa .LC0, %xmm4 > movl $0, 48(%esp) > vmovdqu (%eax), %xmm2 > movl $0, 52(%esp) > movl $0, 56(%esp) > movl $0, 60(%esp) > movl $0, 16(%esp) > movl $0, 20(%esp) > movl $0, 24(%esp) > movl $0, 28(%esp) > vmovdqa %xmm2, 32(%esp) > vmovdqa %xmm4, (%esp) > vmovdqa (%esp), %ymm5 > vpxor 32(%esp), %ymm5, %ymm0 > vptest %ymm0, %ymm0 > jne .L2 > vmovdqu 16(%eax), %xmm7 > movl $0, 48(%esp) > movl $0, 52(%esp) > vmovdqa %xmm7, 32(%esp) > vmovdqa .LC1, %xmm7 > movl $0, 56(%esp) > movl $0, 60(%esp) > movl $0, 16(%esp) > movl $0, 20(%esp) > movl $0, 24(%esp) > movl $0, 28(%esp) > vmovdqa %xmm7, (%esp) > vmovdqa (%esp), %ymm1 > vpxor 32(%esp), %ymm1, %ymm0 > vptest %ymm0, %ymm0 > je .L6 > .L2: movl $1, %eax > xorl $1, %eax > vzeroupper > leave > ret > .L6: xorl %eax, %eax > xorl $1, %eax > vzeroupper > leave > ret > > with this patch, we now generate the (slightly) more sensible: > > foo: vmovdqa .LC0, %xmm0 > movl 4(%esp), %eax > vpxor (%eax), %xmm0, %xmm0 > vptest %xmm0, %xmm0 > jne .L2 > vmovdqa .LC1, %xmm0 > vpxor 16(%eax), %xmm0, %xmm0 > vptest %xmm0, %xmm0 > je .L5 > .L2: movl $1, %eax > xorl $1, %eax > ret > .L5: xorl %eax, %eax > xorl $1, %eax > ret > > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap > and make -k check, both with and without --target_board=unix{-m32} > with no new failures. Ok for mainline? > > > 2023-06-27 Roger Sayle <roger@nextmovesoftware.com> > > gcc/ChangeLog > * config/i386/i386-expand.cc (ix86_expand_branch): Also use ptest > for TImode comparisons on 32-bit architectures. > * config/i386/i386.md (cbranch<mode>4): Change from SDWIM to > SWIM1248x to exclude/avoid TImode being conditional on -m64. > (cbranchti4): New define_expand for TImode on both TARGET_64BIT > and/or with TARGET_SSE4_1. > * config/i386/predicates.md (ix86_timode_comparison_operator): > New predicate that depends upon TARGET_64BIT. > (ix86_timode_comparison_operand): Likewise. > > gcc/testsuite/ChangeLog > * gcc.target/i386/pieces-memcmp-2.c: New test case. OK with a small fix. Thanks, Uros. +;; Return true if this is a valid second operand for a TImode comparison. +(define_predicate "ix86_timode_comparison_operand" + (if_then_else (match_test "TARGET_64BIT") + (match_operand 0 "x86_64_general_operand") + (match_operand 0 "nonimmediate_operand"))) + + Please remove the duplicate blank line above.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 9a8d244..567248d 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -2365,6 +2365,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) /* Handle special case - vector comparsion with boolean result, transform it using ptest instruction. */ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT + || (mode == TImode && !TARGET_64BIT) || mode == OImode) { rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG); @@ -2372,7 +2373,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) gcc_assert (code == EQ || code == NE); - if (mode == OImode) + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT) { op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode); op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b50d82b..dcf0ba6 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1352,8 +1352,8 @@ (define_expand "cbranch<mode>4" [(set (reg:CC FLAGS_REG) - (compare:CC (match_operand:SDWIM 1 "nonimmediate_operand") - (match_operand:SDWIM 2 "<general_operand>"))) + (compare:CC (match_operand:SWIM1248x 1 "nonimmediate_operand") + (match_operand:SWIM1248x 2 "<general_operand>"))) (set (pc) (if_then_else (match_operator 0 "ordered_comparison_operator" [(reg:CC FLAGS_REG) (const_int 0)]) @@ -1368,6 +1368,22 @@ DONE; }) +(define_expand "cbranchti4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:TI 1 "nonimmediate_operand") + (match_operand:TI 2 "ix86_timode_comparison_operand"))) + (set (pc) (if_then_else + (match_operator 0 "ix86_timode_comparison_operator" + [(reg:CC FLAGS_REG) (const_int 0)]) + (label_ref (match_operand 3)) + (pc)))] + "TARGET_64BIT || TARGET_SSE4_1" +{ + ix86_expand_branch (GET_CODE (operands[0]), + operands[1], operands[2], operands[3]); + DONE; +}) + (define_expand "cbranchoi4" [(set (reg:CC FLAGS_REG) (compare:CC (match_operand:OI 1 "nonimmediate_operand") diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index fb07707..2d50cbf 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1641,6 +1641,19 @@ (match_operand 0 "comparison_operator") (match_operand 0 "ix86_trivial_fp_comparison_operator"))) +;; Return true if we can perform this comparison on TImode operands. +(define_predicate "ix86_timode_comparison_operator" + (if_then_else (match_test "TARGET_64BIT") + (match_operand 0 "ordered_comparison_operator") + (match_operand 0 "bt_comparison_operator"))) + +;; Return true if this is a valid second operand for a TImode comparison. +(define_predicate "ix86_timode_comparison_operand" + (if_then_else (match_test "TARGET_64BIT") + (match_operand 0 "x86_64_general_operand") + (match_operand 0 "nonimmediate_operand"))) + + ;; Nearly general operand, but accept any const_double, since we wish ;; to be able to drop them into memory rather than have them get pulled ;; into registers. diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c b/gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c new file mode 100644 index 0000000..6f996fa --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c @@ -0,0 +1,13 @@ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2 -mavx2" } */ + +int foo(char *a) +{ + static const char t[] = "0123456789012345678901234567890"; + return __builtin_memcmp(a, &t[0], sizeof(t)) == 0; +} + +/* { dg-final { scan-assembler-not "movl\[ \\t]*\\\$0," } } */ +/* { dg-final { scan-assembler-not "vptest\[ \\t]*%ymm" } } */ +/* { dg-final { scan-assembler-times "vptest\[ \\t]*%xmm" 2 } } */ +