diff mbox series

expand: Allow widdening optab when expanding popcount==1 [PR116508]

Message ID 20240829013557.3827912-1-quic_apinski@quicinc.com
State New
Headers show
Series expand: Allow widdening optab when expanding popcount==1 [PR116508] | expand

Commit Message

Andrew Pinski Aug. 29, 2024, 1:35 a.m. UTC
After adding popcount{qi,hi}2 to the aarch64 backend, I noticed that
the expansion for popcount==1 was no longer trying to do the trick
of handling popcount==1 as `(arg ^ (arg - 1)) > arg - 1`. The problem
is the expansion was using OPTAB_DIRECT, when using OPTAB_WIDEN
will allow modes which are smaller than SImode (in the aarch64 case).

Note QImode's cost still needs some improvements so part of popcnt-eq-1.c
is xfailed. Though there is a check to make sure the costs are compared now.

Built and tested on aarch64-linux-gnu.

	PR middle-end/116508

gcc/ChangeLog:

	* internal-fn.cc (expand_POPCOUNT): Use OPTAB_WIDEN for PLUS and
	XOR/AND expansion.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/popcnt-eq-1.c: New test.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
 gcc/internal-fn.cc                            |  4 +-
 .../gcc.target/aarch64/popcnt-eq-1.c          | 45 +++++++++++++++++++
 2 files changed, 47 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c

Comments

Richard Biener Aug. 29, 2024, 9:15 a.m. UTC | #1
On Thu, Aug 29, 2024 at 3:37 AM Andrew Pinski <quic_apinski@quicinc.com> wrote:
>
> After adding popcount{qi,hi}2 to the aarch64 backend, I noticed that
> the expansion for popcount==1 was no longer trying to do the trick
> of handling popcount==1 as `(arg ^ (arg - 1)) > arg - 1`. The problem
> is the expansion was using OPTAB_DIRECT, when using OPTAB_WIDEN
> will allow modes which are smaller than SImode (in the aarch64 case).
>
> Note QImode's cost still needs some improvements so part of popcnt-eq-1.c
> is xfailed. Though there is a check to make sure the costs are compared now.
>
> Built and tested on aarch64-linux-gnu.

OK.

Richard.

>         PR middle-end/116508
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (expand_POPCOUNT): Use OPTAB_WIDEN for PLUS and
>         XOR/AND expansion.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/popcnt-eq-1.c: New test.
>
> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> ---
>  gcc/internal-fn.cc                            |  4 +-
>  .../gcc.target/aarch64/popcnt-eq-1.c          | 45 +++++++++++++++++++
>  2 files changed, 47 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 78997ef056a..4e33db365ac 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -5332,11 +5332,11 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
>    start_sequence ();
>    rtx op0 = expand_normal (arg);
>    rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX,
> -                                  1, OPTAB_DIRECT);
> +                                  1, OPTAB_WIDEN);
>    if (argm1 == NULL_RTX)
>      goto fail;
>    rtx argxorargm1 = expand_simple_binop (mode, nonzero_arg ? AND : XOR, op0,
> -                                        argm1, NULL_RTX, 1, OPTAB_DIRECT);
> +                                        argm1, NULL_RTX, 1, OPTAB_WIDEN);
>    if (argxorargm1 == NULL_RTX)
>      goto fail;
>    rtx cmp;
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c
> new file mode 100644
> index 00000000000..bb9e2bf0a54
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand-details" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR middle-end/116508 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** h16:
> +**     sub     w([0-9]+), w0, #1
> +**     eor     w([0-9]+), w0, w\1
> +**     and     w([0-9]+), w\1, 65535
> +**     cmp     w\3, w\2, uxth
> +**     cset    w0, cc
> +**     ret
> +*/
> +
> +/* when expanding popcount == 1, should use
> +   `(arg ^ (arg - 1)) > arg - 1` as that has a lower latency
> +   than doing the popcount then comparing against 1.
> +   The popcount/addv can be costly. */
> +unsigned h16 (const unsigned short a) {
> +         return __builtin_popcountg (a) == 1;
> +}
> +
> +/* unsigned char should also do the same trick */
> +/* Currently xfailed since the cost does not take into account the
> +   moving between gprs and vector regs correctly. */
> +/*
> +** h8: { xfail *-*-* }
> +**     sub     w([0-9]+), w0, #1
> +**     eor     w([0-9]+), w0, w\1
> +**     and     w([0-9]+), w\1, 255
> +**     cmp     w\3, w\2, uxtb
> +**     cset    w0, cc
> +**     ret
> +*/
> +
> +
> +unsigned h8 (const unsigned char a) {
> +         return __builtin_popcountg (a) == 1;
> +}
> +
> +/* There should be printing out the costs for h8 and h16's popcount == 1 */
> +/* { dg-final { scan-rtl-dump-times "popcount == 1:" 2 "expand"} } */
> --
> 2.43.0
>
diff mbox series

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 78997ef056a..4e33db365ac 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -5332,11 +5332,11 @@  expand_POPCOUNT (internal_fn fn, gcall *stmt)
   start_sequence ();
   rtx op0 = expand_normal (arg);
   rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX,
-				   1, OPTAB_DIRECT);
+				   1, OPTAB_WIDEN);
   if (argm1 == NULL_RTX)
     goto fail;
   rtx argxorargm1 = expand_simple_binop (mode, nonzero_arg ? AND : XOR, op0,
-					 argm1, NULL_RTX, 1, OPTAB_DIRECT);
+					 argm1, NULL_RTX, 1, OPTAB_WIDEN);
   if (argxorargm1 == NULL_RTX)
     goto fail;
   rtx cmp;
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c
new file mode 100644
index 00000000000..bb9e2bf0a54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c
@@ -0,0 +1,45 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand-details" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR middle-end/116508 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h16:
+**	sub	w([0-9]+), w0, #1
+**	eor	w([0-9]+), w0, w\1
+**	and	w([0-9]+), w\1, 65535
+**	cmp	w\3, w\2, uxth
+**	cset	w0, cc
+**	ret
+*/
+
+/* when expanding popcount == 1, should use
+   `(arg ^ (arg - 1)) > arg - 1` as that has a lower latency
+   than doing the popcount then comparing against 1.
+   The popcount/addv can be costly. */
+unsigned h16 (const unsigned short a) {
+	  return __builtin_popcountg (a) == 1;
+}
+
+/* unsigned char should also do the same trick */
+/* Currently xfailed since the cost does not take into account the
+   moving between gprs and vector regs correctly. */
+/*
+** h8: { xfail *-*-* }
+**	sub	w([0-9]+), w0, #1
+**	eor	w([0-9]+), w0, w\1
+**	and	w([0-9]+), w\1, 255
+**	cmp	w\3, w\2, uxtb
+**	cset	w0, cc
+**	ret
+*/
+
+
+unsigned h8 (const unsigned char a) {
+	  return __builtin_popcountg (a) == 1;
+}
+
+/* There should be printing out the costs for h8 and h16's popcount == 1 */
+/* { dg-final { scan-rtl-dump-times "popcount == 1:" 2 "expand"} } */