diff mbox series

middle-end: also optimized `popcount(a) <= 1` [PR90693]

Message ID 20240830000813.145441-1-quic_apinski@quicinc.com
State New
Headers show
Series middle-end: also optimized `popcount(a) <= 1` [PR90693] | expand

Commit Message

Andrew Pinski Aug. 30, 2024, 12:08 a.m. UTC
This expands on optimizing `popcount(a) == 1` to also handle
`popcount(a) <= 1`. `<= 1` can be expanded as `(a & -a) == 0`
like what is done for `== 1` if we know that a was nonzero.
We have to do the optimization in 2 places due to if we have
an optab entry for popcount or not.

Built and tested for aarch64-linux-gnu.

	PR middle-end/90693

gcc/ChangeLog:

	* internal-fn.cc (expand_POPCOUNT): Handle the second argument
	being `-1` for `<= 1`.
	* tree-ssa-math-opts.cc (match_single_bit_test): Handle LE/GT
	cases.
	(math_opts_dom_walker::after_dom_children): Call match_single_bit_test
	for LE_EXPR/GT_EXPR also.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/popcnt-le-1.c: New test.
	* gcc.target/aarch64/popcnt-le-2.c: New test.
	* gcc.target/aarch64/popcnt-le-3.c: New test.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
 gcc/internal-fn.cc                            | 20 +++++++++++-
 .../gcc.target/aarch64/popcnt-le-1.c          | 29 +++++++++++++++++
 .../gcc.target/aarch64/popcnt-le-2.c          | 31 +++++++++++++++++++
 .../gcc.target/aarch64/popcnt-le-3.c          | 31 +++++++++++++++++++
 gcc/tree-ssa-math-opts.cc                     | 25 +++++++++++----
 5 files changed, 129 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-le-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c

Comments

Richard Biener Sept. 9, 2024, 12:11 p.m. UTC | #1
On Fri, Aug 30, 2024 at 2:09 AM Andrew Pinski <quic_apinski@quicinc.com> wrote:
>
> This expands on optimizing `popcount(a) == 1` to also handle
> `popcount(a) <= 1`. `<= 1` can be expanded as `(a & -a) == 0`
> like what is done for `== 1` if we know that a was nonzero.
> We have to do the optimization in 2 places due to if we have
> an optab entry for popcount or not.
>
> Built and tested for aarch64-linux-gnu.

OK.

Thanks,
Richard.

>         PR middle-end/90693
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (expand_POPCOUNT): Handle the second argument
>         being `-1` for `<= 1`.
>         * tree-ssa-math-opts.cc (match_single_bit_test): Handle LE/GT
>         cases.
>         (math_opts_dom_walker::after_dom_children): Call match_single_bit_test
>         for LE_EXPR/GT_EXPR also.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/popcnt-le-1.c: New test.
>         * gcc.target/aarch64/popcnt-le-2.c: New test.
>         * gcc.target/aarch64/popcnt-le-3.c: New test.
>
> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> ---
>  gcc/internal-fn.cc                            | 20 +++++++++++-
>  .../gcc.target/aarch64/popcnt-le-1.c          | 29 +++++++++++++++++
>  .../gcc.target/aarch64/popcnt-le-2.c          | 31 +++++++++++++++++++
>  .../gcc.target/aarch64/popcnt-le-3.c          | 31 +++++++++++++++++++
>  gcc/tree-ssa-math-opts.cc                     | 25 +++++++++++----
>  5 files changed, 129 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-le-2.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 4e33db365ac..b55f089cf56 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -5304,11 +5304,16 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
>       Use rtx costs in that case to determine if .POPCOUNT (arg) == 1
>       or (arg ^ (arg - 1)) > arg - 1 is cheaper.
>       If .POPCOUNT second argument is 0, we additionally know that arg
> -     is non-zero, so use arg & (arg - 1) == 0 instead.  */
> +     is non-zero, so use arg & (arg - 1) == 0 instead.
> +     If .POPCOUNT second argument is -1, the comparison was either `<= 1`
> +     or `> 1`.  */
>    bool speed_p = optimize_insn_for_speed_p ();
>    tree lhs = gimple_call_lhs (stmt);
>    tree arg = gimple_call_arg (stmt, 0);
>    bool nonzero_arg = integer_zerop (gimple_call_arg (stmt, 1));
> +  bool was_le = integer_minus_onep (gimple_call_arg (stmt, 1));
> +  if (was_le)
> +    nonzero_arg = true;
>    tree type = TREE_TYPE (arg);
>    machine_mode mode = TYPE_MODE (type);
>    machine_mode lhsmode = TYPE_MODE (TREE_TYPE (lhs));
> @@ -5360,10 +5365,23 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
>      emit_insn (popcount_insns);
>    else
>      {
> +      start_sequence ();
>        emit_insn (cmp_insns);
>        plhs = expand_normal (lhs);
>        if (GET_MODE (cmp) != GET_MODE (plhs))
>         cmp = convert_to_mode (GET_MODE (plhs), cmp, 1);
> +      /* For `<= 1`, we need to produce `2 - cmp` or `cmp ? 1 : 2` as that
> +        then gets compared against 1 and we need the false case to be 2.  */
> +      if (was_le)
> +       {
> +         cmp = expand_simple_binop (GET_MODE (cmp), MINUS, const2_rtx,
> +                                    cmp, NULL_RTX, 1, OPTAB_WIDEN);
> +         if (!cmp)
> +           goto fail;
> +       }
>        emit_move_insn (plhs, cmp);
> +      rtx_insn *all_insns = get_insns ();
> +      end_sequence ();
> +      emit_insn (all_insns);
>      }
>  }
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
> new file mode 100644
> index 00000000000..b4141da982c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand-details" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR middle-end/90693 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** le32:
> +**     sub     w([0-9]+), w0, #1
> +**     tst     w0, w\1
> +**     cset    w0, eq
> +**     ret
> +*/
> +
> +unsigned le32 (const unsigned int a) {
> +  return __builtin_popcountg (a) <= 1;
> +}
> +
> +/*
> +** gt32:
> +**     sub     w([0-9]+), w0, #1
> +**     tst     w0, w\1
> +**     cset    w0, ne
> +**     ret
> +*/
> +unsigned gt32 (const unsigned int a) {
> +  return __builtin_popcountg (a) > 1;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-2.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-2.c
> new file mode 100644
> index 00000000000..975552ca63e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-2.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mgeneral-regs-only -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-not "POPCOUNT \\\(" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "__builtin_popcount \\\(" "optimized" } } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR middle-end/90693 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** le32:
> +**     sub     w([0-9]+), w0, #1
> +**     tst     w\1, w0
> +**     cset    w0, eq
> +**     ret
> +*/
> +
> +unsigned le32 (const unsigned int a) {
> +  return __builtin_popcountg (a) <= 1;
> +}
> +
> +/*
> +** gt32:
> +**     sub     w([0-9]+), w0, #1
> +**     tst     w\1, w0
> +**     cset    w0, ne
> +**     ret
> +*/
> +unsigned gt32 (const unsigned int a) {
> +  return __builtin_popcountg (a) > 1;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
> new file mode 100644
> index 00000000000..b811e6f6e8f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand-details" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR middle-end/90693 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** le16:
> +**     sub     w([0-9]+), w0, #1
> +**     and     w([0-9]+), w0, w\1
> +**     tst     w\2, 65535
> +**     cset    w0, eq
> +**     ret
> +*/
> +
> +unsigned le16 (const unsigned short a) {
> +  return __builtin_popcountg (a) <= 1;
> +}
> +
> +/*
> +** gt16:
> +**     sub     w([0-9]+), w0, #1
> +**     and     w([0-9]+), w0, w\1
> +**     tst     w\2, 65535
> +**     cset    w0, ne
> +**     ret
> +*/
> +unsigned gt16 (const unsigned short a) {
> +  return __builtin_popcountg (a) > 1;
> +}
> diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> index 3c93fca5b53..d61668aacfc 100644
> --- a/gcc/tree-ssa-math-opts.cc
> +++ b/gcc/tree-ssa-math-opts.cc
> @@ -5433,13 +5433,15 @@ match_uaddc_usubc (gimple_stmt_iterator *gsi, gimple *stmt, tree_code code)
>
>  /* Replace .POPCOUNT (x) == 1 or .POPCOUNT (x) != 1 with
>     (x & (x - 1)) > x - 1 or (x & (x - 1)) <= x - 1 if .POPCOUNT
> -   isn't a direct optab.  */
> +   isn't a direct optab.  Also handle `<=`/`>` to be
> +   `x & (x - 1) !=/== x`. */
>
>  static void
>  match_single_bit_test (gimple_stmt_iterator *gsi, gimple *stmt)
>  {
>    tree clhs, crhs;
>    enum tree_code code;
> +  bool was_le = false;
>    if (gimple_code (stmt) == GIMPLE_COND)
>      {
>        clhs = gimple_cond_lhs (stmt);
> @@ -5452,8 +5454,11 @@ match_single_bit_test (gimple_stmt_iterator *gsi, gimple *stmt)
>        crhs = gimple_assign_rhs2 (stmt);
>        code = gimple_assign_rhs_code (stmt);
>      }
> -  if (code != EQ_EXPR && code != NE_EXPR)
> +  if (code != LE_EXPR && code != GT_EXPR
> +      && code != EQ_EXPR && code != NE_EXPR)
>      return;
> +  if (code == LE_EXPR || code == GT_EXPR)
> +    was_le = true;
>    if (TREE_CODE (clhs) != SSA_NAME || !integer_onep (crhs))
>      return;
>    gimple *call = SSA_NAME_DEF_STMT (clhs);
> @@ -5477,8 +5482,9 @@ match_single_bit_test (gimple_stmt_iterator *gsi, gimple *stmt)
>        /* Tell expand_POPCOUNT the popcount result is only used in equality
>          comparison with one, so that it can decide based on rtx costs.  */
>        gimple *g = gimple_build_call_internal (IFN_POPCOUNT, 2, arg,
> -                                             nonzero_arg ? integer_zero_node
> -                                             : integer_one_node);
> +                                             was_le ? integer_minus_one_node
> +                                             : (nonzero_arg ? integer_zero_node
> +                                                : integer_one_node));
>        gimple_call_set_lhs (g, gimple_call_lhs (call));
>        gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
>        gsi_replace (&gsi2, g, true);
> @@ -5489,11 +5495,16 @@ match_single_bit_test (gimple_stmt_iterator *gsi, gimple *stmt)
>                                    build_int_cst (type, -1));
>    gsi_insert_before (gsi, g, GSI_SAME_STMT);
>    g = gimple_build_assign (make_ssa_name (type),
> -                          nonzero_arg ? BIT_AND_EXPR : BIT_XOR_EXPR,
> +                          (nonzero_arg || was_le) ? BIT_AND_EXPR : BIT_XOR_EXPR,
>                            arg, argm1);
>    gsi_insert_before (gsi, g, GSI_SAME_STMT);
>    tree_code cmpcode;
> -  if (nonzero_arg)
> +  if (was_le)
> +    {
> +      argm1 = build_zero_cst (type);
> +      cmpcode = code == LE_EXPR ? EQ_EXPR : NE_EXPR;
> +    }
> +  else if (nonzero_arg)
>      {
>        argm1 = build_zero_cst (type);
>        cmpcode = code;
> @@ -6188,6 +6199,8 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
>
>             case EQ_EXPR:
>             case NE_EXPR:
> +           case LE_EXPR:
> +           case GT_EXPR:
>               match_single_bit_test (&gsi, stmt);
>               break;
>
> --
> 2.43.0
>
diff mbox series

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 4e33db365ac..b55f089cf56 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -5304,11 +5304,16 @@  expand_POPCOUNT (internal_fn fn, gcall *stmt)
      Use rtx costs in that case to determine if .POPCOUNT (arg) == 1
      or (arg ^ (arg - 1)) > arg - 1 is cheaper.
      If .POPCOUNT second argument is 0, we additionally know that arg
-     is non-zero, so use arg & (arg - 1) == 0 instead.  */
+     is non-zero, so use arg & (arg - 1) == 0 instead.
+     If .POPCOUNT second argument is -1, the comparison was either `<= 1`
+     or `> 1`.  */
   bool speed_p = optimize_insn_for_speed_p ();
   tree lhs = gimple_call_lhs (stmt);
   tree arg = gimple_call_arg (stmt, 0);
   bool nonzero_arg = integer_zerop (gimple_call_arg (stmt, 1));
+  bool was_le = integer_minus_onep (gimple_call_arg (stmt, 1));
+  if (was_le)
+    nonzero_arg = true;
   tree type = TREE_TYPE (arg);
   machine_mode mode = TYPE_MODE (type);
   machine_mode lhsmode = TYPE_MODE (TREE_TYPE (lhs));
@@ -5360,10 +5365,23 @@  expand_POPCOUNT (internal_fn fn, gcall *stmt)
     emit_insn (popcount_insns);
   else
     {
+      start_sequence ();
       emit_insn (cmp_insns);
       plhs = expand_normal (lhs);
       if (GET_MODE (cmp) != GET_MODE (plhs))
 	cmp = convert_to_mode (GET_MODE (plhs), cmp, 1);
+      /* For `<= 1`, we need to produce `2 - cmp` or `cmp ? 1 : 2` as that
+	 then gets compared against 1 and we need the false case to be 2.  */
+      if (was_le)
+	{
+	  cmp = expand_simple_binop (GET_MODE (cmp), MINUS, const2_rtx,
+				     cmp, NULL_RTX, 1, OPTAB_WIDEN);
+	  if (!cmp)
+	    goto fail;
+	}
       emit_move_insn (plhs, cmp);
+      rtx_insn *all_insns = get_insns ();
+      end_sequence ();
+      emit_insn (all_insns);
     }
 }
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
new file mode 100644
index 00000000000..b4141da982c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
@@ -0,0 +1,29 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand-details" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR middle-end/90693 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** le32:
+**	sub	w([0-9]+), w0, #1
+**	tst	w0, w\1
+**	cset	w0, eq
+**	ret
+*/
+
+unsigned le32 (const unsigned int a) {
+  return __builtin_popcountg (a) <= 1;
+}
+
+/*
+** gt32:
+**	sub	w([0-9]+), w0, #1
+**	tst	w0, w\1
+**	cset	w0, ne
+**	ret
+*/
+unsigned gt32 (const unsigned int a) {
+  return __builtin_popcountg (a) > 1;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-2.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-2.c
new file mode 100644
index 00000000000..975552ca63e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-2.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mgeneral-regs-only -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "POPCOUNT \\\(" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "__builtin_popcount \\\(" "optimized" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR middle-end/90693 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** le32:
+**	sub	w([0-9]+), w0, #1
+**	tst	w\1, w0
+**	cset	w0, eq
+**	ret
+*/
+
+unsigned le32 (const unsigned int a) {
+  return __builtin_popcountg (a) <= 1;
+}
+
+/*
+** gt32:
+**	sub	w([0-9]+), w0, #1
+**	tst	w\1, w0
+**	cset	w0, ne
+**	ret
+*/
+unsigned gt32 (const unsigned int a) {
+  return __builtin_popcountg (a) > 1;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
new file mode 100644
index 00000000000..b811e6f6e8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand-details" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR middle-end/90693 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** le16:
+**	sub	w([0-9]+), w0, #1
+**	and	w([0-9]+), w0, w\1
+**	tst	w\2, 65535
+**	cset	w0, eq
+**	ret
+*/
+
+unsigned le16 (const unsigned short a) {
+  return __builtin_popcountg (a) <= 1;
+}
+
+/*
+** gt16:
+**	sub	w([0-9]+), w0, #1
+**	and	w([0-9]+), w0, w\1
+**	tst	w\2, 65535
+**	cset	w0, ne
+**	ret
+*/
+unsigned gt16 (const unsigned short a) {
+  return __builtin_popcountg (a) > 1;
+}
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 3c93fca5b53..d61668aacfc 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -5433,13 +5433,15 @@  match_uaddc_usubc (gimple_stmt_iterator *gsi, gimple *stmt, tree_code code)
 
 /* Replace .POPCOUNT (x) == 1 or .POPCOUNT (x) != 1 with
    (x & (x - 1)) > x - 1 or (x & (x - 1)) <= x - 1 if .POPCOUNT
-   isn't a direct optab.  */
+   isn't a direct optab.  Also handle `<=`/`>` to be
+   `x & (x - 1) !=/== x`. */
 
 static void
 match_single_bit_test (gimple_stmt_iterator *gsi, gimple *stmt)
 {
   tree clhs, crhs;
   enum tree_code code;
+  bool was_le = false;
   if (gimple_code (stmt) == GIMPLE_COND)
     {
       clhs = gimple_cond_lhs (stmt);
@@ -5452,8 +5454,11 @@  match_single_bit_test (gimple_stmt_iterator *gsi, gimple *stmt)
       crhs = gimple_assign_rhs2 (stmt);
       code = gimple_assign_rhs_code (stmt);
     }
-  if (code != EQ_EXPR && code != NE_EXPR)
+  if (code != LE_EXPR && code != GT_EXPR
+      && code != EQ_EXPR && code != NE_EXPR)
     return;
+  if (code == LE_EXPR || code == GT_EXPR)
+    was_le = true;
   if (TREE_CODE (clhs) != SSA_NAME || !integer_onep (crhs))
     return;
   gimple *call = SSA_NAME_DEF_STMT (clhs);
@@ -5477,8 +5482,9 @@  match_single_bit_test (gimple_stmt_iterator *gsi, gimple *stmt)
       /* Tell expand_POPCOUNT the popcount result is only used in equality
 	 comparison with one, so that it can decide based on rtx costs.  */
       gimple *g = gimple_build_call_internal (IFN_POPCOUNT, 2, arg,
-					      nonzero_arg ? integer_zero_node
-					      : integer_one_node);
+					      was_le ? integer_minus_one_node
+					      : (nonzero_arg ? integer_zero_node
+						 : integer_one_node));
       gimple_call_set_lhs (g, gimple_call_lhs (call));
       gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
       gsi_replace (&gsi2, g, true);
@@ -5489,11 +5495,16 @@  match_single_bit_test (gimple_stmt_iterator *gsi, gimple *stmt)
 				   build_int_cst (type, -1));
   gsi_insert_before (gsi, g, GSI_SAME_STMT);
   g = gimple_build_assign (make_ssa_name (type),
-			   nonzero_arg ? BIT_AND_EXPR : BIT_XOR_EXPR,
+			   (nonzero_arg || was_le) ? BIT_AND_EXPR : BIT_XOR_EXPR,
 			   arg, argm1);
   gsi_insert_before (gsi, g, GSI_SAME_STMT);
   tree_code cmpcode;
-  if (nonzero_arg)
+  if (was_le)
+    {
+      argm1 = build_zero_cst (type);
+      cmpcode = code == LE_EXPR ? EQ_EXPR : NE_EXPR;
+    }
+  else if (nonzero_arg)
     {
       argm1 = build_zero_cst (type);
       cmpcode = code;
@@ -6188,6 +6199,8 @@  math_opts_dom_walker::after_dom_children (basic_block bb)
 
 	    case EQ_EXPR:
 	    case NE_EXPR:
+	    case LE_EXPR:
+	    case GT_EXPR:
 	      match_single_bit_test (&gsi, stmt);
 	      break;