diff mbox series

aarch64: Improve popcount for bytes [PR113042]

Message ID 20240610040525.73377-1-quic_apinski@quicinc.com
State New
Headers show
Series aarch64: Improve popcount for bytes [PR113042] | expand

Commit Message

Andrew Pinski June 10, 2024, 4:05 a.m. UTC
For popcount for bytes, we don't need the reduction addition
after the vector cnt instruction as we are only counting one
byte's popcount.
This implements a new define_expand to handle that.

Bootstrapped and tested on aarch64-linux-gnu with no regressions.

	PR target/113042

gcc/ChangeLog:

	* config/aarch64/aarch64.md (popcountqi2): New pattern.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/popcnt5.c: New test.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
 gcc/config/aarch64/aarch64.md              | 26 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c

Comments

Kyrylo Tkachov June 10, 2024, 7:25 a.m. UTC | #1
Hi Andrew

-----Original Message-----
From: Andrew Pinski <quic_apinski@quicinc.com <mailto:quic_apinski@quicinc.com>>
Date: Monday, 10 June 2024 at 06:05
To: "gcc-patches@gcc.gnu.org <mailto:gcc-patches@gcc.gnu.org>" <gcc-patches@gcc.gnu.org <mailto:gcc-patches@gcc.gnu.org>>
Cc: Andrew Pinski <quic_apinski@quicinc.com <mailto:quic_apinski@quicinc.com>>
Subject: [PATCH] aarch64: Improve popcount for bytes [PR113042]


For popcount for bytes, we don't need the reduction addition
after the vector cnt instruction as we are only counting one
byte's popcount.
This implements a new define_expand to handle that.


Bootstrapped and tested on aarch64-linux-gnu with no regressions.


PR target/113042


gcc/ChangeLog:


* config/aarch64/aarch64.md (popcountqi2): New pattern.


gcc/testsuite/ChangeLog:


* gcc.target/aarch64/popcnt5.c: New test.


Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com <mailto:quic_apinski@quicinc.com>>
---
gcc/config/aarch64/aarch64.md | 26 ++++++++++++++++++++++
gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++++++++++
2 files changed, 45 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c


diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 389a1906e23..ebaf7ec9970 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5358,6 +5358,32 @@ (define_expand "popcount<mode>2"
}
})


+/* Popcount for byte can remove the reduction part after the popcount.
+ For optimization reasons, enabling this for CSSC. */
+(define_expand "popcountqi2"
+ [(set (match_operand:QI 0 "register_operand" "=w")
+ (popcount:QI (match_operand:QI 1 "register_operand" "w")))]
+ "TARGET_CSSC || TARGET_SIMD"
+{
+ rtx in = operands[1];
+ rtx out = operands[0];
+ if (TARGET_CSSC)
+ {
+ rtx tmp = gen_reg_rtx (SImode);
+ rtx out1 = gen_reg_rtx (SImode);
+ emit_insn (gen_zero_extendqisi2 (tmp, in));
+ emit_insn (gen_popcountsi2 (out1, tmp));
+ emit_move_insn (out, gen_lowpart (QImode, out1));
+ DONE;
+ }
+ rtx v = gen_reg_rtx (V8QImode);
+ rtx v1 = gen_reg_rtx (V8QImode);
+ emit_move_insn (v, gen_lowpart (V8QImode, in));
+ emit_insn (gen_popcountv8qi2 (v1, v));
+ emit_move_insn (out, gen_lowpart (QImode, v1));
+ DONE;
+})

TBH I'd rather merge it with the GPI popcount pattern that looks almost identical. You could extend it with the ALLI iterator and handle HImode as well quite easily.
Thanks,
Kyrill


+
(define_insn "clrsb<mode>2"
[(set (match_operand:GPI 0 "register_operand" "=r")
(clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
new file mode 100644
index 00000000000..406369d9b29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h8:
+** ldr b[0-9]+, \[x0\]
+** cnt v[0-9]+.8b, v[0-9]+.8b
+** smov w0, v[0-9]+.b\[0\]
+** ret
+*/
+/* We should not need the addv here since we only need a byte popcount. */
+
+unsigned h8 (const unsigned char *a) {
+ return __builtin_popcountg (a[0]);
+}
--
2.42.0
Andrew Pinski June 10, 2024, 7:25 p.m. UTC | #2
> -----Original Message-----
> From: Kyrylo Tkachov <ktkachov@nvidia.com>
> Sent: Monday, June 10, 2024 12:26 AM
> To: Andrew Pinski (QUIC) <quic_apinski@quicinc.com>; gcc-
> patches@gcc.gnu.org
> Subject: Re: [PATCH] aarch64: Improve popcount for bytes
> [PR113042]
> 
> Hi Andrew
> 
> -----Original Message-----
> From: Andrew Pinski <quic_apinski@quicinc.com
> <mailto:quic_apinski@quicinc.com>>
> Date: Monday, 10 June 2024 at 06:05
> To: "gcc-patches@gcc.gnu.org <mailto:gcc-
> patches@gcc.gnu.org>" <gcc-patches@gcc.gnu.org
> <mailto:gcc-patches@gcc.gnu.org>>
> Cc: Andrew Pinski <quic_apinski@quicinc.com
> <mailto:quic_apinski@quicinc.com>>
> Subject: [PATCH] aarch64: Improve popcount for bytes
> [PR113042]
> 
> 
> For popcount for bytes, we don't need the reduction addition
> after the vector cnt instruction as we are only counting one
> byte's popcount.
> This implements a new define_expand to handle that.
> 
> 
> Bootstrapped and tested on aarch64-linux-gnu with no
> regressions.
> 
> 
> PR target/113042
> 
> 
> gcc/ChangeLog:
> 
> 
> * config/aarch64/aarch64.md (popcountqi2): New pattern.
> 
> 
> gcc/testsuite/ChangeLog:
> 
> 
> * gcc.target/aarch64/popcnt5.c: New test.
> 
> 
> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com
> <mailto:quic_apinski@quicinc.com>>
> ---
> gcc/config/aarch64/aarch64.md | 26
> ++++++++++++++++++++++
> gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19
> ++++++++++++++++
> 2 files changed, 45 insertions(+)
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt5.c
> 
> 
> diff --git a/gcc/config/aarch64/aarch64.md
> b/gcc/config/aarch64/aarch64.md index
> 389a1906e23..ebaf7ec9970 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5358,6 +5358,32 @@ (define_expand
> "popcount<mode>2"
> }
> })
> 
> 
> +/* Popcount for byte can remove the reduction part after the
> popcount.
> + For optimization reasons, enabling this for CSSC. */
> (define_expand
> +"popcountqi2"
> + [(set (match_operand:QI 0 "register_operand" "=w")
> (popcount:QI
> +(match_operand:QI 1 "register_operand" "w")))]
> "TARGET_CSSC ||
> +TARGET_SIMD"
> +{
> + rtx in = operands[1];
> + rtx out = operands[0];
> + if (TARGET_CSSC)
> + {
> + rtx tmp = gen_reg_rtx (SImode);
> + rtx out1 = gen_reg_rtx (SImode);
> + emit_insn (gen_zero_extendqisi2 (tmp, in));  emit_insn
> +(gen_popcountsi2 (out1, tmp));  emit_move_insn (out,
> gen_lowpart
> +(QImode, out1));  DONE;  }  rtx v = gen_reg_rtx (V8QImode);
> rtx v1 =
> +gen_reg_rtx (V8QImode);  emit_move_insn (v, gen_lowpart
> (V8QImode,
> +in));  emit_insn (gen_popcountv8qi2 (v1, v));
> emit_move_insn (out,
> +gen_lowpart (QImode, v1));  DONE;
> +})
> 
> TBH I'd rather merge it with the GPI popcount pattern that
> looks almost identical. You could extend it with the ALLI
> iterator and handle HImode as well quite easily.

I was thinking about that beforehand, but I was trying for the simplified patch at the time.
Anyways I posted the updated version: 
https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654115.html

And it includes the CSSC testcases too to make sure the generated code is correct.

Thanks,
Andrew Pinski



> Thanks,
> Kyrill
> 
> 
> +
> (define_insn "clrsb<mode>2"
> [(set (match_operand:GPI 0 "register_operand" "=r")
> (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> new file mode 100644
> index 00000000000..406369d9b29
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** h8:
> +** ldr b[0-9]+, \[x0\]
> +** cnt v[0-9]+.8b, v[0-9]+.8b
> +** smov w0, v[0-9]+.b\[0\]
> +** ret
> +*/
> +/* We should not need the addv here since we only need a
> byte popcount.
> +*/
> +
> +unsigned h8 (const unsigned char *a) {
> + return __builtin_popcountg (a[0]);
> +}
> --
> 2.42.0
> 
> 
> 
>
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 389a1906e23..ebaf7ec9970 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5358,6 +5358,32 @@  (define_expand "popcount<mode>2"
     }
 })
 
+/* Popcount for byte can remove the reduction part after the popcount.
+   For optimization reasons, enabling this for CSSC. */
+(define_expand "popcountqi2"
+  [(set (match_operand:QI 0 "register_operand" "=w")
+	(popcount:QI (match_operand:QI 1 "register_operand" "w")))]
+  "TARGET_CSSC || TARGET_SIMD"
+{
+  rtx in = operands[1];
+  rtx out = operands[0];
+  if (TARGET_CSSC)
+    {
+      rtx tmp = gen_reg_rtx (SImode);
+      rtx out1 = gen_reg_rtx (SImode);
+      emit_insn (gen_zero_extendqisi2 (tmp, in));
+      emit_insn (gen_popcountsi2 (out1, tmp));
+      emit_move_insn (out, gen_lowpart (QImode, out1));
+      DONE;
+    }
+  rtx v = gen_reg_rtx (V8QImode);
+  rtx v1 = gen_reg_rtx (V8QImode);
+  emit_move_insn (v, gen_lowpart (V8QImode, in));
+  emit_insn (gen_popcountv8qi2 (v1, v));
+  emit_move_insn (out, gen_lowpart (QImode, v1));
+  DONE;
+})
+
 (define_insn "clrsb<mode>2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
         (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
new file mode 100644
index 00000000000..406369d9b29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h8:
+**	ldr	b[0-9]+, \[x0\]
+**	cnt	v[0-9]+.8b, v[0-9]+.8b
+**	smov	w0, v[0-9]+.b\[0\]
+**	ret
+*/
+/* We should not need the addv here since we only need a byte popcount. */
+
+unsigned h8 (const unsigned char *a) {
+	  return __builtin_popcountg (a[0]);
+}