diff mbox series

[v2] aarch64: Improve popcount for bytes [PR113042]

Message ID 20240610192255.402779-1-quic_apinski@quicinc.com
State New
Headers show
Series [v2] aarch64: Improve popcount for bytes [PR113042] | expand

Commit Message

Andrew Pinski June 10, 2024, 7:22 p.m. UTC
For popcount for bytes, we don't need the reduction addition
after the vector cnt instruction as we are only counting one
byte's popcount.
This changes the popcount extend to cover all ALLI rather than GPI.

Changes since v1:
* v2 - Use ALLI iterator and combine all into one pattern.
       Add new testcases popcnt[6-8].c.

Bootstrapped and tested on aarch64-linux-gnu with no regressions.

	PR target/113042

gcc/ChangeLog:

	* config/aarch64/aarch64.md (popcount<mode>2): Update pattern
	to support ALLI modes.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/popcnt5.c: New test.
	* gcc.target/aarch64/popcnt6.c: New test.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
 gcc/config/aarch64/aarch64.md              | 52 +++++++++++++++++++---
 gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++
 gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++
 gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++
 gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++
 5 files changed, 119 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt6.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt7.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt8.c

Comments

Andrew Pinski Aug. 14, 2024, 6:13 a.m. UTC | #1
> -----Original Message-----
> From: Andrew Pinski (QUIC) <quic_apinski@quicinc.com>
> Sent: Monday, June 10, 2024 12:23 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Andrew Pinski (QUIC) <quic_apinski@quicinc.com>
> Subject: [PATCH v2] aarch64: Improve popcount for bytes
> [PR113042]
> 
> For popcount for bytes, we don't need the reduction addition
> after the vector cnt instruction as we are only counting one
> byte's popcount.
> This changes the popcount extend to cover all ALLI rather than
> GPI.

Ping? 
https://patchwork.sourceware.org/project/gcc/patch/20240610192255.402779-1-quic_apinski@quicinc.com/

Thanks,
Andrew Pinski

> 
> Changes since v1:
> * v2 - Use ALLI iterator and combine all into one pattern.
>        Add new testcases popcnt[6-8].c.
> 
> Bootstrapped and tested on aarch64-linux-gnu with no
> regressions.
> 
> 	PR target/113042
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64.md (popcount<mode>2):
> Update pattern
> 	to support ALLI modes.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/popcnt5.c: New test.
> 	* gcc.target/aarch64/popcnt6.c: New test.
> 
> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64.md              | 52
> +++++++++++++++++++---
>  gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++
> gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++
> gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++
> gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++
>  5 files changed, 119 insertions(+), 7 deletions(-)  create mode
> 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c
>  create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt6.c
>  create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt7.c
>  create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt8.c
> 
> diff --git a/gcc/config/aarch64/aarch64.md
> b/gcc/config/aarch64/aarch64.md index
> 389a1906e23..dd88fd891b5 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5332,28 +5332,66 @@ (define_insn
> "*aarch64_popcount<mode>2_cssc_insn"
>  ;; MOV	w0, v2.b[0]
> 
>  (define_expand "popcount<mode>2"
> -  [(set (match_operand:GPI 0 "register_operand")
> -	(popcount:GPI (match_operand:GPI 1
> "register_operand")))]
> +  [(set (match_operand:ALLI 0 "register_operand")
> +	(popcount:ALLI (match_operand:ALLI 1
> "register_operand")))]
>    "TARGET_CSSC || TARGET_SIMD"
>  {
> +  rtx in = operands[1];
> +  rtx out = operands[0];
> +  if (TARGET_CSSC
> +      && (<MODE>mode == HImode
> +          || <MODE>mode == QImode))
> +    {
> +      rtx tmp = gen_reg_rtx (SImode);
> +      rtx out1 = gen_reg_rtx (SImode);
> +      if (<MODE>mode == HImode)
> +        emit_insn (gen_zero_extendhisi2 (tmp, in));
> +      else
> +        emit_insn (gen_zero_extendqisi2 (tmp, in));
> +      emit_insn (gen_popcountsi2 (out1, tmp));
> +      emit_move_insn (out, gen_lowpart (<MODE>mode,
> out1));
> +      DONE;
> +    }
>    if (!TARGET_CSSC)
>      {
>        rtx v = gen_reg_rtx (V8QImode);
>        rtx v1 = gen_reg_rtx (V8QImode);
>        rtx in = operands[1];
>        rtx out = operands[0];
> -      if(<MODE>mode == SImode)
> +      /* SImode and HImode should be zero extended to
> DImode. */
> +      if (<MODE>mode == SImode || <MODE>mode ==
> HImode)
>  	{
>  	  rtx tmp;
>  	  tmp = gen_reg_rtx (DImode);
> -	  /* If we have SImode, zero extend to DImode, pop count
> does
> -	     not change if we have extra zeros. */
> -	  emit_insn (gen_zero_extendsidi2 (tmp, in));
> +	  /* If we have SImode, zero extend to DImode,
> +	     pop count does not change if we have extra zeros. */
> +	  if (<MODE>mode == SImode)
> +	    emit_insn (gen_zero_extendsidi2 (tmp, in));
> +	  else
> +	    emit_insn (gen_zero_extendhidi2 (tmp, in));
>  	  in = tmp;
>  	}
>        emit_move_insn (v, gen_lowpart (V8QImode, in));
>        emit_insn (gen_popcountv8qi2 (v1, v));
> -      emit_insn
> (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out,
> v1));
> +      /* QImode, just extract from the v8qi vector.  */
> +      if (<MODE>mode == QImode)
> +	{
> +	  emit_move_insn (out, gen_lowpart (QImode, v1));
> +	}
> +      /* HI and SI, reduction is zero extended to SImode. */
> +      else if (<MODE>mode == SImode || <MODE>mode ==
> HImode)
> +	{
> +	  rtx out1;
> +	  out1 = gen_reg_rtx (SImode);
> +	  emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi
> (out1, v1));
> +	  emit_move_insn (out, gen_lowpart (<MODE>mode,
> out1));
> +	}
> +      /* DImode, reduction is zero extended to DImode. */
> +      else
> +	{
> +	  gcc_assert (<MODE>mode == DImode);
> +	  emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi
> (out, v1));
> +	}
>        DONE;
>      }
>  })
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> new file mode 100644
> index 00000000000..406369d9b29
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** h8:
> +**	ldr	b[0-9]+, \[x0\]
> +**	cnt	v[0-9]+.8b, v[0-9]+.8b
> +**	smov	w0, v[0-9]+.b\[0\]
> +**	ret
> +*/
> +/* We should not need the addv here since we only need a
> byte popcount.
> +*/
> +
> +unsigned h8 (const unsigned char *a) {
> +	  return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> new file mode 100644
> index 00000000000..e882cb24126
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** h8:
> +**	ldr	h[0-9]+, \[x0\]
> +**	cnt	v[0-9]+.8b, v[0-9]+.8b
> +**	addv	b[0-9]+, v[0-9]+.8b
> +**	fmov	w0, s[0-9]+
> +**	ret
> +*/
> +
> +unsigned h8 (const unsigned short *a) {
> +	  return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> new file mode 100644
> index 00000000000..8dfff211ae4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+cssc"
> +
> +/*
> +** h8:
> +**	ldrb	w[0-9]+, \[x0\]
> +**	cnt	w[0-9]+, w[0-9]+
> +**	ret
> +*/
> +/* We should not produce any extra zero extend for this code
> */
> +
> +unsigned h8 (const unsigned char *a) {
> +	  return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> new file mode 100644
> index 00000000000..66a88b6a929
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+cssc"
> +
> +/*
> +** h8:
> +**	ldrh	w[0-9]+, \[x0\]
> +**	cnt	w[0-9]+, w[0-9]+
> +**	ret
> +*/
> +/* We should not produce any extra zero extend for this code
> */
> +
> +unsigned h8 (const unsigned short *a) {
> +	  return __builtin_popcountg (a[0]);
> +}
> --
> 2.42.0
Richard Sandiford Aug. 14, 2024, 9:20 p.m. UTC | #2
Andrew Pinski <quic_apinski@quicinc.com> writes:
> For popcount for bytes, we don't need the reduction addition
> after the vector cnt instruction as we are only counting one
> byte's popcount.
> This changes the popcount extend to cover all ALLI rather than GPI.
>
> Changes since v1:
> * v2 - Use ALLI iterator and combine all into one pattern.
>        Add new testcases popcnt[6-8].c.
>
> Bootstrapped and tested on aarch64-linux-gnu with no regressions.
>
> 	PR target/113042
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (popcount<mode>2): Update pattern
> 	to support ALLI modes.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/popcnt5.c: New test.
> 	* gcc.target/aarch64/popcnt6.c: New test.
>
> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64.md              | 52 +++++++++++++++++++---
>  gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++
>  gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++
>  gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++
>  gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++
>  5 files changed, 119 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt6.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt7.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt8.c

Sorry for the slow review.

> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 389a1906e23..dd88fd891b5 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5332,28 +5332,66 @@ (define_insn "*aarch64_popcount<mode>2_cssc_insn"
>  ;; MOV	w0, v2.b[0]
>  
>  (define_expand "popcount<mode>2"
> -  [(set (match_operand:GPI 0 "register_operand")
> -	(popcount:GPI (match_operand:GPI 1 "register_operand")))]
> +  [(set (match_operand:ALLI 0 "register_operand")
> +	(popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
>    "TARGET_CSSC || TARGET_SIMD"

Could we restrict this to:

  TARET_CSSC
  ? GET_MODE_BITSIZE (<MODE>mode) >= 32
  : TARGET_SIMD

>  {
> +  rtx in = operands[1];
> +  rtx out = operands[0];
> +  if (TARGET_CSSC
> +      && (<MODE>mode == HImode
> +          || <MODE>mode == QImode))
> +    {
> +      rtx tmp = gen_reg_rtx (SImode);
> +      rtx out1 = gen_reg_rtx (SImode);
> +      if (<MODE>mode == HImode)
> +        emit_insn (gen_zero_extendhisi2 (tmp, in));
> +      else
> +        emit_insn (gen_zero_extendqisi2 (tmp, in));
> +      emit_insn (gen_popcountsi2 (out1, tmp));
> +      emit_move_insn (out, gen_lowpart (<MODE>mode, out1));
> +      DONE;
> +    }

...and then skip this part (including the rtx in/out)?  It should be
what target-independent code would do.

>    if (!TARGET_CSSC)
>      {
>        rtx v = gen_reg_rtx (V8QImode);
>        rtx v1 = gen_reg_rtx (V8QImode);
>        rtx in = operands[1];
>        rtx out = operands[0];
> -      if(<MODE>mode == SImode)
> +      /* SImode and HImode should be zero extended to DImode. */
> +      if (<MODE>mode == SImode || <MODE>mode == HImode)
>  	{
>  	  rtx tmp;
>  	  tmp = gen_reg_rtx (DImode);
> -	  /* If we have SImode, zero extend to DImode, pop count does
> -	     not change if we have extra zeros. */
> -	  emit_insn (gen_zero_extendsidi2 (tmp, in));
> +	  /* If we have SImode, zero extend to DImode,
> +	     pop count does not change if we have extra zeros. */

The doubled comment seems redundant.  How about making the first one:

      /* SImode and HImode should be zero extended to DImode.
         popcount does not change if we have extra zeros.  */

and deleting the second comment?

> +	  if (<MODE>mode == SImode)
> +	    emit_insn (gen_zero_extendsidi2 (tmp, in));
> +	  else
> +	    emit_insn (gen_zero_extendhidi2 (tmp, in));
>  	  in = tmp;

I think the if body can be replaced with:

  in = convert_to_mode (DImode, in, true);

>  	}
>        emit_move_insn (v, gen_lowpart (V8QImode, in));
>        emit_insn (gen_popcountv8qi2 (v1, v));
> -      emit_insn (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out, v1));
> +      /* QImode, just extract from the v8qi vector.  */
> +      if (<MODE>mode == QImode)
> +	{
> +	  emit_move_insn (out, gen_lowpart (QImode, v1));
> +	}

Nit: redundant braces.

> +      /* HI and SI, reduction is zero extended to SImode. */
> +      else if (<MODE>mode == SImode || <MODE>mode == HImode)
> +	{
> +	  rtx out1;
> +	  out1 = gen_reg_rtx (SImode);

Pre-existing, but: no need for two separate statements.

Thanks,
Richard

> +	  emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi (out1, v1));
> +	  emit_move_insn (out, gen_lowpart (<MODE>mode, out1));
> +	}
> +      /* DImode, reduction is zero extended to DImode. */
> +      else
> +	{
> +	  gcc_assert (<MODE>mode == DImode);
> +	  emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi (out, v1));
> +	}
>        DONE;
>      }
>  })
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> new file mode 100644
> index 00000000000..406369d9b29
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** h8:
> +**	ldr	b[0-9]+, \[x0\]
> +**	cnt	v[0-9]+.8b, v[0-9]+.8b
> +**	smov	w0, v[0-9]+.b\[0\]
> +**	ret
> +*/
> +/* We should not need the addv here since we only need a byte popcount. */
> +
> +unsigned h8 (const unsigned char *a) {
> +	  return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> new file mode 100644
> index 00000000000..e882cb24126
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** h8:
> +**	ldr	h[0-9]+, \[x0\]
> +**	cnt	v[0-9]+.8b, v[0-9]+.8b
> +**	addv	b[0-9]+, v[0-9]+.8b
> +**	fmov	w0, s[0-9]+
> +**	ret
> +*/
> +
> +unsigned h8 (const unsigned short *a) {
> +	  return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> new file mode 100644
> index 00000000000..8dfff211ae4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+cssc"
> +
> +/*
> +** h8:
> +**	ldrb	w[0-9]+, \[x0\]
> +**	cnt	w[0-9]+, w[0-9]+
> +**	ret
> +*/
> +/* We should not produce any extra zero extend for this code */
> +
> +unsigned h8 (const unsigned char *a) {
> +	  return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> new file mode 100644
> index 00000000000..66a88b6a929
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+cssc"
> +
> +/*
> +** h8:
> +**	ldrh	w[0-9]+, \[x0\]
> +**	cnt	w[0-9]+, w[0-9]+
> +**	ret
> +*/
> +/* We should not produce any extra zero extend for this code */
> +
> +unsigned h8 (const unsigned short *a) {
> +	  return __builtin_popcountg (a[0]);
> +}
Andrew Pinski Aug. 15, 2024, 5:06 a.m. UTC | #3
On Wed, Aug 14, 2024 at 2:21 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Andrew Pinski <quic_apinski@quicinc.com> writes:
> > For popcount for bytes, we don't need the reduction addition
> > after the vector cnt instruction as we are only counting one
> > byte's popcount.
> > This changes the popcount extend to cover all ALLI rather than GPI.
> >
> > Changes since v1:
> > * v2 - Use ALLI iterator and combine all into one pattern.
> >        Add new testcases popcnt[6-8].c.
> >
> > Bootstrapped and tested on aarch64-linux-gnu with no regressions.
> >
> >       PR target/113042
> >
> > gcc/ChangeLog:
> >
> >       * config/aarch64/aarch64.md (popcount<mode>2): Update pattern
> >       to support ALLI modes.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/aarch64/popcnt5.c: New test.
> >       * gcc.target/aarch64/popcnt6.c: New test.
> >
> > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> > ---
> >  gcc/config/aarch64/aarch64.md              | 52 +++++++++++++++++++---
> >  gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++
> >  gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++
> >  gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++
> >  gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++
> >  5 files changed, 119 insertions(+), 7 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt6.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt7.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt8.c
>
> Sorry for the slow review.
>
> > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> > index 389a1906e23..dd88fd891b5 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -5332,28 +5332,66 @@ (define_insn "*aarch64_popcount<mode>2_cssc_insn"
> >  ;; MOV       w0, v2.b[0]
> >
> >  (define_expand "popcount<mode>2"
> > -  [(set (match_operand:GPI 0 "register_operand")
> > -     (popcount:GPI (match_operand:GPI 1 "register_operand")))]
> > +  [(set (match_operand:ALLI 0 "register_operand")
> > +     (popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
> >    "TARGET_CSSC || TARGET_SIMD"
>
> Could we restrict this to:
>
>   TARET_CSSC
>   ? GET_MODE_BITSIZE (<MODE>mode) >= 32
>   : TARGET_SIMD
>
> >  {
> > +  rtx in = operands[1];
> > +  rtx out = operands[0];
> > +  if (TARGET_CSSC
> > +      && (<MODE>mode == HImode
> > +          || <MODE>mode == QImode))
> > +    {
> > +      rtx tmp = gen_reg_rtx (SImode);
> > +      rtx out1 = gen_reg_rtx (SImode);
> > +      if (<MODE>mode == HImode)
> > +        emit_insn (gen_zero_extendhisi2 (tmp, in));
> > +      else
> > +        emit_insn (gen_zero_extendqisi2 (tmp, in));
> > +      emit_insn (gen_popcountsi2 (out1, tmp));
> > +      emit_move_insn (out, gen_lowpart (<MODE>mode, out1));
> > +      DONE;
> > +    }
>
> ...and then skip this part (including the rtx in/out)?  It should be
> what target-independent code would do.
>
> >    if (!TARGET_CSSC)
> >      {
> >        rtx v = gen_reg_rtx (V8QImode);
> >        rtx v1 = gen_reg_rtx (V8QImode);
> >        rtx in = operands[1];
> >        rtx out = operands[0];
> > -      if(<MODE>mode == SImode)
> > +      /* SImode and HImode should be zero extended to DImode. */
> > +      if (<MODE>mode == SImode || <MODE>mode == HImode)
> >       {
> >         rtx tmp;
> >         tmp = gen_reg_rtx (DImode);
> > -       /* If we have SImode, zero extend to DImode, pop count does
> > -          not change if we have extra zeros. */
> > -       emit_insn (gen_zero_extendsidi2 (tmp, in));
> > +       /* If we have SImode, zero extend to DImode,
> > +          pop count does not change if we have extra zeros. */
>
> The doubled comment seems redundant.  How about making the first one:
>
>       /* SImode and HImode should be zero extended to DImode.
>          popcount does not change if we have extra zeros.  */
>
> and deleting the second comment?
>
> > +       if (<MODE>mode == SImode)
> > +         emit_insn (gen_zero_extendsidi2 (tmp, in));
> > +       else
> > +         emit_insn (gen_zero_extendhidi2 (tmp, in));
> >         in = tmp;
>
> I think the if body can be replaced with:
>
>   in = convert_to_mode (DImode, in, true);
>
> >       }
> >        emit_move_insn (v, gen_lowpart (V8QImode, in));
> >        emit_insn (gen_popcountv8qi2 (v1, v));
> > -      emit_insn (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out, v1));
> > +      /* QImode, just extract from the v8qi vector.  */
> > +      if (<MODE>mode == QImode)
> > +     {
> > +       emit_move_insn (out, gen_lowpart (QImode, v1));
> > +     }
>
> Nit: redundant braces.
>
> > +      /* HI and SI, reduction is zero extended to SImode. */
> > +      else if (<MODE>mode == SImode || <MODE>mode == HImode)
> > +     {
> > +       rtx out1;
> > +       out1 = gen_reg_rtx (SImode);
>
> Pre-existing, but: no need for two separate statements.

Updated patch posted:
https://gcc.gnu.org/pipermail/gcc-patches/2024-August/660473.html

Thanks,
Andrew

>
> Thanks,
> Richard
>
> > +       emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi (out1, v1));
> > +       emit_move_insn (out, gen_lowpart (<MODE>mode, out1));
> > +     }
> > +      /* DImode, reduction is zero extended to DImode. */
> > +      else
> > +     {
> > +       gcc_assert (<MODE>mode == DImode);
> > +       emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi (out, v1));
> > +     }
> >        DONE;
> >      }
> >  })
> > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> > new file mode 100644
> > index 00000000000..406369d9b29
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> > @@ -0,0 +1,19 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { check-function-bodies "**" "" } } */
> > +/* PR target/113042 */
> > +
> > +#pragma GCC target "+nocssc"
> > +
> > +/*
> > +** h8:
> > +**   ldr     b[0-9]+, \[x0\]
> > +**   cnt     v[0-9]+.8b, v[0-9]+.8b
> > +**   smov    w0, v[0-9]+.b\[0\]
> > +**   ret
> > +*/
> > +/* We should not need the addv here since we only need a byte popcount. */
> > +
> > +unsigned h8 (const unsigned char *a) {
> > +       return __builtin_popcountg (a[0]);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> > new file mode 100644
> > index 00000000000..e882cb24126
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> > @@ -0,0 +1,19 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { check-function-bodies "**" "" } } */
> > +/* PR target/113042 */
> > +
> > +#pragma GCC target "+nocssc"
> > +
> > +/*
> > +** h8:
> > +**   ldr     h[0-9]+, \[x0\]
> > +**   cnt     v[0-9]+.8b, v[0-9]+.8b
> > +**   addv    b[0-9]+, v[0-9]+.8b
> > +**   fmov    w0, s[0-9]+
> > +**   ret
> > +*/
> > +
> > +unsigned h8 (const unsigned short *a) {
> > +       return __builtin_popcountg (a[0]);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> > new file mode 100644
> > index 00000000000..8dfff211ae4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> > @@ -0,0 +1,18 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { check-function-bodies "**" "" } } */
> > +/* PR target/113042 */
> > +
> > +#pragma GCC target "+cssc"
> > +
> > +/*
> > +** h8:
> > +**   ldrb    w[0-9]+, \[x0\]
> > +**   cnt     w[0-9]+, w[0-9]+
> > +**   ret
> > +*/
> > +/* We should not produce any extra zero extend for this code */
> > +
> > +unsigned h8 (const unsigned char *a) {
> > +       return __builtin_popcountg (a[0]);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> > new file mode 100644
> > index 00000000000..66a88b6a929
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> > @@ -0,0 +1,18 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { check-function-bodies "**" "" } } */
> > +/* PR target/113042 */
> > +
> > +#pragma GCC target "+cssc"
> > +
> > +/*
> > +** h8:
> > +**   ldrh    w[0-9]+, \[x0\]
> > +**   cnt     w[0-9]+, w[0-9]+
> > +**   ret
> > +*/
> > +/* We should not produce any extra zero extend for this code */
> > +
> > +unsigned h8 (const unsigned short *a) {
> > +       return __builtin_popcountg (a[0]);
> > +}
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 389a1906e23..dd88fd891b5 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5332,28 +5332,66 @@  (define_insn "*aarch64_popcount<mode>2_cssc_insn"
 ;; MOV	w0, v2.b[0]
 
 (define_expand "popcount<mode>2"
-  [(set (match_operand:GPI 0 "register_operand")
-	(popcount:GPI (match_operand:GPI 1 "register_operand")))]
+  [(set (match_operand:ALLI 0 "register_operand")
+	(popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
   "TARGET_CSSC || TARGET_SIMD"
 {
+  rtx in = operands[1];
+  rtx out = operands[0];
+  if (TARGET_CSSC
+      && (<MODE>mode == HImode
+          || <MODE>mode == QImode))
+    {
+      rtx tmp = gen_reg_rtx (SImode);
+      rtx out1 = gen_reg_rtx (SImode);
+      if (<MODE>mode == HImode)
+        emit_insn (gen_zero_extendhisi2 (tmp, in));
+      else
+        emit_insn (gen_zero_extendqisi2 (tmp, in));
+      emit_insn (gen_popcountsi2 (out1, tmp));
+      emit_move_insn (out, gen_lowpart (<MODE>mode, out1));
+      DONE;
+    }
   if (!TARGET_CSSC)
     {
       rtx v = gen_reg_rtx (V8QImode);
       rtx v1 = gen_reg_rtx (V8QImode);
       rtx in = operands[1];
       rtx out = operands[0];
-      if(<MODE>mode == SImode)
+      /* SImode and HImode should be zero extended to DImode. */
+      if (<MODE>mode == SImode || <MODE>mode == HImode)
 	{
 	  rtx tmp;
 	  tmp = gen_reg_rtx (DImode);
-	  /* If we have SImode, zero extend to DImode, pop count does
-	     not change if we have extra zeros. */
-	  emit_insn (gen_zero_extendsidi2 (tmp, in));
+	  /* If we have SImode, zero extend to DImode,
+	     pop count does not change if we have extra zeros. */
+	  if (<MODE>mode == SImode)
+	    emit_insn (gen_zero_extendsidi2 (tmp, in));
+	  else
+	    emit_insn (gen_zero_extendhidi2 (tmp, in));
 	  in = tmp;
 	}
       emit_move_insn (v, gen_lowpart (V8QImode, in));
       emit_insn (gen_popcountv8qi2 (v1, v));
-      emit_insn (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out, v1));
+      /* QImode, just extract from the v8qi vector.  */
+      if (<MODE>mode == QImode)
+	{
+	  emit_move_insn (out, gen_lowpart (QImode, v1));
+	}
+      /* HI and SI, reduction is zero extended to SImode. */
+      else if (<MODE>mode == SImode || <MODE>mode == HImode)
+	{
+	  rtx out1;
+	  out1 = gen_reg_rtx (SImode);
+	  emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi (out1, v1));
+	  emit_move_insn (out, gen_lowpart (<MODE>mode, out1));
+	}
+      /* DImode, reduction is zero extended to DImode. */
+      else
+	{
+	  gcc_assert (<MODE>mode == DImode);
+	  emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi (out, v1));
+	}
       DONE;
     }
 })
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
new file mode 100644
index 00000000000..406369d9b29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h8:
+**	ldr	b[0-9]+, \[x0\]
+**	cnt	v[0-9]+.8b, v[0-9]+.8b
+**	smov	w0, v[0-9]+.b\[0\]
+**	ret
+*/
+/* We should not need the addv here since we only need a byte popcount. */
+
+unsigned h8 (const unsigned char *a) {
+	  return __builtin_popcountg (a[0]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
new file mode 100644
index 00000000000..e882cb24126
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h8:
+**	ldr	h[0-9]+, \[x0\]
+**	cnt	v[0-9]+.8b, v[0-9]+.8b
+**	addv	b[0-9]+, v[0-9]+.8b
+**	fmov	w0, s[0-9]+
+**	ret
+*/
+
+unsigned h8 (const unsigned short *a) {
+	  return __builtin_popcountg (a[0]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
new file mode 100644
index 00000000000..8dfff211ae4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+cssc"
+
+/*
+** h8:
+**	ldrb	w[0-9]+, \[x0\]
+**	cnt	w[0-9]+, w[0-9]+
+**	ret
+*/
+/* We should not produce any extra zero extend for this code */
+
+unsigned h8 (const unsigned char *a) {
+	  return __builtin_popcountg (a[0]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
new file mode 100644
index 00000000000..66a88b6a929
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+cssc"
+
+/*
+** h8:
+**	ldrh	w[0-9]+, \[x0\]
+**	cnt	w[0-9]+, w[0-9]+
+**	ret
+*/
+/* We should not produce any extra zero extend for this code */
+
+unsigned h8 (const unsigned short *a) {
+	  return __builtin_popcountg (a[0]);
+}