diff mbox series

[2/2] aarch64: Implement popcountti2 pattern [PR113042]

Message ID 20240816213559.1486438-2-quic_apinski@quicinc.com
State New
Headers show
Series [1/2] builtins: Don't expand bit query builtins for __int128_t if the target supports an optab for it | expand

Commit Message

Andrew Pinski Aug. 16, 2024, 9:35 p.m. UTC
When CSSC is not enabled, 128bit popcount can be implemented
just via the vector (v16qi) cnt instruction followed by a reduction,
like how the 64bit one is currently implemented instead of
splitting into 2 64bit popcount.

Build and tested for aarch64-linux-gnu.

	PR target/113042

gcc/ChangeLog:

	* config/aarch64/aarch64.md (popcountti2): New define_expand.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/popcnt10.c: New test.
	* gcc.target/aarch64/popcnt9.c: New test.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
 gcc/config/aarch64/aarch64.md               | 16 +++++++++++++
 gcc/testsuite/gcc.target/aarch64/popcnt10.c | 25 +++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/popcnt9.c  | 25 +++++++++++++++++++++
 3 files changed, 66 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt10.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt9.c

Comments

Richard Sandiford Aug. 20, 2024, 4:51 p.m. UTC | #1
Andrew Pinski <quic_apinski@quicinc.com> writes:
> When CSSC is not enabled, 128bit popcount can be implemented
> just via the vector (v16qi) cnt instruction followed by a reduction,
> like how the 64bit one is currently implemented instead of
> splitting into 2 64bit popcount.
>
> Build and tested for aarch64-linux-gnu.
>
> 	PR target/113042
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (popcountti2): New define_expand.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/popcnt10.c: New test.
> 	* gcc.target/aarch64/popcnt9.c: New test.

OK if there are no other comments in the next 24 hours.

Thanks,
Richard

>
> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64.md               | 16 +++++++++++++
>  gcc/testsuite/gcc.target/aarch64/popcnt10.c | 25 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/popcnt9.c  | 25 +++++++++++++++++++++
>  3 files changed, 66 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt10.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt9.c
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 12dcc16529a..73506e71f43 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5378,6 +5378,22 @@ (define_expand "popcount<mode>2"
>      }
>  })
>  
> +(define_expand "popcountti2"
> +  [(set (match_operand:TI 0 "register_operand")
> +	(popcount:TI (match_operand:TI 1 "register_operand")))]
> +  "TARGET_SIMD && !TARGET_CSSC"
> +{
> +  rtx v = gen_reg_rtx (V16QImode);
> +  rtx v1 = gen_reg_rtx (V16QImode);
> +  emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
> +  emit_insn (gen_popcountv16qi2 (v1, v));
> +  rtx out = gen_reg_rtx (DImode);
> +  emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v16qi (out, v1));
> +  out = convert_to_mode (TImode, out, true);
> +  emit_move_insn (operands[0], out);
> +  DONE;
> +})
> +
>  (define_insn "clrsb<mode>2"
>    [(set (match_operand:GPI 0 "register_operand" "=r")
>          (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt10.c b/gcc/testsuite/gcc.target/aarch64/popcnt10.c
> new file mode 100644
> index 00000000000..4d01fc67022
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt10.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+cssc"
> +
> +/*
> +** h128:
> +**	ldp	x([0-9]+), x([0-9]+), \[x0\]
> +**	cnt	x([0-9]+), x([0-9]+)
> +**	cnt	x([0-9]+), x([0-9]+)
> +**	add	w0, w([0-9]+), w([0-9]+)
> +**	ret
> +*/
> +
> +
> +unsigned h128 (const unsigned __int128 *a) {
> +  return __builtin_popcountg (a[0]);
> +}
> +
> +/* popcount with CSSC should be split into 2 sections. */
> +/* { dg-final { scan-tree-dump-not "POPCOUNT " "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " __builtin_popcount" 2 "optimized" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt9.c b/gcc/testsuite/gcc.target/aarch64/popcnt9.c
> new file mode 100644
> index 00000000000..c778fc7f420
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt9.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** h128:
> +**	ldr	q([0-9]+), \[x0\]
> +**	cnt	v([0-9]+).16b, v\1.16b
> +**	addv	b([0-9]+), v\2.16b
> +**	fmov	w0, s\3
> +**	ret
> +*/
> +
> +
> +unsigned h128 (const unsigned __int128 *a) {
> +	  return __builtin_popcountg (a[0]);
> +}
> +
> +/* There should be only one POPCOUNT. */
> +/* { dg-final { scan-tree-dump-times "POPCOUNT " 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not " __builtin_popcount"  "optimized" } } */
> +
Richard Sandiford Aug. 20, 2024, 6:17 p.m. UTC | #2
Richard Sandiford <richard.sandiford@arm.com> writes:
> Andrew Pinski <quic_apinski@quicinc.com> writes:
>> When CSSC is not enabled, 128bit popcount can be implemented
>> just via the vector (v16qi) cnt instruction followed by a reduction,
>> like how the 64bit one is currently implemented instead of
>> splitting into 2 64bit popcount.
>>
>> Build and tested for aarch64-linux-gnu.
>>
>> 	PR target/113042
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64.md (popcountti2): New define_expand.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.target/aarch64/popcnt10.c: New test.
>> 	* gcc.target/aarch64/popcnt9.c: New test.
>
> OK if there are no other comments in the next 24 hours.

Sorry, only thought about it later, but:

>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 12dcc16529a..73506e71f43 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -5378,6 +5378,22 @@ (define_expand "popcount<mode>2"
>>      }
>>  })
>>  
>> +(define_expand "popcountti2"
>> +  [(set (match_operand:TI 0 "register_operand")
>> +	(popcount:TI (match_operand:TI 1 "register_operand")))]

Could you try making the output :DI instead of :TI?  I'd expect
internal-fn.cc to handle that correctly and extend the result to
128 bits where needed.

That would make the dummy popcount rtx malformed, so I suppose
the pattern should just be:

  [(match_operand:DI 0 "register_operand")
   (match_operand:TI 1 "register_operand")]

>> +  "TARGET_SIMD && !TARGET_CSSC"
>> +{
>> +  rtx v = gen_reg_rtx (V16QImode);
>> +  rtx v1 = gen_reg_rtx (V16QImode);
>> +  emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
>> +  emit_insn (gen_popcountv16qi2 (v1, v));
>> +  rtx out = gen_reg_rtx (DImode);
>> +  emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v16qi (out, v1));

We could then use operands[0] directly as the output here.

Thanks,
Richard

>> +  out = convert_to_mode (TImode, out, true);
>> +  emit_move_insn (operands[0], out);
>> +  DONE;
>> +})
Andrew Pinski Aug. 21, 2024, 12:24 a.m. UTC | #3
On Tue, Aug 20, 2024 at 11:18 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Sandiford <richard.sandiford@arm.com> writes:
> > Andrew Pinski <quic_apinski@quicinc.com> writes:
> >> When CSSC is not enabled, 128bit popcount can be implemented
> >> just via the vector (v16qi) cnt instruction followed by a reduction,
> >> like how the 64bit one is currently implemented instead of
> >> splitting into 2 64bit popcount.
> >>
> >> Build and tested for aarch64-linux-gnu.
> >>
> >>      PR target/113042
> >>
> >> gcc/ChangeLog:
> >>
> >>      * config/aarch64/aarch64.md (popcountti2): New define_expand.
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >>      * gcc.target/aarch64/popcnt10.c: New test.
> >>      * gcc.target/aarch64/popcnt9.c: New test.
> >
> > OK if there are no other comments in the next 24 hours.
>
> Sorry, only thought about it later, but:

Yes that is a good idea since that would be the same code in the end
anyways and it is slightly cleaner.
I was not 100% sure if you removed your approval or approved it with
the changes so I submitted a new patch here:
https://gcc.gnu.org/pipermail/gcc-patches/2024-August/660960.html

Thanks,
Andrew Pinski

>
> >> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> >> index 12dcc16529a..73506e71f43 100644
> >> --- a/gcc/config/aarch64/aarch64.md
> >> +++ b/gcc/config/aarch64/aarch64.md
> >> @@ -5378,6 +5378,22 @@ (define_expand "popcount<mode>2"
> >>      }
> >>  })
> >>
> >> +(define_expand "popcountti2"
> >> +  [(set (match_operand:TI 0 "register_operand")
> >> +    (popcount:TI (match_operand:TI 1 "register_operand")))]
>
> Could you try making the output :DI instead of :TI?  I'd expect
> internal-fn.cc to handle that correctly and extend the result to
> 128 bits where needed.
>
> That would make the dummy popcount rtx malformed, so I suppose
> the pattern should just be:
>
>   [(match_operand:DI 0 "register_operand")
>    (match_operand:TI 1 "register_operand")]
>
> >> +  "TARGET_SIMD && !TARGET_CSSC"
> >> +{
> >> +  rtx v = gen_reg_rtx (V16QImode);
> >> +  rtx v1 = gen_reg_rtx (V16QImode);
> >> +  emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
> >> +  emit_insn (gen_popcountv16qi2 (v1, v));
> >> +  rtx out = gen_reg_rtx (DImode);
> >> +  emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v16qi (out, v1));
>
> We could then use operands[0] directly as the output here.
>
> Thanks,
> Richard
>
> >> +  out = convert_to_mode (TImode, out, true);
> >> +  emit_move_insn (operands[0], out);
> >> +  DONE;
> >> +})
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 12dcc16529a..73506e71f43 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5378,6 +5378,22 @@  (define_expand "popcount<mode>2"
     }
 })
 
+(define_expand "popcountti2"
+  [(set (match_operand:TI 0 "register_operand")
+	(popcount:TI (match_operand:TI 1 "register_operand")))]
+  "TARGET_SIMD && !TARGET_CSSC"
+{
+  rtx v = gen_reg_rtx (V16QImode);
+  rtx v1 = gen_reg_rtx (V16QImode);
+  emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
+  emit_insn (gen_popcountv16qi2 (v1, v));
+  rtx out = gen_reg_rtx (DImode);
+  emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v16qi (out, v1));
+  out = convert_to_mode (TImode, out, true);
+  emit_move_insn (operands[0], out);
+  DONE;
+})
+
 (define_insn "clrsb<mode>2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
         (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt10.c b/gcc/testsuite/gcc.target/aarch64/popcnt10.c
new file mode 100644
index 00000000000..4d01fc67022
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt10.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+cssc"
+
+/*
+** h128:
+**	ldp	x([0-9]+), x([0-9]+), \[x0\]
+**	cnt	x([0-9]+), x([0-9]+)
+**	cnt	x([0-9]+), x([0-9]+)
+**	add	w0, w([0-9]+), w([0-9]+)
+**	ret
+*/
+
+
+unsigned h128 (const unsigned __int128 *a) {
+  return __builtin_popcountg (a[0]);
+}
+
+/* popcount with CSSC should be split into 2 sections. */
+/* { dg-final { scan-tree-dump-not "POPCOUNT " "optimized" } } */
+/* { dg-final { scan-tree-dump-times " __builtin_popcount" 2 "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt9.c b/gcc/testsuite/gcc.target/aarch64/popcnt9.c
new file mode 100644
index 00000000000..c778fc7f420
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt9.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h128:
+**	ldr	q([0-9]+), \[x0\]
+**	cnt	v([0-9]+).16b, v\1.16b
+**	addv	b([0-9]+), v\2.16b
+**	fmov	w0, s\3
+**	ret
+*/
+
+
+unsigned h128 (const unsigned __int128 *a) {
+	  return __builtin_popcountg (a[0]);
+}
+
+/* There should be only one POPCOUNT. */
+/* { dg-final { scan-tree-dump-times "POPCOUNT " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-not " __builtin_popcount"  "optimized" } } */
+