Message ID | 20240610192255.402779-1-quic_apinski@quicinc.com |
---|---|
State | New |
Headers | show |
Series | [v2] aarch64: Improve popcount for bytes [PR113042] | expand |
> -----Original Message----- > From: Andrew Pinski (QUIC) <quic_apinski@quicinc.com> > Sent: Monday, June 10, 2024 12:23 PM > To: gcc-patches@gcc.gnu.org > Cc: Andrew Pinski (QUIC) <quic_apinski@quicinc.com> > Subject: [PATCH v2] aarch64: Improve popcount for bytes > [PR113042] > > For popcount for bytes, we don't need the reduction addition > after the vector cnt instruction as we are only counting one > byte's popcount. > This changes the popcount extend to cover all ALLI rather than > GPI. Ping? https://patchwork.sourceware.org/project/gcc/patch/20240610192255.402779-1-quic_apinski@quicinc.com/ Thanks, Andrew Pinski > > Changes since v1: > * v2 - Use ALLI iterator and combine all into one pattern. > Add new testcases popcnt[6-8].c. > > Bootstrapped and tested on aarch64-linux-gnu with no > regressions. > > PR target/113042 > > gcc/ChangeLog: > > * config/aarch64/aarch64.md (popcount<mode>2): > Update pattern > to support ALLI modes. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/popcnt5.c: New test. > * gcc.target/aarch64/popcnt6.c: New test. > > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> > --- > gcc/config/aarch64/aarch64.md | 52 > +++++++++++++++++++--- > gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++ > gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++ > gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++ > gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++ > 5 files changed, 119 insertions(+), 7 deletions(-) create mode > 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/popcnt6.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/popcnt7.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/popcnt8.c > > diff --git a/gcc/config/aarch64/aarch64.md > b/gcc/config/aarch64/aarch64.md index > 389a1906e23..dd88fd891b5 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -5332,28 +5332,66 @@ (define_insn > "*aarch64_popcount<mode>2_cssc_insn" > ;; MOV w0, v2.b[0] > > (define_expand "popcount<mode>2" > - [(set (match_operand:GPI 0 "register_operand") > - (popcount:GPI (match_operand:GPI 1 > "register_operand")))] > + [(set (match_operand:ALLI 0 "register_operand") > + (popcount:ALLI (match_operand:ALLI 1 > "register_operand")))] > "TARGET_CSSC || TARGET_SIMD" > { > + rtx in = operands[1]; > + rtx out = operands[0]; > + if (TARGET_CSSC > + && (<MODE>mode == HImode > + || <MODE>mode == QImode)) > + { > + rtx tmp = gen_reg_rtx (SImode); > + rtx out1 = gen_reg_rtx (SImode); > + if (<MODE>mode == HImode) > + emit_insn (gen_zero_extendhisi2 (tmp, in)); > + else > + emit_insn (gen_zero_extendqisi2 (tmp, in)); > + emit_insn (gen_popcountsi2 (out1, tmp)); > + emit_move_insn (out, gen_lowpart (<MODE>mode, > out1)); > + DONE; > + } > if (!TARGET_CSSC) > { > rtx v = gen_reg_rtx (V8QImode); > rtx v1 = gen_reg_rtx (V8QImode); > rtx in = operands[1]; > rtx out = operands[0]; > - if(<MODE>mode == SImode) > + /* SImode and HImode should be zero extended to > DImode. */ > + if (<MODE>mode == SImode || <MODE>mode == > HImode) > { > rtx tmp; > tmp = gen_reg_rtx (DImode); > - /* If we have SImode, zero extend to DImode, pop count > does > - not change if we have extra zeros. */ > - emit_insn (gen_zero_extendsidi2 (tmp, in)); > + /* If we have SImode, zero extend to DImode, > + pop count does not change if we have extra zeros. */ > + if (<MODE>mode == SImode) > + emit_insn (gen_zero_extendsidi2 (tmp, in)); > + else > + emit_insn (gen_zero_extendhidi2 (tmp, in)); > in = tmp; > } > emit_move_insn (v, gen_lowpart (V8QImode, in)); > emit_insn (gen_popcountv8qi2 (v1, v)); > - emit_insn > (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out, > v1)); > + /* QImode, just extract from the v8qi vector. */ > + if (<MODE>mode == QImode) > + { > + emit_move_insn (out, gen_lowpart (QImode, v1)); > + } > + /* HI and SI, reduction is zero extended to SImode. */ > + else if (<MODE>mode == SImode || <MODE>mode == > HImode) > + { > + rtx out1; > + out1 = gen_reg_rtx (SImode); > + emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi > (out1, v1)); > + emit_move_insn (out, gen_lowpart (<MODE>mode, > out1)); > + } > + /* DImode, reduction is zero extended to DImode. */ > + else > + { > + gcc_assert (<MODE>mode == DImode); > + emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi > (out, v1)); > + } > DONE; > } > }) > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c > b/gcc/testsuite/gcc.target/aarch64/popcnt5.c > new file mode 100644 > index 00000000000..406369d9b29 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+nocssc" > + > +/* > +** h8: > +** ldr b[0-9]+, \[x0\] > +** cnt v[0-9]+.8b, v[0-9]+.8b > +** smov w0, v[0-9]+.b\[0\] > +** ret > +*/ > +/* We should not need the addv here since we only need a > byte popcount. > +*/ > + > +unsigned h8 (const unsigned char *a) { > + return __builtin_popcountg (a[0]); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c > b/gcc/testsuite/gcc.target/aarch64/popcnt6.c > new file mode 100644 > index 00000000000..e882cb24126 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+nocssc" > + > +/* > +** h8: > +** ldr h[0-9]+, \[x0\] > +** cnt v[0-9]+.8b, v[0-9]+.8b > +** addv b[0-9]+, v[0-9]+.8b > +** fmov w0, s[0-9]+ > +** ret > +*/ > + > +unsigned h8 (const unsigned short *a) { > + return __builtin_popcountg (a[0]); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c > b/gcc/testsuite/gcc.target/aarch64/popcnt7.c > new file mode 100644 > index 00000000000..8dfff211ae4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+cssc" > + > +/* > +** h8: > +** ldrb w[0-9]+, \[x0\] > +** cnt w[0-9]+, w[0-9]+ > +** ret > +*/ > +/* We should not produce any extra zero extend for this code > */ > + > +unsigned h8 (const unsigned char *a) { > + return __builtin_popcountg (a[0]); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c > b/gcc/testsuite/gcc.target/aarch64/popcnt8.c > new file mode 100644 > index 00000000000..66a88b6a929 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+cssc" > + > +/* > +** h8: > +** ldrh w[0-9]+, \[x0\] > +** cnt w[0-9]+, w[0-9]+ > +** ret > +*/ > +/* We should not produce any extra zero extend for this code > */ > + > +unsigned h8 (const unsigned short *a) { > + return __builtin_popcountg (a[0]); > +} > -- > 2.42.0
Andrew Pinski <quic_apinski@quicinc.com> writes: > For popcount for bytes, we don't need the reduction addition > after the vector cnt instruction as we are only counting one > byte's popcount. > This changes the popcount extend to cover all ALLI rather than GPI. > > Changes since v1: > * v2 - Use ALLI iterator and combine all into one pattern. > Add new testcases popcnt[6-8].c. > > Bootstrapped and tested on aarch64-linux-gnu with no regressions. > > PR target/113042 > > gcc/ChangeLog: > > * config/aarch64/aarch64.md (popcount<mode>2): Update pattern > to support ALLI modes. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/popcnt5.c: New test. > * gcc.target/aarch64/popcnt6.c: New test. > > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> > --- > gcc/config/aarch64/aarch64.md | 52 +++++++++++++++++++--- > gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++ > gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++ > gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++ > gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++ > 5 files changed, 119 insertions(+), 7 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt6.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt7.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt8.c Sorry for the slow review. > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index 389a1906e23..dd88fd891b5 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -5332,28 +5332,66 @@ (define_insn "*aarch64_popcount<mode>2_cssc_insn" > ;; MOV w0, v2.b[0] > > (define_expand "popcount<mode>2" > - [(set (match_operand:GPI 0 "register_operand") > - (popcount:GPI (match_operand:GPI 1 "register_operand")))] > + [(set (match_operand:ALLI 0 "register_operand") > + (popcount:ALLI (match_operand:ALLI 1 "register_operand")))] > "TARGET_CSSC || TARGET_SIMD" Could we restrict this to: TARET_CSSC ? GET_MODE_BITSIZE (<MODE>mode) >= 32 : TARGET_SIMD > { > + rtx in = operands[1]; > + rtx out = operands[0]; > + if (TARGET_CSSC > + && (<MODE>mode == HImode > + || <MODE>mode == QImode)) > + { > + rtx tmp = gen_reg_rtx (SImode); > + rtx out1 = gen_reg_rtx (SImode); > + if (<MODE>mode == HImode) > + emit_insn (gen_zero_extendhisi2 (tmp, in)); > + else > + emit_insn (gen_zero_extendqisi2 (tmp, in)); > + emit_insn (gen_popcountsi2 (out1, tmp)); > + emit_move_insn (out, gen_lowpart (<MODE>mode, out1)); > + DONE; > + } ...and then skip this part (including the rtx in/out)? It should be what target-independent code would do. > if (!TARGET_CSSC) > { > rtx v = gen_reg_rtx (V8QImode); > rtx v1 = gen_reg_rtx (V8QImode); > rtx in = operands[1]; > rtx out = operands[0]; > - if(<MODE>mode == SImode) > + /* SImode and HImode should be zero extended to DImode. */ > + if (<MODE>mode == SImode || <MODE>mode == HImode) > { > rtx tmp; > tmp = gen_reg_rtx (DImode); > - /* If we have SImode, zero extend to DImode, pop count does > - not change if we have extra zeros. */ > - emit_insn (gen_zero_extendsidi2 (tmp, in)); > + /* If we have SImode, zero extend to DImode, > + pop count does not change if we have extra zeros. */ The doubled comment seems redundant. How about making the first one: /* SImode and HImode should be zero extended to DImode. popcount does not change if we have extra zeros. */ and deleting the second comment? > + if (<MODE>mode == SImode) > + emit_insn (gen_zero_extendsidi2 (tmp, in)); > + else > + emit_insn (gen_zero_extendhidi2 (tmp, in)); > in = tmp; I think the if body can be replaced with: in = convert_to_mode (DImode, in, true); > } > emit_move_insn (v, gen_lowpart (V8QImode, in)); > emit_insn (gen_popcountv8qi2 (v1, v)); > - emit_insn (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out, v1)); > + /* QImode, just extract from the v8qi vector. */ > + if (<MODE>mode == QImode) > + { > + emit_move_insn (out, gen_lowpart (QImode, v1)); > + } Nit: redundant braces. > + /* HI and SI, reduction is zero extended to SImode. */ > + else if (<MODE>mode == SImode || <MODE>mode == HImode) > + { > + rtx out1; > + out1 = gen_reg_rtx (SImode); Pre-existing, but: no need for two separate statements. Thanks, Richard > + emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi (out1, v1)); > + emit_move_insn (out, gen_lowpart (<MODE>mode, out1)); > + } > + /* DImode, reduction is zero extended to DImode. */ > + else > + { > + gcc_assert (<MODE>mode == DImode); > + emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi (out, v1)); > + } > DONE; > } > }) > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c > new file mode 100644 > index 00000000000..406369d9b29 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+nocssc" > + > +/* > +** h8: > +** ldr b[0-9]+, \[x0\] > +** cnt v[0-9]+.8b, v[0-9]+.8b > +** smov w0, v[0-9]+.b\[0\] > +** ret > +*/ > +/* We should not need the addv here since we only need a byte popcount. */ > + > +unsigned h8 (const unsigned char *a) { > + return __builtin_popcountg (a[0]); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c b/gcc/testsuite/gcc.target/aarch64/popcnt6.c > new file mode 100644 > index 00000000000..e882cb24126 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+nocssc" > + > +/* > +** h8: > +** ldr h[0-9]+, \[x0\] > +** cnt v[0-9]+.8b, v[0-9]+.8b > +** addv b[0-9]+, v[0-9]+.8b > +** fmov w0, s[0-9]+ > +** ret > +*/ > + > +unsigned h8 (const unsigned short *a) { > + return __builtin_popcountg (a[0]); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c b/gcc/testsuite/gcc.target/aarch64/popcnt7.c > new file mode 100644 > index 00000000000..8dfff211ae4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+cssc" > + > +/* > +** h8: > +** ldrb w[0-9]+, \[x0\] > +** cnt w[0-9]+, w[0-9]+ > +** ret > +*/ > +/* We should not produce any extra zero extend for this code */ > + > +unsigned h8 (const unsigned char *a) { > + return __builtin_popcountg (a[0]); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c b/gcc/testsuite/gcc.target/aarch64/popcnt8.c > new file mode 100644 > index 00000000000..66a88b6a929 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+cssc" > + > +/* > +** h8: > +** ldrh w[0-9]+, \[x0\] > +** cnt w[0-9]+, w[0-9]+ > +** ret > +*/ > +/* We should not produce any extra zero extend for this code */ > + > +unsigned h8 (const unsigned short *a) { > + return __builtin_popcountg (a[0]); > +}
On Wed, Aug 14, 2024 at 2:21 PM Richard Sandiford <richard.sandiford@arm.com> wrote: > > Andrew Pinski <quic_apinski@quicinc.com> writes: > > For popcount for bytes, we don't need the reduction addition > > after the vector cnt instruction as we are only counting one > > byte's popcount. > > This changes the popcount extend to cover all ALLI rather than GPI. > > > > Changes since v1: > > * v2 - Use ALLI iterator and combine all into one pattern. > > Add new testcases popcnt[6-8].c. > > > > Bootstrapped and tested on aarch64-linux-gnu with no regressions. > > > > PR target/113042 > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64.md (popcount<mode>2): Update pattern > > to support ALLI modes. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/popcnt5.c: New test. > > * gcc.target/aarch64/popcnt6.c: New test. > > > > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> > > --- > > gcc/config/aarch64/aarch64.md | 52 +++++++++++++++++++--- > > gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++ > > gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++ > > gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++ > > gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++ > > 5 files changed, 119 insertions(+), 7 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c > > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt6.c > > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt7.c > > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt8.c > > Sorry for the slow review. > > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > > index 389a1906e23..dd88fd891b5 100644 > > --- a/gcc/config/aarch64/aarch64.md > > +++ b/gcc/config/aarch64/aarch64.md > > @@ -5332,28 +5332,66 @@ (define_insn "*aarch64_popcount<mode>2_cssc_insn" > > ;; MOV w0, v2.b[0] > > > > (define_expand "popcount<mode>2" > > - [(set (match_operand:GPI 0 "register_operand") > > - (popcount:GPI (match_operand:GPI 1 "register_operand")))] > > + [(set (match_operand:ALLI 0 "register_operand") > > + (popcount:ALLI (match_operand:ALLI 1 "register_operand")))] > > "TARGET_CSSC || TARGET_SIMD" > > Could we restrict this to: > > TARET_CSSC > ? GET_MODE_BITSIZE (<MODE>mode) >= 32 > : TARGET_SIMD > > > { > > + rtx in = operands[1]; > > + rtx out = operands[0]; > > + if (TARGET_CSSC > > + && (<MODE>mode == HImode > > + || <MODE>mode == QImode)) > > + { > > + rtx tmp = gen_reg_rtx (SImode); > > + rtx out1 = gen_reg_rtx (SImode); > > + if (<MODE>mode == HImode) > > + emit_insn (gen_zero_extendhisi2 (tmp, in)); > > + else > > + emit_insn (gen_zero_extendqisi2 (tmp, in)); > > + emit_insn (gen_popcountsi2 (out1, tmp)); > > + emit_move_insn (out, gen_lowpart (<MODE>mode, out1)); > > + DONE; > > + } > > ...and then skip this part (including the rtx in/out)? It should be > what target-independent code would do. > > > if (!TARGET_CSSC) > > { > > rtx v = gen_reg_rtx (V8QImode); > > rtx v1 = gen_reg_rtx (V8QImode); > > rtx in = operands[1]; > > rtx out = operands[0]; > > - if(<MODE>mode == SImode) > > + /* SImode and HImode should be zero extended to DImode. */ > > + if (<MODE>mode == SImode || <MODE>mode == HImode) > > { > > rtx tmp; > > tmp = gen_reg_rtx (DImode); > > - /* If we have SImode, zero extend to DImode, pop count does > > - not change if we have extra zeros. */ > > - emit_insn (gen_zero_extendsidi2 (tmp, in)); > > + /* If we have SImode, zero extend to DImode, > > + pop count does not change if we have extra zeros. */ > > The doubled comment seems redundant. How about making the first one: > > /* SImode and HImode should be zero extended to DImode. > popcount does not change if we have extra zeros. */ > > and deleting the second comment? > > > + if (<MODE>mode == SImode) > > + emit_insn (gen_zero_extendsidi2 (tmp, in)); > > + else > > + emit_insn (gen_zero_extendhidi2 (tmp, in)); > > in = tmp; > > I think the if body can be replaced with: > > in = convert_to_mode (DImode, in, true); > > > } > > emit_move_insn (v, gen_lowpart (V8QImode, in)); > > emit_insn (gen_popcountv8qi2 (v1, v)); > > - emit_insn (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out, v1)); > > + /* QImode, just extract from the v8qi vector. */ > > + if (<MODE>mode == QImode) > > + { > > + emit_move_insn (out, gen_lowpart (QImode, v1)); > > + } > > Nit: redundant braces. > > > + /* HI and SI, reduction is zero extended to SImode. */ > > + else if (<MODE>mode == SImode || <MODE>mode == HImode) > > + { > > + rtx out1; > > + out1 = gen_reg_rtx (SImode); > > Pre-existing, but: no need for two separate statements. Updated patch posted: https://gcc.gnu.org/pipermail/gcc-patches/2024-August/660473.html Thanks, Andrew > > Thanks, > Richard > > > + emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi (out1, v1)); > > + emit_move_insn (out, gen_lowpart (<MODE>mode, out1)); > > + } > > + /* DImode, reduction is zero extended to DImode. */ > > + else > > + { > > + gcc_assert (<MODE>mode == DImode); > > + emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi (out, v1)); > > + } > > DONE; > > } > > }) > > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c > > new file mode 100644 > > index 00000000000..406369d9b29 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c > > @@ -0,0 +1,19 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > +/* { dg-final { check-function-bodies "**" "" } } */ > > +/* PR target/113042 */ > > + > > +#pragma GCC target "+nocssc" > > + > > +/* > > +** h8: > > +** ldr b[0-9]+, \[x0\] > > +** cnt v[0-9]+.8b, v[0-9]+.8b > > +** smov w0, v[0-9]+.b\[0\] > > +** ret > > +*/ > > +/* We should not need the addv here since we only need a byte popcount. */ > > + > > +unsigned h8 (const unsigned char *a) { > > + return __builtin_popcountg (a[0]); > > +} > > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c b/gcc/testsuite/gcc.target/aarch64/popcnt6.c > > new file mode 100644 > > index 00000000000..e882cb24126 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c > > @@ -0,0 +1,19 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > +/* { dg-final { check-function-bodies "**" "" } } */ > > +/* PR target/113042 */ > > + > > +#pragma GCC target "+nocssc" > > + > > +/* > > +** h8: > > +** ldr h[0-9]+, \[x0\] > > +** cnt v[0-9]+.8b, v[0-9]+.8b > > +** addv b[0-9]+, v[0-9]+.8b > > +** fmov w0, s[0-9]+ > > +** ret > > +*/ > > + > > +unsigned h8 (const unsigned short *a) { > > + return __builtin_popcountg (a[0]); > > +} > > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c b/gcc/testsuite/gcc.target/aarch64/popcnt7.c > > new file mode 100644 > > index 00000000000..8dfff211ae4 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c > > @@ -0,0 +1,18 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > +/* { dg-final { check-function-bodies "**" "" } } */ > > +/* PR target/113042 */ > > + > > +#pragma GCC target "+cssc" > > + > > +/* > > +** h8: > > +** ldrb w[0-9]+, \[x0\] > > +** cnt w[0-9]+, w[0-9]+ > > +** ret > > +*/ > > +/* We should not produce any extra zero extend for this code */ > > + > > +unsigned h8 (const unsigned char *a) { > > + return __builtin_popcountg (a[0]); > > +} > > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c b/gcc/testsuite/gcc.target/aarch64/popcnt8.c > > new file mode 100644 > > index 00000000000..66a88b6a929 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c > > @@ -0,0 +1,18 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > +/* { dg-final { check-function-bodies "**" "" } } */ > > +/* PR target/113042 */ > > + > > +#pragma GCC target "+cssc" > > + > > +/* > > +** h8: > > +** ldrh w[0-9]+, \[x0\] > > +** cnt w[0-9]+, w[0-9]+ > > +** ret > > +*/ > > +/* We should not produce any extra zero extend for this code */ > > + > > +unsigned h8 (const unsigned short *a) { > > + return __builtin_popcountg (a[0]); > > +}
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 389a1906e23..dd88fd891b5 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5332,28 +5332,66 @@ (define_insn "*aarch64_popcount<mode>2_cssc_insn" ;; MOV w0, v2.b[0] (define_expand "popcount<mode>2" - [(set (match_operand:GPI 0 "register_operand") - (popcount:GPI (match_operand:GPI 1 "register_operand")))] + [(set (match_operand:ALLI 0 "register_operand") + (popcount:ALLI (match_operand:ALLI 1 "register_operand")))] "TARGET_CSSC || TARGET_SIMD" { + rtx in = operands[1]; + rtx out = operands[0]; + if (TARGET_CSSC + && (<MODE>mode == HImode + || <MODE>mode == QImode)) + { + rtx tmp = gen_reg_rtx (SImode); + rtx out1 = gen_reg_rtx (SImode); + if (<MODE>mode == HImode) + emit_insn (gen_zero_extendhisi2 (tmp, in)); + else + emit_insn (gen_zero_extendqisi2 (tmp, in)); + emit_insn (gen_popcountsi2 (out1, tmp)); + emit_move_insn (out, gen_lowpart (<MODE>mode, out1)); + DONE; + } if (!TARGET_CSSC) { rtx v = gen_reg_rtx (V8QImode); rtx v1 = gen_reg_rtx (V8QImode); rtx in = operands[1]; rtx out = operands[0]; - if(<MODE>mode == SImode) + /* SImode and HImode should be zero extended to DImode. */ + if (<MODE>mode == SImode || <MODE>mode == HImode) { rtx tmp; tmp = gen_reg_rtx (DImode); - /* If we have SImode, zero extend to DImode, pop count does - not change if we have extra zeros. */ - emit_insn (gen_zero_extendsidi2 (tmp, in)); + /* If we have SImode, zero extend to DImode, + pop count does not change if we have extra zeros. */ + if (<MODE>mode == SImode) + emit_insn (gen_zero_extendsidi2 (tmp, in)); + else + emit_insn (gen_zero_extendhidi2 (tmp, in)); in = tmp; } emit_move_insn (v, gen_lowpart (V8QImode, in)); emit_insn (gen_popcountv8qi2 (v1, v)); - emit_insn (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out, v1)); + /* QImode, just extract from the v8qi vector. */ + if (<MODE>mode == QImode) + { + emit_move_insn (out, gen_lowpart (QImode, v1)); + } + /* HI and SI, reduction is zero extended to SImode. */ + else if (<MODE>mode == SImode || <MODE>mode == HImode) + { + rtx out1; + out1 = gen_reg_rtx (SImode); + emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi (out1, v1)); + emit_move_insn (out, gen_lowpart (<MODE>mode, out1)); + } + /* DImode, reduction is zero extended to DImode. */ + else + { + gcc_assert (<MODE>mode == DImode); + emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi (out, v1)); + } DONE; } }) diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c new file mode 100644 index 00000000000..406369d9b29 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR target/113042 */ + +#pragma GCC target "+nocssc" + +/* +** h8: +** ldr b[0-9]+, \[x0\] +** cnt v[0-9]+.8b, v[0-9]+.8b +** smov w0, v[0-9]+.b\[0\] +** ret +*/ +/* We should not need the addv here since we only need a byte popcount. */ + +unsigned h8 (const unsigned char *a) { + return __builtin_popcountg (a[0]); +} diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c b/gcc/testsuite/gcc.target/aarch64/popcnt6.c new file mode 100644 index 00000000000..e882cb24126 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR target/113042 */ + +#pragma GCC target "+nocssc" + +/* +** h8: +** ldr h[0-9]+, \[x0\] +** cnt v[0-9]+.8b, v[0-9]+.8b +** addv b[0-9]+, v[0-9]+.8b +** fmov w0, s[0-9]+ +** ret +*/ + +unsigned h8 (const unsigned short *a) { + return __builtin_popcountg (a[0]); +} diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c b/gcc/testsuite/gcc.target/aarch64/popcnt7.c new file mode 100644 index 00000000000..8dfff211ae4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR target/113042 */ + +#pragma GCC target "+cssc" + +/* +** h8: +** ldrb w[0-9]+, \[x0\] +** cnt w[0-9]+, w[0-9]+ +** ret +*/ +/* We should not produce any extra zero extend for this code */ + +unsigned h8 (const unsigned char *a) { + return __builtin_popcountg (a[0]); +} diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c b/gcc/testsuite/gcc.target/aarch64/popcnt8.c new file mode 100644 index 00000000000..66a88b6a929 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR target/113042 */ + +#pragma GCC target "+cssc" + +/* +** h8: +** ldrh w[0-9]+, \[x0\] +** cnt w[0-9]+, w[0-9]+ +** ret +*/ +/* We should not produce any extra zero extend for this code */ + +unsigned h8 (const unsigned short *a) { + return __builtin_popcountg (a[0]); +}
For popcount for bytes, we don't need the reduction addition after the vector cnt instruction as we are only counting one byte's popcount. This changes the popcount extend to cover all ALLI rather than GPI. Changes since v1: * v2 - Use ALLI iterator and combine all into one pattern. Add new testcases popcnt[6-8].c. Bootstrapped and tested on aarch64-linux-gnu with no regressions. PR target/113042 gcc/ChangeLog: * config/aarch64/aarch64.md (popcount<mode>2): Update pattern to support ALLI modes. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt5.c: New test. * gcc.target/aarch64/popcnt6.c: New test. Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> --- gcc/config/aarch64/aarch64.md | 52 +++++++++++++++++++--- gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++ gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++ gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++ gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++ 5 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt6.c create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt7.c create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt8.c