Message ID | 20240816213559.1486438-2-quic_apinski@quicinc.com |
---|---|
State | New |
Headers | show |
Series | [1/2] builtins: Don't expand bit query builtins for __int128_t if the target supports an optab for it | expand |
Andrew Pinski <quic_apinski@quicinc.com> writes: > When CSSC is not enabled, 128bit popcount can be implemented > just via the vector (v16qi) cnt instruction followed by a reduction, > like how the 64bit one is currently implemented instead of > splitting into 2 64bit popcount. > > Build and tested for aarch64-linux-gnu. > > PR target/113042 > > gcc/ChangeLog: > > * config/aarch64/aarch64.md (popcountti2): New define_expand. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/popcnt10.c: New test. > * gcc.target/aarch64/popcnt9.c: New test. OK if there are no other comments in the next 24 hours. Thanks, Richard > > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> > --- > gcc/config/aarch64/aarch64.md | 16 +++++++++++++ > gcc/testsuite/gcc.target/aarch64/popcnt10.c | 25 +++++++++++++++++++++ > gcc/testsuite/gcc.target/aarch64/popcnt9.c | 25 +++++++++++++++++++++ > 3 files changed, 66 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt10.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt9.c > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index 12dcc16529a..73506e71f43 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -5378,6 +5378,22 @@ (define_expand "popcount<mode>2" > } > }) > > +(define_expand "popcountti2" > + [(set (match_operand:TI 0 "register_operand") > + (popcount:TI (match_operand:TI 1 "register_operand")))] > + "TARGET_SIMD && !TARGET_CSSC" > +{ > + rtx v = gen_reg_rtx (V16QImode); > + rtx v1 = gen_reg_rtx (V16QImode); > + emit_move_insn (v, gen_lowpart (V16QImode, operands[1])); > + emit_insn (gen_popcountv16qi2 (v1, v)); > + rtx out = gen_reg_rtx (DImode); > + emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v16qi (out, v1)); > + out = convert_to_mode (TImode, out, true); > + emit_move_insn (operands[0], out); > + DONE; > +}) > + > (define_insn "clrsb<mode>2" > [(set (match_operand:GPI 0 "register_operand" "=r") > (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))] > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt10.c b/gcc/testsuite/gcc.target/aarch64/popcnt10.c > new file mode 100644 > index 00000000000..4d01fc67022 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt10.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+cssc" > + > +/* > +** h128: > +** ldp x([0-9]+), x([0-9]+), \[x0\] > +** cnt x([0-9]+), x([0-9]+) > +** cnt x([0-9]+), x([0-9]+) > +** add w0, w([0-9]+), w([0-9]+) > +** ret > +*/ > + > + > +unsigned h128 (const unsigned __int128 *a) { > + return __builtin_popcountg (a[0]); > +} > + > +/* popcount with CSSC should be split into 2 sections. */ > +/* { dg-final { scan-tree-dump-not "POPCOUNT " "optimized" } } */ > +/* { dg-final { scan-tree-dump-times " __builtin_popcount" 2 "optimized" } } */ > + > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt9.c b/gcc/testsuite/gcc.target/aarch64/popcnt9.c > new file mode 100644 > index 00000000000..c778fc7f420 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt9.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* PR target/113042 */ > + > +#pragma GCC target "+nocssc" > + > +/* > +** h128: > +** ldr q([0-9]+), \[x0\] > +** cnt v([0-9]+).16b, v\1.16b > +** addv b([0-9]+), v\2.16b > +** fmov w0, s\3 > +** ret > +*/ > + > + > +unsigned h128 (const unsigned __int128 *a) { > + return __builtin_popcountg (a[0]); > +} > + > +/* There should be only one POPCOUNT. */ > +/* { dg-final { scan-tree-dump-times "POPCOUNT " 1 "optimized" } } */ > +/* { dg-final { scan-tree-dump-not " __builtin_popcount" "optimized" } } */ > +
Richard Sandiford <richard.sandiford@arm.com> writes: > Andrew Pinski <quic_apinski@quicinc.com> writes: >> When CSSC is not enabled, 128bit popcount can be implemented >> just via the vector (v16qi) cnt instruction followed by a reduction, >> like how the 64bit one is currently implemented instead of >> splitting into 2 64bit popcount. >> >> Build and tested for aarch64-linux-gnu. >> >> PR target/113042 >> >> gcc/ChangeLog: >> >> * config/aarch64/aarch64.md (popcountti2): New define_expand. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/aarch64/popcnt10.c: New test. >> * gcc.target/aarch64/popcnt9.c: New test. > > OK if there are no other comments in the next 24 hours. Sorry, only thought about it later, but: >> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md >> index 12dcc16529a..73506e71f43 100644 >> --- a/gcc/config/aarch64/aarch64.md >> +++ b/gcc/config/aarch64/aarch64.md >> @@ -5378,6 +5378,22 @@ (define_expand "popcount<mode>2" >> } >> }) >> >> +(define_expand "popcountti2" >> + [(set (match_operand:TI 0 "register_operand") >> + (popcount:TI (match_operand:TI 1 "register_operand")))] Could you try making the output :DI instead of :TI? I'd expect internal-fn.cc to handle that correctly and extend the result to 128 bits where needed. That would make the dummy popcount rtx malformed, so I suppose the pattern should just be: [(match_operand:DI 0 "register_operand") (match_operand:TI 1 "register_operand")] >> + "TARGET_SIMD && !TARGET_CSSC" >> +{ >> + rtx v = gen_reg_rtx (V16QImode); >> + rtx v1 = gen_reg_rtx (V16QImode); >> + emit_move_insn (v, gen_lowpart (V16QImode, operands[1])); >> + emit_insn (gen_popcountv16qi2 (v1, v)); >> + rtx out = gen_reg_rtx (DImode); >> + emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v16qi (out, v1)); We could then use operands[0] directly as the output here. Thanks, Richard >> + out = convert_to_mode (TImode, out, true); >> + emit_move_insn (operands[0], out); >> + DONE; >> +})
On Tue, Aug 20, 2024 at 11:18 AM Richard Sandiford <richard.sandiford@arm.com> wrote: > > Richard Sandiford <richard.sandiford@arm.com> writes: > > Andrew Pinski <quic_apinski@quicinc.com> writes: > >> When CSSC is not enabled, 128bit popcount can be implemented > >> just via the vector (v16qi) cnt instruction followed by a reduction, > >> like how the 64bit one is currently implemented instead of > >> splitting into 2 64bit popcount. > >> > >> Build and tested for aarch64-linux-gnu. > >> > >> PR target/113042 > >> > >> gcc/ChangeLog: > >> > >> * config/aarch64/aarch64.md (popcountti2): New define_expand. > >> > >> gcc/testsuite/ChangeLog: > >> > >> * gcc.target/aarch64/popcnt10.c: New test. > >> * gcc.target/aarch64/popcnt9.c: New test. > > > > OK if there are no other comments in the next 24 hours. > > Sorry, only thought about it later, but: Yes that is a good idea since that would be the same code in the end anyways and it is slightly cleaner. I was not 100% sure if you removed your approval or approved it with the changes so I submitted a new patch here: https://gcc.gnu.org/pipermail/gcc-patches/2024-August/660960.html Thanks, Andrew Pinski > > >> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > >> index 12dcc16529a..73506e71f43 100644 > >> --- a/gcc/config/aarch64/aarch64.md > >> +++ b/gcc/config/aarch64/aarch64.md > >> @@ -5378,6 +5378,22 @@ (define_expand "popcount<mode>2" > >> } > >> }) > >> > >> +(define_expand "popcountti2" > >> + [(set (match_operand:TI 0 "register_operand") > >> + (popcount:TI (match_operand:TI 1 "register_operand")))] > > Could you try making the output :DI instead of :TI? I'd expect > internal-fn.cc to handle that correctly and extend the result to > 128 bits where needed. > > That would make the dummy popcount rtx malformed, so I suppose > the pattern should just be: > > [(match_operand:DI 0 "register_operand") > (match_operand:TI 1 "register_operand")] > > >> + "TARGET_SIMD && !TARGET_CSSC" > >> +{ > >> + rtx v = gen_reg_rtx (V16QImode); > >> + rtx v1 = gen_reg_rtx (V16QImode); > >> + emit_move_insn (v, gen_lowpart (V16QImode, operands[1])); > >> + emit_insn (gen_popcountv16qi2 (v1, v)); > >> + rtx out = gen_reg_rtx (DImode); > >> + emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v16qi (out, v1)); > > We could then use operands[0] directly as the output here. > > Thanks, > Richard > > >> + out = convert_to_mode (TImode, out, true); > >> + emit_move_insn (operands[0], out); > >> + DONE; > >> +})
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 12dcc16529a..73506e71f43 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5378,6 +5378,22 @@ (define_expand "popcount<mode>2" } }) +(define_expand "popcountti2" + [(set (match_operand:TI 0 "register_operand") + (popcount:TI (match_operand:TI 1 "register_operand")))] + "TARGET_SIMD && !TARGET_CSSC" +{ + rtx v = gen_reg_rtx (V16QImode); + rtx v1 = gen_reg_rtx (V16QImode); + emit_move_insn (v, gen_lowpart (V16QImode, operands[1])); + emit_insn (gen_popcountv16qi2 (v1, v)); + rtx out = gen_reg_rtx (DImode); + emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v16qi (out, v1)); + out = convert_to_mode (TImode, out, true); + emit_move_insn (operands[0], out); + DONE; +}) + (define_insn "clrsb<mode>2" [(set (match_operand:GPI 0 "register_operand" "=r") (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))] diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt10.c b/gcc/testsuite/gcc.target/aarch64/popcnt10.c new file mode 100644 index 00000000000..4d01fc67022 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt10.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR target/113042 */ + +#pragma GCC target "+cssc" + +/* +** h128: +** ldp x([0-9]+), x([0-9]+), \[x0\] +** cnt x([0-9]+), x([0-9]+) +** cnt x([0-9]+), x([0-9]+) +** add w0, w([0-9]+), w([0-9]+) +** ret +*/ + + +unsigned h128 (const unsigned __int128 *a) { + return __builtin_popcountg (a[0]); +} + +/* popcount with CSSC should be split into 2 sections. */ +/* { dg-final { scan-tree-dump-not "POPCOUNT " "optimized" } } */ +/* { dg-final { scan-tree-dump-times " __builtin_popcount" 2 "optimized" } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt9.c b/gcc/testsuite/gcc.target/aarch64/popcnt9.c new file mode 100644 index 00000000000..c778fc7f420 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt9.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR target/113042 */ + +#pragma GCC target "+nocssc" + +/* +** h128: +** ldr q([0-9]+), \[x0\] +** cnt v([0-9]+).16b, v\1.16b +** addv b([0-9]+), v\2.16b +** fmov w0, s\3 +** ret +*/ + + +unsigned h128 (const unsigned __int128 *a) { + return __builtin_popcountg (a[0]); +} + +/* There should be only one POPCOUNT. */ +/* { dg-final { scan-tree-dump-times "POPCOUNT " 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-not " __builtin_popcount" "optimized" } } */ +
When CSSC is not enabled, 128bit popcount can be implemented just via the vector (v16qi) cnt instruction followed by a reduction, like how the 64bit one is currently implemented instead of splitting into 2 64bit popcount. Build and tested for aarch64-linux-gnu. PR target/113042 gcc/ChangeLog: * config/aarch64/aarch64.md (popcountti2): New define_expand. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt10.c: New test. * gcc.target/aarch64/popcnt9.c: New test. Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> --- gcc/config/aarch64/aarch64.md | 16 +++++++++++++ gcc/testsuite/gcc.target/aarch64/popcnt10.c | 25 +++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/popcnt9.c | 25 +++++++++++++++++++++ 3 files changed, 66 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt10.c create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt9.c