Message ID | CAMZc-bwSDvTwsiwf3QO+pJ+3pr3iciWsrGQddqw35TigVicPsg@mail.gmail.com |
---|---|
State | New |
Headers | show |
Series | [X86] Fold more shuffle builtins to VEC_PERM_EXPR. | expand |
On Tue, Dec 15, 2020 at 06:10:57PM +0800, Hongtao Liu via Gcc-patches wrote: > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -18187,21 +18187,67 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) > } > break; > > + case IX86_BUILTIN_SHUFPD512: > + case IX86_BUILTIN_SHUFPS512: > + if (n_args > 2) > + { > + /* This is masked shuffle. Only optimize if the mask is all ones. */ > + tree argl = gimple_call_arg (stmt, n_args - 1); > + arg0 = gimple_call_arg (stmt, 0); > + if (!tree_fits_uhwi_p (argl)) > + break; > + unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); > + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); I think it would be better not to mix the argl and arg0 stuff. So e.g. do arg0 = gimple_call_arg (stmt, 0); unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); first and then the argl stuff, or vice versa. Furthermore, you don't really care about the upper bits of argl, so why don't punt just if (TREE_CODE (argl) != INTEGER_CST) and use mask = TREE_LOW_CST (argl); ? > + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) > + break; > + } > + /* Fall thru. */ > case IX86_BUILTIN_SHUFPD: > + case IX86_BUILTIN_SHUFPD256: > + case IX86_BUILTIN_SHUFPS: > + case IX86_BUILTIN_SHUFPS256: > arg2 = gimple_call_arg (stmt, 2); > if (TREE_CODE (arg2) == INTEGER_CST) > { > - location_t loc = gimple_location (stmt); > - unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); > arg0 = gimple_call_arg (stmt, 0); > + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > + machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0))); > + unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); > + > + /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */ > + if (imask > 255 > + || (imask >= HOST_WIDE_INT_1U << elems > + && imode == E_DFmode)) > + return false; Why is this extra checking done only for DFmode and not for SFmode? > + tree itype = imode == E_DFmode > + ? long_long_integer_type_node : integer_type_node; Formatting. Should be e.g. tree itype = (imode == E_DFmode ? long_long_integer_type_node : integer_type_node); or tree itype = (imode == E_DFmode ? long_long_integer_type_node : integer_type_node); but the ? which is part of the imode == E_DFmode ... subexpression can't just be below something unrelated. > + if (imode == E_DFmode) > + sel_idx = (i & 1) * elems > + + (i >> 1 << 1) + ((imask & 1 << i) >> i); Again, formatting. Plus, i >> 1 << 1 looks too ugly/unreadable, if you mean i & ~1, write it like that, it is up to the compiler to emit it like i >> 1 << 1 if that is the best implementation. > + else > + { > + /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select > + controls for each element of the destination. */ > + unsigned j = i % 4; > + sel_idx = ((i & 2) >> 1) * elems > + + (i >> 2 << 2) + ((imask & 3 << j << j) >> j >> j); Ditto. Jakub
On Tue, Dec 15, 2020 at 7:11 PM Jakub Jelinek <jakub@redhat.com> wrote: > > On Tue, Dec 15, 2020 at 06:10:57PM +0800, Hongtao Liu via Gcc-patches wrote: > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -18187,21 +18187,67 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) > > } > > break; > > > > + case IX86_BUILTIN_SHUFPD512: > > + case IX86_BUILTIN_SHUFPS512: > > + if (n_args > 2) > > + { > > + /* This is masked shuffle. Only optimize if the mask is all ones. */ > > + tree argl = gimple_call_arg (stmt, n_args - 1); > > + arg0 = gimple_call_arg (stmt, 0); > > + if (!tree_fits_uhwi_p (argl)) > > + break; > > + unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); > > + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > I think it would be better not to mix the argl and arg0 stuff. > So e.g. do > arg0 = gimple_call_arg (stmt, 0); > unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > first and then the argl stuff, or vice versa. > Furthermore, you don't really care about the upper bits of argl, > so why don't punt just if (TREE_CODE (argl) != INTEGER_CST) > and use mask = TREE_LOW_CST (argl); > ? > Yes, and for maintenance convenience, i put these code into a new function which can be also called by masked shift @@ -18128,17 +18142,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) gcc_assert (n_args >= 2); arg0 = gimple_call_arg (stmt, 0); arg1 = gimple_call_arg (stmt, 1); - if (n_args > 2) - { - /* This is masked shift. Only optimize if the mask is all ones. */ - tree argl = gimple_call_arg (stmt, n_args - 1); - if (!tree_fits_uhwi_p (argl)) - break; - unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); - unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); - if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) - break; - } + /* For masked shift, only optimize if the mask is all ones. */ + if (n_args > 2 + && !ix86_masked_all_ones (arg0, gimple_call_arg (stmt, n_args - 1))) + break; > > + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) > > + break; > > + } > > + /* Fall thru. */ > > case IX86_BUILTIN_SHUFPD: > > + case IX86_BUILTIN_SHUFPD256: > > + case IX86_BUILTIN_SHUFPS: > > + case IX86_BUILTIN_SHUFPS256: > > arg2 = gimple_call_arg (stmt, 2); > > if (TREE_CODE (arg2) == INTEGER_CST) > > { > > - location_t loc = gimple_location (stmt); > > - unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); > > arg0 = gimple_call_arg (stmt, 0); > > + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0))); > > + unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); > > + > > + /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */ > > + if (imask > 255 > > + || (imask >= HOST_WIDE_INT_1U << elems > > + && imode == E_DFmode)) > > + return false; > > Why is this extra checking done only for DFmode and not for SFmode? Oh, yes, delete extra checking, the instruction would ignore high bits for 128/256-bit DFmode version. > > > + tree itype = imode == E_DFmode > > + ? long_long_integer_type_node : integer_type_node; > > Formatting. Should be e.g. > tree itype > = (imode == E_DFmode > ? long_long_integer_type_node : integer_type_node); > or > tree itype = (imode == E_DFmode ? long_long_integer_type_node > : integer_type_node); > but the ? which is part of the imode == E_DFmode ... subexpression > can't just be below something unrelated. > Changed. > > + if (imode == E_DFmode) > > + sel_idx = (i & 1) * elems > > + + (i >> 1 << 1) + ((imask & 1 << i) >> i); > > Again, formatting. Plus, i >> 1 << 1 looks too ugly/unreadable, > if you mean i & ~1, write it like that, it is up to the compiler to emit > it like i >> 1 << 1 if that is the best implementation. > Changed. > > + else > > + { > > + /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select > > + controls for each element of the destination. */ > > + unsigned j = i % 4; > > + sel_idx = ((i & 2) >> 1) * elems > > + + (i >> 2 << 2) + ((imask & 3 << j << j) >> j >> j); > > Ditto. > Changed. > Jakub > Update patch -- BR, Hongtao
On 12/16/20 3:41 AM, Hongtao Liu via Gcc-patches wrote: > On Tue, Dec 15, 2020 at 7:11 PM Jakub Jelinek <jakub@redhat.com> wrote: >> On Tue, Dec 15, 2020 at 06:10:57PM +0800, Hongtao Liu via Gcc-patches wrote: >>> --- a/gcc/config/i386/i386.c >>> +++ b/gcc/config/i386/i386.c >>> @@ -18187,21 +18187,67 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) >>> } >>> break; >>> >>> + case IX86_BUILTIN_SHUFPD512: >>> + case IX86_BUILTIN_SHUFPS512: >>> + if (n_args > 2) >>> + { >>> + /* This is masked shuffle. Only optimize if the mask is all ones. */ >>> + tree argl = gimple_call_arg (stmt, n_args - 1); >>> + arg0 = gimple_call_arg (stmt, 0); >>> + if (!tree_fits_uhwi_p (argl)) >>> + break; >>> + unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); >>> + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); >> I think it would be better not to mix the argl and arg0 stuff. >> So e.g. do >> arg0 = gimple_call_arg (stmt, 0); >> unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); >> first and then the argl stuff, or vice versa. >> Furthermore, you don't really care about the upper bits of argl, >> so why don't punt just if (TREE_CODE (argl) != INTEGER_CST) >> and use mask = TREE_LOW_CST (argl); >> ? >> > Yes, and for maintenance convenience, i put these code into a new > function which can be also called by masked shift > @@ -18128,17 +18142,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) > gcc_assert (n_args >= 2); > arg0 = gimple_call_arg (stmt, 0); > arg1 = gimple_call_arg (stmt, 1); > - if (n_args > 2) > - { > - /* This is masked shift. Only optimize if the mask is all ones. */ > - tree argl = gimple_call_arg (stmt, n_args - 1); > - if (!tree_fits_uhwi_p (argl)) > - break; > - unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); > - unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > - if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) > - break; > - } > + /* For masked shift, only optimize if the mask is all ones. */ > + if (n_args > 2 > + && !ix86_masked_all_ones (arg0, gimple_call_arg (stmt, n_args - 1))) > + break; > > >>> + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) >>> + break; >>> + } >>> + /* Fall thru. */ >>> case IX86_BUILTIN_SHUFPD: >>> + case IX86_BUILTIN_SHUFPD256: >>> + case IX86_BUILTIN_SHUFPS: >>> + case IX86_BUILTIN_SHUFPS256: >>> arg2 = gimple_call_arg (stmt, 2); >>> if (TREE_CODE (arg2) == INTEGER_CST) >>> { >>> - location_t loc = gimple_location (stmt); >>> - unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); >>> arg0 = gimple_call_arg (stmt, 0); >>> + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); >>> + machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0))); >>> + unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); >>> + >>> + /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */ >>> + if (imask > 255 >>> + || (imask >= HOST_WIDE_INT_1U << elems >>> + && imode == E_DFmode)) >>> + return false; >> Why is this extra checking done only for DFmode and not for SFmode? > Oh, yes, delete extra checking, the instruction would ignore high bits > for 128/256-bit DFmode version. >>> + tree itype = imode == E_DFmode >>> + ? long_long_integer_type_node : integer_type_node; >> Formatting. Should be e.g. >> tree itype >> = (imode == E_DFmode >> ? long_long_integer_type_node : integer_type_node); >> or >> tree itype = (imode == E_DFmode ? long_long_integer_type_node >> : integer_type_node); >> but the ? which is part of the imode == E_DFmode ... subexpression >> can't just be below something unrelated. >> > Changed. >>> + if (imode == E_DFmode) >>> + sel_idx = (i & 1) * elems >>> + + (i >> 1 << 1) + ((imask & 1 << i) >> i); >> Again, formatting. Plus, i >> 1 << 1 looks too ugly/unreadable, >> if you mean i & ~1, write it like that, it is up to the compiler to emit >> it like i >> 1 << 1 if that is the best implementation. >> > Changed. >>> + else >>> + { >>> + /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select >>> + controls for each element of the destination. */ >>> + unsigned j = i % 4; >>> + sel_idx = ((i & 2) >> 1) * elems >>> + + (i >> 2 << 2) + ((imask & 3 << j << j) >> j >> j); >> Ditto. >> > Changed. >> Jakub >> > Update patch > > -- > BR, > Hongtao > > 0001-X86-Fold-more-shuffle-builtins-to-VEC_PERM_EXPR.patch > > From 1cfec402ffa25375c88fa38e783d203401f38c5e Mon Sep 17 00:00:00 2001 > From: liuhongt <hongtao.liu@intel.com> > Date: Fri, 11 Dec 2020 19:02:43 +0800 > Subject: [PATCH] [X86] Fold more shuffle builtins to VEC_PERM_EXPR. > MIME-Version: 1.0 > Content-Type: text/plain; charset=UTF-8 > Content-Transfer-Encoding: 8bit > > A follow-up to https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html > > gcc/ > PR target/98167 > * config/i386/i386.c (ix86_gimple_fold_builtin): Handle > IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512, > IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS, > IX86_BUILTIN_SHUFPS256. > > gcc/testsuite/ > * gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase. > * gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase. I think this should defer to gcc-12. jeff
From 74596b08a91dafcb29441de59544dd857a090564 Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao.liu@intel.com> Date: Fri, 11 Dec 2020 19:02:43 +0800 Subject: [PATCH] [X86] Fold more shuffle builtins to VEC_PERM_EXPR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A follow-up to https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html gcc/ PR target/98167 * config/i386/i386.c (ix86_gimple_fold_builtin): Handle IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512, IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS, IX86_BUILTIN_SHUFPS256. gcc/testsuite/ * gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase. * gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase. --- gcc/config/i386/i386.c | 64 ++++++++++++++++--- .../gcc.target/i386/avx512f-vshufpd-1.c | 3 +- .../gcc.target/i386/avx512f-vshufps-1.c | 3 +- 3 files changed, 59 insertions(+), 11 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 54b7e103ba2..432470a916c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -18187,21 +18187,67 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) } break; + case IX86_BUILTIN_SHUFPD512: + case IX86_BUILTIN_SHUFPS512: + if (n_args > 2) + { + /* This is masked shuffle. Only optimize if the mask is all ones. */ + tree argl = gimple_call_arg (stmt, n_args - 1); + arg0 = gimple_call_arg (stmt, 0); + if (!tree_fits_uhwi_p (argl)) + break; + unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) + break; + } + /* Fall thru. */ case IX86_BUILTIN_SHUFPD: + case IX86_BUILTIN_SHUFPD256: + case IX86_BUILTIN_SHUFPS: + case IX86_BUILTIN_SHUFPS256: arg2 = gimple_call_arg (stmt, 2); if (TREE_CODE (arg2) == INTEGER_CST) { - location_t loc = gimple_location (stmt); - unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); arg0 = gimple_call_arg (stmt, 0); + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0))); + unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); + + /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */ + if (imask > 255 + || (imask >= HOST_WIDE_INT_1U << elems + && imode == E_DFmode)) + return false; + arg1 = gimple_call_arg (stmt, 1); - tree itype = long_long_integer_type_node; - tree vtype = build_vector_type (itype, 2); /* V2DI */ - tree_vector_builder elts (vtype, 2, 1); - /* Ignore bits other than the lowest 2. */ - elts.quick_push (build_int_cst (itype, imask & 1)); - imask >>= 1; - elts.quick_push (build_int_cst (itype, 2 + (imask & 1))); + location_t loc = gimple_location (stmt); + tree itype = imode == E_DFmode + ? long_long_integer_type_node : integer_type_node; + /* V2DI/V4DI/V8DI/V4SI/V8SI/V16SI */ + tree vtype = build_vector_type (itype, elems); + tree_vector_builder elts (vtype, elems, 1); + + for (unsigned i = 0; i != elems; i++) + { + unsigned sel_idx; + /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6]) + provide 2 select constrols for each element of the + destination. */ + if (imode == E_DFmode) + sel_idx = (i & 1) * elems + + (i >> 1 << 1) + ((imask & 1 << i) >> i); + else + { + /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select + controls for each element of the destination. */ + unsigned j = i % 4; + sel_idx = ((i & 2) >> 1) * elems + + (i >> 2 << 2) + ((imask & 3 << j << j) >> j >> j); + } + elts.quick_push (build_int_cst (itype, sel_idx)); + } + tree omask = elts.build (); gimple *g = gimple_build_assign (gimple_call_lhs (stmt), VEC_PERM_EXPR, diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c index d1ac01e1c88..8df5b9d4441 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c @@ -7,11 +7,12 @@ #include <immintrin.h> __m512d x; +__m512d y; void extern avx512f_test (void) { - x = _mm512_shuffle_pd (x, x, 56); + x = _mm512_shuffle_pd (x, y, 56); x = _mm512_mask_shuffle_pd (x, 2, x, x, 56); x = _mm512_maskz_shuffle_pd (2, x, x, 56); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c index 07a63fca3ff..378ae4b7101 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c @@ -7,11 +7,12 @@ #include <immintrin.h> __m512 x; +__m512 y; void extern avx512f_test (void) { - x = _mm512_shuffle_ps (x, x, 56); + x = _mm512_shuffle_ps (x, y, 56); x = _mm512_mask_shuffle_ps (x, 2, x, x, 56); x = _mm512_maskz_shuffle_ps (2, x, x, 56); } -- 2.18.1