Message ID | 20210826045751.40630-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | Fold more shuffle builtins to VEC_PERM_EXPR. | expand |
On Thu, Aug 26, 2021 at 12:57 PM liuhongt <hongtao.liu@intel.com> wrote: > > This patch is a follow-up to [1], it fold all shufps/shufpd builtins into gimple. Of course for non-mask or mask all-ones version. > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > [1] https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html > > gcc/ > PR target/98167 > PR target/43147 > * config/i386/i386.c (ix86_gimple_fold_builtin): Fold > IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512, > IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS, > IX86_BUILTIN_SHUFPS256. > (ix86_masked_all_ones): New function. > > gcc/testsuite/ > * gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase. > * gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase. > * gcc.target/i386/pr43147.c: New test. > --- > gcc/config/i386/i386.c | 90 ++++++++++++++----- > .../gcc.target/i386/avx512f-vshufpd-1.c | 3 +- > .../gcc.target/i386/avx512f-vshufps-1.c | 3 +- > gcc/testsuite/gcc.target/i386/pr43147.c | 15 ++++ > 4 files changed, 87 insertions(+), 24 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr43147.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index ebec8668758..f3eed9f2426 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -17541,6 +17541,20 @@ ix86_vector_shift_count (tree arg1) > return NULL_TREE; > } > > +/* Return true if arg_mask is all ones, arg_vec is corresponding vector. */ > +static bool > +ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask) > +{ > + if (TREE_CODE (arg_mask) != INTEGER_CST) > + return false; > + > + unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask); > + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) > + return false; > + > + return true; > +} > + > static tree > ix86_fold_builtin (tree fndecl, int n_args, > tree *args, bool ignore ATTRIBUTE_UNUSED) > @@ -18026,6 +18040,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) > enum tree_code tcode; > unsigned HOST_WIDE_INT count; > bool is_vshift; > + unsigned HOST_WIDE_INT elems; > > switch (fn_code) > { > @@ -18349,17 +18364,11 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) > gcc_assert (n_args >= 2); > arg0 = gimple_call_arg (stmt, 0); > arg1 = gimple_call_arg (stmt, 1); > - if (n_args > 2) > - { > - /* This is masked shift. Only optimize if the mask is all ones. */ > - tree argl = gimple_call_arg (stmt, n_args - 1); > - if (!tree_fits_uhwi_p (argl)) > - break; > - unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); > - unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > - if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) > - break; > - } > + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > + /* For masked shift, only optimize if the mask is all ones. */ > + if (n_args > 2 > + && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 1))) > + break; > if (is_vshift) > { > if (TREE_CODE (arg1) != VECTOR_CST) > @@ -18408,25 +18417,62 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) > } > break; > > + case IX86_BUILTIN_SHUFPD512: > + case IX86_BUILTIN_SHUFPS512: > case IX86_BUILTIN_SHUFPD: > + case IX86_BUILTIN_SHUFPD256: > + case IX86_BUILTIN_SHUFPS: > + case IX86_BUILTIN_SHUFPS256: > + arg0 = gimple_call_arg (stmt, 0); > + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > + /* This is masked shuffle. Only optimize if the mask is all ones. */ > + if (n_args > 3 > + && !ix86_masked_all_ones (elems, > + gimple_call_arg (stmt, n_args - 1))) > + break; > arg2 = gimple_call_arg (stmt, 2); > if (TREE_CODE (arg2) == INTEGER_CST) > { > + unsigned HOST_WIDE_INT shuffle_mask = TREE_INT_CST_LOW (arg2); > + /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */ > + if (shuffle_mask > 255) > + return false; > + > + machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0))); > location_t loc = gimple_location (stmt); > - unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); > - arg0 = gimple_call_arg (stmt, 0); > + tree itype = (imode == E_DFmode > + ? long_long_integer_type_node : integer_type_node); > + tree vtype = build_vector_type (itype, elems); > + tree_vector_builder elts (vtype, elems, 1); > + > + > + /* Transform integer shuffle_mask to vector perm_mask which > + is used by vec_perm_expr, refer to shuflp[sd]256/512 in sse.md. */ > + for (unsigned i = 0; i != elems; i++) > + { > + unsigned sel_idx; > + /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6]) > + provide 2 select constrols for each element of the > + destination. */ > + if (imode == E_DFmode) > + sel_idx = (i & 1) * elems + (i & ~1) > + + ((shuffle_mask >> i) & 1); > + else > + { > + /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select > + controls for each element of the destination. */ > + unsigned j = i % 4; > + sel_idx = ((i >> 1) & 1) * elems + (i & ~3) > + + ((shuffle_mask >> 2 * j) & 3); > + } > + elts.quick_push (build_int_cst (itype, sel_idx)); > + } > + > + tree perm_mask = elts.build (); > arg1 = gimple_call_arg (stmt, 1); > - tree itype = long_long_integer_type_node; > - tree vtype = build_vector_type (itype, 2); /* V2DI */ > - tree_vector_builder elts (vtype, 2, 1); > - /* Ignore bits other than the lowest 2. */ > - elts.quick_push (build_int_cst (itype, imask & 1)); > - imask >>= 1; > - elts.quick_push (build_int_cst (itype, 2 + (imask & 1))); > - tree omask = elts.build (); > gimple *g = gimple_build_assign (gimple_call_lhs (stmt), > VEC_PERM_EXPR, > - arg0, arg1, omask); > + arg0, arg1, perm_mask); > gimple_set_location (g, loc); > gsi_replace (gsi, g, false); > return true; > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c > index d1ac01e1c88..8df5b9d4441 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c > @@ -7,11 +7,12 @@ > #include <immintrin.h> > > __m512d x; > +__m512d y; > > void extern > avx512f_test (void) > { > - x = _mm512_shuffle_pd (x, x, 56); > + x = _mm512_shuffle_pd (x, y, 56); > x = _mm512_mask_shuffle_pd (x, 2, x, x, 56); > x = _mm512_maskz_shuffle_pd (2, x, x, 56); > } > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c > index 07a63fca3ff..378ae4b7101 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c > @@ -7,11 +7,12 @@ > #include <immintrin.h> > > __m512 x; > +__m512 y; > > void extern > avx512f_test (void) > { > - x = _mm512_shuffle_ps (x, x, 56); > + x = _mm512_shuffle_ps (x, y, 56); > x = _mm512_mask_shuffle_ps (x, 2, x, x, 56); > x = _mm512_maskz_shuffle_ps (2, x, x, 56); > } > diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c b/gcc/testsuite/gcc.target/i386/pr43147.c > new file mode 100644 > index 00000000000..3c30f917c06 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr43147.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2" } */ > +/* { dg-final { scan-assembler "movaps" } } */ > +/* { dg-final { scan-assembler-not "shufps" } } */ > + > +#include <x86intrin.h> > + > +__m128 > +foo (void) > +{ > + __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f); > + m = _mm_shuffle_ps(m, m, 0xC9); > + m = _mm_shuffle_ps(m, m, 0x2D); > + return m; > +} > -- > 2.18.1 >
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ebec8668758..f3eed9f2426 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -17541,6 +17541,20 @@ ix86_vector_shift_count (tree arg1) return NULL_TREE; } +/* Return true if arg_mask is all ones, arg_vec is corresponding vector. */ +static bool +ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask) +{ + if (TREE_CODE (arg_mask) != INTEGER_CST) + return false; + + unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask); + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) + return false; + + return true; +} + static tree ix86_fold_builtin (tree fndecl, int n_args, tree *args, bool ignore ATTRIBUTE_UNUSED) @@ -18026,6 +18040,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) enum tree_code tcode; unsigned HOST_WIDE_INT count; bool is_vshift; + unsigned HOST_WIDE_INT elems; switch (fn_code) { @@ -18349,17 +18364,11 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) gcc_assert (n_args >= 2); arg0 = gimple_call_arg (stmt, 0); arg1 = gimple_call_arg (stmt, 1); - if (n_args > 2) - { - /* This is masked shift. Only optimize if the mask is all ones. */ - tree argl = gimple_call_arg (stmt, n_args - 1); - if (!tree_fits_uhwi_p (argl)) - break; - unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); - unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); - if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) - break; - } + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + /* For masked shift, only optimize if the mask is all ones. */ + if (n_args > 2 + && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 1))) + break; if (is_vshift) { if (TREE_CODE (arg1) != VECTOR_CST) @@ -18408,25 +18417,62 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) } break; + case IX86_BUILTIN_SHUFPD512: + case IX86_BUILTIN_SHUFPS512: case IX86_BUILTIN_SHUFPD: + case IX86_BUILTIN_SHUFPD256: + case IX86_BUILTIN_SHUFPS: + case IX86_BUILTIN_SHUFPS256: + arg0 = gimple_call_arg (stmt, 0); + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + /* This is masked shuffle. Only optimize if the mask is all ones. */ + if (n_args > 3 + && !ix86_masked_all_ones (elems, + gimple_call_arg (stmt, n_args - 1))) + break; arg2 = gimple_call_arg (stmt, 2); if (TREE_CODE (arg2) == INTEGER_CST) { + unsigned HOST_WIDE_INT shuffle_mask = TREE_INT_CST_LOW (arg2); + /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */ + if (shuffle_mask > 255) + return false; + + machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0))); location_t loc = gimple_location (stmt); - unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); - arg0 = gimple_call_arg (stmt, 0); + tree itype = (imode == E_DFmode + ? long_long_integer_type_node : integer_type_node); + tree vtype = build_vector_type (itype, elems); + tree_vector_builder elts (vtype, elems, 1); + + + /* Transform integer shuffle_mask to vector perm_mask which + is used by vec_perm_expr, refer to shuflp[sd]256/512 in sse.md. */ + for (unsigned i = 0; i != elems; i++) + { + unsigned sel_idx; + /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6]) + provide 2 select constrols for each element of the + destination. */ + if (imode == E_DFmode) + sel_idx = (i & 1) * elems + (i & ~1) + + ((shuffle_mask >> i) & 1); + else + { + /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select + controls for each element of the destination. */ + unsigned j = i % 4; + sel_idx = ((i >> 1) & 1) * elems + (i & ~3) + + ((shuffle_mask >> 2 * j) & 3); + } + elts.quick_push (build_int_cst (itype, sel_idx)); + } + + tree perm_mask = elts.build (); arg1 = gimple_call_arg (stmt, 1); - tree itype = long_long_integer_type_node; - tree vtype = build_vector_type (itype, 2); /* V2DI */ - tree_vector_builder elts (vtype, 2, 1); - /* Ignore bits other than the lowest 2. */ - elts.quick_push (build_int_cst (itype, imask & 1)); - imask >>= 1; - elts.quick_push (build_int_cst (itype, 2 + (imask & 1))); - tree omask = elts.build (); gimple *g = gimple_build_assign (gimple_call_lhs (stmt), VEC_PERM_EXPR, - arg0, arg1, omask); + arg0, arg1, perm_mask); gimple_set_location (g, loc); gsi_replace (gsi, g, false); return true; diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c index d1ac01e1c88..8df5b9d4441 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c @@ -7,11 +7,12 @@ #include <immintrin.h> __m512d x; +__m512d y; void extern avx512f_test (void) { - x = _mm512_shuffle_pd (x, x, 56); + x = _mm512_shuffle_pd (x, y, 56); x = _mm512_mask_shuffle_pd (x, 2, x, x, 56); x = _mm512_maskz_shuffle_pd (2, x, x, 56); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c index 07a63fca3ff..378ae4b7101 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c @@ -7,11 +7,12 @@ #include <immintrin.h> __m512 x; +__m512 y; void extern avx512f_test (void) { - x = _mm512_shuffle_ps (x, x, 56); + x = _mm512_shuffle_ps (x, y, 56); x = _mm512_mask_shuffle_ps (x, 2, x, x, 56); x = _mm512_maskz_shuffle_ps (2, x, x, 56); } diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c b/gcc/testsuite/gcc.target/i386/pr43147.c new file mode 100644 index 00000000000..3c30f917c06 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr43147.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-final { scan-assembler "movaps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +#include <x86intrin.h> + +__m128 +foo (void) +{ + __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f); + m = _mm_shuffle_ps(m, m, 0xC9); + m = _mm_shuffle_ps(m, m, 0x2D); + return m; +}