Message ID | 20220118062642.48846-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | Enhance vec_pack_trunc for integral mode mask. | expand |
On Tue, Jan 18, 2022 at 7:26 AM liuhongt <hongtao.liu@intel.com> wrote: > > For testcase in PR, the patch supports QI:4 -> HI:16 pack with > multi steps(first pack QI:4 -> QI:8 through vec_pack_sbool_trunc_qi, > then pack QI:8 -> HI:16 through vec_pack_trunc_hi). > Similar for QI:2 -> HI:16 which is test4 in mask-pack-prefer-128.c. > > Bootstrapped both with and w/o '--with-arch=native --with-cpu=native' > on CLX. > Regtested for x86_64-pc-linux-gnu{-m32,} and > x86_64-pc-linux-gnu{-m32\ -march=cascadelake,\ -march=cascadelake} on CLX. > > This patch can partially fix the regression in PR since it enable > vectorization for epilogue. > I'm also working on another patch to prevent mask pack for testcase in PR, > and there will be a separate patch. > > Ok for trunk? > > gcc/ChangeLog: > > PR target/103771 > * tree-vect-stmts.c (supportable_narrowing_operation): Enhance > integral mode mask pack by multi steps which takes > vec_pack_sbool_trunc_optab as start when elements number is > less than BITS_PER_UNITS. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/mask-pack-prefer128.c: New test. > * gcc.target/i386/mask-pack-prefer128.c: New test. > * gcc.target/i386/pr103771.c: New test. > --- > .../gcc.target/i386/mask-pack-prefer128.c | 8 ++++++++ > .../gcc.target/i386/mask-pack-prefer256.c | 8 ++++++++ > gcc/testsuite/gcc.target/i386/pr103771.c | 18 ++++++++++++++++++ > gcc/tree-vect-stmts.c | 11 ++++++++--- > 4 files changed, 42 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c > create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103771.c > > diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c > new file mode 100644 > index 00000000000..c9ea37c7ed3 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c > @@ -0,0 +1,8 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=128" } */ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */ > +/* { dg-final { scan-assembler-not "maskmov" } } */ > + > +#include "mask-pack.c" > diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c > new file mode 100644 > index 00000000000..841f51b4041 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c > @@ -0,0 +1,8 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=256" } */ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */ > +/* { dg-final { scan-assembler-not "maskmov" } } */ > + > +#include "mask-pack.c" > diff --git a/gcc/testsuite/gcc.target/i386/pr103771.c b/gcc/testsuite/gcc.target/i386/pr103771.c > new file mode 100644 > index 00000000000..a1a9952b6a8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103771.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=cascadelake -O3 -fdump-tree-vect-details -mprefer-vector-width=128" } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > + > +typedef unsigned char uint8_t; > + > +static uint8_t x264_clip_uint8 (int x) > +{ > + return x & (~255) ? (-x) >> 31 : x; > +} > + > +void > +mc_weight (uint8_t* __restrict dst, uint8_t* __restrict src, > + int i_width,int i_scale) > +{ > + for(int x = 0; x < i_width; x++) > + dst[x] = x264_clip_uint8 (src[x] * i_scale); > +} > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index f2625a2ff40..4baf5e36127 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -12111,6 +12111,7 @@ supportable_narrowing_operation (enum tree_code code, > tree intermediate_type, prev_type; > machine_mode intermediate_mode, prev_mode; > int i; > + unsigned HOST_WIDE_INT n_elts; > bool uns; > > *multi_step_cvt = 0; > @@ -12120,8 +12121,10 @@ supportable_narrowing_operation (enum tree_code code, > c1 = VEC_PACK_TRUNC_EXPR; > if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype) > && VECTOR_BOOLEAN_TYPE_P (vectype) > - && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype) > - && SCALAR_INT_MODE_P (TYPE_MODE (vectype))) > + && SCALAR_INT_MODE_P (TYPE_MODE (vectype)) > + && (TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype) > + || (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts) > + && n_elts < BITS_PER_UNIT))) your description above hints at that the actual modes involved in the vec_pack_sbool_trunc are the same so the TYPE_MODE (narrow_vectype) and TYPE_MODE (vectype) are not the actual modes participating. I think it would be way better to fix that. I suppose that since we know TYPE_VECTOR_SUBPARTS is a power of two it's always going to be only QImode that is of interest here so maybe a better check would be TYPE_MODE (narrow_vectype) == QImode rather than the equality check or elide the mode check completely and only retain the TYPE_VECTOR_SUBPARTS check you add? > optab1 = vec_pack_sbool_trunc_optab; > else > optab1 = optab_for_tree_code (c1, vectype, optab_default); > @@ -12213,7 +12216,9 @@ supportable_narrowing_operation (enum tree_code code, > if (VECTOR_BOOLEAN_TYPE_P (intermediate_type) > && VECTOR_BOOLEAN_TYPE_P (prev_type) > && intermediate_mode == prev_mode Likewise here. So I think the change is OK if you remove the mode equality checks. Thanks, Richard. > - && SCALAR_INT_MODE_P (prev_mode)) > + && SCALAR_INT_MODE_P (prev_mode) > + && (TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts) > + && n_elts < BITS_PER_UNIT)) > interm_optab = vec_pack_sbool_trunc_optab; > else > interm_optab > -- > 2.18.1 >
diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c new file mode 100644 index 00000000000..c9ea37c7ed3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=128" } */ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */ +/* { dg-final { scan-assembler-not "maskmov" } } */ + +#include "mask-pack.c" diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c new file mode 100644 index 00000000000..841f51b4041 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=256" } */ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */ +/* { dg-final { scan-assembler-not "maskmov" } } */ + +#include "mask-pack.c" diff --git a/gcc/testsuite/gcc.target/i386/pr103771.c b/gcc/testsuite/gcc.target/i386/pr103771.c new file mode 100644 index 00000000000..a1a9952b6a8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103771.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-march=cascadelake -O3 -fdump-tree-vect-details -mprefer-vector-width=128" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ + +typedef unsigned char uint8_t; + +static uint8_t x264_clip_uint8 (int x) +{ + return x & (~255) ? (-x) >> 31 : x; +} + +void +mc_weight (uint8_t* __restrict dst, uint8_t* __restrict src, + int i_width,int i_scale) +{ + for(int x = 0; x < i_width; x++) + dst[x] = x264_clip_uint8 (src[x] * i_scale); +} diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index f2625a2ff40..4baf5e36127 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -12111,6 +12111,7 @@ supportable_narrowing_operation (enum tree_code code, tree intermediate_type, prev_type; machine_mode intermediate_mode, prev_mode; int i; + unsigned HOST_WIDE_INT n_elts; bool uns; *multi_step_cvt = 0; @@ -12120,8 +12121,10 @@ supportable_narrowing_operation (enum tree_code code, c1 = VEC_PACK_TRUNC_EXPR; if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype) && VECTOR_BOOLEAN_TYPE_P (vectype) - && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype) - && SCALAR_INT_MODE_P (TYPE_MODE (vectype))) + && SCALAR_INT_MODE_P (TYPE_MODE (vectype)) + && (TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype) + || (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts) + && n_elts < BITS_PER_UNIT))) optab1 = vec_pack_sbool_trunc_optab; else optab1 = optab_for_tree_code (c1, vectype, optab_default); @@ -12213,7 +12216,9 @@ supportable_narrowing_operation (enum tree_code code, if (VECTOR_BOOLEAN_TYPE_P (intermediate_type) && VECTOR_BOOLEAN_TYPE_P (prev_type) && intermediate_mode == prev_mode - && SCALAR_INT_MODE_P (prev_mode)) + && SCALAR_INT_MODE_P (prev_mode) + && (TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts) + && n_elts < BITS_PER_UNIT)) interm_optab = vec_pack_sbool_trunc_optab; else interm_optab