diff mbox series

Enhance vec_pack_trunc for integral mode mask.

Message ID 20220119063119.21441-1-hongtao.liu@intel.com
State New
Headers show
Series Enhance vec_pack_trunc for integral mode mask. | expand

Commit Message

liuhongt Jan. 19, 2022, 6:31 a.m. UTC
> your description above hints at that the actual modes involved in the
> vec_pack_sbool_trunc are the same so the TYPE_MODE (narrow_vectype)
> and TYPE_MODE (vectype) are not the actual modes participating.  I think
> it would be way better to fix that.
>
> I suppose that since we know TYPE_VECTOR_SUBPARTS is a power of two
> it's always going to be only QImode that is of interest here so maybe a better
> check would be TYPE_MODE (narrow_vectype) == QImode rather than
> the equality check or elide the mode check completely and only retain
> the TYPE_VECTOR_SUBPARTS check you add?
>
> >         optab1 = vec_pack_sbool_trunc_optab;
> >        else
> >         optab1 = optab_for_tree_code (c1, vectype, optab_default);
> > @@ -12213,7 +12216,9 @@ supportable_narrowing_operation (enum tree_code code,
> >        if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
> >           && VECTOR_BOOLEAN_TYPE_P (prev_type)
> >           && intermediate_mode == prev_mode
>
> Likewise here.
>
> So I think the change is OK if you remove the mode equality checks.

Thanks for the review, here is updated patch, it survived bootstrap and regtest.
I'm going to check in the patch if there's no surprise for SPEC2017 on ICX.

For testcase in PR, the patch supports QI:4 -> HI:16 pack with
multi steps(first pack QI:4 -> QI:8 through vec_pack_sbool_trunc_qi,
then pack QI:8 -> HI:16 through vec_pack_trunc_hi).
Similar for QI:2 -> HI:16 which is test4 in mask-pack-prefer-128.c.

gcc/ChangeLog:

	PR target/103771
	* tree-vect-stmts.c (supportable_narrowing_operation): Enhance
	integral mode mask pack by multi steps which takes
	vec_pack_sbool_trunc_optab as start when elements number is
	less than BITS_PER_UNITS.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/mask-pack-prefer128.c: New test.
	* gcc.target/i386/mask-pack-prefer128.c: New test.
	* gcc.target/i386/pr103771.c: New test.
---
 .../gcc.target/i386/mask-pack-prefer128.c      |  8 ++++++++
 .../gcc.target/i386/mask-pack-prefer256.c      |  8 ++++++++
 gcc/testsuite/gcc.target/i386/pr103771.c       | 18 ++++++++++++++++++
 gcc/tree-vect-stmts.cc                         | 11 +++++++----
 4 files changed, 41 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
 create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103771.c
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
new file mode 100644
index 00000000000..c9ea37c7ed3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
@@ -0,0 +1,8 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=128" } */
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
+/* { dg-final { scan-assembler-not "maskmov" } } */
+
+#include "mask-pack.c"
diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
new file mode 100644
index 00000000000..841f51b4041
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
@@ -0,0 +1,8 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
+/* { dg-final { scan-assembler-not "maskmov" } } */
+
+#include "mask-pack.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr103771.c b/gcc/testsuite/gcc.target/i386/pr103771.c
new file mode 100644
index 00000000000..a1a9952b6a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103771.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=cascadelake -O3 -fdump-tree-vect-details -mprefer-vector-width=128" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+
+typedef unsigned char uint8_t;
+
+static uint8_t x264_clip_uint8 (int x)
+{
+  return x & (~255) ? (-x) >> 31 : x;
+}
+
+void
+mc_weight (uint8_t* __restrict dst, uint8_t* __restrict src,
+	   int i_width,int i_scale)
+{
+  for(int x = 0; x < i_width; x++)
+    dst[x] = x264_clip_uint8 (src[x] * i_scale);
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 95be4f38eea..824ebb6354b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12124,6 +12124,7 @@  supportable_narrowing_operation (enum tree_code code,
   tree intermediate_type, prev_type;
   machine_mode intermediate_mode, prev_mode;
   int i;
+  unsigned HOST_WIDE_INT n_elts;
   bool uns;
 
   *multi_step_cvt = 0;
@@ -12133,8 +12134,9 @@  supportable_narrowing_operation (enum tree_code code,
       c1 = VEC_PACK_TRUNC_EXPR;
       if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype)
 	  && VECTOR_BOOLEAN_TYPE_P (vectype)
-	  && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
-	  && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
+	  && SCALAR_INT_MODE_P (TYPE_MODE (vectype))
+	  && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts)
+	  && n_elts < BITS_PER_UNIT)
 	optab1 = vec_pack_sbool_trunc_optab;
       else
 	optab1 = optab_for_tree_code (c1, vectype, optab_default);
@@ -12225,8 +12227,9 @@  supportable_narrowing_operation (enum tree_code code,
 	  = lang_hooks.types.type_for_mode (intermediate_mode, uns);
       if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
 	  && VECTOR_BOOLEAN_TYPE_P (prev_type)
-	  && intermediate_mode == prev_mode
-	  && SCALAR_INT_MODE_P (prev_mode))
+	  && SCALAR_INT_MODE_P (prev_mode)
+	  && TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts)
+	  && n_elts < BITS_PER_UNIT)
 	interm_optab = vec_pack_sbool_trunc_optab;
       else
 	interm_optab