diff mbox series

[x86] Check avx upper register for parallel.

Message ID 20240830044900.479477-1-hongtao.liu@intel.com
State New
Headers show
Series [x86] Check avx upper register for parallel. | expand

Commit Message

liuhongt Aug. 30, 2024, 4:49 a.m. UTC
> Can the above loop be a part of ix86_check_avx_upper_register, so this
> function would scan the full RTX for avx upper register?
Changed, also adjust ix86_check_avx_upper_stores and ix86_avx_u128_mode_needed
to either inline the old ix86_check_avx_upper_register or replace FOR_EACH_SUBRTX
with new ix86_check_avx_upper_register.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport?

For function arguments/return, when it's BLK mode, it's put in a
parallel with an expr_list, and the expr_list contains the real mode
and registers.
Current ix86_check_avx_upper_register only checked for SSE_REG_P, and
failed to handle that. The patch extend the handle to each subrtx.

gcc/ChangeLog:

	PR target/116512
	* config/i386/i386.cc (ix86_check_avx_upper_register): Iterate
	subrtx to scan for avx upper register.
	(ix86_check_avx_upper_stores): Inline old
	ix86_check_avx_upper_register.
	(ix86_avx_u128_mode_needed): Ditto, and replace
	FOR_EACH_SUBRTX with call to new
	ix86_check_avx_upper_register.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr116512.c: New test.
---
 gcc/config/i386/i386.cc                  | 36 +++++++++++++++---------
 gcc/testsuite/gcc.target/i386/pr116512.c | 26 +++++++++++++++++
 2 files changed, 49 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116512.c

Comments

Uros Bizjak Aug. 30, 2024, 5:50 a.m. UTC | #1
On Fri, Aug 30, 2024 at 6:49 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> > Can the above loop be a part of ix86_check_avx_upper_register, so this
> > function would scan the full RTX for avx upper register?
> Changed, also adjust ix86_check_avx_upper_stores and ix86_avx_u128_mode_needed
> to either inline the old ix86_check_avx_upper_register or replace FOR_EACH_SUBRTX
> with new ix86_check_avx_upper_register.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk and backport?
>
> For function arguments/return, when it's BLK mode, it's put in a
> parallel with an expr_list, and the expr_list contains the real mode
> and registers.
> Current ix86_check_avx_upper_register only checked for SSE_REG_P, and
> failed to handle that. The patch extend the handle to each subrtx.
>
> gcc/ChangeLog:
>
>         PR target/116512
>         * config/i386/i386.cc (ix86_check_avx_upper_register): Iterate
>         subrtx to scan for avx upper register.
>         (ix86_check_avx_upper_stores): Inline old
>         ix86_check_avx_upper_register.
>         (ix86_avx_u128_mode_needed): Ditto, and replace
>         FOR_EACH_SUBRTX with call to new
>         ix86_check_avx_upper_register.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr116512.c: New test.

OK for all branches.

Perhaps we could put the repeated condition in a macro, but this could
be an eventual follow-up patch.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.cc                  | 36 +++++++++++++++---------
>  gcc/testsuite/gcc.target/i386/pr116512.c | 26 +++++++++++++++++
>  2 files changed, 49 insertions(+), 13 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr116512.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 224a78cc832..c40cee5b885 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -14881,9 +14881,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn)
>  static bool
>  ix86_check_avx_upper_register (const_rtx exp)
>  {
> -  return (SSE_REG_P (exp)
> -         && !EXT_REX_SSE_REG_P (exp)
> -         && GET_MODE_BITSIZE (GET_MODE (exp)) > 128);
> +  /* construct_container may return a parallel with expr_list
> +     which contains the real reg and mode  */
> +  subrtx_iterator::array_type array;
> +  FOR_EACH_SUBRTX (iter, array, exp, NONCONST)
> +    {
> +      const_rtx x = *iter;
> +      if (SSE_REG_P (x)
> +         && !EXT_REX_SSE_REG_P (x)
> +         && GET_MODE_BITSIZE (GET_MODE (x)) > 128)
> +       return true;
> +    }
> +
> +  return false;
>  }
>
>  /* Check if a 256bit or 512bit AVX register is referenced in stores.   */
> @@ -14891,7 +14901,9 @@ ix86_check_avx_upper_register (const_rtx exp)
>  static void
>  ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
>  {
> -  if (ix86_check_avx_upper_register (dest))
> +  if (SSE_REG_P (dest)
> +      && !EXT_REX_SSE_REG_P (dest)
> +      && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
>      {
>        bool *used = (bool *) data;
>        *used = true;
> @@ -14950,14 +14962,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
>        return AVX_U128_CLEAN;
>      }
>
> -  subrtx_iterator::array_type array;
> -
>    rtx set = single_set (insn);
>    if (set)
>      {
>        rtx dest = SET_DEST (set);
>        rtx src = SET_SRC (set);
> -      if (ix86_check_avx_upper_register (dest))
> +      if (SSE_REG_P (dest)
> +         && !EXT_REX_SSE_REG_P (dest)
> +         && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
>         {
>           /* This is an YMM/ZMM load.  Return AVX_U128_DIRTY if the
>              source isn't zero.  */
> @@ -14968,9 +14980,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
>         }
>        else
>         {
> -         FOR_EACH_SUBRTX (iter, array, src, NONCONST)
> -           if (ix86_check_avx_upper_register (*iter))
> -             return AVX_U128_DIRTY;
> +         if (ix86_check_avx_upper_register (src))
> +           return AVX_U128_DIRTY;
>         }
>
>        /* This isn't YMM/ZMM load/store.  */
> @@ -14981,9 +14992,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
>       Hardware changes state only when a 256bit register is written to,
>       but we need to prevent the compiler from moving optimal insertion
>       point above eventual read from 256bit or 512 bit register.  */
> -  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
> -    if (ix86_check_avx_upper_register (*iter))
> -      return AVX_U128_DIRTY;
> +  if (ix86_check_avx_upper_register (PATTERN (insn)))
> +    return AVX_U128_DIRTY;
>
>    return AVX_U128_ANY;
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c b/gcc/testsuite/gcc.target/i386/pr116512.c
> new file mode 100644
> index 00000000000..c2bc6c91b64
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr116512.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v4 -O2" } */
> +/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> +
> +#include <immintrin.h>
> +
> +struct B {
> +  union {
> +    __m512 f;
> +    __m512i s;
> +  };
> +};
> +
> +struct B foo(int n) {
> +  struct B res;
> +  res.s = _mm512_set1_epi32(n);
> +
> +  return res;
> +}
> +
> +__m512i bar(int n) {
> +  struct B res;
> +  res.s = _mm512_set1_epi32(n);
> +
> +  return res.s;
> +}
> --
> 2.31.1
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 224a78cc832..c40cee5b885 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14881,9 +14881,19 @@  ix86_dirflag_mode_needed (rtx_insn *insn)
 static bool
 ix86_check_avx_upper_register (const_rtx exp)
 {
-  return (SSE_REG_P (exp)
-	  && !EXT_REX_SSE_REG_P (exp)
-	  && GET_MODE_BITSIZE (GET_MODE (exp)) > 128);
+  /* construct_container may return a parallel with expr_list
+     which contains the real reg and mode  */
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, exp, NONCONST)
+    {
+      const_rtx x = *iter;
+      if (SSE_REG_P (x)
+	  && !EXT_REX_SSE_REG_P (x)
+	  && GET_MODE_BITSIZE (GET_MODE (x)) > 128)
+	return true;
+    }
+
+  return false;
 }
 
 /* Check if a 256bit or 512bit AVX register is referenced in stores.   */
@@ -14891,7 +14901,9 @@  ix86_check_avx_upper_register (const_rtx exp)
 static void
 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
 {
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+      && !EXT_REX_SSE_REG_P (dest)
+      && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
     {
       bool *used = (bool *) data;
       *used = true;
@@ -14950,14 +14962,14 @@  ix86_avx_u128_mode_needed (rtx_insn *insn)
       return AVX_U128_CLEAN;
     }
 
-  subrtx_iterator::array_type array;
-
   rtx set = single_set (insn);
   if (set)
     {
       rtx dest = SET_DEST (set);
       rtx src = SET_SRC (set);
-      if (ix86_check_avx_upper_register (dest))
+      if (SSE_REG_P (dest)
+	  && !EXT_REX_SSE_REG_P (dest)
+	  && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
 	{
 	  /* This is an YMM/ZMM load.  Return AVX_U128_DIRTY if the
 	     source isn't zero.  */
@@ -14968,9 +14980,8 @@  ix86_avx_u128_mode_needed (rtx_insn *insn)
 	}
       else
 	{
-	  FOR_EACH_SUBRTX (iter, array, src, NONCONST)
-	    if (ix86_check_avx_upper_register (*iter))
-	      return AVX_U128_DIRTY;
+	  if (ix86_check_avx_upper_register (src))
+	    return AVX_U128_DIRTY;
 	}
 
       /* This isn't YMM/ZMM load/store.  */
@@ -14981,9 +14992,8 @@  ix86_avx_u128_mode_needed (rtx_insn *insn)
      Hardware changes state only when a 256bit register is written to,
      but we need to prevent the compiler from moving optimal insertion
      point above eventual read from 256bit or 512 bit register.  */
-  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
-    if (ix86_check_avx_upper_register (*iter))
-      return AVX_U128_DIRTY;
+  if (ix86_check_avx_upper_register (PATTERN (insn)))
+    return AVX_U128_DIRTY;
 
   return AVX_U128_ANY;
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c b/gcc/testsuite/gcc.target/i386/pr116512.c
new file mode 100644
index 00000000000..c2bc6c91b64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116512.c
@@ -0,0 +1,26 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+
+#include <immintrin.h>
+
+struct B {
+  union {
+    __m512 f;
+    __m512i s;
+  };
+};
+
+struct B foo(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res;
+}
+
+__m512i bar(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res.s;
+}