diff mbox series

[v2,x86] Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload.

Message ID 20240815074009.128776-1-hongtao.liu@intel.com
State New
Headers show
Series [v2,x86] Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload. | expand

Commit Message

liuhongt Aug. 15, 2024, 7:40 a.m. UTC
It results in 2 failures for x86_64-pc-linux-gnu{\
-march=cascadelake};

gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o
gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1

For pr113560.c, now GCC generates mulx instead of mulq with
-march=cascadelake, which should be optimal, so adjust testcase for
that.
For gcc.target/i386/extendditi2-1.c, RA happens to choose another
register instead of rax and result in

	movq	%rdi, %rbp
	movq	%rdi, %rax
	sarq	$63, %rbp
	movq	%rbp, %rdx

The patch adds a new define_peephole2 for that.

gcc/ChangeLog:

	PR target/116274
	* config/i386/i386-expand.cc (ix86_expand_vector_move):
	Restrict special case TImode to 128-bit vector conversions via
	V2DI under ix86_pre_reload_split ().
	* config/i386/i386.cc (inline_secondary_memory_needed):
	Movement between GENERAL_REGS and SSE_REGS for TImode doesn't
	need secondary reload.
	* config/i386/i386.md (*extendsidi2_rex64): Add a
	define_peephole2 after it.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr116274.c: New test.
	* gcc.target/i386/pr113560.c: Scan either mulq or mulx.
---
 gcc/config/i386/i386-expand.cc           |  2 +-
 gcc/config/i386/i386.cc                  | 18 ++++++++++++------
 gcc/config/i386/i386.md                  | 19 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr113560.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr116274.c | 12 ++++++++++++
 5 files changed, 45 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116274.c

Comments

Hongtao Liu Aug. 15, 2024, 11:04 a.m. UTC | #1
On Thu, Aug 15, 2024 at 3:27 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> It results in 2 failures for x86_64-pc-linux-gnu{\
> -march=cascadelake};
>
> gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o
> gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1
>
> For pr113560.c, now GCC generates mulx instead of mulq with
> -march=cascadelake, which should be optimal, so adjust testcase for
> that.
> For gcc.target/i386/extendditi2-1.c, RA happens to choose another
> register instead of rax and result in
>
>         movq    %rdi, %rbp
>         movq    %rdi, %rax
>         sarq    $63, %rbp
>         movq    %rbp, %rdx
>
> The patch adds a new define_peephole2 for that.
>
> gcc/ChangeLog:
>
>         PR target/116274
>         * config/i386/i386-expand.cc (ix86_expand_vector_move):
>         Restrict special case TImode to 128-bit vector conversions via
>         V2DI under ix86_pre_reload_split ().
>         * config/i386/i386.cc (inline_secondary_memory_needed):
>         Movement between GENERAL_REGS and SSE_REGS for TImode doesn't
>         need secondary reload.
>         * config/i386/i386.md (*extendsidi2_rex64): Add a
>         define_peephole2 after it.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr116274.c: New test.
>         * gcc.target/i386/pr113560.c: Scan either mulq or mulx.
> ---
>  gcc/config/i386/i386-expand.cc           |  2 +-
>  gcc/config/i386/i386.cc                  | 18 ++++++++++++------
>  gcc/config/i386/i386.md                  | 19 +++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr113560.c |  2 +-
>  gcc/testsuite/gcc.target/i386/pr116274.c | 12 ++++++++++++
>  5 files changed, 45 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr116274.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index bdbc1423267..ed546eeed6b 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
>        && SUBREG_P (op1)
>        && GET_MODE (SUBREG_REG (op1)) == TImode
>        && TARGET_64BIT && TARGET_SSE
> -      && can_create_pseudo_p ())
> +      && ix86_pre_reload_split ())
>      {
>        rtx tmp = gen_reg_rtx (V2DImode);
>        rtx lo = gen_reg_rtx (DImode);
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index f044826269c..4821892d1e0 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20292,6 +20292,18 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
>        if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2)))
>         return true;
>
> +      /* If the target says that inter-unit moves are more expensive
> +        than moving through memory, then don't generate them.  */
> +      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
> +         || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
> +       return true;
> +
> +      /* Under SSE4.1, *movti_internal supports movement between
> +        SSE_REGS and GENERAL_REGS with pinsrq and pextrq.  */
> +      if (TARGET_SSE4_1
> +         && (TARGET_64BIT ? mode == TImode : mode == DImode))
> +       return false;
> +
>        int msize = GET_MODE_SIZE (mode);
>
>        /* Between SSE and general, we have moves no larger than word size.  */
> @@ -20304,12 +20316,6 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
>
>        if (msize < minsize)
>         return true;
> -
> -      /* If the target says that inter-unit moves are more expensive
> -        than moving through memory, then don't generate them.  */
> -      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
> -         || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
> -       return true;
>      }
>
>    return false;
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index db7789c17d2..1962a7ba5c9 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -5041,6 +5041,25 @@ (define_split
>    DONE;
>  })
>
> +(define_peephole2
> +  [(set (match_operand:DI 0 "general_reg_operand")
> +       (match_operand:DI 1 "general_reg_operand"))
> +   (parallel [(set (match_dup 0)
> +                  (ashiftrt:DI (match_dup 0)
> +                               (const_int 63)))
> +              (clobber (reg:CC FLAGS_REG))])
> +   (set (match_operand:DI 2 "general_reg_operand") (match_dup 1))
> +   (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))]
> +  "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
> +   && REGNO (operands[2]) == AX_REG
> +   && REGNO (operands[3]) == DX_REG
> +   && peep2_reg_dead_p (4, operands[0])
> +   && !reg_mentioned_p (operands[0], operands[1])
> +   && !reg_mentioned_p (operands[2], operands[0])"
> +  [(set (match_dup 2) (match_dup 1))
> +   (parallel [(set (match_dup 3) (ashiftrt:DI (match_dup 2) (const_int 63)))
> +             (clobber (reg:CC FLAGS_REG))])])
> +
>  (define_insn "extend<mode>di2"
>    [(set (match_operand:DI 0 "register_operand" "=r")
>         (sign_extend:DI
> diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c b/gcc/testsuite/gcc.target/i386/pr113560.c
> index ac2e01a4589..9431a2d1d90 100644
> --- a/gcc/testsuite/gcc.target/i386/pr113560.c
> +++ b/gcc/testsuite/gcc.target/i386/pr113560.c
> @@ -11,7 +11,7 @@ __int128 bar(__int128 x, __int128 y)
>    return (x & 1000) * (y & 1000);
>  }
>
> -/* { dg-final { scan-assembler-times "\tmulq" 1 } } */
> +/* { dg-final { scan-assembler-times "\tmul\[qx\]" 1 } } */
>  /* { dg-final { scan-assembler-times "\timulq" 1 } } */
>  /* { dg-final { scan-assembler-not "addq" } } */
>  /* { dg-final { scan-assembler-not "xorl" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr116274.c b/gcc/testsuite/gcc.target/i386/pr116274.c
> new file mode 100644
> index 00000000000..51ac3e1572d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr116274.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O3 -fno-vect-cost-model -msse4.1" } */
> +
> +struct aq { long x,y; };
> +long testq(struct aq a) { return a.x+a.y; }
> +
> +struct aw { short a0,a1,a2,a3,a4,a5,a6,a7; };
> +short testw(struct aw a) { return a.a0+a.a1+a.a2+a.a3+a.a4+a.a5+a.a6+a.a7; }
> +
> +struct ad { int x,y,z,w; };
> +int testd(struct ad a) { return a.x+a.y+a.z+a.w; }
> +/* { dg-final { scan-assembler-not "%rsp" } } */
> --
> 2.43.0
>
Uros Bizjak Aug. 15, 2024, 11:10 a.m. UTC | #2
On Thu, Aug 15, 2024 at 9:27 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> It results in 2 failures for x86_64-pc-linux-gnu{\
> -march=cascadelake};
>
> gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o
> gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1
>
> For pr113560.c, now GCC generates mulx instead of mulq with
> -march=cascadelake, which should be optimal, so adjust testcase for
> that.
> For gcc.target/i386/extendditi2-1.c, RA happens to choose another
> register instead of rax and result in
>
>         movq    %rdi, %rbp
>         movq    %rdi, %rax
>         sarq    $63, %rbp
>         movq    %rbp, %rdx
>
> The patch adds a new define_peephole2 for that.
>
> gcc/ChangeLog:
>
>         PR target/116274
>         * config/i386/i386-expand.cc (ix86_expand_vector_move):
>         Restrict special case TImode to 128-bit vector conversions via
>         V2DI under ix86_pre_reload_split ().
>         * config/i386/i386.cc (inline_secondary_memory_needed):
>         Movement between GENERAL_REGS and SSE_REGS for TImode doesn't
>         need secondary reload.
>         * config/i386/i386.md (*extendsidi2_rex64): Add a
>         define_peephole2 after it.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr116274.c: New test.
>         * gcc.target/i386/pr113560.c: Scan either mulq or mulx.

OK, with updated comment, as proposed below.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-expand.cc           |  2 +-
>  gcc/config/i386/i386.cc                  | 18 ++++++++++++------
>  gcc/config/i386/i386.md                  | 19 +++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr113560.c |  2 +-
>  gcc/testsuite/gcc.target/i386/pr116274.c | 12 ++++++++++++
>  5 files changed, 45 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr116274.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index bdbc1423267..ed546eeed6b 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
>        && SUBREG_P (op1)
>        && GET_MODE (SUBREG_REG (op1)) == TImode
>        && TARGET_64BIT && TARGET_SSE
> -      && can_create_pseudo_p ())
> +      && ix86_pre_reload_split ())
>      {
>        rtx tmp = gen_reg_rtx (V2DImode);
>        rtx lo = gen_reg_rtx (DImode);
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index f044826269c..4821892d1e0 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20292,6 +20292,18 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
>        if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2)))
>         return true;
>
> +      /* If the target says that inter-unit moves are more expensive
> +        than moving through memory, then don't generate them.  */
> +      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
> +         || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
> +       return true;
> +
> +      /* Under SSE4.1, *movti_internal supports movement between
> +        SSE_REGS and GENERAL_REGS with pinsrq and pextrq.  */

WIth SSE4.1, *mov{ti,di}_internal supports moves between
SSE_REGS and GENERAL_REGS using pinsr{q,d} or pextr{q,d}.

> +      if (TARGET_SSE4_1
> +         && (TARGET_64BIT ? mode == TImode : mode == DImode))
> +       return false;
> +
>        int msize = GET_MODE_SIZE (mode);
>
>        /* Between SSE and general, we have moves no larger than word size.  */
> @@ -20304,12 +20316,6 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
>
>        if (msize < minsize)
>         return true;
> -
> -      /* If the target says that inter-unit moves are more expensive
> -        than moving through memory, then don't generate them.  */
> -      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
> -         || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
> -       return true;
>      }
>
>    return false;
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index db7789c17d2..1962a7ba5c9 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -5041,6 +5041,25 @@ (define_split
>    DONE;
>  })
>
> +(define_peephole2
> +  [(set (match_operand:DI 0 "general_reg_operand")
> +       (match_operand:DI 1 "general_reg_operand"))
> +   (parallel [(set (match_dup 0)
> +                  (ashiftrt:DI (match_dup 0)
> +                               (const_int 63)))
> +              (clobber (reg:CC FLAGS_REG))])
> +   (set (match_operand:DI 2 "general_reg_operand") (match_dup 1))
> +   (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))]
> +  "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
> +   && REGNO (operands[2]) == AX_REG
> +   && REGNO (operands[3]) == DX_REG
> +   && peep2_reg_dead_p (4, operands[0])
> +   && !reg_mentioned_p (operands[0], operands[1])
> +   && !reg_mentioned_p (operands[2], operands[0])"
> +  [(set (match_dup 2) (match_dup 1))
> +   (parallel [(set (match_dup 3) (ashiftrt:DI (match_dup 2) (const_int 63)))
> +             (clobber (reg:CC FLAGS_REG))])])
> +
>  (define_insn "extend<mode>di2"
>    [(set (match_operand:DI 0 "register_operand" "=r")
>         (sign_extend:DI
> diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c b/gcc/testsuite/gcc.target/i386/pr113560.c
> index ac2e01a4589..9431a2d1d90 100644
> --- a/gcc/testsuite/gcc.target/i386/pr113560.c
> +++ b/gcc/testsuite/gcc.target/i386/pr113560.c
> @@ -11,7 +11,7 @@ __int128 bar(__int128 x, __int128 y)
>    return (x & 1000) * (y & 1000);
>  }
>
> -/* { dg-final { scan-assembler-times "\tmulq" 1 } } */
> +/* { dg-final { scan-assembler-times "\tmul\[qx\]" 1 } } */
>  /* { dg-final { scan-assembler-times "\timulq" 1 } } */
>  /* { dg-final { scan-assembler-not "addq" } } */
>  /* { dg-final { scan-assembler-not "xorl" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr116274.c b/gcc/testsuite/gcc.target/i386/pr116274.c
> new file mode 100644
> index 00000000000..51ac3e1572d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr116274.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O3 -fno-vect-cost-model -msse4.1" } */
> +
> +struct aq { long x,y; };
> +long testq(struct aq a) { return a.x+a.y; }
> +
> +struct aw { short a0,a1,a2,a3,a4,a5,a6,a7; };
> +short testw(struct aw a) { return a.a0+a.a1+a.a2+a.a3+a.a4+a.a5+a.a6+a.a7; }
> +
> +struct ad { int x,y,z,w; };
> +int testd(struct ad a) { return a.x+a.y+a.z+a.w; }
> +/* { dg-final { scan-assembler-not "%rsp" } } */
> --
> 2.43.0
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index bdbc1423267..ed546eeed6b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -751,7 +751,7 @@  ix86_expand_vector_move (machine_mode mode, rtx operands[])
       && SUBREG_P (op1)
       && GET_MODE (SUBREG_REG (op1)) == TImode
       && TARGET_64BIT && TARGET_SSE
-      && can_create_pseudo_p ())
+      && ix86_pre_reload_split ())
     {
       rtx tmp = gen_reg_rtx (V2DImode);
       rtx lo = gen_reg_rtx (DImode);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f044826269c..4821892d1e0 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20292,6 +20292,18 @@  inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
       if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2)))
 	return true;
 
+      /* If the target says that inter-unit moves are more expensive
+	 than moving through memory, then don't generate them.  */
+      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
+	  || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
+	return true;
+
+      /* Under SSE4.1, *movti_internal supports movement between
+	 SSE_REGS and GENERAL_REGS with pinsrq and pextrq.  */
+      if (TARGET_SSE4_1
+	  && (TARGET_64BIT ? mode == TImode : mode == DImode))
+	return false;
+
       int msize = GET_MODE_SIZE (mode);
 
       /* Between SSE and general, we have moves no larger than word size.  */
@@ -20304,12 +20316,6 @@  inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
 
       if (msize < minsize)
 	return true;
-
-      /* If the target says that inter-unit moves are more expensive
-	 than moving through memory, then don't generate them.  */
-      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
-	  || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
-	return true;
     }
 
   return false;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index db7789c17d2..1962a7ba5c9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5041,6 +5041,25 @@  (define_split
   DONE;
 })
 
+(define_peephole2
+  [(set (match_operand:DI 0 "general_reg_operand")
+	(match_operand:DI 1 "general_reg_operand"))
+   (parallel [(set (match_dup 0)
+		   (ashiftrt:DI (match_dup 0)
+				(const_int 63)))
+	       (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:DI 2 "general_reg_operand") (match_dup 1))
+   (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))]
+  "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
+   && REGNO (operands[2]) == AX_REG
+   && REGNO (operands[3]) == DX_REG
+   && peep2_reg_dead_p (4, operands[0])
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[2], operands[0])"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 3) (ashiftrt:DI (match_dup 2) (const_int 63)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "extend<mode>di2"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(sign_extend:DI
diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c b/gcc/testsuite/gcc.target/i386/pr113560.c
index ac2e01a4589..9431a2d1d90 100644
--- a/gcc/testsuite/gcc.target/i386/pr113560.c
+++ b/gcc/testsuite/gcc.target/i386/pr113560.c
@@ -11,7 +11,7 @@  __int128 bar(__int128 x, __int128 y)
   return (x & 1000) * (y & 1000);
 }
 
-/* { dg-final { scan-assembler-times "\tmulq" 1 } } */
+/* { dg-final { scan-assembler-times "\tmul\[qx\]" 1 } } */
 /* { dg-final { scan-assembler-times "\timulq" 1 } } */
 /* { dg-final { scan-assembler-not "addq" } } */
 /* { dg-final { scan-assembler-not "xorl" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr116274.c b/gcc/testsuite/gcc.target/i386/pr116274.c
new file mode 100644
index 00000000000..51ac3e1572d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116274.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -fno-vect-cost-model -msse4.1" } */
+
+struct aq { long x,y; };
+long testq(struct aq a) { return a.x+a.y; }
+
+struct aw { short a0,a1,a2,a3,a4,a5,a6,a7; };
+short testw(struct aw a) { return a.a0+a.a1+a.a2+a.a3+a.a4+a.a5+a.a6+a.a7; }
+
+struct ad { int x,y,z,w; };
+int testd(struct ad a) { return a.x+a.y+a.z+a.w; }
+/* { dg-final { scan-assembler-not "%rsp" } } */