diff mbox series

Fix wrong cost of MEM when addr is a lea.

Message ID 20240626060840.2836616-1-hongtao.liu@intel.com
State New
Headers show
Series Fix wrong cost of MEM when addr is a lea. | expand

Commit Message

liuhongt June 26, 2024, 6:08 a.m. UTC
416.gamess regressed 4-6% on x86_64 since my r15-882-g1d6199e5f8c1c0.
The commit adjust rtx_cost of mem to reduce cost of (add op0 disp).
But Cost of ADDR could be cheaper than XEXP (addr, 0) when it's a lea.
It is the case in the PR, the patch uses lower cost to enable more
simplication and fix the regression.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR target/115462
	* config/i386/i386.cc (ix86_rtx_costs): Use cost of addr when
	it's lower than rtx_cost (XEXP (addr, 0)) + 1.

gcc/testsuite/ChangeLog:
	* gcc.target/i386/pr115462.c: New test.
---
 gcc/config/i386/i386.cc                  |  9 +++++++--
 gcc/testsuite/gcc.target/i386/pr115462.c | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115462.c

Comments

Richard Biener June 26, 2024, 6:51 a.m. UTC | #1
On Wed, Jun 26, 2024 at 8:09 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> 416.gamess regressed 4-6% on x86_64 since my r15-882-g1d6199e5f8c1c0.
> The commit adjust rtx_cost of mem to reduce cost of (add op0 disp).
> But Cost of ADDR could be cheaper than XEXP (addr, 0) when it's a lea.
> It is the case in the PR, the patch uses lower cost to enable more
> simplication and fix the regression.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/115462
>         * config/i386/i386.cc (ix86_rtx_costs): Use cost of addr when
>         it's lower than rtx_cost (XEXP (addr, 0)) + 1.
>
> gcc/testsuite/ChangeLog:
>         * gcc.target/i386/pr115462.c: New test.
> ---
>  gcc/config/i386/i386.cc                  |  9 +++++++--
>  gcc/testsuite/gcc.target/i386/pr115462.c | 22 ++++++++++++++++++++++
>  2 files changed, 29 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr115462.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index d4ccc24be6e..83dab8220dd 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -22341,8 +22341,13 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
>           if (GET_CODE (addr) == PLUS
>               && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
>             {
> -             *total += 1;
> -             *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
> +             /* PR115462: Cost of ADDR could be cheaper than XEXP (addr, 0)
> +                when it's a lea, use lower cost to enable more
> +                simplification.  */
> +             unsigned cost1 = rtx_cost (addr, Pmode, MEM, 0, speed);
> +             unsigned cost2 = rtx_cost (XEXP (addr, 0), Pmode,
> +                                        PLUS, 0, speed) + 1;

Just as comment - this is a bit ugly, why would we not always use the
address cost?  (and why are you using 'MEM'?)  Should this be better
handled on the insn_cost level when it's clear the PLUS is separate address
calculation (LEA) rather than address calculation in a MEM context?

> +             *total += MIN (cost1, cost2);
>               return true;
>             }
>         }
> diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c b/gcc/testsuite/gcc.target/i386/pr115462.c
> new file mode 100644
> index 00000000000..ad50a6382bc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr115462.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
> +/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } } */
> +
> +int
> +foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, long n, int* q)
> +{
> +  static int p1[10000];
> +  int* p2 = p1 + 1000;
> +  int* p3 = p1 + 4000;
> +  int* p4 = p1 + 8000;
> +
> +  for (long i = 0; i != n; i++)
> +    {
> +      /* scan for      movl    %edi, p1.0+3996(,%rax,4),
> +        p1.0+3996 should be propagted into the loop.  */
> +      p2[indx++] = q[indx++];
> +      p3[indx2++] = q[indx2++];
> +      p4[indx3++] = q[indx3++];
> +    }
> +  return p1[indx6] + p1[indx5];
> +}
> --
> 2.31.1
>
Hongtao Liu June 26, 2024, 7:13 a.m. UTC | #2
On Wed, Jun 26, 2024 at 2:52 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Jun 26, 2024 at 8:09 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > 416.gamess regressed 4-6% on x86_64 since my r15-882-g1d6199e5f8c1c0.
> > The commit adjust rtx_cost of mem to reduce cost of (add op0 disp).
> > But Cost of ADDR could be cheaper than XEXP (addr, 0) when it's a lea.
> > It is the case in the PR, the patch uses lower cost to enable more
> > simplication and fix the regression.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         PR target/115462
> >         * config/i386/i386.cc (ix86_rtx_costs): Use cost of addr when
> >         it's lower than rtx_cost (XEXP (addr, 0)) + 1.
> >
> > gcc/testsuite/ChangeLog:
> >         * gcc.target/i386/pr115462.c: New test.
> > ---
> >  gcc/config/i386/i386.cc                  |  9 +++++++--
> >  gcc/testsuite/gcc.target/i386/pr115462.c | 22 ++++++++++++++++++++++
> >  2 files changed, 29 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr115462.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index d4ccc24be6e..83dab8220dd 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -22341,8 +22341,13 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
> >           if (GET_CODE (addr) == PLUS
> >               && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
> >             {
> > -             *total += 1;
> > -             *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
> > +             /* PR115462: Cost of ADDR could be cheaper than XEXP (addr, 0)
> > +                when it's a lea, use lower cost to enable more
> > +                simplification.  */
> > +             unsigned cost1 = rtx_cost (addr, Pmode, MEM, 0, speed);
> > +             unsigned cost2 = rtx_cost (XEXP (addr, 0), Pmode,
> > +                                        PLUS, 0, speed) + 1;
>
> Just as comment - this is a bit ugly, why would we not always use the
> address cost?  (and why are you using 'MEM'?)  Should this be better
> handled on the insn_cost level when it's clear the PLUS is separate address
> calculation (LEA) rather than address calculation in a MEM context?
 For MEM, rtx_cost doesn't use address_cost but iterates each subrtx,
and adds up the costs,
 So for MEM (reg) and MEM (reg + 4), the former costs 5, the latter
costs 9, it is not accurate for x86.
 Ideally address_cost should be used, but it reduces cost too
much(range from 1-3).
(I've tried that, it regressed many testcases, because two many
registers are propagated into addr and increase register pressure).
 So the current solution is to make constant disp as cheap as possible
so more constant can be propagated into the address(but not
registers).

>
> > +             *total += MIN (cost1, cost2);
> >               return true;
> >             }
> >         }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c b/gcc/testsuite/gcc.target/i386/pr115462.c
> > new file mode 100644
> > index 00000000000..ad50a6382bc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr115462.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
> > +/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } } */
> > +
> > +int
> > +foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, long n, int* q)
> > +{
> > +  static int p1[10000];
> > +  int* p2 = p1 + 1000;
> > +  int* p3 = p1 + 4000;
> > +  int* p4 = p1 + 8000;
> > +
> > +  for (long i = 0; i != n; i++)
> > +    {
> > +      /* scan for      movl    %edi, p1.0+3996(,%rax,4),
> > +        p1.0+3996 should be propagted into the loop.  */
> > +      p2[indx++] = q[indx++];
> > +      p3[indx2++] = q[indx2++];
> > +      p4[indx3++] = q[indx3++];
> > +    }
> > +  return p1[indx6] + p1[indx5];
> > +}
> > --
> > 2.31.1
> >
Richard Biener June 26, 2024, 8:02 a.m. UTC | #3
On Wed, Jun 26, 2024 at 9:14 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, Jun 26, 2024 at 2:52 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Jun 26, 2024 at 8:09 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > 416.gamess regressed 4-6% on x86_64 since my r15-882-g1d6199e5f8c1c0.
> > > The commit adjust rtx_cost of mem to reduce cost of (add op0 disp).
> > > But Cost of ADDR could be cheaper than XEXP (addr, 0) when it's a lea.
> > > It is the case in the PR, the patch uses lower cost to enable more
> > > simplication and fix the regression.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/115462
> > >         * config/i386/i386.cc (ix86_rtx_costs): Use cost of addr when
> > >         it's lower than rtx_cost (XEXP (addr, 0)) + 1.
> > >
> > > gcc/testsuite/ChangeLog:
> > >         * gcc.target/i386/pr115462.c: New test.
> > > ---
> > >  gcc/config/i386/i386.cc                  |  9 +++++++--
> > >  gcc/testsuite/gcc.target/i386/pr115462.c | 22 ++++++++++++++++++++++
> > >  2 files changed, 29 insertions(+), 2 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr115462.c
> > >
> > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > index d4ccc24be6e..83dab8220dd 100644
> > > --- a/gcc/config/i386/i386.cc
> > > +++ b/gcc/config/i386/i386.cc
> > > @@ -22341,8 +22341,13 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
> > >           if (GET_CODE (addr) == PLUS
> > >               && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
> > >             {
> > > -             *total += 1;
> > > -             *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
> > > +             /* PR115462: Cost of ADDR could be cheaper than XEXP (addr, 0)
> > > +                when it's a lea, use lower cost to enable more
> > > +                simplification.  */
> > > +             unsigned cost1 = rtx_cost (addr, Pmode, MEM, 0, speed);
> > > +             unsigned cost2 = rtx_cost (XEXP (addr, 0), Pmode,
> > > +                                        PLUS, 0, speed) + 1;
> >
> > Just as comment - this is a bit ugly, why would we not always use the
> > address cost?  (and why are you using 'MEM'?)  Should this be better
> > handled on the insn_cost level when it's clear the PLUS is separate address
> > calculation (LEA) rather than address calculation in a MEM context?
>  For MEM, rtx_cost doesn't use address_cost but iterates each subrtx,
> and adds up the costs,
>  So for MEM (reg) and MEM (reg + 4), the former costs 5, the latter
> costs 9, it is not accurate for x86.

But rtx_cost invokes targetm.rtx_cost which allows to avoid that
recursive processing at any level.  You're dealing with MEM [addr]
here, so why's rtx_cost (addr, Pmode, MEM, 0, speed) not always
the best way to deal with this?  Since this is the MEM [addr] case
we know it's not LEA, no?

>  Ideally address_cost should be used, but it reduces cost too
> much(range from 1-3).
> (I've tried that, it regressed many testcases, because two many
> registers are propagated into addr and increase register pressure).
>  So the current solution is to make constant disp as cheap as possible
> so more constant can be propagated into the address(but not
> registers).
>
> >
> > > +             *total += MIN (cost1, cost2);
> > >               return true;
> > >             }
> > >         }
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c b/gcc/testsuite/gcc.target/i386/pr115462.c
> > > new file mode 100644
> > > index 00000000000..ad50a6382bc
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr115462.c
> > > @@ -0,0 +1,22 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
> > > +/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } } */
> > > +
> > > +int
> > > +foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, long n, int* q)
> > > +{
> > > +  static int p1[10000];
> > > +  int* p2 = p1 + 1000;
> > > +  int* p3 = p1 + 4000;
> > > +  int* p4 = p1 + 8000;
> > > +
> > > +  for (long i = 0; i != n; i++)
> > > +    {
> > > +      /* scan for      movl    %edi, p1.0+3996(,%rax,4),
> > > +        p1.0+3996 should be propagted into the loop.  */
> > > +      p2[indx++] = q[indx++];
> > > +      p3[indx2++] = q[indx2++];
> > > +      p4[indx3++] = q[indx3++];
> > > +    }
> > > +  return p1[indx6] + p1[indx5];
> > > +}
> > > --
> > > 2.31.1
> > >
>
>
>
> --
> BR,
> Hongtao
Hongtao Liu June 26, 2024, 8:20 a.m. UTC | #4
On Wed, Jun 26, 2024 at 4:02 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Jun 26, 2024 at 9:14 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Wed, Jun 26, 2024 at 2:52 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Wed, Jun 26, 2024 at 8:09 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > >
> > > > 416.gamess regressed 4-6% on x86_64 since my r15-882-g1d6199e5f8c1c0.
> > > > The commit adjust rtx_cost of mem to reduce cost of (add op0 disp).
> > > > But Cost of ADDR could be cheaper than XEXP (addr, 0) when it's a lea.
> > > > It is the case in the PR, the patch uses lower cost to enable more
> > > > simplication and fix the regression.
> > > >
> > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > Ok for trunk?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         PR target/115462
> > > >         * config/i386/i386.cc (ix86_rtx_costs): Use cost of addr when
> > > >         it's lower than rtx_cost (XEXP (addr, 0)) + 1.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >         * gcc.target/i386/pr115462.c: New test.
> > > > ---
> > > >  gcc/config/i386/i386.cc                  |  9 +++++++--
> > > >  gcc/testsuite/gcc.target/i386/pr115462.c | 22 ++++++++++++++++++++++
> > > >  2 files changed, 29 insertions(+), 2 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr115462.c
> > > >
> > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > > index d4ccc24be6e..83dab8220dd 100644
> > > > --- a/gcc/config/i386/i386.cc
> > > > +++ b/gcc/config/i386/i386.cc
> > > > @@ -22341,8 +22341,13 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
> > > >           if (GET_CODE (addr) == PLUS
> > > >               && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
> > > >             {
> > > > -             *total += 1;
> > > > -             *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
> > > > +             /* PR115462: Cost of ADDR could be cheaper than XEXP (addr, 0)
> > > > +                when it's a lea, use lower cost to enable more
> > > > +                simplification.  */
> > > > +             unsigned cost1 = rtx_cost (addr, Pmode, MEM, 0, speed);
> > > > +             unsigned cost2 = rtx_cost (XEXP (addr, 0), Pmode,
> > > > +                                        PLUS, 0, speed) + 1;
> > >
> > > Just as comment - this is a bit ugly, why would we not always use the
> > > address cost?  (and why are you using 'MEM'?)  Should this be better
> > > handled on the insn_cost level when it's clear the PLUS is separate address
> > > calculation (LEA) rather than address calculation in a MEM context?
> >  For MEM, rtx_cost doesn't use address_cost but iterates each subrtx,
> > and adds up the costs,
> >  So for MEM (reg) and MEM (reg + 4), the former costs 5, the latter
> > costs 9, it is not accurate for x86.
>
> But rtx_cost invokes targetm.rtx_cost which allows to avoid that
> recursive processing at any level.  You're dealing with MEM [addr]
> here, so why's rtx_cost (addr, Pmode, MEM, 0, speed) not always
Because when addr is just (plus reg cosnt_int), rtx_cost (addr, Pmode,
MEM, 0, speed) return 4.
But i think it should be equal to addr (reg) which is 0 cost.
> the best way to deal with this?  Since this is the MEM [addr] case
> we know it's not LEA, no?
Maybe the only case I need to handle is reg + disp, otherwise, they're
all lea forms.
>
> >  Ideally address_cost should be used, but it reduces cost too
> > much(range from 1-3).
> > (I've tried that, it regressed many testcases, because two many
> > registers are propagated into addr and increase register pressure).
> >  So the current solution is to make constant disp as cheap as possible
> > so more constant can be propagated into the address(but not
> > registers).
> >
> > >
> > > > +             *total += MIN (cost1, cost2);
> > > >               return true;
> > > >             }
> > > >         }
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c b/gcc/testsuite/gcc.target/i386/pr115462.c
> > > > new file mode 100644
> > > > index 00000000000..ad50a6382bc
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr115462.c
> > > > @@ -0,0 +1,22 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
> > > > +/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } } */
> > > > +
> > > > +int
> > > > +foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, long n, int* q)
> > > > +{
> > > > +  static int p1[10000];
> > > > +  int* p2 = p1 + 1000;
> > > > +  int* p3 = p1 + 4000;
> > > > +  int* p4 = p1 + 8000;
> > > > +
> > > > +  for (long i = 0; i != n; i++)
> > > > +    {
> > > > +      /* scan for      movl    %edi, p1.0+3996(,%rax,4),
> > > > +        p1.0+3996 should be propagted into the loop.  */
> > > > +      p2[indx++] = q[indx++];
> > > > +      p3[indx2++] = q[indx2++];
> > > > +      p4[indx3++] = q[indx3++];
> > > > +    }
> > > > +  return p1[indx6] + p1[indx5];
> > > > +}
> > > > --
> > > > 2.31.1
> > > >
> >
> >
> >
> > --
> > BR,
> > Hongtao
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ccc24be6e..83dab8220dd 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22341,8 +22341,13 @@  ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 	  if (GET_CODE (addr) == PLUS
 	      && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
 	    {
-	      *total += 1;
-	      *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
+	      /* PR115462: Cost of ADDR could be cheaper than XEXP (addr, 0)
+		 when it's a lea, use lower cost to enable more
+		 simplification.  */
+	      unsigned cost1 = rtx_cost (addr, Pmode, MEM, 0, speed);
+	      unsigned cost2 = rtx_cost (XEXP (addr, 0), Pmode,
+					 PLUS, 0, speed) + 1;
+	      *total += MIN (cost1, cost2);
 	      return true;
 	    }
 	}
diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c b/gcc/testsuite/gcc.target/i386/pr115462.c
new file mode 100644
index 00000000000..ad50a6382bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115462.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
+/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } } */
+
+int
+foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, long n, int* q)
+{
+  static int p1[10000];
+  int* p2 = p1 + 1000;
+  int* p3 = p1 + 4000;
+  int* p4 = p1 + 8000;
+
+  for (long i = 0; i != n; i++)
+    {
+      /* scan for 	movl	%edi, p1.0+3996(,%rax,4),
+	 p1.0+3996 should be propagted into the loop.  */
+      p2[indx++] = q[indx++];
+      p3[indx2++] = q[indx2++];
+      p4[indx3++] = q[indx3++];
+    }
+  return p1[indx6] + p1[indx5];
+}