diff mbox series

[x86_64] Add flag to control tight loops alignment opt

Message ID 20241105032000.381511-1-MayShao-oc@zhaoxin.com
State New
Headers show
Series [x86_64] Add flag to control tight loops alignment opt | expand

Commit Message

MayShao-oc Nov. 5, 2024, 3:20 a.m. UTC
Hi all:
    This patch add -malign-tight-loops flag to control
pass_align_tight_loops.
    The motivation is that pass_align_tight_loops may cause performance
regression in nested loops.

    The example code as follows:

    #define ITER 20000
    #define ITER_O 10

    int i, j,k;
    int array[ITER];

    void loop()
    {
      int i;
      for(k = 0; k < ITER_O; k++)
      for(j = 0; j < ITER; j++)
      for(i = 0; i < ITER; i++)
      {
        array[i] += j;
        array[i] += i;
        array[i] += 2*j;
        array[i] += 2*i;
      }
    }

    When I compile it with gcc -O1 loop.c, the output assembly as follows.
It is not optimal, because of too many nops insert in the outer loop.

0000000000400540 <loop>:
  400540:	48 83 ec 08          	sub    $0x8,%rsp
  400544:	bf 0a 00 00 00       	mov    $0xa,%edi
  400549:	b9 00 00 00 00       	mov    $0x0,%ecx
  40054e:	8d 34 09             	lea    (%rcx,%rcx,1),%esi
  400551:	b8 00 00 00 00       	mov    $0x0,%eax
  400556:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
  40055d:	00 00 00 00
  400561:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
  400568:	00 00 00 00
  40056c:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
  400573:	00 00 00 00
  400577:	66 0f 1f 84 00 00 00 	nopw   0x0(%rax,%rax,1)
  40057e:	00 00
  400580:	89 ca                	mov    %ecx,%edx
  400582:	03 14 85 60 10 60 00 	add    0x601060(,%rax,4),%edx
  400589:	01 c2                	add    %eax,%edx
  40058b:	01 f2                	add    %esi,%edx
  40058d:	8d 14 42             	lea    (%rdx,%rax,2),%edx
  400590:	89 14 85 60 10 60 00 	mov    %edx,0x601060(,%rax,4)
  400597:	48 83 c0 01          	add    $0x1,%rax
  40059b:	48 3d 20 4e 00 00    	cmp    $0x4e20,%rax
  4005a1:	75 dd                	jne    400580 <loop+0x40>

   I benchmark this program in the intel Xeon, and find the optimization may cause a 40% performance regression
(6.6B cycles VS 9.3B cycles).
   So I propose to add -malign-tight-loops flag to control tight loop 
optimization to avoid this, we could disalbe this optimization by default.
   Bootstrapped X86_64.
   Ok for trunk?

BR
Mayshao

gcc/ChangeLog:

	* config/i386/i386-features.cc (ix86_align_tight_loops): New flag.
	* config/i386/i386.opt (malign-tight-loops): New option.
	* doc/invoke.texi (-malign-tight-loops): Document.
---
 gcc/config/i386/i386-features.cc | 4 +++-
 gcc/config/i386/i386.opt         | 4 ++++
 gcc/doc/invoke.texi              | 7 ++++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

Comments

Liu, Hongtao Nov. 5, 2024, 6:33 a.m. UTC | #1
> -----Original Message-----
> From: MayShao-oc <MayShao-oc@zhaoxin.com>
> Sent: Tuesday, November 5, 2024 11:20 AM
> To: gcc-patches@gcc.gnu.org; hubicka@ucw.cz; Liu, Hongtao
> <hongtao.liu@intel.com>; ubizjak@gmail.com
> Cc: timhu@zhaoxin.com; silviazhao@zhaoxin.com; louisqi@zhaoxin.com;
> cobechen@zhaoxin.com
> Subject: [PATCH] [x86_64] Add flag to control tight loops alignment opt
> 
> Hi all:
>     This patch add -malign-tight-loops flag to control pass_align_tight_loops.
>     The motivation is that pass_align_tight_loops may cause performance
> regression in nested loops.
> 
>     The example code as follows:
> 
>     #define ITER 20000
>     #define ITER_O 10
> 
>     int i, j,k;
>     int array[ITER];
> 
>     void loop()
>     {
>       int i;
>       for(k = 0; k < ITER_O; k++)
>       for(j = 0; j < ITER; j++)
>       for(i = 0; i < ITER; i++)
>       {
>         array[i] += j;
>         array[i] += i;
>         array[i] += 2*j;
>         array[i] += 2*i;
>       }
>     }
> 
>     When I compile it with gcc -O1 loop.c, the output assembly as follows.
> It is not optimal, because of too many nops insert in the outer loop.
> 
> 0000000000400540 <loop>:
>   400540:	48 83 ec 08          	sub    $0x8,%rsp
>   400544:	bf 0a 00 00 00       	mov    $0xa,%edi
>   400549:	b9 00 00 00 00       	mov    $0x0,%ecx
>   40054e:	8d 34 09             	lea    (%rcx,%rcx,1),%esi
>   400551:	b8 00 00 00 00       	mov    $0x0,%eax
>   400556:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
>   40055d:	00 00 00 00
>   400561:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
>   400568:	00 00 00 00
>   40056c:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
>   400573:	00 00 00 00
>   400577:	66 0f 1f 84 00 00 00 	nopw   0x0(%rax,%rax,1)
>   40057e:	00 00
>   400580:	89 ca                	mov    %ecx,%edx
>   400582:	03 14 85 60 10 60 00 	add    0x601060(,%rax,4),%edx
>   400589:	01 c2                	add    %eax,%edx
>   40058b:	01 f2                	add    %esi,%edx
>   40058d:	8d 14 42             	lea    (%rdx,%rax,2),%edx
>   400590:	89 14 85 60 10 60 00 	mov    %edx,0x601060(,%rax,4)
>   400597:	48 83 c0 01          	add    $0x1,%rax
>   40059b:	48 3d 20 4e 00 00    	cmp    $0x4e20,%rax
>   4005a1:	75 dd                	jne    400580 <loop+0x40>
> 
>    I benchmark this program in the intel Xeon, and find the optimization may
> cause a 40% performance regression (6.6B cycles VS 9.3B cycles).
>    So I propose to add -malign-tight-loops flag to control tight loop
> optimization to avoid this, we could disalbe this optimization by default.
>    Bootstrapped X86_64.
>    Ok for trunk?
> 
> BR
> Mayshao
> 
> gcc/ChangeLog:
> 
> 	* config/i386/i386-features.cc (ix86_align_tight_loops): New flag.
> 	* config/i386/i386.opt (malign-tight-loops): New option.
> 	* doc/invoke.texi (-malign-tight-loops): Document.
> ---
>  gcc/config/i386/i386-features.cc | 4 +++-
>  gcc/config/i386/i386.opt         | 4 ++++
>  gcc/doc/invoke.texi              | 7 ++++++-
>  3 files changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-
> features.cc
> index e2e85212a4f..f9546e00b07 100644
> --- a/gcc/config/i386/i386-features.cc
> +++ b/gcc/config/i386/i386-features.cc
> @@ -3620,7 +3620,9 @@ public:
>    /* opt_pass methods: */
>    bool gate (function *) final override
>      {
> -      return optimize && optimize_function_for_speed_p (cfun);
> +      return ix86_align_tight_loops
> +	   && optimize
> +	   && optimize_function_for_speed_p (cfun);
>      }
> 
>    unsigned int execute (function *) final override diff --git
> a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index
> 64c295d344c..ec41de192bc 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1266,6 +1266,10 @@ mlam=
>  Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type)
> Init(lam_none)  -mlam=[none|u48|u57] Instrument meta data position in
> user data pointers.
> 
> +malign-tight-loops
> +Target Var(ix86_align_tight_loops) Init(0) Optimization Enable align
> +tight loops.

I'd like it to be on by default, so Init (1)?

> +
>  Enum
>  Name(lam_type) Type(enum lam_type) UnknownError(unknown lam
> type %qs)
> 
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index
> 07920e07b4d..9ec1e1f0095 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -1510,7 +1510,7 @@ See RS/6000 and PowerPC Options.
>  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice}  -
> mindirect-branch-register -mharden-sls=@var{choice}  -mindirect-branch-cs-
> prefix -mneeded -mno-direct-extern-access --munroll-only-small-loops -
> mlam=@var{choice}}
> +-munroll-only-small-loops -mlam=@var{choice} -malign-tight-loops}
> 
>  @emph{x86 Windows Options}
> 
> @@ -36530,6 +36530,11 @@ LAM(linear-address masking) allows special
> bits in the pointer to be used  for metadata. The default is @samp{none}. With
> @samp{u48}, pointer bits in  positions 62:48 can be used for metadata; With
> @samp{u57}, pointer bits in  positions 62:57 can be used for metadata.
> +
> +@opindex malign-tight-loops
> +@opindex mno-align-tight-loops
> +@item -malign-tight-loops
> +Controls tight loop alignment optimization.
>  @end table
> 
>  @node x86 Windows Options
> --
> 2.27.0
Hongtao Liu Nov. 5, 2024, 7:25 a.m. UTC | #2
On Tue, Nov 5, 2024 at 2:34 PM Liu, Hongtao <hongtao.liu@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: MayShao-oc <MayShao-oc@zhaoxin.com>
> > Sent: Tuesday, November 5, 2024 11:20 AM
> > To: gcc-patches@gcc.gnu.org; hubicka@ucw.cz; Liu, Hongtao
> > <hongtao.liu@intel.com>; ubizjak@gmail.com
> > Cc: timhu@zhaoxin.com; silviazhao@zhaoxin.com; louisqi@zhaoxin.com;
> > cobechen@zhaoxin.com
> > Subject: [PATCH] [x86_64] Add flag to control tight loops alignment opt
> >
> > Hi all:
> >     This patch add -malign-tight-loops flag to control pass_align_tight_loops.
> >     The motivation is that pass_align_tight_loops may cause performance
> > regression in nested loops.
> >
> >     The example code as follows:
> >
> >     #define ITER 20000
> >     #define ITER_O 10
> >
> >     int i, j,k;
> >     int array[ITER];
> >
> >     void loop()
> >     {
> >       int i;
> >       for(k = 0; k < ITER_O; k++)
> >       for(j = 0; j < ITER; j++)
> >       for(i = 0; i < ITER; i++)
> >       {
> >         array[i] += j;
> >         array[i] += i;
> >         array[i] += 2*j;
> >         array[i] += 2*i;
> >       }
> >     }
> >
> >     When I compile it with gcc -O1 loop.c, the output assembly as follows.
> > It is not optimal, because of too many nops insert in the outer loop.
> >
> > 0000000000400540 <loop>:
> >   400540:     48 83 ec 08             sub    $0x8,%rsp
> >   400544:     bf 0a 00 00 00          mov    $0xa,%edi
> >   400549:     b9 00 00 00 00          mov    $0x0,%ecx
> >   40054e:     8d 34 09                lea    (%rcx,%rcx,1),%esi
> >   400551:     b8 00 00 00 00          mov    $0x0,%eax
> >   400556:     66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
> >   40055d:     00 00 00 00
> >   400561:     66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
> >   400568:     00 00 00 00
> >   40056c:     66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
> >   400573:     00 00 00 00
> >   400577:     66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
> >   40057e:     00 00
> >   400580:     89 ca                   mov    %ecx,%edx
> >   400582:     03 14 85 60 10 60 00    add    0x601060(,%rax,4),%edx
> >   400589:     01 c2                   add    %eax,%edx
> >   40058b:     01 f2                   add    %esi,%edx
> >   40058d:     8d 14 42                lea    (%rdx,%rax,2),%edx
> >   400590:     89 14 85 60 10 60 00    mov    %edx,0x601060(,%rax,4)
> >   400597:     48 83 c0 01             add    $0x1,%rax
> >   40059b:     48 3d 20 4e 00 00       cmp    $0x4e20,%rax
> >   4005a1:     75 dd                   jne    400580 <loop+0x40>
> >
> >    I benchmark this program in the intel Xeon, and find the optimization may
> > cause a 40% performance regression (6.6B cycles VS 9.3B cycles).
On SPR, align is 25% better than no_align case.

> >    So I propose to add -malign-tight-loops flag to control tight loop
> > optimization to avoid this, we could disalbe this optimization by default.
> >    Bootstrapped X86_64.
> >    Ok for trunk?
> >
> > BR
> > Mayshao
> >
> > gcc/ChangeLog:
> >
> >       * config/i386/i386-features.cc (ix86_align_tight_loops): New flag.
> >       * config/i386/i386.opt (malign-tight-loops): New option.
> >       * doc/invoke.texi (-malign-tight-loops): Document.
> > ---
> >  gcc/config/i386/i386-features.cc | 4 +++-
> >  gcc/config/i386/i386.opt         | 4 ++++
> >  gcc/doc/invoke.texi              | 7 ++++++-
> >  3 files changed, 13 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-
> > features.cc
> > index e2e85212a4f..f9546e00b07 100644
> > --- a/gcc/config/i386/i386-features.cc
> > +++ b/gcc/config/i386/i386-features.cc
> > @@ -3620,7 +3620,9 @@ public:
> >    /* opt_pass methods: */
> >    bool gate (function *) final override
> >      {
> > -      return optimize && optimize_function_for_speed_p (cfun);
> > +      return ix86_align_tight_loops
> > +        && optimize
> > +        && optimize_function_for_speed_p (cfun);
> >      }
> >
> >    unsigned int execute (function *) final override diff --git
> > a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index
> > 64c295d344c..ec41de192bc 100644
> > --- a/gcc/config/i386/i386.opt
> > +++ b/gcc/config/i386/i386.opt
> > @@ -1266,6 +1266,10 @@ mlam=
> >  Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type)
> > Init(lam_none)  -mlam=[none|u48|u57] Instrument meta data position in
> > user data pointers.
> >
> > +malign-tight-loops
> > +Target Var(ix86_align_tight_loops) Init(0) Optimization Enable align
> > +tight loops.
>
> I'd like it to be on by default, so Init (1)?
>
> > +
> >  Enum
> >  Name(lam_type) Type(enum lam_type) UnknownError(unknown lam
> > type %qs)
> >
> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index
> > 07920e07b4d..9ec1e1f0095 100644
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -1510,7 +1510,7 @@ See RS/6000 and PowerPC Options.
> >  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice}  -
> > mindirect-branch-register -mharden-sls=@var{choice}  -mindirect-branch-cs-
> > prefix -mneeded -mno-direct-extern-access --munroll-only-small-loops -
> > mlam=@var{choice}}
> > +-munroll-only-small-loops -mlam=@var{choice} -malign-tight-loops}
> >
> >  @emph{x86 Windows Options}
> >
> > @@ -36530,6 +36530,11 @@ LAM(linear-address masking) allows special
> > bits in the pointer to be used  for metadata. The default is @samp{none}. With
> > @samp{u48}, pointer bits in  positions 62:48 can be used for metadata; With
> > @samp{u57}, pointer bits in  positions 62:57 can be used for metadata.
> > +
> > +@opindex malign-tight-loops
> > +@opindex mno-align-tight-loops
> > +@item -malign-tight-loops
> > +Controls tight loop alignment optimization.
> >  @end table
> >
> >  @node x86 Windows Options
> > --
> > 2.27.0
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index e2e85212a4f..f9546e00b07 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3620,7 +3620,9 @@  public:
   /* opt_pass methods: */
   bool gate (function *) final override
     {
-      return optimize && optimize_function_for_speed_p (cfun);
+      return ix86_align_tight_loops
+	   && optimize
+	   && optimize_function_for_speed_p (cfun);
     }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 64c295d344c..ec41de192bc 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1266,6 +1266,10 @@  mlam=
 Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none)
 -mlam=[none|u48|u57] Instrument meta data position in user data pointers.
 
+malign-tight-loops
+Target Var(ix86_align_tight_loops) Init(0) Optimization
+Enable align tight loops.
+
 Enum
 Name(lam_type) Type(enum lam_type) UnknownError(unknown lam type %qs)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 07920e07b4d..9ec1e1f0095 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1510,7 +1510,7 @@  See RS/6000 and PowerPC Options.
 -mindirect-branch=@var{choice}  -mfunction-return=@var{choice}
 -mindirect-branch-register -mharden-sls=@var{choice}
 -mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access
--munroll-only-small-loops -mlam=@var{choice}}
+-munroll-only-small-loops -mlam=@var{choice} -malign-tight-loops}
 
 @emph{x86 Windows Options}
 
@@ -36530,6 +36530,11 @@  LAM(linear-address masking) allows special bits in the pointer to be used
 for metadata. The default is @samp{none}. With @samp{u48}, pointer bits in
 positions 62:48 can be used for metadata; With @samp{u57}, pointer bits in
 positions 62:57 can be used for metadata.
+
+@opindex malign-tight-loops
+@opindex mno-align-tight-loops
+@item -malign-tight-loops
+Controls tight loop alignment optimization.
 @end table
 
 @node x86 Windows Options