diff mbox series

[V2] rs6000: Refine unroll factor with target unroll_adjust hook.

Message ID 1572406482-33289-1-git-send-email-guojiufu@linux.ibm.com
State New
Headers show
Series [V2] rs6000: Refine unroll factor with target unroll_adjust hook. | expand

Commit Message

Jiufu Guo Oct. 30, 2019, 3:34 a.m. UTC
Hi,

In this patch, loop unroll adjust hook is introduced for powerpc. In this hook,
we can do target related hueristic adjustment. For this patch, we tunned for
O2 to unroll small loops with small unroll factor (2 times), for other
optimization, default unroll factor is used.

Bootstrapped and regtested on powerpc64le.  Is this ok for trunk?


Jiufu
BR.


gcc/
2019-10-30  Jiufu Guo  <guojiufu@linux.ibm.com>	    

	PR tree-optimization/88760
	* config/rs6000/rs6000.c (rs6000_option_override_internal): Remove
	code which changes PARAM_MAX_UNROLL_TIMES and PARAM_MAX_UNROLLED_INSNS.
	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
	small loop 2 times for -O2.

	
gcc.testsuite/
2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>

	PR tree-optimization/88760
	* gcc.dg/pr59643.c: Update back to r277550.
	* gcc.dg/unroll-8.c: Update cases.
	* gcc.dg/var-expand3.c: Update cases.

---
 gcc/config/rs6000/rs6000.c         | 37 ++++++++++++++++++++++++++-----------
 gcc/testsuite/gcc.dg/pr59643.c     |  3 ---
 gcc/testsuite/gcc.dg/unroll-8.c    |  4 ++++
 gcc/testsuite/gcc.dg/var-expand3.c |  2 ++
 4 files changed, 32 insertions(+), 14 deletions(-)

Comments

Richard Biener Oct. 30, 2019, 1:57 p.m. UTC | #1
On Wed, 30 Oct 2019, Jiufu Guo wrote:

> Hi,
> 
> In this patch, loop unroll adjust hook is introduced for powerpc. In this hook,
> we can do target related hueristic adjustment. For this patch, we tunned for
> O2 to unroll small loops with small unroll factor (2 times), for other
> optimization, default unroll factor is used.
> 
> Bootstrapped and regtested on powerpc64le.  Is this ok for trunk?

This looks good to me, needs ppc maintainer approval though.

Thanks,
Richard.

> 
> Jiufu
> BR.
> 
> 
> gcc/
> 2019-10-30  Jiufu Guo  <guojiufu@linux.ibm.com>	    
> 
> 	PR tree-optimization/88760
> 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Remove
> 	code which changes PARAM_MAX_UNROLL_TIMES and PARAM_MAX_UNROLLED_INSNS.
> 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
> 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
> 	small loop 2 times for -O2.
> 
> 	
> gcc.testsuite/
> 2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>
> 
> 	PR tree-optimization/88760
> 	* gcc.dg/pr59643.c: Update back to r277550.
> 	* gcc.dg/unroll-8.c: Update cases.
> 	* gcc.dg/var-expand3.c: Update cases.
> 
> ---
>  gcc/config/rs6000/rs6000.c         | 37 ++++++++++++++++++++++++++-----------
>  gcc/testsuite/gcc.dg/pr59643.c     |  3 ---
>  gcc/testsuite/gcc.dg/unroll-8.c    |  4 ++++
>  gcc/testsuite/gcc.dg/var-expand3.c |  2 ++
>  4 files changed, 32 insertions(+), 14 deletions(-)
> 
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 9ed5151..183dceb 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -1428,6 +1428,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>  #undef TARGET_VECTORIZE_DESTROY_COST_DATA
>  #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
>  
> +#undef TARGET_LOOP_UNROLL_ADJUST
> +#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
> +
>  #undef TARGET_INIT_BUILTINS
>  #define TARGET_INIT_BUILTINS rs6000_init_builtins
>  #undef TARGET_BUILTIN_DECL
> @@ -4540,20 +4543,11 @@ rs6000_option_override_internal (bool global_init_p)
>  			     global_options.x_param_values,
>  			     global_options_set.x_param_values);
>  
> -      /* unroll very small loops 2 time if no -funroll-loops.  */
> +      /* If funroll-loops is implicitly enabled, do not turn fweb or
> +	 frename-registers on implicitly.  */
>        if (!global_options_set.x_flag_unroll_loops
>  	  && !global_options_set.x_flag_unroll_all_loops)
>  	{
> -	  maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
> -				 global_options.x_param_values,
> -				 global_options_set.x_param_values);
> -
> -	  maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
> -				 global_options.x_param_values,
> -				 global_options_set.x_param_values);
> -
> -	  /* If fweb or frename-registers are not specificed in command-line,
> -	     do not turn them on implicitly.  */
>  	  if (!global_options_set.x_flag_web)
>  	    global_options.x_flag_web = 0;
>  	  if (!global_options_set.x_flag_rename_registers)
> @@ -5101,6 +5095,27 @@ rs6000_destroy_cost_data (void *data)
>    free (data);
>  }
>  
> +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
> +   a new heristic number struct loop *loop should be unrolled.  */
> +
> +static unsigned
> +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
> +{
> +  /* For -O2, we only unroll small loops with small unroll factor.  */
> +  if (optimize == 2)
> +    {
> +      /* If the loop contains few insns, treated it as small loops.
> +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
> +	 if loop constians only 1-2 insns, we may unroll more times(4).
> +	 And we may use PARAM to control kinds of loop size.  */
> +      if (loop->ninsns <= 10)
> +	return MIN (2, nunroll);
> +      else
> +	return 0;
> +    }
> +  return nunroll;
> +}
> +
>  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
>     library with vectorized intrinsics.  */
>  
> diff --git a/gcc/testsuite/gcc.dg/pr59643.c b/gcc/testsuite/gcc.dg/pr59643.c
> index 4446f6e..de78d60 100644
> --- a/gcc/testsuite/gcc.dg/pr59643.c
> +++ b/gcc/testsuite/gcc.dg/pr59643.c
> @@ -1,9 +1,6 @@
>  /* PR tree-optimization/59643 */
>  /* { dg-do compile } */
>  /* { dg-options "-O3 -fdump-tree-pcom-details" } */
> -/* { dg-additional-options "--param max-unrolled-insns=400" { target { powerpc*-*-* } } } */
> -/* Implicit threashold of max-unrolled-insn on ppc at O3 is too small for the
> -   loop of this case.  */
>  
>  void
>  foo (double *a, double *b, double *c, double d, double e, int n)
> diff --git a/gcc/testsuite/gcc.dg/unroll-8.c b/gcc/testsuite/gcc.dg/unroll-8.c
> index b16df67..b1d38a6 100644
> --- a/gcc/testsuite/gcc.dg/unroll-8.c
> +++ b/gcc/testsuite/gcc.dg/unroll-8.c
> @@ -4,6 +4,10 @@ struct a {int a[7];};
>  int t(struct a *a, int n)
>  {
>    int i;
> +
> +  /* Testing unroller message if arrary size is smaller than unroll factor.  */
> +  /* Using pragma unroll to make sure unroll factor is large enough.  */
> +  #pragma GCC unroll 8
>    for (i=0;i<n;i++)
>      a->a[i]++;
>  }
> diff --git a/gcc/testsuite/gcc.dg/var-expand3.c b/gcc/testsuite/gcc.dg/var-expand3.c
> index dce6ec1..d2ca6df 100644
> --- a/gcc/testsuite/gcc.dg/var-expand3.c
> +++ b/gcc/testsuite/gcc.dg/var-expand3.c
> @@ -21,6 +21,8 @@ foo (int n)
>  
>    vaccum = vzero;
>  
> +  /* Using pragma unroll to make sure unroll enough times.  */
> +  #pragma GCC unroll 8
>    for (i = 0; i < n; i++)
>      {
>        vp1 = vec_ld (i * 16, in1);
>
Segher Boessenkool Oct. 30, 2019, 10:07 p.m. UTC | #2
Hi!

On Wed, Oct 30, 2019 at 11:34:42AM +0800, Jiufu Guo wrote:
> In this patch, loop unroll adjust hook is introduced for powerpc. In this hook,
> we can do target related hueristic adjustment. For this patch, we tunned for
> O2 to unroll small loops with small unroll factor (2 times), for other
> optimization, default unroll factor is used.

> 2019-10-30  Jiufu Guo  <guojiufu@linux.ibm.com>	    
> 
> 	PR tree-optimization/88760

I think this should be "target"?  (Change it in bugzilla if you change it
here, of course).

> 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Remove
> 	code which changes PARAM_MAX_UNROLL_TIMES and PARAM_MAX_UNROLLED_INSNS.
> 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
> 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
> 	small loop 2 times for -O2.

(Two spaces after a full stop).

> +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
> +   a new heristic number struct loop *loop should be unrolled.  */

"heuristic".

Argument names are written in ALL CAPS, and repeating the type is useless
here (you can see it just a few lines down).  But anyway, everything else
here just says things like

/*  Implement targetm.loop_unroll_adjust.  */

so just do that?

> +static unsigned
> +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
> +{
> +  /* For -O2, we only unroll small loops with small unroll factor.  */
> +  if (optimize == 2)
> +    {
> +      /* If the loop contains few insns, treated it as small loops.
> +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
> +	 if loop constians only 1-2 insns, we may unroll more times(4).
> +	 And we may use PARAM to control kinds of loop size.  */

That first line has no new information.  So maybe something like
      /* TODO: This is hardcoded to 10 right now.  It can be refined, for
	 example we may want to unroll very small loops more times (4 perhaps).
	 We also should use a PARAM for this.  */

Okay for trunk like that.  Thanks!


Segher
Jiufu Guo Oct. 31, 2019, 2:19 a.m. UTC | #3
Segher Boessenkool <segher@kernel.crashing.org> writes:

> Hi!
>
> On Wed, Oct 30, 2019 at 11:34:42AM +0800, Jiufu Guo wrote:
>> In this patch, loop unroll adjust hook is introduced for powerpc. In this hook,
>> we can do target related hueristic adjustment. For this patch, we tunned for
>> O2 to unroll small loops with small unroll factor (2 times), for other
>> optimization, default unroll factor is used.
>
>> 2019-10-30  Jiufu Guo  <guojiufu@linux.ibm.com>	    
>> 
>> 	PR tree-optimization/88760
>
> I think this should be "target"?  (Change it in bugzilla if you change it
> here, of course).
Or should I open a new bug, then PR88760 could be keep for tree-optimization?
>
>> 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Remove
>> 	code which changes PARAM_MAX_UNROLL_TIMES and PARAM_MAX_UNROLLED_INSNS.
>> 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
>> 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
>> 	small loop 2 times for -O2.
>
> (Two spaces after a full stop).
>
>> +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
>> +   a new heristic number struct loop *loop should be unrolled.  */
>
> "heuristic". 
>
> Argument names are written in ALL CAPS, and repeating the type is useless
> here (you can see it just a few lines down).  But anyway, everything else
> here just says things like
>
> /*  Implement targetm.loop_unroll_adjust.  */
>
> so just do that?
>
>> +static unsigned
>> +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
>> +{
>> +  /* For -O2, we only unroll small loops with small unroll factor.  */
>> +  if (optimize == 2)
>> +    {
>> +      /* If the loop contains few insns, treated it as small loops.
>> +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
>> +	 if loop constians only 1-2 insns, we may unroll more times(4).
>> +	 And we may use PARAM to control kinds of loop size.  */
>
> That first line has no new information.  So maybe something like
>       /* TODO: This is hardcoded to 10 right now.  It can be refined, for
> 	 example we may want to unroll very small loops more times (4 perhaps).
> 	 We also should use a PARAM for this.  */
Thanks for your so detail and careful suggestions which always help me
to make patch better! 
>
> Okay for trunk like that.  Thanks!
>
>
> Segher
Segher Boessenkool Oct. 31, 2019, 5:19 p.m. UTC | #4
Hi!

On Thu, Oct 31, 2019 at 10:19:49AM +0800, Jiufu Guo wrote:
> Segher Boessenkool <segher@kernel.crashing.org> writes:
> > On Wed, Oct 30, 2019 at 11:34:42AM +0800, Jiufu Guo wrote:
> >> In this patch, loop unroll adjust hook is introduced for powerpc. In this hook,
> >> we can do target related hueristic adjustment. For this patch, we tunned for
> >> O2 to unroll small loops with small unroll factor (2 times), for other
> >> optimization, default unroll factor is used.
> >
> >> 2019-10-30  Jiufu Guo  <guojiufu@linux.ibm.com>	    
> >> 
> >> 	PR tree-optimization/88760
> >
> > I think this should be "target"?  (Change it in bugzilla if you change it
> > here, of course).
> Or should I open a new bug, then PR88760 could be keep for tree-optimization?

Oh oops, I should have looked at the actual PR.  No, just ignore me here,
that would be best :-)  (It's much too late now to usefully split off the
Power part of this into a separate PR, or much too much work).


Segher
diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 9ed5151..183dceb 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1428,6 +1428,9 @@  static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
 #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
+
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS rs6000_init_builtins
 #undef TARGET_BUILTIN_DECL
@@ -4540,20 +4543,11 @@  rs6000_option_override_internal (bool global_init_p)
 			     global_options.x_param_values,
 			     global_options_set.x_param_values);
 
-      /* unroll very small loops 2 time if no -funroll-loops.  */
+      /* If funroll-loops is implicitly enabled, do not turn fweb or
+	 frename-registers on implicitly.  */
       if (!global_options_set.x_flag_unroll_loops
 	  && !global_options_set.x_flag_unroll_all_loops)
 	{
-	  maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
-				 global_options.x_param_values,
-				 global_options_set.x_param_values);
-
-	  maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
-				 global_options.x_param_values,
-				 global_options_set.x_param_values);
-
-	  /* If fweb or frename-registers are not specificed in command-line,
-	     do not turn them on implicitly.  */
 	  if (!global_options_set.x_flag_web)
 	    global_options.x_flag_web = 0;
 	  if (!global_options_set.x_flag_rename_registers)
@@ -5101,6 +5095,27 @@  rs6000_destroy_cost_data (void *data)
   free (data);
 }
 
+/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
+   a new heristic number struct loop *loop should be unrolled.  */
+
+static unsigned
+rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
+{
+  /* For -O2, we only unroll small loops with small unroll factor.  */
+  if (optimize == 2)
+    {
+      /* If the loop contains few insns, treated it as small loops.
+	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
+	 if loop constians only 1-2 insns, we may unroll more times(4).
+	 And we may use PARAM to control kinds of loop size.  */
+      if (loop->ninsns <= 10)
+	return MIN (2, nunroll);
+      else
+	return 0;
+    }
+  return nunroll;
+}
+
 /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
    library with vectorized intrinsics.  */
 
diff --git a/gcc/testsuite/gcc.dg/pr59643.c b/gcc/testsuite/gcc.dg/pr59643.c
index 4446f6e..de78d60 100644
--- a/gcc/testsuite/gcc.dg/pr59643.c
+++ b/gcc/testsuite/gcc.dg/pr59643.c
@@ -1,9 +1,6 @@ 
 /* PR tree-optimization/59643 */
 /* { dg-do compile } */
 /* { dg-options "-O3 -fdump-tree-pcom-details" } */
-/* { dg-additional-options "--param max-unrolled-insns=400" { target { powerpc*-*-* } } } */
-/* Implicit threashold of max-unrolled-insn on ppc at O3 is too small for the
-   loop of this case.  */
 
 void
 foo (double *a, double *b, double *c, double d, double e, int n)
diff --git a/gcc/testsuite/gcc.dg/unroll-8.c b/gcc/testsuite/gcc.dg/unroll-8.c
index b16df67..b1d38a6 100644
--- a/gcc/testsuite/gcc.dg/unroll-8.c
+++ b/gcc/testsuite/gcc.dg/unroll-8.c
@@ -4,6 +4,10 @@  struct a {int a[7];};
 int t(struct a *a, int n)
 {
   int i;
+
+  /* Testing unroller message if arrary size is smaller than unroll factor.  */
+  /* Using pragma unroll to make sure unroll factor is large enough.  */
+  #pragma GCC unroll 8
   for (i=0;i<n;i++)
     a->a[i]++;
 }
diff --git a/gcc/testsuite/gcc.dg/var-expand3.c b/gcc/testsuite/gcc.dg/var-expand3.c
index dce6ec1..d2ca6df 100644
--- a/gcc/testsuite/gcc.dg/var-expand3.c
+++ b/gcc/testsuite/gcc.dg/var-expand3.c
@@ -21,6 +21,8 @@  foo (int n)
 
   vaccum = vzero;
 
+  /* Using pragma unroll to make sure unroll enough times.  */
+  #pragma GCC unroll 8
   for (i = 0; i < n; i++)
     {
       vp1 = vec_ld (i * 16, in1);