diff mbox series

[1/4] openmp: Tune omp_max_vf for offload targets

Message ID 20241106152722.2821586-2-ams@baylibre.com
State New
Headers show
Series [1/4] openmp: Tune omp_max_vf for offload targets | expand

Commit Message

Andrew Stubbs Nov. 6, 2024, 3:27 p.m. UTC
If requested, return the vectorization factor appropriate for the offload
device, if any.

This change gives a significant speedup in the BabelStream "dot" benchmark on
amdgcn.

The omp_adjust_chunk_size usecase is set "false", for now, but I intend to
change that in a follow-up patch.

Note that NVPTX SIMT offload does not use this code-path.

gcc/ChangeLog:

	* gimple-loop-versioning.cc (loop_versioning::loop_versioning): Set
	omp_max_vf to offload == false.
	* omp-expand.cc (omp_adjust_chunk_size): Likewise.
	* omp-general.cc (omp_max_vf): Add "offload" parameter, and detect
	amdgcn offload devices.
	* omp-general.h (omp_max_vf): Likewise.
	* omp-low.cc (lower_rec_simd_input_clauses): Pass offload state to
	omp_max_vf.
---
 gcc/gimple-loop-versioning.cc |  2 +-
 gcc/omp-expand.cc             |  2 +-
 gcc/omp-general.cc            | 17 +++++++++++++++--
 gcc/omp-general.h             |  2 +-
 gcc/omp-low.cc                |  3 ++-
 5 files changed, 20 insertions(+), 6 deletions(-)

Comments

Jakub Jelinek Nov. 6, 2024, 3:30 p.m. UTC | #1
On Wed, Nov 06, 2024 at 03:27:19PM +0000, Andrew Stubbs wrote:
> If requested, return the vectorization factor appropriate for the offload
> device, if any.
> 
> This change gives a significant speedup in the BabelStream "dot" benchmark on
> amdgcn.
> 
> The omp_adjust_chunk_size usecase is set "false", for now, but I intend to
> change that in a follow-up patch.
> 
> Note that NVPTX SIMT offload does not use this code-path.
> 
> gcc/ChangeLog:
> 
> 	* gimple-loop-versioning.cc (loop_versioning::loop_versioning): Set
> 	omp_max_vf to offload == false.
> 	* omp-expand.cc (omp_adjust_chunk_size): Likewise.
> 	* omp-general.cc (omp_max_vf): Add "offload" parameter, and detect
> 	amdgcn offload devices.
> 	* omp-general.h (omp_max_vf): Likewise.
> 	* omp-low.cc (lower_rec_simd_input_clauses): Pass offload state to
> 	omp_max_vf.

Ok.

	Jakub
Andrew Pinski Nov. 6, 2024, 5:38 p.m. UTC | #2
On Wed, Nov 6, 2024 at 7:28 AM Andrew Stubbs <ams@baylibre.com> wrote:
>
> If requested, return the vectorization factor appropriate for the offload
> device, if any.
>
> This change gives a significant speedup in the BabelStream "dot" benchmark on
> amdgcn.
>
> The omp_adjust_chunk_size usecase is set "false", for now, but I intend to
> change that in a follow-up patch.
>
> Note that NVPTX SIMT offload does not use this code-path.
>
> gcc/ChangeLog:
>
>         * gimple-loop-versioning.cc (loop_versioning::loop_versioning): Set
>         omp_max_vf to offload == false.
>         * omp-expand.cc (omp_adjust_chunk_size): Likewise.
>         * omp-general.cc (omp_max_vf): Add "offload" parameter, and detect
>         amdgcn offload devices.
>         * omp-general.h (omp_max_vf): Likewise.
>         * omp-low.cc (lower_rec_simd_input_clauses): Pass offload state to
>         omp_max_vf.
> ---
>  gcc/gimple-loop-versioning.cc |  2 +-
>  gcc/omp-expand.cc             |  2 +-
>  gcc/omp-general.cc            | 17 +++++++++++++++--
>  gcc/omp-general.h             |  2 +-
>  gcc/omp-low.cc                |  3 ++-
>  5 files changed, 20 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc
> index 107b0020024..2968c929d04 100644
> --- a/gcc/gimple-loop-versioning.cc
> +++ b/gcc/gimple-loop-versioning.cc
> @@ -554,7 +554,7 @@ loop_versioning::loop_versioning (function *fn)
>       handled efficiently by scalar code.  omp_max_vf calculates the
>       maximum number of bytes in a vector, when such a value is relevant
>       to loop optimization.  */
> -  m_maximum_scale = estimated_poly_value (omp_max_vf ());
> +  m_maximum_scale = estimated_poly_value (omp_max_vf (false));
>    m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
>  }
>
> diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc
> index b0b4ddf5dbc..907fd46a5b2 100644
> --- a/gcc/omp-expand.cc
> +++ b/gcc/omp-expand.cc
> @@ -212,7 +212,7 @@ omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
>    if (!simd_schedule || integer_zerop (chunk_size))
>      return chunk_size;
>
> -  poly_uint64 vf = omp_max_vf ();
> +  poly_uint64 vf = omp_max_vf (false);
>    if (known_eq (vf, 1U))
>      return chunk_size;
>
> diff --git a/gcc/omp-general.cc b/gcc/omp-general.cc
> index f74b9bf5e96..1ae575ee181 100644
> --- a/gcc/omp-general.cc
> +++ b/gcc/omp-general.cc
> @@ -987,10 +987,11 @@ find_combined_omp_for (tree *tp, int *walk_subtrees, void *data)
>    return NULL_TREE;
>  }
>
> -/* Return maximum possible vectorization factor for the target.  */
> +/* Return maximum possible vectorization factor for the target, or for
> +   the OpenMP offload target if one exists.  */
>
>  poly_uint64
> -omp_max_vf (void)
> +omp_max_vf (bool offload)
>  {
>    if (!optimize
>        || optimize_debug
> @@ -999,6 +1000,18 @@ omp_max_vf (void)
>           && OPTION_SET_P (flag_tree_loop_vectorize)))
>      return 1;
>
> +  if (ENABLE_OFFLOADING && offload)
> +    {
> +      for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c;)
> +       {
> +         if (startswith (c, "amdgcn"))
> +           return ordered_max (64, omp_max_vf (false));

This causes a bootstrap failure for me (and others) on x86_64-linux-gnu:

In file included from ../../src-master/gcc/coretypes.h:497,
                 from ../../src-master/gcc/omp-general.cc:25:
../../src-master/gcc/poly-int.h: In instantiation of ‘typename
if_nonpoly<Cb, bool>::type maybe_lt(const poly_int<N, C>&, const Cb&)
[with unsigned int N = 1; Ca = long unsigned int; Cb = int; typename
if_nonpoly<Cb, bool>::type = bool]’:
../../src-master/gcc/poly-int.h:1440:7:   required from ‘poly_int<N,
typename poly_result<typename if_nonpoly<Ca>::type, Cb>::type>
ordered_max(const Ca&, const poly_int<N, Cb>&) [with unsigned int N =
1; Ca = int; Cb = long unsigned int; typename poly_result<typename
if_nonpoly<Ca>::type, Cb>::type = long unsigned int; typename
if_nonpoly<Ca>::type = int]’
 1342 | #define maybe_gt(A, B) maybe_lt (B, A)
      |                        ~~~~~~~~~^~~~~~
../../src-master/gcc/omp-general.cc:1008:25:   required from here
 1008 |             return ordered_max (64, omp_max_vf (false));
      |                    ~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
../../src-master/gcc/poly-int.h:1317:22: error: comparison of integer
expressions of different signedness: ‘const long unsigned int’ and
‘const int’ [-Werror=sign-compare]
 1317 |   return a.coeffs[0] < b;
      |          ~~~~~~~~~~~~^~~
../../src-master/gcc/poly-int.h: In instantiation of ‘typename
if_nonpoly<Ca, bool>::type maybe_lt(const Ca&, const poly_int<N, Cb>&)
[with unsigned int N = 1; Ca = int; Cb = long unsigned int; typename
if_nonpoly<Ca, bool>::type = bool]’:
../../src-master/gcc/poly-int.h:1445:2:   required from ‘poly_int<N,
typename poly_result<typename if_nonpoly<Ca>::type, Cb>::type>
ordered_max(const Ca&, const poly_int<N, Cb>&) [with unsigned int N =
1; Ca = int; Cb = long unsigned int; typename poly_result<typename
if_nonpoly<Ca>::type, Cb>::type = long unsigned int; typename
if_nonpoly<Ca>::type = int]’
 1342 | #define maybe_gt(A, B) maybe_lt (B, A)
      |                        ~~~~~~~~~^~~~~~
../../src-master/gcc/omp-general.cc:1008:25:   required from here
 1008 |             return ordered_max (64, omp_max_vf (false));
      |                    ~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
../../src-master/gcc/poly-int.h:1328:12: error: comparison of integer
expressions of different signedness: ‘const int’ and ‘const long
unsigned int’ [-Werror=sign-compare]
 1328 |   return a < b.coeffs[0];
      |          ~~^~~~~~~~~~~

Thanks,
Andrew Pinski


> +         else if ((c = strchr (c, ':')))
> +           c++;
> +       }
> +      /* Otherwise, fall through to host VF.  */
> +    }
> +
>    auto_vector_modes modes;
>    targetm.vectorize.autovectorize_vector_modes (&modes, true);
>    if (!modes.is_empty ())
> diff --git a/gcc/omp-general.h b/gcc/omp-general.h
> index f3778131626..70f78d2055b 100644
> --- a/gcc/omp-general.h
> +++ b/gcc/omp-general.h
> @@ -162,7 +162,7 @@ extern void omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
>                                   struct omp_for_data_loop *loops);
>  extern gimple *omp_build_barrier (tree lhs);
>  extern tree find_combined_omp_for (tree *, int *, void *);
> -extern poly_uint64 omp_max_vf (void);
> +extern poly_uint64 omp_max_vf (bool);
>  extern int omp_max_simt_vf (void);
>  extern const char *omp_context_name_list_prop (tree);
>  extern void omp_construct_traits_to_codes (tree, int, enum tree_code *);
> diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc
> index 44c4310075b..70a2c108fbc 100644
> --- a/gcc/omp-low.cc
> +++ b/gcc/omp-low.cc
> @@ -4589,7 +4589,8 @@ lower_rec_simd_input_clauses (tree new_var, omp_context *ctx,
>  {
>    if (known_eq (sctx->max_vf, 0U))
>      {
> -      sctx->max_vf = sctx->is_simt ? omp_max_simt_vf () : omp_max_vf ();
> +      sctx->max_vf = (sctx->is_simt ? omp_max_simt_vf ()
> +                     : omp_max_vf (omp_maybe_offloaded_ctx (ctx)));
>        if (maybe_gt (sctx->max_vf, 1U))
>         {
>           tree c = omp_find_clause (gimple_omp_for_clauses (ctx->stmt),
> --
> 2.46.0
>
Jakub Jelinek Nov. 6, 2024, 5:52 p.m. UTC | #3
On Wed, Nov 06, 2024 at 09:38:21AM -0800, Andrew Pinski wrote:
> > +      for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c;)
> > +       {
> > +         if (startswith (c, "amdgcn"))
> > +           return ordered_max (64, omp_max_vf (false));
> 
> This causes a bootstrap failure for me (and others) on x86_64-linux-gnu:
> 
> In file included from ../../src-master/gcc/coretypes.h:497,
>                  from ../../src-master/gcc/omp-general.cc:25:
> ../../src-master/gcc/poly-int.h: In instantiation of ‘typename
> if_nonpoly<Cb, bool>::type maybe_lt(const poly_int<N, C>&, const Cb&)
> [with unsigned int N = 1; Ca = long unsigned int; Cb = int; typename
> if_nonpoly<Cb, bool>::type = bool]’:
> ../../src-master/gcc/poly-int.h:1440:7:   required from ‘poly_int<N,
> typename poly_result<typename if_nonpoly<Ca>::type, Cb>::type>
> ordered_max(const Ca&, const poly_int<N, Cb>&) [with unsigned int N =
> 1; Ca = int; Cb = long unsigned int; typename poly_result<typename
> if_nonpoly<Ca>::type, Cb>::type = long unsigned int; typename
> if_nonpoly<Ca>::type = int]’
>  1342 | #define maybe_gt(A, B) maybe_lt (B, A)
>       |                        ~~~~~~~~~^~~~~~
> ../../src-master/gcc/omp-general.cc:1008:25:   required from here
>  1008 |             return ordered_max (64, omp_max_vf (false));
>       |                    ~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
> ../../src-master/gcc/poly-int.h:1317:22: error: comparison of integer
> expressions of different signedness: ‘const long unsigned int’ and
> ‘const int’ [-Werror=sign-compare]
>  1317 |   return a.coeffs[0] < b;
>       |          ~~~~~~~~~~~~^~~
> ../../src-master/gcc/poly-int.h: In instantiation of ‘typename
> if_nonpoly<Ca, bool>::type maybe_lt(const Ca&, const poly_int<N, Cb>&)
> [with unsigned int N = 1; Ca = int; Cb = long unsigned int; typename
> if_nonpoly<Ca, bool>::type = bool]’:
> ../../src-master/gcc/poly-int.h:1445:2:   required from ‘poly_int<N,
> typename poly_result<typename if_nonpoly<Ca>::type, Cb>::type>
> ordered_max(const Ca&, const poly_int<N, Cb>&) [with unsigned int N =
> 1; Ca = int; Cb = long unsigned int; typename poly_result<typename
> if_nonpoly<Ca>::type, Cb>::type = long unsigned int; typename
> if_nonpoly<Ca>::type = int]’
>  1342 | #define maybe_gt(A, B) maybe_lt (B, A)
>       |                        ~~~~~~~~~^~~~~~
> ../../src-master/gcc/omp-general.cc:1008:25:   required from here
>  1008 |             return ordered_max (64, omp_max_vf (false));
>       |                    ~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
> ../../src-master/gcc/poly-int.h:1328:12: error: comparison of integer
> expressions of different signedness: ‘const int’ and ‘const long
> unsigned int’ [-Werror=sign-compare]
>  1328 |   return a < b.coeffs[0];
>       |          ~~^~~~~~~~~~~

Guess we need ordered_max (uint64_t (64), omp_max_vf (false))
or something similar.

	Jakub
Andrew Stubbs Nov. 6, 2024, 5:53 p.m. UTC | #4
On 06/11/2024 17:38, Andrew Pinski wrote:
>> +  if (ENABLE_OFFLOADING && offload)
>> +    {
>> +      for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c;)
>> +       {
>> +         if (startswith (c, "amdgcn"))
>> +           return ordered_max (64, omp_max_vf (false));
> 
> This causes a bootstrap failure for me (and others) on x86_64-linux-gnu:

I'm not sure why I didn't see this.

I'm testing the attached patch.

Andrew
Jakub Jelinek Nov. 6, 2024, 5:59 p.m. UTC | #5
On Wed, Nov 06, 2024 at 05:53:53PM +0000, Andrew Stubbs wrote:
> I'm not sure why I didn't see this.

Was it bootstrap tested or just built without bootstrap + tested?
Otherwise it is just a warning.

> I'm testing the attached patch.

If it makes it to stage3, this is ok for trunk.
Just 64U would most likely work as well (at least I don't get any warnings
in that case).

> From 345eb9b795d9728733bd0e472529e259ce796ff6 Mon Sep 17 00:00:00 2001
> From: Andrew Stubbs <ams@baylibre.com>
> Date: Wed, 6 Nov 2024 17:50:00 +0000
> Subject: [PATCH] openmp: Fix signed/unsigned warning
> 
> My previous patch broke things when building with Werror.
> 
> gcc/ChangeLog:
> 
> 	* omp-general.cc (omp_max_vf): Cast the constant to poly_uint64.
> ---
>  gcc/omp-general.cc | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/gcc/omp-general.cc b/gcc/omp-general.cc
> index 1ae575ee181..72fb7f92ff7 100644
> --- a/gcc/omp-general.cc
> +++ b/gcc/omp-general.cc
> @@ -1005,7 +1005,7 @@ omp_max_vf (bool offload)
>        for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c;)
>  	{
>  	  if (startswith (c, "amdgcn"))
> -	    return ordered_max (64, omp_max_vf (false));
> +	    return ordered_max (poly_uint64 (64), omp_max_vf (false));
>  	  else if ((c = strchr (c, ':')))
>  	    c++;
>  	}

	Jakub
Andrew Stubbs Nov. 6, 2024, 6:19 p.m. UTC | #6
On 06/11/2024 17:59, Jakub Jelinek wrote:
> On Wed, Nov 06, 2024 at 05:53:53PM +0000, Andrew Stubbs wrote:
>> I'm not sure why I didn't see this.
> 
> Was it bootstrap tested or just built without bootstrap + tested?
> Otherwise it is just a warning.

Apparently I forgot to rerun the bootstrap after making the changes you 
requested. Sorry, everyone. :-(

>> I'm testing the attached patch.
> 
> If it makes it to stage3, this is ok for trunk.
> Just 64U would most likely work as well (at least I don't get any warnings
> in that case).

Thanks, pushed now.

Andrew
diff mbox series

Patch

diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc
index 107b0020024..2968c929d04 100644
--- a/gcc/gimple-loop-versioning.cc
+++ b/gcc/gimple-loop-versioning.cc
@@ -554,7 +554,7 @@  loop_versioning::loop_versioning (function *fn)
      handled efficiently by scalar code.  omp_max_vf calculates the
      maximum number of bytes in a vector, when such a value is relevant
      to loop optimization.  */
-  m_maximum_scale = estimated_poly_value (omp_max_vf ());
+  m_maximum_scale = estimated_poly_value (omp_max_vf (false));
   m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
 }
 
diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc
index b0b4ddf5dbc..907fd46a5b2 100644
--- a/gcc/omp-expand.cc
+++ b/gcc/omp-expand.cc
@@ -212,7 +212,7 @@  omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
   if (!simd_schedule || integer_zerop (chunk_size))
     return chunk_size;
 
-  poly_uint64 vf = omp_max_vf ();
+  poly_uint64 vf = omp_max_vf (false);
   if (known_eq (vf, 1U))
     return chunk_size;
 
diff --git a/gcc/omp-general.cc b/gcc/omp-general.cc
index f74b9bf5e96..1ae575ee181 100644
--- a/gcc/omp-general.cc
+++ b/gcc/omp-general.cc
@@ -987,10 +987,11 @@  find_combined_omp_for (tree *tp, int *walk_subtrees, void *data)
   return NULL_TREE;
 }
 
-/* Return maximum possible vectorization factor for the target.  */
+/* Return maximum possible vectorization factor for the target, or for
+   the OpenMP offload target if one exists.  */
 
 poly_uint64
-omp_max_vf (void)
+omp_max_vf (bool offload)
 {
   if (!optimize
       || optimize_debug
@@ -999,6 +1000,18 @@  omp_max_vf (void)
 	  && OPTION_SET_P (flag_tree_loop_vectorize)))
     return 1;
 
+  if (ENABLE_OFFLOADING && offload)
+    {
+      for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c;)
+	{
+	  if (startswith (c, "amdgcn"))
+	    return ordered_max (64, omp_max_vf (false));
+	  else if ((c = strchr (c, ':')))
+	    c++;
+	}
+      /* Otherwise, fall through to host VF.  */
+    }
+
   auto_vector_modes modes;
   targetm.vectorize.autovectorize_vector_modes (&modes, true);
   if (!modes.is_empty ())
diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index f3778131626..70f78d2055b 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -162,7 +162,7 @@  extern void omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 				  struct omp_for_data_loop *loops);
 extern gimple *omp_build_barrier (tree lhs);
 extern tree find_combined_omp_for (tree *, int *, void *);
-extern poly_uint64 omp_max_vf (void);
+extern poly_uint64 omp_max_vf (bool);
 extern int omp_max_simt_vf (void);
 extern const char *omp_context_name_list_prop (tree);
 extern void omp_construct_traits_to_codes (tree, int, enum tree_code *);
diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc
index 44c4310075b..70a2c108fbc 100644
--- a/gcc/omp-low.cc
+++ b/gcc/omp-low.cc
@@ -4589,7 +4589,8 @@  lower_rec_simd_input_clauses (tree new_var, omp_context *ctx,
 {
   if (known_eq (sctx->max_vf, 0U))
     {
-      sctx->max_vf = sctx->is_simt ? omp_max_simt_vf () : omp_max_vf ();
+      sctx->max_vf = (sctx->is_simt ? omp_max_simt_vf ()
+		      : omp_max_vf (omp_maybe_offloaded_ctx (ctx)));
       if (maybe_gt (sctx->max_vf, 1U))
 	{
 	  tree c = omp_find_clause (gimple_omp_for_clauses (ctx->stmt),