[2/4] openmp: use offload max_vf for chunk_size

Message ID	20241106152722.2821586-3-ams@baylibre.com
State	New
Headers	show Return-Path: <gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org D7A023858D28 From: Andrew Stubbs <ams@baylibre.com> To: gcc-patches@gcc.gnu.org Cc: prathameshk@nvidia.com, jakub@redhat.com Subject: [PATCH 2/4] openmp: use offload max_vf for chunk_size Date: Wed, 6 Nov 2024 15:27:20 +0000 Message-ID: <20241106152722.2821586-3-ams@baylibre.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org
Series	[1/4] openmp: Tune omp_max_vf for offload targets \| expand [1/4] openmp: Tune omp_max_vf for offload targets [2/4] openmp: use offload max_vf for chunk_size [3/4] openmp: Add IFN_GOMP_MAX_VF [4/4] openmp: Add testcases for omp_max_vf

Message ID

20241106152722.2821586-3-ams@baylibre.com

State

New

Headers

DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org D7A023858D28
From: Andrew Stubbs <ams@baylibre.com>
To: gcc-patches@gcc.gnu.org
Cc: prathameshk@nvidia.com,
	jakub@redhat.com
Subject: [PATCH 2/4] openmp: use offload max_vf for chunk_size
Date: Wed,  6 Nov 2024 15:27:20 +0000
Message-ID: <20241106152722.2821586-3-ams@baylibre.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: list
Errors-To: gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org

Series

[1/4] openmp: Tune omp_max_vf for offload targets | expand

Commit Message

Andrew Stubbs Nov. 6, 2024, 3:27 p.m. UTC

The chunk size for SIMD loops should be right for the current device; too big
allocates too much memory, too small is inefficient.  Getting it wrong doesn't
actually break anything though.

This patch attempts to choose the optimal setting based on the context.  Both
host-fallback and device will get the same chunk size, but device performance
is the most important in this case.

gcc/ChangeLog:

	* omp-expand.cc (is_in_offload_region): New function.
	(omp_adjust_chunk_size): Add pass-through "offload" parameter.
	(get_ws_args_for): Likewise.
	(determine_parallel_type): Use is_in_offload_region to adjust call to
	get_ws_args_for.
	(expand_omp_for_generic): Likewise.
	(expand_omp_for_static_chunk): Likewise.
---
 gcc/omp-expand.cc | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

Comments

Jakub Jelinek Nov. 6, 2024, 3:31 p.m. UTC | #1

On Wed, Nov 06, 2024 at 03:27:20PM +0000, Andrew Stubbs wrote:
> The chunk size for SIMD loops should be right for the current device; too big
> allocates too much memory, too small is inefficient.  Getting it wrong doesn't
> actually break anything though.
> 
> This patch attempts to choose the optimal setting based on the context.  Both
> host-fallback and device will get the same chunk size, but device performance
> is the most important in this case.
> 
> gcc/ChangeLog:
> 
> 	* omp-expand.cc (is_in_offload_region): New function.
> 	(omp_adjust_chunk_size): Add pass-through "offload" parameter.
> 	(get_ws_args_for): Likewise.
> 	(determine_parallel_type): Use is_in_offload_region to adjust call to
> 	get_ws_args_for.
> 	(expand_omp_for_generic): Likewise.
> 	(expand_omp_for_static_chunk): Likewise.

Ok.

	Jakub

diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc
index 907fd46a5b2..b0f9d375b6c 100644
--- a/gcc/omp-expand.cc
+++ b/gcc/omp-expand.cc
@@ -127,6 +127,23 @@  is_combined_parallel (struct omp_region *region)
   return region->is_combined_parallel;
 }
 
+/* Return true is REGION is or is contained within an offload region.  */
+
+static bool
+is_in_offload_region (struct omp_region *region)
+{
+  gimple *entry_stmt = last_nondebug_stmt (region->entry);
+  if (is_gimple_omp (entry_stmt)
+      && is_gimple_omp_offloaded (entry_stmt))
+    return true;
+  else if (region->outer)
+    return is_in_offload_region (region->outer);
+  else
+    return (lookup_attribute ("omp declare target",
+			      DECL_ATTRIBUTES (current_function_decl))
+	    != NULL);
+}
+
 /* Given two blocks PAR_ENTRY_BB and WS_ENTRY_BB such that WS_ENTRY_BB
    is the immediate dominator of PAR_ENTRY_BB, return true if there
    are no data dependencies that would prevent expanding the parallel
@@ -207,12 +224,12 @@  workshare_safe_to_combine_p (basic_block ws_entry_bb)
    presence (SIMD_SCHEDULE).  */
 
 static tree
-omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
+omp_adjust_chunk_size (tree chunk_size, bool simd_schedule, bool offload)
 {
   if (!simd_schedule || integer_zerop (chunk_size))
     return chunk_size;
 
-  poly_uint64 vf = omp_max_vf (false);
+  poly_uint64 vf = omp_max_vf (offload);
   if (known_eq (vf, 1U))
     return chunk_size;
 
@@ -228,7 +245,7 @@  omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
    expanded.  */
 
 static vec<tree, va_gc> *
-get_ws_args_for (gimple *par_stmt, gimple *ws_stmt)
+get_ws_args_for (gimple *par_stmt, gimple *ws_stmt, bool offload)
 {
   tree t;
   location_t loc = gimple_location (ws_stmt);
@@ -270,7 +287,7 @@  get_ws_args_for (gimple *par_stmt, gimple *ws_stmt)
       if (fd.chunk_size)
 	{
 	  t = fold_convert_loc (loc, long_integer_type_node, fd.chunk_size);
-	  t = omp_adjust_chunk_size (t, fd.simd_schedule);
+	  t = omp_adjust_chunk_size (t, fd.simd_schedule, offload);
 	  ws_args->quick_push (t);
 	}
 
@@ -366,7 +383,8 @@  determine_parallel_type (struct omp_region *region)
 
       region->is_combined_parallel = true;
       region->inner->is_combined_parallel = true;
-      region->ws_args = get_ws_args_for (par_stmt, ws_stmt);
+      region->ws_args = get_ws_args_for (par_stmt, ws_stmt,
+					 is_in_offload_region (region));
     }
 }
 
@@ -3929,6 +3947,7 @@  expand_omp_for_generic (struct omp_region *region,
   tree *counts = NULL;
   int i;
   bool ordered_lastprivate = false;
+  bool offload = is_in_offload_region (region);
 
   gcc_assert (!broken_loop || !in_combined_parallel);
   gcc_assert (fd->iter_type == long_integer_type_node
@@ -4196,7 +4215,7 @@  expand_omp_for_generic (struct omp_region *region,
 	  if (fd->chunk_size)
 	    {
 	      t = fold_convert (fd->iter_type, fd->chunk_size);
-	      t = omp_adjust_chunk_size (t, fd->simd_schedule);
+	      t = omp_adjust_chunk_size (t, fd->simd_schedule, offload);
 	      if (sched_arg)
 		{
 		  if (fd->ordered)
@@ -4240,7 +4259,7 @@  expand_omp_for_generic (struct omp_region *region,
 	    {
 	      tree bfn_decl = builtin_decl_explicit (start_fn);
 	      t = fold_convert (fd->iter_type, fd->chunk_size);
-	      t = omp_adjust_chunk_size (t, fd->simd_schedule);
+	      t = omp_adjust_chunk_size (t, fd->simd_schedule, offload);
 	      if (sched_arg)
 		t = build_call_expr (bfn_decl, 10, t5, t0, t1, t2, sched_arg,
 				     t, t3, t4, reductions, mem);
@@ -5937,7 +5956,8 @@  expand_omp_for_static_chunk (struct omp_region *region,
   step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
 				   true, NULL_TREE, true, GSI_SAME_STMT);
   tree chunk_size = fold_convert (itype, fd->chunk_size);
-  chunk_size = omp_adjust_chunk_size (chunk_size, fd->simd_schedule);
+  chunk_size = omp_adjust_chunk_size (chunk_size, fd->simd_schedule,
+				      is_in_offload_region (region));
   chunk_size
     = force_gimple_operand_gsi (&gsi, chunk_size, true, NULL_TREE, true,
 				GSI_SAME_STMT);

[2/4] openmp: use offload max_vf for chunk_size

Commit Message

Comments

Patch