diff mbox series

GCN: Honor OpenMP 5.1 'num_teams' lower bound (was: [PATCH] libgomp, nvptx, v3: Honor OpenMP 5.1 num_teams lower bound)

Message ID 87y163t340.fsf@euler.schwinge.ddns.net
State New
Headers show
Series GCN: Honor OpenMP 5.1 'num_teams' lower bound (was: [PATCH] libgomp, nvptx, v3: Honor OpenMP 5.1 num_teams lower bound) | expand

Commit Message

Thomas Schwinge July 15, 2024, 9:29 a.m. UTC
Hi!

On 2021-11-12T18:58:04+0100, Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> And finally here is a third version, [...]

... which became commit 9fa72756d90e0d9edadf6e6f5f56476029925788
"libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound".

Attached here is "GCN: Honor OpenMP 5.1 'num_teams' lower bound", which
are exactly the corresponding changes for GCN (see below Jakub's nvptx
changes for reference); OK to push?


Grüße
 Thomas


> 2021-11-12  Jakub Jelinek  <jakub@redhat.com>
>
> 	* config/nvptx/team.c (__gomp_team_num): Define as
> 	__attribute__((shared)) var.
> 	(gomp_nvptx_main): Initialize __gomp_team_num to 0.
> 	* config/nvptx/target.c (__gomp_team_num): Declare as
> 	extern __attribute__((shared)) var.
> 	(GOMP_teams4): Use __gomp_team_num as the team number instead of
> 	%ctaid.x.  If first, initialize it to %ctaid.x.  If num_teams_lower
> 	is bigger than num_blocks, use num_teams_lower teams and arrange for
> 	bumping of __gomp_team_num if !first and returning false once we run
> 	out of teams.
> 	* config/nvptx/teams.c (__gomp_team_num): Declare as
> 	extern __attribute__((shared)) var.
> 	(omp_get_team_num): Return __gomp_team_num value instead of %ctaid.x.
>
> --- libgomp/config/nvptx/team.c.jj	2021-05-25 13:43:02.793121350 +0200
> +++ libgomp/config/nvptx/team.c	2021-11-12 17:49:02.847341650 +0100
> @@ -32,6 +32,7 @@
>  #include <string.h>
>  
>  struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
> +int __gomp_team_num __attribute__((shared));
>  
>  static void gomp_thread_start (struct gomp_thread_pool *);
>  
> @@ -57,6 +58,7 @@ gomp_nvptx_main (void (*fn) (void *), vo
>        /* Starting additional threads is not supported.  */
>        gomp_global_icv.dyn_var = true;
>  
> +      __gomp_team_num = 0;
>        nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
>        memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
>  
> --- libgomp/config/nvptx/target.c.jj	2021-11-12 15:57:29.400632875 +0100
> +++ libgomp/config/nvptx/target.c	2021-11-12 17:47:39.499533296 +0100
> @@ -26,28 +26,41 @@
>  #include "libgomp.h"
>  #include <limits.h>
>  
> +extern int __gomp_team_num __attribute__((shared));
> +
>  bool
>  GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper,
>  	     unsigned int thread_limit, bool first)
>  {
> +  unsigned int num_blocks, block_id;
> +  asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks));
>    if (!first)
> -    return false;
> +    {
> +      unsigned int team_num;
> +      if (num_blocks > gomp_num_teams_var)
> +	return false;
> +      team_num = __gomp_team_num;
> +      if (team_num > gomp_num_teams_var - num_blocks)
> +	return false;
> +      __gomp_team_num = team_num + num_blocks;
> +      return true;
> +    }
>    if (thread_limit)
>      {
>        struct gomp_task_icv *icv = gomp_icv (true);
>        icv->thread_limit_var
>  	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
>      }
> -  unsigned int num_blocks, block_id;
> -  asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks));
> -  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
> -  /* FIXME: If num_teams_lower > num_blocks, we want to loop multiple
> -     times for some CTAs.  */
> -  (void) num_teams_lower;
> -  if (!num_teams_upper || num_teams_upper >= num_blocks)
> +  if (!num_teams_upper)
>      num_teams_upper = num_blocks;
> -  else if (block_id >= num_teams_upper)
> +  else if (num_blocks < num_teams_lower)
> +    num_teams_upper = num_teams_lower;
> +  else if (num_blocks < num_teams_upper)
> +    num_teams_upper = num_blocks;
> +  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
> +  if (block_id >= num_teams_upper)
>      return false;
> +  __gomp_team_num = block_id;
>    gomp_num_teams_var = num_teams_upper - 1;
>    return true;
>  }
> --- libgomp/config/nvptx/teams.c.jj	2021-05-25 13:43:02.793121350 +0200
> +++ libgomp/config/nvptx/teams.c	2021-11-12 17:37:18.933361024 +0100
> @@ -28,6 +28,8 @@
>  
>  #include "libgomp.h"
>  
> +extern int __gomp_team_num __attribute__((shared));
> +
>  void
>  GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams,
>  		unsigned int thread_limit, unsigned int flags)
> @@ -48,9 +50,7 @@ omp_get_num_teams (void)
>  int
>  omp_get_team_num (void)
>  {
> -  int ctaid;
> -  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
> -  return ctaid;
> +  return __gomp_team_num;
>  }
>  
>  ialias (omp_get_num_teams)
>
>
> 	Jakub

Comments

Andrew Stubbs July 15, 2024, 11:16 a.m. UTC | #1
On 15/07/2024 10:29, Thomas Schwinge wrote:
> Hi!
> 
> On 2021-11-12T18:58:04+0100, Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>> And finally here is a third version, [...]
> 
> ... which became commit 9fa72756d90e0d9edadf6e6f5f56476029925788
> "libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound".
> 
> Attached here is "GCN: Honor OpenMP 5.1 'num_teams' lower bound", which
> are exactly the corresponding changes for GCN (see below Jakub's nvptx
> changes for reference); OK to push?
> 
> 

> --- a/libgomp/config/gcn/target.c
> +++ b/libgomp/config/gcn/target.c
> @@ -33,26 +33,37 @@ bool
>  GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper,
>  	     unsigned int thread_limit, bool first)
>  {
> +  int __lds *gomp_team_num = (int __lds *) GOMP_TEAM_NUM;
> +  unsigned int num_workgroups = __builtin_gcn_dim_size (0);
>    if (!first)
> -    return false;
> +    {
> +      unsigned int team_num;
> +      if (num_workgroups > gomp_num_teams_var)
> +	return false;
> +      team_num = *gomp_team_num;
> +      if (team_num > gomp_num_teams_var - num_workgroups)
> +	return false;
> +      *gomp_team_num = team_num + num_workgroups;
> +      return true;
> +    }
>    if (thread_limit)
>      {
>        struct gomp_task_icv *icv = gomp_icv (true);
>        icv->thread_limit_var
>  	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
>      }
> -  unsigned int num_workgroups, workgroup_id;
> -  num_workgroups = __builtin_gcn_dim_size (0);
> -  workgroup_id = __builtin_gcn_dim_pos (0);
> -  /* FIXME: If num_teams_lower > num_workgroups, we want to loop
> -     multiple times at least for some workgroups.  */
> -  (void) num_teams_lower;
> -  if (!num_teams_upper || num_teams_upper >= num_workgroups)
> +  if (!num_teams_upper)
>      num_teams_upper = ((GOMP_ADDITIONAL_ICVS.nteams > 0
>  			&& num_workgroups > GOMP_ADDITIONAL_ICVS.nteams)
>  		       ? GOMP_ADDITIONAL_ICVS.nteams : num_workgroups);
> -  else if (workgroup_id >= num_teams_upper)
> +  else if (num_workgroups < num_teams_lower)
> +    num_teams_upper = num_teams_lower;
> +  else if (num_workgroups < num_teams_upper)
> +    num_teams_upper = num_workgroups;
> +  unsigned int workgroup_id = __builtin_gcn_dim_pos (0);
> +  if (workgroup_id >= num_teams_upper)
>      return false;
> +  *gomp_team_num = workgroup_id;
>    gomp_num_teams_var = num_teams_upper - 1;
>    return true;
>  }


That's a lot of convoluted logic to drop in without a single comment!

The GCN bits look fine, and I assume you've probably thought about the 
logic here a lot, but I've no idea what you're trying to achieve, or why 
you're trying to achieve it (from the patch alone).

Can we have some comments on motivation and goals, please?

Andrew
Thomas Schwinge July 15, 2024, 3:36 p.m. UTC | #2
Hi!

On 2024-07-15T12:16:30+0100, Andrew Stubbs <ams@baylibre.com> wrote:
> On 15/07/2024 10:29, Thomas Schwinge wrote:
>> On 2021-11-12T18:58:04+0100, Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>>> And finally here is a third version, [...]
>> 
>> ... which became commit 9fa72756d90e0d9edadf6e6f5f56476029925788
>> "libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound".
>> 
>> Attached here is "GCN: Honor OpenMP 5.1 'num_teams' lower bound", which
>> are exactly the corresponding changes for GCN (see below Jakub's nvptx
>> changes for reference); OK to push?

> That's a lot of convoluted logic to drop in without a single comment!

Well, I'll pass that compliment over to Jakub ;-) -- my code changes just
intend to be a faithful "'s%nvptx%GCN'" of his code changes from back
then.

> The GCN bits look fine, and I assume you've probably thought about the 
> logic here a lot, but I've no idea what you're trying to achieve, or why 
> you're trying to achieve it (from the patch alone).
>
> Can we have some comments on motivation and goals, please?

Here's the original context:

  - <https://inbox.sourceware.org/20211111190313.GV2710@tucnak> "[PATCH] openmp: Honor OpenMP 5.1 num_teams lower bound"
  - <https://inbox.sourceware.org/20211112132023.GC2710@tucnak> "[PATCH] libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound"

Is that sufficient, and/or would you like to see some commentary to the
relevant libgomp generic/nvptx/GCN code added?


Grüße
 Thomas
Andrew Stubbs July 15, 2024, 4:01 p.m. UTC | #3
On 15/07/2024 16:36, Thomas Schwinge wrote:
> Hi!
> 
> On 2024-07-15T12:16:30+0100, Andrew Stubbs <ams@baylibre.com> wrote:
>> On 15/07/2024 10:29, Thomas Schwinge wrote:
>>> On 2021-11-12T18:58:04+0100, Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>>>> And finally here is a third version, [...]
>>>
>>> ... which became commit 9fa72756d90e0d9edadf6e6f5f56476029925788
>>> "libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound".
>>>
>>> Attached here is "GCN: Honor OpenMP 5.1 'num_teams' lower bound", which
>>> are exactly the corresponding changes for GCN (see below Jakub's nvptx
>>> changes for reference); OK to push?
> 
>> That's a lot of convoluted logic to drop in without a single comment!
> 
> Well, I'll pass that compliment over to Jakub ;-) -- my code changes just
> intend to be a faithful "'s%nvptx%GCN'" of his code changes from back
> then.
> 
>> The GCN bits look fine, and I assume you've probably thought about the
>> logic here a lot, but I've no idea what you're trying to achieve, or why
>> you're trying to achieve it (from the patch alone).
>>
>> Can we have some comments on motivation and goals, please?
> 
> Here's the original context:
> 
>    - <https://inbox.sourceware.org/20211111190313.GV2710@tucnak> "[PATCH] openmp: Honor OpenMP 5.1 num_teams lower bound"
>    - <https://inbox.sourceware.org/20211112132023.GC2710@tucnak> "[PATCH] libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound"
> 
> Is that sufficient, and/or would you like to see some commentary to the
> relevant libgomp generic/nvptx/GCN code added?

Yes, sorry if it wasn't clear; I meant *code* comments.

/* The team number is usually the same as the gcn_dim_pos(0), except 
when num_teams(N) is .....   */

The FIXME actually tells me something useful about one of the 
conditional cases, but that's being removed here.

Also, why are we returning "false" in other cases, and what effect does 
that have? Is that for "spare" teams when we launch more than we need?

Andrew
diff mbox series

Patch

From f078b635f033dcb80ce8cd48de3bf62ad5e285bf Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <tschwinge@baylibre.com>
Date: Mon, 15 Jul 2024 11:19:28 +0200
Subject: [PATCH] GCN: Honor OpenMP 5.1 'num_teams' lower bound

Corresponding to commit 9fa72756d90e0d9edadf6e6f5f56476029925788
"libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound", these are the
GCN offloading changes to fix:

    PASS: libgomp.c/../libgomp.c-c++-common/teams-2.c (test for excess errors)
    [-FAIL:-]{+PASS:+} libgomp.c/../libgomp.c-c++-common/teams-2.c execution test

    PASS: libgomp.c++/../libgomp.c-c++-common/teams-2.c (test for excess errors)
    [-FAIL:-]{+PASS:+} libgomp.c++/../libgomp.c-c++-common/teams-2.c execution test

..., and omptests' 't-critical' test case.  I've cross checked that those test
cases are the ones that regress for nvptx offloading, if I locally revert the
"libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound" changes.

	libgomp/
	* config/gcn/libgomp-gcn.h (GOMP_TEAM_NUM): Inject.
	* config/gcn/target.c (GOMP_teams4): Handle.
	* config/gcn/team.c (gomp_gcn_enter_kernel): Initialize.
	* config/gcn/teams.c (omp_get_team_num): Adjust.
---
 libgomp/config/gcn/libgomp-gcn.h |  9 +++++----
 libgomp/config/gcn/target.c      | 29 ++++++++++++++++++++---------
 libgomp/config/gcn/team.c        |  3 +++
 libgomp/config/gcn/teams.c       |  5 +++--
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/libgomp/config/gcn/libgomp-gcn.h b/libgomp/config/gcn/libgomp-gcn.h
index e94f0c7ae68..48a3741b04d 100644
--- a/libgomp/config/gcn/libgomp-gcn.h
+++ b/libgomp/config/gcn/libgomp-gcn.h
@@ -34,10 +34,11 @@ 
 #define DEFAULT_TEAM_ARENA_SIZE (64*1024)
 
 /* These define the LDS location of data needed by OpenMP.  */
-#define TEAM_ARENA_START 16  /* LDS offset of free pointer.  */
-#define TEAM_ARENA_FREE  24  /* LDS offset of free pointer.  */
-#define TEAM_ARENA_END   32  /* LDS offset of end pointer.  */
-#define GCN_LOWLAT_HEAP  40  /* LDS offset of the OpenMP low-latency heap.  */
+#define GOMP_TEAM_NUM    16
+#define TEAM_ARENA_START 24  /* LDS offset of free pointer.  */
+#define TEAM_ARENA_FREE  32  /* LDS offset of free pointer.  */
+#define TEAM_ARENA_END   40  /* LDS offset of end pointer.  */
+#define GCN_LOWLAT_HEAP  48  /* LDS offset of the OpenMP low-latency heap.  */
 
 struct heap
 {
diff --git a/libgomp/config/gcn/target.c b/libgomp/config/gcn/target.c
index 1d4a23cb8d2..e57d2e5f93f 100644
--- a/libgomp/config/gcn/target.c
+++ b/libgomp/config/gcn/target.c
@@ -33,26 +33,37 @@  bool
 GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper,
 	     unsigned int thread_limit, bool first)
 {
+  int __lds *gomp_team_num = (int __lds *) GOMP_TEAM_NUM;
+  unsigned int num_workgroups = __builtin_gcn_dim_size (0);
   if (!first)
-    return false;
+    {
+      unsigned int team_num;
+      if (num_workgroups > gomp_num_teams_var)
+	return false;
+      team_num = *gomp_team_num;
+      if (team_num > gomp_num_teams_var - num_workgroups)
+	return false;
+      *gomp_team_num = team_num + num_workgroups;
+      return true;
+    }
   if (thread_limit)
     {
       struct gomp_task_icv *icv = gomp_icv (true);
       icv->thread_limit_var
 	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
     }
-  unsigned int num_workgroups, workgroup_id;
-  num_workgroups = __builtin_gcn_dim_size (0);
-  workgroup_id = __builtin_gcn_dim_pos (0);
-  /* FIXME: If num_teams_lower > num_workgroups, we want to loop
-     multiple times at least for some workgroups.  */
-  (void) num_teams_lower;
-  if (!num_teams_upper || num_teams_upper >= num_workgroups)
+  if (!num_teams_upper)
     num_teams_upper = ((GOMP_ADDITIONAL_ICVS.nteams > 0
 			&& num_workgroups > GOMP_ADDITIONAL_ICVS.nteams)
 		       ? GOMP_ADDITIONAL_ICVS.nteams : num_workgroups);
-  else if (workgroup_id >= num_teams_upper)
+  else if (num_workgroups < num_teams_lower)
+    num_teams_upper = num_teams_lower;
+  else if (num_workgroups < num_teams_upper)
+    num_teams_upper = num_workgroups;
+  unsigned int workgroup_id = __builtin_gcn_dim_pos (0);
+  if (workgroup_id >= num_teams_upper)
     return false;
+  *gomp_team_num = workgroup_id;
   gomp_num_teams_var = num_teams_upper - 1;
   return true;
 }
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index bd3df448b52..aa68b3abe0b 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -68,6 +68,9 @@  gomp_gcn_enter_kernel (void)
       /* Starting additional threads is not supported.  */
       gomp_global_icv.dyn_var = true;
 
+      int __lds *gomp_team_num = (int __lds *) GOMP_TEAM_NUM;
+      *gomp_team_num = 0;
+
       /* Initialize the team arena for optimized memory allocation.
          The arena has been allocated on the host side, and the address
          passed in via the kernargs.  Each team takes a small slice of it.  */
diff --git a/libgomp/config/gcn/teams.c b/libgomp/config/gcn/teams.c
index 8a91ba8f5c1..57404184c89 100644
--- a/libgomp/config/gcn/teams.c
+++ b/libgomp/config/gcn/teams.c
@@ -44,10 +44,11 @@  omp_get_num_teams (void)
   return gomp_num_teams_var + 1;
 }
 
-int __attribute__ ((__optimize__ ("O2")))
+int
 omp_get_team_num (void)
 {
-  return __builtin_gcn_dim_pos (0);
+  int __lds *gomp_team_num = (int __lds *) GOMP_TEAM_NUM;
+  return *gomp_team_num;
 }
 
 ialias (omp_get_num_teams)
-- 
2.34.1