diff mbox series

nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution, via 'vote.all.pred' (was: nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution (was: [committed][nvptx] Add uniform_warp_check insn))

Message ID 87v82oeb7f.fsf@euler.schwinge.ddns.net
State New
Headers show
Series nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution, via 'vote.all.pred' (was: nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution (was: [committed][nvptx] Add uniform_warp_check insn)) | expand

Commit Message

Thomas Schwinge June 4, 2024, 7:53 p.m. UTC
Hi!

On 2022-12-15T19:27:08+0100, I wrote:
> First "a bit" of context; skip to "the proposed patch" if you'd like to
> see just that.

Here, I'm not again providing all the context; see the previous email if
necessary.

> My following discussion is about the implementation of
> 'nvptx_uniform_warp_check', originally introduced as follows:
>
> On 2022-02-01T19:31:27+0100, Tom de Vries via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>> --- a/gcc/config/nvptx/nvptx.md
>> +++ b/gcc/config/nvptx/nvptx.md

>> +(define_insn "nvptx_uniform_warp_check"
>> +  [(unspec_volatile [(const_int 0)] UNSPECV_UNIFORM_WARP_CHECK)]
>> +  ""
>> +  {
>> +    output_asm_insn ("{", NULL);
>> +    output_asm_insn ("\\t"	 ".reg.b32"	   "\\t" "act;", NULL);
>> +    output_asm_insn ("\\t"	 "vote.ballot.b32" "\\t" "act,1;", NULL);
>> +    output_asm_insn ("\\t"	 ".reg.pred"	   "\\t" "uni;", NULL);
>> +    output_asm_insn ("\\t"	 "setp.eq.b32"	   "\\t" "uni,act,0xffffffff;",
>> +		     NULL);
>> +    output_asm_insn ("@ !uni\\t" "trap;", NULL);
>> +    output_asm_insn ("@ !uni\\t" "exit;", NULL);
>> +    output_asm_insn ("}", NULL);
>> +    return "";
>> +  }
>> +  [(set_attr "predicable" "false")])
>
> Later adjusted, but the fundamental idea is still the same.

> Now, "the proposed patch".  I'd like to make 'nvptx_uniform_warp_check'
> fit for non-full-warp execution.  For example, to be able to execute such
> code in single-threaded 'cuLaunchKernel' for execution of global
> constructors/destructors, where those may, for example, call into nvptx
> target libraries compiled with '-mgomp' (thus, '-muniform-simt').
>
> OK to push (after proper testing, and with TODO markers adjusted/removed)
> the attached
> "nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution"?

> --- a/gcc/config/nvptx/nvptx.md
> +++ b/gcc/config/nvptx/nvptx.md
> @@ -2282,10 +2282,24 @@
>        "{",
>        "\\t"		  ".reg.b32"	    "\\t" "%%r_act;",
>        "%.\\t"		  "vote.ballot.b32" "\\t" "%%r_act,1;",
> +      /* For '%r_exp', we essentially need 'activemask.b32', but that is "Introduced in PTX ISA version 6.2", and this code here is used only 'if (!TARGET_PTX_6_0)'.  Thus, emulate it.
> +         TODO Is that actually correct?  Wouldn't 'activemask.b32' rather replace our 'vote.ballot.b32' given that it registers the *currently active threads*?  */
> +      /* Compute the "membermask" of all threads of the warp that are expected to be converged here.
> +      	 For OpenACC, '%ntid.x' is 'vector_length', which per 'nvptx_goacc_validate_dims' always is a multiple of 32.
> +	 For OpenMP, '%ntid.x' always is 32.
> +      	 Thus, this is typically 0xffffffff, but additionally always for the case that not all 32 threads of the warp have been launched.
> +	 This assume that lane IDs are assigned in ascending order.  */
> +      //TODO Can we rely on '1 << 32 == 0', and '0 - 1 = 0xffffffff'?
> +      //TODO https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
> +      //TODO https://stackoverflow.com/questions/54055195/activemask-vs-ballot-sync
> +      "\\t"		  ".reg.b32"	    "\\t" "%%r_exp;",
> +      "%.\\t"		  "mov.b32"	    "\\t" "%%r_exp, %%ntid.x;",
> +      "%.\\t"		  "shl.b32"	    "\\t" "%%r_exp, 1, %%r_exp;",
> +      "%.\\t"		  "sub.u32"	    "\\t" "%%r_exp, %%r_exp, 1;",
>        "\\t"		  ".reg.pred"	    "\\t" "%%r_do_abort;",
>        "\\t"		  "mov.pred"	    "\\t" "%%r_do_abort,0;",
>        "%.\\t"		  "setp.ne.b32"	    "\\t" "%%r_do_abort,%%r_act,"
> -						  "0xffffffff;",
> +						  "%%r_exp;",
>        "@ %%r_do_abort\\t" "trap;",
>        "@ %%r_do_abort\\t" "exit;",
>        "}",

Turns out, there is a simpler way, via 'vote.all.pred'.  :-)

Unless there are any comments, I intend to soon push the attached
"nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution, via 'vote.all.pred'".


Grüße
 Thomas
diff mbox series

Patch

From f7f4a20ca14761d39822e9d79cb3ac711df45b90 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <tschwinge@baylibre.com>
Date: Fri, 10 May 2024 12:50:23 +0200
Subject: [PATCH] nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp
 execution, via 'vote.all.pred'

For example, this allows for '-muniform-simt' code to be executed
single-threaded, which currently fails (device-side 'trap'): the '0xffffffff'
bitmask isn't correct if not all 32 threads of a warp are active.  The same
issue/fix, I suppose but have not verified, would apply if we were to allow for
OpenACC 'vector_length' smaller than 32, for example for OpenACC 'serial'.

We use 'nvptx_uniform_warp_check' only for PTX ISA version less than 6.0.
Otherwise we're using 'nvptx_warpsync', which emits 'bar.warp.sync 0xffffffff',
which evidently appears to do the right thing.  (I've tested '-muniform-simt'
code executing single-threaded.)

The change that I proposed on 2022-12-15 was to emit PTX code to calculate
'(1 << %ntid.x) - 1' as the actual bitmask to use instead of '0xffffffff'.
This works, but the PTX JIT generates SASS code to do this computation.

In turn, this change now uses PTX 'vote.all.pred' -- which even simplifies upon
the original code a little bit, see the following examplary SASS 'diff' before
vs. after this change:

    [...]
              /*[...]*/                   SYNC                                                        (*"BRANCH_TARGETS .L_x_332"*)        }
      .L_x_332:
    -         /*[...]*/                   VOTE.ANY R9, PT, PT ;
    +         /*[...]*/                   VOTE.ALL P1, PT ;
    -         /*[...]*/                   ISETP.NE.U32.AND P1, PT, R9, -0x1, PT ;
    -         /*[...]*/              @!P1 BRA `(.L_x_333) ;
    +         /*[...]*/               @P1 BRA `(.L_x_333) ;
              /*[...]*/                   BPT.TRAP 0x1 ;
      .L_x_333:
    -         /*[...]*/               @P1 EXIT ;
    +         /*[...]*/              @!P1 EXIT ;
    [...]

	gcc/
	* config/nvptx/nvptx.md (nvptx_uniform_warp_check): Make fit for
	non-full-warp execution, via 'vote.all.pred'.
	gcc/testsuite/
	* gcc.target/nvptx/nvptx.exp
	(check_effective_target_default_ptx_isa_version_at_least_6_0):
	New.
	* gcc.target/nvptx/uniform-simt-2.c: Adjust.
	* gcc.target/nvptx/uniform-simt-5.c: New.
---
 gcc/config/nvptx/nvptx.md                     | 13 ++++-----
 gcc/testsuite/gcc.target/nvptx/nvptx.exp      |  5 ++++
 .../gcc.target/nvptx/uniform-simt-2.c         |  2 +-
 .../gcc.target/nvptx/uniform-simt-5.c         | 28 +++++++++++++++++++
 4 files changed, 39 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index ef7e3fb00fa..7878a3b6f09 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -2316,14 +2316,11 @@ 
   {
     const char *insns[] = {
       "{",
-      "\\t"		  ".reg.b32"	    "\\t" "%%r_act;",
-      "%.\\t"		  "vote.ballot.b32" "\\t" "%%r_act,1;",
-      "\\t"		  ".reg.pred"	    "\\t" "%%r_do_abort;",
-      "\\t"		  "mov.pred"	    "\\t" "%%r_do_abort,0;",
-      "%.\\t"		  "setp.ne.b32"	    "\\t" "%%r_do_abort,%%r_act,"
-						  "0xffffffff;",
-      "@ %%r_do_abort\\t" "trap;",
-      "@ %%r_do_abort\\t" "exit;",
+      "\\t"		".reg.pred"	"\\t" "%%r_sync;",
+      "\\t"		"mov.pred"	"\\t" "%%r_sync, 1;",
+      "%.\\t"		"vote.all.pred" "\\t" "%%r_sync, 1;",
+      "@!%%r_sync\\t"	"trap;",
+      "@!%%r_sync\\t"	"exit;",
       "}",
       NULL
     };
diff --git a/gcc/testsuite/gcc.target/nvptx/nvptx.exp b/gcc/testsuite/gcc.target/nvptx/nvptx.exp
index 97aa7ae0852..3151381f51a 100644
--- a/gcc/testsuite/gcc.target/nvptx/nvptx.exp
+++ b/gcc/testsuite/gcc.target/nvptx/nvptx.exp
@@ -49,6 +49,11 @@  proc check_effective_target_default_ptx_isa_version_at_least { major minor } {
     return $res
 }
 
+# Return 1 if code by default compiles for at least PTX ISA version 6.0.
+proc check_effective_target_default_ptx_isa_version_at_least_6_0 { } {
+    return [check_effective_target_default_ptx_isa_version_at_least 6 0]
+}
+
 # Return 1 if code with PTX ISA version major.minor or higher can be run.
 proc check_effective_target_runtime_ptx_isa_version_at_least { major minor } {
     set name runtime_ptx_isa_version_${major}_${minor}
diff --git a/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c b/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c
index b1eee0d618f..1d83c49a44b 100644
--- a/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c
+++ b/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c
@@ -17,4 +17,4 @@  f (void)
 
 /* { dg-final { scan-assembler-times "@%r\[0-9\]*\tatom.global.cas" 1 } } */
 /* { dg-final { scan-assembler-times "shfl.idx.b32" 1 } } */
-/* { dg-final { scan-assembler-times "vote.ballot.b32" 1 } } */
+/* { dg-final { scan-assembler-times "vote.all.pred" 1 } } */
diff --git a/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c b/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c
new file mode 100644
index 00000000000..cd6ea82d293
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c
@@ -0,0 +1,28 @@ 
+/* Verify that '-muniform-simt' code may be executed single-threaded.
+
+   { dg-do run }
+   { dg-options {-save-temps -O2 -muniform-simt} } */
+
+enum memmodel
+{
+  MEMMODEL_RELAXED = 0
+};
+
+unsigned long long int v64;
+unsigned long long int *p64 = &v64;
+
+int
+main()
+{
+  /* Trigger uniform-SIMT processing.  */
+  __atomic_fetch_add (p64, v64, MEMMODEL_RELAXED);
+
+  return 0;
+}
+
+/* Per 'omp_simt_exit':
+     - 'nvptx_warpsync'
+       { dg-final { scan-assembler-times {bar\.warp\.sync\t0xffffffff;} 1 { target default_ptx_isa_version_at_least_6_0 } } }
+     - 'nvptx_uniform_warp_check'
+       { dg-final { scan-assembler-times {vote\.all\.pred\t%r_sync, 1;} 1 { target { ! default_ptx_isa_version_at_least_6_0 } } } }
+*/
-- 
2.34.1