From 1d8df3b793fc43dd23b2679d4a31b761e6ac799c Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Mon, 12 Dec 2022 22:05:37 +0100
Subject: [PATCH] nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp
execution
For example, this allows for '-muniform-simt' code to be executed
single-threaded, which currently fails (device-side 'trap'), as the 0xffffffff
mask isn't correct if not all 32 threads of a warp are active. The same
issue/fix, I suppose but have not verified, would apply if we were to allow for
OpenACC 'vector_length' smaller than 32, for example for OpenACC 'serial'.
We use 'nvptx_uniform_warp_check' only for PTX ISA version less than 6.0.
Otherwise we're using 'nvptx_warpsync', which emits 'bar.warp.sync 0xffffffff',
which evidently appears to do the right thing. (I've tested '-muniform-simt'
code executing single-threaded.)
gcc/
* config/nvptx/nvptx.md (nvptx_uniform_warp_check): Make fit for
non-full-warp execution.
gcc/testsuite/
* gcc.target/nvptx/nvptx.exp
(check_effective_target_default_ptx_isa_version_at_least_6_0):
New.
* gcc.target/nvptx/uniform-simt-5.c: New.
libgomp/
* plugin/plugin-nvptx.c (nvptx_exec): Assert what we know about
'blockDimX'.
---
gcc/config/nvptx/nvptx.md | 16 ++++++++++-
gcc/testsuite/gcc.target/nvptx/nvptx.exp | 5 ++++
.../gcc.target/nvptx/uniform-simt-5.c | 28 +++++++++++++++++++
libgomp/plugin/plugin-nvptx.c | 3 ++
4 files changed, 51 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c
@@ -2282,10 +2282,24 @@
"{",
"\\t" ".reg.b32" "\\t" "%%r_act;",
"%.\\t" "vote.ballot.b32" "\\t" "%%r_act,1;",
+ /* For '%r_exp', we essentially need 'activemask.b32', but that is "Introduced in PTX ISA version 6.2", and this code here is used only 'if (!TARGET_PTX_6_0)'. Thus, emulate it.
+ TODO Is that actually correct? Wouldn't 'activemask.b32' rather replace our 'vote.ballot.b32' given that it registers the *currently active threads*? */
+ /* Compute the "membermask" of all threads of the warp that are expected to be converged here.
+ For OpenACC, '%ntid.x' is 'vector_length', which per 'nvptx_goacc_validate_dims' always is a multiple of 32.
+ For OpenMP, '%ntid.x' always is 32.
+ Thus, this is typically 0xffffffff, but additionally always for the case that not all 32 threads of the warp have been launched.
+ This assume that lane IDs are assigned in ascending order. */
+ //TODO Can we rely on '1 << 32 == 0', and '0 - 1 = 0xffffffff'?
+ //TODO https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
+ //TODO https://stackoverflow.com/questions/54055195/activemask-vs-ballot-sync
+ "\\t" ".reg.b32" "\\t" "%%r_exp;",
+ "%.\\t" "mov.b32" "\\t" "%%r_exp, %%ntid.x;",
+ "%.\\t" "shl.b32" "\\t" "%%r_exp, 1, %%r_exp;",
+ "%.\\t" "sub.u32" "\\t" "%%r_exp, %%r_exp, 1;",
"\\t" ".reg.pred" "\\t" "%%r_do_abort;",
"\\t" "mov.pred" "\\t" "%%r_do_abort,0;",
"%.\\t" "setp.ne.b32" "\\t" "%%r_do_abort,%%r_act,"
- "0xffffffff;",
+ "%%r_exp;",
"@ %%r_do_abort\\t" "trap;",
"@ %%r_do_abort\\t" "exit;",
"}",
@@ -49,6 +49,11 @@ proc check_effective_target_default_ptx_isa_version_at_least { major minor } {
return $res
}
+# Return 1 if code by default compiles for at least PTX ISA version 6.0.
+proc check_effective_target_default_ptx_isa_version_at_least_6_0 { } {
+ return [check_effective_target_default_ptx_isa_version_at_least 6 0]
+}
+
# Return 1 if code with PTX ISA version major.minor or higher can be run.
proc check_effective_target_runtime_ptx_isa_version_at_least { major minor } {
set name runtime_ptx_isa_version_${major}_${minor}
new file mode 100644
@@ -0,0 +1,28 @@
+/* Verify that '-muniform-simt' code may be executed single-threaded.
+
+ { dg-do run }
+ { dg-options {-save-temps -O2 -muniform-simt} } */
+
+enum memmodel
+{
+ MEMMODEL_RELAXED = 0
+};
+
+unsigned long long int v64;
+unsigned long long int *p64 = &v64;
+
+int
+main()
+{
+ /* Trigger uniform-SIMT processing. */
+ __atomic_fetch_add (p64, v64, MEMMODEL_RELAXED);
+
+ return 0;
+}
+
+/* Per 'omp_simt_exit':
+ - 'nvptx_warpsync'
+ { dg-final { scan-assembler-times {bar\.warp\.sync\t0xffffffff;} 1 { target default_ptx_isa_version_at_least_6_0 } } }
+ - 'nvptx_uniform_warp_check'
+ { dg-final { scan-assembler-times {vote\.ballot\.b32\t%r_act,1;} 1 { target { ! default_ptx_isa_version_at_least_6_0 } } } }
+*/
@@ -984,6 +984,9 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
api_info);
}
+ /* Per 'nvptx_goacc_validate_dims'. */
+ assert (dims[GOMP_DIM_VECTOR] % warp_size == 0);
+
kargs[0] = &dp;
CUDA_CALL_ASSERT (cuLaunchKernel, function,
dims[GOMP_DIM_GANG], 1, 1,
--
2.35.1