AMD GCN: Set HSA_XNACK for USM and 'xnack+' / 'xnack-'
Code compiled with explicit 'xnack-' (-mxnack=off) or 'xnack+' (-mnack=on)
only runs when the hardware mode matches. Additionally, on GPUs that support
XNACK, the GPU has to use this mode in order that unified-shared memory access
is possible.
This commit adds code to the constructor, created by mkoffload.cc, to set the
HSA_XNACK environment variable (if not set by the user) - either to 0 for
'xnack-' or to 1 for 'xnack+' and if the code contains
'omp requires self_maps' or '... unified_shared_memory'.
There is a compile-time warning when combining 'xnack-' with USM. At runtime,
when XNACK is supported but not enabled, the GPU is excluded (host fallback).
The new setenv has been added to the 'init' constructor - and the existing
setenv code for the stack size has been moved there to reduce the pointless
larger overhead due having multiple constructors.
Note: In GCC, gfx90{0,6,8} default to 'xnack-'; hence, the constructor will
set HSA_XNACK=0 by default.
Note 2: There is probably a preexisting endian issue in handling omp_requires
if the host compiler and the target compiler (in the GCC sense) have a different
endianess, which would be fixable. [If target and offload target would have
different endianess, it would be mostly unfixable, esp. with USM.]
Co-Authored-By: Andrew Stubbs <ams@baylibre.com>
gcc/ChangeLog:
* config/gcn/mkoffload.cc (process_asm): Take omp_requires argument;
extend conditions for requiring an '#include' in the generated C file.
Remove code generation for the GCN_STACK_SIZE constructor.
(process_obj): Inside the generated 'init' constructor, set
GCN_STACK_SIZE and HSA_XNACK, if applicable.
(main): Update process_asm call.
libgomp/ChangeLog:
* libgomp.texi (nvptx): Mention 'self_maps' besides
'unified_shared_memory' clause.
(AMD GCN): Likewise; state that GCC might automatically
set HSA_XNACK.
* testsuite/libgomp.c-c++-common/requires-4.c: Add dg-prune-output
for the -mxnack=off with USM warning.
* testsuite/libgomp.c-c++-common/requires-4a.c: Likewise.
* testsuite/libgomp.c-c++-common/requires-5.c: Likewise.
* testsuite/libgomp.c-c++-common/requires-7.c: Likewise.
* testsuite/libgomp.c-c++-common/target-implicit-map-4.c: Likewise.
* testsuite/libgomp.c-c++-common/target-link-3.c: Likewise.
* testsuite/libgomp.c-c++-common/target-link-4.c: Likewise.
* testsuite/libgomp.fortran/self_maps.f90: Likewise.
gcc/config/gcn/mkoffload.cc | 67 ++++++++++++++++------
libgomp/libgomp.texi | 24 +++++---
.../testsuite/libgomp.c-c++-common/requires-4.c | 5 ++
.../testsuite/libgomp.c-c++-common/requires-4a.c | 5 ++
.../testsuite/libgomp.c-c++-common/requires-5.c | 5 ++
.../testsuite/libgomp.c-c++-common/requires-7.c | 5 ++
.../libgomp.c-c++-common/target-implicit-map-4.c | 5 ++
.../testsuite/libgomp.c-c++-common/target-link-3.c | 5 ++
.../testsuite/libgomp.c-c++-common/target-link-4.c | 5 ++
libgomp/testsuite/libgomp.fortran/self_maps.f90 | 5 ++
10 files changed, 105 insertions(+), 26 deletions(-)
@@ -140,6 +140,7 @@ uint32_t elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX900; // Default GPU architecture.
uint32_t elf_flags = EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4;
static int gcn_stack_size = 0; /* Zero means use default. */
+bool xnack_supported = false;
/* Delete tempfiles. */
@@ -442,7 +443,7 @@ copy_early_debug_info (const char *infile, const char *outfile)
encoded as structured data. */
static void
-process_asm (FILE *in, FILE *out, FILE *cfile)
+process_asm (FILE *in, FILE *out, FILE *cfile, uint32_t omp_requires)
{
int fn_count = 0, var_count = 0, ind_fn_count = 0;
int dims_count = 0, regcount_count = 0;
@@ -616,7 +617,14 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
struct regcount *regcounts = XOBFINISH (®counts_os, struct regcount *);
- if (gcn_stack_size)
+ /* Used in the constructor, generated in process_obj. */
+ if (gcn_stack_size
+ || TEST_XNACK_OFF (elf_flags)
+ || TEST_XNACK_ON (elf_flags)
+ || (SET_XNACK_ANY (elf_flags)
+ && (omp_requires
+ & (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
+ | GOMP_REQUIRES_SELF_MAPS))))
{
fprintf (cfile, "#include <stdlib.h>\n");
fprintf (cfile, "#include <stdbool.h>\n\n");
@@ -662,18 +670,6 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
}
fprintf (cfile, "\n};\n\n");
- /* Set the stack size if the user configured a value. */
- if (gcn_stack_size)
- fprintf (cfile,
- "static __attribute__((constructor))\n"
- "void configure_stack_size (void)\n"
- "{\n"
- " const char *val = getenv (\"GCN_STACK_SIZE\");\n"
- " if (!val || val[0] == '\\0')\n"
- " setenv (\"GCN_STACK_SIZE\", \"%d\", true);\n"
- "}\n\n",
- gcn_stack_size);
-
obstack_free (&fns_os, NULL);
for (i = 0; i < dims_count; i++)
free (dims[i].name);
@@ -736,8 +732,42 @@ process_obj (const char *fname_in, FILE *cfile, uint32_t omp_requires)
fprintf (cfile, "extern const void *const __OFFLOAD_TABLE__[];\n\n");
+ /* Create the constructor to handle stack size and xnack and to call
+ GOMP_offload_register_ver. */
+
fprintf (cfile, "static __attribute__((constructor)) void init (void)\n"
- "{\n"
+ "{\n");
+
+ /* Set the stack size if the user configured a value. */
+ if (gcn_stack_size)
+ fprintf (cfile,
+ " const char *val2 = getenv (\"GCN_STACK_SIZE\");\n"
+ " if (!val2 || val2[0] == '\\0')\n"
+ " setenv (\"GCN_STACK_SIZE\", \"%d\", true);\n",
+ gcn_stack_size);
+
+ /* If the user explicitly disabled XNACK ('xnack-', -mxnack=off), set at
+ runtime HSA_XNACK=0. Otherwise, if either USM was requested or explicitly
+ compiled for XNACK ('xnack-', -mxnack=off), set HSA_XNACK=1.
+ In either case, a user-set HSA_XNACK env var is not overridden. */
+ if (TEST_XNACK_OFF (elf_flags)
+ && (omp_requires & (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
+ | GOMP_REQUIRES_SELF_MAPS)))
+ warning (0, "%<-mxnack=off%> is incompatible with OpenMP's "
+ "%<self_maps%> and %<unified_shared_memory%> clauses");
+ if (TEST_XNACK_OFF (elf_flags)
+ || TEST_XNACK_ON (elf_flags)
+ || (SET_XNACK_ANY (elf_flags)
+ && (omp_requires
+ & (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
+ | GOMP_REQUIRES_SELF_MAPS))))
+ fprintf (cfile,
+ " const char *val = getenv (\"HSA_XNACK\");\n"
+ " if (!val || val[0] == '\\0')\n"
+ " setenv (\"HSA_XNACK\", \"%d\", true);\n",
+ TEST_XNACK_OFF (elf_flags) ? 0 : 1);
+
+ fprintf (cfile,
" GOMP_offload_register_ver (%#x, __OFFLOAD_TABLE__,"
" %d/*GCN*/, &gcn_data);\n"
"};\n",
@@ -1016,7 +1046,9 @@ main (int argc, char **argv)
gcc_unreachable ();
}
- /* Set the default ELF flags for XNACK. */
+ /* Set the default ELF flags for XNACK; additionally set
+ xnack_supported for ANY but not for those defaulting
+ to NO. */
switch (elf_arch)
{
#define GCN_DEVICE(name, NAME, ELF, ISA, XNACK, SRAM, ...) \
@@ -1024,6 +1056,7 @@ main (int argc, char **argv)
#define HSACO_ATTR_UNSUPPORTED SET_XNACK_UNSET (elf_flags)
#define HSACO_ATTR_OFF SET_XNACK_OFF (elf_flags)
#define HSACO_ATTR_ANY \
+ xnack_supported = true; \
if (TEST_XNACK_UNSET (elf_flags)) SET_XNACK_ANY (elf_flags)
#include "gcn-devices.def"
#undef HSACO_ATTR_UNSUPPORTED
@@ -1250,7 +1283,7 @@ main (int argc, char **argv)
if (!out)
fatal_error (input_location, "cannot open %qs", gcn_s2_name);
- process_asm (in, out, cfile);
+ process_asm (in, out, cfile, omp_requires);
fclose (in);
fclose (out);
@@ -6576,14 +6576,20 @@ The implementation remark:
@code{device(ancestor:1)}) are processed serially per @code{target} region
such that the next reverse offload region is only executed after the previous
one returned.
-@item OpenMP code that has a @code{requires} directive with
- @code{unified_shared_memory} is only supported if all AMD GPUs have the
- @code{HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT} property; for
- discrete GPUs, this may require setting the @code{HSA_XNACK} environment
- variable to @samp{1}; for systems with both an APU and a discrete GPU that
- does not support XNACK, consider using @code{ROCR_VISIBLE_DEVICES} to
- enable only the APU. If not supported, all AMD GPU devices are removed
+@item OpenMP code that has a @code{requires} directive with a @code{self_maps} or
+ @code{unified_shared_memory} clause is only supported if all AMD GPUs have
+ the @code{HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT} property. With
+ some discrete GPUs, this requires that the @code{HSA_XNACK} environment
+ variable is set to @samp{1}. For systems with both an APU and a discrete
+ GPU that does not support XNACK, consider using @code{ROCR_VISIBLE_DEVICES}
+ to enable only the APU. If not supported, all AMD GPU devices are removed
from the list of available devices (``host fallback'').
+@item If unset and XNACK is supported by the architecture, GCC automatically
+ sets the @code{HSA_XNACK} environment variable as follows: It is set to
+ @samp{0} when compiled with @code{-mxnack=off}; otherwise, it is set to
+ @samp{1} if either compiled with @code{-mxnack=on} or if the OpenMP
+ @code{self_maps} or @code{unified_shared_memory} requirement clause has
+ been specified. Else, it is not set.
@item The available stack size can be changed using the @code{GCN_STACK_SIZE}
environment variable; the default is 32 kiB per thread.
@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the
@@ -6663,8 +6669,8 @@ The implementation remark:
Per device, reverse offload regions are processed serially such that
the next reverse offload region is only executed after the previous
one returned.
-@item OpenMP code that has a @code{requires} directive with
- @code{unified_shared_memory} runs on nvptx devices if and only if
+@item OpenMP code that has a @code{requires} directive with a @code{self_maps} or
+ @code{unified_shared_memory} clause runs on nvptx devices if and only if
all of those support the @code{pageableMemoryAccess} property;@footnote{
@uref{https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements}}
otherwise, all nvptx device are removed from the list of available
@@ -34,3 +34,8 @@ main (void)
foo ();
return 0;
}
+
+/* AMD GCN's gfx900/gfx906/gfx908 support in principle the XNACK flag, but the support is lurkwarm;
+ hence, GCC defaults to OFF, causing the warning
+ gcn mkoffload: warning: ‘-mxnack=off’ is incompatible with OpenMP's ‘self_maps’ and ‘unified_shared_memory’ clauses */
+/* { dg-prune-output "mxnack=off" } */
@@ -38,3 +38,8 @@ main (void)
__builtin_free (a);
return 0;
}
+
+/* AMD GCN's gfx900/gfx906/gfx908 support in principle the XNACK flag, but the support is lurkwarm;
+ hence, GCC defaults to OFF, causing the warning
+ gcn mkoffload: warning: ‘-mxnack=off’ is incompatible with OpenMP's ‘self_maps’ and ‘unified_shared_memory’ clauses */
+/* { dg-prune-output "mxnack=off" } */
@@ -28,3 +28,8 @@ main (void)
foo ();
return 0;
}
+
+/* AMD GCN's gfx900/gfx906/gfx908 support in principle the XNACK flag, but the support is lurkwarm;
+ hence, GCC defaults to OFF, causing the warning
+ gcn mkoffload: warning: ‘-mxnack=off’ is incompatible with OpenMP's ‘self_maps’ and ‘unified_shared_memory’ clauses */
+/* { dg-prune-output "mxnack=off" } */
@@ -30,3 +30,8 @@ main (void)
..., but we may still verify that the rest of the diagnostic is correct:
{ dg-note {' has 'unified_address'} {} { target *-*-* } 0 } */
/* { dg-excess-errors "Ignore messages like: errors during merging of translation units|mkoffload returned 1 exit status" } */
+
+/* AMD GCN's gfx900/gfx906/gfx908 support in principle the XNACK flag, but the support is lurkwarm;
+ hence, GCC defaults to OFF, causing the warning
+ gcn mkoffload: warning: ‘-mxnack=off’ is incompatible with OpenMP's ‘self_maps’ and ‘unified_shared_memory’ clauses */
+/* { dg-prune-output "mxnack=off" } */
@@ -157,3 +157,8 @@ main()
test_device (i);
return 0;
}
+
+/* AMD GCN's gfx900/gfx906/gfx908 support in principle the XNACK flag, but the support is lurkwarm;
+ hence, GCC defaults to OFF, causing the warning
+ gcn mkoffload: warning: ‘-mxnack=off’ is incompatible with OpenMP's ‘self_maps’ and ‘unified_shared_memory’ clauses */
+/* { dg-prune-output "mxnack=off" } */
@@ -50,3 +50,8 @@ main ()
q = 42;
}
}
+
+/* AMD GCN's gfx900/gfx906/gfx908 support in principle the XNACK flag, but the support is lurkwarm;
+ hence, GCC defaults to OFF, causing the warning
+ gcn mkoffload: warning: ‘-mxnack=off’ is incompatible with OpenMP's ‘self_maps’ and ‘unified_shared_memory’ clauses */
+/* { dg-prune-output "mxnack=off" } */
@@ -50,3 +50,8 @@ main ()
q = 42;
}
}
+
+/* AMD GCN's gfx900/gfx906/gfx908 support in principle the XNACK flag, but the support is lurkwarm;
+ hence, GCC defaults to OFF, causing the warning
+ gcn mkoffload: warning: ‘-mxnack=off’ is incompatible with OpenMP's ‘self_maps’ and ‘unified_shared_memory’ clauses */
+/* { dg-prune-output "mxnack=off" } */
@@ -47,3 +47,8 @@ do i = 0, omp_get_num_devices()
!$omp end target
end do
end
+
+! AMD GCN's gfx900/gfx906/gfx908 support in principle the XNACK flag, but the support is lurkwarm;
+! hence, GCC defaults to OFF, causing the warning
+! gcn mkoffload: warning: ‘-mxnack=off’ is incompatible with OpenMP's ‘self_maps’ and ‘unified_shared_memory’ clauses
+! { dg-prune-output "mxnack=off" }