@@ -1,3 +1,10 @@
+2019-07-31 Julian Brown <julian@codesourcery.com>
+ Andrew Stubbs <ams@codesourcery.com>
+
+ * config.gcc (amdgcn-*-*): Add default option for gfx906.
+ * config/gcn/mkoffload.c: New.
+ * config/gcn/offload.h: New.
+
2019-06-25 Kwok Cheung Yeung <kcy@codesourcery.com>
Andrew Stubbs <ams@codesourcery.com>
@@ -4126,7 +4126,7 @@ case "${target}" in
for which in arch tune; do
eval "val=\$with_$which"
case ${val} in
- "" | carrizo | fiji | gfx900 )
+ "" | carrizo | fiji | gfx900 | gfx906 )
# OK
;;
*)
new file mode 100644
@@ -0,0 +1,702 @@
+/* Offload image generation tool for AMD GCN.
+
+ Copyright (C) 2014-2019 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+/* Munges GCN assembly into a C source file defining the GCN code as a
+ string.
+
+ This is not a complete assembler. We presume the source is well
+ formed from the compiler and can die horribly if it is not. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "obstack.h"
+#include "diagnostic.h"
+#include "intl.h"
+#include <libgen.h>
+#include "collect-utils.h"
+#include "gomp-constants.h"
+
+const char tool_name[] = "gcn mkoffload";
+
+#define COMMENT_PREFIX "#"
+
+struct id_map
+{
+ id_map *next;
+ char *gcn_name;
+};
+
+static id_map *func_ids, **funcs_tail = &func_ids;
+static id_map *var_ids, **vars_tail = &var_ids;
+
+/* Files to unlink. */
+static const char *gcn_s1_name;
+static const char *gcn_s2_name;
+static const char *gcn_o_name;
+static const char *gcn_cfile_name;
+
+enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
+
+/* Delete tempfiles. */
+
+void
+tool_cleanup (bool from_signal ATTRIBUTE_UNUSED)
+{
+ if (gcn_cfile_name)
+ maybe_unlink (gcn_cfile_name);
+ if (gcn_s1_name)
+ maybe_unlink (gcn_s1_name);
+ if (gcn_s2_name)
+ maybe_unlink (gcn_s2_name);
+ if (gcn_o_name)
+ maybe_unlink (gcn_o_name);
+}
+
+static void
+mkoffload_cleanup (void)
+{
+ tool_cleanup (false);
+}
+
+/* Unlink FILE unless requested otherwise. */
+
+void
+maybe_unlink (const char *file)
+{
+ if (!save_temps)
+ {
+ if (unlink_if_ordinary (file) && errno != ENOENT)
+ fatal_error (input_location, "deleting file %s: %m", file);
+ }
+ else if (verbose)
+ fprintf (stderr, "[Leaving %s]\n", file);
+}
+
+/* Add or change the value of an environment variable, outputting the
+ change to standard error if in verbose mode. */
+
+static void
+xputenv (const char *string)
+{
+ if (verbose)
+ fprintf (stderr, "%s\n", string);
+ putenv (CONST_CAST (char *, string));
+}
+
+/* Read the whole input file. It will be NUL terminated (but
+ remember, there could be a NUL in the file itself. */
+
+static const char *
+read_file (FILE *stream, size_t *plen)
+{
+ size_t alloc = 16384;
+ size_t base = 0;
+ char *buffer;
+
+ if (!fseek (stream, 0, SEEK_END))
+ {
+ /* Get the file size. */
+ long s = ftell (stream);
+ if (s >= 0)
+ alloc = s + 100;
+ fseek (stream, 0, SEEK_SET);
+ }
+ buffer = XNEWVEC (char, alloc);
+
+ for (;;)
+ {
+ size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
+
+ if (!n)
+ break;
+ base += n;
+ if (base + 1 == alloc)
+ {
+ alloc *= 2;
+ buffer = XRESIZEVEC (char, buffer, alloc);
+ }
+ }
+ buffer[base] = 0;
+ *plen = base;
+ return buffer;
+}
+
+/* Parse STR, saving found tokens into PVALUES and return their number.
+ Tokens are assumed to be delimited by ':'. */
+
+static unsigned
+parse_env_var (const char *str, char ***pvalues)
+{
+ const char *curval, *nextval;
+ char **values;
+ unsigned num = 1, i;
+
+ curval = strchr (str, ':');
+ while (curval)
+ {
+ num++;
+ curval = strchr (curval + 1, ':');
+ }
+
+ values = (char **) xmalloc (num * sizeof (char *));
+ curval = str;
+ nextval = strchr (curval, ':');
+ if (nextval == NULL)
+ nextval = strchr (curval, '\0');
+
+ for (i = 0; i < num; i++)
+ {
+ int l = nextval - curval;
+ values[i] = (char *) xmalloc (l + 1);
+ memcpy (values[i], curval, l);
+ values[i][l] = 0;
+ curval = nextval + 1;
+ nextval = strchr (curval, ':');
+ if (nextval == NULL)
+ nextval = strchr (curval, '\0');
+ }
+ *pvalues = values;
+ return num;
+}
+
+/* Auxiliary function that frees elements of PTR and PTR itself.
+ N is number of elements to be freed. If PTR is NULL, nothing is freed.
+ If an element is NULL, subsequent elements are not freed. */
+
+static void
+free_array_of_ptrs (void **ptr, unsigned n)
+{
+ unsigned i;
+ if (!ptr)
+ return;
+ for (i = 0; i < n; i++)
+ {
+ if (!ptr[i])
+ break;
+ free (ptr[i]);
+ }
+ free (ptr);
+ return;
+}
+
+/* Check whether NAME can be accessed in MODE. This is like access,
+ except that it never considers directories to be executable. */
+
+static int
+access_check (const char *name, int mode)
+{
+ if (mode == X_OK)
+ {
+ struct stat st;
+
+ if (stat (name, &st) < 0 || S_ISDIR (st.st_mode))
+ return -1;
+ }
+
+ return access (name, mode);
+}
+
+/* Parse an input assembler file, extract the offload tables etc.,
+ and output (1) the assembler code, minus the tables (which can contain
+ problematic relocations), and (2) a C file with the offload tables
+ encoded as structured data. */
+
+static void
+process_asm (FILE *in, FILE *out, FILE *cfile)
+{
+ int fn_count = 0, var_count = 0, dims_count = 0;
+ struct obstack fns_os, vars_os, varsizes_os, dims_os;
+ obstack_init (&fns_os);
+ obstack_init (&vars_os);
+ obstack_init (&varsizes_os);
+ obstack_init (&dims_os);
+
+ struct oaccdims
+ {
+ int d[3];
+ char *name;
+ } dim;
+
+ /* Always add _init_array and _fini_array as kernels. */
+ obstack_ptr_grow (&fns_os, xstrdup ("_init_array"));
+ obstack_ptr_grow (&fns_os, xstrdup ("_fini_array"));
+ fn_count += 2;
+
+ char buf[1000];
+ enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE;
+ while (fgets (buf, sizeof (buf), in))
+ {
+ switch (state)
+ {
+ case IN_CODE:
+ {
+ if (sscanf (buf, " ;; OPENACC-DIMS: %d, %d, %d : %ms\n",
+ &dim.d[0], &dim.d[1], &dim.d[2], &dim.name) == 4)
+ {
+ obstack_grow (&dims_os, &dim, sizeof (dim));
+ dims_count++;
+ }
+ break;
+ }
+ case IN_VARS:
+ {
+ char *varname;
+ unsigned varsize;
+ if (sscanf (buf, " .8byte %ms\n", &varname))
+ {
+ obstack_ptr_grow (&vars_os, varname);
+ fgets (buf, sizeof (buf), in);
+ if (!sscanf (buf, " .8byte %u\n", &varsize))
+ abort ();
+ obstack_int_grow (&varsizes_os, varsize);
+ var_count++;
+
+ /* The HSA Runtime cannot locate the symbol if it is not
+ exported from the kernel. */
+ fprintf (out, "\t.global %s\n", varname);
+ }
+ break;
+ }
+ case IN_FUNCS:
+ {
+ char *funcname;
+ if (sscanf (buf, "\t.8byte\t%ms\n", &funcname))
+ {
+ obstack_ptr_grow (&fns_os, funcname);
+ fn_count++;
+ continue;
+ }
+ break;
+ }
+ }
+
+ char dummy;
+ if (sscanf (buf, " .section .gnu.offload_vars%c", &dummy) > 0)
+ state = IN_VARS;
+ else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
+ state = IN_FUNCS;
+ else if (sscanf (buf, " .section %c", &dummy) > 0
+ || sscanf (buf, " .text%c", &dummy) > 0
+ || sscanf (buf, " .bss%c", &dummy) > 0
+ || sscanf (buf, " .data%c", &dummy) > 0
+ || sscanf (buf, " .ident %c", &dummy) > 0)
+ state = IN_CODE;
+
+ if (state == IN_CODE)
+ fputs (buf, out);
+ }
+
+ char **fns = XOBFINISH (&fns_os, char **);
+ struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
+
+ fprintf (cfile, "#include <stdlib.h>\n");
+ fprintf (cfile, "#include <stdbool.h>\n\n");
+
+ char **vars = XOBFINISH (&vars_os, char **);
+ unsigned *varsizes = XOBFINISH (&varsizes_os, unsigned *);
+ fprintf (cfile,
+ "static const struct global_var_info {\n"
+ " const char *name;\n"
+ " void *address;\n"
+ "} vars[] = {\n");
+ int i;
+ for (i = 0; i < var_count; ++i)
+ {
+ const char *sep = i < var_count - 1 ? "," : " ";
+ fprintf (cfile, " { \"%s\", NULL }%s /* size: %u */\n", vars[i], sep,
+ varsizes[i]);
+ }
+ fprintf (cfile, "};\n\n");
+
+ obstack_free (&vars_os, NULL);
+ obstack_free (&varsizes_os, NULL);
+
+ /* Dump out function idents. */
+ fprintf (cfile, "static const struct hsa_kernel_description {\n"
+ " const char *name;\n"
+ " unsigned omp_data_size;\n"
+ " bool gridified_kernel_p;\n"
+ " unsigned kernel_dependencies_count;\n"
+ " const char **kernel_dependencies;\n"
+ " int oacc_dims[3];\n"
+ "} gcn_kernels[] = {\n ");
+ dim.d[0] = dim.d[1] = dim.d[2] = 0;
+ const char *comma;
+ for (comma = "", i = 0; i < fn_count; comma = ",\n ", i++)
+ {
+ /* Find if we recorded dimensions for this function. */
+ int *d = dim.d; /* Previously zeroed. */
+ for (int j = 0; j < dims_count; j++)
+ if (strcmp (fns[i], dims[j].name) == 0)
+ {
+ d = dims[j].d;
+ break;
+ }
+
+ fprintf (cfile, "%s{\"%s\", 0, 0, 0, NULL, {%d, %d, %d}}", comma,
+ fns[i], d[0], d[1], d[2]);
+
+ free (fns[i]);
+ }
+ fprintf (cfile, "\n};\n\n");
+
+ obstack_free (&fns_os, NULL);
+ for (i = 0; i < dims_count; i++)
+ free (dims[i].name);
+ obstack_free (&dims_os, NULL);
+}
+
+/* Embed an object file into a C source file. */
+
+static void
+process_obj (FILE *in, FILE *cfile)
+{
+ size_t len = 0;
+ const char *input = read_file (in, &len);
+ id_map const *id;
+ unsigned ix;
+
+ /* Dump out an array containing the binary.
+ FIXME: do this with objcopy. */
+ fprintf (cfile, "static unsigned char gcn_code[] = {");
+ for (size_t i = 0; i < len; i += 17)
+ {
+ fprintf (cfile, "\n\t");
+ for (size_t j = i; j < i + 17 && j < len; j++)
+ fprintf (cfile, "%3u,", (unsigned char) input[j]);
+ }
+ fprintf (cfile, "\n};\n\n");
+
+ fprintf (cfile,
+ "static const struct gcn_image {\n"
+ " char magic[4];\n"
+ " size_t size;\n"
+ " void *image;\n"
+ "} gcn_image = {\n"
+ " \"GCN\",\n"
+ " %zu,\n"
+ " gcn_code\n"
+ "};\n\n",
+ len);
+
+ fprintf (cfile,
+ "static const struct gcn_image_desc {\n"
+ " const struct gcn_image *gcn_image;\n"
+ " unsigned kernel_count;\n"
+ " const struct hsa_kernel_description *kernel_infos;\n"
+ " unsigned global_variable_count;\n"
+ " const struct global_var_info *global_variables;\n"
+ "} target_data = {\n"
+ " &gcn_image,\n"
+ " sizeof (gcn_kernels) / sizeof (gcn_kernels[0]),\n"
+ " gcn_kernels,\n"
+ " sizeof (vars) / sizeof (vars[0]),\n"
+ " vars\n"
+ "};\n\n");
+
+ fprintf (cfile,
+ "#ifdef __cplusplus\n"
+ "extern \"C\" {\n"
+ "#endif\n"
+ "extern void GOMP_offload_register_ver"
+ " (unsigned, const void *, int, const void *);\n"
+ "extern void GOMP_offload_unregister_ver"
+ " (unsigned, const void *, int, const void *);\n"
+ "#ifdef __cplusplus\n"
+ "}\n"
+ "#endif\n\n");
+
+ fprintf (cfile, "extern const void *const __OFFLOAD_TABLE__[];\n\n");
+
+ fprintf (cfile, "static __attribute__((constructor)) void init (void)\n"
+ "{\n"
+ " GOMP_offload_register_ver (%#x, __OFFLOAD_TABLE__,"
+ " %d/*GCN*/, &target_data);\n"
+ "};\n",
+ GOMP_VERSION_PACK (GOMP_VERSION, GOMP_VERSION_GCN),
+ GOMP_DEVICE_GCN);
+
+ fprintf (cfile, "static __attribute__((destructor)) void fini (void)\n"
+ "{\n"
+ " GOMP_offload_unregister_ver (%#x, __OFFLOAD_TABLE__,"
+ " %d/*GCN*/, &target_data);\n"
+ "};\n",
+ GOMP_VERSION_PACK (GOMP_VERSION, GOMP_VERSION_GCN),
+ GOMP_DEVICE_GCN);
+}
+
+/* Compile a C file using the host compiler. */
+
+static void
+compile_native (const char *infile, const char *outfile, const char *compiler)
+{
+ const char *collect_gcc_options = getenv ("COLLECT_GCC_OPTIONS");
+ if (!collect_gcc_options)
+ fatal_error (input_location,
+ "environment variable COLLECT_GCC_OPTIONS must be set");
+
+ struct obstack argv_obstack;
+ obstack_init (&argv_obstack);
+ obstack_ptr_grow (&argv_obstack, compiler);
+ if (save_temps)
+ obstack_ptr_grow (&argv_obstack, "-save-temps");
+ if (verbose)
+ obstack_ptr_grow (&argv_obstack, "-v");
+ switch (offload_abi)
+ {
+ case OFFLOAD_ABI_LP64:
+ obstack_ptr_grow (&argv_obstack, "-m64");
+ break;
+ case OFFLOAD_ABI_ILP32:
+ obstack_ptr_grow (&argv_obstack, "-m32");
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ obstack_ptr_grow (&argv_obstack, infile);
+ obstack_ptr_grow (&argv_obstack, "-c");
+ obstack_ptr_grow (&argv_obstack, "-o");
+ obstack_ptr_grow (&argv_obstack, outfile);
+ obstack_ptr_grow (&argv_obstack, NULL);
+
+ const char **new_argv = XOBFINISH (&argv_obstack, const char **);
+ fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true);
+ obstack_free (&argv_obstack, NULL);
+}
+
+int
+main (int argc, char **argv)
+{
+ FILE *in = stdin;
+ FILE *out = stdout;
+ FILE *cfile = stdout;
+ const char *outname = 0, *offloadsrc = 0;
+
+ progname = "mkoffload";
+ diagnostic_initialize (global_dc, 0);
+
+ if (atexit (mkoffload_cleanup) != 0)
+ fatal_error (input_location, "atexit failed");
+
+ char *collect_gcc = getenv ("COLLECT_GCC");
+ if (collect_gcc == NULL)
+ fatal_error (input_location, "COLLECT_GCC must be set.");
+ const char *gcc_path = dirname (ASTRDUP (collect_gcc));
+ const char *gcc_exec = basename (ASTRDUP (collect_gcc));
+
+ size_t len = (strlen (gcc_path) + 1 + strlen (GCC_INSTALL_NAME) + 1);
+ char *driver = XALLOCAVEC (char, len);
+
+ if (strcmp (gcc_exec, collect_gcc) == 0)
+ /* collect_gcc has no path, so it was found in PATH. Make sure we also
+ find accel-gcc in PATH. */
+ gcc_path = NULL;
+
+ int driver_used = 0;
+ if (gcc_path != NULL)
+ driver_used = sprintf (driver, "%s/", gcc_path);
+ sprintf (driver + driver_used, "%s", GCC_INSTALL_NAME);
+
+ bool found = false;
+ if (gcc_path == NULL)
+ found = true;
+ else if (access_check (driver, X_OK) == 0)
+ found = true;
+ else
+ {
+ /* Don't use alloca pointer with XRESIZEVEC. */
+ driver = NULL;
+ /* Look in all COMPILER_PATHs for GCC_INSTALL_NAME. */
+ char **paths = NULL;
+ unsigned n_paths;
+ n_paths = parse_env_var (getenv ("COMPILER_PATH"), &paths);
+ for (unsigned i = 0; i < n_paths; i++)
+ {
+ len = strlen (paths[i]) + 1 + strlen (GCC_INSTALL_NAME) + 1;
+ driver = XRESIZEVEC (char, driver, len);
+ sprintf (driver, "%s/%s", paths[i], GCC_INSTALL_NAME);
+ if (access_check (driver, X_OK) == 0)
+ {
+ found = true;
+ break;
+ }
+ }
+ free_array_of_ptrs ((void **) paths, n_paths);
+ }
+
+ if (!found)
+ fatal_error (input_location,
+ "offload compiler %s not found", GCC_INSTALL_NAME);
+
+ /* We may be called with all the arguments stored in some file and
+ passed with @file. Expand them into argv before processing. */
+ expandargv (&argc, &argv);
+
+ /* Scan the argument vector. */
+ bool fopenmp = false;
+ bool fopenacc = false;
+ for (int i = 1; i < argc; i++)
+ {
+#define STR "-foffload-abi="
+ if (strncmp (argv[i], STR, strlen (STR)) == 0)
+ {
+ if (strcmp (argv[i] + strlen (STR), "lp64") == 0)
+ offload_abi = OFFLOAD_ABI_LP64;
+ else if (strcmp (argv[i] + strlen (STR), "ilp32") == 0)
+ offload_abi = OFFLOAD_ABI_ILP32;
+ else
+ fatal_error (input_location,
+ "unrecognizable argument of option " STR);
+ }
+#undef STR
+ else if (strcmp (argv[i], "-fopenmp") == 0)
+ fopenmp = true;
+ else if (strcmp (argv[i], "-fopenacc") == 0)
+ fopenacc = true;
+ else if (strcmp (argv[i], "-save-temps") == 0)
+ save_temps = true;
+ else if (strcmp (argv[i], "-v") == 0)
+ verbose = true;
+ }
+ if (!(fopenacc ^ fopenmp))
+ fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
+
+ const char *abi;
+ switch (offload_abi)
+ {
+ case OFFLOAD_ABI_LP64:
+ abi = "-m64";
+ break;
+ case OFFLOAD_ABI_ILP32:
+ abi = "-m32";
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ gcn_s1_name = make_temp_file (".mkoffload.1.s");
+ gcn_s2_name = make_temp_file (".mkoffload.2.s");
+ gcn_o_name = make_temp_file (".mkoffload.hsaco");
+ gcn_cfile_name = make_temp_file (".c");
+
+ /* Build arguments for compiler pass. */
+ struct obstack cc_argv_obstack;
+ obstack_init (&cc_argv_obstack);
+ obstack_ptr_grow (&cc_argv_obstack, driver);
+ obstack_ptr_grow (&cc_argv_obstack, "-S");
+
+ if (save_temps)
+ obstack_ptr_grow (&cc_argv_obstack, "-save-temps");
+ if (verbose)
+ obstack_ptr_grow (&cc_argv_obstack, "-v");
+ obstack_ptr_grow (&cc_argv_obstack, abi);
+ obstack_ptr_grow (&cc_argv_obstack, "-xlto");
+ if (fopenmp)
+ obstack_ptr_grow (&cc_argv_obstack, "-mgomp");
+
+ for (int ix = 1; ix != argc; ix++)
+ {
+ if (!strcmp (argv[ix], "-o") && ix + 1 != argc)
+ outname = argv[++ix];
+ else
+ {
+ obstack_ptr_grow (&cc_argv_obstack, argv[ix]);
+
+ if (argv[ix][0] != '-')
+ offloadsrc = argv[ix];
+ }
+ }
+
+ obstack_ptr_grow (&cc_argv_obstack, "-o");
+ obstack_ptr_grow (&cc_argv_obstack, gcn_s1_name);
+ obstack_ptr_grow (&cc_argv_obstack,
+ concat ("-mlocal-symbol-id=", offloadsrc, NULL));
+ obstack_ptr_grow (&cc_argv_obstack, NULL);
+ const char **cc_argv = XOBFINISH (&cc_argv_obstack, const char **);
+
+ /* Build arguments for assemble/link pass. */
+ struct obstack ld_argv_obstack;
+ obstack_init (&ld_argv_obstack);
+ obstack_ptr_grow (&ld_argv_obstack, driver);
+ obstack_ptr_grow (&ld_argv_obstack, gcn_s2_name);
+ obstack_ptr_grow (&ld_argv_obstack, "-lgomp");
+
+ for (int i = 1; i < argc; i++)
+ if (strncmp (argv[i], "-l", 2) == 0
+ || strncmp (argv[i], "-Wl", 3) == 0
+ || strncmp (argv[i], "-march", 6) == 0)
+ obstack_ptr_grow (&ld_argv_obstack, argv[i]);
+
+ obstack_ptr_grow (&ld_argv_obstack, "-o");
+ obstack_ptr_grow (&ld_argv_obstack, gcn_o_name);
+ obstack_ptr_grow (&ld_argv_obstack, NULL);
+ const char **ld_argv = XOBFINISH (&ld_argv_obstack, const char **);
+
+ /* Clean up unhelpful environment variables. */
+ char *execpath = getenv ("GCC_EXEC_PREFIX");
+ char *cpath = getenv ("COMPILER_PATH");
+ char *lpath = getenv ("LIBRARY_PATH");
+ unsetenv ("GCC_EXEC_PREFIX");
+ unsetenv ("COMPILER_PATH");
+ unsetenv ("LIBRARY_PATH");
+
+ /* Run the compiler pass. */
+ fork_execute (cc_argv[0], CONST_CAST (char **, cc_argv), true);
+ obstack_free (&cc_argv_obstack, NULL);
+
+ in = fopen (gcn_s1_name, "r");
+ if (!in)
+ fatal_error (input_location, "cannot open intermediate gcn asm file");
+
+ out = fopen (gcn_s2_name, "w");
+ if (!out)
+ fatal_error (input_location, "cannot open '%s'", gcn_s2_name);
+
+ cfile = fopen (gcn_cfile_name, "w");
+ if (!cfile)
+ fatal_error (input_location, "cannot open '%s'", gcn_cfile_name);
+
+ process_asm (in, out, cfile);
+
+ fclose (in);
+ fclose (out);
+
+ /* Run the assemble/link pass. */
+ fork_execute (ld_argv[0], CONST_CAST (char **, ld_argv), true);
+ obstack_free (&ld_argv_obstack, NULL);
+
+ in = fopen (gcn_o_name, "r");
+ if (!in)
+ fatal_error (input_location, "cannot open intermediate gcn obj file");
+
+ process_obj (in, cfile);
+
+ fclose (in);
+ fclose (cfile);
+
+ xputenv (concat ("GCC_EXEC_PREFIX=", execpath, NULL));
+ xputenv (concat ("COMPILER_PATH=", cpath, NULL));
+ xputenv (concat ("LIBRARY_PATH=", lpath, NULL));
+
+ compile_native (gcn_cfile_name, outname, collect_gcc);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,35 @@
+/* Support for AMD GCN offloading.
+
+ Copyright (C) 2014-2019 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef GCC_GCN_OFFLOAD_H
+#define GCC_GCN_OFFLOAD_H
+
+/* Support for OpenACC acc_on_device. */
+
+#include "gomp-constants.h"
+
+#define ACCEL_COMPILER_acc_device GOMP_DEVICE_GCN
+
+#endif
@@ -219,7 +219,8 @@ enum gomp_map_kind
#define GOMP_DEVICE_NVIDIA_PTX 5
#define GOMP_DEVICE_INTEL_MIC 6
#define GOMP_DEVICE_HSA 7
-#define GOMP_DEVICE_CURRENT 8
+#define GOMP_DEVICE_GCN 8
+#define GOMP_DEVICE_CURRENT 9
#define GOMP_DEVICE_ICV -1
#define GOMP_DEVICE_HOST_FALLBACK -2
@@ -276,6 +277,7 @@ enum gomp_map_kind
#define GOMP_VERSION_NVIDIA_PTX 1
#define GOMP_VERSION_INTEL_MIC 0
#define GOMP_VERSION_HSA 0
+#define GOMP_VERSION_GCN 0
#define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV))
#define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff)
@@ -1,3 +1,13 @@
+2019-07-31 Julian Brown <julian@codesourcery.com>
+ Andrew Stubbs <ams@codesourcery.com>
+
+ * Makefile.in: Allow disabling of emutls.
+ * config/gcn/gomp_print.c: New.
+ * config/gcn/reduction.c: New.
+ * config/gcn/t-amdgcn (LIB2ADD): Add gomp_print.c and reduction.c.
+ Disable emutls.c.
+ * config/gcn/t-gcn-hsa: New.
+
2019-06-25 Andrew Stubbs <ams@codesourcery.com>
Backport from mainline:
@@ -430,9 +430,11 @@ LIB2ADD += enable-execute-stack.c
# While emutls.c has nothing to do with EH, it is in LIB2ADDEH*
# instead of LIB2ADD because that's the way to be sure on some targets
# (e.g. *-*-darwin*) only one copy of it is linked.
+ifneq ($(enable_emutls),no)
LIB2ADDEH += $(srcdir)/emutls.c
LIB2ADDEHSTATIC += $(srcdir)/emutls.c
LIB2ADDEHSHARED += $(srcdir)/emutls.c
+endif
# Library members defined in libgcc2.c.
lib2funcs = _muldi3 _negdi2 _lshrdi3 _ashldi3 _ashrdi3 _cmpdi2 _ucmpdi2 \
new file mode 100644
@@ -0,0 +1,101 @@
+/* Newlib may not have been built yet. */
+typedef long int64_t;
+typedef long size_t;
+extern char *strncpy (char *dst, const char *src, size_t length);
+extern void exit(int);
+
+void gomp_print_string (const char *msg, const char *value);
+void gomp_print_integer (const char *msg, int64_t value);
+void gomp_print_double (const char *msg, double value);
+
+/* This struct must match the one used by gcn-run and libgomp.
+ It holds all the data output from a kernel (besides mapping data).
+
+ The base address pointer can be found at kernargs+16.
+
+ The next_output counter must be atomically incremented for each
+ print output. Only when the print data is fully written can the
+ "written" flag be set. */
+struct output {
+ int return_value;
+ unsigned int next_output;
+ struct printf_data {
+ int written;
+ char msg[128];
+ int type;
+ union {
+ int64_t ivalue;
+ double dvalue;
+ char text[128];
+ };
+ } queue[1024];
+ unsigned int consumed;
+};
+
+static struct printf_data *
+reserve_print_slot (void) {
+ /* The kernargs pointer is in s[8:9].
+ This will break if the enable_sgpr_* flags are ever changed. */
+ char *kernargs;
+ asm ("s_mov_b64 %0, s[8:9]" : "=Sg"(kernargs));
+
+ /* The output data is at kernargs[2]. */
+ struct output *data = *(struct output **)(kernargs + 16);
+
+ /* Reserve the slot. */
+ unsigned int index = __atomic_fetch_add (&data->next_output, 1,
+ __ATOMIC_ACQUIRE);
+
+ /* Spinlock while the host catches up. */
+ if (index >= 1024)
+ while (__atomic_load_n (&data->consumed, __ATOMIC_ACQUIRE)
+ <= (index - 1024))
+ asm ("s_sleep 64");
+
+ if ((unsigned int)(index + 1) < data->consumed)
+ {
+ /* Overflow. */
+ exit (1);
+ }
+ return &(data->queue[index%1024]);
+}
+
+void
+gomp_print_string (const char *msg, const char *value)
+{
+ struct printf_data *output = reserve_print_slot ();
+ output->type = 2; /* String. */
+
+ strncpy (output->msg, msg, 127);
+ output->msg[127] = '\0';
+ strncpy (output->text, value, 127);
+ output->text[127] = '\0';
+
+ __atomic_store_n (&output->written, 1, __ATOMIC_RELEASE);
+}
+
+void
+gomp_print_integer (const char *msg, int64_t value)
+{
+ struct printf_data *output = reserve_print_slot ();
+ output->type = 0; /* Integer. */
+
+ strncpy (output->msg, msg, 127);
+ output->msg[127] = '\0';
+ output->ivalue = value;
+
+ __atomic_store_n (&output->written, 1, __ATOMIC_RELEASE);
+}
+
+void
+gomp_print_double (const char *msg, double value)
+{
+ struct printf_data *output = reserve_print_slot ();
+ output->type = 1; /* Double. */
+
+ strncpy (output->msg, msg, 127);
+ output->msg[127] = '\0';
+ output->dvalue = value;
+
+ __atomic_store_n (&output->written, 1, __ATOMIC_RELEASE);
+}
new file mode 100644
@@ -0,0 +1,30 @@
+/* Oversized reductions lock variable
+ Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Graphics.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/* We use a global lock variable for reductions on objects larger than
+ 64 bits. Until and unless proven that lock contention for
+ different reductions is a problem, a single lock will suffice. */
+
+unsigned volatile __reduction_lock = 0;
@@ -1,7 +1,11 @@
+LIB2ADD += $(srcdir)/config/gcn/gomp_print.c
+
LIB2ADD += $(srcdir)/config/gcn/lib2-divmod.c \
$(srcdir)/config/gcn/lib2-divmod-hi.c \
$(srcdir)/config/gcn/unwind-gcn.c
+LIB2ADD += $(srcdir)/config/gcn/reduction.c
+
LIB2ADDEH=
LIB2FUNCS_EXCLUDE=__main
@@ -13,5 +17,10 @@ LIBGCC2_DEBUG_CFLAGS = -g0
crt0.o: $(srcdir)/config/gcn/crt0.c
$(crt_compile) -c $<
-# Prevent building "advanced" stuff (for example, gcov support).
+# Prevent building "advanced" stuff (for example, gcov support). We don't
+# support it, and it may cause the build to fail, because of alloca usage, for
+# example.
INHIBIT_LIBC_CFLAGS = -Dinhibit_libc
+
+# Disable emutls.c (temporarily?)
+enable_emutls = no
new file mode 100644
@@ -0,0 +1,52 @@
+# Copyright (C) 2016-2019 Free Software Foundation, Inc.
+#
+# This file is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This file is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3. If not see
+# <http://www.gnu.org/licenses/>.
+
+GTM_H += $(HASH_TABLE_H)
+
+driver-gcn.o: $(srcdir)/config/gcn/driver-gcn.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
+CFLAGS-mkoffload.o += $(DRIVER_DEFINES) \
+ -DGCC_INSTALL_NAME=\"$(GCC_INSTALL_NAME)\"
+mkoffload.o: $(srcdir)/config/gcn/mkoffload.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+ALL_HOST_OBJS += mkoffload.o
+
+mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a \
+ $(LIBIBERTY) $(LIBDEPS)
+ +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \
+ mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS)
+
+CFLAGS-gcn-run.o += -DVERSION_STRING=$(PKGVERSION_s)
+COMPILE-gcn-run.o = $(filter-out -fno-rtti,$(COMPILE))
+gcn-run.o: $(srcdir)/config/gcn/gcn-run.c
+ $(COMPILE-gcn-run.o) -x c -std=gnu11 -Wno-error=pedantic $<
+ $(POSTCOMPILE)
+ALL_HOST_OBJS += gcn-run.o
+
+gcn-run$(exeext): gcn-run.o
+ +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ $< -ldl
+
+MULTILIB_OPTIONS = march=gfx900 march=gfx906
+MULTILIB_DIRNAMES = gfx900 gfx906
+
+PASSES_EXTRA += $(srcdir)/config/gcn/gcn-passes.def
+gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+ALL_HOST_OBJS += gcn-tree.o
@@ -1,3 +1,160 @@
+2019-07-31 Julian Brown <julian@codesourcery.com>
+ Andrew Stubbs <ams@codesourcery.com>
+
+ * Makefile.am (libgomp_la_SOURCES): Add gomp_print.c.
+ * Makefile.in: Regenerate.
+ * affinity-fmt.c: Rename calls to gomp_write_string from
+ gomp_print_string.
+ * config.h.in (PLUGIN_GCN): Add #undef.
+ * config/nvptx/libgomp-plugin.c: Rename to...
+ * config/accel/libgomp-plugin.c: ...this.
+ * config/nvptx/lock.c: Rename to...
+ * config/accel/lock.c: ...this.
+ * config/nvptx/mutex.c: Rename to...
+ * config/accel/mutex.c: ...this.
+ * config/nvptx/mutex.h: Rename to...
+ * config/accel/mutex.h: ...this.
+ * config/nvptx/oacc-async.c: Rename to...
+ * config/accel/oacc-async.c: ...this.
+ * config/nvptx/oacc-cuda.c: Rename to...
+ * config/accel/oacc-cuda.c: ...this.
+ * config/nvptx/oacc-host.c: Rename to...
+ * config/accel/oacc-host.c: ...this.
+ * config/nvptx/oacc-init.c: Rename to...
+ * config/accel/oacc-init.c: ...this.
+ * config/nvptx/oacc-mem.c: Rename to...
+ * config/accel/oacc-mem.c: ...this.
+ * config/nvptx/oacc-plugin.c: Rename to...
+ * config/accel/oacc-plugin.c: ...this.
+ * config/nvptx/omp-lock.h: Rename to...
+ * config/accel/omp-lock.h: ...this.
+ * config/nvptx/openacc.f90: Rename to...
+ * config/accel/openacc.f90: ...this. Add acc_device_hsa and
+ acc_device_gcn.
+ * config/nvptx/pool.h: Rename to...
+ * config/accel/pool.h: ...this.
+ * config/nvptx/proc.c: Rename to...
+ * config/accel/proc.c: ...this. Add omp_get_num_procs alias.
+ * config/nvptx/ptrlock.c: Rename to...
+ * config/accel/ptrlock.c: ...this.
+ * config/nvptx/ptrlock.h: Rename to...
+ * config/accel/ptrlock.h: ...this.
+ * config/nvptx/sem.c: Rename to...
+ * config/accel/sem.c: ...this.
+ * config/nvptx/sem.h: Rename to...
+ * config/accel/sem.h: ...this.
+ * config/nvptx/thread-stacksize.h: Rename to...
+ * config/accel/thread-stacksize.h: ...this.
+ * config/gcn/affinity-fmt.c: New.
+ * config/gcn/bar.c: New.
+ * config/gcn/bar.h: New.
+ * config/gcn/doacross.h: New.
+ * config/gcn/gomp_print.c: New.
+ * config/gcn/icv-device.c: New.
+ * config/gcn/simple-bar.h: New.
+ * config/gcn/target.c: New.
+ * config/gcn/task.c: New.
+ * config/gcn/team.c: New.
+ * config/gcn/time.c: New.
+ * config/linux/gomp_print.c: New.
+ * configure.ac (amdgcn*-*-*): Disable pthreads.
+ * configure: Regenerated.
+ * configure.tgt (nvptx*-*-*): Add 'accel' config_path.
+ (amdgcn*-*-*): Set config_path.
+ * fortran.c (omp_display_affinity_): Rename calls to gomp_write_string
+ from gomp_print_string.
+ * libgomp-plugin.h (enum offload_target_type): Add
+ OFFLOAD_TARGET_TYPE_GCN.
+ (GOMP_OFFLOAD_openacc_async_construct): Change parameter type to int.
+ * libgomp.h (gcn_thrs, set_gcn_thrs, gomp_thread): Add for __AMDGCN__.
+ (gomp_print_string): Rename to...
+ (gomp_write_string): ...this.
+ * libgomp.map (GOMP_4.5): Add gomp_rpint_string, gomp_print_integer,
+ gomp_print_double.
+ * oacc-async.c (lookup_goacc_asyncqueue): Pass target_id to async queue
+ construct function.
+ * oacc-host.c (host_openacc_async_construct): Add dummy device
+ parameter.
+ * oacc-init.c (name_of_acc_device_t): Add acc_device_gcn.
+ * oacc-int.h (goacc_thread): Add dummy implementation for __AMDGCN__.
+ * oacc-parallel.c (GOACC_enter_exit_data): Support acc_async_noval and
+ zero-length array sections.
+ * omp.h.in (gomp_print_string, gomp_print_integer, gomp_print_double):
+ Add prototypes.
+ * omp_lib.f90.in (gomp_print_string, gomp_print_integer,
+ gomp_print_double): Add interfaces.
+ * openacc.f90 (openacc_kinds): Add acc_device_gcn. Bump
+ acc_device_current code.
+ * openacc.h (acc_device_t): Add acc_device_gcn, bump acc_device_current
+ code.
+ * openacc_lib.h (acc_device_hsa, acc_device_gcn): Add.
+ * plugin/Makefrag.am (PLUGIN_GCN): Support building GCN plugin.
+ * plugin/configfrag.am (PLUGIN_GCN, PLUGIN_GCN_CPPFLAGS,
+ PLUGIN_GCN_LDFLAGS, PLUGIN_GCN_LIBS): Add. Add suport for GCN plugin.
+ * plugin/plugin-gcn.c: New.
+ * target.c (stdio.h): Include unconditionally.
+ (gomp_copy_host2dev): Add function comment.
+ (copy_host2dev_immediate): New function.
+ (gomp_map_pointer, gomp_map_vars_internal): Use
+ copy_host2dev_immediate where appropriate.
+ (offload_target_to_plugin_name): Support gcn.
+ * team.c (gomp_free_pool_helper): Support gcn.
+ * testsuite/Makefile.in: Regenerated.
+ * testsuite/lib/libgomp.exp
+ (check_effective_target_openacc_amdgcn_accel_present): New.
+ (check_effective_target_openacc_amdgcn_accel_selected): New.
+ * testsuite/libgomp.c/c.exp (generate_tests, test_lists,
+ generated_tests): New.
+ (tests): Add generated tests.
+ * testsuite/libgomp.c/for-1.h: New.
+ * testsuite/libgomp.c/for-2.h: New.
+ * testsuite/libgomp.c/for-3.h: New.
+ * testsuite/libgomp.c/for-3.list: New.
+ * testsuite/libgomp.c/for-5.c: New.
+ * testsuite/libgomp.c/for-5.list: New.
+ * testsuite/libgomp.c/for-6.c: New.
+ * testsuite/libgomp.c/for-6.list: New.
+ * testsuite/libgomp.c/target-print-1.c: New.
+ * testsuite/libgomp.fortran/target-print-1.f90: New.
+ * testsuite/libgomp.oacc-c++/c++.exp (amdgcn*): Add support for AMD GCN.
+ * testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c: Adjust for
+ portability.
+ * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Skip unsuitable
+ test for AMD GCN.
+ * testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Adjust for
+ portability.
+ * testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/routine-v-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/serial-dims.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/private-variables-2.c: New.
+ * testsuite/libgomp.oacc-c-c++-common/tile-1.c: Skip for AMD GCN.
+ * testsuite/libgomp.oacc-c/c.exp (amdgcn*): Add support for AMD GCN.
+ * testsuite/libgomp.oacc-c/offload-target-1.c: Add AMD GCN support.
+ * testsuite/libgomp.oacc-c/print-1.c: New.
+ * testsuite/libgomp.oacc-fortran/fortran.exp (amdgcn*): Add AMD GCN
+ support.
+ * testsuite/libgomp.oacc-fortran/atomic_capture-1.f90: Adjust for
+ portability.
+ * testsuite/libgomp.oacc-fortran/collapse-1.f90: Likewise.
+ * testsuite/libgomp.oacc-fortran/collapse-2.f90: Likewise.
+ * testsuite/libgomp.oacc-fortran/error_stop-1.f: Support AMD GCN.
+ * testsuite/libgomp.oacc-fortran/error_stop-2.f: Support AMD GCN.
+ * testsuite/libgomp.oacc-fortran/error_stop-3.f: Support AMD GCN.
+ * testsuite/libgomp.oacc-fortran/print-1.f90: New.
+
2019-01-23 Thomas Schwinge <thomas@codesourcery.com>
* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: Update.
@@ -72,7 +72,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
- affinity-fmt.c teams.c oacc-profiling.c \
+ affinity-fmt.c teams.c gomp_print.c oacc-profiling.c \
oacc-profiling-acc_register_library.c
include $(top_srcdir)/plugin/Makefrag.am
@@ -120,7 +120,8 @@ host_triplet = @host@
target_triplet = @target@
@PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la
@PLUGIN_HSA_TRUE@am__append_2 = libgomp-plugin-hsa.la
-@USE_FORTRAN_TRUE@am__append_3 = openacc.f90
+@PLUGIN_GCN_TRUE@am__append_3 = libgomp-plugin-gcn.la
+@USE_FORTRAN_TRUE@am__append_4 = openacc.f90
subdir = .
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
@@ -180,15 +181,26 @@ am__installdirs = "$(DESTDIR)$(toolexeclibdir)" "$(DESTDIR)$(infodir)" \
"$(DESTDIR)$(toolexeclibdir)"
LTLIBRARIES = $(toolexeclib_LTLIBRARIES)
am__DEPENDENCIES_1 =
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_DEPENDENCIES = libgomp.la \
+@PLUGIN_GCN_TRUE@ $(am__DEPENDENCIES_1)
+@PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_OBJECTS = \
+@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-plugin-gcn.lo
+libgomp_plugin_gcn_la_OBJECTS = $(am_libgomp_plugin_gcn_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 =
+libgomp_plugin_gcn_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+ $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+ $(libgomp_plugin_gcn_la_LDFLAGS) $(LDFLAGS) -o $@
+@PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_rpath = -rpath \
+@PLUGIN_GCN_TRUE@ $(toolexeclibdir)
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_DEPENDENCIES = libgomp.la \
@PLUGIN_HSA_TRUE@ $(am__DEPENDENCIES_1)
@PLUGIN_HSA_TRUE@am_libgomp_plugin_hsa_la_OBJECTS = \
@PLUGIN_HSA_TRUE@ libgomp_plugin_hsa_la-plugin-hsa.lo
libgomp_plugin_hsa_la_OBJECTS = $(am_libgomp_plugin_hsa_la_OBJECTS)
-AM_V_lt = $(am__v_lt_@AM_V@)
-am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
-am__v_lt_0 = --silent
-am__v_lt_1 =
libgomp_plugin_hsa_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
$(libgomp_plugin_hsa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
@@ -216,7 +228,7 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
- teams.lo oacc-profiling.lo \
+ teams.lo gomp_print.lo oacc-profiling.lo \
oacc-profiling-acc_register_library.lo $(am__objects_1)
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
AM_V_P = $(am__v_P_@AM_V@)
@@ -265,7 +277,8 @@ AM_V_FCLD = $(am__v_FCLD_@AM_V@)
am__v_FCLD_ = $(am__v_FCLD_@AM_DEFAULT_V@)
am__v_FCLD_0 = @echo " FCLD " $@;
am__v_FCLD_1 =
-SOURCES = $(libgomp_plugin_hsa_la_SOURCES) \
+SOURCES = $(libgomp_plugin_gcn_la_SOURCES) \
+ $(libgomp_plugin_hsa_la_SOURCES) \
$(libgomp_plugin_nvptx_la_SOURCES) $(libgomp_la_SOURCES)
AM_V_DVIPS = $(am__v_DVIPS_@AM_V@)
am__v_DVIPS_ = $(am__v_DVIPS_@AM_DEFAULT_V@)
@@ -429,6 +442,10 @@ PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
PERL = @PERL@
+PLUGIN_GCN = @PLUGIN_GCN@
+PLUGIN_GCN_CPPFLAGS = @PLUGIN_GCN_CPPFLAGS@
+PLUGIN_GCN_LDFLAGS = @PLUGIN_GCN_LDFLAGS@
+PLUGIN_GCN_LIBS = @PLUGIN_GCN_LIBS@
PLUGIN_HSA = @PLUGIN_HSA@
PLUGIN_HSA_CPPFLAGS = @PLUGIN_HSA_CPPFLAGS@
PLUGIN_HSA_LDFLAGS = @PLUGIN_HSA_LDFLAGS@
@@ -502,6 +519,7 @@ pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
+runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
@@ -529,7 +547,8 @@ libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
AM_CPPFLAGS = $(addprefix -I, $(search_path)) $(LIBFFIINCS)
AM_CFLAGS = $(XCFLAGS)
AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
-toolexeclib_LTLIBRARIES = libgomp.la $(am__append_1) $(am__append_2)
+toolexeclib_LTLIBRARIES = libgomp.la $(am__append_1) $(am__append_2) \
+ $(am__append_3)
nodist_toolexeclib_HEADERS = libgomp.spec
# -Wc is only a libtool option.
@@ -554,8 +573,8 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
affinity.c target.c splay-tree.c libgomp-plugin.c \
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
- affinity-fmt.c teams.c oacc-profiling.c \
- oacc-profiling-acc_register_library.c $(am__append_3)
+ affinity-fmt.c teams.c gomp_print.c oacc-profiling.c \
+ oacc-profiling-acc_register_library.c $(am__append_4)
# Nvidia PTX OpenACC plugin.
@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -578,6 +597,18 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
@PLUGIN_HSA_TRUE@ $(lt_host_flags) $(PLUGIN_HSA_LDFLAGS)
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
+
+# AMD GCN plugin
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_GCN_CPPFLAGS) \
+@PLUGIN_GCN_TRUE@ -D_GNU_SOURCE
+
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_LDFLAGS = \
+@PLUGIN_GCN_TRUE@ $(libgomp_plugin_gcn_version_info) \
+@PLUGIN_GCN_TRUE@ $(lt_host_flags) $(PLUGIN_GCN_LDFLAGS)
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_LIBADD = libgomp.la $(PLUGIN_GCN_LIBS)
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_LIBTOOLFLAGS = --tag=disable-static
nodist_noinst_HEADERS = libgomp_f.h
nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
@USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
@@ -714,6 +745,9 @@ clean-toolexeclibLTLIBRARIES:
rm -f $${locs}; \
}
+libgomp-plugin-gcn.la: $(libgomp_plugin_gcn_la_OBJECTS) $(libgomp_plugin_gcn_la_DEPENDENCIES) $(EXTRA_libgomp_plugin_gcn_la_DEPENDENCIES)
+ $(AM_V_CCLD)$(libgomp_plugin_gcn_la_LINK) $(am_libgomp_plugin_gcn_la_rpath) $(libgomp_plugin_gcn_la_OBJECTS) $(libgomp_plugin_gcn_la_LIBADD) $(LIBS)
+
libgomp-plugin-hsa.la: $(libgomp_plugin_hsa_la_OBJECTS) $(libgomp_plugin_hsa_la_DEPENDENCIES) $(EXTRA_libgomp_plugin_hsa_la_DEPENDENCIES)
$(AM_V_CCLD)$(libgomp_plugin_hsa_la_LINK) $(am_libgomp_plugin_hsa_la_rpath) $(libgomp_plugin_hsa_la_OBJECTS) $(libgomp_plugin_hsa_la_LIBADD) $(LIBS)
@@ -739,11 +773,13 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gomp_print.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv-device.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
@@ -796,6 +832,13 @@ distclean-compile:
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+libgomp_plugin_gcn_la-plugin-gcn.lo: plugin/plugin-gcn.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-plugin-gcn.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Tpo -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='plugin/plugin-gcn.c' object='libgomp_plugin_gcn_la-plugin-gcn.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c
+
libgomp_plugin_hsa_la-plugin-hsa.lo: plugin/plugin-hsa.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_hsa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_hsa_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_hsa_la-plugin-hsa.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Tpo -c -o libgomp_plugin_hsa_la-plugin-hsa.lo `test -f 'plugin/plugin-hsa.c' || echo '$(srcdir)/'`plugin/plugin-hsa.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Tpo $(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Plo
@@ -38,7 +38,7 @@
#endif
void
-gomp_print_string (const char *str, size_t len)
+gomp_write_string (const char *str, size_t len)
{
fwrite (str, 1, len, stderr);
}
@@ -462,13 +462,13 @@ omp_display_affinity (const char *format)
if (ret < sizeof buf)
{
buf[ret] = '\n';
- gomp_print_string (buf, ret + 1);
+ gomp_write_string (buf, ret + 1);
return;
}
b = gomp_malloc (ret + 1);
ialias_call (omp_capture_affinity) (b, ret + 1, format);
b[ret] = '\n';
- gomp_print_string (b, ret + 1);
+ gomp_write_string (b, ret + 1);
free (b);
}
@@ -483,13 +483,13 @@ gomp_display_affinity_thread (gomp_thread_handle handle,
if (ret < sizeof buf)
{
buf[ret] = '\n';
- gomp_print_string (buf, ret + 1);
+ gomp_write_string (buf, ret + 1);
return;
}
b = gomp_malloc (ret + 1);
gomp_display_affinity (b, ret + 1, gomp_affinity_format_var,
handle, ts, place);
b[ret] = '\n';
- gomp_print_string (b, ret + 1);
+ gomp_write_string (b, ret + 1);
free (b);
}
@@ -170,6 +170,9 @@
/* Define to the version of this package. */
#undef PACKAGE_VERSION
+/* Define to 1 if the GCN plugin is built, 0 if not. */
+#undef PLUGIN_GCN
+
/* Define to 1 if the HSA plugin is built, 0 if not. */
#undef PLUGIN_HSA
similarity index 100%
rename from libgomp/config/nvptx/libgomp-plugin.c
rename to libgomp/config/accel/libgomp-plugin.c
similarity index 100%
rename from libgomp/config/nvptx/lock.c
rename to libgomp/config/accel/lock.c
similarity index 100%
rename from libgomp/config/nvptx/mutex.c
rename to libgomp/config/accel/mutex.c
similarity index 100%
rename from libgomp/config/nvptx/mutex.h
rename to libgomp/config/accel/mutex.h
similarity index 100%
rename from libgomp/config/nvptx/oacc-async.c
rename to libgomp/config/accel/oacc-async.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-cuda.c
rename to libgomp/config/accel/oacc-cuda.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-host.c
rename to libgomp/config/accel/oacc-host.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-init.c
rename to libgomp/config/accel/oacc-init.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-mem.c
rename to libgomp/config/accel/oacc-mem.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-plugin.c
rename to libgomp/config/accel/oacc-plugin.c
similarity index 100%
rename from libgomp/config/nvptx/omp-lock.h
rename to libgomp/config/accel/omp-lock.h
similarity index 96%
rename from libgomp/config/nvptx/openacc.f90
rename to libgomp/config/accel/openacc.f90
@@ -51,6 +51,8 @@ module openacc_kinds
! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
integer (acc_device_kind), parameter :: acc_device_not_host = 4
integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+ integer (acc_device_kind), parameter :: acc_device_hsa = 6
+ integer (acc_device_kind), parameter :: acc_device_gcn = 7
end module
similarity index 100%
rename from libgomp/config/nvptx/pool.h
rename to libgomp/config/accel/pool.h
similarity index 98%
rename from libgomp/config/nvptx/proc.c
rename to libgomp/config/accel/proc.c
@@ -39,3 +39,4 @@ omp_get_num_procs (void)
{
return gomp_icv (false)->nthreads_var;
}
+ialias (omp_get_num_procs)
similarity index 100%
rename from libgomp/config/nvptx/ptrlock.c
rename to libgomp/config/accel/ptrlock.c
similarity index 100%
rename from libgomp/config/nvptx/ptrlock.h
rename to libgomp/config/accel/ptrlock.h
similarity index 100%
rename from libgomp/config/nvptx/sem.c
rename to libgomp/config/accel/sem.c
similarity index 100%
rename from libgomp/config/nvptx/sem.h
rename to libgomp/config/accel/sem.h
similarity index 100%
rename from libgomp/config/nvptx/thread-stacksize.h
rename to libgomp/config/accel/thread-stacksize.h
new file mode 100644
@@ -0,0 +1,51 @@
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "libgomp.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h> /* For PRIx64. */
+#endif
+#ifdef HAVE_UNAME
+#include <sys/utsname.h>
+#endif
+
+/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for nvptx,
+ while the nvptx newlib implementation does not support those functions.
+ Override the configure test results here. */
+#undef HAVE_GETPID
+#undef HAVE_GETHOSTNAME
+
+/* The GCN newlib implementation does not support fwrite, but it does support
+ write. Map fwrite to write. */
+#undef fwrite
+#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size))
+
+#include "../../affinity-fmt.c"
+
new file mode 100644
@@ -0,0 +1,230 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is an AMD GCN specific implementation of a barrier synchronization
+ mechanism for libgomp. This type is private to the library. This
+ implementation uses atomic instructions and s_barrier instruction. It
+ uses MEMMODEL_RELAXED here because barriers are within workgroups and
+ therefore don't need to flush caches. */
+
+#include <limits.h>
+#include "libgomp.h"
+
+
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ {
+ /* Next time we'll be awaiting TOTAL threads again. */
+ bar->awaited = bar->total;
+ __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+ MEMMODEL_RELAXED);
+ }
+ asm ("s_barrier" ::: "memory");
+}
+
+void
+gomp_barrier_wait (gomp_barrier_t *bar)
+{
+ gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+/* Like gomp_barrier_wait, except that if the encountering thread
+ is not the last one to hit the barrier, it returns immediately.
+ The intended usage is that a thread which intends to gomp_barrier_destroy
+ this barrier calls gomp_barrier_wait, while all other threads
+ call gomp_barrier_wait_last. When gomp_barrier_wait returns,
+ the barrier can be safely destroyed. */
+
+void
+gomp_barrier_wait_last (gomp_barrier_t *bar)
+{
+ /* Deferring to gomp_barrier_wait does not use the optimization opportunity
+ allowed by the interface contract for all-but-last participants. The
+ original implementation in config/linux/bar.c handles this better. */
+ gomp_barrier_wait (bar);
+}
+
+void
+gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+{
+ asm ("s_barrier" ::: "memory");
+}
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+ unsigned int generation, gen;
+
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ {
+ /* Next time we'll be awaiting TOTAL threads again. */
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+
+ bar->awaited = bar->total;
+ team->work_share_cancelled = 0;
+ if (__builtin_expect (team->task_count, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ state &= ~BAR_WAS_LAST;
+ }
+ else
+ {
+ state &= ~BAR_CANCELLED;
+ state += BAR_INCR - BAR_WAS_LAST;
+ __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
+ asm ("s_barrier" ::: "memory");
+ return;
+ }
+ }
+
+ generation = state;
+ state &= ~BAR_CANCELLED;
+ int retry = 100;
+ do
+ {
+ if (retry-- == 0)
+ {
+ /* It really shouldn't happen that barriers get out of sync, but
+ if they do then this will loop until they realign, so we need
+ to avoid an infinite loop where the thread just isn't there. */
+ gomp_print_string ("Barrier sync failed (another thread died?);",
+ " aborting.");
+ abort();
+ }
+
+ asm ("s_barrier" ::: "memory");
+ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ }
+ generation |= gen & BAR_WAITING_FOR_TASK;
+ }
+ while (gen != state + BAR_INCR);
+}
+
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+ gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+ gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ bar->awaited_final = bar->total;
+ gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+ gomp_barrier_state_t state)
+{
+ unsigned int generation, gen;
+
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ {
+ /* Next time we'll be awaiting TOTAL threads again. */
+ /* BAR_CANCELLED should never be set in state here, because
+ cancellation means that at least one of the threads has been
+ cancelled, thus on a cancellable barrier we should never see
+ all threads to arrive. */
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+
+ bar->awaited = bar->total;
+ team->work_share_cancelled = 0;
+ if (__builtin_expect (team->task_count, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ state &= ~BAR_WAS_LAST;
+ }
+ else
+ {
+ state += BAR_INCR - BAR_WAS_LAST;
+ __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
+ asm ("s_barrier" ::: "memory");
+ return false;
+ }
+ }
+
+ if (__builtin_expect (state & BAR_CANCELLED, 0))
+ return true;
+
+ generation = state;
+ int retry = 100;
+ do
+ {
+ if (retry-- == 0)
+ {
+ /* It really shouldn't happen that barriers get out of sync, but
+ if they do then this will loop until they realign, so we need
+ to avoid an infinite loop where the thread just isn't there. */
+ gomp_print_string ("Barrier sync failed (another thread died?);",
+ " aborting.");
+ abort();
+ }
+
+ asm ("s_barrier" ::: "memory");
+ gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+ if (__builtin_expect (gen & BAR_CANCELLED, 0))
+ return true;
+ if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+ }
+ generation |= gen & BAR_WAITING_FOR_TASK;
+ }
+ while (gen != state + BAR_INCR);
+
+ return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+ return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+ gomp_mutex_lock (&team->task_lock);
+ if (team->barrier.generation & BAR_CANCELLED)
+ {
+ gomp_mutex_unlock (&team->task_lock);
+ return;
+ }
+ team->barrier.generation |= BAR_CANCELLED;
+ gomp_mutex_unlock (&team->task_lock);
+ gomp_team_barrier_wake (&team->barrier, INT_MAX);
+}
new file mode 100644
@@ -0,0 +1,168 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is an AMD GCN specific implementation of a barrier synchronization
+ mechanism for libgomp. This type is private to the library. This
+ implementation uses atomic instructions and s_barrier instruction. It
+ uses MEMMODEL_RELAXED here because barriers are within workgroups and
+ therefore don't need to flush caches. */
+
+#ifndef GOMP_BARRIER_H
+#define GOMP_BARRIER_H 1
+
+#include "mutex.h"
+
+typedef struct
+{
+ unsigned total;
+ unsigned generation;
+ unsigned awaited;
+ unsigned awaited_final;
+} gomp_barrier_t;
+
+typedef unsigned int gomp_barrier_state_t;
+
+/* The generation field contains a counter in the high bits, with a few
+ low bits dedicated to flags. Note that TASK_PENDING and WAS_LAST can
+ share space because WAS_LAST is never stored back to generation. */
+#define BAR_TASK_PENDING 1
+#define BAR_WAS_LAST 1
+#define BAR_WAITING_FOR_TASK 2
+#define BAR_CANCELLED 4
+#define BAR_INCR 8
+
+static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
+{
+ bar->total = count;
+ bar->awaited = count;
+ bar->awaited_final = count;
+ bar->generation = 0;
+}
+
+static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
+{
+ __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_RELAXED);
+ bar->total = count;
+}
+
+static inline void gomp_barrier_destroy (gomp_barrier_t *bar)
+{
+}
+
+extern void gomp_barrier_wait (gomp_barrier_t *);
+extern void gomp_barrier_wait_last (gomp_barrier_t *);
+extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
+extern void gomp_team_barrier_wait (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
+ gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+ gomp_barrier_state_t);
+extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_start (gomp_barrier_t *bar)
+{
+ unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+ ret &= -BAR_INCR | BAR_CANCELLED;
+ /* A memory barrier is needed before exiting from the various forms
+ of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
+ 2.8.6 flush Construct, which says there is an implicit flush during
+ a barrier region. This is a convenient place to add the barrier,
+ so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE. */
+ if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_RELAXED) == 0)
+ ret |= BAR_WAS_LAST;
+ return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+ return gomp_barrier_wait_start (bar);
+}
+
+/* This is like gomp_barrier_wait_start, except it decrements
+ bar->awaited_final rather than bar->awaited and should be used
+ for the gomp_team_end barrier only. */
+static inline gomp_barrier_state_t
+gomp_barrier_wait_final_start (gomp_barrier_t *bar)
+{
+ unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+ ret &= -BAR_INCR | BAR_CANCELLED;
+ /* See above gomp_barrier_wait_start comment. */
+ if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_RELAXED) == 0)
+ ret |= BAR_WAS_LAST;
+ return ret;
+}
+
+static inline bool
+gomp_barrier_last_thread (gomp_barrier_state_t state)
+{
+ return state & BAR_WAS_LAST;
+}
+
+/* All the inlines below must be called with team->task_lock
+ held. */
+
+static inline void
+gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
+{
+ bar->generation |= BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
+{
+ bar->generation &= ~BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
+{
+ bar->generation |= BAR_WAITING_FOR_TASK;
+}
+
+static inline bool
+gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
+{
+ return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+ return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
+}
+
+static inline void
+gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+ bar->generation = (state & -BAR_INCR) + BAR_INCR;
+}
+
+#endif /* GOMP_BARRIER_H */
new file mode 100644
@@ -0,0 +1,58 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is the AMD GCN implementation of doacross spinning. */
+
+#ifndef GOMP_DOACROSS_H
+#define GOMP_DOACROSS_H 1
+
+#include "libgomp.h"
+
+static inline int
+cpu_relax (void)
+{
+ /* This can be implemented as just a memory barrier, but a sleep seems
+ like it should allow the wavefront to yield (maybe?)
+ Use the shortest possible sleep time of 1*64 cycles. */
+ asm volatile ("s_sleep\t1" ::: "memory");
+ return 0;
+}
+
+static inline void doacross_spin (unsigned long *addr, unsigned long expected,
+ unsigned long cur)
+{
+ /* Prevent compiler from optimizing based on bounds of containing object. */
+ asm ("" : "+r" (addr));
+ do
+ {
+ /* An alternative implementation might use s_setprio to lower the
+ priority temporarily, and then restore it after. */
+ int i = cpu_relax ();
+ cur = addr[i];
+ }
+ while (cur <= expected);
+}
+
+#endif /* GOMP_DOACROSS_H */
new file mode 100644
@@ -0,0 +1,2 @@
+/* The GCN gomp_print routines live in libgcc where they are available
+ to stand-alone toolchains configured without libgomp. */
new file mode 100644
@@ -0,0 +1,72 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file defines OpenMP API entry points that accelerator targets are
+ expected to replace. */
+
+#include "libgomp.h"
+
+void
+omp_set_default_device (int device_num __attribute__((unused)))
+{
+}
+
+int
+omp_get_default_device (void)
+{
+ return 0;
+}
+
+int
+omp_get_num_devices (void)
+{
+ return 0;
+}
+
+int
+omp_get_num_teams (void)
+{
+ return gomp_num_teams_var + 1;
+}
+
+int __attribute__ ((__optimize__ ("O2")))
+omp_get_team_num (void)
+{
+ return __builtin_gcn_dim_pos (0);
+}
+
+int
+omp_is_initial_device (void)
+{
+ /* AMD GCN is an accelerator-only target. */
+ return 0;
+}
+
+ialias (omp_set_default_device)
+ialias (omp_get_default_device)
+ialias (omp_get_num_devices)
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
+ialias (omp_is_initial_device)
new file mode 100644
@@ -0,0 +1,61 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is a simplified barrier that is suitable for thread pool
+ synchronizaton. Only a subset of full barrier API (bar.h) is exposed.
+ Here in the AMD GCN-specific implementation, we expect that thread pool
+ corresponds to the wavefronts within a work group. */
+
+#ifndef GOMP_SIMPLE_BARRIER_H
+#define GOMP_SIMPLE_BARRIER_H 1
+
+/* AMD GCN has no use for this type. */
+typedef int gomp_simple_barrier_t;
+
+/* GCN barriers block all wavefronts, so the count is not interesting. */
+static inline void
+gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count)
+{
+}
+
+static inline void
+gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar)
+{
+}
+
+static inline void
+gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
+{
+ asm volatile ("s_barrier" ::: "memory");
+}
+
+static inline void
+gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
+{
+ /* GCN has no way to signal a barrier without waiting. */
+ asm volatile ("s_barrier" ::: "memory");
+}
+
+#endif /* GOMP_SIMPLE_BARRIER_H */
new file mode 100644
@@ -0,0 +1,49 @@
+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "libgomp.h"
+#include <limits.h>
+
+void
+GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
+{
+ if (thread_limit)
+ {
+ struct gomp_task_icv *icv = gomp_icv (true);
+ icv->thread_limit_var
+ = thread_limit > INT_MAX ? UINT_MAX : thread_limit;
+ }
+ unsigned int num_workgroups, workgroup_id;
+ num_workgroups = __builtin_gcn_dim_size (0);
+ workgroup_id = __builtin_gcn_dim_pos (0);
+ if (!num_teams || num_teams >= num_workgroups)
+ num_teams = num_workgroups;
+ else if (workgroup_id >= num_teams)
+ {
+ gomp_free_thread (gcn_thrs ());
+ exit (0);
+ }
+ gomp_num_teams_var = num_teams - 1;
+}
new file mode 100644
@@ -0,0 +1,39 @@
+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file handles the maintainence of tasks in response to task
+ creation and termination. */
+
+#include "libgomp.h"
+
+/* AMD GCN is an accelerator-only target, so this should never be called. */
+
+bool
+gomp_target_task_fn (void *data)
+{
+ __builtin_unreachable ();
+}
+
+#include "../../task.c"
new file mode 100644
@@ -0,0 +1,202 @@
+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file handles maintainance of threads on AMD GCN. */
+
+#include "libgomp.h"
+#include <stdlib.h>
+#include <string.h>
+
+static void gomp_thread_start (struct gomp_thread_pool *);
+void gomp_print_string (const char *msg, const char *val);
+
+/* This externally visible function handles target region entry. It
+ sets up a per-team thread pool and transfers control by returning to
+ the kernel in the master thread or gomp_thread_start in other threads.
+
+ The name of this function is part of the interface with the compiler: for
+ each OpenMP kernel the compiler configures the stack, then calls here.
+
+ Likewise, gomp_gcn_exit_kernel is called during the kernel epilogue. */
+
+void
+gomp_gcn_enter_kernel (void)
+{
+ int tid, ntids;
+ tid = __builtin_gcn_dim_pos (1);
+ ntids = __builtin_gcn_dim_size (1);
+ if (tid == 0)
+ {
+ gomp_global_icv.nthreads_var = ntids;
+ /* Starting additional threads is not supported. */
+ gomp_global_icv.dyn_var = true;
+
+ set_gcn_thrs (calloc (ntids, sizeof (struct gomp_thread)));
+ if (gcn_thrs () == NULL)
+ goto oom;
+
+ struct gomp_thread_pool *pool = malloc (sizeof (*pool));
+ if (pool == NULL)
+ goto oom;
+
+ pool->threads = malloc (ntids * sizeof (*pool->threads));
+ if (pool->threads == NULL)
+ goto oom;
+
+ for (tid = 0; tid < ntids; tid++)
+ pool->threads[tid] = gcn_thrs () + tid;
+ pool->threads_size = ntids;
+ pool->threads_used = ntids;
+ pool->threads_busy = 1;
+ pool->last_team = NULL;
+ gomp_simple_barrier_init (&pool->threads_dock, ntids);
+
+ gcn_thrs ()[0].thread_pool = pool;
+ asm ("s_barrier" ::: "memory");
+ return; /* Return to kernel. */
+ }
+ else
+ {
+ asm ("s_barrier" ::: "memory");
+ gomp_thread_start (gcn_thrs ()[0].thread_pool);
+ /* gomp_thread_start does not return. */
+ }
+
+oom:
+ gomp_print_string ("GCN heap exhausted; try setting GCN_HEAP_SIZE.", "");
+ abort();
+}
+
+void
+gomp_gcn_exit_kernel (void)
+{
+ gomp_free_thread (gcn_thrs ());
+}
+
+/* This function contains the idle loop in which a thread waits
+ to be called up to become part of a team. */
+
+static void
+gomp_thread_start (struct gomp_thread_pool *pool)
+{
+ struct gomp_thread *thr = gomp_thread ();
+
+ gomp_sem_init (&thr->release, 0);
+ thr->thread_pool = pool;
+
+ /* The loop exits only when "fn" is assigned "gomp_free_pool_helper",
+ which contains "s_endpgm", or an infinite no-op loop is
+ suspected (this happens when the thread master crashes). */
+ int nul_limit = 99;
+ do
+ {
+ gomp_simple_barrier_wait (&pool->threads_dock);
+ if (!thr->fn)
+ {
+ if (nul_limit-- > 0)
+ continue;
+ else
+ {
+ gomp_print_string ("team master not responding;",
+ " slave thread aborting");
+ abort();
+ }
+ }
+ thr->fn (thr->data);
+ thr->fn = NULL;
+
+ struct gomp_task *task = thr->task;
+ gomp_team_barrier_wait_final (&thr->ts.team->barrier);
+ gomp_finish_task (task);
+ }
+ while (1);
+}
+
+/* Launch a team. */
+
+void
+gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
+ unsigned flags, struct gomp_team *team,
+ struct gomp_taskgroup *taskgroup)
+{
+ struct gomp_thread *thr, *nthr;
+ struct gomp_task *task;
+ struct gomp_task_icv *icv;
+ struct gomp_thread_pool *pool;
+ unsigned long nthreads_var;
+
+ thr = gomp_thread ();
+ pool = thr->thread_pool;
+ task = thr->task;
+ icv = task ? &task->icv : &gomp_global_icv;
+
+ /* Always save the previous state, even if this isn't a nested team.
+ In particular, we should save any work share state from an outer
+ orphaned work share construct. */
+ team->prev_ts = thr->ts;
+
+ thr->ts.team = team;
+ thr->ts.team_id = 0;
+ ++thr->ts.level;
+ if (nthreads > 1)
+ ++thr->ts.active_level;
+ thr->ts.work_share = &team->work_shares[0];
+ thr->ts.last_work_share = NULL;
+ thr->ts.single_count = 0;
+ thr->ts.static_trip = 0;
+ thr->task = &team->implicit_task[0];
+ nthreads_var = icv->nthreads_var;
+ gomp_init_task (thr->task, task, icv);
+ team->implicit_task[0].icv.nthreads_var = nthreads_var;
+ team->implicit_task[0].taskgroup = taskgroup;
+
+ if (nthreads == 1)
+ return;
+
+ /* Release existing idle threads. */
+ for (unsigned i = 1; i < nthreads; ++i)
+ {
+ nthr = pool->threads[i];
+ nthr->ts.team = team;
+ nthr->ts.work_share = &team->work_shares[0];
+ nthr->ts.last_work_share = NULL;
+ nthr->ts.team_id = i;
+ nthr->ts.level = team->prev_ts.level + 1;
+ nthr->ts.active_level = thr->ts.active_level;
+ nthr->ts.single_count = 0;
+ nthr->ts.static_trip = 0;
+ nthr->task = &team->implicit_task[i];
+ gomp_init_task (nthr->task, task, icv);
+ team->implicit_task[i].icv.nthreads_var = nthreads_var;
+ team->implicit_task[i].taskgroup = taskgroup;
+ nthr->fn = fn;
+ nthr->data = data;
+ team->ordered_release[i] = &nthr->release;
+ }
+
+ gomp_simple_barrier_wait (&pool->threads_dock);
+}
+
+#include "../../team.c"
new file mode 100644
@@ -0,0 +1,52 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file implements timer routines for AMD GCN. */
+
+#include "libgomp.h"
+
+/* According to AMD:
+ dGPU RTC is 27MHz
+ AGPU RTC is 100MHz
+ FIXME: DTRT on an APU. */
+#define RTC_TICKS (1.0 / 27000000.0) /* 27MHz */
+
+double
+omp_get_wtime (void)
+{
+ uint64_t clock;
+ asm ("s_memrealtime %0\n\t"
+ "s_waitcnt 0" : "=r" (clock));
+ return clock * RTC_TICKS;
+}
+
+double
+omp_get_wtick (void)
+{
+ return RTC_TICKS;
+}
+
+ialias (omp_get_wtime)
+ialias (omp_get_wtick)
new file mode 100644
@@ -0,0 +1,20 @@
+#include <stdio.h>
+#include <stdint.h>
+
+void
+gomp_print_string (const char *msg, const char *value)
+{
+ printf ("%s%s\n", msg, value);
+}
+
+void
+gomp_print_integer (const char *msg, int64_t value)
+{
+ printf ("%s%ld\n", msg, value);
+}
+
+void
+gomp_print_double (const char *msg, double value)
+{
+ printf ("%s%f\n", msg, value);
+}
@@ -661,6 +661,8 @@ LIBGOMP_BUILD_VERSIONED_SHLIB_FALSE
LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE
OPT_LDFLAGS
SECTION_LDFLAGS
+PLUGIN_GCN_FALSE
+PLUGIN_GCN_TRUE
PLUGIN_HSA_FALSE
PLUGIN_HSA_TRUE
PLUGIN_NVPTX_FALSE
@@ -669,6 +671,10 @@ offload_additional_lib_paths
offload_additional_options
offload_targets
offload_plugins
+PLUGIN_GCN_LIBS
+PLUGIN_GCN_LDFLAGS
+PLUGIN_GCN_CPPFLAGS
+PLUGIN_GCN
PLUGIN_HSA_LIBS
PLUGIN_HSA_LDFLAGS
PLUGIN_HSA_CPPFLAGS
@@ -11396,7 +11402,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 11399 "configure"
+#line 11405 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -11502,7 +11508,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 11505 "configure"
+#line 11511 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -15315,7 +15321,7 @@ case "$host" in
*-*-rtems*)
# RTEMS supports Pthreads, but the library is not available at GCC build time.
;;
- nvptx*-*-*)
+ nvptx*-*-* | amdgcn*-*-*)
# NVPTX does not support Pthreads, has its own code replacement.
libgomp_use_pthreads=no
# NVPTX is an accelerator-only target
@@ -15645,6 +15651,15 @@ PLUGIN_HSA_LIBS=
+PLUGIN_GCN=0
+PLUGIN_GCN_CPPFLAGS=
+PLUGIN_GCN_LDFLAGS=
+PLUGIN_GCN_LIBS=
+
+
+
+
+
# Parse '--enable-offload-targets', figure out the corresponding libgomp
# plugins, and configure to find the corresponding offload compilers.
# 'offload_plugins' and 'offload_targets' will be populated in the same order.
@@ -15756,6 +15771,29 @@ rm -f core conftest.err conftest.$ac_objext \
;;
esac
;;
+
+ amdgcn*)
+ case "${target}" in
+ x86_64-*-*)
+ case " ${CC} ${CFLAGS} " in
+ *" -m32 "*)
+ PLUGIN_GCN=0
+ ;;
+ *)
+ tgt_name=gcn
+ PLUGIN_GCN=$tgt
+ PLUGIN_GCN_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
+ PLUGIN_GCN_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
+ PLUGIN_GCN_LIBS="-ldl"
+ PLUGIN_GCN=1
+ ;;
+ esac
+ ;;
+ *-*-*)
+ PLUGIN_GCN=0
+ ;;
+ esac
+ ;;
*)
as_fn_error $? "unknown offload target specified" "$LINENO" 5
;;
@@ -15820,6 +15858,19 @@ cat >>confdefs.h <<_ACEOF
#define PLUGIN_HSA $PLUGIN_HSA
_ACEOF
+ if test $PLUGIN_GCN = 1; then
+ PLUGIN_GCN_TRUE=
+ PLUGIN_GCN_FALSE='#'
+else
+ PLUGIN_GCN_TRUE='#'
+ PLUGIN_GCN_FALSE=
+fi
+
+
+cat >>confdefs.h <<_ACEOF
+#define PLUGIN_GCN $PLUGIN_GCN
+_ACEOF
+
if test "$HSA_RUNTIME_LIB" != ""; then
HSA_RUNTIME_LIB="$HSA_RUNTIME_LIB/"
@@ -17445,6 +17496,10 @@ if test -z "${PLUGIN_HSA_TRUE}" && test -z "${PLUGIN_HSA_FALSE}"; then
as_fn_error $? "conditional \"PLUGIN_HSA\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
+if test -z "${PLUGIN_GCN_TRUE}" && test -z "${PLUGIN_GCN_FALSE}"; then
+ as_fn_error $? "conditional \"PLUGIN_GCN\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
if test -z "${LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE}" && test -z "${LIBGOMP_BUILD_VERSIONED_SHLIB_FALSE}"; then
as_fn_error $? "conditional \"LIBGOMP_BUILD_VERSIONED_SHLIB\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -177,7 +177,7 @@ case "$host" in
*-*-rtems*)
# RTEMS supports Pthreads, but the library is not available at GCC build time.
;;
- nvptx*-*-*)
+ nvptx*-*-* | amdgcn*-*-*)
# NVPTX does not support Pthreads, has its own code replacement.
libgomp_use_pthreads=no
# NVPTX is an accelerator-only target
@@ -154,7 +154,7 @@ case "${target}" in
;;
nvptx*-*-*)
- config_path="nvptx"
+ config_path="nvptx accel"
;;
*-*-rtems*)
@@ -164,6 +164,10 @@ case "${target}" in
fi
;;
+ amdgcn*-*-*)
+ config_path="gcn accel"
+ ;;
+
*)
;;
@@ -626,7 +626,7 @@ omp_display_affinity_ (const char *format, size_t format_len)
if (ret < sizeof buf)
{
buf[ret] = '\n';
- gomp_print_string (buf, ret + 1);
+ gomp_write_string (buf, ret + 1);
}
else
{
@@ -635,7 +635,7 @@ omp_display_affinity_ (const char *format, size_t format_len)
format_len ? fmt : gomp_affinity_format_var,
gomp_thread_self (), &thr->ts, thr->place);
b[ret] = '\n';
- gomp_print_string (b, ret + 1);
+ gomp_write_string (b, ret + 1);
free (b);
}
if (fmt && fmt != fmt_buf)
@@ -50,7 +50,8 @@ enum offload_target_type
/* OFFLOAD_TARGET_TYPE_HOST_NONSHM = 3 removed. */
OFFLOAD_TARGET_TYPE_NVIDIA_PTX = 5,
OFFLOAD_TARGET_TYPE_INTEL_MIC = 6,
- OFFLOAD_TARGET_TYPE_HSA = 7
+ OFFLOAD_TARGET_TYPE_HSA = 7,
+ OFFLOAD_TARGET_TYPE_GCN = 8
};
/* Container type for passing device properties. */
@@ -120,7 +121,7 @@ extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **,
void **, unsigned *, void *);
extern void *GOMP_OFFLOAD_openacc_create_thread_data (int);
extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *);
-extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (void);
+extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (int);
extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *);
extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
@@ -692,6 +692,24 @@ static inline struct gomp_thread *gomp_thread (void)
asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
return nvptx_thrs + tid;
}
+#elif defined __AMDGCN__
+static inline struct gomp_thread *gcn_thrs (void)
+{
+ /* The value is at the bottom of LDS. */
+ struct gomp_thread * __lds *thrs = (struct gomp_thread * __lds *)4;
+ return *thrs;
+}
+static inline void set_gcn_thrs (struct gomp_thread *val)
+{
+ /* The value is at the bottom of LDS. */
+ struct gomp_thread * __lds *thrs = (struct gomp_thread * __lds *)4;
+ *thrs = val;
+}
+static inline struct gomp_thread *gomp_thread (void)
+{
+ int tid = __builtin_gcn_dim_pos(1);
+ return gcn_thrs () + tid;
+}
#elif defined HAVE_TLS || defined USE_EMUTLS
extern __thread struct gomp_thread gomp_tls_data;
static inline struct gomp_thread *gomp_thread (void)
@@ -751,7 +769,7 @@ extern void gomp_display_affinity_place (char *, size_t, size_t *, int);
/* affinity-fmt.c */
-extern void gomp_print_string (const char *str, size_t len);
+extern void gomp_write_string (const char *str, size_t len);
extern void gomp_set_affinity_format (const char *, size_t);
extern void gomp_display_string (char *, size_t, size_t *, const char *,
size_t);
@@ -312,6 +312,9 @@ GOMP_4.5 {
GOMP_loop_ull_nonmonotonic_guided_start;
GOMP_parallel_loop_nonmonotonic_dynamic;
GOMP_parallel_loop_nonmonotonic_guided;
+ gomp_print_string;
+ gomp_print_integer;
+ gomp_print_double;
} GOMP_4.0.1;
GOMP_5.0 {
@@ -100,7 +100,8 @@ lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
if (!dev->openacc.async.asyncqueue[async])
{
- dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func ();
+ dev->openacc.async.asyncqueue[async]
+ = dev->openacc.async.construct_func (dev->target_id);
if (!dev->openacc.async.asyncqueue[async])
{
@@ -257,7 +257,7 @@ host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
}
static struct goacc_asyncqueue *
-host_openacc_async_construct (void)
+host_openacc_async_construct (int device __attribute__((unused)))
{
/* Non-NULL 0xffff... value as opaque dummy. */
return (struct goacc_asyncqueue *) -1;
@@ -110,8 +110,9 @@ name_of_acc_device_t (enum acc_device_t type)
case acc_device_host: return "host";
case acc_device_not_host: return "not_host";
case acc_device_nvidia: return "nvidia";
- case /* not supported */ _acc_device_intel_mic:
+ case acc_device_gcn: return "gcn";
case /* not supported */ _acc_device_hsa:
+ case /* not supported */ _acc_device_intel_mic:
default: gomp_fatal ("unknown device type %u", (unsigned) type);
}
}
@@ -82,7 +82,14 @@ struct goacc_thread
void *target_tls;
};
-#if defined HAVE_TLS || defined USE_EMUTLS
+#ifdef __AMDGCN__
+static inline struct goacc_thread *
+goacc_thread (void)
+{
+ /* Unused in the offload libgomp for OpenACC: return a dummy value. */
+ return 0;
+}
+#elif defined HAVE_TLS || defined USE_EMUTLS
extern __thread struct goacc_thread *goacc_tls_data;
static inline struct goacc_thread *
goacc_thread (void)
@@ -732,6 +732,8 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
|| kind == GOMP_MAP_FORCE_FROM)
finalize = true;
}
+ else if (num_waits == acc_async_noval)
+ acc_wait_all_async (async);
/* Determine if this is an "acc enter data". */
for (i = 0; i < mapnum; ++i)
@@ -749,7 +751,8 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
|| kind == GOMP_MAP_FORCE_TO
|| kind == GOMP_MAP_TO
|| kind == GOMP_MAP_ALLOC
- || kind == GOMP_MAP_DECLARE_ALLOCATE)
+ || kind == GOMP_MAP_DECLARE_ALLOCATE
+ || kind == GOMP_MAP_ZERO_LEN_ARRAY_SECTION)
{
data_enter = true;
break;
@@ -761,7 +764,8 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
|| kind == GOMP_MAP_FORCE_DETACH
|| kind == GOMP_MAP_FROM
|| kind == GOMP_MAP_FORCE_FROM
- || kind == GOMP_MAP_DECLARE_DEALLOCATE)
+ || kind == GOMP_MAP_DECLARE_DEALLOCATE
+ || kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION)
break;
gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -866,6 +870,10 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
case GOMP_MAP_FORCE_ALLOC:
acc_create_async (hostaddrs[i], sizes[i], async);
break;
+ case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
+ if (hostaddrs[i] != NULL)
+ acc_create_async (hostaddrs[i], 1, async);
+ break;
case GOMP_MAP_TO:
case GOMP_MAP_FORCE_TO:
if (hostaddrs[i])
@@ -986,6 +994,15 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
acc_delete_async (hostaddrs[i], sizes[i], async);
}
break;
+ case GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION:
+ if (acc_is_present (hostaddrs[i], 1))
+ {
+ if (finalize)
+ acc_delete_finalize_async (hostaddrs[i], 1, async);
+ else
+ acc_delete_async (hostaddrs[i], 1, async);
+ }
+ break;
case GOMP_MAP_DETACH:
case GOMP_MAP_FORCE_DETACH:
case GOMP_MAP_FORCE_PRESENT:
@@ -1196,6 +1213,7 @@ GOACC_update (int flags_m, size_t mapnum,
{
case GOMP_MAP_POINTER:
case GOMP_MAP_TO_PSET:
+ case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
break;
case GOMP_MAP_ALWAYS_POINTER:
@@ -188,6 +188,18 @@ extern __SIZE_TYPE__ omp_capture_affinity (char *, __SIZE_TYPE__, const char *)
extern int omp_pause_resource (omp_pause_resource_t, int) __GOMP_NOTHROW;
extern int omp_pause_resource_all (omp_pause_resource_t) __GOMP_NOTHROW;
+/************************************************************************/
+/* Libgomp extensions. */
+
+#include <stdint.h>
+
+/* Print a message, and value, possibly from a GPU-offloaded function.
+ Primarily intended for debug messages.
+ Maximum message & value length is 128 bytes. */
+void gomp_print_string (const char *msg, const char *value);
+void gomp_print_integer (const char *msg, int64_t value);
+void gomp_print_double (const char *msg, double value);
+
#ifdef __cplusplus
}
#endif
@@ -484,4 +484,30 @@
end function
end interface
+ ! Libgomp extensions.
+
+ interface
+ subroutine gomp_print_string (msg, str) bind(C)
+ use iso_c_binding, only: c_char
+ character (kind=c_char) :: msg(*)
+ character (kind=c_char) :: str(*)
+ end subroutine gomp_print_string
+ end interface
+
+ interface
+ subroutine gomp_print_integer (msg, i) bind(C)
+ use iso_c_binding, only: c_char, c_int64_t
+ character (kind=c_char) :: msg(*)
+ integer (kind=c_int64_t), value :: i
+ end subroutine gomp_print_integer
+ end interface
+
+ interface
+ subroutine gomp_print_double (msg, d) bind(C)
+ use iso_c_binding, only: c_char, c_double
+ character (kind=c_char) :: msg(*)
+ real (kind=c_double), value :: d
+ end subroutine gomp_print_double
+ end interface
+
end module omp_lib
@@ -37,7 +37,7 @@ module openacc_kinds
integer, parameter :: acc_device_kind = int32
public :: acc_device_none, acc_device_default, acc_device_host
- public :: acc_device_not_host, acc_device_nvidia
+ public :: acc_device_not_host, acc_device_nvidia, acc_device_gcn
! Keep in sync with include/gomp-constants.h.
integer (acc_device_kind), parameter :: acc_device_none = 0
@@ -46,7 +46,9 @@ module openacc_kinds
! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
integer (acc_device_kind), parameter :: acc_device_not_host = 4
integer (acc_device_kind), parameter :: acc_device_nvidia = 5
- integer (acc_device_kind), parameter :: acc_device_current = 8
+ integer (acc_device_kind), parameter :: acc_device_hsa = 7
+ integer (acc_device_kind), parameter :: acc_device_gcn = 8
+ integer (acc_device_kind), parameter :: acc_device_current = 9
public :: acc_device_property
@@ -57,7 +57,8 @@ typedef enum acc_device_t {
acc_device_nvidia = 5,
/* not supported */ _acc_device_intel_mic = 6,
/* not supported */ _acc_device_hsa = 7,
- acc_device_current = 8,
+ acc_device_gcn = 8,
+ acc_device_current = 9,
_ACC_device_hwm,
/* Ensure enumeration is layout compatible with int. */
_ACC_highest = __INT_MAX__,
@@ -42,6 +42,8 @@
! removed.
integer (acc_device_kind), parameter :: acc_device_not_host = 4
integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+ integer (acc_device_kind), parameter :: acc_device_hsa = 7
+ integer (acc_device_kind), parameter :: acc_device_gcn = 8
integer, parameter :: acc_handle_kind = 4
@@ -52,3 +52,17 @@ libgomp_plugin_hsa_la_LDFLAGS += $(PLUGIN_HSA_LDFLAGS)
libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
endif
+
+if PLUGIN_GCN
+# AMD GCN plugin
+libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-gcn.la
+libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_GCN_CPPFLAGS) \
+ -D_GNU_SOURCE
+libgomp_plugin_gcn_la_LDFLAGS = $(libgomp_plugin_gcn_version_info) \
+ $(lt_host_flags)
+libgomp_plugin_gcn_la_LDFLAGS += $(PLUGIN_GCN_LDFLAGS)
+libgomp_plugin_gcn_la_LIBADD = libgomp.la $(PLUGIN_GCN_LIBS)
+libgomp_plugin_gcn_la_LIBTOOLFLAGS = --tag=disable-static
+endif
@@ -137,6 +137,15 @@ AC_SUBST(PLUGIN_HSA_CPPFLAGS)
AC_SUBST(PLUGIN_HSA_LDFLAGS)
AC_SUBST(PLUGIN_HSA_LIBS)
+PLUGIN_GCN=0
+PLUGIN_GCN_CPPFLAGS=
+PLUGIN_GCN_LDFLAGS=
+PLUGIN_GCN_LIBS=
+AC_SUBST(PLUGIN_GCN)
+AC_SUBST(PLUGIN_GCN_CPPFLAGS)
+AC_SUBST(PLUGIN_GCN_LDFLAGS)
+AC_SUBST(PLUGIN_GCN_LIBS)
+
# Parse '--enable-offload-targets', figure out the corresponding libgomp
# plugins, and configure to find the corresponding offload compilers.
# 'offload_plugins' and 'offload_targets' will be populated in the same order.
@@ -237,6 +246,29 @@ if test x"$enable_offload_targets" != x; then
;;
esac
;;
+
+ amdgcn*)
+ case "${target}" in
+ x86_64-*-*)
+ case " ${CC} ${CFLAGS} " in
+ *" -m32 "*)
+ PLUGIN_GCN=0
+ ;;
+ *)
+ tgt_name=gcn
+ PLUGIN_GCN=$tgt
+ PLUGIN_GCN_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
+ PLUGIN_GCN_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
+ PLUGIN_GCN_LIBS="-ldl"
+ PLUGIN_GCN=1
+ ;;
+ esac
+ ;;
+ *-*-*)
+ PLUGIN_GCN=0
+ ;;
+ esac
+ ;;
*)
AC_MSG_ERROR([unknown offload target specified])
;;
@@ -275,6 +307,9 @@ AC_DEFINE_UNQUOTED([PLUGIN_NVPTX_DYNAMIC], [$PLUGIN_NVPTX_DYNAMIC],
AM_CONDITIONAL([PLUGIN_HSA], [test $PLUGIN_HSA = 1])
AC_DEFINE_UNQUOTED([PLUGIN_HSA], [$PLUGIN_HSA],
[Define to 1 if the HSA plugin is built, 0 if not.])
+AM_CONDITIONAL([PLUGIN_GCN], [test $PLUGIN_GCN = 1])
+AC_DEFINE_UNQUOTED([PLUGIN_GCN], [$PLUGIN_GCN],
+ [Define to 1 if the GCN plugin is built, 0 if not.])
if test "$HSA_RUNTIME_LIB" != ""; then
HSA_RUNTIME_LIB="$HSA_RUNTIME_LIB/"
new file mode 100644
@@ -0,0 +1,3482 @@
+/* Plugin for AMD GCN execution.
+
+ Copyright (C) 2013-2019 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <hsa.h>
+#include <dlfcn.h>
+#include <signal.h>
+#include "libgomp-plugin.h"
+#include "gomp-constants.h"
+#include <elf.h>
+#include "oacc-plugin.h"
+#include "oacc-int.h"
+#include <assert.h>
+
+#define obstack_chunk_alloc GOMP_PLUGIN_malloc
+#define obstack_chunk_free free
+#include "obstack.h"
+
+/* These probably won't be in elf.h for a while. */
+#define R_AMDGPU_NONE 0
+#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
+#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
+#define R_AMDGPU_ABS64 3 /* S + A */
+#define R_AMDGPU_REL32 4 /* S + A - P */
+#define R_AMDGPU_REL64 5 /* S + A - P */
+#define R_AMDGPU_ABS32 6 /* S + A */
+#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
+#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
+#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
+#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
+#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
+#define reserved 12
+#define R_AMDGPU_RELATIVE64 13 /* B + A */
+
+/* Secure getenv() which returns NULL if running as SUID/SGID. */
+#ifndef HAVE_SECURE_GETENV
+#ifdef HAVE___SECURE_GETENV
+#define secure_getenv __secure_getenv
+#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
+ && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
+
+#include <unistd.h>
+
+/* Implementation of secure_getenv() for targets where it is not provided but
+ we have at least means to test real and effective IDs. */
+
+static char *
+secure_getenv (const char *name)
+{
+ if ((getuid () == geteuid ()) && (getgid () == getegid ()))
+ return getenv (name);
+ else
+ return NULL;
+}
+
+#else
+#define secure_getenv getenv
+#endif
+#endif
+
+struct gcn_thread
+{
+ int async;
+};
+
+static inline struct gcn_thread *
+gcn_thread (void)
+{
+ return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
+}
+
+/* As an HSA runtime is dlopened, following structure defines function
+ pointers utilized by the HSA plug-in. */
+
+struct hsa_runtime_fn_info
+{
+ /* HSA runtime. */
+ hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
+ const char **status_string);
+ hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
+ void *value);
+ hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
+ hsa_agent_info_t attribute,
+ void *value);
+ hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
+ hsa_isa_info_t attribute,
+ uint32_t index,
+ void *value);
+ hsa_status_t (*hsa_init_fn) (void);
+ hsa_status_t (*hsa_iterate_agents_fn)
+ (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
+ hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
+ hsa_region_info_t attribute,
+ void *value);
+ hsa_status_t (*hsa_queue_create_fn)
+ (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
+ void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
+ void *data, uint32_t private_segment_size,
+ uint32_t group_segment_size, hsa_queue_t **queue);
+ hsa_status_t (*hsa_agent_iterate_regions_fn)
+ (hsa_agent_t agent,
+ hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
+ hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
+ hsa_status_t (*hsa_executable_create_fn)
+ (hsa_profile_t profile, hsa_executable_state_t executable_state,
+ const char *options, hsa_executable_t *executable);
+ hsa_status_t (*hsa_executable_global_variable_define_fn)
+ (hsa_executable_t executable, const char *variable_name, void *address);
+ hsa_status_t (*hsa_executable_load_code_object_fn)
+ (hsa_executable_t executable, hsa_agent_t agent,
+ hsa_code_object_t code_object, const char *options);
+ hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
+ const char *options);
+ hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
+ uint32_t num_consumers,
+ const hsa_agent_t *consumers,
+ hsa_signal_t *signal);
+ hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
+ void **ptr);
+ hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
+ hsa_status_t (*hsa_memory_free_fn) (void *ptr);
+ hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
+ hsa_status_t (*hsa_executable_get_symbol_fn)
+ (hsa_executable_t executable, const char *module_name,
+ const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
+ hsa_executable_symbol_t *symbol);
+ hsa_status_t (*hsa_executable_symbol_get_info_fn)
+ (hsa_executable_symbol_t executable_symbol,
+ hsa_executable_symbol_info_t attribute, void *value);
+ hsa_status_t (*hsa_executable_iterate_symbols_fn)
+ (hsa_executable_t executable,
+ hsa_status_t (*callback)(hsa_executable_t executable,
+ hsa_executable_symbol_t symbol, void *data),
+ void *data);
+ uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
+ uint64_t value);
+ uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
+ void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
+ hsa_signal_value_t value);
+ void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
+ hsa_signal_value_t value);
+ hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
+ (hsa_signal_t signal, hsa_signal_condition_t condition,
+ hsa_signal_value_t compare_value, uint64_t timeout_hint,
+ hsa_wait_state_t wait_state_hint);
+ hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
+ hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
+
+ hsa_status_t (*hsa_code_object_deserialize_fn)
+ (void *serialized_code_object, size_t serialized_code_object_size,
+ const char *options, hsa_code_object_t *code_object);
+};
+
+/* HSA runtime functions that are initialized in init_hsa_context. */
+
+static struct hsa_runtime_fn_info hsa_fns;
+
+/* Keep the following GOMP prefixed structures in sync with respective parts of
+ the compiler. */
+
+/* Structure describing the run-time and grid properties of an HSA kernel
+ lauch. */
+
+struct GOMP_kernel_launch_attributes
+{
+ /* Number of dimensions the workload has. Maximum number is 3. */
+ uint32_t ndim;
+ /* Size of the grid in the three respective dimensions. */
+ uint32_t gdims[3];
+ /* Size of work-groups in the respective dimensions. */
+ uint32_t wdims[3];
+};
+
+/* Collection of information needed for a dispatch of a kernel from a
+ kernel. */
+
+struct GOMP_hsa_kernel_dispatch
+{
+ /* Pointer to a command queue associated with a kernel dispatch agent. */
+ void *queue;
+ /* Pointer to reserved memory for OMP data struct copying. */
+ void *omp_data_memory;
+ /* Pointer to a memory space used for kernel arguments passing. */
+ void *kernarg_address;
+ /* Kernel object. */
+ uint64_t object;
+ /* Synchronization signal used for dispatch synchronization. */
+ uint64_t signal;
+ /* Private segment size. */
+ uint32_t private_segment_size;
+ /* Group segment size. */
+ uint32_t group_segment_size;
+ /* Number of children kernel dispatches. */
+ uint64_t kernel_dispatch_count;
+ /* Debug purpose argument. */
+ uint64_t debug;
+ /* Levels-var ICV. */
+ uint64_t omp_level;
+ /* Kernel dispatch structures created for children kernel dispatches. */
+ struct GOMP_hsa_kernel_dispatch **children_dispatches;
+ /* Number of threads. */
+ uint32_t omp_num_threads;
+};
+
+/* Structure of the default kernargs segment, supporting gomp_print_*.
+ This will only be used if the requested space is less than 9 bytes. */
+
+struct kernargs {
+ /* Leave space for the real kernel arguments.
+ OpenACC and OpenMP only use one pointer. */
+ int64_t dummy1;
+ int64_t dummy2;
+
+ /* A pointer to struct output, below, for console output data. */
+ int64_t out_ptr;
+
+ /* A pointer to struct heap, below. */
+ int64_t heap_ptr;
+
+ /* Output data. */
+ struct output {
+ int return_value;
+ unsigned int next_output;
+ struct printf_data {
+ int written;
+ char msg[128];
+ int type;
+ union {
+ int64_t ivalue;
+ double dvalue;
+ char text[128];
+ };
+ } queue[1024];
+ unsigned int consumed;
+ } output_data;
+};
+
+/* Heap space, allocated target-side, provided for use of newlib malloc.
+ Each module should have it's own heap allocated.
+ Beware that heap usage increases with OpenMP teams. */
+static size_t gcn_kernel_heap_size = 100*1024*1024; /* 100MB. */
+struct heap {
+ int64_t size;
+ char data[0];
+};
+
+/* GCN specific definition of asynchronous queues. */
+
+#define ASYNC_QUEUE_SIZE 64
+#define DRAIN_QUEUE_SYNCHRONOUS_P false
+#define DEBUG_QUEUES 0
+#define DEBUG_THREAD_SLEEP 0
+#define DEBUG_THREAD_SIGNAL 0
+
+struct kernel_launch
+{
+ struct kernel_info *kernel;
+ void *vars;
+ struct GOMP_kernel_launch_attributes kla;
+};
+
+struct callback
+{
+ void (*fn)(void *);
+ void *data;
+};
+
+struct queue_entry
+{
+ int type;
+ union {
+ struct kernel_launch launch;
+ struct callback callback;
+ } u;
+};
+
+struct goacc_asyncqueue
+{
+ struct agent_info *agent;
+ hsa_queue_t *hsa_queue;
+
+ pthread_t thread_drain_queue;
+ pthread_mutex_t mutex;
+ pthread_cond_t queue_cond_in;
+ pthread_cond_t queue_cond_out;
+ struct queue_entry queue[ASYNC_QUEUE_SIZE];
+ int queue_first;
+ int queue_n;
+ int drain_queue_stop;
+
+ int id;
+ struct goacc_asyncqueue *prev;
+ struct goacc_asyncqueue *next;
+};
+
+/* Part of the libgomp plugin interface. Return the name of the accelerator,
+ which is "gcn". */
+
+const char *
+GOMP_OFFLOAD_get_name (void)
+{
+ return "gcn";
+}
+
+/* Part of the libgomp plugin interface. Return the specific capabilities the
+ HSA accelerator have. */
+
+unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+ /* FIXME: Enable shared memory for APU, but not discrete GPU. */
+ return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
+ | GOMP_OFFLOAD_CAP_OPENACC_200;
+}
+
+/* Part of the libgomp plugin interface. Identify as HSA accelerator. */
+
+int
+GOMP_OFFLOAD_get_type (void)
+{
+ return OFFLOAD_TARGET_TYPE_GCN;
+}
+
+/* Return the libgomp version number we're compatible with. There is
+ no requirement for cross-version compatibility. */
+
+unsigned
+GOMP_OFFLOAD_version (void)
+{
+ return GOMP_VERSION;
+}
+
+/* Flag to decide whether print to stderr information about what is going on.
+ Set in init_debug depending on environment variables. */
+
+static bool debug;
+
+/* Flag to decide if the runtime should suppress a possible fallback to host
+ execution. */
+
+static bool suppress_host_fallback;
+
+/* Flag to locate HSA runtime shared library that is dlopened
+ by this plug-in. */
+
+static const char *hsa_runtime_lib;
+
+/* Flag to decide if the runtime should support also CPU devices (can be
+ a simulator). */
+
+static bool support_cpu_devices;
+
+/* Runtime dimension overrides. Zero indicates default. */
+
+static int override_x_dim = 0;
+static int override_z_dim = 0;
+
+/* Initialize debug and suppress_host_fallback according to the environment. */
+
+static void
+init_environment_variables (void)
+{
+ if (secure_getenv ("GCN_DEBUG"))
+ debug = true;
+ else
+ debug = false;
+
+ if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
+ suppress_host_fallback = true;
+ else
+ suppress_host_fallback = false;
+
+ hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
+ if (hsa_runtime_lib == NULL)
+ hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
+
+ support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
+
+ const char *x = secure_getenv ("GCN_NUM_TEAMS");
+ if (!x)
+ x = secure_getenv ("GCN_NUM_GANGS");
+ if (x)
+ override_x_dim = atoi (x);
+
+ const char *z = secure_getenv ("GCN_NUM_THREADS");
+ if (!z)
+ z = secure_getenv ("GCN_NUM_WORKERS");
+ if (z)
+ override_z_dim = atoi (z);
+
+ const char *heap = secure_getenv ("GCN_HEAP_SIZE");
+ if (heap)
+ {
+ size_t tmp = atol (heap);
+ if (tmp)
+ gcn_kernel_heap_size = tmp;
+ }
+}
+
+/* Print a message to stderr if HSA_DEBUG value is set to true. */
+
+#define HSA_DPRINT(...) \
+ do \
+ { \
+ if (debug) \
+ { \
+ fprintf (stderr, __VA_ARGS__); \
+ } \
+ } \
+ while (false);
+
+/* Flush stderr if GCN_DEBUG value is set to true. */
+
+#define HSA_FLUSH() \
+ do { \
+ if (debug) \
+ fflush (stderr); \
+ } while (0)
+
+/* Print a logging message with PREFIX to stderr if HSA_DEBUG value
+ is set to true. */
+
+#define HSA_LOG(prefix, ...) \
+ do \
+ { \
+ HSA_DPRINT (prefix); \
+ HSA_DPRINT (__VA_ARGS__); \
+ HSA_FLUSH (); \
+ } while (false)
+
+/* Print a debugging message to stderr. */
+
+#define HSA_DEBUG(...) HSA_LOG ("GCN debug: ", __VA_ARGS__)
+
+/* Print a warning message to stderr. */
+
+#define HSA_WARNING(...) HSA_LOG ("GCN warning: ", __VA_ARGS__)
+
+/* Print HSA warning STR with an HSA STATUS code. */
+
+static void
+hsa_warn (const char *str, hsa_status_t status)
+{
+ if (!debug)
+ return;
+
+ const char *hsa_error_msg;
+ hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+
+ fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
+ hsa_error_msg);
+}
+
+/* Report a fatal error STR together with the HSA error corresponding to STATUS
+ and terminate execution of the current process. */
+
+static void
+hsa_fatal (const char *str, hsa_status_t status)
+{
+ const char *hsa_error_msg;
+ hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+ GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
+ hsa_error_msg);
+}
+
+/* Like hsa_fatal, except only report error message, and return FALSE
+ for propagating error processing to outside of plugin. */
+
+static bool
+hsa_error (const char *str, hsa_status_t status)
+{
+ const char *hsa_error_msg;
+ hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+ GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
+ hsa_error_msg);
+ return false;
+}
+
+struct hsa_kernel_description
+{
+ const char *name;
+ unsigned omp_data_size;
+ bool gridified_kernel_p;
+ unsigned kernel_dependencies_count;
+ const char **kernel_dependencies;
+ int oacc_dims[3]; /* Only present for GCN kernels. */
+};
+
+struct global_var_info
+{
+ const char *name;
+ void *address;
+};
+
+/* Data passed by the static initializer of a compilation unit containing GCN
+ object code to GOMP_offload_register. */
+
+struct gcn_image_desc
+{
+ union {
+ struct gcn_image {
+ char magic[4]; /* Will be "GCN" for GCN code objects. */
+ size_t size;
+ void *image;
+ } *gcn_image;
+ };
+ const unsigned kernel_count;
+ struct hsa_kernel_description *kernel_infos;
+ const unsigned global_variable_count;
+ struct global_var_info *global_variables;
+};
+
+struct agent_info;
+
+/* Information required to identify, finalize and run any given kernel. */
+
+struct kernel_info
+{
+ /* Name of the kernel, required to locate it within the GCN object-code
+ module. */
+ const char *name;
+ /* Size of memory space for OMP data. */
+ unsigned omp_data_size;
+ /* The specific agent the kernel has been or will be finalized for and run
+ on. */
+ struct agent_info *agent;
+ /* The specific module where the kernel takes place. */
+ struct module_info *module;
+ /* Mutex enforcing that at most once thread ever initializes a kernel for
+ use. A thread should have locked agent->module_rwlock for reading before
+ acquiring it. */
+ pthread_mutex_t init_mutex;
+ /* Flag indicating whether the kernel has been initialized and all fields
+ below it contain valid data. */
+ bool initialized;
+ /* Flag indicating that the kernel has a problem that blocks an execution. */
+ bool initialization_failed;
+ /* The object to be put into the dispatch queue. */
+ uint64_t object;
+ /* Required size of kernel arguments. */
+ uint32_t kernarg_segment_size;
+ /* Required size of group segment. */
+ uint32_t group_segment_size;
+ /* Required size of private segment. */
+ uint32_t private_segment_size;
+ /* List of all kernel dependencies. */
+ const char **dependencies;
+ /* Number of dependencies. */
+ unsigned dependencies_count;
+ /* Maximum OMP data size necessary for kernel from kernel dispatches. */
+ unsigned max_omp_data_size;
+ /* True if the kernel is gridified. */
+ bool gridified_kernel_p;
+};
+
+/* Information about a particular GCN module, its image and kernels. */
+
+struct module_info
+{
+ /* The description with which the program has registered the image. */
+ struct gcn_image_desc *image_desc;
+ /* GCN heap allocation. */
+ struct heap *heap;
+ /* Physical boundaries of the loaded module. */
+ Elf64_Addr phys_address_start;
+ Elf64_Addr phys_address_end;
+
+ bool constructors_run_p;
+ struct kernel_info *init_array_func, *fini_array_func;
+
+ /* Number of kernels in this module. */
+ int kernel_count;
+ /* An array of kernel_info structures describing each kernel in this
+ module. */
+ struct kernel_info kernels[];
+};
+
+/* Description of an HSA GPU agent and the program associated with it. */
+
+struct agent_info
+{
+ /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
+ hsa_agent_t id;
+ /* The user-visible device number. */
+ int device_id;
+ /* Whether the agent has been initialized. The fields below are usable only
+ if it has been. */
+ bool initialized;
+ /* Precomuted check for problem architectures. */
+ bool gfx900_p;
+
+ /* Command queues of the agent. */
+ hsa_queue_t *sync_queue;
+ struct goacc_asyncqueue *async_queues, *omp_async_queue;
+ pthread_mutex_t async_queues_mutex;
+
+ /* The HSA memory region from which to allocate kernel arguments. */
+ hsa_region_t kernarg_region;
+
+ /* Read-write lock that protects kernels which are running or about to be run
+ from interference with loading and unloading of images. Needs to be
+ locked for reading while a kernel is being run, and for writing if the
+ list of modules is manipulated (and thus the HSA program invalidated). */
+ pthread_rwlock_t module_rwlock;
+
+ /* The module associated with this kernel. */
+ struct module_info *module;
+
+ /* Mutex enforcing that only one thread will finalize the HSA program. A
+ thread should have locked agent->module_rwlock for reading before
+ acquiring it. */
+ pthread_mutex_t prog_mutex;
+ /* Flag whether the HSA program that consists of all the modules has been
+ finalized. */
+ bool prog_finalized;
+ /* HSA executable - the finalized program that is used to locate kernels. */
+ hsa_executable_t executable;
+};
+
+static bool create_and_finalize_hsa_program (struct agent_info *);
+
+/* Information about the whole HSA environment and all of its agents. */
+
+struct hsa_context_info
+{
+ /* Whether the structure has been initialized. */
+ bool initialized;
+ /* Number of usable GPU HSA agents in the system. */
+ int agent_count;
+ /* Array of agent_info structures describing the individual HSA agents. */
+ struct agent_info *agents;
+};
+
+/* Information about the whole HSA environment and all of its agents. */
+
+static struct hsa_context_info hsa_context;
+
+static bool
+init_hsa_runtime_functions (void)
+{
+#define DLSYM_FN(function) \
+ hsa_fns.function##_fn = dlsym (handle, #function); \
+ if (hsa_fns.function##_fn == NULL) \
+ return false;
+ void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
+ if (handle == NULL)
+ return false;
+
+ DLSYM_FN (hsa_status_string)
+ DLSYM_FN (hsa_system_get_info)
+ DLSYM_FN (hsa_agent_get_info)
+ DLSYM_FN (hsa_init)
+ DLSYM_FN (hsa_iterate_agents)
+ DLSYM_FN (hsa_region_get_info)
+ DLSYM_FN (hsa_queue_create)
+ DLSYM_FN (hsa_agent_iterate_regions)
+ DLSYM_FN (hsa_executable_destroy)
+ DLSYM_FN (hsa_executable_create)
+ DLSYM_FN (hsa_executable_global_variable_define)
+ DLSYM_FN (hsa_executable_load_code_object)
+ DLSYM_FN (hsa_executable_freeze)
+ DLSYM_FN (hsa_signal_create)
+ DLSYM_FN (hsa_memory_allocate)
+ DLSYM_FN (hsa_memory_copy)
+ DLSYM_FN (hsa_memory_free)
+ DLSYM_FN (hsa_signal_destroy)
+ DLSYM_FN (hsa_executable_get_symbol)
+ DLSYM_FN (hsa_executable_symbol_get_info)
+ DLSYM_FN (hsa_executable_iterate_symbols)
+ DLSYM_FN (hsa_queue_add_write_index_release)
+ DLSYM_FN (hsa_queue_load_read_index_acquire)
+ DLSYM_FN (hsa_signal_wait_acquire)
+ DLSYM_FN (hsa_signal_store_relaxed)
+ DLSYM_FN (hsa_signal_store_release)
+ DLSYM_FN (hsa_signal_load_acquire)
+ DLSYM_FN (hsa_queue_destroy)
+ DLSYM_FN (hsa_code_object_deserialize)
+ return true;
+#undef DLSYM_FN
+}
+
+static void
+dump_hsa_system_info (void)
+{
+ hsa_status_t status;
+
+ hsa_endianness_t endianness;
+ status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
+ &endianness);
+ if (status == HSA_STATUS_SUCCESS)
+ switch (endianness)
+ {
+ case HSA_ENDIANNESS_LITTLE:
+ HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
+ break;
+ case HSA_ENDIANNESS_BIG:
+ HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
+ break;
+ default:
+ HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
+ }
+ else
+ HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
+
+ uint8_t extensions[128];
+ status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
+ &extensions);
+ if (status == HSA_STATUS_SUCCESS)
+ {
+ if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
+ HSA_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
+ }
+ else
+ HSA_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
+}
+
+static void
+dump_machine_model (hsa_machine_model_t machine_model, const char *s)
+{
+ switch (machine_model)
+ {
+ case HSA_MACHINE_MODEL_SMALL:
+ HSA_DEBUG ("%s: SMALL\n", s);
+ break;
+ case HSA_MACHINE_MODEL_LARGE:
+ HSA_DEBUG ("%s: LARGE\n", s);
+ break;
+ default:
+ HSA_DEBUG ("%s: UNKNOWN\n", s);
+ break;
+ }
+}
+
+static void
+dump_profile (hsa_profile_t profile, const char *s)
+{
+ switch (profile)
+ {
+ case HSA_PROFILE_FULL:
+ HSA_DEBUG ("%s: FULL\n", s);
+ break;
+ case HSA_PROFILE_BASE:
+ HSA_DEBUG ("%s: BASE\n", s);
+ break;
+ default:
+ HSA_DEBUG ("%s: UNKNOWN\n", s);
+ break;
+ }
+}
+
+static void dump_hsa_regions (hsa_agent_t agent);
+
+static hsa_status_t
+dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
+{
+ hsa_status_t status;
+
+ char buf[64];
+ status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
+ &buf);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_NAME: FAILED\n");
+
+ status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
+ &buf);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
+
+ hsa_machine_model_t machine_model;
+ status
+ = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
+ &machine_model);
+ if (status == HSA_STATUS_SUCCESS)
+ dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
+
+ hsa_profile_t profile;
+ status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
+ &profile);
+ if (status == HSA_STATUS_SUCCESS)
+ dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_PROFILE: FAILED\n");
+
+ hsa_device_type_t device_type;
+ status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
+ &device_type);
+ if (status == HSA_STATUS_SUCCESS)
+ {
+ switch (device_type)
+ {
+ case HSA_DEVICE_TYPE_CPU:
+ HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
+ break;
+ case HSA_DEVICE_TYPE_GPU:
+ HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
+ break;
+ case HSA_DEVICE_TYPE_DSP:
+ HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
+ break;
+ default:
+ HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
+ break;
+ }
+ }
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: FAILED\n");
+
+ uint32_t size;
+ status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
+ &size);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
+
+ uint32_t max_dim;
+ status = hsa_fns.hsa_agent_get_info_fn (agent,
+ HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
+ &max_dim);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
+
+ uint32_t max_size;
+ status = hsa_fns.hsa_agent_get_info_fn (agent,
+ HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
+ &max_size);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
+
+ uint32_t grid_max_dim;
+ status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
+ &grid_max_dim);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
+
+ uint32_t grid_max_size;
+ status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
+ &grid_max_size);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
+ else
+ HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
+
+ dump_hsa_regions (agent);
+
+ return HSA_STATUS_SUCCESS;
+}
+
+/* Return true if the agent is a GPU and acceptable of concurrent submissions
+ from different threads. */
+
+static bool
+suitable_hsa_agent_p (hsa_agent_t agent)
+{
+ hsa_device_type_t device_type;
+ hsa_status_t status
+ = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
+ &device_type);
+ if (status != HSA_STATUS_SUCCESS)
+ return false;
+
+ switch (device_type)
+ {
+ case HSA_DEVICE_TYPE_GPU:
+ break;
+ case HSA_DEVICE_TYPE_CPU:
+ if (!support_cpu_devices)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ uint32_t features = 0;
+ status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
+ &features);
+ if (status != HSA_STATUS_SUCCESS
+ || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
+ return false;
+ hsa_queue_type_t queue_type;
+ status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
+ &queue_type);
+ if (status != HSA_STATUS_SUCCESS
+ || (queue_type != HSA_QUEUE_TYPE_MULTI))
+ return false;
+
+ return true;
+}
+
+/* Callback of hsa_iterate_agents, if AGENT is a GPU device, increment
+ agent_count in hsa_context. */
+
+static hsa_status_t
+count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
+{
+ if (suitable_hsa_agent_p (agent))
+ hsa_context.agent_count++;
+ return HSA_STATUS_SUCCESS;
+}
+
+/* Callback of hsa_iterate_agents, if AGENT is a GPU device, assign the agent
+ id to the describing structure in the hsa context. The index of the
+ structure is pointed to by DATA, increment it afterwards. */
+
+static hsa_status_t
+assign_agent_ids (hsa_agent_t agent, void *data)
+{
+ if (suitable_hsa_agent_p (agent))
+ {
+ int *agent_index = (int *) data;
+ hsa_context.agents[*agent_index].id = agent;
+ ++*agent_index;
+ }
+ return HSA_STATUS_SUCCESS;
+}
+
+static void
+finalize_async_thread (struct goacc_asyncqueue *aq)
+{
+ pthread_mutex_lock (&aq->mutex);
+ if (aq->drain_queue_stop == 2)
+ {
+ pthread_mutex_unlock (&aq->mutex);
+ return;
+ }
+
+ aq->drain_queue_stop = 1;
+
+ if (DEBUG_THREAD_SIGNAL)
+ HSA_DEBUG ("Signalling async thread %d:%d: cond_in\n",
+ aq->agent->device_id, aq->id);
+ pthread_cond_signal (&aq->queue_cond_in);
+
+ while (aq->drain_queue_stop != 2)
+ {
+ if (DEBUG_THREAD_SLEEP)
+ HSA_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
+ " to sleep\n", aq->agent->device_id, aq->id);
+ pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+ if (DEBUG_THREAD_SLEEP)
+ HSA_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
+ aq->agent->device_id, aq->id);
+ }
+
+ HSA_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
+ aq->id);
+ pthread_mutex_unlock (&aq->mutex);
+
+ int err = pthread_join (aq->thread_drain_queue, NULL);
+ if (err != 0)
+ GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
+ aq->agent->device_id, aq->id, strerror (err));
+ HSA_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
+}
+
+/* Initialize hsa_context if it has not already been done.
+ Return TRUE on success. */
+
+static bool
+init_hsa_context (void)
+{
+ hsa_status_t status;
+ int agent_index = 0;
+
+ if (hsa_context.initialized)
+ return true;
+ init_environment_variables ();
+ if (!init_hsa_runtime_functions ())
+ {
+ HSA_DEBUG ("Run-time could not be dynamically opened\n");
+ if (suppress_host_fallback)
+ GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
+ return false;
+ }
+ status = hsa_fns.hsa_init_fn ();
+ if (status != HSA_STATUS_SUCCESS)
+ return hsa_error ("Run-time could not be initialized", status);
+ HSA_DEBUG ("HSA run-time initialized for GCN\n");
+
+ if (debug)
+ dump_hsa_system_info ();
+
+ status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
+ if (status != HSA_STATUS_SUCCESS)
+ return hsa_error ("GCN GPU devices could not be enumerated", status);
+ HSA_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
+
+ hsa_context.agents
+ = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
+ * sizeof (struct agent_info));
+ status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
+ if (agent_index != hsa_context.agent_count)
+ {
+ GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
+ return false;
+ }
+
+ if (debug)
+ {
+ status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
+ if (status != HSA_STATUS_SUCCESS)
+ GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
+ }
+
+ hsa_context.initialized = true;
+ return true;
+}
+
+/* Verify that hsa_context has already been initialized and return the
+ agent_info structure describing device number N. Return NULL on error. */
+
+static struct agent_info *
+get_agent_info (int n)
+{
+ if (!hsa_context.initialized)
+ {
+ GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
+ return NULL;
+ }
+ if (n >= hsa_context.agent_count)
+ {
+ GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
+ return NULL;
+ }
+ if (!hsa_context.agents[n].initialized)
+ {
+ GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
+ return NULL;
+ }
+ return &hsa_context.agents[n];
+}
+
+/* Callback of dispatch queues to report errors. */
+
+static void
+queue_callback (hsa_status_t status,
+ hsa_queue_t *queue __attribute__ ((unused)),
+ void *data __attribute__ ((unused)))
+{
+ hsa_fatal ("Asynchronous queue error", status);
+}
+
+static hsa_status_t
+dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
+{
+ hsa_status_t status;
+
+ hsa_region_segment_t segment;
+ status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
+ &segment);
+ if (status == HSA_STATUS_SUCCESS)
+ {
+ if (segment == HSA_REGION_SEGMENT_GLOBAL)
+ HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
+ else if (segment == HSA_REGION_SEGMENT_READONLY)
+ HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
+ else if (segment == HSA_REGION_SEGMENT_PRIVATE)
+ HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
+ else if (segment == HSA_REGION_SEGMENT_GROUP)
+ HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
+ else
+ HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
+ }
+ else
+ HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: FAILED\n");
+
+ if (segment == HSA_REGION_SEGMENT_GLOBAL)
+ {
+ uint32_t flags;
+ status
+ = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
+ &flags);
+ if (status == HSA_STATUS_SUCCESS)
+ {
+ if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
+ HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
+ if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
+ HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
+ if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
+ HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
+ }
+ else
+ HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
+ }
+
+ size_t size;
+ status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
+ else
+ HSA_DEBUG ("HSA_REGION_INFO_SIZE: FAILED\n");
+
+ status
+ = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
+ &size);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
+ else
+ HSA_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
+
+ bool alloc_allowed;
+ status
+ = hsa_fns.hsa_region_get_info_fn (region,
+ HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
+ &alloc_allowed);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
+ else
+ HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
+
+ if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
+ return HSA_STATUS_SUCCESS;
+
+ status
+ = hsa_fns.hsa_region_get_info_fn (region,
+ HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
+ &size);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
+ else
+ HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
+
+ size_t align;
+ status
+ = hsa_fns.hsa_region_get_info_fn (region,
+ HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
+ &align);
+ if (status == HSA_STATUS_SUCCESS)
+ HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
+ else
+ HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
+
+ return HSA_STATUS_SUCCESS;
+}
+
+static void
+dump_hsa_regions (hsa_agent_t agent)
+{
+ hsa_status_t status;
+ status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
+ dump_hsa_region,
+ NULL);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_error ("Dumping hsa regions failed", status);
+}
+
+/* Return malloc'd string with name of SYMBOL. */
+
+static char *
+get_executable_symbol_name (hsa_executable_symbol_t symbol)
+{
+ hsa_status_t status;
+ char *res;
+ uint32_t len;
+ const hsa_executable_symbol_info_t info_name_length
+ = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
+
+ status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
+ &len);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not get length of symbol name", status);
+ return NULL;
+ }
+
+ res = GOMP_PLUGIN_malloc (len + 1);
+
+ const hsa_executable_symbol_info_t info_name
+ = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
+
+ status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
+
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not get symbol name", status);
+ free (res);
+ return NULL;
+ }
+
+ res[len] = '\0';
+
+ return res;
+}
+
+/* Helper function for dump_executable_symbols. */
+
+static hsa_status_t
+dump_executable_symbol (hsa_executable_t executable,
+ hsa_executable_symbol_t symbol,
+ void *data __attribute__((unused)))
+{
+ char *name = get_executable_symbol_name (symbol);
+
+ if (name)
+ {
+ HSA_DEBUG ("executable symbol: %s\n", name);
+ free (name);
+ }
+
+ return HSA_STATUS_SUCCESS;
+}
+
+/* Dump all global symbol in executable. */
+
+static void
+dump_executable_symbols (hsa_executable_t executable)
+{
+ hsa_status_t status;
+ status
+ = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
+ dump_executable_symbol,
+ NULL);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not dump HSA executable symbols", status);
+}
+
+/* Helper function for find_executable_symbol. */
+
+static hsa_status_t
+find_executable_symbol_1 (hsa_executable_t executable,
+ hsa_executable_symbol_t symbol,
+ void *data)
+{
+ hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
+ *res = symbol;
+ return HSA_STATUS_INFO_BREAK;
+}
+
+/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true. If not
+ found, return false. */
+
+static bool
+find_executable_symbol (hsa_executable_t executable,
+ hsa_executable_symbol_t *symbol)
+{
+ hsa_status_t status;
+
+ status
+ = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
+ find_executable_symbol_1,
+ symbol);
+ if (status != HSA_STATUS_INFO_BREAK)
+ {
+ hsa_error ("Could not find executable symbol", status);
+ return false;
+ }
+
+ return true;
+}
+
+/* Callback of hsa_agent_iterate_regions. Determine if a memory REGION can be
+ used for kernarg allocations and if so write it to the memory pointed to by
+ DATA and break the query. */
+
+static hsa_status_t
+get_kernarg_memory_region (hsa_region_t region, void *data)
+{
+ hsa_status_t status;
+ hsa_region_segment_t segment;
+
+ status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
+ &segment);
+ if (status != HSA_STATUS_SUCCESS)
+ return status;
+ if (segment != HSA_REGION_SEGMENT_GLOBAL)
+ return HSA_STATUS_SUCCESS;
+
+ uint32_t flags;
+ status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
+ &flags);
+ if (status != HSA_STATUS_SUCCESS)
+ return status;
+ if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
+ {
+ hsa_region_t *ret = (hsa_region_t *) data;
+ *ret = region;
+ return HSA_STATUS_INFO_BREAK;
+ }
+ return HSA_STATUS_SUCCESS;
+}
+
+/* Part of the libgomp plugin interface. Return the number of HSA devices on
+ the system. */
+
+int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+ if (!init_hsa_context ())
+ return 0;
+ return hsa_context.agent_count;
+}
+
+union gomp_device_property_value
+GOMP_OFFLOAD_get_property (int device, int prop)
+{
+ struct agent_info *agent = get_agent_info (device);
+ hsa_region_t region = agent->kernarg_region;
+
+ union gomp_device_property_value propval = { .val = 0 };
+
+ static char buf[64];
+ buf[0] = '\0';
+ size_t size;
+ hsa_status_t status;
+
+ switch (prop)
+ {
+ case GOMP_DEVICE_PROPERTY_FREE_MEMORY:
+ /* Not known: fall through. */
+ case GOMP_DEVICE_PROPERTY_MEMORY:
+ status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE,
+ &size);
+ propval.val = size;
+ break;
+ break;
+ case GOMP_DEVICE_PROPERTY_NAME:
+ status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
+ buf);
+ if (status == HSA_STATUS_SUCCESS)
+ propval.ptr = buf;
+ break;
+ case GOMP_DEVICE_PROPERTY_VENDOR:
+ status = hsa_fns.hsa_agent_get_info_fn (agent->id,
+ HSA_AGENT_INFO_VENDOR_NAME,
+ buf);
+ if (status == HSA_STATUS_SUCCESS)
+ propval.ptr = buf;
+ break;
+ case GOMP_DEVICE_PROPERTY_DRIVER:
+ propval.ptr = "HSA Runtime";
+ break;
+ }
+
+ return propval;
+}
+
+static void
+queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
+ void *vars, struct GOMP_kernel_launch_attributes *kla)
+{
+ assert (aq->agent == kernel->agent);
+
+ if (aq->queue_n == ASYNC_QUEUE_SIZE)
+ GOMP_PLUGIN_fatal ("ran out of async queue in thread %d:%d",
+ aq->agent->device_id, aq->id);
+
+ pthread_mutex_lock (&aq->mutex);
+
+ int queue_last = ((aq->queue_first + aq->queue_n)
+ % ASYNC_QUEUE_SIZE);
+ if (DEBUG_QUEUES)
+ HSA_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
+ aq->id, queue_last);
+
+ aq->queue[queue_last].type = 0;
+ aq->queue[queue_last].u.launch.kernel = kernel;
+ aq->queue[queue_last].u.launch.vars = vars;
+ aq->queue[queue_last].u.launch.kla = *kla;
+
+ aq->queue_n++;
+
+ if (DEBUG_THREAD_SIGNAL)
+ HSA_DEBUG ("signalling async thread %d:%d: cond_in\n",
+ aq->agent->device_id, aq->id);
+ pthread_cond_signal (&aq->queue_cond_in);
+
+ pthread_mutex_unlock (&aq->mutex);
+}
+
+static void
+queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
+ void *data)
+{
+ if (aq->queue_n == ASYNC_QUEUE_SIZE)
+ GOMP_PLUGIN_fatal ("Async thread %d:%d: error: queue overflowed",
+ aq->agent->device_id, aq->id);
+
+ pthread_mutex_lock (&aq->mutex);
+
+ int queue_last = ((aq->queue_first + aq->queue_n)
+ % ASYNC_QUEUE_SIZE);
+ if (DEBUG_QUEUES)
+ HSA_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
+ aq->id, queue_last);
+
+ aq->queue[queue_last].type = 1;
+ aq->queue[queue_last].u.callback.fn = fn;
+ aq->queue[queue_last].u.callback.data = data;
+
+ aq->queue_n++;
+
+ if (DEBUG_THREAD_SIGNAL)
+ HSA_DEBUG ("signalling async thread %d:%d: cond_in\n",
+ aq->agent->device_id, aq->id);
+ pthread_cond_signal (&aq->queue_cond_in);
+
+ pthread_mutex_unlock (&aq->mutex);
+}
+
+static void run_kernel (struct kernel_info *kernel, void *vars,
+ struct GOMP_kernel_launch_attributes *kla,
+ struct goacc_asyncqueue *aq, bool module_locked);
+
+static void
+execute_queue_entry (struct goacc_asyncqueue *aq, int index)
+{
+ struct queue_entry *entry = &aq->queue[index];
+ if (entry->type == 0)
+ {
+ if (DEBUG_QUEUES)
+ HSA_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
+ aq->agent->device_id, aq->id, index);
+ run_kernel (entry->u.launch.kernel,
+ entry->u.launch.vars,
+ &entry->u.launch.kla, aq, false);
+ if (DEBUG_QUEUES)
+ HSA_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
+ aq->agent->device_id, aq->id, index);
+ }
+ else if (entry->type == 1)
+ {
+ if (DEBUG_QUEUES)
+ HSA_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
+ aq->agent->device_id, aq->id, index);
+ entry->u.callback.fn (entry->u.callback.data);
+ if (DEBUG_QUEUES)
+ HSA_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
+ aq->agent->device_id, aq->id, index);
+ }
+ else
+ GOMP_PLUGIN_fatal ("Unknown queue element");
+}
+
+static void *
+drain_queue (void *thread_arg)
+{
+ struct goacc_asyncqueue *aq = thread_arg;
+
+ if (DRAIN_QUEUE_SYNCHRONOUS_P)
+ {
+ aq->drain_queue_stop = 2;
+ return NULL;
+ }
+
+ pthread_mutex_lock (&aq->mutex);
+
+ while (true)
+ {
+ if (aq->drain_queue_stop)
+ break;
+
+ if (aq->queue_n > 0)
+ {
+ pthread_mutex_unlock (&aq->mutex);
+ execute_queue_entry (aq, aq->queue_first);
+
+ pthread_mutex_lock (&aq->mutex);
+ aq->queue_first = ((aq->queue_first + 1)
+ % ASYNC_QUEUE_SIZE);
+ aq->queue_n--;
+
+ if (DEBUG_THREAD_SIGNAL)
+ HSA_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
+ aq->agent->device_id, aq->id);
+ pthread_cond_broadcast (&aq->queue_cond_out);
+ pthread_mutex_unlock (&aq->mutex);
+
+ if (DEBUG_QUEUES)
+ HSA_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
+ aq->id);
+ pthread_mutex_lock (&aq->mutex);
+ }
+ else
+ {
+ if (DEBUG_THREAD_SLEEP)
+ HSA_DEBUG ("Async thread %d:%d: going to sleep\n",
+ aq->agent->device_id, aq->id);
+ pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
+ if (DEBUG_THREAD_SLEEP)
+ HSA_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
+ aq->agent->device_id, aq->id);
+ }
+ }
+
+ aq->drain_queue_stop = 2;
+ if (DEBUG_THREAD_SIGNAL)
+ HSA_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
+ aq->agent->device_id, aq->id);
+ pthread_cond_broadcast (&aq->queue_cond_out);
+ pthread_mutex_unlock (&aq->mutex);
+
+ HSA_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
+ return NULL;
+}
+
+static void
+drain_queue_synchronous (struct goacc_asyncqueue *aq)
+{
+ pthread_mutex_lock (&aq->mutex);
+
+ while (aq->queue_n > 0)
+ {
+ execute_queue_entry (aq, aq->queue_first);
+
+ aq->queue_first = ((aq->queue_first + 1)
+ % ASYNC_QUEUE_SIZE);
+ aq->queue_n--;
+ }
+
+ pthread_mutex_unlock (&aq->mutex);
+}
+
+/* Part of the libgomp plugin interface. Initialize agent number N so that it
+ can be used for computation. Return TRUE on success. */
+
+bool
+GOMP_OFFLOAD_init_device (int n)
+{
+ if (!init_hsa_context ())
+ return false;
+ if (n >= hsa_context.agent_count)
+ {
+ GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
+ return false;
+ }
+ struct agent_info *agent = &hsa_context.agents[n];
+
+ if (agent->initialized)
+ return true;
+
+ agent->device_id = n;
+
+ if (pthread_rwlock_init (&agent->module_rwlock, NULL))
+ {
+ GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
+ return false;
+ }
+ if (pthread_mutex_init (&agent->prog_mutex, NULL))
+ {
+ GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
+ return false;
+ }
+ if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
+ {
+ GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
+ return false;
+ }
+ agent->async_queues = NULL;
+ agent->omp_async_queue = NULL;
+
+ uint32_t queue_size;
+ hsa_status_t status;
+ status = hsa_fns.hsa_agent_get_info_fn (agent->id,
+ HSA_AGENT_INFO_QUEUE_MAX_SIZE,
+ &queue_size);
+ if (status != HSA_STATUS_SUCCESS)
+ return hsa_error ("Error requesting maximum queue size of the GCN agent",
+ status);
+
+ char buf[64];
+ status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
+ &buf);
+ if (status != HSA_STATUS_SUCCESS)
+ return hsa_error ("Error querying the name of the agent", status);
+ agent->gfx900_p = (strncmp (buf, "gfx900", 6) == 0);
+
+ status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
+ HSA_QUEUE_TYPE_MULTI, queue_callback,
+ NULL, UINT32_MAX, UINT32_MAX,
+ &agent->sync_queue);
+ if (status != HSA_STATUS_SUCCESS)
+ return hsa_error ("Error creating command queue", status);
+
+ agent->kernarg_region.handle = (uint64_t) -1;
+ status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
+ get_kernarg_memory_region,
+ &agent->kernarg_region);
+ if (agent->kernarg_region.handle == (uint64_t) -1)
+ {
+ GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
+ "arguments");
+ return false;
+ }
+ HSA_DEBUG ("Selected kernel arguments memory region:\n");
+ dump_hsa_region (agent->kernarg_region, NULL);
+
+ HSA_DEBUG ("GCN agent %d initialized\n", n);
+
+ agent->initialized = true;
+ return true;
+}
+
+/* Free the HSA program in agent and everything associated with it and set
+ agent->prog_finalized and the initialized flags of all kernels to false.
+ Return TRUE on success. */
+
+static bool
+destroy_hsa_program (struct agent_info *agent)
+{
+ if (!agent->prog_finalized)
+ return true;
+
+ hsa_status_t status;
+
+ HSA_DEBUG ("Destroying the current GCN program.\n");
+
+ status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
+ if (status != HSA_STATUS_SUCCESS)
+ return hsa_error ("Could not destroy GCN executable", status);
+
+ if (agent->module)
+ {
+ int i;
+ for (i = 0; i < agent->module->kernel_count; i++)
+ agent->module->kernels[i].initialized = false;
+
+ if (agent->module->heap)
+ {
+ hsa_fns.hsa_memory_free_fn (agent->module->heap);
+ agent->module->heap = NULL;
+ }
+ }
+ agent->prog_finalized = false;
+ return true;
+}
+
+/* Initialize KERNEL from D and other parameters. Return true on success. */
+
+static bool
+init_basic_kernel_info (struct kernel_info *kernel,
+ struct hsa_kernel_description *d,
+ struct agent_info *agent,
+ struct module_info *module)
+{
+ kernel->agent = agent;
+ kernel->module = module;
+ kernel->name = d->name;
+ kernel->omp_data_size = d->omp_data_size;
+ kernel->gridified_kernel_p = d->gridified_kernel_p;
+ kernel->dependencies_count = d->kernel_dependencies_count;
+ kernel->dependencies = d->kernel_dependencies;
+ if (pthread_mutex_init (&kernel->init_mutex, NULL))
+ {
+ GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
+ return false;
+ }
+ return true;
+}
+
+static void init_kernel (struct kernel_info *kernel);
+
+/* Part of the libgomp plugin interface. Load GCN object-code module
+ described by struct gcn_image_desc in TARGET_DATA and return references to
+ kernel descriptors in TARGET_TABLE. */
+
+int
+GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
+ struct addr_pair **target_table)
+{
+ if (GOMP_VERSION_DEV (version) > GOMP_VERSION_GCN)
+ {
+ GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
+ " (expected %u, received %u)",
+ GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
+ return -1;
+ }
+
+ struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
+ struct agent_info *agent;
+ struct addr_pair *pair;
+ struct module_info *module;
+ struct kernel_info *kernel;
+ int kernel_count = image_desc->kernel_count;
+ unsigned var_count = image_desc->global_variable_count;
+
+ agent = get_agent_info (ord);
+ if (!agent)
+ return -1;
+
+ if (pthread_rwlock_wrlock (&agent->module_rwlock))
+ {
+ GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
+ return -1;
+ }
+ if (agent->prog_finalized
+ && !destroy_hsa_program (agent))
+ return -1;
+
+ HSA_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
+ HSA_DEBUG ("Encountered %u global variables in an image\n", var_count);
+ pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
+ * sizeof (struct addr_pair));
+ *target_table = pair;
+ module = (struct module_info *)
+ GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
+ + kernel_count * sizeof (struct kernel_info));
+ module->image_desc = image_desc;
+ module->kernel_count = kernel_count;
+ module->heap = NULL;
+ module->constructors_run_p = false;
+
+ kernel = &module->kernels[0];
+
+ /* We have the magic code for a native GCN ELF kernel, not something
+ else. */
+ if (strcmp (image_desc->gcn_image->magic, "GCN") != 0)
+ return -1;
+
+ /* Allocate memory for kernel dependencies. */
+ for (unsigned i = 0; i < kernel_count; i++)
+ {
+ struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
+ if (!init_basic_kernel_info (kernel, d, agent, module))
+ return -1;
+ if (strcmp (d->name, "_init_array") == 0)
+ module->init_array_func = kernel;
+ else if (strcmp (d->name, "_fini_array") == 0)
+ module->fini_array_func = kernel;
+ else
+ {
+ pair->start = (uintptr_t) kernel;
+ pair->end = (uintptr_t) (kernel + 1);
+ pair++;
+ }
+ kernel++;
+ }
+
+ agent->module = module;
+ if (pthread_rwlock_unlock (&agent->module_rwlock))
+ {
+ GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
+ return -1;
+ }
+
+ if (!create_and_finalize_hsa_program (agent))
+ return -1;
+
+ for (unsigned i = 0; i < var_count; i++)
+ {
+ struct global_var_info *v = &image_desc->global_variables[i];
+ HSA_DEBUG ("Looking for variable %s\n", v->name);
+
+ hsa_status_t status;
+ hsa_executable_symbol_t var_symbol;
+ status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
+ v->name, agent->id,
+ 0, &var_symbol);
+
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not find symbol for variable in the code object",
+ status);
+
+ uint64_t var_addr;
+ uint32_t var_size;
+ status = hsa_fns.hsa_executable_symbol_get_info_fn
+ (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not extract a variable from its symbol", status);
+ status = hsa_fns.hsa_executable_symbol_get_info_fn
+ (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not extract a variable size from its symbol", status);
+
+ pair->start = var_addr;
+ pair->end = var_addr + var_size;
+ HSA_DEBUG ("Found variable %s at %p with size %u\n", v->name,
+ (void *)var_addr, var_size);
+ pair++;
+ }
+
+ /* Ensure that constructors are run first. */
+ struct GOMP_kernel_launch_attributes kla =
+ { 3,
+ /* Grid size. */
+ { 1, 64, 1 },
+ /* Work-group size. */
+ { 1, 64, 1 }
+ };
+
+ if (module->init_array_func)
+ {
+ init_kernel (module->init_array_func);
+ run_kernel (module->init_array_func, NULL, &kla, NULL, false);
+ }
+ module->constructors_run_p = true;
+
+ return kernel_count + var_count;
+}
+
+/* Find the load_offset for MODULE, savte to *LOAD_OFFSET, and return true. If
+ not found, return false. */
+
+static bool
+find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
+ struct module_info *module, Elf64_Ehdr *image,
+ Elf64_Shdr *sections)
+{
+ bool res = false;
+
+ hsa_status_t status;
+
+ hsa_executable_symbol_t symbol;
+ if (!find_executable_symbol (agent->executable, &symbol))
+ return false;
+
+ status = hsa_fns.hsa_executable_symbol_get_info_fn
+ (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not extract symbol address", status);
+ return false;
+ }
+
+ char *symbol_name = get_executable_symbol_name (symbol);
+ if (symbol_name == NULL)
+ return false;
+
+ /* Find the kernel function in ELF, and calculate actual load offset. */
+ for (int i = 0; i < image->e_shnum; i++)
+ if (sections[i].sh_type == SHT_SYMTAB)
+ {
+ Elf64_Shdr *strtab = §ions[sections[i].sh_link];
+ char *strings = (char *)image + strtab->sh_offset;
+
+ for (size_t offset = 0;
+ offset < sections[i].sh_size;
+ offset += sections[i].sh_entsize)
+ {
+ Elf64_Sym *sym = (Elf64_Sym*)((char*)image
+ + sections[i].sh_offset
+ + offset);
+ if (strcmp (symbol_name, strings + sym->st_name) == 0)
+ {
+ *load_offset -= sym->st_value;
+ res = true;
+ break;
+ }
+ }
+ }
+
+ free (symbol_name);
+ return res;
+}
+
+/* Create and finalize the program consisting of all loaded modules. */
+
+static bool
+create_and_finalize_hsa_program (struct agent_info *agent)
+{
+ hsa_status_t status;
+ int reloc_count = 0;
+ bool res = true;
+ if (pthread_mutex_lock (&agent->prog_mutex))
+ {
+ GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+ return false;
+ }
+ if (agent->prog_finalized)
+ goto final;
+
+ status
+ = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
+ HSA_EXECUTABLE_STATE_UNFROZEN,
+ "", &agent->executable);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not create GCN executable", status);
+ goto fail;
+ }
+
+ struct obstack unmodified_sections_os;
+ obstack_init (&unmodified_sections_os);
+
+ /* Load any GCN modules. */
+ struct module_info *module = agent->module;
+ if (module)
+ {
+ Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
+
+ /* Hide relocations from the HSA runtime loader.
+ Keep a copy of the unmodified section headers to use later. */
+ Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
+ + image->e_shoff);
+ Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
+ memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
+ for (int i = image->e_shnum - 1; i >= 0; i--)
+ {
+ if (image_sections[i].sh_type == SHT_RELA
+ || image_sections[i].sh_type == SHT_REL)
+ /* Change section type to something harmless. */
+ image_sections[i].sh_type = SHT_NOTE;
+ }
+ obstack_ptr_grow (&unmodified_sections_os, sections);
+
+ hsa_code_object_t co = { 0 };
+ status = hsa_fns.hsa_code_object_deserialize_fn
+ (module->image_desc->gcn_image->image,
+ module->image_desc->gcn_image->size,
+ NULL, &co);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not deserialize GCN code object", status);
+ goto fail;
+ }
+
+ status = hsa_fns.hsa_executable_load_code_object_fn
+ (agent->executable, agent->id, co, "");
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not load GCN code object", status);
+ goto fail;
+ }
+
+ if (!module->heap)
+ {
+ status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
+ gcn_kernel_heap_size,
+ (void**)&module->heap);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not allocate memory for GCN heap", status);
+ goto fail;
+ }
+
+ module->heap->size = gcn_kernel_heap_size;
+ }
+
+ }
+ Elf64_Shdr **unmodified_sections = obstack_finish (&unmodified_sections_os);
+
+ if (debug)
+ dump_executable_symbols (agent->executable);
+
+ status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not freeze the GCN executable", status);
+ goto fail;
+ }
+
+ int s = 0;
+ if (agent->module)
+ {
+ struct module_info *module = agent->module;
+ Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
+ Elf64_Shdr *sections = unmodified_sections[s++];
+
+ Elf64_Addr load_offset;
+ if (!find_load_offset (&load_offset, agent, module, image, sections))
+ goto fail;
+
+ /* Record the physical load address range.
+ We need this for data copies later. */
+ Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
+ Elf64_Addr low = ~0, high = 0;
+ for (int i = 0; i < image->e_phnum; i++)
+ if (segments[i].p_memsz > 0)
+ {
+ if (segments[i].p_paddr < low)
+ low = segments[i].p_paddr;
+ if (segments[i].p_paddr > high)
+ high = segments[i].p_paddr + segments[i].p_memsz - 1;
+ }
+ module->phys_address_start = low + load_offset;
+ module->phys_address_end = high + load_offset;
+
+ // Find dynamic symbol table
+ Elf64_Shdr *dynsym = NULL;
+ for (int i = 0; i < image->e_shnum; i++)
+ if (sections[i].sh_type == SHT_DYNSYM)
+ {
+ dynsym = §ions[i];
+ break;
+ }
+
+ /* Fix up relocations. */
+ for (int i = 0; i < image->e_shnum; i++)
+ {
+ if (sections[i].sh_type == SHT_RELA)
+ for (size_t offset = 0;
+ offset < sections[i].sh_size;
+ offset += sections[i].sh_entsize)
+ {
+ Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
+ + sections[i].sh_offset
+ + offset);
+ Elf64_Sym *sym =
+ (dynsym
+ ? (Elf64_Sym*)((char*)image
+ + dynsym->sh_offset
+ + (dynsym->sh_entsize
+ * ELF64_R_SYM (reloc->r_info)))
+ : NULL);
+
+ int64_t S = (sym ? sym->st_value : 0);
+ int64_t P = reloc->r_offset + load_offset;
+ int64_t A = reloc->r_addend;
+ int64_t B = load_offset;
+ int64_t V, size;
+ switch (ELF64_R_TYPE (reloc->r_info))
+ {
+ case R_AMDGPU_ABS32_LO:
+ V = (S + A) & 0xFFFFFFFF;
+ size = 4;
+ break;
+ case R_AMDGPU_ABS32_HI:
+ V = (S + A) >> 32;
+ size = 4;
+ break;
+ case R_AMDGPU_ABS64:
+ V = S + A;
+ size = 8;
+ break;
+ case R_AMDGPU_REL32:
+ V = S + A - P;
+ size = 4;
+ break;
+ case R_AMDGPU_REL64:
+ /* FIXME
+ LLD seems to emit REL64 where the the assembler has
+ ABS64. This is clearly wrong because it's not what the
+ compiler is expecting. Let's assume, for now, that
+ it's a bug. In any case, GCN kernels are always self
+ contained and therefore relative relocations will have
+ been resolved already, so this should be a safe
+ workaround. */
+ V = S + A/* - P*/;
+ size = 8;
+ break;
+ case R_AMDGPU_ABS32:
+ V = S + A;
+ size = 4;
+ break;
+ /* TODO R_AMDGPU_GOTPCREL */
+ /* TODO R_AMDGPU_GOTPCREL32_LO */
+ /* TODO R_AMDGPU_GOTPCREL32_HI */
+ case R_AMDGPU_REL32_LO:
+ V = (S + A - P) & 0xFFFFFFFF;
+ size = 4;
+ break;
+ case R_AMDGPU_REL32_HI:
+ V = (S + A - P) >> 32;
+ size = 4;
+ break;
+ case R_AMDGPU_RELATIVE64:
+ V = B + A;
+ size = 8;
+ break;
+ default:
+ fprintf (stderr, "Error: unsupported relocation type.\n");
+ exit (1);
+ }
+ status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Failed to fix up relocation", status);
+ goto fail;
+ }
+ reloc_count++;
+ }
+ }
+
+ free (sections);
+ }
+ obstack_free (&unmodified_sections_os, NULL);
+
+ HSA_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
+ agent->device_id, reloc_count);
+
+final:
+ agent->prog_finalized = true;
+
+ if (pthread_mutex_unlock (&agent->prog_mutex))
+ {
+ GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
+ res = false;
+ }
+
+ return res;
+
+fail:
+ res = false;
+ goto final;
+}
+
+/* Create kernel dispatch data structure for given KERNEL. */
+
+static struct GOMP_hsa_kernel_dispatch *
+create_single_kernel_dispatch (struct kernel_info *kernel,
+ unsigned omp_data_size)
+{
+ struct agent_info *agent = kernel->agent;
+ struct GOMP_hsa_kernel_dispatch *shadow
+ = GOMP_PLUGIN_malloc_cleared (sizeof (struct GOMP_hsa_kernel_dispatch));
+
+ shadow->omp_data_memory
+ = omp_data_size > 0 ? GOMP_PLUGIN_malloc (omp_data_size) : NULL;
+ unsigned dispatch_count = kernel->dependencies_count;
+ if (dispatch_count != 0)
+ GOMP_PLUGIN_fatal ("kernel->dependencies_count != 0");
+ shadow->kernel_dispatch_count = 0;
+
+ shadow->object = kernel->object;
+
+ hsa_signal_t sync_signal;
+ hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Error creating the GCN sync signal", status);
+
+ shadow->signal = sync_signal.handle;
+ shadow->private_segment_size = kernel->private_segment_size;
+ shadow->group_segment_size = kernel->group_segment_size;
+
+ /* Ensure that there is space for the gomp_print data.
+ See also gcn-run.c, in GCC. */
+ size_t kss = kernel->kernarg_segment_size;
+ bool use_gomp_print = false;
+ if (kss <= 8)
+ {
+ kss = sizeof (struct kernargs);
+ use_gomp_print = true;
+ }
+
+ status
+ = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
+ kss,
+ &shadow->kernarg_address);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
+
+ struct kernargs *kernargs = shadow->kernarg_address;
+ if (use_gomp_print)
+ {
+ /* Zero-initialize the output_data (minimum needed). */
+ kernargs->out_ptr = (int64_t)&kernargs->output_data;
+ kernargs->output_data.next_output = 0;
+ for (unsigned i = 0;
+ i < (sizeof (kernargs->output_data.queue)
+ / sizeof (kernargs->output_data.queue[0]));
+ i++)
+ kernargs->output_data.queue[i].written = 0;
+ kernargs->output_data.consumed = 0;
+
+ /* Pass in the heap location. */
+ kernargs->heap_ptr = (int64_t)kernel->module->heap;
+ }
+
+ kernargs->output_data.return_value = 0xcafe0000;
+
+ return shadow;
+}
+
+/* Output any data written by gomp_print_*.
+ Only enabled when the requested kernarg_segment_size would not
+ overwrite the gomp_print data.
+ We print all entries from print_index to the next entry without a "written"
+ flag. Subsequent calls should use the returned print_index value to resume
+ from the same point. */
+static void
+gomp_print_output (struct kernel_info *kernel, struct kernargs *kernargs,
+ bool final)
+{
+ if (kernel->kernarg_segment_size <= 8)
+ {
+ unsigned int limit = (sizeof (kernargs->output_data.queue)
+ / sizeof (kernargs->output_data.queue[0]));
+
+ unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
+ __ATOMIC_ACQUIRE);
+ unsigned int to = kernargs->output_data.next_output;
+
+ if (from > to)
+ {
+ /* Overflow. */
+ if (final)
+ printf ("GCN print buffer overflowed.\n");
+ return;
+ }
+
+ unsigned int i;
+ for (i = from; i < to; i++)
+ {
+ struct printf_data *data = &kernargs->output_data.queue[i%limit];
+
+ if (!data->written && !final)
+ break;
+
+ switch (data->type)
+ {
+ case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
+ case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
+ case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
+ case 3: printf ("%.128s%.128s", data->msg, data->text); break;
+ default: printf ("GCN print buffer error!\n"); break;
+ }
+ data->written = 0;
+ __atomic_store_n (&kernargs->output_data.consumed, i+1,
+ __ATOMIC_RELEASE);
+ }
+ fflush (stdout);
+ }
+}
+
+/* Release data structure created for a kernel dispatch in SHADOW argument. */
+
+static void
+release_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *shadow)
+{
+ HSA_DEBUG ("Released kernel dispatch: %p has value: %lu (%p)\n", shadow,
+ shadow->debug, (void *) shadow->debug);
+
+ hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
+
+ hsa_signal_t s;
+ s.handle = shadow->signal;
+ hsa_fns.hsa_signal_destroy_fn (s);
+
+ free (shadow->omp_data_memory);
+
+ free (shadow);
+}
+
+/* Initialize a KERNEL without its dependencies. MAX_OMP_DATA_SIZE is used
+ to calculate maximum necessary memory for OMP data allocation. */
+
+static void
+init_single_kernel (struct kernel_info *kernel, unsigned *max_omp_data_size)
+{
+ hsa_status_t status;
+ struct agent_info *agent = kernel->agent;
+ hsa_executable_symbol_t kernel_symbol;
+ status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
+ kernel->name, agent->id,
+ 0, &kernel_symbol);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_warn ("Could not find symbol for kernel in the code object", status);
+ fprintf (stderr, "not found name: '%s'\n", kernel->name);
+ dump_executable_symbols (agent->executable);
+ goto failure;
+ }
+ HSA_DEBUG ("Located kernel %s\n", kernel->name);
+ status = hsa_fns.hsa_executable_symbol_get_info_fn
+ (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not extract a kernel object from its symbol", status);
+ status = hsa_fns.hsa_executable_symbol_get_info_fn
+ (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+ &kernel->kernarg_segment_size);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not get info about kernel argument size", status);
+ status = hsa_fns.hsa_executable_symbol_get_info_fn
+ (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+ &kernel->group_segment_size);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not get info about kernel group segment size", status);
+ status = hsa_fns.hsa_executable_symbol_get_info_fn
+ (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+ &kernel->private_segment_size);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not get info about kernel private segment size",
+ status);
+
+ HSA_DEBUG ("Kernel structure for %s fully initialized with "
+ "following segment sizes: \n", kernel->name);
+ HSA_DEBUG (" group_segment_size: %u\n",
+ (unsigned) kernel->group_segment_size);
+ HSA_DEBUG (" private_segment_size: %u\n",
+ (unsigned) kernel->private_segment_size);
+ HSA_DEBUG (" kernarg_segment_size: %u\n",
+ (unsigned) kernel->kernarg_segment_size);
+ HSA_DEBUG (" omp_data_size: %u\n", kernel->omp_data_size);
+ HSA_DEBUG (" gridified_kernel_p: %u\n", kernel->gridified_kernel_p);
+
+ if (kernel->omp_data_size > *max_omp_data_size)
+ *max_omp_data_size = kernel->omp_data_size;
+
+ return;
+
+failure:
+ kernel->initialization_failed = true;
+}
+
+/* Indent stream F by INDENT spaces. */
+
+static void
+indent_stream (FILE *f, unsigned indent)
+{
+ fprintf (f, "%*s", indent, "");
+}
+
+/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
+
+static void
+print_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *dispatch,
+ unsigned indent)
+{
+ indent_stream (stderr, indent);
+ fprintf (stderr, "this: %p\n", dispatch);
+ indent_stream (stderr, indent);
+ fprintf (stderr, "queue: %p\n", dispatch->queue);
+ indent_stream (stderr, indent);
+ fprintf (stderr, "omp_data_memory: %p\n", dispatch->omp_data_memory);
+ indent_stream (stderr, indent);
+ fprintf (stderr, "kernarg_address: %p\n", dispatch->kernarg_address);
+ indent_stream (stderr, indent);
+ fprintf (stderr, "object: %lu\n", dispatch->object);
+ indent_stream (stderr, indent);
+ fprintf (stderr, "signal: %lu\n", dispatch->signal);
+ indent_stream (stderr, indent);
+ fprintf (stderr, "private_segment_size: %u\n",
+ dispatch->private_segment_size);
+ indent_stream (stderr, indent);
+ fprintf (stderr, "group_segment_size: %u\n",
+ dispatch->group_segment_size);
+ indent_stream (stderr, indent);
+ fprintf (stderr, "children dispatches: %lu\n",
+ dispatch->kernel_dispatch_count);
+ indent_stream (stderr, indent);
+ fprintf (stderr, "omp_num_threads: %u\n",
+ dispatch->omp_num_threads);
+ fprintf (stderr, "\n");
+}
+
+/* Create kernel dispatch data structure for a KERNEL and all its
+ dependencies. */
+
+static struct GOMP_hsa_kernel_dispatch *
+create_kernel_dispatch (struct kernel_info *kernel, unsigned omp_data_size)
+{
+ struct GOMP_hsa_kernel_dispatch *shadow
+ = create_single_kernel_dispatch (kernel, omp_data_size);
+ shadow->omp_num_threads = 64;
+ shadow->debug = 0;
+ shadow->omp_level = kernel->gridified_kernel_p ? 1 : 0;
+
+ return shadow;
+}
+
+/* Do all the work that is necessary before running KERNEL for the first time.
+ The function assumes the program has been created, finalized and frozen by
+ create_and_finalize_hsa_program. */
+
+static void
+init_kernel (struct kernel_info *kernel)
+{
+ if (pthread_mutex_lock (&kernel->init_mutex))
+ GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
+ if (kernel->initialized)
+ {
+ if (pthread_mutex_unlock (&kernel->init_mutex))
+ GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
+ "mutex");
+
+ return;
+ }
+
+ /* Precomputed maximum size of OMP data necessary for a kernel from kernel
+ dispatch operation. */
+ init_single_kernel (kernel, &kernel->max_omp_data_size);
+
+ if (!kernel->initialization_failed)
+ {
+ HSA_DEBUG ("\n");
+
+ kernel->initialized = true;
+ }
+ if (pthread_mutex_unlock (&kernel->init_mutex))
+ GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
+ "mutex");
+}
+
+/* Calculate the maximum grid size for OMP threads / OACC workers.
+ This depends on the kernel's resource usage levels. */
+
+static int
+limit_worker_threads (int threads)
+{
+ /* FIXME Do something more inteligent here.
+ GCN can always run 4 threads within a Compute Unit, but
+ more than that depends on register usage. */
+ if (threads > 16)
+ threads = 16;
+ return threads;
+}
+
+/* Parse the target attributes INPUT provided by the compiler and return true
+ if we should run anything all. If INPUT is NULL, fill DEF with default
+ values, then store INPUT or DEF into *RESULT. */
+
+static bool
+parse_target_attributes (void **input,
+ struct GOMP_kernel_launch_attributes *def,
+ struct GOMP_kernel_launch_attributes **result,
+ struct agent_info *agent)
+{
+ if (!input)
+ GOMP_PLUGIN_fatal ("No target arguments provided");
+
+ bool grid_attrs_found = false;
+ bool gcn_dims_found = false;
+ int gcn_teams = 0;
+ int gcn_threads = 0;
+ while (*input)
+ {
+ intptr_t id = (intptr_t) *input++, val;
+
+ if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+ val = (intptr_t) *input++;
+ else
+ val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+
+ val = (val > INT_MAX) ? INT_MAX : val;
+
+ if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
+ && ((id & GOMP_TARGET_ARG_ID_MASK)
+ == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
+ {
+ grid_attrs_found = true;
+ break;
+ }
+ else if ((id & GOMP_TARGET_ARG_DEVICE_ALL) == GOMP_TARGET_ARG_DEVICE_ALL)
+ {
+ gcn_dims_found = true;
+ switch (id & GOMP_TARGET_ARG_ID_MASK)
+ {
+ case GOMP_TARGET_ARG_NUM_TEAMS:
+ gcn_teams = val;
+ break;
+ case GOMP_TARGET_ARG_THREAD_LIMIT:
+ gcn_threads = limit_worker_threads (val);
+ break;
+ default:
+ ;
+ }
+ }
+ }
+
+ if (gcn_dims_found)
+ {
+ if (agent->gfx900_p && gcn_threads == 0 && override_z_dim == 0)
+ {
+ gcn_threads = 4;
+ HSA_DEBUG ("VEGA BUG WORKAROUND: reducing default number of "
+ "threads to 4 per team.\n");
+ HSA_DEBUG (" - If this is not a Vega 10 device, please use "
+ "GCN_NUM_THREADS=16\n");
+ }
+
+ def->ndim = 3;
+ /* Fiji has 64 CUs. */
+ def->gdims[0] = (gcn_teams > 0) ? gcn_teams : 64;
+ /* Each thread is 64 work items wide. */
+ def->gdims[1] = 64;
+ /* A work group can have 16 wavefronts. */
+ def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
+ def->wdims[0] = 1; /* Single team per work-group. */
+ def->wdims[1] = 64;
+ def->wdims[2] = 16;
+ *result = def;
+ return true;
+ }
+ else if (!grid_attrs_found)
+ {
+ def->ndim = 1;
+ def->gdims[0] = 1;
+ def->gdims[1] = 1;
+ def->gdims[2] = 1;
+ def->wdims[0] = 1;
+ def->wdims[1] = 1;
+ def->wdims[2] = 1;
+ *result = def;
+ HSA_DEBUG ("GOMP_OFFLOAD_run called with no launch attributes\n");
+ return true;
+ }
+
+ struct GOMP_kernel_launch_attributes *kla;
+ kla = (struct GOMP_kernel_launch_attributes *) *input;
+ *result = kla;
+ if (kla->ndim == 0 || kla->ndim > 3)
+ GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
+
+ HSA_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
+ unsigned i;
+ for (i = 0; i < kla->ndim; i++)
+ {
+ HSA_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
+ kla->gdims[i], kla->wdims[i]);
+ if (kla->gdims[i] == 0)
+ return false;
+ }
+ return true;
+}
+
+/* Return the group size given the requested GROUP size, GRID size and number
+ of grid dimensions NDIM. */
+
+static uint32_t
+get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
+{
+ if (group == 0)
+ {
+ /* TODO: Provide a default via environment or device characteristics. */
+ if (ndim == 1)
+ group = 64;
+ else if (ndim == 2)
+ group = 8;
+ else
+ group = 4;
+ }
+
+ if (group > grid)
+ group = grid;
+ return group;
+}
+
+/* Return true if the HSA runtime can run function FN_PTR. */
+
+bool
+GOMP_OFFLOAD_can_run (void *fn_ptr)
+{
+ struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+ init_kernel (kernel);
+ if (kernel->initialization_failed)
+ goto failure;
+
+ return true;
+
+failure:
+ if (suppress_host_fallback)
+ GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
+ HSA_DEBUG ("GCN target cannot be launched, doing a host fallback\n");
+ return false;
+}
+
+/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
+
+void
+packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
+{
+ __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
+}
+
+/* Run KERNEL on its agent, pass VARS to it as arguments and take
+ launchattributes from KLA. MODULE_LOCKED indicates that the caller
+ already holds the lock and run_kernel need not lock it again.
+ If AQ is NULL then agent->sync_queue will be used. */
+
+static void
+run_kernel (struct kernel_info *kernel, void *vars,
+ struct GOMP_kernel_launch_attributes *kla,
+ struct goacc_asyncqueue *aq, bool module_locked)
+{
+ HSA_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
+ (aq ? aq->id : 0));
+ HSA_DEBUG ("GCN launch attribs: gdims:[");
+ int i;
+ for (i = 0; i < kla->ndim; ++i)
+ {
+ if (i)
+ HSA_DPRINT (", ");
+ HSA_DPRINT ("%u", kla->gdims[i]);
+ }
+ HSA_DPRINT ("], normalized gdims:[");
+ for (i = 0; i < kla->ndim; ++i)
+ {
+ if (i)
+ HSA_DPRINT (", ");
+ HSA_DPRINT ("%u", kla->gdims[i] / kla->wdims[i]);
+ }
+ HSA_DPRINT ("], wdims:[");
+ for (i = 0; i < kla->ndim; ++i)
+ {
+ if (i)
+ HSA_DPRINT (", ");
+ HSA_DPRINT ("%u", kla->wdims[i]);
+ }
+ HSA_DPRINT ("]\n");
+ HSA_FLUSH ();
+
+ struct agent_info *agent = kernel->agent;
+ if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
+ GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
+
+ if (!agent->initialized)
+ GOMP_PLUGIN_fatal ("Agent must be initialized");
+
+ if (!kernel->initialized)
+ GOMP_PLUGIN_fatal ("Called kernel must be initialized");
+
+ struct GOMP_hsa_kernel_dispatch *shadow
+ = create_kernel_dispatch (kernel, kernel->max_omp_data_size);
+
+ hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
+ shadow->queue = command_q;
+
+ if (debug)
+ {
+ fprintf (stderr, "\nKernel has following dependencies:\n");
+ print_kernel_dispatch (shadow, 2);
+ }
+
+ uint64_t index
+ = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
+ HSA_DEBUG ("Got AQL index %llu\n", (long long int) index);
+
+ /* Wait until the queue is not full before writing the packet. */
+ while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
+ >= command_q->size)
+ ;
+
+ /* Do not allow the dimensions to be overridden when running
+ constructors or destructors. */
+ struct module_info *module = kernel->module;
+ bool init_fini_p = kernel == module->init_array_func
+ || kernel == module->fini_array_func;
+ int override_x = init_fini_p ? 0 : override_x_dim;
+ int override_z = init_fini_p ? 0 : override_z_dim;
+
+ hsa_kernel_dispatch_packet_t *packet;
+ packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
+ + index % command_q->size;
+
+ memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
+ packet->grid_size_x = override_x ? : kla->gdims[0];
+ packet->workgroup_size_x = get_group_size (kla->ndim,
+ packet->grid_size_x,
+ kla->wdims[0]);
+
+ if (kla->ndim >= 2)
+ {
+ packet->grid_size_y = kla->gdims[1];
+ packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
+ kla->wdims[1]);
+ }
+ else
+ {
+ packet->grid_size_y = 1;
+ packet->workgroup_size_y = 1;
+ }
+
+ if (kla->ndim == 3)
+ {
+ packet->grid_size_z = limit_worker_threads (override_z
+ ? : kla->gdims[2]);
+ packet->workgroup_size_z = get_group_size (kla->ndim,
+ packet->grid_size_z,
+ kla->wdims[2]);
+ }
+ else
+ {
+ packet->grid_size_z = 1;
+ packet->workgroup_size_z = 1;
+ }
+
+ HSA_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
+ " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
+ packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
+ packet->grid_size_x / packet->workgroup_size_x,
+ packet->grid_size_y / packet->workgroup_size_y,
+ packet->grid_size_z / packet->workgroup_size_z,
+ packet->workgroup_size_x, packet->workgroup_size_y,
+ packet->workgroup_size_z);
+
+ packet->private_segment_size = kernel->private_segment_size;
+ packet->group_segment_size = kernel->group_segment_size;
+ packet->kernel_object = kernel->object;
+ packet->kernarg_address = shadow->kernarg_address;
+ hsa_signal_t s;
+ s.handle = shadow->signal;
+ packet->completion_signal = s;
+ hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
+ memcpy (shadow->kernarg_address, &vars, sizeof (vars));
+
+ /* PR hsa/70337. */
+ size_t vars_size = sizeof (vars);
+ if (kernel->kernarg_segment_size > vars_size)
+ {
+ if (kernel->kernarg_segment_size != vars_size
+ + sizeof (struct hsa_kernel_runtime *))
+ GOMP_PLUGIN_fatal ("Kernel segment size has an unexpected value");
+ memcpy (packet->kernarg_address + vars_size, &shadow,
+ sizeof (struct hsa_kernel_runtime *));
+ }
+
+ HSA_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
+
+ uint16_t header;
+ header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+ header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+ header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+
+ HSA_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
+ agent->device_id);
+
+ packet_store_release ((uint32_t *) packet, header,
+ (uint16_t) kla->ndim
+ << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
+
+ hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
+ index);
+
+ HSA_DEBUG ("Kernel dispatched, waiting for completion\n");
+
+ /* Root signal waits with 1ms timeout. */
+ while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
+ 1000 * 1000,
+ HSA_WAIT_STATE_BLOCKED) != 0)
+ {
+ gomp_print_output (kernel, shadow->kernarg_address, false);
+ }
+ gomp_print_output (kernel, shadow->kernarg_address, true);
+
+ struct kernargs *kernargs = shadow->kernarg_address;
+ unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
+
+ release_kernel_dispatch (shadow);
+
+ if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
+ GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
+
+ unsigned int upper = (return_value & ~0xffff) >> 16;
+ if (upper == 0xcafe)
+ ; // exit not called, normal termination.
+ else if (upper == 0xffff)
+ ; // exit called.
+ else
+ {
+ GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
+ " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
+ return_value);
+ abort ();
+ }
+
+ if (upper == 0xffff)
+ {
+ unsigned int signal = (return_value >> 8) & 0xff;
+
+ if (signal == SIGABRT)
+ {
+ HSA_DEBUG ("GCN Kernel aborted\n");
+ abort ();
+ }
+ else if (signal != 0)
+ {
+ HSA_DEBUG ("GCN Kernel received unknown signal\n");
+ abort ();
+ }
+
+ HSA_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
+ exit (return_value & 0xff);
+ }
+}
+
+/* Part of the libgomp plugin interface. Run a kernel on device N (the number
+ is actually ignored, we assume the FN_PTR has been mapped using the correct
+ device) and pass it an array of pointers in VARS as a parameter. The kernel
+ is identified by FN_PTR which must point to a kernel_info structure. */
+
+void
+GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
+{
+ struct agent_info *agent = get_agent_info (device);
+ struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+ struct GOMP_kernel_launch_attributes def;
+ struct GOMP_kernel_launch_attributes *kla;
+ assert (agent == kernel->agent);
+
+ if (!parse_target_attributes (args, &def, &kla, agent))
+ {
+ HSA_DEBUG ("Will not run GCN kernel because the grid size is zero\n");
+ return;
+ }
+ run_kernel (kernel, vars, kla, NULL, false);
+}
+
+/* Set up an async queue for OpenMP. There will be only one.
+ FIXME: is this thread-safe if two threads call this function? */
+static void
+maybe_init_omp_async (struct agent_info *agent)
+{
+ if (!agent->omp_async_queue)
+ agent->omp_async_queue
+ = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
+}
+
+/* Part of the libgomp plugin interface. Run a kernel like GOMP_OFFLOAD_run
+ does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it
+ has finished. */
+
+void
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+ void **args, void *async_data)
+{
+ HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
+ struct agent_info *agent = get_agent_info (device);
+ struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
+ struct GOMP_kernel_launch_attributes def;
+ struct GOMP_kernel_launch_attributes *kla;
+ assert (agent == kernel->agent);
+
+ if (!parse_target_attributes (args, &def, &kla, agent))
+ {
+ HSA_DEBUG ("Will not run GCN kernel because the grid size is zero\n");
+ return;
+ }
+
+ maybe_init_omp_async (agent);
+ queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
+ queue_push_callback (agent->omp_async_queue,
+ GOMP_PLUGIN_target_task_completion, async_data);
+}
+
+/* Deinitialize all information associated with MODULE and kernels within
+ it. Return TRUE on success. */
+
+static bool
+destroy_module (struct module_info *module, bool locked)
+{
+ /* Run destructors before destroying module. */
+ struct GOMP_kernel_launch_attributes kla =
+ { 3,
+ /* Grid size. */
+ { 1, 64, 1 },
+ /* Work-group size. */
+ { 1, 64, 1 }
+ };
+
+ if (module->fini_array_func)
+ {
+ init_kernel (module->fini_array_func);
+ run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
+ }
+ module->constructors_run_p = false;
+
+ int i;
+ for (i = 0; i < module->kernel_count; i++)
+ if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
+ {
+ GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
+ "mutex");
+ return false;
+ }
+
+ return true;
+}
+
+/* Part of the libgomp plugin interface. Unload GCN object-code module
+ described by struct gcn_image_desc in TARGET_DATA from agent number N.
+ Return TRUE on success. */
+
+bool
+GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
+{
+ if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
+ {
+ GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
+ " (expected %u, received %u)",
+ GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
+ return false;
+ }
+
+ struct agent_info *agent;
+ agent = get_agent_info (n);
+ if (!agent)
+ return false;
+
+ if (pthread_rwlock_wrlock (&agent->module_rwlock))
+ {
+ GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
+ return false;
+ }
+
+ if (!agent->module || agent->module->image_desc != target_data)
+ {
+ GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
+ "loaded before");
+ return false;
+ }
+
+ if (!destroy_module (agent->module, true))
+ return false;
+ free (agent->module);
+ agent->module = NULL;
+ if (!destroy_hsa_program (agent))
+ return false;
+ if (pthread_rwlock_unlock (&agent->module_rwlock))
+ {
+ GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
+ return false;
+ }
+ return true;
+}
+
+/* Part of the libgomp plugin interface. Deinitialize all information and
+ status associated with agent number N. We do not attempt any
+ synchronization, assuming the user and libgomp will not attempt
+ deinitialization of a device that is in any way being used at the same
+ time. Return TRUE on success. */
+
+bool
+GOMP_OFFLOAD_fini_device (int n)
+{
+ struct agent_info *agent = get_agent_info (n);
+ if (!agent)
+ return false;
+
+ if (!agent->initialized)
+ return true;
+
+ if (agent->omp_async_queue)
+ {
+ GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
+ agent->omp_async_queue = NULL;
+ }
+
+ if (agent->module)
+ {
+ if (!destroy_module (agent->module, false))
+ return false;
+ free (agent->module);
+ agent->module = NULL;
+ }
+
+ if (!destroy_hsa_program (agent))
+ return false;
+
+ /*release_agent_shared_libraries (agent);*/
+
+ hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
+ if (status != HSA_STATUS_SUCCESS)
+ return hsa_error ("Error destroying command queue", status);
+
+ if (pthread_mutex_destroy (&agent->prog_mutex))
+ {
+ GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
+ return false;
+ }
+ if (pthread_rwlock_destroy (&agent->module_rwlock))
+ {
+ GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
+ return false;
+ }
+
+ if (pthread_mutex_destroy (&agent->async_queues_mutex))
+ {
+ GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
+ return false;
+ }
+ agent->initialized = false;
+ return true;
+}
+
+static void *
+GOMP_OFFLOAD_alloc_by_agent (struct agent_info *agent, size_t size)
+{
+ HSA_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
+
+ /* Zero-size allocations are invalid, so in order to return a valid pointer
+ we need to pass a valid size. One source of zero-size allocations is
+ kernargs for kernels that have no inputs or outputs (the kernel may
+ only use gomp_print, for example). */
+ if (size == 0)
+ size = 4;
+
+ void *ptr;
+ hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
+ size, &ptr);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not allocate device memory", status);
+ return NULL;
+ }
+
+ return ptr;
+}
+
+void *
+GOMP_OFFLOAD_alloc (int n, size_t size)
+{
+ struct agent_info *agent = get_agent_info (n);
+ return GOMP_OFFLOAD_alloc_by_agent (agent, size);
+}
+
+bool
+GOMP_OFFLOAD_free (int device, void *ptr)
+{
+ HSA_DEBUG ("Freeing memory on device %d\n", device);
+
+ hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Could not free device memory", status);
+ return false;
+ }
+
+ return true;
+}
+
+/* Returns true if PTR falls within the bounds of any loaded kernel image. */
+
+static bool
+image_address_p (struct agent_info *agent, const void *ptr)
+{
+ Elf64_Addr addr = (Elf64_Addr)ptr;
+ if (agent->module)
+ {
+ if (addr >= agent->module->phys_address_start
+ && addr <= agent->module->phys_address_end)
+ return true;
+ }
+ return false;
+}
+
+struct copy_data
+{
+ void *dst;
+ const void *src;
+ size_t len;
+ bool use_hsa_memory_copy;
+ struct goacc_asyncqueue *aq;
+};
+
+static void
+copy_data (void *data_)
+{
+ struct copy_data *data = (struct copy_data *)data_;
+ HSA_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
+ data->aq->agent->device_id, data->aq->id, data->len, data->src,
+ data->dst);
+ if (data->use_hsa_memory_copy)
+ hsa_fns.hsa_memory_copy_fn (data->dst, data->src, data->len);
+ else
+ memcpy (data->dst, data->src, data->len);
+ free (data);
+}
+
+static void
+queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
+ size_t len, bool use_hsa_memory_copy)
+{
+ if (DEBUG_QUEUES)
+ HSA_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
+ aq->agent->device_id, aq->id, len, src, dst);
+ struct copy_data *data
+ = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
+ data->dst = dst;
+ data->src = src;
+ data->len = len;
+ data->use_hsa_memory_copy = use_hsa_memory_copy;
+ data->aq = aq;
+ queue_push_callback (aq, copy_data, data);
+}
+
+bool
+GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
+{
+ HSA_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
+ src, dst);
+
+ /* memcpy only works for addresses allocated with hsa_memory_allocate,
+ but hsa_memory_copy seems unable to read from .rodata variables. */
+ if (image_address_p (get_agent_info (device), src))
+ hsa_fns.hsa_memory_copy_fn (dst, src, n);
+ else
+ memcpy (dst, src, n);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
+{
+ HSA_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
+ device, dst);
+ /* memcpy only works for addresses allocated with hsa_memory_allocate,
+ but hsa_memory_copy seems unable to read from .rodata variables. */
+ if (image_address_p (get_agent_info (device), dst))
+ hsa_fns.hsa_memory_copy_fn (dst, src, n);
+ else
+ memcpy (dst, src, n);
+ return true;
+}
+
+/* Part of the libgomp plugin interface. */
+
+bool
+GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
+{
+ struct gcn_thread *thread_data = gcn_thread ();
+
+ if (thread_data && !async_synchronous_p (thread_data->async))
+ {
+ struct agent_info *agent = get_agent_info (device);
+ maybe_init_omp_async (agent);
+ queue_push_copy (agent->omp_async_queue, dst, src, n, false);
+ return true;
+ }
+
+ HSA_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
+ device, src, device, dst);
+ /* We can assume that dev2dev moves are always within allocated memory. */
+ memcpy (dst, src, n);
+ return true;
+}
+
+static int
+queue_empty (struct goacc_asyncqueue *aq)
+{
+ pthread_mutex_lock (&aq->mutex);
+ int res = aq->queue_n == 0 ? 1 : 0;
+ pthread_mutex_unlock (&aq->mutex);
+
+ return res;
+}
+
+static void
+wait_queue (struct goacc_asyncqueue *aq)
+{
+ if (DRAIN_QUEUE_SYNCHRONOUS_P)
+ {
+ drain_queue_synchronous (aq);
+ return;
+ }
+
+ pthread_mutex_lock (&aq->mutex);
+
+ while (aq->queue_n > 0)
+ {
+ if (DEBUG_THREAD_SLEEP)
+ HSA_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
+ aq->agent->device_id, aq->id);
+ pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+ if (DEBUG_THREAD_SLEEP)
+ HSA_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
+ aq->id);
+ }
+
+ pthread_mutex_unlock (&aq->mutex);
+ HSA_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
+}
+
+static void
+gomp_offload_free (void *ptr)
+{
+ HSA_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
+ GOMP_OFFLOAD_free (0, ptr);
+}
+
+static void
+gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
+ void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
+ struct goacc_asyncqueue *aq)
+{
+ if (!GOMP_OFFLOAD_can_run (kernel))
+ GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
+
+ // For some reason, devaddrs must be double-indirect on the target
+ void **ind_da = GOMP_OFFLOAD_alloc_by_agent (kernel->agent,
+ sizeof (void*) * mapnum);
+ for (size_t i = 0; i < mapnum; i++)
+ ind_da[i] = devaddrs[i] ? devaddrs[i] : hostaddrs[i];
+
+ struct hsa_kernel_description *hsa_kernel_desc = NULL;
+ for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
+ {
+ struct hsa_kernel_description *d
+ = &kernel->module->image_desc->kernel_infos[i];
+ if (d->name == kernel->name)
+ {
+ hsa_kernel_desc = d;
+ break;
+ }
+ }
+
+ /* We may have statically-determined dimensions in
+ hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
+ invocation at runtime in dims[]. We allow static dimensions to take
+ priority over dynamic dimensions when present (non-zero). */
+ if (hsa_kernel_desc->oacc_dims[0] > 0)
+ dims[0] = hsa_kernel_desc->oacc_dims[0];
+ if (hsa_kernel_desc->oacc_dims[1] > 0)
+ dims[1] = hsa_kernel_desc->oacc_dims[1];
+ if (hsa_kernel_desc->oacc_dims[2] > 0)
+ dims[2] = hsa_kernel_desc->oacc_dims[2];
+
+ /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
+ There isn't really a correct answer for this without a clue about the
+ problem size, so let's do a reasonable number of single-worker gangs.
+ 64 gangs matches a typical Fiji device. */
+
+ if (dims[0] == 0) dims[0] = 64; /* Gangs. */
+ if (dims[1] == 0) dims[1] = 16; /* Workers. */
+
+ /* The incoming dimensions are expressed in terms of gangs, workers, and
+ vectors. The HSA dimensions are expressed in terms of "work-items",
+ which means multiples of vector lanes.
+
+ The "grid size" specifies the size of the problem space, and the
+ "work-group size" specifies how much of that we want a single compute
+ unit to chew on at once.
+
+ The three dimensions do not really correspond to hardware, but the
+ important thing is that the HSA runtime will launch as many
+ work-groups as it takes to process the entire grid, and each
+ work-group will contain as many wave-fronts as it takes to process
+ the work-items in that group.
+
+ Essentially, as long as we set the Y dimension to 64 (the number of
+ vector lanes in hardware), and the Z group size to the maximum (16),
+ then we will get the gangs (X) and workers (Z) launched as we expect.
+
+ The reason for the apparent reversal of vector and worker dimension
+ order is to do with the way the run-time distributes work-items across
+ v1 and v2. */
+ struct GOMP_kernel_launch_attributes kla =
+ {3,
+ /* Grid size. */
+ {dims[0], 64, dims[1]},
+ /* Work-group size. */
+ {1, 64, 16}
+ };
+
+ if (!async)
+ {
+ run_kernel (kernel, ind_da, &kla, NULL, false);
+ gomp_offload_free (ind_da);
+ }
+ else
+ {
+ queue_push_launch (aq, kernel, ind_da, &kla);
+ if (DEBUG_QUEUES)
+ HSA_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
+ aq->agent->device_id, aq->id, ind_da);
+ queue_push_callback (aq, gomp_offload_free, ind_da);
+ }
+}
+
+void
+GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
+ void **hostaddrs, void **devaddrs, unsigned *dims,
+ void *targ_mem_desc)
+{
+ struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+ gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
+ NULL);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
+ void **hostaddrs, void **devaddrs,
+ unsigned *dims, void *targ_mem_desc,
+ struct goacc_asyncqueue *aq)
+{
+ struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+ gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
+ aq);
+}
+
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device)
+{
+ struct agent_info *agent = get_agent_info (device);
+
+ pthread_mutex_lock (&agent->async_queues_mutex);
+
+ struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
+ aq->agent = get_agent_info (device);
+ aq->prev = NULL;
+ aq->next = agent->async_queues;
+ if (aq->next)
+ {
+ aq->next->prev = aq;
+ aq->id = aq->next->id + 1;
+ }
+ else
+ aq->id = 1;
+ agent->async_queues = aq;
+
+ aq->queue_first = 0;
+ aq->queue_n = 0;
+ aq->drain_queue_stop = 0;
+
+ if (pthread_mutex_init (&aq->mutex, NULL))
+ {
+ GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
+ return false;
+ }
+ if (pthread_cond_init (&aq->queue_cond_in, NULL))
+ {
+ GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
+ return false;
+ }
+ if (pthread_cond_init (&aq->queue_cond_out, NULL))
+ {
+ GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
+ return false;
+ }
+
+ hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
+ ASYNC_QUEUE_SIZE,
+ HSA_QUEUE_TYPE_MULTI,
+ queue_callback, NULL,
+ UINT32_MAX, UINT32_MAX,
+ &aq->hsa_queue);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Error creating command queue", status);
+
+ int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
+ if (err != 0)
+ GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
+ strerror (err));
+ HSA_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
+ aq->id);
+
+ pthread_mutex_unlock (&agent->async_queues_mutex);
+
+ return aq;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+ struct agent_info *agent = aq->agent;
+
+ finalize_async_thread (aq);
+
+ pthread_mutex_lock (&agent->async_queues_mutex);
+
+ int err;
+ if ((err = pthread_mutex_destroy (&aq->mutex)))
+ {
+ GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
+ goto fail;
+ }
+ if (pthread_cond_destroy (&aq->queue_cond_in))
+ {
+ GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
+ goto fail;
+ }
+ if (pthread_cond_destroy (&aq->queue_cond_out))
+ {
+ GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
+ goto fail;
+ }
+ hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ hsa_error ("Error destroying command queue", status);
+ goto fail;
+ }
+
+ if (aq->prev)
+ aq->prev->next = aq->next;
+ if (aq->next)
+ aq->next->prev = aq->prev;
+ if (agent->async_queues == aq)
+ agent->async_queues = aq->next;
+
+ HSA_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
+
+ free (aq);
+ pthread_mutex_unlock (&agent->async_queues_mutex);
+ return true;
+
+fail:
+ pthread_mutex_unlock (&agent->async_queues_mutex);
+ return false;
+}
+
+int
+GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
+{
+ return queue_empty (aq);
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+ wait_queue (aq);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
+ struct goacc_asyncqueue *aq2)
+{
+ /* FIXME: what should happen here???? */
+ wait_queue (aq1);
+ wait_queue (aq2);
+ return true;
+}
+
+void
+GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
+ void (*fn) (void *), void *data)
+{
+ queue_push_callback (aq, fn, data);
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ struct agent_info *agent = get_agent_info (device);
+ assert (agent == aq->agent);
+ queue_push_copy (aq, dst, src, n, image_address_p (agent, dst));
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ struct agent_info *agent = get_agent_info (device);
+ assert (agent == aq->agent);
+ queue_push_copy (aq, dst, src, n, image_address_p (agent, src));
+ return true;
+}
+
+void *
+GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
+{
+ struct gcn_thread *thread_data
+ = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
+
+ thread_data->async = GOMP_ASYNC_SYNC;
+
+ return (void *) thread_data;
+}
+
+void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
+{
+ free (data);
+}
@@ -39,7 +39,7 @@
#include <string.h>
#include <assert.h>
#include <errno.h>
-#ifdef RC_CHECKING
+#if defined(RC_CHECKING)
#include <stdio.h>
#endif
@@ -302,6 +302,12 @@ gomp_to_device_kind_p (int kind)
}
}
+/* Copy host memory to an offload device. In asynchronous mode (if AQ is
+ non-NULL), this is only safe when the source memory is a global or heap
+ location (otherwise a copy may take place from a dangling pointer to an
+ expired stack frame). Use copy_host2dev_immediate for copies from stack
+ locations. */
+
attribute_hidden void
gomp_copy_host2dev (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq,
@@ -340,6 +346,17 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
}
+/* Use this variant for host-to-device copies from stack locations that may not
+ be live at the time an asynchronous copy operation takes place. */
+
+static void
+copy_host2dev_immediate (struct gomp_device_descr *devicep, void *d,
+ const void *h, size_t sz,
+ struct gomp_coalesce_buf *cbuf)
+{
+ gomp_copy_host2dev (devicep, NULL, d, h, sz, cbuf);
+}
+
attribute_hidden void
gomp_copy_dev2host (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq,
@@ -600,10 +617,10 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
if (cur_node.host_start == (uintptr_t) NULL)
{
cur_node.tgt_offset = (uintptr_t) NULL;
- gomp_copy_host2dev (devicep, aq,
- (void *) (tgt->tgt_start + target_offset),
- (void *) &cur_node.tgt_offset,
- sizeof (void *), cbuf);
+ copy_host2dev_immediate (devicep,
+ (void *) (tgt->tgt_start + target_offset),
+ (void *) &cur_node.tgt_offset,
+ sizeof (void *), cbuf);
return;
}
/* Add bias to the pointer value. */
@@ -622,8 +639,9 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
array section. Now subtract bias to get what we want
to initialize the pointer with. */
cur_node.tgt_offset -= bias;
- gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
- (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
+ copy_host2dev_immediate (devicep, (void *) (tgt->tgt_start + target_offset),
+ (void *) &cur_node.tgt_offset, sizeof (void *),
+ cbuf);
}
static void
@@ -1442,13 +1460,13 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1);
if (cur_node.tgt_offset)
cur_node.tgt_offset -= sizes[i];
- gomp_copy_host2dev (devicep, aq,
- (void *) (n->tgt->tgt_start
- + n->tgt_offset
- + cur_node.host_start
- - n->host_start),
- (void *) &cur_node.tgt_offset,
- sizeof (void *), cbufp);
+ copy_host2dev_immediate (devicep,
+ (void *) (n->tgt->tgt_start
+ + n->tgt_offset
+ + cur_node.host_start
+ - n->host_start),
+ (void *) &cur_node.tgt_offset,
+ sizeof (void *), cbufp);
cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
+ cur_node.host_start - n->host_start;
continue;
@@ -1687,8 +1705,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
/* We intentionally do not use coalescing here, as it's not
data allocated by the current call to this function. */
- gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
- &tgt_addr, sizeof (void *), NULL);
+ copy_host2dev_immediate (devicep, (void *) n->tgt_offset,
+ &tgt_addr, sizeof (void *), NULL);
}
array++;
}
@@ -1810,10 +1828,9 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
for (i = 0; i < mapnum; i++)
{
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
- gomp_copy_host2dev (devicep, aq,
- (void *) (tgt->tgt_start + i * sizeof (void *)),
- (void *) &cur_node.tgt_offset, sizeof (void *),
- cbufp);
+ copy_host2dev_immediate (devicep,
+ (void *) (tgt->tgt_start + i * sizeof (void *)),
+ (void *) &cur_node.tgt_offset, sizeof (void *), cbufp);
}
}
@@ -3725,6 +3742,8 @@ offload_target_to_plugin_name (const char *offload_target)
return "nvptx";
else if (strncmp (offload_target, "hsa", 3) == 0)
return "hsa";
+ else if (strstr (offload_target, "gcn") != NULL)
+ return "gcn";
else
gomp_fatal ("Unknown offload target: %s", offload_target);
}
@@ -239,6 +239,9 @@ gomp_free_pool_helper (void *thread_pool)
pthread_exit (NULL);
#elif defined(__nvptx__)
asm ("exit;");
+#elif defined(__AMDGCN__)
+ asm ("s_dcache_wb\n\t"
+ "s_endpgm");
#else
#error gomp_free_pool_helper must terminate the thread
#endif
@@ -207,6 +207,10 @@ PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
PERL = @PERL@
+PLUGIN_GCN = @PLUGIN_GCN@
+PLUGIN_GCN_CPPFLAGS = @PLUGIN_GCN_CPPFLAGS@
+PLUGIN_GCN_LDFLAGS = @PLUGIN_GCN_LDFLAGS@
+PLUGIN_GCN_LIBS = @PLUGIN_GCN_LIBS@
PLUGIN_HSA = @PLUGIN_HSA@
PLUGIN_HSA_CPPFLAGS = @PLUGIN_HSA_CPPFLAGS@
PLUGIN_HSA_LDFLAGS = @PLUGIN_HSA_LDFLAGS@
@@ -280,6 +284,7 @@ pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
+runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
@@ -445,3 +445,28 @@ proc check_effective_target_hsa_offloading_selected {} {
check_effective_target_hsa_offloading_selected_nocache
}]
}
+# Return 1 if at least one AMD GCN board is present.
+
+proc check_effective_target_openacc_amdgcn_accel_present { } {
+ return [check_runtime openacc_amdgcn_accel_present {
+ #include <openacc.h>
+ int main () {
+ return !(acc_get_num_devices (acc_device_gcn) > 0);
+ }
+ } "" ]
+}
+
+# Return 1 if at least one AMD GCN board is present, and the AMD GCN device
+# type is selected by default.
+
+proc check_effective_target_openacc_amdgcn_accel_selected { } {
+ if { ![check_effective_target_openacc_amdgcn_accel_present] } {
+ return 0;
+ }
+ global offload_target_openacc
+ if { [string match "amdgcn*" $offload_target_openacc] } {
+ return 1;
+ }
+ return 0;
+}
+
@@ -23,10 +23,81 @@ dg-init
# Turn on OpenMP.
lappend ALWAYS_CFLAGS "additional_flags=-fopenmp"
+# Generate new tests for each DO_TEST entry in TEST_LIST.
+proc generate_tests { test_list } {
+ global srcdir
+ global subdir
+
+ # Get corresponding source file.
+ set base_file [regsub "\.list" $test_list ""]
+ set base_file [regsub "$srcdir/$subdir/" $base_file ""]
+ set c_file $base_file.c
+
+ # Get dg directives from c file.
+ set dg_directives ""
+ set fp [open "$srcdir/$subdir/$c_file" r]
+ while {[gets $fp line] >= 0} {
+ if {[regexp -line -- "^/\\* \{ dg-" $line]} {
+ if { "$dg_directives" == "" } {
+ set sep ""
+ } else {
+ set sep "\n"
+ }
+ set dg_directives "$dg_directives$sep$line"
+ }
+ }
+ close $fp
+
+ # Get list of tests.
+ set fp [open "$test_list" r]
+ set file_data [read $fp]
+ close $fp
+ set file_data [regsub -all "DO_TEST" $file_data ""]
+ set file_data [regsub -all "\\(" $file_data ""]
+ set file_data [regsub -all "\\)" $file_data ""]
+ set file_data [regsub -all \[\n\] $file_data ""]
+ set file_data [string trimleft $file_data " "]
+ set tests [split $file_data]
+
+ # Create directory to generate files.
+ set test_dir [pwd]
+ set generated_dir $test_dir/generated/libgomp.c
+ file mkdir $generated_dir
+
+ # Generate tests.
+ set new_files []
+ set i 1
+ foreach test $tests {
+ set new_file "$generated_dir/$base_file-$test.c"
+
+ set fp [open "$new_file" w]
+ puts $fp "$dg_directives"
+ puts $fp "#define ONE_TEST $test"
+ puts $fp "#define TEST_NR $i"
+ puts $fp "#include \"$srcdir/$subdir/$c_file\""
+ close $fp
+
+ set i [expr $i + 1]
+ lappend new_files $new_file
+ }
+
+ return $new_files
+}
+
+# Generate tests for each .list file
+set test_lists [find $srcdir/$subdir *.list]
+set generated_tests []
+foreach test_list $test_lists {
+ set generated_tests [concat \
+ $generated_tests \
+ [generate_tests $test_list]]
+}
+
# Gather a list of all tests.
set tests [lsort [concat \
[find $srcdir/$subdir *.c] \
- [find $srcdir/$subdir/../libgomp.c-c++-common *.c]]]
+ [find $srcdir/$subdir/../libgomp.c-c++-common *.c] \
+ $generated_tests]]
set ld_library_path $always_ld_library_path
append ld_library_path [gcc-set-multilib-library-path $GCC_UNDER_TEST]
new file mode 100644
@@ -0,0 +1,25 @@
+#define S
+#define N(x) M(x, G, static)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(static, 32)
+#define N(x) M(x, G, static32)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(auto)
+#define N(x) M(x, G, auto)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(guided, 32)
+#define N(x) M(x, G, guided32)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(runtime)
+#define N(x) M(x, G, runtime)
+#include "for-2.h"
+#undef S
+#undef N
new file mode 100644
@@ -0,0 +1,313 @@
+#ifndef VARS
+#define VARS
+int a[1500];
+float b[10][15][10];
+__attribute__((noreturn)) void
+noreturn (void)
+{
+ for (;;);
+}
+#endif
+#ifndef SC
+#define SC
+#endif
+#ifndef OMPTGT
+#define OMPTGT
+#endif
+#ifndef OMPTO
+#define OMPTO(v) do {} while (0)
+#endif
+#ifndef OMPFROM
+#define OMPFROM(v) do {} while (0)
+#endif
+
+__attribute__((noinline, noclone)) void
+N(f0) (void)
+{
+ int i;
+ OMPTGT
+#pragma omp F S
+ for (i = 0; i < 1500; i++)
+ a[i] += 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f1) (void)
+{
+ OMPTGT
+#pragma omp F S
+ for (unsigned int i = __INT_MAX__; i < 3000U + __INT_MAX__; i += 2)
+ a[(i - __INT_MAX__) >> 1] -= 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f2) (void)
+{
+ unsigned long long i;
+ OMPTGT
+#pragma omp F S
+ for (i = __LONG_LONG_MAX__ + 4500ULL - 27;
+ i > __LONG_LONG_MAX__ - 27ULL; i -= 3)
+ a[(i + 26LL - __LONG_LONG_MAX__) / 3] -= 4;
+}
+
+__attribute__((noinline, noclone)) void
+N(f3) (long long n1, long long n2, long long s3)
+{
+ OMPTGT
+#pragma omp F S
+ for (long long i = n1 + 23; i > n2 - 25; i -= s3)
+ a[i + 48] += 7;
+}
+
+__attribute__((noinline, noclone)) void
+N(f4) (void)
+{
+ unsigned int i;
+ OMPTGT
+#pragma omp F S
+ for (i = 30; i < 20; i += 2)
+ a[i] += 10;
+}
+
+__attribute__((noinline, noclone)) void
+N(f5) (int n11, int n12, int n21, int n22, int n31, int n32,
+ int s1, int s2, int s3)
+{
+ SC int v1, v2, v3;
+ OMPTGT
+#pragma omp F S collapse(3)
+ for (v1 = n11; v1 < n12; v1 += s1)
+ for (v2 = n21; v2 < n22; v2 += s2)
+ for (v3 = n31; v3 < n32; v3 += s3)
+ b[v1][v2][v3] += 2.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f6) (int n11, int n12, int n21, int n22, long long n31, long long n32,
+ int s1, int s2, long long int s3)
+{
+ SC int v1, v2;
+ SC long long v3;
+ OMPTGT
+#pragma omp F S collapse(3)
+ for (v1 = n11; v1 > n12; v1 += s1)
+ for (v2 = n21; v2 > n22; v2 += s2)
+ for (v3 = n31; v3 > n32; v3 += s3)
+ b[v1][v2 / 2][v3] -= 4.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f7) (void)
+{
+ SC unsigned int v1, v3;
+ SC unsigned long long v2;
+ OMPTGT
+#pragma omp F S collapse(3)
+ for (v1 = 0; v1 < 20; v1 += 2)
+ for (v2 = __LONG_LONG_MAX__ + 16ULL;
+ v2 > __LONG_LONG_MAX__ - 29ULL; v2 -= 3)
+ for (v3 = 10; v3 > 0; v3--)
+ b[v1 >> 1][(v2 - __LONG_LONG_MAX__ + 64) / 3 - 12][v3 - 1] += 5.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f8) (void)
+{
+ SC long long v1, v2, v3;
+ OMPTGT
+#pragma omp F S collapse(3)
+ for (v1 = 0; v1 < 20; v1 += 2)
+ for (v2 = 30; v2 < 20; v2++)
+ for (v3 = 10; v3 < 0; v3--)
+ b[v1][v2][v3] += 5.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f9) (void)
+{
+ int i;
+ OMPTGT
+#pragma omp F S
+ for (i = 20; i < 10; i++)
+ {
+ a[i] += 2;
+ noreturn ();
+ a[i] -= 4;
+ }
+}
+
+__attribute__((noinline, noclone)) void
+N(f10) (void)
+{
+ SC int i;
+ OMPTGT
+#pragma omp F S collapse(3)
+ for (i = 0; i < 10; i++)
+ for (int j = 10; j < 8; j++)
+ for (long k = -10; k < 10; k++)
+ {
+ b[i][j][k] += 4;
+ noreturn ();
+ b[i][j][k] -= 8;
+ }
+}
+
+__attribute__((noinline, noclone)) void
+N(f11) (int n)
+{
+ int i;
+ OMPTGT
+#pragma omp F S
+ for (i = 20; i < n; i++)
+ {
+ a[i] += 8;
+ noreturn ();
+ a[i] -= 16;
+ }
+}
+
+__attribute__((noinline, noclone)) void
+N(f12) (int n)
+{
+ SC int i;
+ OMPTGT
+#pragma omp F S collapse(3)
+ for (i = 0; i < 10; i++)
+ for (int j = n; j < 8; j++)
+ for (long k = -10; k < 10; k++)
+ {
+ b[i][j][k] += 16;
+ noreturn ();
+ b[i][j][k] -= 32;
+ }
+}
+
+__attribute__((noinline, noclone)) void
+N(f13) (void)
+{
+ int *i;
+ OMPTGT
+#pragma omp F S
+ for (i = a; i < &a[1500]; i++)
+ i[0] += 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f14) (void)
+{
+ SC float *i;
+ OMPTGT
+#pragma omp F S collapse(3)
+ for (i = &b[0][0][0]; i < &b[0][0][10]; i++)
+ for (float *j = &b[0][15][0]; j > &b[0][0][0]; j -= 10)
+ for (float *k = &b[0][0][10]; k > &b[0][0][0]; --k)
+ b[i - &b[0][0][0]][(j - &b[0][0][0]) / 10 - 1][(k - &b[0][0][0]) - 1]
+ -= 3.5;
+}
+
+__attribute__((noinline, noclone)) int
+N(test) (void)
+{
+ int i, j, k;
+ for (i = 0; i < 1500; i++)
+ a[i] = i - 25;
+ OMPTO (a);
+ N(f0) ();
+ OMPFROM (a);
+ for (i = 0; i < 1500; i++)
+ if (a[i] != i - 23)
+ return 1;
+ N(f1) ();
+ OMPFROM (a);
+ for (i = 0; i < 1500; i++)
+ if (a[i] != i - 25)
+ return 1;
+ N(f2) ();
+ OMPFROM (a);
+ for (i = 0; i < 1500; i++)
+ if (a[i] != i - 29)
+ return 1;
+ N(f3) (1500LL - 1 - 23 - 48, -1LL + 25 - 48, 1LL);
+ OMPFROM (a);
+ for (i = 0; i < 1500; i++)
+ if (a[i] != i - 22)
+ return 1;
+ N(f3) (1500LL - 1 - 23 - 48, 1500LL - 1, 7LL);
+ OMPFROM (a);
+ for (i = 0; i < 1500; i++)
+ if (a[i] != i - 22)
+ return 1;
+ N(f4) ();
+ OMPFROM (a);
+ for (i = 0; i < 1500; i++)
+ if (a[i] != i - 22)
+ return 1;
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 15; j++)
+ for (k = 0; k < 10; k++)
+ b[i][j][k] = i - 2.5 + 1.5 * j - 1.5 * k;
+ OMPTO (b);
+ N(f5) (0, 10, 0, 15, 0, 10, 1, 1, 1);
+ OMPFROM (b);
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 15; j++)
+ for (k = 0; k < 10; k++)
+ if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
+ return 1;
+ N(f5) (0, 10, 30, 15, 0, 10, 4, 5, 6);
+ OMPFROM (b);
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 15; j++)
+ for (k = 0; k < 10; k++)
+ if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
+ return 1;
+ N(f6) (9, -1, 29, 0, 9, -1, -1, -2, -1);
+ OMPFROM (b);
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 15; j++)
+ for (k = 0; k < 10; k++)
+ if (b[i][j][k] != i - 4.5 + 1.5 * j - 1.5 * k)
+ return 1;
+ N(f7) ();
+ OMPFROM (b);
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 15; j++)
+ for (k = 0; k < 10; k++)
+ if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+ return 1;
+ N(f8) ();
+ OMPFROM (b);
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 15; j++)
+ for (k = 0; k < 10; k++)
+ if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+ return 1;
+ N(f9) ();
+ N(f10) ();
+ N(f11) (10);
+ N(f12) (12);
+ OMPFROM (a);
+ OMPFROM (b);
+ for (i = 0; i < 1500; i++)
+ if (a[i] != i - 22)
+ return 1;
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 15; j++)
+ for (k = 0; k < 10; k++)
+ if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+ return 1;
+ N(f13) ();
+ N(f14) ();
+ OMPFROM (a);
+ OMPFROM (b);
+ for (i = 0; i < 1500; i++)
+ if (a[i] != i - 20)
+ return 1;
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 15; j++)
+ for (k = 0; k < 10; k++)
+ if (b[i][j][k] != i - 2.5 + 1.5 * j - 1.5 * k)
+ return 1;
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,123 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#ifndef ONE_TEST
+#define TEST_ALL 1
+#else
+#define TEST_ALL 0
+#endif
+
+#pragma omp declare target
+
+#if TEST_ALL || TEST_NR == 1
+#define F distribute
+#define G d
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 2
+#define F distribute
+#define G d_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 3
+#define F distribute simd
+#define G ds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 4
+#define F distribute simd
+#define G ds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (5 <= TEST_NR && TEST_NR <= 9)
+#define F distribute parallel for
+#define G dpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (10 <= TEST_NR && TEST_NR <= 14)
+#define F distribute parallel for dist_schedule(static, 128)
+#define G dpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (15 <= TEST_NR && TEST_NR <= 19)
+#define F distribute parallel for simd
+#define G dpfs
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (20 <= TEST_NR && TEST_NR <= 24)
+#define F distribute parallel for simd dist_schedule(static, 128)
+#define G dpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#pragma omp end declare target
+
+int
+main ()
+{
+ int err = 0;
+
+ #pragma omp target teams reduction(|:err)
+ {
+#define DO_TEST_1(test) \
+ do { \
+ err |= test (); \
+ } while (0)
+
+#ifdef ONE_TEST
+ DO_TEST_1 (ONE_TEST);
+#else
+#define DO_TEST(test) DO_TEST_1(test);
+#include "for-3.list"
+#undef DO_TEST
+#endif
+#undef DO_TEST_1
+ }
+
+ if (err)
+ abort ();
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,24 @@
+DO_TEST (test_d_normal)
+DO_TEST (test_d_ds128_normal)
+DO_TEST (test_ds_normal)
+DO_TEST (test_ds_ds128_normal)
+DO_TEST (test_dpf_static)
+DO_TEST (test_dpf_static32)
+DO_TEST (test_dpf_auto)
+DO_TEST (test_dpf_guided32)
+DO_TEST (test_dpf_runtime)
+DO_TEST (test_dpf_ds128_static)
+DO_TEST (test_dpf_ds128_static32)
+DO_TEST (test_dpf_ds128_auto)
+DO_TEST (test_dpf_ds128_guided32)
+DO_TEST (test_dpf_ds128_runtime)
+DO_TEST (test_dpfs_static)
+DO_TEST (test_dpfs_static32)
+DO_TEST (test_dpfs_auto)
+DO_TEST (test_dpfs_guided32)
+DO_TEST (test_dpfs_runtime)
+DO_TEST (test_dpfs_ds128_static)
+DO_TEST (test_dpfs_ds128_static32)
+DO_TEST (test_dpfs_ds128_auto)
+DO_TEST (test_dpfs_ds128_guided32)
+DO_TEST (test_dpfs_ds128_runtime)
new file mode 100644
@@ -0,0 +1,161 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#ifndef ONE_TEST
+#define TEST_ALL 1
+#else
+#define TEST_ALL 0
+#endif
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#if TEST_ALL || (1 <= TEST_NR && TEST_NR <= 5)
+#define F target parallel for
+#define G tpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 6
+#define F target simd
+#define G t_simd
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (7 <= TEST_NR && TEST_NR <= 11)
+#define F target parallel for simd
+#define G tpf_simd
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 12
+#define F target teams distribute
+#define G ttd
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 13
+#define F target teams distribute
+#define G ttd_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 14
+#define F target teams distribute simd
+#define G ttds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 15
+#define F target teams distribute simd
+#define G ttds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (16 <= TEST_NR && TEST_NR <= 20)
+#define F target teams distribute parallel for
+#define G ttdpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (21 <= TEST_NR && TEST_NR <= 25)
+#define F target teams distribute parallel for dist_schedule(static, 128)
+#define G ttdpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (26 <= TEST_NR && TEST_NR <= 30)
+#define F target teams distribute parallel for simd
+#define G ttdpfs
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (31 <= TEST_NR && TEST_NR <= 35)
+#define F target teams distribute parallel for simd dist_schedule(static, 128)
+#define G ttdpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+int
+main ()
+{
+#define DO_TEST_1(test) \
+ do { \
+ if (test ()) \
+ abort (); \
+ } while (0)
+
+#ifdef ONE_TEST
+ DO_TEST_1 (ONE_TEST);
+#else
+#define DO_TEST(test) DO_TEST_1 (test);
+#include "for-5.list"
+#undef DO_TEST
+#endif
+#undef DO_TEST_1
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,35 @@
+DO_TEST (test_tpf_static)
+DO_TEST (test_tpf_static32)
+DO_TEST (test_tpf_auto)
+DO_TEST (test_tpf_guided32)
+DO_TEST (test_tpf_runtime)
+DO_TEST (test_t_simd_normal)
+DO_TEST (test_tpf_simd_static)
+DO_TEST (test_tpf_simd_static32)
+DO_TEST (test_tpf_simd_auto)
+DO_TEST (test_tpf_simd_guided32)
+DO_TEST (test_tpf_simd_runtime)
+DO_TEST (test_ttd_normal)
+DO_TEST (test_ttd_ds128_normal)
+DO_TEST (test_ttds_normal)
+DO_TEST (test_ttds_ds128_normal)
+DO_TEST (test_ttdpf_static)
+DO_TEST (test_ttdpf_static32)
+DO_TEST (test_ttdpf_auto)
+DO_TEST (test_ttdpf_guided32)
+DO_TEST (test_ttdpf_runtime)
+DO_TEST (test_ttdpf_ds128_static)
+DO_TEST (test_ttdpf_ds128_static32)
+DO_TEST (test_ttdpf_ds128_auto)
+DO_TEST (test_ttdpf_ds128_guided32)
+DO_TEST (test_ttdpf_ds128_runtime)
+DO_TEST (test_ttdpfs_static)
+DO_TEST (test_ttdpfs_static32)
+DO_TEST (test_ttdpfs_auto)
+DO_TEST (test_ttdpfs_guided32)
+DO_TEST (test_ttdpfs_runtime)
+DO_TEST (test_ttdpfs_ds128_static)
+DO_TEST (test_ttdpfs_ds128_static32)
+DO_TEST (test_ttdpfs_ds128_auto)
+DO_TEST (test_ttdpfs_ds128_guided32)
+DO_TEST (test_ttdpfs_ds128_runtime)
new file mode 100644
@@ -0,0 +1,135 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#ifndef ONE_TEST
+#define TEST_ALL 1
+#else
+#define TEST_ALL 0
+#endif
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPTGT
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPTGT DO_PRAGMA (omp target)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#if TEST_ALL || TEST_NR == 1
+#define F teams distribute
+#define G td
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 2
+#define F teams distribute
+#define G td_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 3
+#define F teams distribute simd
+#define G tds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 4
+#define F teams distribute simd
+#define G tds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (5 <= TEST_NR && TEST_NR <= 9)
+#define F teams distribute parallel for
+#define G tdpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (10 <= TEST_NR && TEST_NR <= 14)
+#define F teams distribute parallel for dist_schedule(static, 128)
+#define G tdpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (15 <= TEST_NR && TEST_NR <= 19)
+#define F teams distribute parallel for simd
+#define G tdpfs
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (20 <= TEST_NR && TEST_NR <= 24)
+#define F teams distribute parallel for simd dist_schedule(static, 128)
+#define G tdpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+int
+main ()
+{
+#define DO_TEST_1(test) \
+ do { \
+ if (test ()) \
+ abort (); \
+ } while (0)
+
+#ifdef ONE_TEST
+ DO_TEST_1 (ONE_TEST);
+#else
+#define DO_TEST(test) DO_TEST_1 (test);
+#include "for-6.list"
+#undef DO_TEST
+#endif
+#undef DO_TEST_1
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,24 @@
+DO_TEST (test_td_normal)
+DO_TEST (test_td_ds128_normal)
+DO_TEST (test_tds_normal)
+DO_TEST (test_tds_ds128_normal)
+DO_TEST (test_tdpf_static)
+DO_TEST (test_tdpf_static32)
+DO_TEST (test_tdpf_auto)
+DO_TEST (test_tdpf_guided32)
+DO_TEST (test_tdpf_runtime)
+DO_TEST (test_tdpf_ds128_static)
+DO_TEST (test_tdpf_ds128_static32)
+DO_TEST (test_tdpf_ds128_auto)
+DO_TEST (test_tdpf_ds128_guided32)
+DO_TEST (test_tdpf_ds128_runtime)
+DO_TEST (test_tdpfs_static)
+DO_TEST (test_tdpfs_static32)
+DO_TEST (test_tdpfs_auto)
+DO_TEST (test_tdpfs_guided32)
+DO_TEST (test_tdpfs_runtime)
+DO_TEST (test_tdpfs_ds128_static)
+DO_TEST (test_tdpfs_ds128_static32)
+DO_TEST (test_tdpfs_ds128_auto)
+DO_TEST (test_tdpfs_ds128_guided32)
+DO_TEST (test_tdpfs_ds128_runtime)
new file mode 100644
@@ -0,0 +1,17 @@
+/* Ensure that printf on the offload device works. */
+
+/* { dg-do run } */
+/* { dg-output "The answer is 42(\n|\r\n|\r)+" } */
+
+#include <stdio.h>
+
+int var = 42;
+
+int
+main ()
+{
+#pragma omp target
+ {
+ printf ("The answer is %d\n", var);
+ }
+}
new file mode 100644
@@ -0,0 +1,15 @@
+! Ensure that printf on the offload device works.
+
+! { dg-do run }
+! { dg-output "The answer is 42(\n|\r\n|\r)+" }
+! { dg-xfail-if "no write for nvidia" { openacc_nvidia_accel_selected } }
+
+program main
+ implicit none
+ integer :: var = 42
+
+!$omp target
+ write (0, '("The answer is ", I2)') var
+!$omp end target
+
+end program main
@@ -111,6 +111,10 @@ if { $lang_test_file_found } {
set acc_mem_shared 0
}
+ amdgcn* {
+ set acc_mem_shared 0
+ set tagopt "-DACC_DEVICE_TYPE_gcn=\"$offload_target_openacc\""
+ }
default {
error "Unknown OpenACC device type: $openacc_device_type (offload target: $offload_target)"
}
@@ -37,11 +37,9 @@ main(int argc, char **argv)
imin = idata[i] < imin ? idata[i] : imin;
}
- if (imax != 1234 || imin != 0)
+ if (imax != 1234 || imin < 0 || imin > 1)
abort ();
- return 0;
-
igot = 0;
iexp = 32;
@@ -443,17 +441,16 @@ main(int argc, char **argv)
}
}
+ int ones = 0, zeros = 0;
+
for (i = 0; i < N; i++)
- if (i % 2 == 0)
- {
- if (idata[i] != 1)
- abort ();
- }
- else
- {
- if (idata[i] != 0)
- abort ();
- }
+ if (idata[i] == 1)
+ ones++;
+ else if (idata[i] == 0)
+ zeros++;
+
+ if (ones != N / 2 || zeros != N / 2)
+ abort ();
if (iexp != igot)
abort ();
@@ -491,17 +488,16 @@ main(int argc, char **argv)
}
}
+ ones = zeros = 0;
+
for (i = 0; i < N; i++)
- if (i % 2 == 0)
- {
- if (idata[i] != 0)
- abort ();
- }
- else
- {
- if (idata[i] != 1)
- abort ();
- }
+ if (idata[i] == 1)
+ ones++;
+ else if (idata[i] == 0)
+ zeros++;
+
+ if (ones != N / 2 || zeros != N / 2)
+ abort ();
if (iexp != igot)
abort ();
@@ -579,7 +575,7 @@ main(int argc, char **argv)
if (lexp != lgot)
abort ();
- lgot = 2LL;
+ lgot = 2LL << N;
lexp = 2LL;
#pragma acc data copy (lgot, ldata[0:N])
@@ -587,7 +583,7 @@ main(int argc, char **argv)
#pragma acc parallel loop
for (i = 0; i < N; i++)
{
- long long expr = 1LL << N;
+ long long expr = 2LL;
#pragma acc atomic capture
{ lgot = lgot / expr; ldata[i] = lgot; }
@@ -1450,17 +1446,16 @@ main(int argc, char **argv)
}
}
+ ones = zeros = 0;
+
for (i = 0; i < N; i++)
- if (i % 2 == 0)
- {
- if (fdata[i] != 1.0)
- abort ();
- }
- else
- {
- if (fdata[i] != 0.0)
- abort ();
- }
+ if (fdata[i] == 1.0)
+ ones++;
+ else if (fdata[i] == 0.0)
+ zeros++;
+
+ if (ones != N / 2 || zeros != N / 2)
+ abort ();
if (fexp != fgot)
abort ();
@@ -1498,17 +1493,16 @@ main(int argc, char **argv)
}
}
+ ones = zeros = 0;
+
for (i = 0; i < N; i++)
- if (i % 2 == 0)
- {
- if (fdata[i] != 0.0)
- abort ();
- }
- else
- {
- if (fdata[i] != 1.0)
- abort ();
- }
+ if (fdata[i] == 1.0)
+ ones++;
+ else if (fdata[i] == 0.0)
+ zeros++;
+
+ if (ones != N / 2 || zeros != N / 2)
+ abort ();
if (fexp != fgot)
abort ();
@@ -1569,7 +1563,7 @@ main(int argc, char **argv)
abort ();
fgot = 8192.0*8192.0*64.0;
- fexp = 1.0;
+ fexp = fgot;
#pragma acc data copy (fgot, fdata[0:N])
{
@@ -1586,15 +1580,15 @@ main(int argc, char **argv)
if (fexp != fgot)
abort ();
- fgot = 4.0;
- fexp = 4.0;
+ fgot = 2.0 * (1LL << N);
+ fexp = 2.0;
#pragma acc data copy (fgot, fdata[0:N])
{
#pragma acc parallel loop
for (i = 0; i < N; i++)
{
- long long expr = 1LL << N;
+ long long expr = 2LL;
#pragma acc atomic capture
{ fgot = fgot / expr; fdata[i] = fgot; }
@@ -1,3 +1,6 @@
+/* AMD GCN does not use 32-lane vectors.
+ { dg-skip-if "unsuitable dimensions" { openacc_amdgcn_accel_selected } { "*" } { "" } } */
+
/* { dg-additional-options "-fopenacc-dim=32" } */
#include <stdio.h>
@@ -9,11 +9,13 @@ int main ()
int ix;
int exit = 0;
int ondev = 0;
+ int gangsize, workersize, vectorsize;
for (ix = 0; ix < N;ix++)
ary[ix] = -1;
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
+ copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
{
#pragma acc loop gang worker vector
for (unsigned ix = 0; ix < N; ix++)
@@ -32,6 +34,10 @@ int main ()
else
ary[ix] = ix;
}
+
+ gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
@@ -39,11 +45,12 @@ int main ()
int expected = ix;
if(ondev)
{
- int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+ int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+ / (gangsize * workersize * vectorsize);
- int g = ix / (chunk_size * 32 * 32);
- int w = ix / 32 % 32;
- int v = ix % 32;
+ int g = ix / (chunk_size * workersize * vectorsize);
+ int w = (ix / vectorsize) % workersize;
+ int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v;
}
@@ -8,8 +8,10 @@ int main ()
int ix;
int ondev = 0;
int t = 0, h = 0;
-
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ondev)
+ int gangsize, workersize, vectorsize;
+
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
+ copy(ondev) copyout(gangsize, workersize, vectorsize)
{
#pragma acc loop gang worker vector reduction(+:t)
for (unsigned ix = 0; ix < N; ix++)
@@ -28,18 +30,22 @@ int main ()
}
t += val;
}
+ gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
{
int val = ix;
- if(ondev)
+ if (ondev)
{
- int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+ int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+ / (gangsize * workersize * vectorsize);
- int g = ix / (chunk_size * 32 * 32);
- int w = ix / 32 % 32;
- int v = ix % 32;
+ int g = ix / (chunk_size * vectorsize * workersize);
+ int w = ix / vectorsize % workersize;
+ int v = ix % vectorsize;
val = (g << 16) | (w << 8) | v;
}
@@ -9,8 +9,9 @@ int main ()
int ix;
int ondev = 0;
int t = 0, h = 0;
+ int vectorsize;
-#pragma acc parallel vector_length(32) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ondev) copyout(vectorsize)
{
#pragma acc loop vector reduction (+:t)
for (unsigned ix = 0; ix < N; ix++)
@@ -29,6 +30,7 @@ int main ()
}
t += val;
}
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
@@ -38,7 +40,7 @@ int main ()
{
int g = 0;
int w = 0;
- int v = ix % 32;
+ int v = ix % vectorsize;
val = (g << 16) | (w << 8) | v;
}
@@ -9,8 +9,9 @@ int main ()
int ix;
int ondev = 0;
int q = 0, h = 0;
+ int vectorsize;
-#pragma acc parallel vector_length(32) copy(q) copy(ondev)
+#pragma acc parallel vector_length(32) copy(q) copy(ondev) copyout(vectorsize)
{
int t = q;
@@ -32,6 +33,7 @@ int main ()
t += val;
}
q = t;
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
@@ -41,7 +43,7 @@ int main ()
{
int g = 0;
int w = 0;
- int v = ix % 32;
+ int v = ix % vectorsize;
val = (g << 16) | (w << 8) | v;
}
@@ -9,8 +9,10 @@ int main ()
int ix;
int ondev = 0;
int t = 0, h = 0;
+ int workersize;
-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
+ copyout(workersize)
{
#pragma acc loop worker reduction(+:t)
for (unsigned ix = 0; ix < N; ix++)
@@ -29,6 +31,7 @@ int main ()
}
t += val;
}
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
}
for (ix = 0; ix < N; ix++)
@@ -37,7 +40,7 @@ int main ()
if(ondev)
{
int g = 0;
- int w = ix % 32;
+ int w = ix % workersize;
int v = 0;
val = (g << 16) | (w << 8) | v;
@@ -9,8 +9,10 @@ int main ()
int ix;
int ondev = 0;
int q = 0, h = 0;
+ int workersize;
-#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev) \
+ copyout(workersize)
{
int t = q;
@@ -32,6 +34,7 @@ int main ()
t += val;
}
q = t;
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
}
for (ix = 0; ix < N; ix++)
@@ -40,7 +43,7 @@ int main ()
if(ondev)
{
int g = 0;
- int w = ix % 32;
+ int w = ix % workersize;
int v = 0;
val = (g << 16) | (w << 8) | v;
@@ -8,8 +8,10 @@ int main ()
int ix;
int ondev = 0;
int t = 0, h = 0;
+ int workersize, vectorsize;
-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
+ copyout(workersize, vectorsize)
{
#pragma acc loop worker vector reduction (+:t)
for (unsigned ix = 0; ix < N; ix++)
@@ -28,6 +30,8 @@ int main ()
}
t += val;
}
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
@@ -36,8 +40,8 @@ int main ()
if(ondev)
{
int g = 0;
- int w = (ix / 32) % 32;
- int v = ix % 32;
+ int w = (ix / vectorsize) % workersize;
+ int v = ix % vectorsize;
val = (g << 16) | (w << 8) | v;
}
@@ -9,11 +9,13 @@ int main ()
int ix;
int exit = 0;
int ondev = 0;
+ int vectorsize;
for (ix = 0; ix < N;ix++)
ary[ix] = -1;
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
+ copyout(vectorsize)
{
#pragma acc loop vector
for (unsigned ix = 0; ix < N; ix++)
@@ -31,6 +33,7 @@ int main ()
else
ary[ix] = ix;
}
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
@@ -40,7 +43,7 @@ int main ()
{
int g = 0;
int w = 0;
- int v = ix % 32;
+ int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v;
}
@@ -9,12 +9,14 @@ int main ()
int ix;
int exit = 0;
int ondev = 0;
+ int workersize;
for (ix = 0; ix < N;ix++)
ary[ix] = -1;
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
- /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 16 } */
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+ copyout(workersize)
+ /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 17 } */
{
#pragma acc loop worker
for (unsigned ix = 0; ix < N; ix++)
@@ -32,6 +34,7 @@ int main ()
else
ary[ix] = ix;
}
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
}
for (ix = 0; ix < N; ix++)
@@ -40,7 +43,7 @@ int main ()
if(ondev)
{
int g = 0;
- int w = ix % 32;
+ int w = ix % workersize;
int v = 0;
expected = (g << 16) | (w << 8) | v;
@@ -9,11 +9,13 @@ int main ()
int ix;
int exit = 0;
int ondev = 0;
+ int workersize, vectorsize;
for (ix = 0; ix < N;ix++)
ary[ix] = -1;
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+ copyout(workersize, vectorsize)
{
#pragma acc loop worker vector
for (unsigned ix = 0; ix < N; ix++)
@@ -31,6 +33,8 @@ int main ()
else
ary[ix] = ix;
}
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
@@ -39,8 +43,8 @@ int main ()
if(ondev)
{
int g = 0;
- int w = (ix / 32) % 32;
- int v = ix % 32;
+ int w = (ix / vectorsize) % workersize;
+ int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v;
}
@@ -12,7 +12,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
{
if (acc_on_device ((int) acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device ((int) acc_device_nvidia)
+ || acc_on_device ((int) acc_device_gcn))
return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
else
__builtin_abort ();
@@ -23,7 +24,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
{
if (acc_on_device ((int) acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device ((int) acc_device_nvidia)
+ || acc_on_device ((int) acc_device_gcn))
return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
else
__builtin_abort ();
@@ -34,7 +36,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
{
if (acc_on_device ((int) acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device ((int) acc_device_nvidia)
+ || acc_on_device ((int) acc_device_gcn))
return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
else
__builtin_abort ();
@@ -177,9 +180,8 @@ int main ()
if (vectors_actual != 32)
__builtin_abort ();
}
- else
- if (vectors_actual != 1)
- __builtin_abort ();
+ else if (vectors_actual != 1)
+ __builtin_abort ();
if (gangs_min != 0 || gangs_max != 0
|| workers_min != 0 || workers_max != 0
|| vectors_min != 0 || vectors_max != vectors_actual - 1)
@@ -325,6 +327,10 @@ int main ()
/* We're actually executing with num_workers (32). */
/* workers_actual = 32; */
}
+ else if (acc_on_device (acc_device_gcn))
+ {
+ workers_actual = 4;
+ }
else
__builtin_abort ();
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@@ -404,6 +410,13 @@ int main ()
/* The GCC nvptx back end enforces vector_length (32). */
vectors_actual = 32;
}
+ else if (acc_on_device (acc_device_gcn))
+ {
+ /* Because of the way vectors are implemented for GCN, a vector loop
+ containing a seq routine call will not vectorize calls to that
+ routine. Hence, we'll only get one "vector". */
+ vectors_actual = 1;
+ }
else
__builtin_abort ();
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@@ -430,6 +443,9 @@ int main ()
in the following case. So, limit ourselves here. */
if (acc_get_device_type () == acc_device_nvidia)
gangs = 3;
+ /* Similar appears to be true for GCN. */
+ if (acc_get_device_type () == acc_device_gcn)
+ gangs = 3;
int gangs_actual = gangs;
#define WORKERS 3
int workers_actual = WORKERS;
@@ -456,6 +472,11 @@ int main ()
/* The GCC nvptx back end enforces vector_length (32). */
vectors_actual = 32;
}
+ else if (acc_on_device (acc_device_gcn))
+ {
+ /* See above comments about GCN vectors_actual. */
+ vectors_actual = 1;
+ }
else
__builtin_abort ();
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
new file mode 100644
@@ -0,0 +1,217 @@
+#include <assert.h>
+
+/* Worker propagation: plain scalar variables. */
+
+void
+worker_bcast_1 (void)
+{
+ int i, arr[32 * 32];
+
+ for (i = 0; i < 32 * 32; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(32) num_workers(32)
+ {
+ #pragma acc loop gang
+ for (i = 0; i < 32; i++)
+ {
+ int j;
+ int x = (i ^ 3) * 3;
+
+ #pragma acc loop worker
+ for (j = 0; j < 32; j++)
+ arr[i * 32 + j] += x * j;
+
+ x = (i | 5) * 5;
+
+ #pragma acc loop worker
+ for (j = 0; j < 32; j++)
+ arr[i * 32 + j] += x * j;
+ }
+ }
+
+ for (i = 0; i < 32; i++)
+ for (int j = 0; j < 32; j++)
+ {
+ int idx = i * 32 + j;
+ assert (arr[idx] == idx + (i ^ 3) * 3 * j + (i | 5) * 5 * j);
+ }
+}
+
+#pragma acc routine seq
+__attribute__((noinline)) static int
+select_var (int s, int x, int y)
+{
+ if (s)
+ return x;
+ else
+ return y;
+}
+
+/* Worker propagation: scalars through function calls. */
+
+void
+worker_bcast_2 (void)
+{
+ int i, arr[32 * 32];
+
+ for (i = 0; i < 32 * 32; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
+ {
+ int j;
+
+ #pragma acc loop gang
+ for (i = 0; i < 32; i++)
+ {
+ int x, y, z;
+
+ x = i * 5;
+ y = i * 7;
+
+ #pragma acc loop worker
+ for (j = 0; j < 32; j++)
+ arr[i * 32 + j] += select_var (j & 1, x, y) * j;
+
+ #pragma acc loop worker vector
+ for (j = 0; j < 32; j++)
+ arr[i * 32 + j] += select_var (j & 1, y, x) * j;
+ }
+ }
+
+ for (i = 0; i < 32; i++)
+ for (int j = 0; j < 32; j++)
+ {
+ int idx = i * 32 + j;
+ int answer = idx + ((j & 1) ? i * 5 : i * 7) * j
+ + ((j & 1) ? i * 7 : i * 5) * j;
+ assert (arr[idx] == answer);
+ }
+}
+
+#pragma acc routine seq
+__attribute__((noinline)) static int
+select_addr (int s, int *x, int *y)
+{
+ if (s)
+ return *x;
+ else
+ return *y;
+}
+
+/* Worker propagation: addresses of locals through function calls. */
+
+void
+worker_bcast_3 (void)
+{
+ int i, arr[32 * 32];
+
+ for (i = 0; i < 32 * 32; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
+ {
+ int j;
+
+ #pragma acc loop gang
+ for (i = 0; i < 32; i++)
+ {
+ int x, y, z;
+
+ x = i * 5;
+ y = i * 7;
+
+ #pragma acc loop worker
+ for (j = 0; j < 32; j++)
+ arr[i * 32 + j] += select_addr (j & 1, &x, &y) * j;
+
+ #pragma acc loop worker vector
+ for (j = 0; j < 32; j++)
+ arr[i * 32 + j] += select_addr (j & 1, &y, &x) * j;
+ }
+ }
+
+ for (i = 0; i < 32; i++)
+ for (int j = 0; j < 32; j++)
+ {
+ int idx = i * 32 + j;
+ int answer = idx + ((j & 1) ? i * 5 : i * 7) * j
+ + ((j & 1) ? i * 7 : i * 5) * j;
+ assert (arr[idx] == answer);
+ }
+}
+
+#pragma acc routine seq
+__attribute__((noinline)) static int *
+select_ptr (int s, int *x, int *y)
+{
+ if (s)
+ return x;
+ else
+ return y;
+}
+
+/* Worker propagation: writes through pointers. */
+
+void
+worker_bcast_4 (void)
+{
+ int i, arr[32 * 32];
+
+ for (i = 0; i < 32 * 32; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(32) num_workers(32)
+ {
+ int j;
+
+ #pragma acc loop gang
+ for (i = 0; i < 32; i++)
+ {
+ int x, y, z;
+ int *p, *q, *r;
+
+ p = &x;
+ q = &y;
+
+ x = i * 5;
+ y = i * 7;
+ r = select_ptr (i & 1, p, q);
+
+ #pragma acc loop worker
+ for (j = 0; j < 32; j++)
+ arr[i * 32 + j] += (x + y + 2 * (*p) + (*q)) * j;
+
+ /* This write can affect either x or y: both should be broadcast into
+ the next loop. */
+ (*r) += 20;
+
+ #pragma acc loop worker
+ for (j = 0; j < 32; j++)
+ arr[i * 32 + j] += (x + y + 2 * (*p) + (*q)) * j;
+ }
+ }
+
+ for (i = 0; i < 32; i++)
+ for (int j = 0; j < 32; j++)
+ {
+ int idx = i * 32 + j;
+ int x = i * 5, y = i * 7;
+ int answer = idx + (3 * x + 2 * y) * j
+ + ((i & 1) ? (3 * (x + 20) + 2 * y)
+ : (3 * x + 2 * (y + 20))) * j;
+ assert (arr[idx] == answer);
+ }
+}
+
+
+int main ()
+{
+ worker_bcast_1 ();
+ worker_bcast_2 ();
+ worker_bcast_3 ();
+ worker_bcast_4 ();
+
+ return 0;
+}
@@ -30,14 +30,18 @@ int main ()
int ix;
int exit = 0;
int ondev = 0;
+ int gangsize, workersize, vectorsize;
for (ix = 0; ix < N;ix++)
ary[ix] = -1;
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
{
ondev = acc_on_device (acc_device_not_host);
gang (ary);
+ gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
@@ -45,11 +49,12 @@ int main ()
int expected = ix;
if(ondev)
{
- int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+ int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+ / (gangsize * workersize * vectorsize);
- int g = ix / (chunk_size * 32 * 32);
- int w = ix / 32 % 32;
- int v = ix % 32;
+ int g = ix / (chunk_size * vectorsize * workersize);
+ int w = (ix / vectorsize) % workersize;
+ int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v;
}
@@ -30,14 +30,17 @@ int main ()
int ix;
int exit = 0;
int ondev = 0;
+ int vectorsize;
for (ix = 0; ix < N;ix++)
ary[ix] = -1;
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
+ copyout(vectorsize)
{
ondev = acc_on_device (acc_device_not_host);
vector (ary);
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
@@ -47,7 +50,7 @@ int main ()
{
int g = 0;
int w = 0;
- int v = ix % 32;
+ int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v;
}
@@ -31,14 +31,17 @@ int main ()
int ix;
int exit = 0;
int ondev = 0;
+ int workersize;
for (ix = 0; ix < N;ix++)
ary[ix] = -1;
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+ copyout(workersize)
{
ondev = acc_on_device (acc_device_not_host);
worker (ary);
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
}
for (ix = 0; ix < N; ix++)
@@ -47,7 +50,7 @@ int main ()
if(ondev)
{
int g = 0;
- int w = ix % 32;
+ int w = ix % workersize;
int v = 0;
expected = (g << 16) | (w << 8) | v;
@@ -30,14 +30,18 @@ int main ()
int ix;
int exit = 0;
int ondev = 0;
+ int workersize, vectorsize;
for (ix = 0; ix < N;ix++)
ary[ix] = -1;
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+ copyout(workersize, vectorsize)
{
ondev = acc_on_device (acc_device_not_host);
worker (ary);
+ workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+ vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
}
for (ix = 0; ix < N; ix++)
@@ -46,8 +50,8 @@ int main ()
if(ondev)
{
int g = 0;
- int w = (ix / 32) % 32;
- int v = ix % 32;
+ int w = (ix / vectorsize) % workersize;
+ int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v;
}
@@ -2,8 +2,14 @@
#include <openacc.h>
#include <gomp-constants.h>
+#ifdef ACC_DEVICE_TYPE_gcn
+/* FIXME: Max. number of workers may increase for GCN in the future. */
+#define NUM_WORKERS 4
+#define NUM_VECTORS 1
+#else
#define NUM_WORKERS 16
#define NUM_VECTORS 32
+#endif
#define WIDTH 64
#define HEIGHT 32
@@ -37,7 +43,8 @@ int DoWorkVec (int nw)
ary[ix][jx] = 0xdeadbeef;
printf ("spawning %d ...", nw); fflush (stdout);
-
+
+/* { dg-warning "region contains vector partitioned code but is not vector partitioned" "vector" { target openacc_amdgcn_accel_selected } 48 } */
#pragma acc parallel num_workers(nw) vector_length (NUM_VECTORS) copy (ary)
{
WorkVec ((int *)ary, WIDTH, HEIGHT, nw, NUM_VECTORS);
@@ -11,7 +11,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
{
if (acc_on_device ((int) acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device ((int) acc_device_nvidia)
+ || acc_on_device ((int) acc_device_gcn))
return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
else
__builtin_abort ();
@@ -22,7 +23,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
{
if (acc_on_device ((int) acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device ((int) acc_device_nvidia)
+ || acc_on_device ((int) acc_device_gcn))
return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
else
__builtin_abort ();
@@ -33,7 +35,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
{
if (acc_on_device ((int) acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device ((int) acc_device_nvidia)
+ || acc_on_device ((int) acc_device_gcn))
return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
else
__builtin_abort ();
@@ -1,3 +1,6 @@
+/* AMD GCN does not use 32-lane vectors.
+ { dg-skip-if "unsuitable dimensions" { openacc_amdgcn_accel_selected } { "*" } { "" } } */
+
/* { dg-additional-options "-fopenacc-dim=32" } */
#include <stdio.h>
@@ -72,6 +72,10 @@ foreach offload_target [concat [split $offload_targets ":"] "disable"] {
set acc_mem_shared 0
}
+ amdgcn* {
+ set acc_mem_shared 0
+ set tagopt "-DACC_DEVICE_TYPE_gcn=\"$offload_target_openacc\""
+ }
default {
error "Unknown OpenACC device type: $openacc_device_type (offload target: $offload_target)"
}
@@ -26,6 +26,9 @@ int main ()
#if defined ACC_DEVICE_TYPE_nvidia
offload_target_requested = ACC_DEVICE_TYPE_nvidia;
acc_device_type_requested = acc_device_nvidia;
+#elif defined ACC_DEVICE_TYPE_gcn
+ offload_target_requested = ACC_DEVICE_TYPE_gcn;
+ acc_device_type_requested = acc_device_gcn;
#elif defined ACC_DEVICE_TYPE_host
offload_target_requested = ACC_DEVICE_TYPE_host;
acc_device_type_requested = acc_device_host;
new file mode 100644
@@ -0,0 +1,17 @@
+/* Ensure that printf on the offload device works. */
+
+/* { dg-do run } */
+/* { dg-output "The answer is 42(\n|\r\n|\r)+" } */
+
+#include <stdio.h>
+
+int var = 42;
+
+int
+main ()
+{
+#pragma acc parallel
+ {
+ printf ("The answer is %d\n", var);
+ }
+}
@@ -275,7 +275,7 @@ program main
if (ltmp .neqv. .not. lexp) STOP 33
if (lgot .neqv. lexp) STOP 34
- igot = 1
+ igot = 0
iexp = N
!$acc parallel loop copy (igot, itmp)
@@ -287,12 +287,16 @@ program main
end do
!$acc end parallel loop
+ itmp = 0
+ do i = 1, N
+ if (iarr(i) == 0 .and. itmp == 0) itmp = i
+ end do
do i = 1, N
- if (.not. (1 <= iarr(i) .and. iarr(i) < iexp)) STOP 35
+ if (iarr(i) == 0 .and. i /= itmp) STOP 35
end do
if (igot /= iexp) STOP 36
- igot = N
+ igot = N + 1
iexp = 1
!$acc parallel loop copy (igot, itmp)
@@ -304,8 +308,12 @@ program main
end do
!$acc end parallel loop
+ itmp = 0
do i = 1, N
- if (.not. (iarr(i) == 1 .or. iarr(i) == N)) STOP 37
+ if (iarr(i) == N + 1 .and. itmp == 0) itmp = i
+ end do
+ do i = 1, N
+ if (iarr(i) == N + 1 .and. i /= itmp) STOP 37
end do
if (igot /= iexp) STOP 38
@@ -314,7 +322,7 @@ program main
!$acc parallel loop copy (igot, itmp)
do i = 0, N - 1
- iexpr = ibclr (-2, i)
+ iexpr = ibclr (-1, i)
!$acc atomic capture
iarr(i) = igot
igot = iand (igot, iexpr)
@@ -322,9 +330,13 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) < 0)) STOP 39
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
+ if (itmp /= 528) STOP 39
if (igot /= iexp) STOP 40
igot = 0
@@ -340,10 +352,14 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) >= 0)) STOP 41
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
- if (igot /= iexp) STOP 42
+ if (igot /= iexp) STOP 41
+ if (itmp /= 496) STOP 42
igot = -1
iexp = 0
@@ -358,12 +374,16 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) < 0)) STOP 43
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
- if (igot /= iexp) STOP 44
+ if (igot /= iexp) STOP 43
+ if (itmp /= 528) STOP 44
- igot = 1
+ igot = 0
iexp = N
!$acc parallel loop copy (igot, itmp)
@@ -375,12 +395,16 @@ program main
end do
!$acc end parallel loop
+ itmp = 0
+ do i = 1, N
+ if (iarr(i) == 0 .and. itmp == 0) itmp = i
+ end do
do i = 1, N
- if (.not. (1 <= iarr(i) .and. iarr(i) < iexp)) STOP 45
+ if (iarr(i) == 0 .and. itmp /= i) STOP 45
end do
if (igot /= iexp) STOP 46
- igot = N
+ igot = N + 1
iexp = 1
!$acc parallel loop copy (igot, itmp)
@@ -392,8 +416,12 @@ program main
end do
!$acc end parallel loop
+ itmp = 0
do i = 1, N
- if (.not. (iarr(i) == 1 .or. iarr(i) == N)) STOP 47
+ if (iarr(i) == N + 1 .and. itmp == 0) itmp = i
+ end do
+ do i = 1, N
+ if (iarr(i) == N + 1 .and. i /= itmp) STOP 47
end do
if (igot /= iexp) STOP 48
@@ -402,7 +430,7 @@ program main
!$acc parallel loop copy (igot, itmp)
do i = 0, N - 1
- iexpr = ibclr (-2, i)
+ iexpr = ibclr (-1, i)
!$acc atomic capture
iarr(i) = igot
igot = iand (iexpr, igot)
@@ -410,14 +438,18 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) < 0)) STOP 49
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
+ if (itmp /= 528) STOP 49
if (igot /= iexp) STOP 50
igot = 0
iexp = -1
- !!
+
!$acc parallel loop copy (igot, itmp)
do i = 0, N - 1
iexpr = lshift (1, i)
@@ -428,10 +460,14 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) >= 0)) STOP 51
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
- if (igot /= iexp) STOP 52
+ if (igot /= iexp) STOP 51
+ if (itmp /= 496) STOP 52
igot = -1
iexp = 0
@@ -446,10 +482,14 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) < 0)) STOP 53
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
- if (igot /= iexp) STOP 54
+ if (igot /= iexp) STOP 53
+ if (itmp /= 528) STOP 54
fgot = 1234.0
fexp = 1266.0
@@ -720,7 +760,7 @@ program main
end do
if (igot /= iexp) STOP 88
- igot = N
+ igot = N + 1
iexp = 1
!$acc parallel loop copy (igot, itmp)
@@ -733,7 +773,7 @@ program main
!$acc end parallel loop
do i = 1, N
- if (.not. (iarr(i) == iexp)) STOP 89
+ if (iarr(i) .lt. 1 .or. iarr(i) .gt. N) STOP 89
end do
if (igot /= iexp) STOP 90
@@ -742,7 +782,7 @@ program main
!$acc parallel loop copy (igot, itmp)
do i = 0, N - 1
- iexpr = ibclr (-2, i)
+ iexpr = ibclr (-1, i)
!$acc atomic capture
igot = iand (igot, iexpr)
iarr(i) = igot
@@ -750,9 +790,13 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) <= 0)) STOP 91
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
+ if (itmp /= 496) STOP 91
if (igot /= iexp) STOP 92
igot = 0
@@ -768,9 +812,13 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) >= -1)) STOP 93
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
+ if (itmp /= 528) STOP 93
if (igot /= iexp) STOP 94
igot = -1
@@ -786,9 +834,13 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) <= 0)) STOP 95
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
+ if (itmp /= 496) STOP 95
if (igot /= iexp) STOP 96
igot = 1
@@ -808,7 +860,7 @@ program main
end do
if (igot /= iexp) STOP 98
- igot = N
+ igot = N + 1
iexp = 1
!$acc parallel loop copy (igot, itmp)
@@ -821,7 +873,7 @@ program main
!$acc end parallel loop
do i = 1, N
- if (.not. (iarr(i) == iexp )) STOP 99
+ if (iarr(i) .lt. 1 .or. iarr(i) .gt. N) STOP 99
end do
if (igot /= iexp) STOP 100
@@ -830,7 +882,7 @@ program main
!$acc parallel loop copy (igot, itmp)
do i = 0, N - 1
- iexpr = ibclr (-2, i)
+ iexpr = ibclr (-1, i)
!$acc atomic capture
igot = iand (iexpr, igot)
iarr(i) = igot
@@ -838,9 +890,13 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) <= 0)) STOP 101
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
+ if (itmp /= 496) STOP 101
if (igot /= iexp) STOP 102
igot = 0
@@ -856,9 +912,13 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) >= iexp)) STOP 103
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
+ if (itmp /= 528) STOP 103
if (igot /= iexp) STOP 104
igot = -1
@@ -874,9 +934,12 @@ program main
end do
!$acc end parallel loop
- do i = 1, N
- if (.not. (iarr(i - 1) <= iexp)) STOP 105
+ itmp = 0
+ do i = 0, N - 1
+ do j = 0, N - 1
+ if (btest (iarr(i), j)) itmp = itmp + 1
+ end do
end do
+ if (itmp /= 496) STOP 105
if (igot /= iexp) STOP 106
-
end program
@@ -6,7 +6,7 @@ program collapse1
l = .false.
a(:, :, :) = 0
!$acc parallel
- !$acc loop collapse(4 - 1)
+ !$acc loop collapse(4 - 1) gang(static:*)
do i = 1, 3
do j = 4, 6
do k = 5, 7
@@ -14,7 +14,7 @@ program collapse1
end do
end do
end do
- !$acc loop collapse(2) reduction(.or.:l)
+ !$acc loop collapse(3) gang(static:*) reduction(.or.:l)
do i = 1, 3
do j = 4, 6
do k = 5, 7
@@ -7,13 +7,13 @@ program collapse2
l = .false.
a(:, :, :) = 0
!$acc parallel
- !$acc loop collapse(4 - 1)
+ !$acc loop collapse(4 - 1) gang(static:*)
do 164 i = 1, 3
do 164 j = 4, 6
do 164 k = 5, 7
a(i, j, k) = i + j + k
164 end do
- !$acc loop collapse(2) reduction(.or.:l)
+ !$acc loop collapse(3) gang(static:*) reduction(.or.:l)
firstdo: do i = 1, 3
do j = 4, 6
do k = 5, 7
@@ -15,6 +15,6 @@
! { dg-output "ERROR STOP (\n|\r\n|\r)+" }
! PR85463. The "minimal" libgfortran implementation used with nvptx
! offloading is a little bit different.
-! { dg-output "Error termination.*" { target { ! openacc_nvidia_accel_selected } } }
+! { dg-output "Error termination.*" { target { { ! openacc_nvidia_accel_selected } && { ! openacc_amdgcn_accel_selected } } } }
! { dg-output "libgomp: cuStreamSynchronize error.*" { target openacc_nvidia_accel_selected } }
! { dg-shouldfail "" }
@@ -15,6 +15,6 @@
! { dg-output "ERROR STOP 35(\n|\r\n|\r)+" }
! PR85463. The "minimal" libgfortran implementation used with nvptx
! offloading is a little bit different.
-! { dg-output "Error termination.*" { target { ! openacc_nvidia_accel_selected } } }
+! { dg-output "Error termination.*" { target { { ! openacc_nvidia_accel_selected } && { ! openacc_amdgcn_accel_selected } } } }
! { dg-output "libgomp: cuStreamSynchronize error.*" { target openacc_nvidia_accel_selected } }
! { dg-shouldfail "" }
@@ -15,6 +15,6 @@
! { dg-output "ERROR STOP SiGN(\n|\r\n|\r)+" }
! PR85463. The "minimal" libgfortran implementation used with nvptx
! offloading is a little bit different.
-! { dg-output "Error termination.*" { target { ! openacc_nvidia_accel_selected } } }
+! { dg-output "Error termination.*" { target { { ! openacc_nvidia_accel_selected } && { ! openacc_amdgcn_accel_selected } } } }
! { dg-output "libgomp: cuStreamSynchronize error.*" { target openacc_nvidia_accel_selected } }
! { dg-shouldfail "" }
@@ -90,6 +90,10 @@ if { $lang_test_file_found } {
set acc_mem_shared 0
}
+ amdgcn* {
+ set acc_mem_shared 0
+ set tagopt "-DACC_DEVICE_TYPE_gcn=\"$offload_target_openacc\""
+ }
default {
error "Unknown OpenACC device type: $openacc_device_type (offload target: $offload_target)"
}
new file mode 100644
@@ -0,0 +1,15 @@
+! Ensure that printf on the offload device works.
+
+! { dg-do run }
+! { dg-output "The answer is 42(\n|\r\n|\r)+" }
+! { dg-xfail-if "no write for nvidia" { openacc_nvidia_accel_selected } }
+
+program main
+ implicit none
+ integer :: var = 42
+
+!$acc parallel
+ write (0, '("The answer is ", I2)') var
+!$acc end parallel
+
+end program main