commit 48ae7eecfbca988d1bd85e28d2ee52bb2ebb7e27
Author: Julian Brown <julian@codesourcery.com>
Date: Thu Nov 13 04:21:00 2014 -0800
OpenACC support for libgomp.
xxxx-xx-xx Nathan Sidwell <nathan@codesourcery.com>
James Norris <jnorris@codesourcery.com>
Thomas Schwinge <thomas@codesourcery.com>
Tom de Vries <tom@codesourcery.com>
Julian Brown <julian@codesourcery.com>
Bernd Schmidt <bernds@codesourcery.com>
Cesar Philippidis <cesar@codesourcery.com>
include/
* gomp-constants.h: New file.
libgomp/
* Makefile.am (search_path): Search in $(top_srcidr)/../include also.
(libgomp_la_SOURCES): Add oacc-parallel.c, splay-tree.c,
oacc-fortran.c, oacc-host.c, oacc-init.c, oacc-mem.c,
oacc-async.c, oacc-plugin.c, oacc-cuda.c, libgomp-plugin.c.
(Makefrag.am): Include.
(libgomp_la_SOURCES): Add openacc.f90 if USE_FORTRAN is true.
(nodist_libsubinclude_HEADERS): Add openacc.h, ../include/gomp-constants.h.
(nodist_finclude_HEADERS): Add openacc_lib.h, openacc.f90, openacc.mod,
openacc_kinds.mod.
* configure.ac (plugin_support): Add check for accelerators if attempting
to build plugins.
(plugin/configfrag.ac): Include.
(offload_targets): Add host_nonshm target by default, nvptx target
conditionally if the corresponding offload target is enabled.
(testsuite/libgomp-test-support.exp): Add to AC_CONFIG_FILES.
* env.c (libgomp_target.h, oacc-int.h): Include.
(goacc_notify_var, goacc_device_num, goacc_device_type): New globals.
(goacc_parse_device_type): New functions.
(initialize_env): Parse GCC_ACC_NOTIFY, ACC_DEVICE_TYPE, ACC_DEVICE_NUM
environment variables. Call ACC_runtime_initialize.
* error.c (gomp_verror): Make global.
(gomp_vfatal, gomp_vnotify, gomp_notify): New functions.
(gomp_fatal): Use gomp_vfatal instead of gomp_verror.
* libgomp.h (stdarg.h): Include.
(struct gomp_memory_mapping): Forward declaration.
(goacc_notify_var, goacc_device_num, goacc_device_type): Add extern
declarations.
(gomp_vnotify, gomp_notify, gomp_verror, gomp_vfatal): Add
prototypes.
(gomp_init_targets_once): Add prototype.
* libgomp.map (OACC_2.0): New symbol version. Add public acc_*
interface functions.
(PLUGIN_1.0): New symbol version. Add gomp plugin interface functions.
* libgomp_g.h (GOACC_data_start, GOACC_data_end, GOACC_kernels)
(GOACC_parallel, GOACC_wait): Add prototypes.
* libgomp_target.h (gomp-constants.h, splay-tree.h): Include.
(offload_target_type): Set enumeration values from constants in
gomp-constants.h. Add OFFLOAD_TARGET_TYPE_HOST_NONSHM and
OFFLOAD_TARGET_TYPE_NVIDIA_PTX.
(struct target_mem_desc): Move to here.
(TARGET_CAP_SHARED_MEM, TARGET_CAP_NATIVE_EXEC, TARGET_CAP_OPENMP_400)
(TARGET_CAP_OPENACC_200): Define macros.
(struct gomp_memory_mapping): New.
(struct ACC_dispatch_t): New.
(struct gomp_device_descr): Move here. Add offload_regions_registered,
openacc dispatch functions, target_data.
(gomp_map_vars, gomp_copy_from_async, gomp_unmap_vars, gomp_init_device)
(gomp_init_tables, gomp_fini_device, gomp_free_memmap): Add prototypes.
* target.c (oacc-plugin.h, gomp-constants.h, oacc-int.h, stdio.h)
(assert.h): Include.
(splay_tree_node, splay_tree, splay_tree_key, target_mem_desc)
(splay_tree_key_s, gomp_device_descr): Don't declare here.
(splay_compare): Change linkage to hidden not static.
(gomp_init_targets_once): New function.
(gomp_get_num_devices): Use above.
(get_kind): New function.
(gomp_map_vars): Add is_openacc parameter. Change KINDS to void *. Use lock
from memory map not device. Use macros from gomp-constants.h instead of
hard-coded values. Support OpenACC-specific mappings.
(gomp_copy_from_async): New function.
(gomp_unmap_vars): Add DO_COPYFROM argument. Only copy memory
back from device if it is true. Use lock from memory map not
device.
(gomp_update): Add is_openacc parameter. Use lock from memory map not
device. Use macros from gomp-constants.h instead of hard-coded values.
(gomp_register_image_for_device): Add forward declaration.
(GOMP_offload_register): Check realloc result.
(gomp_init_device): Change linkage to hidden not static.
(gomp_init_tables, gomp_init_dev_tables, gomp_free_memmap)
(gomp_fini_device): New function.
(GOMP_target): Adjust lazy initialization, check target
capabilities for OpenMP 4.0 support. Update call to gomp_map_vars,
gomp_unmap_vars.
(GOMP_target_data): Adjust lazy initialization. Update call to
gomp_map_vars.
(GOMP_target_end_data): Update call to gomp_unmap_vars.
(GOMP_target_update): Tweak lazy initialization. Add new args to
gomp_update call.
(gomp_load_plugin_for_device): Initialize get_name, get_caps, device_fini
and OpenACC-specific plugin hooks.
(gomp_register_images_for_device): Rename to...
(gomp_register_image_for_device): This, and register a single
device only, and only if it has not already had images
registered.
(gomp_find_available_plugins): Initialize OpenACC-specific bits, offload
image registration, and other new device member data. Prefer device with
TARGET_CAP_OPENMP_400 if more than one plugin is available.
* libgomp-plugin.c: New file.
* libgomp-plugin.h: New file.
* oacc-async.c: New file.
* oacc-cuda.c: New file.
* oacc-fortran.c: New file.
* oacc-host.c: New file.
* oacc-init.c: New file.
* oacc-int.h: New file.
* oacc-mem.c: New file.
* oacc-parallel.c: New file.
* oacc-plugin.c: New file.
* oacc-plugin.h: New file.
* openacc.f90: New file.
* openacc.h: New file.
* openacc_lib.h: New file.
* splay-tree.h: Move bulk of implementation to...
* splay-tree.c: New file.
* Makefile.in: Regenerate.
* config.h.in: Regenerate.
* configure: Regenerate.
* plugin/Makefrag.am: New file.
* plugin/configfrag.am: New file.
* plugin/plugin-host.c: New file.
* plugin/plugin-nvptx.c: New file.
* testsuite/libgomp-test-support.exp.in: New file.
add --enable-libgomp-verbose to compile-time disable notify calls
__builtin_expect for gomp_notify, when enabled
new file mode 100644
@@ -0,0 +1,45 @@
+#ifndef GOMP_CONSTANTS_H
+#define GOMP_CONSTANTS_H 1
+
+/* Enumerated variable mapping types used to communicate between GCC and
+ libgomp. These values are used for both OpenMP and OpenACC. */
+
+#define GOMP_MAP_ALLOC 0x00
+#define GOMP_MAP_ALLOC_TO 0x01
+#define GOMP_MAP_ALLOC_FROM 0x02
+#define GOMP_MAP_ALLOC_TOFROM 0x03
+#define GOMP_MAP_POINTER 0x04
+#define GOMP_MAP_TO_PSET 0x05
+#define GOMP_MAP_FORCE_ALLOC 0x08
+#define GOMP_MAP_FORCE_TO 0x09
+#define GOMP_MAP_FORCE_FROM 0x0a
+#define GOMP_MAP_FORCE_TOFROM 0x0b
+#define GOMP_MAP_FORCE_PRESENT 0x0c
+#define GOMP_MAP_FORCE_DEALLOC 0x0d
+#define GOMP_MAP_FORCE_DEVICEPTR 0x0e
+#define GOMP_MAP_FORCE_PRIVATE 0x18
+#define GOMP_MAP_FORCE_FIRSTPRIVATE 0x19
+
+#define GOMP_MAP_COPYTO_P(X) \
+ ((X) == GOMP_MAP_ALLOC_TO || (X) == GOMP_MAP_FORCE_TO)
+
+#define GOMP_MAP_COPYFROM_P(X) \
+ ((X) == GOMP_MAP_ALLOC_FROM || (X) == GOMP_MAP_FORCE_FROM)
+
+#define GOMP_MAP_TOFROM_P(X) \
+ ((X) == GOMP_MAP_ALLOC_TOFROM || (X) == GOMP_MAP_FORCE_TOFROM)
+
+#define GOMP_MAP_POINTER_P(X) \
+ ((X) == GOMP_MAP_POINTER)
+
+#define GOMP_IF_CLAUSE_FALSE -2
+
+/* Canonical list of target type codes for OpenMP/OpenACC. */
+#define GOMP_TARGET_NONE 0
+#define GOMP_TARGET_HOST 2
+#define GOMP_TARGET_HOST_NONSHM 3
+#define GOMP_TARGET_NOT_HOST 4
+#define GOMP_TARGET_NVIDIA_PTX 5
+#define GOMP_TARGET_INTEL_MIC 6
+
+#endif
@@ -7,7 +7,8 @@ SUBDIRS = testsuite
gcc_version := $(shell cat $(top_srcdir)/../gcc/BASE-VER)
config_path = @config_path@
-search_path = $(addprefix $(top_srcdir)/config/, $(config_path)) $(top_srcdir)
+search_path = $(addprefix $(top_srcdir)/config/, $(config_path)) $(top_srcdir) \
+ $(top_srcdir)/../include
fincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/finclude
libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
@@ -18,6 +19,10 @@ AM_CPPFLAGS = $(addprefix -I, $(search_path))
AM_CFLAGS = $(XCFLAGS)
AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
+if LIBGOMP_VERBOSE
+AM_CPPFLAGS += -DLIBGOMP_VERBOSE
+endif
+
toolexeclib_LTLIBRARIES = libgomp.la
nodist_toolexeclib_HEADERS = libgomp.spec
@@ -60,12 +65,21 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
- time.c fortran.c affinity.c target.c
+ time.c fortran.c affinity.c target.c oacc-parallel.c splay-tree.c \
+ oacc-host.c oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c \
+ oacc-cuda.c libgomp-plugin.c
+
+include $(top_srcdir)/plugin/Makefrag.am
+
+if USE_FORTRAN
+libgomp_la_SOURCES += openacc.f90
+endif
nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h
+nodist_libsubinclude_HEADERS = omp.h openacc.h ../include/gomp-constants.h
if USE_FORTRAN
-nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod
+nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
+ openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
endif
LTLDFLAGS = $(shell $(SHELL) $(top_srcdir)/../libtool-ldflags $(LDFLAGS))
@@ -2,7 +2,7 @@
# aclocal -I ../config && autoconf && autoheader && automake
AC_PREREQ(2.64)
-AC_INIT([GNU OpenMP Runtime Library], 1.0,,[libgomp])
+AC_INIT([GNU Offloading and Multi Processing Runtime Library], 1.0,,[libgomp])
AC_CONFIG_HEADER(config.h)
# -------
@@ -28,7 +28,6 @@ LIBGOMP_ENABLE(generated-files-in-srcdir, no, ,
AC_MSG_RESULT($enable_generated_files_in_srcdir)
AM_CONDITIONAL(GENINSRC, test "$enable_generated_files_in_srcdir" = yes)
-
# -------
# -------
@@ -193,13 +192,28 @@ AC_LINK_IFELSE(
[],
[AC_MSG_ERROR([Pthreads are required to build libgomp])])])
+# Enable --enable-libgomp-verbose
+AC_ARG_ENABLE(libgomp-verbose,
+[AS_HELP_STRING([--enable-libgomp-verbose],
+ [enable verbose debugging output for libgomp])],
+[case "${enableval}" in
+ yes) libgomp_verbose=true ;;
+ no) libgomp_verbose=false ;;
+ *) AC_MSG_ERROR([bad value ${enableval} for --enable-libgomp-verbose]) ;;
+esac], [libgomp_verbose=false])
+AM_CONDITIONAL([LIBGOMP_VERBOSE], [test x$libgomp_verbose = xtrue])
+
plugin_support=yes
AC_CHECK_LIB(dl, dlsym, , [plugin_support=no])
if test x"$plugin_support" = xyes; then
AC_DEFINE(PLUGIN_SUPPORT, 1,
[Define if all infrastructure, needed for plugins, is supported.])
+elif test "x$enable_accelerator" != xno; then
+ AC_MSG_ERROR([Can't have support for accelerators without support for plugins])
fi
+m4_include([plugin/configfrag.ac])
+
# Check for functions needed.
AC_CHECK_FUNCS(getloadavg clock_gettime strtoull)
@@ -283,7 +297,7 @@ fi
# Get accel target and path to install tree of accel compiler
offload_additional_options=
offload_additional_lib_paths=
-offload_targets=
+offload_targets=host_nonshm
if test x"$enable_offload_targets" != x; then
for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'`
@@ -291,6 +305,8 @@ if test x"$enable_offload_targets" != x; then
case $tgt in
*-intelmic-* | *-intelmicemul-*)
tgt_name="intelmic" ;;
+ nvptx-*)
+ tgt_name="nvptx" ;;
*)
AC_MSG_ERROR([unknown offload target specified]) ;;
esac
@@ -388,4 +404,5 @@ CFLAGS="$save_CFLAGS"
AC_CONFIG_FILES(omp.h omp_lib.h omp_lib.f90 libgomp_f.h)
AC_CONFIG_FILES(Makefile testsuite/Makefile libgomp.spec)
+AC_CONFIG_FILES([testsuite/libgomp-test-support.exp])
AC_OUTPUT
@@ -27,6 +27,8 @@
#include "libgomp.h"
#include "libgomp_f.h"
+#include "libgomp_target.h"
+#include "oacc-int.h"
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
@@ -77,6 +79,10 @@ unsigned long gomp_bind_var_list_len;
void **gomp_places_list;
unsigned long gomp_places_list_len;
+int goacc_notify_var;
+int goacc_device_num;
+char* goacc_device_type;
+
/* Parse the OMP_SCHEDULE environment variable. */
static void
@@ -1011,6 +1017,16 @@ parse_affinity (bool ignore)
return false;
}
+static void
+goacc_parse_device_type (void)
+{
+ const char *env = getenv ("ACC_DEVICE_TYPE");
+
+ if (env && *env != '\0')
+ goacc_device_type = strdup (env);
+ else
+ goacc_device_type = NULL;
+}
static void
handle_omp_display_env (unsigned long stacksize, int wait_policy)
@@ -1181,6 +1197,7 @@ initialize_env (void)
gomp_global_icv.thread_limit_var
= thread_limit_var > INT_MAX ? UINT_MAX : thread_limit_var;
}
+ parse_int ("GOACC_NOTIFY", &goacc_notify_var, true);
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_init (&gomp_managed_threads_lock);
#endif
@@ -1271,6 +1288,15 @@ initialize_env (void)
}
handle_omp_display_env (stacksize, wait_policy);
+
+ /* Look for OpenACC-specific environment variables. */
+ if (!parse_int ("ACC_DEVICE_NUM", &goacc_device_num, true))
+ goacc_device_num = 0;
+
+ goacc_parse_device_type ();
+
+ /* Initialize OpenACC-specific internal state. */
+ goacc_runtime_initialize ();
}
@@ -35,7 +35,7 @@
#include <stdlib.h>
-static void
+void
gomp_verror (const char *fmt, va_list list)
{
fputs ("\nlibgomp: ", stderr);
@@ -54,13 +54,40 @@ gomp_error (const char *fmt, ...)
}
void
+gomp_vfatal (const char *fmt, va_list list)
+{
+ gomp_verror (fmt, list);
+ exit (EXIT_FAILURE);
+}
+
+void
gomp_fatal (const char *fmt, ...)
{
va_list list;
va_start (list, fmt);
- gomp_verror (fmt, list);
+ gomp_vfatal (fmt, list);
va_end (list);
+}
- exit (EXIT_FAILURE);
+#ifdef LIBGOMP_VERBOSE
+
+#undef gomp_vnotify
+void
+gomp_vnotify (const char *msg, va_list list)
+{
+ if (goacc_notify_var)
+ vfprintf (stderr, msg, list);
+}
+
+#undef gomp_notify
+void
+gomp_notify (const char *msg, ...)
+{
+ va_list list;
+
+ va_start (list, msg);
+ gomp_vnotify (msg, list);
+ va_end (list);
}
+#endif
new file mode 100644
@@ -0,0 +1,107 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Exported (non-hidden) functions exposing libgomp interface for plugins. */
+
+#include <stdlib.h>
+
+#include "libgomp.h"
+#include "libgomp-plugin.h"
+#include "target.h"
+
+void *
+GOMP_PLUGIN_malloc (size_t size)
+{
+ return gomp_malloc (size);
+}
+
+void *
+GOMP_PLUGIN_malloc_cleared (size_t size)
+{
+ return gomp_malloc_cleared (size);
+}
+
+void *
+GOMP_PLUGIN_realloc (void *ptr, size_t size)
+{
+ return gomp_realloc (ptr, size);
+}
+
+void
+GOMP_PLUGIN_error (const char *msg, ...)
+{
+ va_list ap;
+
+ va_start (ap, msg);
+ gomp_verror (msg, ap);
+ va_end (ap);
+}
+
+void
+GOMP_PLUGIN_notify (const char *msg, ...)
+{
+ va_list ap;
+
+ va_start (ap, msg);
+ gomp_vnotify (msg, ap);
+ va_end (ap);
+}
+
+void
+GOMP_PLUGIN_fatal (const char *msg, ...)
+{
+ va_list ap;
+
+ va_start (ap, msg);
+ gomp_vfatal (msg, ap);
+ va_end (ap);
+
+ /* Unreachable. */
+ abort ();
+}
+
+void
+GOMP_PLUGIN_mutex_init (gomp_mutex_t *mutex)
+{
+ gomp_mutex_init (mutex);
+}
+
+void
+GOMP_PLUGIN_mutex_destroy (gomp_mutex_t *mutex)
+{
+ gomp_mutex_destroy (mutex);
+}
+
+void
+GOMP_PLUGIN_mutex_lock (gomp_mutex_t *mutex)
+{
+ gomp_mutex_lock (mutex);
+}
+
+void
+GOMP_PLUGIN_mutex_unlock (gomp_mutex_t *mutex)
+{
+ gomp_mutex_unlock (mutex);
+}
new file mode 100644
@@ -0,0 +1,54 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* An interface to various libgomp-internal functions for use by plugins. */
+
+#ifndef LIBGOMP_PLUGIN_H
+#define LIBGOMP_PLUGIN_H 1
+
+#include "mutex.h"
+
+/* alloc.c */
+
+extern void *GOMP_PLUGIN_malloc (size_t) __attribute__((malloc));
+extern void *GOMP_PLUGIN_malloc_cleared (size_t) __attribute__((malloc));
+extern void *GOMP_PLUGIN_realloc (void *, size_t);
+
+/* error.c */
+
+extern void GOMP_PLUGIN_notify(const char *msg, ...);
+extern void GOMP_PLUGIN_error (const char *, ...)
+ __attribute__((format (printf, 1, 2)));
+extern void GOMP_PLUGIN_fatal (const char *, ...)
+ __attribute__((noreturn, format (printf, 1, 2)));
+
+/* mutex.c */
+
+extern void GOMP_PLUGIN_mutex_init (gomp_mutex_t *mutex);
+extern void GOMP_PLUGIN_mutex_destroy (gomp_mutex_t *mutex);
+extern void GOMP_PLUGIN_mutex_lock (gomp_mutex_t *mutex);
+extern void GOMP_PLUGIN_mutex_unlock (gomp_mutex_t *mutex);
+
+#endif
@@ -40,6 +40,7 @@
#include <pthread.h>
#include <stdbool.h>
#include <stdlib.h>
+#include <stdarg.h>
#ifdef HAVE_ATTRIBUTE_VISIBILITY
# pragma GCC visibility push(hidden)
@@ -220,6 +221,7 @@ struct gomp_team_state
};
struct target_mem_desc;
+struct gomp_memory_mapping;
/* These are the OpenMP 4.0 Internal Control Variables described in
section 2.3.1. Those described as having one copy per task are
@@ -254,6 +256,10 @@ extern unsigned long gomp_bind_var_list_len;
extern void **gomp_places_list;
extern unsigned long gomp_places_list_len;
+extern int goacc_notify_var;
+extern int goacc_device_num;
+extern char* goacc_device_type;
+
enum gomp_task_kind
{
GOMP_TASK_IMPLICIT,
@@ -532,8 +538,29 @@ extern void *gomp_realloc (void *, size_t);
/* error.c */
+#ifdef LIBGOMP_VERBOSE
+extern void gomp_vnotify (const char *, va_list);
+extern void gomp_notify (const char *msg, ...)
+ __attribute__((format (printf, 1, 2)));
+#define gomp_notify(...) \
+ do { \
+ if (__builtin_expect (goacc_notify_var, 0)) \
+ (gomp_notify) (__VA_ARGS__); \
+ } while (0)
+#define gomp_vnotify(FMT, VALIST) \
+ do { \
+ if (__builtin_expect (goacc_notify_var, 0)) \
+ (gomp_vnotify) ((FMT), (VALIST)); \
+ } while (0)
+#else
+#define gomp_vnotify(FMT, VALIST)
+#define gomp_notify(FMT, ...)
+#endif
+extern void gomp_verror (const char *, va_list);
extern void gomp_error (const char *, ...)
__attribute__((format (printf, 1, 2)));
+extern void gomp_vfatal (const char *, va_list)
+ __attribute__((noreturn));
extern void gomp_fatal (const char *, ...)
__attribute__((noreturn, format (printf, 1, 2)));
@@ -606,6 +633,7 @@ extern void gomp_free_thread (void *);
/* target.c */
+extern void gomp_init_targets_once (void);
extern int gomp_get_num_devices (void);
/* work.c */
@@ -232,3 +232,98 @@ GOMP_4.0.1 {
global:
GOMP_offload_register;
} GOMP_4.0;
+
+OACC_2.0 {
+ global:
+ acc_get_num_devices;
+ acc_set_device_type;
+ acc_get_device_type;
+ acc_set_device_num;
+ acc_get_device_num;
+ acc_async_test;
+ acc_async_test_h_;
+ acc_async_test_all;
+ acc_async_test_all_h_;
+ acc_wait;
+ acc_wait_async;
+ acc_wait_all;
+ acc_wait_all_async;
+ acc_init;
+ acc_shutdown;
+ acc_on_device;
+ acc_on_device_h_;
+ acc_malloc;
+ acc_free;
+ acc_copyin;
+ acc_copyin_32_h_;
+ acc_copyin_64_h_;
+ acc_copyin_array_h_;
+ acc_present_or_copyin;
+ acc_present_or_copyin_32_h_;
+ acc_present_or_copyin_64_h_;
+ acc_present_or_copyin_array_h_;
+ acc_create;
+ acc_create_32_h_;
+ acc_create_64_h_;
+ acc_create_array_h_;
+ acc_present_or_create;
+ acc_present_or_create_32_h_;
+ acc_present_or_create_64_h_;
+ acc_present_or_create_array_h_;
+ acc_copyout;
+ acc_copyout_32_h_;
+ acc_copyout_64_h_;
+ acc_copyout_array_h_;
+ acc_delete;
+ acc_delete_32_h_;
+ acc_delete_64_h_;
+ acc_delete_array_h_;
+ acc_update_device;
+ acc_update_device_32_h_;
+ acc_update_device_64_h_;
+ acc_update_device_array_h_;
+ acc_update_self;
+ acc_update_self_32_h_;
+ acc_update_self_64_h_;
+ acc_update_self_array_h_;
+ acc_map_data;
+ acc_unmap_data;
+ acc_deviceptr;
+ acc_hostptr;
+ acc_is_present;
+ acc_is_present_32_h_;
+ acc_is_present_64_h_;
+ acc_is_present_array_h_;
+ acc_memcpy_to_device;
+ acc_memcpy_from_device;
+ acc_get_current_cuda_device;
+ acc_get_current_cuda_context;
+ acc_get_cuda_stream;
+ acc_set_cuda_stream;
+};
+
+GOACC_2.0 {
+ global:
+ GOACC_data_end;
+ GOACC_data_start;
+ GOACC_kernels;
+ GOACC_parallel;
+ GOACC_update;
+ GOACC_wait;
+};
+
+GOMP_PLUGIN_1.0 {
+ global:
+ GOMP_PLUGIN_malloc;
+ GOMP_PLUGIN_malloc_cleared;
+ GOMP_PLUGIN_realloc;
+ GOMP_PLUGIN_error;
+ GOMP_PLUGIN_notify;
+ GOMP_PLUGIN_fatal;
+ GOMP_PLUGIN_mutex_init;
+ GOMP_PLUGIN_mutex_destroy;
+ GOMP_PLUGIN_mutex_lock;
+ GOMP_PLUGIN_mutex_unlock;
+ GOMP_PLUGIN_async_unmap_vars;
+ GOMP_PLUGIN_acc_thread;
+};
@@ -214,4 +214,17 @@ extern void GOMP_target_update (int, const void *,
size_t, void **, size_t *, unsigned char *);
extern void GOMP_teams (unsigned int, unsigned int);
+/* oacc-parallel.c */
+
+extern void GOACC_data_start (int, const void *,
+ size_t, void **, size_t *, unsigned short *);
+extern void GOACC_data_end (void);
+extern void GOACC_kernels (int, void (*) (void *), const void *,
+ size_t, void **, size_t *, unsigned short *,
+ int, int, int, int, int, ...);
+extern void GOACC_parallel (int, void (*) (void *), const void *,
+ size_t, void **, size_t *, unsigned short *,
+ int, int, int, int, int, ...);
+extern void GOACC_wait (int, int, ...);
+
#endif /* LIBGOMP_G_H */
@@ -24,11 +24,15 @@
#ifndef LIBGOMP_TARGET_H
#define LIBGOMP_TARGET_H 1
-/* Type of offload target device. */
+#include "gomp-constants.h"
+
+/* Type of offload target device. Keep in sync with openacc.h:acc_device_t. */
enum offload_target_type
{
- OFFLOAD_TARGET_TYPE_HOST,
- OFFLOAD_TARGET_TYPE_INTEL_MIC
+ OFFLOAD_TARGET_TYPE_HOST = GOMP_TARGET_HOST,
+ OFFLOAD_TARGET_TYPE_HOST_NONSHM = GOMP_TARGET_HOST_NONSHM,
+ OFFLOAD_TARGET_TYPE_NVIDIA_PTX = GOMP_TARGET_NVIDIA_PTX,
+ OFFLOAD_TARGET_TYPE_INTEL_MIC = GOMP_TARGET_INTEL_MIC
};
/* Auxiliary struct, used for transferring a host-target address range mapping
@@ -41,4 +45,177 @@ struct mapping_table
uintptr_t tgt_end;
};
+#include "splay-tree.h"
+
+struct target_mem_desc {
+ /* Reference count. */
+ uintptr_t refcount;
+ /* All the splay nodes allocated together. */
+ splay_tree_node array;
+ /* Start of the target region. */
+ uintptr_t tgt_start;
+ /* End of the targer region. */
+ uintptr_t tgt_end;
+ /* Handle to free. */
+ void *to_free;
+ /* Previous target_mem_desc. */
+ struct target_mem_desc *prev;
+ /* Number of items in following list. */
+ size_t list_count;
+
+ /* Corresponding target device descriptor. */
+ struct gomp_device_descr *device_descr;
+
+ /* Memory mapping info for the thread that created this descriptor. */
+ struct gomp_memory_mapping *mem_map;
+
+ /* List of splay keys to remove (or decrease refcount)
+ at the end of region. */
+ splay_tree_key list[];
+};
+
+#define TARGET_CAP_SHARED_MEM 1
+#define TARGET_CAP_NATIVE_EXEC 2
+#define TARGET_CAP_OPENMP_400 4
+#define TARGET_CAP_OPENACC_200 8
+
+/* Information about mapped memory regions (per device/context). */
+
+struct gomp_memory_mapping
+{
+ /* Splay tree containing information about mapped memory regions. */
+ struct splay_tree_s splay_tree;
+
+ /* Mutex for operating with the splay tree and other shared structures. */
+ gomp_mutex_t lock;
+
+ /* True when tables have been added to this memory map. */
+ bool is_initialized;
+};
+
+typedef struct acc_dispatch_t
+{
+ /* This is a linked list of data mapped using the
+ acc_map_data/acc_unmap_data or "acc enter data"/"acc exit data" pragmas
+ (TODO). Unlike mapped_data in the goacc_thread struct, unmapping can
+ happen out-of-order with respect to mapping. */
+ struct target_mem_desc *data_environ;
+
+ /* Open or close a device instance. */
+ void *(*open_device_func) (int n);
+ int (*close_device_func) (void *h);
+
+ /* Set or get the device number. */
+ int (*get_device_num_func) (void);
+ void (*set_device_num_func) (int);
+
+ /* Execute. */
+ void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
+ unsigned short *, int, int, int, int, void *);
+
+ /* Async cleanup callback registration. */
+ void (*register_async_cleanup_func) (void *);
+
+ /* Asynchronous routines. */
+ int (*async_test_func) (int);
+ int (*async_test_all_func) (void);
+ void (*async_wait_func) (int);
+ void (*async_wait_async_func) (int, int);
+ void (*async_wait_all_func) (void);
+ void (*async_wait_all_async_func) (int);
+ void (*async_set_async_func) (int);
+
+ /* Create/destroy TLS data. */
+ void *(*create_thread_data_func) (void *);
+ void (*destroy_thread_data_func) (void *);
+
+ /* NVIDIA target specific routines. */
+ struct {
+ void *(*get_current_device_func) (void);
+ void *(*get_current_context_func) (void);
+ void *(*get_stream_func) (int);
+ int (*set_stream_func) (int, void *);
+ } cuda;
+} acc_dispatch_t;
+
+/* This structure describes accelerator device.
+ It contains name of the corresponding libgomp plugin, function handlers for
+ interaction with the device, ID-number of the device, and information about
+ mapped memory. */
+struct gomp_device_descr
+{
+ /* The name of the device. */
+ const char *name;
+
+ /* Capabilities of device (supports OpenACC, OpenMP). */
+ unsigned int capabilities;
+
+ /* This is the ID number of device. It could be specified in DEVICE-clause of
+ TARGET construct. */
+ int id;
+
+ /* This is the ID number of device among devices of the same type. */
+ int target_id;
+
+ /* This is the TYPE of device. */
+ enum offload_target_type type;
+
+ /* Set to true when device is initialized. */
+ bool is_initialized;
+
+ /* True when offload regions have been registered with this device. */
+ bool offload_regions_registered;
+
+ /* Plugin file handler. */
+ void *plugin_handle;
+
+ /* Function handlers. */
+ const char *(*get_name_func) (void);
+ unsigned int (*get_caps_func) (void);
+ int (*get_type_func) (void);
+ int (*get_num_devices_func) (void);
+ void (*register_image_func) (void *, void *);
+ void (*init_device_func) (int);
+ void (*fini_device_func) (int);
+ int (*get_table_func) (int, struct mapping_table **);
+ void *(*alloc_func) (int, size_t);
+ void (*free_func) (int, void *);
+ void *(*dev2host_func) (int, void *, const void *, size_t);
+ void *(*host2dev_func) (int, void *, const void *, size_t);
+ void (*run_func) (int, void *, void *);
+
+ /* OpenACC-specific functions. */
+ acc_dispatch_t openacc;
+
+ /* Memory-mapping info for this device instance. */
+ struct gomp_memory_mapping mem_map;
+
+ /* Extra information required for a device instance by a given target. */
+ void *target_data;
+};
+
+extern struct target_mem_desc *
+gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
+ void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
+ bool is_openacc, bool is_target);
+
+extern void
+gomp_copy_from_async (struct target_mem_desc *tgt);
+
+extern void
+gomp_unmap_vars (struct target_mem_desc *tgt, bool);
+
+extern attribute_hidden void
+gomp_init_device (struct gomp_device_descr *devicep);
+
+extern attribute_hidden void
+gomp_init_tables (const struct gomp_device_descr *devicep,
+ struct gomp_memory_mapping *mm);
+
+extern attribute_hidden void
+gomp_fini_device (struct gomp_device_descr *devicep);
+
+extern attribute_hidden void
+gomp_free_memmap (struct gomp_device_descr *devicep);
+
#endif /* LIBGOMP_TARGET_H */
new file mode 100644
@@ -0,0 +1,77 @@
+/* OpenACC Runtime Library Definitions.
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include "openacc.h"
+#include "libgomp.h"
+#include "libgomp_target.h"
+#include "oacc-int.h"
+
+int
+acc_async_test (int async)
+{
+ if (async < acc_async_sync)
+ gomp_fatal ("invalid async argument: %d", async);
+
+ return base_dev->openacc.async_test_func (async);
+}
+
+int
+acc_async_test_all (void)
+{
+ return base_dev->openacc.async_test_all_func ();
+}
+
+void
+acc_wait (int async)
+{
+ if (async < acc_async_sync)
+ gomp_fatal ("invalid async argument: %d", async);
+
+ base_dev->openacc.async_wait_func (async);
+}
+
+void
+acc_wait_async (int async1, int async2)
+{
+ base_dev->openacc.async_wait_async_func (async1, async2);
+}
+
+void
+acc_wait_all (void)
+{
+ base_dev->openacc.async_wait_all_func ();
+}
+
+void
+acc_wait_all_async (int async)
+{
+ if (async < acc_async_sync)
+ gomp_fatal ("invalid async argument: %d", async);
+
+ base_dev->openacc.async_wait_all_async_func (async);
+}
new file mode 100644
@@ -0,0 +1,84 @@
+/* OpenACC Runtime Library: CUDA support glue.
+
+ Copyright (C) 2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "libgomp_target.h"
+#include "oacc-int.h"
+
+void *
+acc_get_current_cuda_device (void)
+{
+ void *p = NULL;
+
+ if (base_dev && base_dev->openacc.cuda.get_current_device_func)
+ p = base_dev->openacc.cuda.get_current_device_func ();
+
+ return p;
+}
+
+void *
+acc_get_current_cuda_context (void)
+{
+ void *p = NULL;
+
+ if (base_dev && base_dev->openacc.cuda.get_current_context_func)
+ p = base_dev->openacc.cuda.get_current_context_func ();
+
+ return p;
+}
+
+void *
+acc_get_cuda_stream (int async)
+{
+ void *p = NULL;
+
+ if (async < 0)
+ return p;
+
+ if (base_dev && base_dev->openacc.cuda.get_stream_func)
+ p = base_dev->openacc.cuda.get_stream_func (async);
+
+ return p;
+}
+
+int
+acc_set_cuda_stream (int async, void *stream)
+{
+ int s = -1;
+
+ if (async < 0 || stream == NULL)
+ return 0;
+
+ goacc_lazy_initialize ();
+
+ if (base_dev && base_dev->openacc.cuda.set_stream_func)
+ s = base_dev->openacc.cuda.set_stream_func (async, stream);
+
+ return s;
+}
new file mode 100644
@@ -0,0 +1,99 @@
+/* OpenACC Runtime Library: acc_device_host, acc_device_host_nonshm.
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This shares much of the implementation of the plugin-host.c "host_nonshm"
+ plugin. */
+#include "plugin/plugin-host.c"
+
+static struct gomp_device_descr host_dispatch =
+ {
+ .name = "host",
+
+ .type = OFFLOAD_TARGET_TYPE_HOST,
+ .capabilities = TARGET_CAP_OPENACC_200 | TARGET_CAP_NATIVE_EXEC
+ | TARGET_CAP_SHARED_MEM,
+ .id = 0,
+
+ .is_initialized = false,
+ .offload_regions_registered = false,
+
+ .get_name_func = GOMP_OFFLOAD_get_name,
+ .get_type_func = GOMP_OFFLOAD_get_type,
+ .get_caps_func = GOMP_OFFLOAD_get_caps,
+
+ .init_device_func = GOMP_OFFLOAD_init_device,
+ .fini_device_func = GOMP_OFFLOAD_fini_device,
+ .get_num_devices_func = GOMP_OFFLOAD_get_num_devices,
+ .register_image_func = GOMP_OFFLOAD_register_image,
+ .get_table_func = GOMP_OFFLOAD_get_table,
+
+ .alloc_func = GOMP_OFFLOAD_alloc,
+ .free_func = GOMP_OFFLOAD_free,
+ .host2dev_func = GOMP_OFFLOAD_host2dev,
+ .dev2host_func = GOMP_OFFLOAD_dev2host,
+
+ .run_func = GOMP_OFFLOAD_run,
+
+ .openacc = {
+ .open_device_func = GOMP_OFFLOAD_openacc_open_device,
+ .close_device_func = GOMP_OFFLOAD_openacc_close_device,
+
+ .get_device_num_func = GOMP_OFFLOAD_openacc_get_device_num,
+ .set_device_num_func = GOMP_OFFLOAD_openacc_set_device_num,
+
+ .exec_func = GOMP_OFFLOAD_openacc_parallel,
+
+ .register_async_cleanup_func
+ = GOMP_OFFLOAD_openacc_register_async_cleanup,
+
+ .async_set_async_func = GOMP_OFFLOAD_openacc_async_set_async,
+ .async_test_func = GOMP_OFFLOAD_openacc_async_test,
+ .async_test_all_func = GOMP_OFFLOAD_openacc_async_test_all,
+ .async_wait_func = GOMP_OFFLOAD_openacc_async_wait,
+ .async_wait_async_func = GOMP_OFFLOAD_openacc_async_wait_async,
+ .async_wait_all_func = GOMP_OFFLOAD_openacc_async_wait_all,
+ .async_wait_all_async_func = GOMP_OFFLOAD_openacc_async_wait_all_async,
+
+ .create_thread_data_func = GOMP_OFFLOAD_openacc_create_thread_data,
+ .destroy_thread_data_func = GOMP_OFFLOAD_openacc_destroy_thread_data,
+
+ .cuda = {
+ .get_current_device_func = NULL,
+ .get_current_context_func = NULL,
+ .get_stream_func = NULL,
+ .set_stream_func = NULL,
+ }
+ }
+ };
+
+/* Register this device type. */
+static __attribute__ ((constructor))
+void goacc_host_init (void)
+{
+ gomp_mutex_init (&host_dispatch.mem_map.lock);
+ goacc_register (&host_dispatch);
+}
new file mode 100644
@@ -0,0 +1,613 @@
+/* OpenACC Runtime initialization routines
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "libgomp.h"
+#include "libgomp_target.h"
+#include "oacc-int.h"
+#include "openacc.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+static gomp_mutex_t acc_device_lock;
+
+/* The dispatch table for the current accelerator device. This is global, so
+ you can only have one type of device open at any given time in a program.
+ This is the "base" device in that several devices that use the same
+ dispatch table may be active concurrently: this one (the "zeroth") is used
+ for overall initialisation/shutdown, and other instances -- not necessarily
+ including this one -- may be opened and closed once the base device has
+ been initialized. */
+struct gomp_device_descr const *base_dev;
+
+#ifdef HAVE_TLS
+__thread struct goacc_thread *goacc_tls_data;
+#else
+pthread_key_t goacc_tls_key;
+#endif
+static pthread_key_t goacc_cleanup_key;
+
+/* Current dispatcher, and how it was initialized */
+static acc_device_t init_key = _ACC_device_hwm;
+
+static struct goacc_thread *goacc_threads;
+static gomp_mutex_t goacc_thread_lock;
+
+/* An array of dispatchers for device types, indexed by the type. This array
+ only references "base" devices, and other instances of the same type are
+ found by simply indexing from each such device (which are stored linearly,
+ grouped by device in target.c:devices). */
+static struct gomp_device_descr const *dispatchers[_ACC_device_hwm] = { 0 };
+
+attribute_hidden void
+goacc_register (struct gomp_device_descr const *disp)
+{
+ /* Only register the 0th device here. */
+ if (disp->target_id != 0)
+ return;
+
+ gomp_mutex_lock (&acc_device_lock);
+
+ assert (acc_device_type (disp->type) != acc_device_none
+ && acc_device_type (disp->type) != acc_device_default
+ && acc_device_type (disp->type) != acc_device_not_host);
+ assert (!dispatchers[disp->type]);
+ dispatchers[disp->type] = disp;
+
+ gomp_mutex_unlock (&acc_device_lock);
+}
+
+static struct gomp_device_descr const *
+resolve_device (acc_device_t d)
+{
+ acc_device_t d_arg = d;
+
+ switch (d)
+ {
+ case acc_device_default:
+ {
+ if (goacc_device_type)
+ {
+ /* Lookup the named device. */
+ while (++d != _ACC_device_hwm)
+ if (dispatchers[d]
+ && !strcasecmp (goacc_device_type, dispatchers[d]->name)
+ && dispatchers[d]->get_num_devices_func () > 0)
+ goto found;
+
+ gomp_fatal ("device type %s not supported", goacc_device_type);
+ }
+
+ /* No default device specified, so start scanning for any non-host
+ device that is available. */
+ d = acc_device_not_host;
+ }
+ /* FALLTHROUGH */
+
+ case acc_device_not_host:
+ /* Find the first available device after acc_device_not_host. */
+ while (++d != _ACC_device_hwm)
+ if (dispatchers[d] && dispatchers[d]->get_num_devices_func () > 0)
+ goto found;
+ if (d_arg == acc_device_default)
+ {
+ d = acc_device_host;
+ goto found;
+ }
+ gomp_fatal ("no device found");
+ break;
+
+ case acc_device_host:
+ break;
+
+ default:
+ if (d > _ACC_device_hwm)
+ gomp_fatal ("device %u out of range", (unsigned)d);
+ break;
+ }
+ found:
+
+ assert (d != acc_device_none
+ && d != acc_device_default
+ && d != acc_device_not_host);
+
+ return dispatchers[d];
+}
+
+/* This is called when plugins have been initialized, and serves to call
+ (indirectly) the target's device_init hook. Calling multiple times without
+ an intervening acc_shutdown_1 call is an error. */
+
+static struct gomp_device_descr const *
+acc_init_1 (acc_device_t d)
+{
+ struct gomp_device_descr const *acc_dev;
+
+ acc_dev = resolve_device (d);
+
+ if (!acc_dev || acc_dev->get_num_devices_func () <= 0)
+ gomp_fatal ("device %u not supported", (unsigned)d);
+
+ if (acc_dev->is_initialized)
+ gomp_fatal ("device already active");
+
+ /* We need to remember what we were intialized as, to check shutdown etc. */
+ init_key = d;
+
+ gomp_init_device ((struct gomp_device_descr *) acc_dev);
+
+ return acc_dev;
+}
+
+static struct goacc_thread *
+goacc_new_thread (void)
+{
+ struct goacc_thread *thr = gomp_malloc (sizeof (struct gomp_thread));
+
+#ifdef HAVE_TLS
+ goacc_tls_data = thr;
+#else
+ pthread_setspecific (goacc_tls_key, thr);
+#endif
+
+ pthread_setspecific (goacc_cleanup_key, thr);
+
+ gomp_mutex_lock (&goacc_thread_lock);
+ thr->next = goacc_threads;
+ goacc_threads = thr;
+ gomp_mutex_unlock (&goacc_thread_lock);
+
+ return thr;
+}
+
+static void
+goacc_destroy_thread (void *data)
+{
+ struct goacc_thread *thr = data, *walk, *prev;
+
+ gomp_mutex_lock (&goacc_thread_lock);
+
+ if (thr)
+ {
+ if (base_dev && thr->target_tls)
+ {
+ base_dev->openacc.destroy_thread_data_func (thr->target_tls);
+ thr->target_tls = NULL;
+ }
+
+ assert (!thr->mapped_data);
+
+ /* Remove from thread list. */
+ for (prev = NULL, walk = goacc_threads; walk;
+ prev = walk, walk = walk->next)
+ if (walk == thr)
+ {
+ if (prev == NULL)
+ goacc_threads = walk->next;
+ else
+ prev->next = walk->next;
+
+ free (thr);
+
+ break;
+ }
+
+ assert (walk);
+ }
+
+ gomp_mutex_unlock (&goacc_thread_lock);
+}
+
+/* Open the ORD'th device of the currently-active type (base_dev must be
+ initialised before calling). If ORD is < 0, open the default-numbered
+ device (set by the ACC_DEVICE_NUM environment variable or a call to
+ acc_set_device_num), or leave any currently-opened device as is. "Opening"
+ consists of calling the device's open_device_func hook, and setting up
+ thread-local data (maybe allocating, then initializing with information
+ pertaining to the newly-opened or previously-opened device). */
+
+static void
+lazy_open (int ord)
+{
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev;
+
+ if (thr && thr->dev)
+ {
+ assert (ord < 0 || ord == thr->dev->target_id);
+ return;
+ }
+
+ assert (base_dev);
+
+ if (ord < 0)
+ ord = goacc_device_num;
+
+ if (ord >= base_dev->get_num_devices_func ())
+ gomp_fatal ("device %u does not exist", ord);
+
+ if (!thr)
+ thr = goacc_new_thread ();
+
+ acc_dev = thr->dev = (struct gomp_device_descr *) &base_dev[ord];
+
+ assert (acc_dev->target_id == ord);
+
+ thr->saved_bound_dev = NULL;
+ thr->mapped_data = NULL;
+
+ if (!acc_dev->target_data)
+ acc_dev->target_data = acc_dev->openacc.open_device_func (ord);
+
+ thr->target_tls
+ = acc_dev->openacc.create_thread_data_func (acc_dev->target_data);
+
+ acc_dev->openacc.async_set_async_func (acc_async_sync);
+
+ if (!acc_dev->mem_map.is_initialized)
+ gomp_init_tables (acc_dev, &acc_dev->mem_map);
+}
+
+/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
+ init/shutdown is per-process or per-thread. We choose per-process. */
+
+void
+acc_init (acc_device_t d)
+{
+ if (!base_dev)
+ gomp_init_targets_once ();
+
+ gomp_mutex_lock (&acc_device_lock);
+
+ base_dev = acc_init_1 (d);
+
+ lazy_open (-1);
+
+ gomp_mutex_unlock (&acc_device_lock);
+}
+
+ialias (acc_init)
+
+void
+acc_shutdown_1 (acc_device_t d)
+{
+ struct goacc_thread *walk;
+
+ /* We don't check whether d matches the actual device found, because
+ OpenACC 2.0 (3.2.12) says the parameters to the init and this
+ call must match (for the shutdown call anyway, it's silent on
+ others). */
+
+ if (!base_dev)
+ gomp_fatal ("no device initialized");
+ if (d != init_key)
+ gomp_fatal ("device %u(%u) is initialized",
+ (unsigned) init_key, (unsigned) base_dev->type);
+
+ gomp_mutex_lock (&goacc_thread_lock);
+
+ /* Free target-specific TLS data and close all devices. */
+ for (walk = goacc_threads; walk != NULL; walk = walk->next)
+ {
+ if (walk->target_tls)
+ base_dev->openacc.destroy_thread_data_func (walk->target_tls);
+
+ walk->target_tls = NULL;
+
+ /* This would mean the user is shutting down OpenACC in the middle of an
+ "acc data" pragma. Likely not intentional. */
+ if (walk->mapped_data)
+ gomp_fatal ("shutdown in 'acc data' region");
+
+ if (walk->dev)
+ {
+ if (walk->dev->openacc.close_device_func (walk->dev->target_data) < 0)
+ gomp_fatal ("failed to close device");
+
+ walk->dev->target_data = NULL;
+
+ gomp_free_memmap (walk->dev);
+
+ walk->dev = NULL;
+ }
+ }
+
+ gomp_mutex_unlock (&goacc_thread_lock);
+
+ gomp_fini_device ((struct gomp_device_descr *) base_dev);
+
+ base_dev = NULL;
+}
+
+void
+acc_shutdown (acc_device_t d)
+{
+ gomp_mutex_lock (&acc_device_lock);
+
+ acc_shutdown_1 (d);
+
+ gomp_mutex_unlock (&acc_device_lock);
+}
+
+ialias (acc_shutdown)
+
+/* This function is called after plugins have been initialized. It deals with
+ the "base" device, and is used to prepare the runtime for dealing with a
+ number of such devices (as implemented by some particular plugin). If the
+ argument device type D matches a previous call to the function, return the
+ current base device, else shut the old device down and re-initialize with
+ the new device type. */
+
+static struct gomp_device_descr const *
+lazy_init (acc_device_t d)
+{
+ if (base_dev)
+ {
+ /* Re-initializing the same device, do nothing. */
+ if (d == init_key)
+ return base_dev;
+
+ acc_shutdown_1 (init_key);
+ }
+
+ assert (!base_dev);
+
+ return acc_init_1 (d);
+}
+
+/* Ensure that plugins are loaded, initialize and open the (default-numbered)
+ device. */
+
+static void
+lazy_init_and_open (acc_device_t d)
+{
+ if (!base_dev)
+ gomp_init_targets_once ();
+
+ gomp_mutex_lock (&acc_device_lock);
+
+ base_dev = lazy_init (d);
+
+ lazy_open (-1);
+
+ gomp_mutex_unlock (&acc_device_lock);
+}
+
+int
+acc_get_num_devices (acc_device_t d)
+{
+ int n = 0;
+ struct gomp_device_descr const *acc_dev;
+
+ if (d == acc_device_none)
+ return 0;
+
+ if (!base_dev)
+ gomp_init_targets_once ();
+
+ acc_dev = resolve_device (d);
+ if (!acc_dev)
+ return 0;
+
+ n = acc_dev->get_num_devices_func ();
+ if (n < 0)
+ n = 0;
+
+ return n;
+}
+
+ialias (acc_get_num_devices)
+
+void
+acc_set_device_type (acc_device_t d)
+{
+ lazy_init_and_open (d);
+}
+
+ialias (acc_set_device_type)
+
+acc_device_t
+acc_get_device_type (void)
+{
+ acc_device_t res = acc_device_none;
+ const struct gomp_device_descr *dev;
+
+ if (base_dev)
+ res = acc_device_type (base_dev->type);
+ else
+ {
+ gomp_init_targets_once ();
+
+ dev = resolve_device (acc_device_default);
+ res = acc_device_type (dev->type);
+ }
+
+ assert (res != acc_device_default
+ && res != acc_device_not_host);
+
+ return res;
+}
+
+ialias (acc_get_device_type)
+
+int
+acc_get_device_num (acc_device_t d)
+{
+ const struct gomp_device_descr *dev;
+ int num;
+
+ if (d >= _ACC_device_hwm)
+ gomp_fatal ("device %u out of range", (unsigned)d);
+
+ if (!base_dev)
+ gomp_init_targets_once ();
+
+ dev = resolve_device (d);
+ if (!dev)
+ gomp_fatal ("no devices of type %u", d);
+
+ /* We might not have called lazy_open for this host thread yet, in which case
+ the get_device_num_func hook will return -1. */
+ num = dev->openacc.get_device_num_func ();
+ if (num < 0)
+ num = goacc_device_num;
+
+ return num;
+}
+
+ialias (acc_get_device_num)
+
+void
+acc_set_device_num (int n, acc_device_t d)
+{
+ const struct gomp_device_descr *dev;
+ int num_devices;
+
+ if (!base_dev)
+ gomp_init_targets_once ();
+
+ if ((int) d == 0)
+ {
+ int i;
+
+ /* A device setting of zero sets all device types on the system to use
+ the Nth instance of that device type. Only attempt it for initialized
+ devices though. */
+ for (i = acc_device_not_host + 1; i < _ACC_device_hwm; i++)
+ {
+ dev = resolve_device (d);
+ if (dev && dev->is_initialized)
+ dev->openacc.set_device_num_func (n);
+ }
+
+ /* ...and for future calls to acc_init/acc_set_device_type, etc. */
+ goacc_device_num = n;
+ }
+ else
+ {
+ struct goacc_thread *thr = goacc_thread ();
+
+ gomp_mutex_lock (&acc_device_lock);
+
+ base_dev = lazy_init (d);
+
+ num_devices = base_dev->get_num_devices_func ();
+
+ if (n >= num_devices)
+ gomp_fatal ("device %u out of range", n);
+
+ /* If we're changing the device number, de-associate this thread with
+ the device (but don't close the device, since it may be in use by
+ other threads). */
+ if (thr && thr->dev && n != thr->dev->target_id)
+ thr->dev = NULL;
+
+ lazy_open (n);
+
+ gomp_mutex_unlock (&acc_device_lock);
+ }
+}
+
+ialias (acc_set_device_num)
+
+int
+acc_on_device (acc_device_t dev)
+{
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (thr && thr->dev
+ && acc_device_type (thr->dev->type) == acc_device_host_nonshm)
+ return dev == acc_device_host_nonshm || dev == acc_device_not_host;
+
+ /* Just rely on the compiler builtin. */
+ return __builtin_acc_on_device (dev);
+}
+ialias (acc_on_device)
+
+attribute_hidden void
+goacc_runtime_initialize (void)
+{
+ gomp_mutex_init (&acc_device_lock);
+
+#ifndef HAVE_TLS
+ pthread_key_create (&goacc_tls_key, NULL);
+#endif
+
+ pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread);
+
+ base_dev = NULL;
+
+ goacc_threads = NULL;
+ gomp_mutex_init (&goacc_thread_lock);
+}
+
+/* Compiler helper functions */
+
+attribute_hidden void
+goacc_save_and_set_bind (acc_device_t d)
+{
+ struct goacc_thread *thr = goacc_thread ();
+
+ assert (!thr->saved_bound_dev);
+
+ thr->saved_bound_dev = thr->dev;
+ thr->dev = (struct gomp_device_descr *) dispatchers[d];
+}
+
+attribute_hidden void
+goacc_restore_bind (void)
+{
+ struct goacc_thread *thr = goacc_thread ();
+
+ thr->dev = thr->saved_bound_dev;
+ thr->saved_bound_dev = NULL;
+}
+
+/* This is called from any OpenACC support function that may need to implicitly
+ initialize the libgomp runtime. On exit all such initialization will have
+ been done, and both the global ACC_dev and the per-host-thread ACC_memmap
+ pointers will be valid. */
+
+attribute_hidden void
+goacc_lazy_initialize (void)
+{
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (thr && thr->dev)
+ return;
+
+ if (!base_dev)
+ lazy_init_and_open (acc_device_default);
+ else
+ {
+ gomp_mutex_lock (&acc_device_lock);
+ lazy_open (-1);
+ gomp_mutex_unlock (&acc_device_lock);
+ }
+}
new file mode 100644
@@ -0,0 +1,106 @@
+/* OpenACC Runtime - internal declarations
+
+ Copyright (C) 2005-2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file contains data types and function declarations that are not
+ part of the official OpenACC user interface. There are declarations
+ in here that are part of the GNU OpenACC ABI, in that the compiler is
+ required to know about them and use them.
+
+ The convention is that the all caps prefix "GOACC" is used group items
+ that are part of the external ABI, and the lower case prefix "goacc"
+ is used group items that are completely private to the library. */
+
+#ifndef _OACC_INT_H
+#define _OACC_INT_H 1
+
+#include "openacc.h"
+#include "config.h"
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdarg.h>
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility push(hidden)
+#endif
+
+static inline enum acc_device_t
+acc_device_type (enum offload_target_type type)
+{
+ return (enum acc_device_t) type;
+}
+
+struct goacc_thread
+{
+ /* The device for the current thread. */
+ struct gomp_device_descr *dev;
+
+ struct gomp_device_descr *saved_bound_dev;
+
+ /* This is a linked list of data mapped by the "acc data" pragma, following
+ strictly push/pop semantics according to lexical scope. */
+ struct target_mem_desc *mapped_data;
+
+ /* These structures form a list: this is the next thread in that list. */
+ struct goacc_thread *next;
+
+ /* Target-specific data (used by plugin). */
+ void *target_tls;
+};
+
+#ifdef HAVE_TLS
+extern __thread struct goacc_thread *goacc_tls_data;
+static inline struct goacc_thread *
+goacc_thread (void)
+{
+ return goacc_tls_data;
+}
+#else
+extern pthread_key_t goacc_tls_key;
+static inline struct goacc_thread *
+goacc_thread (void)
+{
+ return pthread_getspecific (goacc_tls_key);
+}
+#endif
+
+struct gomp_device_descr;
+
+void goacc_register (struct gomp_device_descr const *) __GOACC_NOTHROW;
+
+/* Current dispatcher. */
+extern struct gomp_device_descr const *base_dev;
+
+void goacc_runtime_initialize (void);
+void goacc_save_and_set_bind (acc_device_t);
+void goacc_restore_bind (void);
+void goacc_lazy_initialize (void);
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility pop
+#endif
+
+#endif /* _OACC_INT_H */
new file mode 100644
@@ -0,0 +1,510 @@
+/* OpenACC Runtime initialization routines
+
+ Copyright (C) 2013 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "gomp-constants.h"
+#include "libgomp_target.h"
+#include "oacc-int.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include "splay-tree.h"
+
+/* Return block containing [H->S), or NULL if not contained. */
+
+attribute_hidden splay_tree_key
+lookup_host (struct gomp_memory_mapping *mem_map, void *h, size_t s)
+{
+ struct splay_tree_key_s node;
+ splay_tree_key key;
+
+ node.host_start = (uintptr_t) h;
+ node.host_end = (uintptr_t) h + s;
+
+ gomp_mutex_lock (&mem_map->lock);
+
+ key = splay_tree_lookup (&mem_map->splay_tree, &node);
+
+ gomp_mutex_unlock (&mem_map->lock);
+
+ return key;
+}
+
+/* Return block containing [D->S), or NULL if not contained.
+ The list isn't ordered by device address, so we have to iterate
+ over the whole array. This is not expected to be a common
+ operation. */
+
+static splay_tree_key
+lookup_dev (struct target_mem_desc *tgt, void *d, size_t s)
+{
+ int i;
+ struct target_mem_desc *t;
+ struct gomp_memory_mapping *mem_map;
+
+ if (!tgt)
+ return NULL;
+
+ mem_map = tgt->mem_map;
+
+ gomp_mutex_lock (&mem_map->lock);
+
+ for (t = tgt; t != NULL; t = t->prev)
+ {
+ if (t->tgt_start <= (uintptr_t) d && t->tgt_end >= (uintptr_t) d + s)
+ break;
+ }
+
+ gomp_mutex_unlock (&mem_map->lock);
+
+ if (!t)
+ return NULL;
+
+ for (i = 0; i < t->list_count; i++)
+ {
+ void * offset;
+
+ splay_tree_key k = &t->array[i].key;
+ offset = d - t->tgt_start + k->tgt_offset;
+
+ if (k->host_start + offset <= (void *) k->host_end)
+ return k;
+ }
+
+ return NULL;
+}
+
+/* OpenACC is silent on how memory exhaustion is indicated. We return
+ NULL. */
+
+void *
+acc_malloc (size_t s)
+{
+ if (!s)
+ return NULL;
+
+ goacc_lazy_initialize ();
+
+ struct goacc_thread *thr = goacc_thread ();
+
+ return base_dev->alloc_func (thr->dev->target_id, s);
+}
+
+/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
+ the device address is mapped. We choose to check if it mapped,
+ and if it is, to unmap it. */
+void
+acc_free (void *d)
+{
+ splay_tree_key k;
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (!d)
+ return;
+
+ /* We don't have to call lazy open here, as the ptr value must have
+ been returned by acc_malloc. It's not permitted to pass NULL in
+ (unless you got that null from acc_malloc). */
+ if ((k = lookup_dev (thr->dev->openacc.data_environ, d, 1)))
+ {
+ void *offset;
+
+ offset = d - k->tgt->tgt_start + k->tgt_offset;
+
+ acc_unmap_data ((void *)(k->host_start + offset));
+ }
+
+ base_dev->free_func (thr->dev->target_id, d);
+}
+
+void
+acc_memcpy_to_device (void *d, void *h, size_t s)
+{
+ /* No need to call lazy open here, as the device pointer must have
+ been obtained from a routine that did that. */
+ struct goacc_thread *thr = goacc_thread ();
+
+ base_dev->host2dev_func (thr->dev->target_id, d, h, s);
+}
+
+void
+acc_memcpy_from_device (void *h, void *d, size_t s)
+{
+ /* No need to call lazy open here, as the device pointer must have
+ been obtained from a routine that did that. */
+ struct goacc_thread *thr = goacc_thread ();
+
+ base_dev->dev2host_func (thr->dev->target_id, h, d, s);
+}
+
+/* Return the device pointer that corresponds to host data H. Or NULL
+ if no mapping. */
+
+void *
+acc_deviceptr (void *h)
+{
+ splay_tree_key n;
+ void *d;
+ void *offset;
+
+ goacc_lazy_initialize ();
+
+ struct goacc_thread *thr = goacc_thread ();
+
+ n = lookup_host (&thr->dev->mem_map, h, 1);
+
+ if (!n)
+ return NULL;
+
+ offset = h - n->host_start;
+
+ d = n->tgt->tgt_start + n->tgt_offset + offset;
+
+ return d;
+}
+
+/* Return the host pointer that corresponds to device data D. Or NULL
+ if no mapping. */
+
+void *
+acc_hostptr (void *d)
+{
+ splay_tree_key n;
+ void *h;
+ void *offset;
+
+ goacc_lazy_initialize ();
+
+ struct goacc_thread *thr = goacc_thread ();
+
+ n = lookup_dev (thr->dev->openacc.data_environ, d, 1);
+
+ if (!n)
+ return NULL;
+
+ offset = d - n->tgt->tgt_start + n->tgt_offset;
+
+ h = n->host_start + offset;
+
+ return h;
+}
+
+/* Return 1 if host data [H,+S] is present on the device. */
+
+int
+acc_is_present (void *h, size_t s)
+{
+ splay_tree_key n;
+
+ if (!s || !h)
+ return 0;
+
+ goacc_lazy_initialize ();
+
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ n = lookup_host (&acc_dev->mem_map, h, s);
+
+ if (n && ((uintptr_t)h < n->host_start
+ || (uintptr_t)h + s > n->host_end
+ || s > n->host_end - n->host_start))
+ n = NULL;
+
+ return n != NULL;
+}
+
+/* Create a mapping for host [H,+S] -> device [D,+S] */
+
+void
+acc_map_data (void *h, void *d, size_t s)
+{
+ struct target_mem_desc *tgt;
+ size_t mapnum = 1;
+ void *hostaddrs = h;
+ void *devaddrs = d;
+ size_t sizes = s;
+ unsigned short kinds = GOMP_MAP_ALLOC;
+
+ goacc_lazy_initialize ();
+
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ if (acc_dev->capabilities & TARGET_CAP_SHARED_MEM)
+ {
+ if (d != h)
+ gomp_fatal ("cannot map data on shared-memory system");
+
+ tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true, false);
+ }
+ else
+ {
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (!d || !h || !s)
+ gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
+ (void *)h, (int)s, (void *)d, (int)s);
+
+ if (lookup_host (&acc_dev->mem_map, h, s))
+ gomp_fatal ("host address [%p, +%d] is already mapped", (void *)h,
+ (int)s);
+
+ if (lookup_dev (thr->dev->openacc.data_environ, d, s))
+ gomp_fatal ("device address [%p, +%d] is already mapped", (void *)d,
+ (int)s);
+
+ tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
+ &kinds, true, false);
+ }
+
+ tgt->prev = acc_dev->openacc.data_environ;
+ acc_dev->openacc.data_environ = tgt;
+}
+
+void
+acc_unmap_data (void *h)
+{
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ /* No need to call lazy open, as the address must have been mapped. */
+
+ size_t host_size;
+ splay_tree_key n = lookup_host (&acc_dev->mem_map, h, 1);
+ struct target_mem_desc *t;
+
+ if (!n)
+ gomp_fatal ("%p is not a mapped block", (void *)h);
+
+ host_size = n->host_end - n->host_start;
+
+ if (n->host_start != (uintptr_t) h)
+ gomp_fatal ("[%p,%d] surrounds1 %p",
+ (void *) n->host_start, (int) host_size, (void *) h);
+
+ t = n->tgt;
+
+ if (t->refcount == 2)
+ {
+ struct target_mem_desc *tp;
+
+ /* This is the last reference, so pull the descriptor off the
+ chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from
+ freeing the device memory. */
+ t->tgt_end = 0;
+ t->to_free = 0;
+
+ gomp_mutex_lock (&acc_dev->mem_map.lock);
+
+ for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
+ tp = t, t = t->prev)
+ if (n->tgt == t)
+ {
+ if (tp)
+ tp->prev = t->prev;
+ else
+ acc_dev->openacc.data_environ = t->prev;
+
+ break;
+ }
+
+ gomp_mutex_unlock (&acc_dev->mem_map.lock);
+ }
+
+ gomp_unmap_vars (t, true);
+}
+
+#define PCC_Present (1 << 0)
+#define PCC_Create (1 << 1)
+#define PCC_Copy (1 << 2)
+
+attribute_hidden void *
+present_create_copy (unsigned f, void *h, size_t s)
+{
+ void *d;
+ splay_tree_key n;
+
+ if (!h || !s)
+ gomp_fatal ("[%p,+%d] is a bad range", (void *)h, (int)s);
+
+ goacc_lazy_initialize ();
+
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ n = lookup_host (&acc_dev->mem_map, h, s);
+ if (n)
+ {
+ /* Present. */
+ d = (void *) (n->tgt->tgt_start + n->tgt_offset);
+
+ if (!(f & PCC_Present))
+ gomp_fatal ("[%p,+%d] already mapped to [%p,+%d]",
+ (void *)h, (int)s, (void *)d, (int)s);
+ if ((h + s) > (void *)n->host_end)
+ gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
+ }
+ else if (!(f & PCC_Create))
+ {
+ gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
+ }
+ else
+ {
+ struct target_mem_desc *tgt;
+ size_t mapnum = 1;
+ unsigned short kinds;
+ void *hostaddrs = h;
+
+ if (f & PCC_Copy)
+ kinds = GOMP_MAP_ALLOC_TO;
+ else
+ kinds = GOMP_MAP_ALLOC;
+
+ tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
+ false);
+
+ gomp_mutex_lock (&acc_dev->mem_map.lock);
+
+ d = tgt->to_free;
+ tgt->prev = acc_dev->openacc.data_environ;
+ acc_dev->openacc.data_environ = tgt;
+
+ gomp_mutex_unlock (&acc_dev->mem_map.lock);
+ }
+
+ return d;
+}
+
+void *
+acc_create (void *h, size_t s)
+{
+ return present_create_copy (PCC_Create, h, s);
+}
+
+void *
+acc_copyin (void *h, size_t s)
+{
+ return present_create_copy (PCC_Create | PCC_Copy, h, s);
+}
+
+void *
+acc_present_or_create (void *h, size_t s)
+{
+ return present_create_copy (PCC_Present | PCC_Create, h, s);
+}
+
+void *
+acc_present_or_copyin (void *h, size_t s)
+{
+ return present_create_copy (PCC_Present | PCC_Create | PCC_Copy, h, s);
+}
+
+#define DC_Copyout (1 << 0)
+
+static void
+delete_copyout (unsigned f, void *h, size_t s)
+{
+ size_t host_size;
+ splay_tree_key n;
+ void *d;
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ n = lookup_host (&acc_dev->mem_map, h, s);
+
+ /* No need to call lazy open, as the data must already have been
+ mapped. */
+
+ if (!n)
+ gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s);
+
+ d = (void *) (n->tgt->tgt_start + n->tgt_offset);
+
+ host_size = n->host_end - n->host_start;
+
+ if (n->host_start != (uintptr_t) h || host_size != s)
+ gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]",
+ (void *) n->host_start, (int) host_size, (void *) h, (int) s);
+
+ if (f & DC_Copyout)
+ acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+
+ acc_unmap_data (h);
+
+ acc_dev->free_func (acc_dev->target_id, d);
+}
+
+void
+acc_delete (void *h , size_t s)
+{
+ delete_copyout (0, h, s);
+}
+
+void acc_copyout (void *h, size_t s)
+{
+ delete_copyout (DC_Copyout, h, s);
+}
+
+static void
+update_dev_host (int is_dev, void *h, size_t s)
+{
+ splay_tree_key n;
+ void *d;
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ n = lookup_host (&acc_dev->mem_map, h, s);
+
+ /* No need to call lazy open, as the data must already have been
+ mapped. */
+
+ if (!n)
+ gomp_fatal ("[%p,%d] is not mapped", h, (int)s);
+
+ d = (void *) (n->tgt->tgt_start + n->tgt_offset);
+
+ if (is_dev)
+ acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
+ else
+ acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+}
+
+void
+acc_update_device (void *h, size_t s)
+{
+ update_dev_host (1, h, s);
+}
+
+void
+acc_update_self (void *h, size_t s)
+{
+ update_dev_host (0, h, s);
+}
new file mode 100644
@@ -0,0 +1,388 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file handles OpenACC constructs. */
+
+#include "openacc.h"
+#include "libgomp.h"
+#include "libgomp_g.h"
+#include "gomp-constants.h"
+#include "libgomp_target.h"
+#include "oacc-int.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <alloca.h>
+
+static void
+dump_var (char *s, size_t idx, void *hostaddr, size_t size, unsigned char kind)
+{
+ gomp_notify (" %2zi: %3s 0x%.2x -", idx, s, kind & 0xff);
+
+ switch (kind & 0xff)
+ {
+ case 0x00: gomp_notify (" ALLOC "); break;
+ case 0x01: gomp_notify (" ALLOC TO "); break;
+ case 0x02: gomp_notify (" ALLOC FROM "); break;
+ case 0x03: gomp_notify (" ALLOC TOFROM "); break;
+ case 0x04: gomp_notify (" POINTER "); break;
+ case 0x05: gomp_notify (" TO_PSET "); break;
+
+ case 0x08: gomp_notify (" FORCE_ALLOC "); break;
+ case 0x09: gomp_notify (" FORCE_TO "); break;
+ case 0x0a: gomp_notify (" FORCE_FROM "); break;
+ case 0x0b: gomp_notify (" FORCE_TOFROM "); break;
+ case 0x0c: gomp_notify (" FORCE_PRESENT "); break;
+ case 0x0d: gomp_notify (" FORCE_DEALLOC "); break;
+ case 0x0e: gomp_notify (" FORCE_DEVICEPTR "); break;
+
+ case 0x18: gomp_notify (" FORCE_PRIVATE "); break;
+ case 0x19: gomp_notify (" FORCE_FIRSTPRIVATE "); break;
+
+ case (unsigned char) -1: gomp_notify (" DUMMY "); break;
+ default: gomp_notify ("UGH! 0x%x\n", kind);
+ }
+
+ gomp_notify ("- %d - %4d/0x%04x ", 1 << (kind >> 8), (int) size, (int) size);
+ gomp_notify ("- %p\n", hostaddr);
+}
+
+/* Ensure that the target device for DEVICE_TYPE is initialised (and that
+ plugins have been loaded if appropriate). The ACC_dev variable for the
+ current thread will be set appropriately for the given device type on
+ return. */
+
+attribute_hidden void
+select_acc_device (int device_type)
+{
+ goacc_lazy_initialize ();
+
+ if (device_type == GOMP_IF_CLAUSE_FALSE)
+ return;
+
+ if (device_type == acc_device_none)
+ device_type = acc_device_host;
+
+ if (device_type >= 0)
+ {
+ /* NOTE: this will go badly if the surrounding data environment is set up
+ to use a different device type. We'll just have to trust that users
+ know what they're doing... */
+ acc_set_device_type (device_type);
+ }
+}
+
+void goacc_wait (int async, int num_waits, va_list ap);
+
+void
+GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
+ size_t mapnum, void **hostaddrs, size_t *sizes,
+ unsigned short *kinds,
+ int num_gangs, int num_workers, int vector_length,
+ int async, int num_waits, ...)
+{
+ bool if_clause_condition_value = device != GOMP_IF_CLAUSE_FALSE;
+ va_list ap;
+ struct goacc_thread *thr;
+ struct gomp_device_descr *acc_dev;
+ struct target_mem_desc *tgt;
+ void **devaddrs;
+ unsigned int i;
+ struct splay_tree_key_s k;
+ splay_tree_key tgt_fn_key;
+ void (*tgt_fn);
+
+ if (num_gangs != 1)
+ gomp_fatal ("num_gangs (%d) different from one is not yet supported",
+ num_gangs);
+ if (num_workers != 1)
+ gomp_fatal ("num_workers (%d) different from one is not yet supported",
+ num_workers);
+
+ gomp_notify ("%s: mapnum=%zd, hostaddrs=%p, sizes=%p, kinds=%p, async=%d\n",
+ __FUNCTION__, mapnum, hostaddrs, sizes, kinds, async);
+
+ select_acc_device (device);
+
+ thr = goacc_thread ();
+ acc_dev = thr->dev;
+
+ /* Host fallback if "if" clause is false or if the current device is set to
+ the host. */
+ if (!if_clause_condition_value)
+ {
+ goacc_save_and_set_bind (acc_device_host);
+ fn (hostaddrs);
+ goacc_restore_bind ();
+ return;
+ }
+ else if (acc_device_type (acc_dev->type) == acc_device_host)
+ {
+ fn (hostaddrs);
+ return;
+ }
+
+ va_start (ap, num_waits);
+
+ if (num_waits > 0)
+ goacc_wait (async, num_waits, ap);
+
+ va_end (ap);
+
+ acc_dev->openacc.async_set_async_func (async);
+
+ if (!(acc_dev->capabilities & TARGET_CAP_NATIVE_EXEC))
+ {
+ k.host_start = (uintptr_t) fn;
+ k.host_end = k.host_start + 1;
+ gomp_mutex_lock (&acc_dev->mem_map.lock);
+ tgt_fn_key = splay_tree_lookup (&acc_dev->mem_map.splay_tree, &k);
+ gomp_mutex_unlock (&acc_dev->mem_map.lock);
+
+ if (tgt_fn_key == NULL)
+ gomp_fatal ("target function wasn't mapped: perhaps -fopenacc was "
+ "used without -flto?");
+
+ tgt_fn = (void (*)) tgt_fn_key->tgt->tgt_start;
+ }
+ else
+ tgt_fn = (void (*)) fn;
+
+ tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
+ false);
+
+ devaddrs = alloca (sizeof (void *) * mapnum);
+ for (i = 0; i < mapnum; i++)
+ devaddrs[i] = (void *) (tgt->list[i]->tgt->tgt_start
+ + tgt->list[i]->tgt_offset);
+
+ acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
+ num_gangs, num_workers, vector_length, async,
+ tgt);
+
+ /* If running synchronously, unmap immediately. */
+ if (async < acc_async_noval)
+ gomp_unmap_vars (tgt, true);
+ else
+ {
+ gomp_copy_from_async (tgt);
+ acc_dev->openacc.register_async_cleanup_func (tgt);
+ }
+
+ acc_dev->openacc.async_set_async_func (acc_async_sync);
+}
+
+void
+GOACC_data_start (int device, const void *openmp_target, size_t mapnum,
+ void **hostaddrs, size_t *sizes, unsigned short *kinds)
+{
+ bool if_clause_condition_value = device != GOMP_IF_CLAUSE_FALSE;
+ struct target_mem_desc *tgt;
+
+ gomp_notify ("%s: mapnum=%zd, hostaddrs=%p, sizes=%p, kinds=%p\n",
+ __FUNCTION__, mapnum, hostaddrs, sizes, kinds);
+
+ select_acc_device (device);
+
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ /* Host fallback or 'do nothing'. */
+ if ((acc_dev->capabilities & TARGET_CAP_SHARED_MEM)
+ || !if_clause_condition_value)
+ {
+ tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true, false);
+ tgt->prev = thr->mapped_data;
+ thr->mapped_data = tgt;
+
+ return;
+ }
+
+ gomp_notify (" %s: prepare mappings\n", __FUNCTION__);
+ tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
+ false);
+ gomp_notify (" %s: mappings prepared\n", __FUNCTION__);
+ tgt->prev = thr->mapped_data;
+ thr->mapped_data = tgt;
+}
+
+void
+GOACC_data_end (void)
+{
+ struct goacc_thread *thr = goacc_thread ();
+ struct target_mem_desc *tgt = thr->mapped_data;
+
+ gomp_notify (" %s: restore mappings\n", __FUNCTION__);
+ thr->mapped_data = tgt->prev;
+ gomp_unmap_vars (tgt, true);
+ gomp_notify (" %s: mappings restored\n", __FUNCTION__);
+}
+
+
+void
+GOACC_kernels (int device, void (*fn) (void *), const void *openmp_target,
+ size_t mapnum, void **hostaddrs, size_t *sizes,
+ unsigned short *kinds,
+ int num_gangs, int num_workers, int vector_length,
+ int async, int num_waits, ...)
+{
+ gomp_notify ("%s: mapnum=%zd, hostaddrs=%p, sizes=%p, kinds=%p\n",
+ __FUNCTION__, mapnum, hostaddrs, sizes, kinds);
+
+ va_list ap;
+
+ select_acc_device (device);
+
+ va_start (ap, num_waits);
+
+ if (num_waits > 0)
+ goacc_wait (async, num_waits, ap);
+
+ va_end (ap);
+
+ GOACC_parallel (device, fn, openmp_target, mapnum, hostaddrs, sizes, kinds,
+ num_gangs, num_workers, vector_length, async, 0);
+}
+
+void
+goacc_wait (int async, int num_waits, va_list ap)
+{
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+ int i;
+
+ assert (num_waits >= 0);
+
+ if (async == acc_async_sync && num_waits == 0)
+ {
+ acc_wait_all ();
+ return;
+ }
+
+ if (async == acc_async_sync && num_waits)
+ {
+ for (i = 0; i < num_waits; i++)
+ {
+ int qid = va_arg (ap, int);
+
+ if (acc_async_test (qid))
+ continue;
+
+ acc_wait (qid);
+ }
+ return;
+ }
+
+ if (async == acc_async_noval && num_waits == 0)
+ {
+ acc_dev->openacc.async_wait_all_async_func (acc_async_noval);
+ return;
+ }
+
+ for (i = 0; i < num_waits; i++)
+ {
+ int qid = va_arg (ap, int);
+
+ if (acc_async_test (qid))
+ continue;
+
+ /* If we're waiting on the same asynchronous queue as we're launching on,
+ the queue itself will order work as required, so there's no need to
+ wait explicitly. */
+ if (qid != async)
+ acc_dev->openacc.async_wait_async_func (qid, async);
+ }
+}
+
+void
+GOACC_update (int device, const void *openmp_target, size_t mapnum,
+ void **hostaddrs, size_t *sizes, unsigned short *kinds,
+ int async, int num_waits, ...)
+{
+ bool if_clause_condition_value = device != GOMP_IF_CLAUSE_FALSE;
+ size_t i;
+
+ select_acc_device (device);
+
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ if ((acc_dev->capabilities & TARGET_CAP_SHARED_MEM)
+ || !if_clause_condition_value)
+ return;
+
+ if (num_waits > 0)
+ {
+ va_list ap;
+
+ va_start (ap, num_waits);
+
+ goacc_wait (async, num_waits, ap);
+
+ va_end (ap);
+ }
+
+ acc_dev->openacc.async_set_async_func (async);
+
+ for (i = 0; i < mapnum; ++i)
+ {
+ unsigned char kind = kinds[i] & 0xff;
+
+ dump_var ("UPD", i, hostaddrs[i], sizes[i], kinds[i]);
+
+ switch (kind)
+ {
+ case GOMP_MAP_POINTER:
+ break;
+
+ case GOMP_MAP_FORCE_TO:
+ acc_update_device (hostaddrs[i], sizes[i]);
+ break;
+
+ case GOMP_MAP_FORCE_FROM:
+ acc_update_self (hostaddrs[i], sizes[i]);
+ break;
+
+ default:
+ gomp_fatal (">>>> GOACC_update UNHANDLED kind 0x%.2x", kind);
+ break;
+ }
+ }
+
+ acc_dev->openacc.async_set_async_func (acc_async_sync);
+}
+
+void
+GOACC_wait (int async, int num_waits, ...)
+{
+ va_list ap;
+
+ va_start (ap, num_waits);
+
+ goacc_wait (async, num_waits, ap);
+
+ va_end (ap);
+}
new file mode 100644
@@ -0,0 +1,48 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Initialize and register OpenACC dispatch table from libgomp plugin. */
+
+#include "libgomp.h"
+#include "oacc-plugin.h"
+#include "libgomp_target.h"
+#include "oacc-int.h"
+
+void
+GOMP_PLUGIN_async_unmap_vars (void *ptr)
+{
+ struct target_mem_desc *tgt = ptr;
+
+ gomp_unmap_vars (tgt, false);
+}
+
+/* Return the target-specific part of the TLS data for the current thread. */
+
+void *
+GOMP_PLUGIN_acc_thread (void)
+{
+ struct goacc_thread *thr = goacc_thread ();
+ return thr ? thr->target_tls : NULL;
+}
new file mode 100644
@@ -0,0 +1,32 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _OACC_PLUGIN_H
+#define _OACC_PLUGIN_H 1
+
+extern void GOMP_PLUGIN_async_unmap_vars (void *ptr);
+extern void *GOMP_PLUGIN_acc_thread (void);
+
+#endif
new file mode 100644
@@ -0,0 +1,803 @@
+! OpenACC Runtime Library Definitions.
+
+! Copyright (C) 2014 Free Software Foundation, Inc.
+
+! Contributed by Tobias Burnus <burnus@net-b.de>
+! and Mentor Embedded.
+
+! This file is part of the GNU OpenMP Library (libgomp).
+
+! Libgomp is free software; you can redistribute it and/or modify it
+! under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 3, or (at your option)
+! any later version.
+
+! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+! FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+! more details.
+
+! Under Section 7 of GPL version 3, you are granted additional
+! permissions described in the GCC Runtime Library Exception, version
+! 3.1, as published by the Free Software Foundation.
+
+! You should have received a copy of the GNU General Public License and
+! a copy of the GCC Runtime Library Exception along with this program;
+! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+! <http://www.gnu.org/licenses/>.
+
+module openacc_kinds
+ use iso_fortran_env, only: int32
+ implicit none
+
+ private :: int32
+ public :: acc_device_kind
+
+ integer, parameter :: acc_device_kind = int32
+
+ public :: acc_device_none, acc_device_default, acc_device_host
+ public :: acc_device_not_host, acc_device_nvidia
+
+ integer (acc_device_kind), parameter :: acc_device_none = 0
+ integer (acc_device_kind), parameter :: acc_device_default = 1
+ integer (acc_device_kind), parameter :: acc_device_host = 2
+ integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3
+ integer (acc_device_kind), parameter :: acc_device_not_host = 4
+ integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+
+ public :: acc_handle_kind
+
+ integer, parameter :: acc_handle_kind = int32
+
+ public :: acc_async_noval, acc_async_sync
+
+ integer (acc_handle_kind), parameter :: acc_async_noval = -1
+ integer (acc_handle_kind), parameter :: acc_async_sync = -2
+
+end module
+
+module openacc_internal
+ use openacc_kinds
+ implicit none
+
+ interface
+ function acc_async_test_h (a)
+ logical acc_async_test_h
+ integer a
+ end function
+
+ function acc_async_test_all_h ()
+ logical acc_async_test_all_h
+ end function
+
+ function acc_on_device_h (d)
+ import
+ integer (acc_device_kind) d
+ logical acc_on_device_h
+ end function
+
+ subroutine acc_copyin_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_copyin_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_copyin_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+
+ subroutine acc_present_or_copyin_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_present_or_copyin_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_present_or_copyin_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+
+ subroutine acc_create_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_create_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_create_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+
+ subroutine acc_present_or_create_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_present_or_create_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_present_or_create_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+
+ subroutine acc_copyout_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_copyout_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_copyout_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+
+ subroutine acc_delete_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_delete_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_delete_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+
+ subroutine acc_update_device_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_update_device_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_update_device_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+
+ subroutine acc_update_self_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_update_self_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_update_self_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+
+ function acc_is_present_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ logical acc_is_present_32_h
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end function
+
+ function acc_is_present_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ logical acc_is_present_64_h
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end function
+
+ function acc_is_present_array_h (a)
+ logical acc_is_present_array_h
+ type (*), dimension (..), contiguous :: a
+ end function
+ end interface
+
+ interface
+ function acc_async_test_l (a) &
+ bind (C, name = "acc_async_test")
+ use iso_c_binding, only: c_int
+ integer (c_int) :: acc_async_test_l
+ integer (c_int), value :: a
+ end function
+
+ function acc_async_test_all_l () &
+ bind (C, name = "acc_async_test_all")
+ use iso_c_binding, only: c_int
+ integer (c_int) :: acc_async_test_all_l
+ end function
+
+ function acc_on_device_l (d) &
+ bind (C, name = "acc_on_device")
+ use iso_c_binding, only: c_int
+ integer (c_int) :: acc_on_device_l
+ integer (c_int), value :: d
+ end function
+
+ subroutine acc_copyin_l (a, len) &
+ bind (C, name = "acc_copyin")
+ use iso_c_binding, only: c_size_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_size_t), value :: len
+ end subroutine
+
+ subroutine acc_present_or_copyin_l (a, len) &
+ bind (C, name = "acc_present_or_copyin")
+ use iso_c_binding, only: c_size_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_size_t), value :: len
+ end subroutine
+
+ subroutine acc_create_l (a, len) &
+ bind (C, name = "acc_create")
+ use iso_c_binding, only: c_size_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_size_t), value :: len
+ end subroutine
+
+ subroutine acc_present_or_create_l (a, len) &
+ bind (C, name = "acc_present_or_create")
+ use iso_c_binding, only: c_size_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_size_t), value :: len
+ end subroutine
+
+ subroutine acc_copyout_l (a, len) &
+ bind (C, name = "acc_copyout")
+ use iso_c_binding, only: c_size_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_size_t), value :: len
+ end subroutine
+
+ subroutine acc_delete_l (a, len) &
+ bind (C, name = "acc_delete")
+ use iso_c_binding, only: c_size_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_size_t), value :: len
+ end subroutine
+
+ subroutine acc_update_device_l (a, len) &
+ bind (C, name = "acc_update_device")
+ use iso_c_binding, only: c_size_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_size_t), value :: len
+ end subroutine
+
+ subroutine acc_update_self_l (a, len) &
+ bind (C, name = "acc_update_self")
+ use iso_c_binding, only: c_size_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_size_t), value :: len
+ end subroutine
+
+ function acc_is_present_l (a, len) &
+ bind (C, name = "acc_is_present")
+ use iso_c_binding, only: c_int32_t, c_size_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ integer (c_int32_t) :: acc_is_present_l
+ type (*), dimension (*) :: a
+ integer (c_size_t), value :: len
+ end function
+ end interface
+end module
+
+module openacc
+ use openacc_kinds
+ use openacc_internal
+ implicit none
+
+ public :: openacc_version
+
+ public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type
+ public :: acc_set_device_num, acc_get_device_num, acc_async_test
+ public :: acc_async_test_all, acc_wait, acc_wait_async, acc_wait_all
+ public :: acc_wait_all_async, acc_init, acc_shutdown, acc_on_device
+ public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create
+ public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete
+ public :: acc_update_device, acc_update_self, acc_is_present
+
+ integer, parameter :: openacc_version = 201306
+
+ interface acc_get_num_devices
+ function acc_get_num_devices (d) &
+ bind (C, name = "acc_get_num_devices")
+ use iso_c_binding, only: c_int
+ integer (c_int) :: acc_get_num_devices
+ integer (c_int), value :: d
+ end function
+ end interface
+
+ interface acc_set_device_type
+ subroutine acc_set_device_type (d) &
+ bind (C, name = "acc_set_device_type")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: d
+ end subroutine
+ end interface
+
+ interface acc_get_device_type
+ function acc_get_device_type () &
+ bind (C, name = "acc_get_device_type")
+ use iso_c_binding, only: c_int
+ integer (c_int) :: acc_get_device_type
+ end function
+ end interface
+
+ interface acc_set_device_num
+ subroutine acc_set_device_num (n, d) &
+ bind (C, name = "acc_set_device_num")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: n, d
+ end subroutine
+ end interface
+
+ interface acc_get_device_num
+ function acc_get_device_num (d) &
+ bind (C, name = "acc_get_device_num")
+ use iso_c_binding, only: c_int
+ integer (c_int) :: acc_get_device_num
+ integer (c_int), value :: d
+ end function
+ end interface
+
+ interface acc_async_test
+ procedure :: acc_async_test_h
+ end interface
+
+ interface acc_async_test_all
+ procedure :: acc_async_test_all_h
+ end interface
+
+ interface acc_wait
+ subroutine acc_wait (a) &
+ bind (C, name = "acc_wait")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: a
+ end subroutine
+ end interface
+
+ interface acc_wait_async
+ subroutine acc_wait_async (a1, a2) &
+ bind (C, name = "acc_wait_async")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: a1, a2
+ end subroutine
+ end interface
+
+ interface acc_wait_all
+ subroutine acc_wait_all () &
+ bind (C, name = "acc_wait_all")
+ use iso_c_binding, only: c_int
+ end subroutine
+ end interface
+
+ interface acc_wait_all_async
+ subroutine acc_wait_all_async (a) &
+ bind (C, name = "acc_wait_all_async")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: a
+ end subroutine
+ end interface
+
+ interface acc_init
+ subroutine acc_init (d) &
+ bind (C, name = "acc_init")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: d
+ end subroutine
+ end interface
+
+ interface acc_shutdown
+ subroutine acc_shutdown (d) &
+ bind (C, name = "acc_shutdown")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: d
+ end subroutine
+ end interface
+
+ interface acc_on_device
+ procedure :: acc_on_device_h
+ end interface
+
+ ! acc_malloc: Only available in C/C++
+ ! acc_free: Only available in C/C++
+
+ ! As vendor extension, the following code supports both 32bit and 64bit
+ ! arguments for "size"; the OpenACC standard only permits default-kind
+ ! integers, which are of kind 4 (i.e. 32 bits).
+ ! Additionally, the two-argument version also takes arrays as argument.
+ ! and the one argument version also scalars. Note that the code assumes
+ ! that the arrays are contiguous.
+
+ interface acc_copyin
+ procedure :: acc_copyin_32_h
+ procedure :: acc_copyin_64_h
+ procedure :: acc_copyin_array_h
+ end interface
+
+ interface acc_present_or_copyin
+ procedure :: acc_present_or_copyin_32_h
+ procedure :: acc_present_or_copyin_64_h
+ procedure :: acc_present_or_copyin_array_h
+ end interface
+
+ interface acc_pcopyin
+ procedure :: acc_present_or_copyin_32_h
+ procedure :: acc_present_or_copyin_64_h
+ procedure :: acc_present_or_copyin_array_h
+ end interface
+
+ interface acc_create
+ procedure :: acc_create_32_h
+ procedure :: acc_create_64_h
+ procedure :: acc_create_array_h
+ end interface
+
+ interface acc_present_or_create
+ procedure :: acc_present_or_create_32_h
+ procedure :: acc_present_or_create_64_h
+ procedure :: acc_present_or_create_array_h
+ end interface
+
+ interface acc_pcreate
+ procedure :: acc_present_or_create_32_h
+ procedure :: acc_present_or_create_64_h
+ procedure :: acc_present_or_create_array_h
+ end interface
+
+ interface acc_copyout
+ procedure :: acc_copyout_32_h
+ procedure :: acc_copyout_64_h
+ procedure :: acc_copyout_array_h
+ end interface
+
+ interface acc_delete
+ procedure :: acc_delete_32_h
+ procedure :: acc_delete_64_h
+ procedure :: acc_delete_array_h
+ end interface
+
+ interface acc_update_device
+ procedure :: acc_update_device_32_h
+ procedure :: acc_update_device_64_h
+ procedure :: acc_update_device_array_h
+ end interface
+
+ interface acc_update_self
+ procedure :: acc_update_self_32_h
+ procedure :: acc_update_self_64_h
+ procedure :: acc_update_self_array_h
+ end interface
+
+ ! acc_map_data: Only available in C/C++
+ ! acc_unmap_data: Only available in C/C++
+ ! acc_deviceptr: Only available in C/C++
+ ! acc_hostptr: Only available in C/C++
+
+ interface acc_is_present
+ procedure :: acc_is_present_32_h
+ procedure :: acc_is_present_64_h
+ procedure :: acc_is_present_array_h
+ end interface
+
+ ! acc_memcpy_to_device: Only available in C/C++
+ ! acc_memcpy_from_device: Only available in C/C++
+
+end module
+
+function acc_async_test_h (a)
+ use openacc_internal, only: acc_async_test_l
+ logical acc_async_test_h
+ integer a
+ if (acc_async_test_l (a) .eq. 1) then
+ acc_async_test_h = .TRUE.
+ else
+ acc_async_test_h = .FALSE.
+ end if
+end function
+
+function acc_async_test_all_h ()
+ use openacc_internal, only: acc_async_test_all_l
+ logical acc_async_test_all_h
+ if (acc_async_test_all_l () .eq. 1) then
+ acc_async_test_all_h = .TRUE.
+ else
+ acc_async_test_all_h = .FALSE.
+ end if
+end function
+
+function acc_on_device_h (d)
+ use openacc_internal, only: acc_on_device_l
+ use openacc_kinds
+ integer (acc_device_kind) d
+ logical acc_on_device_h
+ if (acc_on_device_l (d) .eq. 1) then
+ acc_on_device_h = .TRUE.
+ else
+ acc_on_device_h = .FALSE.
+ end if
+end function
+
+subroutine acc_copyin_32_h (a, len)
+ use iso_c_binding, only: c_int32_t, c_size_t
+ use openacc_internal, only: acc_copyin_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ call acc_copyin_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyin_64_h (a, len)
+ use iso_c_binding, only: c_int64_t, c_size_t
+ use openacc_internal, only: acc_copyin_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ call acc_copyin_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyin_array_h (a)
+ use openacc_internal, only: acc_copyin_l
+ type (*), dimension (..), contiguous :: a
+ call acc_copyin_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_present_or_copyin_32_h (a, len)
+ use iso_c_binding, only: c_int32_t, c_size_t
+ use openacc_internal, only: acc_present_or_copyin_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_present_or_copyin_64_h (a, len)
+ use iso_c_binding, only: c_int64_t, c_size_t
+ use openacc_internal, only: acc_present_or_copyin_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_present_or_copyin_array_h (a)
+ use openacc_internal, only: acc_present_or_copyin_l
+ type (*), dimension (..), contiguous :: a
+ call acc_present_or_copyin_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_create_32_h (a, len)
+ use iso_c_binding, only: c_int32_t, c_size_t
+ use openacc_internal, only: acc_create_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ call acc_create_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_create_64_h (a, len)
+ use iso_c_binding, only: c_int64_t, c_size_t
+ use openacc_internal, only: acc_create_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ call acc_create_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_create_array_h (a)
+ use openacc_internal, only: acc_create_l
+ type (*), dimension (..), contiguous :: a
+ call acc_create_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_present_or_create_32_h (a, len)
+ use iso_c_binding, only: c_int32_t, c_size_t
+ use openacc_internal, only: acc_present_or_create_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ call acc_present_or_create_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_present_or_create_64_h (a, len)
+ use iso_c_binding, only: c_int64_t, c_size_t
+ use openacc_internal, only: acc_present_or_create_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ call acc_present_or_create_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_present_or_create_array_h (a)
+ use openacc_internal, only: acc_present_or_create_l
+ type (*), dimension (..), contiguous :: a
+ call acc_present_or_create_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_copyout_32_h (a, len)
+ use iso_c_binding, only: c_int32_t, c_size_t
+ use openacc_internal, only: acc_copyout_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ call acc_copyout_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyout_64_h (a, len)
+ use iso_c_binding, only: c_int64_t, c_size_t
+ use openacc_internal, only: acc_copyout_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ call acc_copyout_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyout_array_h (a)
+ use openacc_internal, only: acc_copyout_l
+ type (*), dimension (..), contiguous :: a
+ call acc_copyout_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_delete_32_h (a, len)
+ use iso_c_binding, only: c_int32_t, c_size_t
+ use openacc_internal, only: acc_delete_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ call acc_delete_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_delete_64_h (a, len)
+ use iso_c_binding, only: c_int64_t, c_size_t
+ use openacc_internal, only: acc_delete_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ call acc_delete_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_delete_array_h (a)
+ use openacc_internal, only: acc_delete_l
+ type (*), dimension (..), contiguous :: a
+ call acc_delete_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_update_device_32_h (a, len)
+ use iso_c_binding, only: c_int32_t, c_size_t
+ use openacc_internal, only: acc_update_device_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ call acc_update_device_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_update_device_64_h (a, len)
+ use iso_c_binding, only: c_int64_t, c_size_t
+ use openacc_internal, only: acc_update_device_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ call acc_update_device_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_update_device_array_h (a)
+ use openacc_internal, only: acc_update_device_l
+ type (*), dimension (..), contiguous :: a
+ call acc_update_device_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_update_self_32_h (a, len)
+ use iso_c_binding, only: c_int32_t, c_size_t
+ use openacc_internal, only: acc_update_self_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ call acc_update_self_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_update_self_64_h (a, len)
+ use iso_c_binding, only: c_int64_t, c_size_t
+ use openacc_internal, only: acc_update_self_l
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ call acc_update_self_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_update_self_array_h (a)
+ use openacc_internal, only: acc_update_self_l
+ type (*), dimension (..), contiguous :: a
+ call acc_update_self_l (a, sizeof (a))
+end subroutine
+
+function acc_is_present_32_h (a, len)
+ use iso_c_binding, only: c_int32_t, c_size_t
+ use openacc_internal, only: acc_is_present_l
+ logical acc_is_present_32_h
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
+ acc_is_present_32_h = .TRUE.
+ else
+ acc_is_present_32_h = .FALSE.
+ end if
+end function
+
+function acc_is_present_64_h (a, len)
+ use iso_c_binding, only: c_int64_t, c_size_t
+ use openacc_internal, only: acc_is_present_l
+ logical acc_is_present_64_h
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
+ acc_is_present_64_h = .TRUE.
+ else
+ acc_is_present_64_h = .FALSE.
+ end if
+end function
+
+function acc_is_present_array_h (a)
+ use openacc_internal, only: acc_is_present_l
+ logical acc_is_present_array_h
+ type (*), dimension (..), contiguous :: a
+ acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1
+end function
new file mode 100644
@@ -0,0 +1,127 @@
+/* OpenACC Runtime Library User-facing Declarations
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _OPENACC_H
+#define _OPENACC_H 1
+
+#include "gomp-constants.h"
+
+/* The OpenACC std is silent on whether or not including openacc.h
+ might or must not include other header files. We chose to include
+ some. */
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if __cplusplus >= 201103
+# define __GOACC_NOTHROW noexcept ()
+#elif __cplusplus
+# define __GOACC_NOTHROW throw ()
+#else /* Not C++ */
+# define __GOACC_NOTHROW __attribute__ ((__nothrow__))
+#endif
+
+ /* Types */
+ typedef enum acc_device_t
+ {
+ acc_device_none = 0,
+ acc_device_default, /* This has to be a distinct value, as no
+ return value can match it. */
+ acc_device_host = GOMP_TARGET_HOST,
+ acc_device_host_nonshm = GOMP_TARGET_HOST_NONSHM,
+ acc_device_not_host,
+ acc_device_nvidia = GOMP_TARGET_NVIDIA_PTX,
+ _ACC_device_hwm
+ } acc_device_t;
+
+ typedef enum acc_async_t
+ {
+ acc_async_noval = -1,
+ acc_async_sync = -2
+ } acc_async_t;
+
+ int acc_get_num_devices (acc_device_t __dev) __GOACC_NOTHROW;
+ void acc_set_device_type (acc_device_t __dev) __GOACC_NOTHROW;
+ acc_device_t acc_get_device_type (void) __GOACC_NOTHROW;
+ void acc_set_device_num (int __num, acc_device_t __dev) __GOACC_NOTHROW;
+ int acc_get_device_num (acc_device_t __dev) __GOACC_NOTHROW;
+ int acc_async_test (int __async) __GOACC_NOTHROW;
+ int acc_async_test_all (void) __GOACC_NOTHROW;
+ void acc_wait (int __async) __GOACC_NOTHROW;
+ void acc_wait_async (int __async1, int __async2) __GOACC_NOTHROW;
+ void acc_wait_all (void) __GOACC_NOTHROW;
+ void acc_wait_all_async (int __async) __GOACC_NOTHROW;
+ void acc_init (acc_device_t __dev) __GOACC_NOTHROW;
+ void acc_shutdown (acc_device_t __dev) __GOACC_NOTHROW;
+ int acc_on_device (acc_device_t __dev) __GOACC_NOTHROW;
+ void *acc_malloc (size_t) __GOACC_NOTHROW;
+ void acc_free (void *) __GOACC_NOTHROW;
+ /* Some of these would be more correct with const qualifiers, but
+ the standard specifies otherwise. */
+ void *acc_copyin (void *, size_t) __GOACC_NOTHROW;
+ void *acc_present_or_copyin (void *, size_t) __GOACC_NOTHROW;
+ void *acc_create (void *, size_t) __GOACC_NOTHROW;
+ void *acc_present_or_create (void *, size_t) __GOACC_NOTHROW;
+ void acc_copyout (void *, size_t) __GOACC_NOTHROW;
+ void acc_delete (void *, size_t) __GOACC_NOTHROW;
+ void acc_update_device (void *, size_t) __GOACC_NOTHROW;
+ void acc_update_self (void *, size_t) __GOACC_NOTHROW;
+ void acc_map_data (void *, void *, size_t) __GOACC_NOTHROW;
+ void acc_unmap_data (void *) __GOACC_NOTHROW;
+ void *acc_deviceptr (void *) __GOACC_NOTHROW;
+ void *acc_hostptr (void *) __GOACC_NOTHROW;
+ int acc_is_present (void *, size_t) __GOACC_NOTHROW;
+ void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW;
+ void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW;
+
+ void ACC_target (int, void (*) (void *), const void *,
+ size_t, void **, size_t *, unsigned char *, int *) __GOACC_NOTHROW;
+ void ACC_parallel (int, void (*) (void *), const void *,
+ size_t, void **, size_t *, unsigned char *) __GOACC_NOTHROW;
+ void ACC_add_device_code (void const *, char const *) __GOACC_NOTHROW;
+
+ void ACC_async_copy (int) __GOACC_NOTHROW;
+ void ACC_async_kern (int) __GOACC_NOTHROW;
+
+ /* Old names. OpenACC does not specify whether these can or must
+ not be macros, inlines or aliases for the new names. */
+ #define acc_pcreate acc_present_or_create
+ #define acc_pcopyin acc_present_or_copyin
+
+ /* CUDA-specific routines. */
+ void *acc_get_current_cuda_device (void) __GOACC_NOTHROW;
+ void *acc_get_current_cuda_context (void) __GOACC_NOTHROW;
+ void *acc_get_cuda_stream (int __async) __GOACC_NOTHROW;
+ int acc_set_cuda_stream (int __async, void *__stream) __GOACC_NOTHROW;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OPENACC_H */
new file mode 100644
@@ -0,0 +1,390 @@
+! OpenACC Runtime Library Definitions. -*- mode: fortran -*-
+
+! Copyright (C) 2014 Free Software Foundation, Inc.
+
+! Contributed by Tobias Burnus <burnus@net-b.de>
+! and Mentor Embedded.
+
+! This file is part of the GNU OpenMP Library (libgomp).
+
+! Libgomp is free software; you can redistribute it and/or modify it
+! under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 3, or (at your option)
+! any later version.
+
+! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+! FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+! more details.
+
+! Under Section 7 of GPL version 3, you are granted additional
+! permissions described in the GCC Runtime Library Exception, version
+! 3.1, as published by the Free Software Foundation.
+
+! You should have received a copy of the GNU General Public License and
+! a copy of the GCC Runtime Library Exception along with this program;
+! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+! <http://www.gnu.org/licenses/>.
+
+! NOTE: Due to the use of dimension (..), the code only works when compiled
+! with -std=f2008ts/gnu/legacy but not with other standard settings.
+! Alternatively, the user can use the module version, which permits
+! compilation with -std=f95.
+
+ integer, parameter :: acc_device_kind = 4
+
+ integer (acc_device_kind), parameter :: acc_device_none = 0
+ integer (acc_device_kind), parameter :: acc_device_default = 1
+ integer (acc_device_kind), parameter :: acc_device_host = 2
+ integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3
+ integer (acc_device_kind), parameter :: acc_device_not_host = 4
+ integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+
+ integer, parameter :: acc_handle_kind = 4
+
+ integer (acc_handle_kind), parameter :: acc_async_noval = -1
+ integer (acc_handle_kind), parameter :: acc_async_sync = -2
+
+ integer, parameter :: openacc_version = 201306
+
+ interface
+ function acc_get_num_devices (d)
+ & bind (C, name = "acc_get_num_devices")
+ use iso_c_binding, only: c_int
+ integer (c_int) :: acc_get_num_devices
+ integer (c_int), value :: d
+ end function
+ end interface
+
+ interface acc_set_device_type
+ subroutine acc_set_device_type (d)
+ & bind (C, name = "acc_set_device_type")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: d
+ end subroutine
+ end interface
+
+ interface acc_get_device_type
+ function acc_get_device_type ()
+ & bind (C, name = "acc_get_device_type")
+ use iso_c_binding, only: c_int
+ integer (c_int) :: acc_get_device_type
+ end function
+ end interface
+
+ interface acc_set_device_num
+ subroutine acc_set_device_num (n, d)
+ & bind (C, name = "acc_set_device_num")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: n, d
+ end subroutine
+ end interface
+
+ interface acc_get_device_num
+ function acc_get_device_num (d)
+ & bind (C, name = "acc_get_device_num")
+ use iso_c_binding, only: c_int
+ integer (c_int) :: acc_get_device_num
+ integer (c_int), value :: d
+ end function
+ end interface
+
+ interface acc_async_test
+ function acc_async_test_h (a)
+ logical acc_async_test_h
+ integer a
+ end function
+ end interface
+
+ interface acc_async_test_all
+ function acc_async_test_all_h ()
+ logical acc_async_test_all_h
+ end function
+ end interface
+
+ interface acc_wait
+ subroutine acc_wait (a)
+ & bind (C, name = "acc_wait")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: a
+ end subroutine
+ end interface
+
+ interface acc_wait_async
+ subroutine acc_wait_async (a1, a2)
+ & bind (C, name = "acc_wait_async")
+ end subroutine
+ end interface
+
+ interface acc_wait_all
+ subroutine acc_wait_all ()
+ & bind (C, name = "acc_wait_all")
+ use iso_c_binding, only: c_int
+ end subroutine
+ end interface
+
+ interface acc_wait_all_async
+ subroutine acc_wait_all_async (a)
+ & bind (C, name = "acc_wait_all_async")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: a
+ end subroutine
+ end interface
+
+ interface acc_init
+ subroutine acc_init (d)
+ & bind (C, name = "acc_init")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: d
+ end subroutine
+ end interface
+
+ interface acc_shutdown
+ subroutine acc_shutdown (d)
+ & bind (C, name = "acc_shutdown")
+ use iso_c_binding, only: c_int
+ integer (c_int), value :: d
+ end subroutine
+ end interface
+
+ interface acc_on_device
+ function acc_on_device_h (devicetype)
+ import acc_device_kind
+ logical acc_on_device_h
+ integer (acc_device_kind) devicetype
+ end function
+ end interface
+
+ ! acc_malloc: Only available in C/C++
+ ! acc_free: Only available in C/C++
+
+ interface acc_copyin
+ subroutine acc_copyin_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_copyin_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_copyin_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ interface acc_present_or_copyin
+ subroutine acc_present_or_copyin_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_present_or_copyin_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_present_or_copyin_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ interface acc_pcopyin
+ subroutine acc_pcopyin_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_pcopyin_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_pcopyin_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ interface acc_create
+ subroutine acc_create_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_create_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_create_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ interface acc_present_or_create
+ subroutine acc_present_or_create_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_present_or_create_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_present_or_create_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ interface acc_pcreate
+ subroutine acc_pcreate_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_pcreate_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_pcreate_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ interface acc_copyout
+ subroutine acc_copyout_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_copyout_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_copyout_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ interface acc_delete
+ subroutine acc_delete_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_delete_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_delete_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ interface acc_update_device
+ subroutine acc_update_device_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_update_device_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_update_device_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ interface acc_update_self
+ subroutine acc_update_self_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end subroutine
+
+ subroutine acc_update_self_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end subroutine
+
+ subroutine acc_update_self_array_h (a)
+ type (*), dimension (..), contiguous :: a
+ end subroutine
+ end interface
+
+ ! acc_map_data: Only available in C/C++
+ ! acc_unmap_data: Only available in C/C++
+ ! acc_deviceptr: Only available in C/C++
+ ! acc_ostptr: Only available in C/C++
+
+ interface acc_is_present
+ function acc_is_present_32_h (a, len)
+ use iso_c_binding, only: c_int32_t
+ logical acc_is_present_32_h
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int32_t) len
+ end function
+
+ function acc_is_present_64_h (a, len)
+ use iso_c_binding, only: c_int64_t
+ logical acc_is_present_64_h
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+ type (*), dimension (*) :: a
+ integer (c_int64_t) len
+ end function
+
+ function acc_is_present_array_h (a)
+ logical acc_is_present_array_h
+ type (*), dimension (..), contiguous :: a
+ end function
+ end interface
+
+ ! acc_memcpy_to_device: Only available in C/C++
+ ! acc_memcpy_from_device: Only available in C/C++
new file mode 100644
@@ -0,0 +1,47 @@
+# Plugins for offload execution, Makefile.am fragment.
+#
+# Copyright (C) 2014 Free Software Foundation, Inc.
+#
+# Contributed by Mentor Embedded.
+#
+# This file is part of the GNU OpenMP Library (libgomp).
+#
+# Libgomp is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+# more details.
+#
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+#
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+# <http://www.gnu.org/licenses/>.
+
+if PLUGIN_NVPTX
+# Nvidia PTX OpenACC plugin.
+libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-nvptx.la
+libgomp_plugin_nvptx_la_SOURCES = plugin/plugin-nvptx.c
+libgomp_plugin_nvptx_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_NVPTX_CPPFLAGS)
+libgomp_plugin_nvptx_la_LDFLAGS = $(libgomp_plugin_nvptx_version_info) \
+ $(lt_host_flags)
+libgomp_plugin_nvptx_la_LDFLAGS += $(PLUGIN_NVPTX_LDFLAGS)
+libgomp_plugin_nvptx_la_LIBADD = $(PLUGIN_NVPTX_LIBS)
+libgomp_plugin_nvptx_la_LIBTOOLFLAGS = --tag=disable-static
+endif
+
+libgomp_plugin_host_nonshm_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-host_nonshm.la
+libgomp_plugin_host_nonshm_la_SOURCES = plugin/plugin-host.c
+libgomp_plugin_host_nonshm_la_CPPFLAGS = $(AM_CPPFLAGS) -DHOST_NONSHM_PLUGIN
+libgomp_plugin_host_nonshm_la_LDFLAGS = \
+ $(libgomp_plugin_host_nonshm_version_info) $(lt_host_flags)
+libgomp_plugin_host_nonshm_la_LIBTOOLFLAGS = --tag=disable-static
new file mode 100644
@@ -0,0 +1,107 @@
+# Plugins for offload execution, configure.ac fragment.
+#
+# Copyright (C) 2014 Free Software Foundation, Inc.
+#
+# Contributed by Mentor Embedded.
+#
+# This file is part of the GNU OpenMP Library (libgomp).
+#
+# Libgomp is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+# more details.
+#
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+#
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+# <http://www.gnu.org/licenses/>.
+
+# Look for the CUDA driver package.
+CUDA_DRIVER_INCLUDE=
+CUDA_DRIVER_LIB=
+AC_SUBST(CUDA_DRIVER_INCLUDE)
+AC_SUBST(CUDA_DRIVER_LIB)
+CUDA_DRIVER_CPPFLAGS=
+CUDA_DRIVER_LDFLAGS=
+AC_ARG_WITH(cuda-driver,
+ [AS_HELP_STRING([--with-cuda-driver=PATH],
+ [specify prefix directory for installed CUDA driver package.
+ Equivalent to --with-cuda-driver-include=PATH/include
+ plus --with-cuda-driver-lib=PATH/lib])])
+AC_ARG_WITH(cuda-driver-include,
+ [AS_HELP_STRING([--with-cuda-driver-include=PATH],
+ [specify directory for installed CUDA driver include files])])
+AC_ARG_WITH(cuda-driver-lib,
+ [AS_HELP_STRING([--with-cuda-driver-lib=PATH],
+ [specify directory for the installed CUDA driver library])])
+if test "x$with_cuda_driver" != x; then
+ CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
+ CUDA_DRIVER_LIB=$with_cuda_driver/lib
+fi
+if test "x$with_cuda_driver_include" != x; then
+ CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
+fi
+if test "x$with_cuda_driver_lib" != x; then
+ CUDA_DRIVER_LIB=$with_cuda_driver_lib
+fi
+if test "x$CUDA_DRIVER_INCLUDE" != x; then
+ CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
+fi
+if test "x$CUDA_DRIVER_LIB" != x; then
+ CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
+fi
+
+PLUGIN_NVPTX=0
+PLUGIN_NVPTX_CPPFLAGS=
+PLUGIN_NVPTX_LDFLAGS=
+PLUGIN_NVPTX_LIBS=
+AC_SUBST(PLUGIN_NVPTX)
+AC_SUBST(PLUGIN_NVPTX_CPPFLAGS)
+AC_SUBST(PLUGIN_NVPTX_LDFLAGS)
+AC_SUBST(PLUGIN_NVPTX_LIBS)
+
+for accel in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
+ case "$accel" in
+ nvptx*)
+ PLUGIN_NVPTX=$accel
+ PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
+ PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
+ PLUGIN_NVPTX_LIBS='-lcuda'
+
+ PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
+ CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
+ PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
+ LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
+ PLUGIN_NVPTX_save_LIBS=$LIBS
+ LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM(
+ [#include "cuda.h"],
+ [CUresult r = cuCtxPushCurrent (NULL);])],
+ [PLUGIN_NVPTX=1])
+ CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
+ LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
+ LIBS=$PLUGIN_NVPTX_save_LIBS
+ case $PLUGIN_NVPTX in
+ nvptx*)
+ PLUGIN_NVPTX=0
+ AC_MSG_ERROR([CUDA driver package required for nvptx support])
+ ;;
+ esac
+ ;;
+ esac
+done
+AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
+AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
+ [Define to 1 if the NVIDIA plugin is built, 0 if not.])
+
+AC_OUTPUT
new file mode 100644
@@ -0,0 +1,269 @@
+/* OpenACC Runtime Library: acc_device_host, acc_device_host_nonshm.
+
+ Copyright (C) 2013 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Simple implementation of support routines for a shared-memory
+ acc_device_host, and a non-shared memory acc_device_host_nonshm, with the
+ latter built as a plugin. */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "libgomp_target.h"
+#ifdef HOST_NONSHM_PLUGIN
+#include "libgomp-plugin.h"
+#include "oacc-plugin.h"
+#else
+#include "oacc-int.h"
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HOST_NONSHM_PLUGIN
+#define STATIC
+#define GOMP(X) GOMP_PLUGIN_##X
+#define SELF "host_nonshm plugin: "
+#else
+#define STATIC static
+#define GOMP(X) gomp_##X
+#define SELF "host: "
+#endif
+
+#ifndef HOST_NONSHM_PLUGIN
+static struct gomp_device_descr host_dispatch;
+#endif
+
+STATIC const char *
+GOMP_OFFLOAD_get_name (void)
+{
+#ifdef HOST_NONSHM_PLUGIN
+ return "host_nonshm";
+#else
+ return "host";
+#endif
+}
+
+STATIC int
+GOMP_OFFLOAD_get_type (void)
+{
+#ifdef HOST_NONSHM_PLUGIN
+ return OFFLOAD_TARGET_TYPE_HOST_NONSHM;
+#else
+ return OFFLOAD_TARGET_TYPE_HOST;
+#endif
+}
+
+STATIC unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+ unsigned int caps = TARGET_CAP_OPENACC_200 | TARGET_CAP_NATIVE_EXEC;
+
+#ifndef HOST_NONSHM_PLUGIN
+ caps |= TARGET_CAP_SHARED_MEM;
+#endif
+
+ return caps;
+}
+
+STATIC int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+ return 1;
+}
+
+STATIC void
+GOMP_OFFLOAD_register_image (void *host_table __attribute__((unused)),
+ void *target_data __attribute__((unused)))
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_init_device (int n __attribute__((unused)))
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_fini_device (int n __attribute__((unused)))
+{
+}
+
+STATIC int
+GOMP_OFFLOAD_get_table (int n __attribute__((unused)),
+ struct mapping_table **table __attribute__((unused)))
+{
+ return 0;
+}
+
+STATIC void *
+GOMP_OFFLOAD_openacc_open_device (int n)
+{
+ return (void *) (intptr_t) n;
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_close_device (void *hnd)
+{
+ return 0;
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_get_device_num (void)
+{
+ return 0;
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_set_device_num (int n)
+{
+ if (n > 0)
+ GOMP(fatal) ("device number %u out of range for host execution", n);
+}
+
+STATIC void *
+GOMP_OFFLOAD_alloc (int n __attribute__((unused)), size_t s)
+{
+ return GOMP(malloc) (s);
+}
+
+STATIC void
+GOMP_OFFLOAD_free (int n __attribute__((unused)), void *p)
+{
+ free (p);
+}
+
+STATIC void *
+GOMP_OFFLOAD_host2dev (int n __attribute__((unused)), void *d, const void *h,
+ size_t s)
+{
+#ifdef HOST_NONSHM_PLUGIN
+ memcpy (d, h, s);
+#endif
+
+ return 0;
+}
+
+STATIC void *
+GOMP_OFFLOAD_dev2host (int n __attribute__((unused)), void *h, const void *d,
+ size_t s)
+{
+#ifdef HOST_NONSHM_PLUGIN
+ memcpy (h, d, s);
+#endif
+
+ return 0;
+}
+
+STATIC void
+GOMP_OFFLOAD_run (int n __attribute__((unused)), void *fn_ptr, void *vars)
+{
+ void (*fn)(void *) = (void (*)(void *)) fn_ptr;
+
+ fn (vars);
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *),
+ size_t mapnum __attribute__((unused)),
+ void **hostaddrs,
+ void **devaddrs __attribute__((unused)),
+ size_t *sizes __attribute__((unused)),
+ unsigned short *kinds __attribute__((unused)),
+ int num_gangs __attribute__((unused)),
+ int num_workers __attribute__((unused)),
+ int vector_length __attribute__((unused)),
+ int async __attribute__((unused)),
+ void *targ_mem_desc __attribute__((unused)))
+{
+#ifdef HOST_NONSHM_PLUGIN
+ fn (devaddrs);
+#else
+ fn (hostaddrs);
+#endif
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
+{
+#ifdef HOST_NONSHM_PLUGIN
+ /* "Asynchronous" launches are executed synchronously on the (non-SHM) host,
+ so there's no point in delaying host-side cleanup -- just do it now. */
+ GOMP_PLUGIN_async_unmap_vars (targ_mem_desc);
+#endif
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_set_async (int async __attribute__((unused)))
+{
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_async_test (int async __attribute__((unused)))
+{
+ return 1;
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_async_test_all (void)
+{
+ return 1;
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait (int async __attribute__((unused)))
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait_all (void)
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait_async (int async1 __attribute__((unused)),
+ int async2 __attribute__((unused)))
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__((unused)))
+{
+}
+
+STATIC void *
+GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data
+ __attribute__((unused)))
+{
+ return NULL;
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *tls_data
+ __attribute__((unused)))
+{
+}
new file mode 100644
@@ -0,0 +1,1852 @@
+/* Plugin for NVPTX execution.
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
+ library appears to hold some implicit state, but the documentation
+ is not clear as to what that state might be. Or how one might
+ propagate it from one thread to another. */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "libgomp_target.h"
+#include "libgomp-plugin.h"
+#include "oacc-plugin.h"
+
+#include <cuda.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <dlfcn.h>
+#include <unistd.h>
+#include <assert.h>
+
+#define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
+
+static struct
+{
+ CUresult r;
+ char *m;
+} cuda_errlist[]=
+{
+ { CUDA_ERROR_INVALID_VALUE, "invalid value" },
+ { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
+ { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
+ { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
+ { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
+ { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
+ { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
+ { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
+ { CUDA_ERROR_NO_DEVICE, "no device" },
+ { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
+ { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
+ { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
+ { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
+ { CUDA_ERROR_MAP_FAILED, "map error" },
+ { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
+ { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
+ { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
+ { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
+ { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
+ { CUDA_ERROR_NOT_MAPPED, "not mapped" },
+ { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
+ { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
+ { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
+ { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
+ { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
+ { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
+ { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
+ { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
+ { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ "shared object symbol not found" },
+ { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
+ { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
+ { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
+ { CUDA_ERROR_NOT_FOUND, "not found" },
+ { CUDA_ERROR_NOT_READY, "not ready" },
+ { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
+ { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
+ { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
+ { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ "launch incompatibe texturing" },
+ { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
+ { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
+ { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
+ { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
+ { CUDA_ERROR_ASSERT, "assert" },
+ { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
+ { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+ "host memory already registered" },
+ { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
+ { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
+ { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
+ { CUDA_ERROR_UNKNOWN, "unknown" }
+};
+
+static char errmsg[128];
+
+static char *
+cuda_error (CUresult r)
+{
+ int i;
+
+ for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
+ {
+ if (cuda_errlist[i].r == r)
+ return &cuda_errlist[i].m[0];
+ }
+
+ sprintf (&errmsg[0], "unknown result code: %5d", r);
+
+ return &errmsg[0];
+}
+
+struct targ_fn_descriptor
+{
+ CUfunction fn;
+ const char *name;
+};
+
+static bool ptx_inited = false;
+
+struct ptx_stream
+{
+ CUstream stream;
+ pthread_t host_thread;
+ bool multithreaded;
+
+ CUdeviceptr d;
+ void *h;
+ void *h_begin;
+ void *h_end;
+ void *h_next;
+ void *h_prev;
+ void *h_tail;
+
+ struct ptx_stream *next;
+};
+
+/* Thread-specific data for PTX. */
+
+struct nvptx_thread
+{
+ struct ptx_stream *current_stream;
+ struct ptx_device *ptx_dev;
+};
+
+struct map
+{
+ int async;
+ size_t size;
+ char mappings[0];
+};
+
+static void
+map_init (struct ptx_stream *s)
+{
+ CUresult r;
+
+ int size = getpagesize ();
+
+ assert (s);
+ assert (!s->d);
+ assert (!s->h);
+
+ r = cuMemAllocHost (&s->h, size);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
+
+ r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
+
+ assert (s->h);
+
+ s->h_begin = s->h;
+ s->h_end = s->h_begin + size;
+ s->h_next = s->h_prev = s->h_tail = s->h_begin;
+
+ assert (s->h_next);
+ assert (s->h_end);
+}
+
+static void
+map_fini (struct ptx_stream *s)
+{
+ CUresult r;
+
+ r = cuMemFreeHost (s->h);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
+}
+
+static void
+map_pop (struct ptx_stream *s)
+{
+ struct map *m;
+
+ assert (s != NULL);
+ assert (s->h_next);
+ assert (s->h_prev);
+ assert (s->h_tail);
+
+ m = s->h_tail;
+
+ s->h_tail += m->size;
+
+ if (s->h_tail >= s->h_end)
+ s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+
+ if (s->h_next == s->h_tail)
+ s->h_prev = s->h_next;
+
+ assert (s->h_next >= s->h_begin);
+ assert (s->h_tail >= s->h_begin);
+ assert (s->h_prev >= s->h_begin);
+
+ assert (s->h_next <= s->h_end);
+ assert (s->h_tail <= s->h_end);
+ assert (s->h_prev <= s->h_end);
+}
+
+static void
+map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
+{
+ int left;
+ int offset;
+ struct map *m;
+
+ assert (s != NULL);
+
+ left = s->h_end - s->h_next;
+ size += sizeof (struct map);
+
+ assert (s->h_prev);
+ assert (s->h_next);
+
+ if (size >= left)
+ {
+ m = s->h_prev;
+ m->size += left;
+ s->h_next = s->h_begin;
+
+ if (s->h_next + size > s->h_end)
+ GOMP_PLUGIN_fatal ("unable to push map");
+ }
+
+ assert (s->h_next);
+
+ m = s->h_next;
+ m->async = async;
+ m->size = size;
+
+ offset = (void *)&m->mappings[0] - s->h;
+
+ *d = (void *)(s->d + offset);
+ *h = (void *)(s->h + offset);
+
+ s->h_prev = s->h_next;
+ s->h_next += size;
+
+ assert (s->h_prev);
+ assert (s->h_next);
+
+ assert (s->h_next >= s->h_begin);
+ assert (s->h_tail >= s->h_begin);
+ assert (s->h_prev >= s->h_begin);
+ assert (s->h_next <= s->h_end);
+ assert (s->h_tail <= s->h_end);
+ assert (s->h_prev <= s->h_end);
+
+ return;
+}
+
+struct ptx_device
+{
+ CUcontext ctx;
+ bool ctx_shared;
+ CUdevice dev;
+ struct ptx_stream *null_stream;
+ /* All non-null streams associated with this device (actually context),
+ either created implicitly or passed in from the user (via
+ acc_set_cuda_stream). */
+ struct ptx_stream *active_streams;
+ struct {
+ struct ptx_stream **arr;
+ int size;
+ } async_streams;
+ /* A lock for use when manipulating the above stream list and array. */
+ gomp_mutex_t stream_lock;
+ int ord;
+ bool overlap;
+ bool map;
+ bool concur;
+ int mode;
+ bool mkern;
+
+ struct ptx_device *next;
+};
+
+enum PTX_event_type
+{
+ PTX_EVT_MEM,
+ PTX_EVT_KNL,
+ PTX_EVT_SYNC,
+ PTX_EVT_ASYNC_CLEANUP
+};
+
+struct PTX_event
+{
+ CUevent *evt;
+ int type;
+ void *addr;
+ int ord;
+
+ struct PTX_event *next;
+};
+
+static gomp_mutex_t PTX_event_lock;
+static struct PTX_event *PTX_events;
+
+#define _XSTR(s) _STR(s)
+#define _STR(s) #s
+
+static struct _synames
+{
+ char *n;
+} cuSymNames[] =
+{
+ { _XSTR (cuCtxCreate) },
+ { _XSTR (cuCtxDestroy) },
+ { _XSTR (cuCtxGetCurrent) },
+ { _XSTR (cuCtxPushCurrent) },
+ { _XSTR (cuCtxSynchronize) },
+ { _XSTR (cuDeviceGet) },
+ { _XSTR (cuDeviceGetAttribute) },
+ { _XSTR (cuDeviceGetCount) },
+ { _XSTR (cuEventCreate) },
+ { _XSTR (cuEventDestroy) },
+ { _XSTR (cuEventQuery) },
+ { _XSTR (cuEventRecord) },
+ { _XSTR (cuInit) },
+ { _XSTR (cuLaunchKernel) },
+ { _XSTR (cuLinkAddData) },
+ { _XSTR (cuLinkComplete) },
+ { _XSTR (cuLinkCreate) },
+ { _XSTR (cuMemAlloc) },
+ { _XSTR (cuMemAllocHost) },
+ { _XSTR (cuMemcpy) },
+ { _XSTR (cuMemcpyDtoH) },
+ { _XSTR (cuMemcpyDtoHAsync) },
+ { _XSTR (cuMemcpyHtoD) },
+ { _XSTR (cuMemcpyHtoDAsync) },
+ { _XSTR (cuMemFree) },
+ { _XSTR (cuMemFreeHost) },
+ { _XSTR (cuMemGetAddressRange) },
+ { _XSTR (cuMemHostGetDevicePointer) },
+ { _XSTR (cuMemHostRegister) },
+ { _XSTR (cuMemHostUnregister) },
+ { _XSTR (cuModuleGetFunction) },
+ { _XSTR (cuModuleLoadData) },
+ { _XSTR (cuStreamDestroy) },
+ { _XSTR (cuStreamQuery) },
+ { _XSTR (cuStreamSynchronize) },
+ { _XSTR (cuStreamWaitEvent) }
+};
+
+static int
+verify_device_library (void)
+{
+ int i;
+ void *dh, *ds;
+
+ dh = dlopen ("libcuda.so", RTLD_LAZY);
+ if (!dh)
+ return -1;
+
+ for (i = 0; i < ARRAYSIZE (cuSymNames); i++)
+ {
+ ds = dlsym (dh, cuSymNames[i].n);
+ if (!ds)
+ return -1;
+ }
+
+ dlclose (dh);
+
+ return 0;
+}
+
+static inline struct nvptx_thread *
+nvptx_thread (void)
+{
+ return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
+}
+
+static void
+init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
+{
+ int i;
+ struct ptx_stream *null_stream
+ = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
+
+ null_stream->stream = NULL;
+ null_stream->host_thread = pthread_self ();
+ null_stream->multithreaded = true;
+ null_stream->d = (CUdeviceptr) NULL;
+ null_stream->h = NULL;
+ map_init (null_stream);
+ ptx_dev->null_stream = null_stream;
+
+ ptx_dev->active_streams = NULL;
+ GOMP_PLUGIN_mutex_init (&ptx_dev->stream_lock);
+
+ if (concurrency < 1)
+ concurrency = 1;
+
+ /* This is just a guess -- make space for as many async streams as the
+ current device is capable of concurrently executing. This can grow
+ later as necessary. No streams are created yet. */
+ ptx_dev->async_streams.arr
+ = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
+ ptx_dev->async_streams.size = concurrency;
+
+ for (i = 0; i < concurrency; i++)
+ ptx_dev->async_streams.arr[i] = NULL;
+}
+
+static void
+fini_streams_for_device (struct ptx_device *ptx_dev)
+{
+ free (ptx_dev->async_streams.arr);
+
+ while (ptx_dev->active_streams != NULL)
+ {
+ struct ptx_stream *s = ptx_dev->active_streams;
+ ptx_dev->active_streams = ptx_dev->active_streams->next;
+
+ cuStreamDestroy (s->stream);
+ map_fini (s);
+ free (s);
+ }
+
+ map_fini (ptx_dev->null_stream);
+ free (ptx_dev->null_stream);
+}
+
+/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
+ thread THREAD (and also current device/context). If CREATE is true, create
+ the stream if it does not exist (or use EXISTING if it is non-NULL), and
+ associate the stream with the same thread argument. Returns stream to use
+ as result. */
+
+static struct ptx_stream *
+select_stream_for_async (int async, pthread_t thread, bool create,
+ CUstream existing)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+ /* Local copy of TLS variable. */
+ struct ptx_device *ptx_dev = nvthd->ptx_dev;
+ struct ptx_stream *stream = NULL;
+ int orig_async = async;
+
+ /* The special value acc_async_noval (-1) maps (for now) to an
+ implicitly-created stream, which is then handled the same as any other
+ numbered async stream. Other options are available, e.g. using the null
+ stream for anonymous async operations, or choosing an idle stream from an
+ active set. But, stick with this for now. */
+ if (async > acc_async_sync)
+ async++;
+
+ if (create)
+ GOMP_PLUGIN_mutex_lock (&ptx_dev->stream_lock);
+
+ /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
+ null stream, and in fact better performance may be obtainable if it doesn't
+ (because the null stream enforces overly-strict synchronisation with
+ respect to other streams for legacy reasons, and that's probably not
+ needed with OpenACC). Maybe investigate later. */
+ if (async == acc_async_sync)
+ stream = ptx_dev->null_stream;
+ else if (async >= 0 && async < ptx_dev->async_streams.size
+ && ptx_dev->async_streams.arr[async] && !(create && existing))
+ stream = ptx_dev->async_streams.arr[async];
+ else if (async >= 0 && create)
+ {
+ if (async >= ptx_dev->async_streams.size)
+ {
+ int i, newsize = ptx_dev->async_streams.size * 2;
+
+ if (async >= newsize)
+ newsize = async + 1;
+
+ ptx_dev->async_streams.arr
+ = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
+ newsize * sizeof (struct ptx_stream *));
+
+ for (i = ptx_dev->async_streams.size; i < newsize; i++)
+ ptx_dev->async_streams.arr[i] = NULL;
+
+ ptx_dev->async_streams.size = newsize;
+ }
+
+ /* Create a new stream on-demand if there isn't one already, or if we're
+ setting a particular async value to an existing (externally-provided)
+ stream. */
+ if (!ptx_dev->async_streams.arr[async] || existing)
+ {
+ CUresult r;
+ struct ptx_stream *s
+ = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
+
+ if (existing)
+ s->stream = existing;
+ else
+ {
+ r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
+ }
+
+ /* If CREATE is true, we're going to be queueing some work on this
+ stream. Associate it with the current host thread. */
+ s->host_thread = thread;
+ s->multithreaded = false;
+
+ s->d = (CUdeviceptr) NULL;
+ s->h = NULL;
+ map_init (s);
+
+ s->next = ptx_dev->active_streams;
+ ptx_dev->active_streams = s;
+ ptx_dev->async_streams.arr[async] = s;
+ }
+
+ stream = ptx_dev->async_streams.arr[async];
+ }
+ else if (async < 0)
+ GOMP_PLUGIN_fatal ("bad async %d", async);
+
+ if (create)
+ {
+ assert (stream != NULL);
+
+ /* If we're trying to use the same stream from different threads
+ simultaneously, set stream->multithreaded to true. This affects the
+ behaviour of acc_async_test_all and acc_wait_all, which are supposed to
+ only wait for asynchronous launches from the same host thread they are
+ invoked on. If multiple threads use the same async value, we make note
+ of that here and fall back to testing/waiting for all threads in those
+ functions. */
+ if (thread != stream->host_thread)
+ stream->multithreaded = true;
+
+ GOMP_PLUGIN_mutex_unlock (&ptx_dev->stream_lock);
+ }
+ else if (stream && !stream->multithreaded
+ && !pthread_equal (stream->host_thread, thread))
+ GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
+
+ return stream;
+}
+
+static int PTX_get_num_devices (void);
+
+/* Initialize the device. */
+static int
+PTX_init (void)
+{
+ CUresult r;
+ int rc;
+
+ if (ptx_inited)
+ return PTX_get_num_devices ();
+
+ rc = verify_device_library ();
+ if (rc < 0)
+ return -1;
+
+ r = cuInit (0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
+
+ PTX_events = NULL;
+
+ GOMP_PLUGIN_mutex_init (&PTX_event_lock);
+
+ ptx_inited = true;
+
+ return PTX_get_num_devices ();
+}
+
+static void
+PTX_fini (void)
+{
+ ptx_inited = false;
+}
+
+static void *
+PTX_open_device (int n)
+{
+ struct ptx_device *ptx_dev;
+ CUdevice dev;
+ CUresult r;
+ int async_engines, pi;
+
+ r = cuDeviceGet (&dev, n);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
+
+ ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
+
+ ptx_dev->ord = n;
+ ptx_dev->dev = dev;
+ ptx_dev->ctx_shared = false;
+
+ r = cuCtxGetCurrent (&ptx_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+
+ if (!ptx_dev->ctx)
+ {
+ r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
+ }
+ else
+ ptx_dev->ctx_shared = true;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+ ptx_dev->overlap = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+ ptx_dev->map = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+ ptx_dev->concur = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+ ptx_dev->mode = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+ ptx_dev->mkern = pi;
+
+ r = cuDeviceGetAttribute (&async_engines,
+ CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+ if (r != CUDA_SUCCESS)
+ async_engines = 1;
+
+ init_streams_for_device (ptx_dev, async_engines);
+
+ return (void *) ptx_dev;
+}
+
+static int
+PTX_close_device (void *targ_data)
+{
+ CUresult r;
+ struct ptx_device *ptx_dev = targ_data;
+
+ if (!ptx_dev)
+ return 0;
+
+ fini_streams_for_device (ptx_dev);
+
+ if (!ptx_dev->ctx_shared)
+ {
+ r = cuCtxDestroy (ptx_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
+ }
+
+ free (ptx_dev);
+
+ return 0;
+}
+
+static int
+PTX_get_num_devices (void)
+{
+ int n;
+ CUresult r;
+
+ /* This function will be called before the plugin has been initialized in
+ order to enumerate available devices, but CUDA API routines can't be used
+ until cuInit has been called. Just call it now (but don't yet do any
+ further initialization). */
+ if (!ptx_inited)
+ cuInit (0);
+
+ r = cuDeviceGetCount (&n);
+ if (r!= CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
+
+ return n;
+}
+
+#define ABORT_PTX \
+ ".version 3.1\n" \
+ ".target sm_30\n" \
+ ".address_size 64\n" \
+ ".visible .func abort;\n" \
+ ".visible .func abort\n" \
+ "{\n" \
+ "trap;\n" \
+ "ret;\n" \
+ "}\n" \
+ ".visible .func _gfortran_abort;\n" \
+ ".visible .func _gfortran_abort\n" \
+ "{\n" \
+ "trap;\n" \
+ "ret;\n" \
+ "}\n" \
+
+/* Generated with:
+
+ $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
+*/
+#define ACC_ON_DEVICE_PTX \
+ " .version 3.1\n" \
+ " .target sm_30\n" \
+ " .address_size 64\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
+ "{\n" \
+ " .reg.u32 %ar1;\n" \
+ ".reg.u32 %retval;\n" \
+ " .reg.u64 %hr10;\n" \
+ " .reg.u32 %r24;\n" \
+ " .reg.u32 %r25;\n" \
+ " .reg.pred %r27;\n" \
+ " .reg.u32 %r30;\n" \
+ " ld.param.u32 %ar1, [%in_ar1];\n" \
+ " mov.u32 %r24, %ar1;\n" \
+ " setp.ne.u32 %r27,%r24,4;\n" \
+ " set.u32.eq.u32 %r30,%r24,5;\n" \
+ " neg.s32 %r25, %r30;\n" \
+ " @%r27 bra $L3;\n" \
+ " mov.u32 %r25, 1;\n" \
+ "$L3:\n" \
+ " mov.u32 %retval, %r25;\n" \
+ " st.param.u32 [%out_retval], %retval;\n" \
+ " ret;\n" \
+ " }\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1);\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1)\n" \
+ "{\n" \
+ " .reg.u64 %ar1;\n" \
+ ".reg.u32 %retval;\n" \
+ " .reg.u64 %hr10;\n" \
+ " .reg.u64 %r25;\n" \
+ " .reg.u32 %r26;\n" \
+ " .reg.u32 %r27;\n" \
+ " ld.param.u64 %ar1, [%in_ar1];\n" \
+ " mov.u64 %r25, %ar1;\n" \
+ " ld.u32 %r26, [%r25];\n" \
+ " {\n" \
+ " .param.u32 %retval_in;\n" \
+ " {\n" \
+ " .param.u32 %out_arg0;\n" \
+ " st.param.u32 [%out_arg0], %r26;\n" \
+ " call (%retval_in), acc_on_device, (%out_arg0);\n" \
+ " }\n" \
+ " ld.param.u32 %r27, [%retval_in];\n" \
+ "}\n" \
+ " mov.u32 %retval, %r27;\n" \
+ " st.param.u32 [%out_retval], %retval;\n" \
+ " ret;\n" \
+ " }"
+
+static void
+link_ptx (CUmodule *module, char *ptx_code)
+{
+ CUjit_option opts[7];
+ void *optvals[7];
+ float elapsed = 0.0;
+#define LOGSIZE 8192
+ char elog[LOGSIZE];
+ char ilog[LOGSIZE];
+ unsigned long logsize = LOGSIZE;
+ CUlinkState linkstate;
+ CUresult r;
+ void *linkout;
+ size_t linkoutsize __attribute__((unused));
+
+ GOMP_PLUGIN_notify ("attempting to load:\n---\n%s\n---\n", ptx_code);
+
+ opts[0] = CU_JIT_WALL_TIME;
+ optvals[0] = &elapsed;
+
+ opts[1] = CU_JIT_INFO_LOG_BUFFER;
+ optvals[1] = &ilog[0];
+
+ opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+ optvals[2] = (void *) logsize;
+
+ opts[3] = CU_JIT_ERROR_LOG_BUFFER;
+ optvals[3] = &elog[0];
+
+ opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+ optvals[4] = (void *) logsize;
+
+ opts[5] = CU_JIT_LOG_VERBOSE;
+ optvals[5] = (void *) 1;
+
+ opts[6] = CU_JIT_TARGET;
+ optvals[6] = (void *) CU_TARGET_COMPUTE_30;
+
+ r = cuLinkCreate (7, opts, optvals, &linkstate);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
+
+ char *abort_ptx = ABORT_PTX;
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
+ strlen (abort_ptx) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+ GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
+ }
+
+ char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
+ strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+ GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
+ cuda_error (r));
+ }
+
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
+ strlen (ptx_code) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+ GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
+ }
+
+ r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
+
+ GOMP_PLUGIN_notify ("Link complete: %fms\n", elapsed);
+ GOMP_PLUGIN_notify ("Link log %s\n", &ilog[0]);
+
+ r = cuModuleLoadData (module, linkout);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
+}
+
+static void
+event_gc (bool memmap_lockable)
+{
+ struct PTX_event *e = PTX_events;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
+
+ while (e != NULL)
+ {
+ CUresult r;
+
+ if (e->ord != nvthd->ptx_dev->ord)
+ {
+ e = e->next;
+ continue;
+ }
+
+ r = cuEventQuery (*e->evt);
+ if (r == CUDA_SUCCESS)
+ {
+ CUevent *te;
+
+ te = e->evt;
+
+ switch (e->type)
+ {
+ case PTX_EVT_MEM:
+ case PTX_EVT_SYNC:
+ break;
+
+ case PTX_EVT_KNL:
+ map_pop (e->addr);
+ break;
+
+ case PTX_EVT_ASYNC_CLEANUP:
+ {
+ /* The function gomp_plugin_async_unmap_vars needs to claim the
+ memory-map splay tree lock for the current device, so we
+ can't call it when one of our callers has already claimed
+ the lock. In that case, just delay the GC for this event
+ until later. */
+ if (!memmap_lockable)
+ {
+ e = e->next;
+ continue;
+ }
+
+ GOMP_PLUGIN_async_unmap_vars (e->addr);
+ }
+ break;
+ }
+
+ cuEventDestroy (*te);
+ free ((void *)te);
+
+ struct PTX_event *next = e->next;
+
+ if (PTX_events == e)
+ PTX_events = PTX_events->next;
+ else
+ {
+ struct PTX_event *e_ = PTX_events;
+ while (e_->next != e)
+ e_ = e_->next;
+ e_->next = e_->next->next;
+ }
+
+ free (e);
+ e = next;
+ }
+ else
+ e = e->next;
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
+}
+
+static void
+event_add (enum PTX_event_type type, CUevent *e, void *h)
+{
+ struct PTX_event *ptx_event;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
+ || type == PTX_EVT_ASYNC_CLEANUP);
+
+ ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event));
+ ptx_event->type = type;
+ ptx_event->evt = e;
+ ptx_event->addr = h;
+ ptx_event->ord = nvthd->ptx_dev->ord;
+
+ GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
+
+ ptx_event->next = PTX_events;
+ PTX_events = ptx_event;
+
+ GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
+}
+
+void
+PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
+ size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
+ int vector_length, int async, void *targ_mem_desc)
+{
+ struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
+ CUfunction function;
+ CUresult r;
+ int i;
+ struct ptx_stream *dev_str;
+ void *kargs[1];
+ void *hp, *dp;
+ unsigned int nthreads_in_block;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ function = targ_fn->fn;
+
+ dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
+ assert (dev_str == nvthd->current_stream);
+
+ /* This reserves a chunk of a pre-allocated page of memory mapped on both
+ the host and the device. HP is a host pointer to the new chunk, and DP is
+ the corresponding device pointer. */
+ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
+
+ GOMP_PLUGIN_notify (" %s: prepare mappings\n", __FUNCTION__);
+
+ /* Copy the array of arguments to the mapped page. */
+ for (i = 0; i < mapnum; i++)
+ ((void **) hp)[i] = devaddrs[i];
+
+ /* Copy the (device) pointers to arguments to the device (dp and hp might in
+ fact have the same value on a unified-memory system). */
+ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
+
+ GOMP_PLUGIN_notify (" %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
+
+ // XXX: possible geometry mappings??
+ //
+ // OpenACC CUDA
+ //
+ // num_gangs blocks
+ // num_workers warps (where a warp is equivalent to 32 threads)
+ // vector length threads
+ //
+
+ /* The openacc vector_length clause 'determines the vector length to use for
+ vector or SIMD operations'. The question is how to map this to CUDA.
+
+ In CUDA, the warp size is the vector length of a CUDA device. However, the
+ CUDA interface abstracts away from that, and only shows us warp size
+ indirectly in maximum number of threads per block, which is a product of
+ warp size and the number of hyperthreads of a multiprocessor.
+
+ We choose to map openacc vector_length directly onto the number of threads
+ in a block, in the x dimension. This is reflected in gcc code generation
+ that uses ThreadIdx.x to access vector elements.
+
+ Attempting to use an openacc vector_length of more than the maximum number
+ of threads per block will result in a cuda error. */
+ nthreads_in_block = vector_length;
+
+ kargs[0] = &dp;
+ r = cuLaunchKernel (function,
+ 1, 1, 1,
+ nthreads_in_block, 1, 1,
+ 0, dev_str->stream, kargs, 0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+
+#ifndef DISABLE_ASYNC
+ if (async < acc_async_noval)
+ {
+ r = cuStreamSynchronize (dev_str->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+ }
+ else
+ {
+ CUevent *e;
+
+ e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+ event_gc (true);
+
+ r = cuEventRecord (*e, dev_str->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+ event_add (PTX_EVT_KNL, e, (void *)dev_str);
+ }
+#else
+ r = cuCtxSynchronize ();
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+#endif
+
+ GOMP_PLUGIN_notify (" %s: kernel %s: finished\n", __FUNCTION__,
+ targ_fn->name);
+
+#ifndef DISABLE_ASYNC
+ if (async < acc_async_noval)
+#endif
+ map_pop (dev_str);
+}
+
+void * openacc_get_current_cuda_context (void);
+
+static void *
+PTX_alloc (size_t s)
+{
+ CUdeviceptr d;
+ CUresult r;
+
+ r = cuMemAlloc (&d, s);
+ if (r == CUDA_ERROR_OUT_OF_MEMORY)
+ return 0;
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
+ return (void *)d;
+}
+
+static void
+PTX_free (void *p)
+{
+ CUresult r;
+ CUdeviceptr pb;
+ size_t ps;
+
+ r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+
+ if ((CUdeviceptr)p != pb)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ r = cuMemFree ((CUdeviceptr)p);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+}
+
+static void *
+PTX_host2dev (void *d, const void *h, size_t s)
+{
+ CUresult r;
+ CUdeviceptr pb;
+ size_t ps;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!s)
+ return 0;
+
+ if (!d)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+
+ if (!pb)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ if (!h)
+ GOMP_PLUGIN_fatal ("invalid host address");
+
+ if (d == h)
+ GOMP_PLUGIN_fatal ("invalid host or device address");
+
+ if ((void *)(d + s) > (void *)(pb + ps))
+ GOMP_PLUGIN_fatal ("invalid size");
+
+#ifndef DISABLE_ASYNC
+ if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+ {
+ CUevent *e;
+
+ e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+ event_gc (false);
+
+ r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
+ nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
+
+ r = cuEventRecord (*e, nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+ event_add (PTX_EVT_MEM, e, (void *)h);
+ }
+ else
+#endif
+ {
+ r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+ }
+
+ return 0;
+}
+
+static void *
+PTX_dev2host (void *h, const void *d, size_t s)
+{
+ CUresult r;
+ CUdeviceptr pb;
+ size_t ps;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!s)
+ return 0;
+
+ if (!d)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+
+ if (!pb)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ if (!h)
+ GOMP_PLUGIN_fatal ("invalid host address");
+
+ if (d == h)
+ GOMP_PLUGIN_fatal ("invalid host or device address");
+
+ if ((void *)(d + s) > (void *)(pb + ps))
+ GOMP_PLUGIN_fatal ("invalid size");
+
+#ifndef DISABLE_ASYNC
+ if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+ {
+ CUevent *e;
+
+ e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
+
+ event_gc (false);
+
+ r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
+ nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
+
+ r = cuEventRecord (*e, nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+ event_add (PTX_EVT_MEM, e, (void *)h);
+ }
+ else
+#endif
+ {
+ r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
+ }
+
+ return 0;
+}
+
+static void
+PTX_set_async (int async)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+ nvthd->current_stream
+ = select_stream_for_async (async, pthread_self (), true, NULL);
+}
+
+static int
+PTX_async_test (int async)
+{
+ CUresult r;
+ struct ptx_stream *s;
+
+ s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+ if (!s)
+ GOMP_PLUGIN_fatal ("unknown async %d", async);
+
+ r = cuStreamQuery (s->stream);
+ if (r == CUDA_SUCCESS)
+ {
+ /* The oacc-parallel.c:goacc_wait function calls this hook to determine
+ whether all work has completed on this stream, and if so omits the call
+ to the wait hook. If that happens, event_gc might not get called
+ (which prevents variables from getting unmapped and their associated
+ device storage freed), so call it here. */
+ event_gc (true);
+ return 1;
+ }
+ else if (r == CUDA_ERROR_NOT_READY)
+ return 0;
+
+ GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
+
+ return 0;
+}
+
+static int
+PTX_async_test_all (void)
+{
+ struct ptx_stream *s;
+ pthread_t self = pthread_self ();
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+ for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
+ {
+ if ((s->multithreaded || pthread_equal (s->host_thread, self))
+ && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
+ {
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+ return 0;
+ }
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+
+ event_gc (true);
+
+ return 1;
+}
+
+static void
+PTX_wait (int async)
+{
+ CUresult r;
+ struct ptx_stream *s;
+
+ s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+ if (!s)
+ GOMP_PLUGIN_fatal ("unknown async %d", async);
+
+ r = cuStreamSynchronize (s->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+
+ event_gc (true);
+}
+
+static void
+PTX_wait_async (int async1, int async2)
+{
+ CUresult r;
+ CUevent *e;
+ struct ptx_stream *s1, *s2;
+ pthread_t self = pthread_self ();
+
+ /* The stream that is waiting (rather than being waited for) doesn't
+ necessarily have to exist already. */
+ s2 = select_stream_for_async (async2, self, true, NULL);
+
+ s1 = select_stream_for_async (async1, self, false, NULL);
+ if (!s1)
+ GOMP_PLUGIN_fatal ("invalid async 1\n");
+
+ if (s1 == s2)
+ GOMP_PLUGIN_fatal ("identical parameters");
+
+ e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+ event_gc (true);
+
+ r = cuEventRecord (*e, s1->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+ event_add (PTX_EVT_SYNC, e, NULL);
+
+ r = cuStreamWaitEvent (s2->stream, *e, 0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
+}
+
+static void
+PTX_wait_all (void)
+{
+ CUresult r;
+ struct ptx_stream *s;
+ pthread_t self = pthread_self ();
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+ /* Wait for active streams initiated by this thread (or by multiple threads)
+ to complete. */
+ for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
+ {
+ if (s->multithreaded || pthread_equal (s->host_thread, self))
+ {
+ r = cuStreamQuery (s->stream);
+ if (r == CUDA_SUCCESS)
+ continue;
+ else if (r != CUDA_ERROR_NOT_READY)
+ GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
+
+ r = cuStreamSynchronize (s->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+ }
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+
+ event_gc (true);
+}
+
+static void
+PTX_wait_all_async (int async)
+{
+ CUresult r;
+ struct ptx_stream *waiting_stream, *other_stream;
+ CUevent *e;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+ pthread_t self = pthread_self ();
+
+ /* The stream doing the waiting. This could be the first mention of the
+ stream, so create it if necessary. */
+ waiting_stream
+ = select_stream_for_async (async, pthread_self (), true, NULL);
+
+ /* Launches on the null stream already block on other streams in the
+ context. */
+ if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
+ return;
+
+ event_gc (true);
+
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+ for (other_stream = nvthd->ptx_dev->active_streams;
+ other_stream != NULL;
+ other_stream = other_stream->next)
+ {
+ if (!other_stream->multithreaded
+ && !pthread_equal (other_stream->host_thread, self))
+ continue;
+
+ e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+ /* Record an event on the waited-for stream. */
+ r = cuEventRecord (*e, other_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+ event_add (PTX_EVT_SYNC, e, NULL);
+
+ r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+}
+
+static void *
+PTX_get_current_cuda_device (void)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!nvthd || !nvthd->ptx_dev)
+ return NULL;
+
+ return &nvthd->ptx_dev->dev;
+}
+
+static void *
+PTX_get_current_cuda_context (void)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!nvthd || !nvthd->ptx_dev)
+ return NULL;
+
+ return nvthd->ptx_dev->ctx;
+}
+
+static void *
+PTX_get_cuda_stream (int async)
+{
+ struct ptx_stream *s;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!nvthd || !nvthd->ptx_dev)
+ return NULL;
+
+ s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+ return s ? s->stream : NULL;
+}
+
+static int
+PTX_set_cuda_stream (int async, void *stream)
+{
+ struct ptx_stream *oldstream;
+ pthread_t self = pthread_self ();
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+ if (async < 0)
+ GOMP_PLUGIN_fatal ("bad async %d", async);
+
+ /* We have a list of active streams and an array mapping async values to
+ entries of that list. We need to take "ownership" of the passed-in stream,
+ and add it to our list, removing the previous entry also (if there was one)
+ in order to prevent resource leaks. Note the potential for surprise
+ here: maybe we should keep track of passed-in streams and leave it up to
+ the user to tidy those up, but that doesn't work for stream handles
+ returned from acc_get_cuda_stream above... */
+
+ oldstream = select_stream_for_async (async, self, false, NULL);
+
+ if (oldstream)
+ {
+ if (nvthd->ptx_dev->active_streams == oldstream)
+ nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
+ else
+ {
+ struct ptx_stream *s = nvthd->ptx_dev->active_streams;
+ while (s->next != oldstream)
+ s = s->next;
+ s->next = s->next->next;
+ }
+
+ cuStreamDestroy (oldstream->stream);
+ map_fini (oldstream);
+ free (oldstream);
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+
+ (void) select_stream_for_async (async, self, true, (CUstream) stream);
+
+ return 1;
+}
+
+/* Plugin entry points. */
+
+
+int
+GOMP_OFFLOAD_get_type (void)
+{
+ return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
+}
+
+unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+ return TARGET_CAP_OPENACC_200;
+}
+
+const char *
+GOMP_OFFLOAD_get_name (void)
+{
+ return "nvidia";
+}
+
+int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+ return PTX_get_num_devices ();
+}
+
+static void **kernel_target_data;
+static void **kernel_host_table;
+
+void
+GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
+{
+ kernel_target_data = target_data;
+ kernel_host_table = host_table;
+}
+
+void
+GOMP_OFFLOAD_init_device (int n __attribute__((unused)))
+{
+ (void) PTX_init ();
+}
+
+void
+GOMP_OFFLOAD_fini_device (int n __attribute__((unused)))
+{
+ PTX_fini ();
+}
+
+int
+GOMP_OFFLOAD_get_table (int n __attribute__((unused)),
+ struct mapping_table **tablep)
+{
+ CUmodule module;
+ void **fn_table;
+ char **fn_names;
+ int fn_entries, i;
+ CUresult r;
+ struct targ_fn_descriptor *targ_fns;
+
+ if (PTX_init () <= 0)
+ return 0;
+
+ /* This isn't an error, because an image may legitimately have no offloaded
+ regions and so will not call GOMP_offload_register. */
+ if (kernel_target_data == NULL)
+ return 0;
+
+ link_ptx (&module, kernel_target_data[0]);
+
+ /* kernel_target_data[0] -> ptx code
+ kernel_target_data[1] -> variable mappings
+ kernel_target_data[2] -> array of kernel names in ascii
+
+ kernel_host_table[0] -> start of function addresses (_omp_func_table)
+ kernel_host_table[1] -> end of function addresses (_omp_funcs_end)
+
+ The array of kernel names and the functions addresses form a
+ one-to-one correspondence. */
+
+ fn_table = kernel_host_table[0];
+ fn_names = (char **) kernel_target_data[2];
+ fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
+
+ *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
+ targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
+ * fn_entries);
+
+ for (i = 0; i < fn_entries; i++)
+ {
+ CUfunction function;
+
+ r = cuModuleGetFunction (&function, module, fn_names[i]);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
+
+ targ_fns[i].fn = function;
+ targ_fns[i].name = (const char *) fn_names[i];
+
+ (*tablep)[i].host_start = (uintptr_t) fn_table[i];
+ (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
+ (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
+ (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
+ }
+
+ return fn_entries;
+}
+
+void *
+GOMP_OFFLOAD_alloc (int n __attribute__((unused)), size_t size)
+{
+ return PTX_alloc (size);
+}
+
+void
+GOMP_OFFLOAD_free (int n __attribute__((unused)), void *ptr)
+{
+ PTX_free (ptr);
+}
+
+void *
+GOMP_OFFLOAD_dev2host (int ord __attribute__((unused)), void *dst,
+ const void *src, size_t n)
+{
+ return PTX_dev2host (dst, src, n);
+}
+
+void *
+GOMP_OFFLOAD_host2dev (int ord __attribute__((unused)), void *dst,
+ const void *src, size_t n)
+{
+ return PTX_host2dev (dst, src, n);
+}
+
+void (*device_run) (void *fn_ptr, void *vars) = NULL;
+
+void
+GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
+ void **hostaddrs, void **devaddrs, size_t *sizes,
+ unsigned short *kinds, int num_gangs,
+ int num_workers, int vector_length, int async,
+ void *targ_mem_desc)
+{
+ PTX_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
+ num_workers, vector_length, async, targ_mem_desc);
+}
+
+void *
+GOMP_OFFLOAD_openacc_open_device (int n)
+{
+ return PTX_open_device (n);
+}
+
+int
+GOMP_OFFLOAD_openacc_close_device (void *h)
+{
+ return PTX_close_device (h);
+}
+
+void
+GOMP_OFFLOAD_openacc_set_device_num (int n)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ assert (n >= 0);
+
+ if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
+ (void) PTX_open_device (n);
+}
+
+/* This can be called before the device is "opened" for the current thread, in
+ which case we can't tell which device number should be returned. We don't
+ actually want to open the device here, so just return -1 and let the caller
+ (oacc-init.c:acc_get_device_num) handle it. */
+
+int
+GOMP_OFFLOAD_openacc_get_device_num (void)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (nvthd && nvthd->ptx_dev)
+ return nvthd->ptx_dev->ord;
+ else
+ return -1;
+}
+
+void
+GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
+{
+ CUevent *e;
+ CUresult r;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+ r = cuEventRecord (*e, nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+ event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
+}
+
+int
+GOMP_OFFLOAD_openacc_async_test (int async)
+{
+ return PTX_async_test (async);
+}
+
+int
+GOMP_OFFLOAD_openacc_async_test_all (void)
+{
+ return PTX_async_test_all ();
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait (int async)
+{
+ PTX_wait (async);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
+{
+ PTX_wait_async (async1, async2);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait_all (void)
+{
+ PTX_wait_all ();
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
+{
+ PTX_wait_all_async (async);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_set_async (int async)
+{
+ PTX_set_async (async);
+}
+
+void *
+GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data)
+{
+ struct ptx_device *ptx_dev = (struct ptx_device *) targ_data;
+ struct nvptx_thread *nvthd
+ = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
+ CUresult r;
+ CUcontext thd_ctx;
+
+ r = cuCtxGetCurrent (&thd_ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+
+ assert (ptx_dev->ctx);
+
+ if (!thd_ctx)
+ {
+ r = cuCtxPushCurrent (ptx_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
+ }
+
+ nvthd->current_stream = ptx_dev->null_stream;
+ nvthd->ptx_dev = ptx_dev;
+
+ return (void *) nvthd;
+}
+
+void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
+{
+ free (data);
+}
+
+void *
+GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
+{
+ return PTX_get_current_cuda_device ();
+}
+
+void *
+GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
+{
+ return PTX_get_current_cuda_context ();
+}
+
+/* NOTE: This returns a CUstream, not a ptx_stream pointer. */
+
+void *
+GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
+{
+ return PTX_get_cuda_stream (async);
+}
+
+/* NOTE: This takes a CUstream, not a ptx_stream pointer. */
+
+int
+GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
+{
+ return PTX_set_cuda_stream (async, stream);
+}
new file mode 100644
@@ -0,0 +1,224 @@
+/* A splay-tree datatype.
+ Copyright 1998-2013
+ Free Software Foundation, Inc.
+ Contributed by Mark Mitchell (mark@markmitchell.com).
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The splay tree code copied from include/splay-tree.h and adjusted,
+ so that all the data lives directly in splay_tree_node_s structure
+ and no extra allocations are needed.
+
+ Files including this header should before including it add:
+typedef struct splay_tree_node_s *splay_tree_node;
+typedef struct splay_tree_s *splay_tree;
+typedef struct splay_tree_key_s *splay_tree_key;
+ define splay_tree_key_s structure, and define
+ splay_compare inline function. */
+
+/* For an easily readable description of splay-trees, see:
+
+ Lewis, Harry R. and Denenberg, Larry. Data Structures and Their
+ Algorithms. Harper-Collins, Inc. 1991.
+
+ The major feature of splay trees is that all basic tree operations
+ are amortized O(log n) time for a tree with n nodes. */
+
+#include "libgomp.h"
+#include "splay-tree.h"
+
+extern int splay_compare (splay_tree_key, splay_tree_key);
+
+/* Rotate the edge joining the left child N with its parent P. PP is the
+ grandparents' pointer to P. */
+
+static inline void
+rotate_left (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
+{
+ splay_tree_node tmp;
+ tmp = n->right;
+ n->right = p;
+ p->left = tmp;
+ *pp = n;
+}
+
+/* Rotate the edge joining the right child N with its parent P. PP is the
+ grandparents' pointer to P. */
+
+static inline void
+rotate_right (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
+{
+ splay_tree_node tmp;
+ tmp = n->left;
+ n->left = p;
+ p->right = tmp;
+ *pp = n;
+}
+
+/* Bottom up splay of KEY. */
+
+static void
+splay_tree_splay (splay_tree sp, splay_tree_key key)
+{
+ if (sp->root == NULL)
+ return;
+
+ do {
+ int cmp1, cmp2;
+ splay_tree_node n, c;
+
+ n = sp->root;
+ cmp1 = splay_compare (key, &n->key);
+
+ /* Found. */
+ if (cmp1 == 0)
+ return;
+
+ /* Left or right? If no child, then we're done. */
+ if (cmp1 < 0)
+ c = n->left;
+ else
+ c = n->right;
+ if (!c)
+ return;
+
+ /* Next one left or right? If found or no child, we're done
+ after one rotation. */
+ cmp2 = splay_compare (key, &c->key);
+ if (cmp2 == 0
+ || (cmp2 < 0 && !c->left)
+ || (cmp2 > 0 && !c->right))
+ {
+ if (cmp1 < 0)
+ rotate_left (&sp->root, n, c);
+ else
+ rotate_right (&sp->root, n, c);
+ return;
+ }
+
+ /* Now we have the four cases of double-rotation. */
+ if (cmp1 < 0 && cmp2 < 0)
+ {
+ rotate_left (&n->left, c, c->left);
+ rotate_left (&sp->root, n, n->left);
+ }
+ else if (cmp1 > 0 && cmp2 > 0)
+ {
+ rotate_right (&n->right, c, c->right);
+ rotate_right (&sp->root, n, n->right);
+ }
+ else if (cmp1 < 0 && cmp2 > 0)
+ {
+ rotate_right (&n->left, c, c->right);
+ rotate_left (&sp->root, n, n->left);
+ }
+ else if (cmp1 > 0 && cmp2 < 0)
+ {
+ rotate_left (&n->right, c, c->left);
+ rotate_right (&sp->root, n, n->right);
+ }
+ } while (1);
+}
+
+/* Insert a new NODE into SP. The NODE shouldn't exist in the tree. */
+
+attribute_hidden void
+splay_tree_insert (splay_tree sp, splay_tree_node node)
+{
+ int comparison = 0;
+
+ splay_tree_splay (sp, &node->key);
+
+ if (sp->root)
+ comparison = splay_compare (&sp->root->key, &node->key);
+
+ if (sp->root && comparison == 0)
+ gomp_fatal ("Duplicate node");
+ else
+ {
+ /* Insert it at the root. */
+ if (sp->root == NULL)
+ node->left = node->right = NULL;
+ else if (comparison < 0)
+ {
+ node->left = sp->root;
+ node->right = node->left->right;
+ node->left->right = NULL;
+ }
+ else
+ {
+ node->right = sp->root;
+ node->left = node->right->left;
+ node->right->left = NULL;
+ }
+
+ sp->root = node;
+ }
+}
+
+/* Remove node with KEY from SP. It is not an error if it did not exist. */
+
+attribute_hidden void
+splay_tree_remove (splay_tree sp, splay_tree_key key)
+{
+ splay_tree_splay (sp, key);
+
+ if (sp->root && splay_compare (&sp->root->key, key) == 0)
+ {
+ splay_tree_node left, right;
+
+ left = sp->root->left;
+ right = sp->root->right;
+
+ /* One of the children is now the root. Doesn't matter much
+ which, so long as we preserve the properties of the tree. */
+ if (left)
+ {
+ sp->root = left;
+
+ /* If there was a right child as well, hang it off the
+ right-most leaf of the left child. */
+ if (right)
+ {
+ while (left->right)
+ left = left->right;
+ left->right = right;
+ }
+ }
+ else
+ sp->root = right;
+ }
+}
+
+/* Lookup KEY in SP, returning NODE if present, and NULL
+ otherwise. */
+
+attribute_hidden splay_tree_key
+splay_tree_lookup (splay_tree sp, splay_tree_key key)
+{
+ splay_tree_splay (sp, key);
+
+ if (sp->root && splay_compare (&sp->root->key, key) == 0)
+ return &sp->root->key;
+ else
+ return NULL;
+}
@@ -43,6 +43,30 @@ typedef struct splay_tree_key_s *splay_tree_key;
The major feature of splay trees is that all basic tree operations
are amortized O(log n) time for a tree with n nodes. */
+#ifndef _SPLAY_TREE_H
+#define _SPLAY_TREE_H 1
+
+typedef struct splay_tree_node_s *splay_tree_node;
+typedef struct splay_tree_s *splay_tree;
+typedef struct splay_tree_key_s *splay_tree_key;
+
+struct splay_tree_key_s {
+ /* Address of the host object. */
+ uintptr_t host_start;
+ /* Address immediately after the host object. */
+ uintptr_t host_end;
+ /* Descriptor of the target memory. */
+ struct target_mem_desc *tgt;
+ /* Offset from tgt->tgt_start to the start of the target object. */
+ uintptr_t tgt_offset;
+ /* Reference count. */
+ uintptr_t refcount;
+ /* Asynchronous reference count. */
+ uintptr_t async_refcount;
+ /* True if data should be copied from device to host at the end. */
+ bool copy_from;
+};
+
/* The nodes in the splay tree. */
struct splay_tree_node_s {
struct splay_tree_key_s key;
@@ -56,177 +80,8 @@ struct splay_tree_s {
splay_tree_node root;
};
-/* Rotate the edge joining the left child N with its parent P. PP is the
- grandparents' pointer to P. */
-
-static inline void
-rotate_left (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
-{
- splay_tree_node tmp;
- tmp = n->right;
- n->right = p;
- p->left = tmp;
- *pp = n;
-}
-
-/* Rotate the edge joining the right child N with its parent P. PP is the
- grandparents' pointer to P. */
-
-static inline void
-rotate_right (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
-{
- splay_tree_node tmp;
- tmp = n->left;
- n->left = p;
- p->right = tmp;
- *pp = n;
-}
-
-/* Bottom up splay of KEY. */
-
-static void
-splay_tree_splay (splay_tree sp, splay_tree_key key)
-{
- if (sp->root == NULL)
- return;
-
- do {
- int cmp1, cmp2;
- splay_tree_node n, c;
-
- n = sp->root;
- cmp1 = splay_compare (key, &n->key);
-
- /* Found. */
- if (cmp1 == 0)
- return;
-
- /* Left or right? If no child, then we're done. */
- if (cmp1 < 0)
- c = n->left;
- else
- c = n->right;
- if (!c)
- return;
-
- /* Next one left or right? If found or no child, we're done
- after one rotation. */
- cmp2 = splay_compare (key, &c->key);
- if (cmp2 == 0
- || (cmp2 < 0 && !c->left)
- || (cmp2 > 0 && !c->right))
- {
- if (cmp1 < 0)
- rotate_left (&sp->root, n, c);
- else
- rotate_right (&sp->root, n, c);
- return;
- }
-
- /* Now we have the four cases of double-rotation. */
- if (cmp1 < 0 && cmp2 < 0)
- {
- rotate_left (&n->left, c, c->left);
- rotate_left (&sp->root, n, n->left);
- }
- else if (cmp1 > 0 && cmp2 > 0)
- {
- rotate_right (&n->right, c, c->right);
- rotate_right (&sp->root, n, n->right);
- }
- else if (cmp1 < 0 && cmp2 > 0)
- {
- rotate_right (&n->left, c, c->right);
- rotate_left (&sp->root, n, n->left);
- }
- else if (cmp1 > 0 && cmp2 < 0)
- {
- rotate_left (&n->right, c, c->left);
- rotate_right (&sp->root, n, n->right);
- }
- } while (1);
-}
-
-/* Insert a new NODE into SP. The NODE shouldn't exist in the tree. */
-
-static void
-splay_tree_insert (splay_tree sp, splay_tree_node node)
-{
- int comparison = 0;
-
- splay_tree_splay (sp, &node->key);
-
- if (sp->root)
- comparison = splay_compare (&sp->root->key, &node->key);
-
- if (sp->root && comparison == 0)
- abort ();
- else
- {
- /* Insert it at the root. */
- if (sp->root == NULL)
- node->left = node->right = NULL;
- else if (comparison < 0)
- {
- node->left = sp->root;
- node->right = node->left->right;
- node->left->right = NULL;
- }
- else
- {
- node->right = sp->root;
- node->left = node->right->left;
- node->right->left = NULL;
- }
-
- sp->root = node;
- }
-}
-
-/* Remove node with KEY from SP. It is not an error if it did not exist. */
-
-static void
-splay_tree_remove (splay_tree sp, splay_tree_key key)
-{
- splay_tree_splay (sp, key);
-
- if (sp->root && splay_compare (&sp->root->key, key) == 0)
- {
- splay_tree_node left, right;
-
- left = sp->root->left;
- right = sp->root->right;
-
- /* One of the children is now the root. Doesn't matter much
- which, so long as we preserve the properties of the tree. */
- if (left)
- {
- sp->root = left;
-
- /* If there was a right child as well, hang it off the
- right-most leaf of the left child. */
- if (right)
- {
- while (left->right)
- left = left->right;
- left->right = right;
- }
- }
- else
- sp->root = right;
- }
-}
-
-/* Lookup KEY in SP, returning NODE if present, and NULL
- otherwise. */
-
-static splay_tree_key
-splay_tree_lookup (splay_tree sp, splay_tree_key key)
-{
- splay_tree_splay (sp, key);
-
- if (sp->root && splay_compare (&sp->root->key, key) == 0)
- return &sp->root->key;
- else
- return NULL;
-}
+attribute_hidden splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key);
+attribute_hidden void splay_tree_insert (splay_tree, splay_tree_node);
+attribute_hidden void splay_tree_remove (splay_tree, splay_tree_key);
+
+#endif /* _SPLAY_TREE_H */
@@ -30,7 +30,12 @@
#include <limits.h>
#include <stdbool.h>
#include <stdlib.h>
+#include "oacc-plugin.h"
+#include "gomp-constants.h"
+#include "oacc-int.h"
#include <string.h>
+#include <stdio.h>
+#include <assert.h>
#ifdef PLUGIN_SUPPORT
#include <dlfcn.h>
@@ -40,50 +45,6 @@ static void gomp_target_init (void);
static pthread_once_t gomp_is_initialized = PTHREAD_ONCE_INIT;
-/* Forward declaration for a node in the tree. */
-typedef struct splay_tree_node_s *splay_tree_node;
-typedef struct splay_tree_s *splay_tree;
-typedef struct splay_tree_key_s *splay_tree_key;
-
-struct target_mem_desc {
- /* Reference count. */
- uintptr_t refcount;
- /* All the splay nodes allocated together. */
- splay_tree_node array;
- /* Start of the target region. */
- uintptr_t tgt_start;
- /* End of the targer region. */
- uintptr_t tgt_end;
- /* Handle to free. */
- void *to_free;
- /* Previous target_mem_desc. */
- struct target_mem_desc *prev;
- /* Number of items in following list. */
- size_t list_count;
-
- /* Corresponding target device descriptor. */
- struct gomp_device_descr *device_descr;
-
- /* List of splay keys to remove (or decrease refcount)
- at the end of region. */
- splay_tree_key list[];
-};
-
-struct splay_tree_key_s {
- /* Address of the host object. */
- uintptr_t host_start;
- /* Address immediately after the host object. */
- uintptr_t host_end;
- /* Descriptor of the target memory. */
- struct target_mem_desc *tgt;
- /* Offset from tgt->tgt_start to the start of the target object. */
- uintptr_t tgt_offset;
- /* Reference count. */
- uintptr_t refcount;
- /* True if data should be copied from device to host at the end. */
- bool copy_from;
-};
-
/* This structure describes an offload image.
It contains type of the target device, pointer to host table descriptor, and
pointer to target data. */
@@ -107,7 +68,7 @@ static int num_devices;
/* The comparison function. */
-static int
+attribute_hidden int
splay_compare (splay_tree_key x, splay_tree_key y)
{
if (x->host_start == x->host_end
@@ -122,47 +83,16 @@ splay_compare (splay_tree_key x, splay_tree_key y)
#include "splay-tree.h"
-/* This structure describes accelerator device.
- It contains ID-number of the device, its type, function handlers for
- interaction with the device, and information about mapped memory. */
-struct gomp_device_descr
+attribute_hidden void
+gomp_init_targets_once (void)
{
- /* This is the ID number of device. It could be specified in DEVICE-clause of
- TARGET construct. */
- int id;
-
- /* This is the ID number of device among devices of the same type. */
- int target_id;
-
- /* This is the TYPE of device. */
- enum offload_target_type type;
-
- /* Set to true when device is initialized. */
- bool is_initialized;
-
- /* Function handlers. */
- int (*get_type_func) (void);
- int (*get_num_devices_func) (void);
- void (*register_image_func) (void *, void *);
- void (*init_device_func) (int);
- int (*get_table_func) (int, void *);
- void *(*alloc_func) (int, size_t);
- void (*free_func) (int, void *);
- void *(*host2dev_func) (int, void *, const void *, size_t);
- void *(*dev2host_func) (int, void *, const void *, size_t);
- void (*run_func) (int, void *, void *);
-
- /* Splay tree containing information about mapped memory regions. */
- struct splay_tree_s dev_splay_tree;
-
- /* Mutex for operating with the splay tree and other shared structures. */
- gomp_mutex_t dev_env_lock;
-};
+ (void) pthread_once (&gomp_is_initialized, gomp_target_init);
+}
attribute_hidden int
gomp_get_num_devices (void)
{
- (void) pthread_once (&gomp_is_initialized, gomp_target_init);
+ gomp_init_targets_once ();
return num_devices;
}
@@ -198,18 +128,29 @@ gomp_map_vars_existing (splay_tree_key oldn, splay_tree_key newn,
oldn->refcount++;
}
-static struct target_mem_desc *
+static int
+get_kind (bool is_openacc, void *kinds, int idx)
+{
+ return is_openacc ? ((unsigned short *) kinds)[idx]
+ : ((unsigned char *) kinds)[idx];
+}
+
+attribute_hidden struct target_mem_desc *
gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
- void **hostaddrs, size_t *sizes, unsigned char *kinds,
- bool is_target)
+ void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
+ bool is_openacc, bool is_target)
{
size_t i, tgt_align, tgt_size, not_found_cnt = 0;
+ const int rshift = is_openacc ? 8 : 3;
+ const int typemask = is_openacc ? 0xff : 0x7;
+ struct gomp_memory_mapping *mm = &devicep->mem_map;
struct splay_tree_key_s cur_node;
struct target_mem_desc *tgt
= gomp_malloc (sizeof (*tgt) + sizeof (tgt->list[0]) * mapnum);
tgt->list_count = mapnum;
tgt->refcount = 1;
tgt->device_descr = devicep;
+ tgt->mem_map = mm;
if (mapnum == 0)
return tgt;
@@ -222,41 +163,41 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt_align = align;
tgt_size = mapnum * sizeof (void *);
}
-
- gomp_mutex_lock (&devicep->dev_env_lock);
+ gomp_mutex_lock (&mm->lock);
for (i = 0; i < mapnum; i++)
{
+ int kind = get_kind (is_openacc, kinds, i);
if (hostaddrs[i] == NULL)
{
tgt->list[i] = NULL;
continue;
}
cur_node.host_start = (uintptr_t) hostaddrs[i];
- if ((kinds[i] & 7) != 4)
+ if (!GOMP_MAP_POINTER_P (kind & typemask))
cur_node.host_end = cur_node.host_start + sizes[i];
else
cur_node.host_end = cur_node.host_start + sizeof (void *);
- splay_tree_key n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ splay_tree_key n = splay_tree_lookup (&mm->splay_tree, &cur_node);
if (n)
{
tgt->list[i] = n;
- gomp_map_vars_existing (n, &cur_node, kinds[i]);
+ gomp_map_vars_existing (n, &cur_node, kind);
}
else
{
- size_t align = (size_t) 1 << (kinds[i] >> 3);
+ size_t align = (size_t) 1 << (kind >> rshift);
tgt->list[i] = NULL;
not_found_cnt++;
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
tgt_size += cur_node.host_end - cur_node.host_start;
- if ((kinds[i] & 7) == 5)
+ if ((kind & typemask) == GOMP_MAP_TO_PSET)
{
size_t j;
for (j = i + 1; j < mapnum; j++)
- if ((kinds[j] & 7) != 4)
+ if (!GOMP_MAP_POINTER_P (get_kind (is_openacc, kinds, j)
+ & typemask))
break;
else if ((uintptr_t) hostaddrs[j] < cur_node.host_start
|| ((uintptr_t) hostaddrs[j] + sizeof (void *)
@@ -271,7 +212,15 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
}
}
- if (not_found_cnt || is_target)
+ if (devaddrs)
+ {
+ if (mapnum != 1)
+ gomp_fatal ("unexpected aggregation");
+ tgt->to_free = devaddrs[0];
+ tgt->tgt_start = (uintptr_t) tgt->to_free;
+ tgt->tgt_end = tgt->tgt_start + sizes[0];
+ }
+ else if (not_found_cnt || is_target)
{
/* Allocate tgt_align aligned tgt_size block of memory. */
/* FIXME: Perhaps change interface to allocate properly aligned
@@ -303,44 +252,52 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
for (i = 0; i < mapnum; i++)
if (tgt->list[i] == NULL)
{
+ int kind = get_kind (is_openacc, kinds, i);
if (hostaddrs[i] == NULL)
continue;
splay_tree_key k = &array->key;
k->host_start = (uintptr_t) hostaddrs[i];
- if ((kinds[i] & 7) != 4)
+ if (!GOMP_MAP_POINTER_P (kind & typemask))
k->host_end = k->host_start + sizes[i];
else
k->host_end = k->host_start + sizeof (void *);
- splay_tree_key n
- = splay_tree_lookup (&devicep->dev_splay_tree, k);
+ splay_tree_key n = splay_tree_lookup (&mm->splay_tree, k);
if (n)
{
tgt->list[i] = n;
- gomp_map_vars_existing (n, k, kinds[i]);
+ gomp_map_vars_existing (n, k, kind);
}
else
{
- size_t align = (size_t) 1 << (kinds[i] >> 3);
+ size_t align = (size_t) 1 << (kind >> rshift);
tgt->list[i] = k;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
k->tgt = tgt;
k->tgt_offset = tgt_size;
tgt_size += k->host_end - k->host_start;
- k->copy_from = false;
- if ((kinds[i] & 7) == 2 || (kinds[i] & 7) == 3)
- k->copy_from = true;
+ k->copy_from = GOMP_MAP_COPYFROM_P (kind & typemask)
+ || GOMP_MAP_TOFROM_P (kind & typemask);
k->refcount = 1;
+ k->async_refcount = 0;
tgt->refcount++;
array->left = NULL;
array->right = NULL;
- splay_tree_insert (&devicep->dev_splay_tree, array);
- switch (kinds[i] & 7)
+ splay_tree_insert (&mm->splay_tree, array);
+ switch (kind & typemask)
{
- case 0: /* ALLOC */
- case 2: /* FROM */
+ case GOMP_MAP_FORCE_ALLOC:
+ case GOMP_MAP_FORCE_FROM:
+ /* FIXME: No special handling (see comment in
+ oacc-parallel.c). */
+ case GOMP_MAP_ALLOC:
+ case GOMP_MAP_ALLOC_FROM:
break;
- case 1: /* TO */
- case 3: /* TOFROM */
+ case GOMP_MAP_FORCE_TO:
+ case GOMP_MAP_FORCE_TOFROM:
+ /* FIXME: No special handling, as above. */
+ case GOMP_MAP_ALLOC_TO:
+ case GOMP_MAP_ALLOC_TOFROM:
+ /* Copy from host to device memory. */
/* FIXME: Perhaps add some smarts, like if copying
several adjacent fields from host to target, use some
host buffer to avoid sending each var individually. */
@@ -350,7 +307,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
(void *) k->host_start,
k->host_end - k->host_start);
break;
- case 4: /* POINTER */
+ case GOMP_MAP_POINTER:
cur_node.host_start
= (uintptr_t) *(void **) k->host_start;
if (cur_node.host_start == (uintptr_t) NULL)
@@ -366,19 +323,16 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
/* Add bias to the pointer value. */
cur_node.host_start += sizes[i];
cur_node.host_end = cur_node.host_start + 1;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree, &cur_node);
if (n == NULL)
{
/* Could be possibly zero size array section. */
cur_node.host_end--;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree, &cur_node);
if (n == NULL)
{
cur_node.host_start--;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree, &cur_node);
cur_node.host_start++;
}
}
@@ -398,14 +352,17 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
(void *) &cur_node.tgt_offset,
sizeof (void *));
break;
- case 5: /* TO_PSET */
- devicep->host2dev_func (devicep->target_id,
- (void *) (tgt->tgt_start
- + k->tgt_offset),
- (void *) k->host_start,
- k->host_end - k->host_start);
+ case GOMP_MAP_TO_PSET:
+ /* Copy from host to device memory. */
+ /* FIXME: see above FIXME comment. */
+ devicep->host2dev_func
+ (devicep->target_id,
+ (void *) (tgt->tgt_start + k->tgt_offset),
+ (void *) k->host_start,
+ (k->host_end - k->host_start));
for (j = i + 1; j < mapnum; j++)
- if ((kinds[j] & 7) != 4)
+ if (!GOMP_MAP_POINTER_P (get_kind (is_openacc, kinds, j)
+ & typemask))
break;
else if ((uintptr_t) hostaddrs[j] < k->host_start
|| ((uintptr_t) hostaddrs[j] + sizeof (void *)
@@ -432,19 +389,18 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
/* Add bias to the pointer value. */
cur_node.host_start += sizes[j];
cur_node.host_end = cur_node.host_start + 1;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree, &cur_node);
if (n == NULL)
{
/* Could be possibly zero size array section. */
cur_node.host_end--;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
+ n = splay_tree_lookup (&mm->splay_tree,
&cur_node);
if (n == NULL)
{
cur_node.host_start--;
- n = splay_tree_lookup
- (&devicep->dev_splay_tree, &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree,
+ &cur_node);
cur_node.host_start++;
}
}
@@ -468,6 +424,32 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
i++;
}
break;
+ case GOMP_MAP_FORCE_PRESENT:
+ {
+ /* We already looked up the memory region above and it
+ was missing. */
+ size_t size = k->host_end - k->host_start;
+ gomp_fatal ("present clause: !acc_is_present (%p, "
+ "%zd (0x%zx))", (void *) k->host_start,
+ size, size);
+ }
+ break;
+ case GOMP_MAP_FORCE_DEVICEPTR:
+ assert (k->host_end - k->host_start == sizeof (void *));
+
+ devicep->host2dev_func
+ (devicep->target_id,
+ (void *) (tgt->tgt_start + k->tgt_offset),
+ (void *) k->host_start,
+ sizeof (void *));
+ break;
+ case GOMP_MAP_FORCE_PRIVATE:
+ abort ();
+ case GOMP_MAP_FORCE_FIRSTPRIVATE:
+ abort ();
+ default:
+ gomp_fatal ("%s: unhandled kind 0x%.2x", __FUNCTION__,
+ kind);
}
array++;
}
@@ -490,7 +472,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
}
}
- gomp_mutex_unlock (&devicep->dev_env_lock);
+ gomp_mutex_unlock (&mm->lock);
return tgt;
}
@@ -505,10 +487,51 @@ gomp_unmap_tgt (struct target_mem_desc *tgt)
free (tgt);
}
-static void
-gomp_unmap_vars (struct target_mem_desc *tgt)
+/* Decrease the refcount for a set of mapped variables, and queue asychronous
+ copies from the device back to the host after any work that has been issued.
+ Because the regions are still "live", increment an asynchronous reference
+ count to indicate that they should not be unmapped from host-side data
+ structures until the asynchronous copy has completed. */
+
+attribute_hidden void
+gomp_copy_from_async (struct target_mem_desc *tgt)
+{
+ struct gomp_device_descr *devicep = tgt->device_descr;
+ struct gomp_memory_mapping *mm = tgt->mem_map;
+ size_t i;
+
+ gomp_mutex_lock (&mm->lock);
+
+ for (i = 0; i < tgt->list_count; i++)
+ if (tgt->list[i] == NULL)
+ ;
+ else if (tgt->list[i]->refcount > 1)
+ {
+ tgt->list[i]->refcount--;
+ tgt->list[i]->async_refcount++;
+ }
+ else
+ {
+ splay_tree_key k = tgt->list[i];
+ if (k->copy_from)
+ /* Copy from device to host memory. */
+ devicep->dev2host_func (devicep->target_id, (void *) k->host_start,
+ (void *) (k->tgt->tgt_start + k->tgt_offset),
+ k->host_end - k->host_start);
+ }
+
+ gomp_mutex_unlock (&mm->lock);
+}
+
+/* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant
+ variables back from device to host: if it is false, it is assumed that this
+ has been done already, i.e. by gomp_copy_from_async above. */
+
+attribute_hidden void
+gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
{
struct gomp_device_descr *devicep = tgt->device_descr;
+ struct gomp_memory_mapping *mm = tgt->mem_map;
if (tgt->list_count == 0)
{
@@ -517,20 +540,23 @@ gomp_unmap_vars (struct target_mem_desc *tgt)
}
size_t i;
- gomp_mutex_lock (&devicep->dev_env_lock);
+ gomp_mutex_lock (&mm->lock);
for (i = 0; i < tgt->list_count; i++)
if (tgt->list[i] == NULL)
;
else if (tgt->list[i]->refcount > 1)
tgt->list[i]->refcount--;
+ else if (tgt->list[i]->async_refcount > 0)
+ tgt->list[i]->async_refcount--;
else
{
splay_tree_key k = tgt->list[i];
- if (k->copy_from)
+ if (k->copy_from && do_copyfrom)
+ /* Copy from device to host memory. */
devicep->dev2host_func (devicep->target_id, (void *) k->host_start,
(void *) (k->tgt->tgt_start + k->tgt_offset),
k->host_end - k->host_start);
- splay_tree_remove (&devicep->dev_splay_tree, k);
+ splay_tree_remove (&mm->splay_tree, k);
if (k->tgt->refcount > 1)
k->tgt->refcount--;
else
@@ -541,15 +567,17 @@ gomp_unmap_vars (struct target_mem_desc *tgt)
tgt->refcount--;
else
gomp_unmap_tgt (tgt);
- gomp_mutex_unlock (&devicep->dev_env_lock);
+ gomp_mutex_unlock (&mm->lock);
}
static void
-gomp_update (struct gomp_device_descr *devicep, size_t mapnum,
- void **hostaddrs, size_t *sizes, unsigned char *kinds)
+gomp_update (struct gomp_device_descr *devicep, struct gomp_memory_mapping *mm,
+ size_t mapnum, void **hostaddrs, size_t *sizes, void *kinds,
+ bool is_openacc)
{
size_t i;
struct splay_tree_key_s cur_node;
+ const int typemask = is_openacc ? 0xff : 0x7;
if (!devicep)
return;
@@ -557,16 +585,17 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum,
if (mapnum == 0)
return;
- gomp_mutex_lock (&devicep->dev_env_lock);
+ gomp_mutex_lock (&mm->lock);
for (i = 0; i < mapnum; i++)
if (sizes[i])
{
cur_node.host_start = (uintptr_t) hostaddrs[i];
cur_node.host_end = cur_node.host_start + sizes[i];
- splay_tree_key n = splay_tree_lookup (&devicep->dev_splay_tree,
+ splay_tree_key n = splay_tree_lookup (&mm->splay_tree,
&cur_node);
if (n)
{
+ int kind = get_kind (is_openacc, kinds, i);
if (n->host_start > cur_node.host_start
|| n->host_end < cur_node.host_end)
gomp_fatal ("Trying to update [%p..%p) object when"
@@ -575,31 +604,38 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum,
(void *) cur_node.host_end,
(void *) n->host_start,
(void *) n->host_end);
- if ((kinds[i] & 7) == 1)
- devicep->host2dev_func (devicep->target_id,
- (void *) (n->tgt->tgt_start
- + n->tgt_offset
- + cur_node.host_start
- - n->host_start),
- (void *) cur_node.host_start,
- cur_node.host_end - cur_node.host_start);
- else if ((kinds[i] & 7) == 2)
- devicep->dev2host_func (devicep->target_id,
- (void *) cur_node.host_start,
- (void *) (n->tgt->tgt_start
- + n->tgt_offset
- + cur_node.host_start
- - n->host_start),
- cur_node.host_end - cur_node.host_start);
+ if (GOMP_MAP_COPYTO_P (kind & typemask))
+ /* Copy from host to device memory. */
+ devicep->host2dev_func
+ (devicep->target_id,
+ (void *) (n->tgt->tgt_start
+ + n->tgt_offset
+ + cur_node.host_start
+ - n->host_start),
+ (void *) cur_node.host_start,
+ cur_node.host_end - cur_node.host_start);
+ else if (GOMP_MAP_COPYFROM_P (kind & typemask))
+ /* Copy from device to host memory. */
+ devicep->dev2host_func
+ (devicep->target_id,
+ (void *) cur_node.host_start,
+ (void *) (n->tgt->tgt_start
+ + n->tgt_offset
+ + cur_node.host_start
+ - n->host_start),
+ cur_node.host_end - cur_node.host_start);
}
else
gomp_fatal ("Trying to update [%p..%p) object that is not mapped",
(void *) cur_node.host_start,
(void *) cur_node.host_end);
}
- gomp_mutex_unlock (&devicep->dev_env_lock);
+ gomp_mutex_unlock (&mm->lock);
}
+static void gomp_register_image_for_device (struct gomp_device_descr *device,
+ struct offload_image_descr *image);
+
/* This function should be called from every offload image.
It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of
the target, and TARGET_DATA needed by target plugin. */
@@ -612,6 +648,9 @@ GOMP_offload_register (void *host_table, enum offload_target_type target_type,
(num_offload_images + 1)
* sizeof (struct offload_image_descr));
+ if (offload_images == NULL)
+ return;
+
offload_images[num_offload_images].type = target_type;
offload_images[num_offload_images].host_table = host_table;
offload_images[num_offload_images].target_data = target_data;
@@ -621,17 +660,24 @@ GOMP_offload_register (void *host_table, enum offload_target_type target_type,
/* This function initializes the target device, specified by DEVICEP. */
-static void
+attribute_hidden void
gomp_init_device (struct gomp_device_descr *devicep)
{
+ /* Initialize the target device. */
devicep->init_device_func (devicep->target_id);
+
+ devicep->is_initialized = true;
+}
+attribute_hidden void
+gomp_init_tables (const struct gomp_device_descr *devicep,
+ struct gomp_memory_mapping *mm)
+{
/* Get address mapping table for device. */
struct mapping_table *table = NULL;
- int num_entries = devicep->get_table_func (devicep->target_id, &table);
+ int i, num_entries = devicep->get_table_func (devicep->target_id, &table);
/* Insert host-target address mapping into dev_splay_tree. */
- int i;
for (i = 0; i < num_entries; i++)
{
struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt));
@@ -641,7 +687,7 @@ gomp_init_device (struct gomp_device_descr *devicep)
tgt->tgt_end = table[i].tgt_end;
tgt->to_free = NULL;
tgt->list_count = 0;
- tgt->device_descr = devicep;
+ tgt->device_descr = (struct gomp_device_descr *) devicep;
splay_tree_node node = tgt->array;
splay_tree_key k = &node->key;
k->host_start = table[i].host_start;
@@ -652,11 +698,45 @@ gomp_init_device (struct gomp_device_descr *devicep)
k->tgt = tgt;
node->left = NULL;
node->right = NULL;
- splay_tree_insert (&devicep->dev_splay_tree, node);
+ splay_tree_insert (&mm->splay_tree, node);
}
free (table);
- devicep->is_initialized = true;
+ mm->is_initialized = true;
+}
+
+static void
+gomp_init_dev_tables (struct gomp_device_descr *devicep)
+{
+ gomp_init_device (devicep);
+ gomp_init_tables (devicep, &devicep->mem_map);
+}
+
+
+attribute_hidden void
+gomp_free_memmap (struct gomp_device_descr *devicep)
+{
+ struct gomp_memory_mapping *mm = &devicep->mem_map;
+
+ while (mm->splay_tree.root)
+ {
+ struct target_mem_desc *tgt = mm->splay_tree.root->key.tgt;
+
+ splay_tree_remove (&mm->splay_tree, &mm->splay_tree.root->key);
+ free (tgt->array);
+ free (tgt);
+ }
+
+ mm->is_initialized = false;
+}
+
+attribute_hidden void
+gomp_fini_device (struct gomp_device_descr *devicep)
+{
+ if (devicep->is_initialized)
+ devicep->fini_device_func (devicep->target_id);
+
+ devicep->is_initialized = false;
}
/* Called when encountering a target directive. If DEVICE
@@ -675,7 +755,12 @@ GOMP_target (int device, void (*fn) (void *), const void *openmp_target,
unsigned char *kinds)
{
struct gomp_device_descr *devicep = resolve_device (device);
- if (devicep == NULL)
+ struct gomp_memory_mapping *mm = &devicep->mem_map;
+
+ if (devicep != NULL && !devicep->is_initialized)
+ gomp_init_dev_tables (devicep);
+
+ if (devicep == NULL || !(devicep->capabilities & TARGET_CAP_OPENMP_400))
{
/* Host fallback. */
struct gomp_thread old_thr, *thr = gomp_thread ();
@@ -692,20 +777,30 @@ GOMP_target (int device, void (*fn) (void *), const void *openmp_target,
return;
}
- gomp_mutex_lock (&devicep->dev_env_lock);
- if (!devicep->is_initialized)
- gomp_init_device (devicep);
+ void *fn_addr;
- struct splay_tree_key_s k;
- k.host_start = (uintptr_t) fn;
- k.host_end = k.host_start + 1;
- splay_tree_key tgt_fn = splay_tree_lookup (&devicep->dev_splay_tree, &k);
- if (tgt_fn == NULL)
- gomp_fatal ("Target function wasn't mapped");
- gomp_mutex_unlock (&devicep->dev_env_lock);
+ if (devicep->capabilities & TARGET_CAP_NATIVE_EXEC)
+ fn_addr = (void *) fn;
+ else
+ {
+ gomp_mutex_lock (&mm->lock);
+ if (!devicep->is_initialized)
+ gomp_init_dev_tables (devicep);
+ struct splay_tree_key_s k;
+ k.host_start = (uintptr_t) fn;
+ k.host_end = k.host_start + 1;
+ splay_tree_key tgt_fn = splay_tree_lookup (&devicep->mem_map.splay_tree,
+ &k);
+ if (tgt_fn == NULL)
+ gomp_fatal ("Target function wasn't mapped");
+ gomp_mutex_unlock (&mm->lock);
+
+ fn_addr = (void *) tgt_fn->tgt->tgt_start;
+ }
struct target_mem_desc *tgt_vars
- = gomp_map_vars (devicep, mapnum, hostaddrs, sizes, kinds, true);
+ = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false,
+ true);
struct gomp_thread old_thr, *thr = gomp_thread ();
old_thr = *thr;
memset (thr, '\0', sizeof (*thr));
@@ -714,11 +809,10 @@ GOMP_target (int device, void (*fn) (void *), const void *openmp_target,
thr->place = old_thr.place;
thr->ts.place_partition_len = gomp_places_list_len;
}
- devicep->run_func (devicep->target_id, (void *) tgt_fn->tgt->tgt_start,
- (void *) tgt_vars->tgt_start);
+ devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start);
gomp_free_thread (thr);
*thr = old_thr;
- gomp_unmap_vars (tgt_vars);
+ gomp_unmap_vars (tgt_vars, true);
}
void
@@ -726,7 +820,12 @@ GOMP_target_data (int device, const void *openmp_target, size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned char *kinds)
{
struct gomp_device_descr *devicep = resolve_device (device);
- if (devicep == NULL)
+ struct gomp_memory_mapping *mm = &devicep->mem_map;
+
+ if (devicep != NULL && !devicep->is_initialized)
+ gomp_init_dev_tables (devicep);
+
+ if (devicep == NULL || !(devicep->capabilities & TARGET_CAP_OPENMP_400))
{
/* Host fallback. */
struct gomp_task_icv *icv = gomp_icv (false);
@@ -737,20 +836,21 @@ GOMP_target_data (int device, const void *openmp_target, size_t mapnum,
new #pragma omp target data, otherwise GOMP_target_end_data
would get out of sync. */
struct target_mem_desc *tgt
- = gomp_map_vars (NULL, 0, NULL, NULL, NULL, false);
+ = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, false, false);
tgt->prev = icv->target_data;
icv->target_data = tgt;
}
return;
}
- gomp_mutex_lock (&devicep->dev_env_lock);
+ gomp_mutex_lock (&mm->lock);
if (!devicep->is_initialized)
- gomp_init_device (devicep);
- gomp_mutex_unlock (&devicep->dev_env_lock);
+ gomp_init_dev_tables (devicep);
+ gomp_mutex_unlock (&mm->lock);
struct target_mem_desc *tgt
- = gomp_map_vars (devicep, mapnum, hostaddrs, sizes, kinds, false);
+ = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false,
+ false);
struct gomp_task_icv *icv = gomp_icv (true);
tgt->prev = icv->target_data;
icv->target_data = tgt;
@@ -764,7 +864,7 @@ GOMP_target_end_data (void)
{
struct target_mem_desc *tgt = icv->target_data;
icv->target_data = tgt->prev;
- gomp_unmap_vars (tgt);
+ gomp_unmap_vars (tgt, true);
}
}
@@ -773,15 +873,18 @@ GOMP_target_update (int device, const void *openmp_target, size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned char *kinds)
{
struct gomp_device_descr *devicep = resolve_device (device);
- if (devicep == NULL)
- return;
+ struct gomp_memory_mapping *mm = &devicep->mem_map;
- gomp_mutex_lock (&devicep->dev_env_lock);
- if (!devicep->is_initialized)
+ gomp_mutex_lock (&mm->lock);
+ if (devicep != NULL && !devicep->is_initialized)
gomp_init_device (devicep);
- gomp_mutex_unlock (&devicep->dev_env_lock);
+ gomp_mutex_unlock (&mm->lock);
- gomp_update (devicep, mapnum, hostaddrs, sizes, kinds);
+ if (devicep == NULL || !(devicep->capabilities & TARGET_CAP_OPENMP_400))
+ return;
+
+ gomp_update (devicep, &devicep->mem_map, mapnum, hostaddrs, sizes, kinds,
+ false);
}
void
@@ -808,9 +911,22 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
const char *plugin_name)
{
void *plugin_handle = dlopen (plugin_name, RTLD_LAZY);
+ char *err = NULL, *last_missing = NULL;
+ int optional_present, optional_total;
+
if (!plugin_handle)
return false;
+ /* Clear any existing error. */
+ dlerror ();
+
+ device->plugin_handle = dlopen (plugin_name, RTLD_LAZY);
+ if (!device->plugin_handle)
+ {
+ err = dlerror ();
+ goto out;
+ }
+
/* Check if all required functions are available in the plugin and store
their handlers. */
#define DLSYM(f) \
@@ -821,33 +937,104 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
return false; \
} \
while (0)
+ /* Similar, but missing functions are not an error. */
+#define DLSYM_OPT(f,n) \
+ do \
+ { \
+ char *tmp_err; \
+ device->f##_func = dlsym (device->plugin_handle, \
+ "GOMP_OFFLOAD_" #n); \
+ tmp_err = dlerror (); \
+ if (tmp_err == NULL) \
+ optional_present++; \
+ else \
+ last_missing = #n; \
+ optional_total++; \
+ } \
+ while (0)
+
+ DLSYM (get_name);
+ DLSYM (get_caps);
DLSYM (get_type);
DLSYM (get_num_devices);
DLSYM (register_image);
DLSYM (init_device);
+ DLSYM (fini_device);
DLSYM (get_table);
DLSYM (alloc);
DLSYM (free);
DLSYM (dev2host);
DLSYM (host2dev);
- DLSYM (run);
+ device->capabilities = device->get_caps_func ();
+ if (device->capabilities & TARGET_CAP_OPENMP_400)
+ DLSYM (run);
+ if (device->capabilities & TARGET_CAP_OPENACC_200)
+ {
+ optional_present = optional_total = 0;
+ DLSYM_OPT (openacc.exec, openacc_parallel);
+ DLSYM_OPT (openacc.open_device, openacc_open_device);
+ DLSYM_OPT (openacc.close_device, openacc_close_device);
+ DLSYM_OPT (openacc.get_device_num, openacc_get_device_num);
+ DLSYM_OPT (openacc.set_device_num, openacc_set_device_num);
+ DLSYM_OPT (openacc.register_async_cleanup,
+ openacc_register_async_cleanup);
+ DLSYM_OPT (openacc.async_test, openacc_async_test);
+ DLSYM_OPT (openacc.async_test_all, openacc_async_test_all);
+ DLSYM_OPT (openacc.async_wait, openacc_async_wait);
+ DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async);
+ DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all);
+ DLSYM_OPT (openacc.async_wait_all_async, openacc_async_wait_all_async);
+ DLSYM_OPT (openacc.async_set_async, openacc_async_set_async);
+ DLSYM_OPT (openacc.create_thread_data, openacc_create_thread_data);
+ DLSYM_OPT (openacc.destroy_thread_data, openacc_destroy_thread_data);
+ /* Require all the OpenACC handlers if we have TARGET_CAP_OPENACC_200. */
+ if (optional_present != optional_total)
+ {
+ err = "plugin missing OpenACC handler function";
+ goto out;
+ }
+ optional_present = optional_total = 0;
+ DLSYM_OPT (openacc.cuda.get_current_device,
+ openacc_get_current_cuda_device);
+ DLSYM_OPT (openacc.cuda.get_current_context,
+ openacc_get_current_cuda_context);
+ DLSYM_OPT (openacc.cuda.get_stream, openacc_get_cuda_stream);
+ DLSYM_OPT (openacc.cuda.set_stream, openacc_set_cuda_stream);
+ /* Make sure all the CUDA functions are there if any of them are. */
+ if (optional_present && optional_present != optional_total)
+ {
+ err = "plugin missing OpenACC CUDA handler function";
+ goto out;
+ }
+ }
#undef DLSYM
+#undef DLSYM_OPT
- return true;
+ out:
+ if (err != NULL)
+ {
+ gomp_error ("while loading %s: %s", plugin_name, err);
+ if (last_missing)
+ gomp_error ("missing function was %s", last_missing);
+ if (device->plugin_handle)
+ dlclose (device->plugin_handle);
+ }
+ return err == NULL;
}
-/* This function finds OFFLOAD_IMAGES corresponding to DEVICE type, and
- registers them in the plugin. */
+/* This function adds a compatible offload image IMAGE to an accelerator device
+ DEVICE. */
static void
-gomp_register_images_for_device (struct gomp_device_descr *device)
+gomp_register_image_for_device (struct gomp_device_descr *device,
+ struct offload_image_descr *image)
{
- int i;
- for (i = 0; i < num_offload_images; i++)
+ if (!device->offload_regions_registered
+ && (device->type == image->type
+ || device->type == OFFLOAD_TARGET_TYPE_HOST))
{
- struct offload_image_descr *image = &offload_images[i];
- if (image->type == device->type)
- device->register_image_func (image->host_table, image->target_data);
+ device->register_image_func (image->host_table, image->target_data);
+ device->offload_regions_registered = true;
}
}
@@ -903,15 +1090,19 @@ gomp_target_init (void)
}
current_device.type = current_device.get_type_func ();
+ current_device.name = current_device.get_name_func ();
current_device.is_initialized = false;
- current_device.dev_splay_tree.root = NULL;
- gomp_register_images_for_device (¤t_device);
+ current_device.offload_regions_registered = false;
+ current_device.mem_map.splay_tree.root = NULL;
+ current_device.mem_map.is_initialized = false;
+ current_device.target_data = NULL;
+ current_device.openacc.data_environ = NULL;
for (i = 0; i < new_num_devices; i++)
{
current_device.id = num_devices + 1;
current_device.target_id = i;
devices[num_devices] = current_device;
- gomp_mutex_init (&devices[num_devices].dev_env_lock);
+ gomp_mutex_init (&devices[num_devices].mem_map.lock);
num_devices++;
}
}
@@ -922,6 +1113,43 @@ gomp_target_init (void)
}
while (next);
+ /* Prefer a device with TARGET_CAP_OPENMP_400 for ICV default-device-var. */
+ if (num_devices > 1)
+ {
+ int d = gomp_icv (false)->default_device_var;
+
+ if (!(devices[d].capabilities & TARGET_CAP_OPENMP_400))
+ {
+ for (i = 0; i < num_devices; i++)
+ {
+ if (devices[i].capabilities & TARGET_CAP_OPENMP_400)
+ {
+ struct gomp_device_descr device_tmp = devices[d];
+ devices[d] = devices[i];
+ devices[d].id = d + 1;
+ devices[i] = device_tmp;
+ devices[i].id = i + 1;
+
+ break;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < num_devices; i++)
+ {
+ int j;
+
+ for (j = 0; j < num_offload_images; j++)
+ gomp_register_image_for_device (&devices[i], &offload_images[j]);
+
+ /* The 'devices' array can be moved (by the realloc call) until we have
+ found all the plugins, so registering with the OpenACC runtime (which
+ takes a copy of the pointer argument) must be delayed until now. */
+ if (devices[i].capabilities & TARGET_CAP_OPENACC_200)
+ goacc_register (&devices[i]);
+ }
+
free (offload_images);
offload_images = NULL;
num_offload_images = 0;
new file mode 100644
new file mode 100644
@@ -0,0 +1,2 @@
+set cuda_driver_include "@CUDA_DRIVER_INCLUDE@"
+set cuda_driver_lib "@CUDA_DRIVER_LIB@"