commit 8994fb8c1b9d52cb9c82a61227a450df29e61806
Author: Julian Brown <julian@codesourcery.com>
Date: Wed Nov 5 02:54:30 2014 -0800
Move libgomp plugins into their own directory.
@@ -1,7 +1,8 @@
## Process this file with automake to produce Makefile.in
ACLOCAL_AMFLAGS = -I .. -I ../config
-SUBDIRS = testsuite
+SUBDIRS = testsuite plugin
+DIST_SUBDIRS = plugin
## May be used by toolexeclibdir.
gcc_version := $(shell cat $(top_srcdir)/../gcc/BASE-VER)
@@ -21,27 +22,6 @@ AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
toolexeclib_LTLIBRARIES = libgomp.la
nodist_toolexeclib_HEADERS = libgomp.spec
-if PLUGIN_NVPTX
-# Nvidia PTX OpenACC plugin.
-libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
-toolexeclib_LTLIBRARIES += libgomp-plugin-nvptx.la
-libgomp_plugin_nvptx_la_SOURCES = plugin-nvptx.c
-libgomp_plugin_nvptx_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_NVPTX_CPPFLAGS)
-libgomp_plugin_nvptx_la_LDFLAGS = $(libgomp_plugin_nvptx_version_info) \
- $(lt_host_flags)
-libgomp_plugin_nvptx_la_LDFLAGS += $(PLUGIN_NVPTX_LDFLAGS)
-libgomp_plugin_nvptx_la_LIBADD = $(PLUGIN_NVPTX_LIBS)
-libgomp_plugin_nvptx_la_LIBTOOLFLAGS = --tag=disable-static
-endif
-
-libgomp_plugin_host_nonshm_version_info = -version-info $(libtool_VERSION)
-toolexeclib_LTLIBRARIES += libgomp-plugin-host_nonshm.la
-libgomp_plugin_host_nonshm_la_SOURCES = oacc-host.c
-libgomp_plugin_host_nonshm_la_CPPFLAGS = $(AM_CPPFLAGS) -DHOST_NONSHM_PLUGIN
-libgomp_plugin_host_nonshm_la_LDFLAGS = \
- $(libgomp_plugin_host_nonshm_version_info) $(lt_host_flags)
-libgomp_plugin_host_nonshm_la_LIBTOOLFLAGS = --tag=disable-static
-
if LIBGOMP_BUILD_VERSIONED_SHLIB
# -Wc is only a libtool option.
comma = ,
@@ -30,42 +30,6 @@ LIBGOMP_ENABLE(generated-files-in-srcdir, no, ,
AC_MSG_RESULT($enable_generated_files_in_srcdir)
AM_CONDITIONAL(GENINSRC, test "$enable_generated_files_in_srcdir" = yes)
-# Look for the CUDA driver package.
-CUDA_DRIVER_INCLUDE=
-CUDA_DRIVER_LIB=
-AC_SUBST(CUDA_DRIVER_INCLUDE)
-AC_SUBST(CUDA_DRIVER_LIB)
-CUDA_DRIVER_CPPFLAGS=
-CUDA_DRIVER_LDFLAGS=
-AC_ARG_WITH(cuda-driver,
- [AS_HELP_STRING([--with-cuda-driver=PATH],
- [specify prefix directory for installed CUDA driver package.
- Equivalent to --with-cuda-driver-include=PATH/include
- plus --with-cuda-driver-lib=PATH/lib])])
-AC_ARG_WITH(cuda-driver-include,
- [AS_HELP_STRING([--with-cuda-driver-include=PATH],
- [specify directory for installed CUDA driver include files])])
-AC_ARG_WITH(cuda-driver-lib,
- [AS_HELP_STRING([--with-cuda-driver-lib=PATH],
- [specify directory for the installed CUDA driver library])])
-if test "x$with_cuda_driver" != x; then
- CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
- CUDA_DRIVER_LIB=$with_cuda_driver/lib
-fi
-if test "x$with_cuda_driver_include" != x; then
- CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
-fi
-if test "x$with_cuda_driver_lib" != x; then
- CUDA_DRIVER_LIB=$with_cuda_driver_lib
-fi
-if test "x$CUDA_DRIVER_INCLUDE" != x; then
- CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
-fi
-if test "x$CUDA_DRIVER_LIB" != x; then
- CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
-fi
-
-
# -------
# -------
@@ -241,52 +205,7 @@ elif test "x$enable_accelerator" != xno; then
AC_MSG_ERROR([Can't have support for accelerators without support for plugins])
fi
-PLUGIN_NVPTX=0
-PLUGIN_NVPTX_CPPFLAGS=
-PLUGIN_NVPTX_LDFLAGS=
-PLUGIN_NVPTX_LIBS=
-AC_SUBST(PLUGIN_NVPTX)
-AC_SUBST(PLUGIN_NVPTX_CPPFLAGS)
-AC_SUBST(PLUGIN_NVPTX_LDFLAGS)
-AC_SUBST(PLUGIN_NVPTX_LIBS)
-# enable_accelerator has already been validated at top level.
-# No need to do it again.
-case $enable_offload_targets in
- auto-nvptx*|nvptx*)
- PLUGIN_NVPTX=yes
- PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
- PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
- PLUGIN_NVPTX_LIBS='-lcuda'
-
- PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
- CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
- PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
- LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
- PLUGIN_NVPTX_save_LIBS=$LIBS
- LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
- AC_LINK_IFELSE(
- [AC_LANG_PROGRAM(
- [#include "cuda.h"],
- [CUresult r = cuCtxPushCurrent (NULL);])],
- [PLUGIN_NVPTX=1])
- CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
- LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
- LIBS=$PLUGIN_NVPTX_save_LIBS
- case $PLUGIN_NVPTX in
- auto-nvptx*)
- PLUGIN_NVPTX=0
- AC_MSG_WARN([CUDA driver package required for nvptx support; disabling])
- ;;
- nvptx*)
- PLUGIN_NVPTX=0
- AC_MSG_ERROR([CUDA driver package required for nvptx support])
- ;;
- esac
- ;;
-esac
-AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
-AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
- [Define to 1 if the NVIDIA plugin is built, 0 if not.])
+AC_CONFIG_SUBDIRS([plugin])
# Check for functions needed.
AC_CHECK_FUNCS(getloadavg clock_gettime strtoull)
@@ -1,4 +1,4 @@
-/* OpenACC Runtime Library: acc_device_host, acc_device_host_nonshm.
+/* OpenACC Runtime Library: acc_device_host.
Copyright (C) 2013-2014 Free Software Foundation, Inc.
@@ -25,430 +25,6 @@
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
-/* Simple implementation of support routines for a shared-memory
- acc_device_host, and a non-shared memory acc_device_host_nonshm, with the
- latter built as a plugin. */
-
-#include "openacc.h"
-#include "config.h"
-#include "libgomp.h"
-#include "target.h"
-#ifdef HOST_NONSHM_PLUGIN
-#include "libgomp-plugin.h"
-#include "oacc-plugin.h"
-#else
-#include "oacc-int.h"
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
-#ifdef HOST_NONSHM_PLUGIN
-#define STATIC
-#define GOMP(X) GOMP_PLUGIN_##X
-#define SELF "host_nonshm plugin: "
-#else
-#define STATIC static
-#define GOMP(X) gomp_##X
-#define SELF "host: "
-#endif
-
-#ifndef HOST_NONSHM_PLUGIN
-static struct gomp_device_descr host_dispatch;
-#endif
-
-STATIC const char *
-GOMP_OFFLOAD_get_name (void)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
-#ifdef HOST_NONSHM_PLUGIN
- return "host_nonshm";
-#else
- return "host";
-#endif
-}
-
-STATIC int
-GOMP_OFFLOAD_get_type (void)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
-#ifdef HOST_NONSHM_PLUGIN
- return TARGET_TYPE_HOST_NONSHM;
-#else
- return TARGET_TYPE_HOST;
-#endif
-}
-
-STATIC unsigned int
-GOMP_OFFLOAD_get_caps (void)
-{
- unsigned int caps = TARGET_CAP_OPENACC_200 | TARGET_CAP_OPENMP_400
- | TARGET_CAP_NATIVE_EXEC;
-
-#ifndef HOST_NONSHM_PLUGIN
- caps |= TARGET_CAP_SHARED_MEM;
-#endif
-
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s: 0x%x\n", __FILE__, __FUNCTION__, caps);
-#endif
-
- return caps;
-}
-
-STATIC int
-GOMP_OFFLOAD_get_num_devices (void)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return 1;
-}
-
-STATIC void
-GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, host_table,
- target_data);
-#endif
-}
-
-STATIC int
-GOMP_OFFLOAD_init_device (void)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return GOMP_OFFLOAD_get_num_devices ();
-}
-
-STATIC int
-GOMP_OFFLOAD_fini_device (void)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return 0;
-}
-
-STATIC int
-GOMP_OFFLOAD_get_table (struct mapping_table **table)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, table);
-#endif
-
- return 0;
-}
-
-STATIC bool
-GOMP_OFFLOAD_openacc_avail (void)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return 1;
-}
-
-STATIC void *
-GOMP_OFFLOAD_openacc_open_device (int n)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n);
-#endif
-
- return (void *) (intptr_t) n;
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_close_device (void *hnd)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, hnd);
-#endif
-
- return 0;
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_get_device_num (void)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return 0;
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_set_device_num (int n)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n);
-#endif
-
- if (n > 0)
- GOMP(fatal) ("device number %u out of range for host execution", n);
-}
-
-STATIC void *
-GOMP_OFFLOAD_alloc (size_t s)
-{
- void *ptr = GOMP(malloc) (s);
-
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%zd): %p\n", __FILE__, __FUNCTION__, s, ptr);
-#endif
-
- return ptr;
-}
-
-STATIC void
-GOMP_OFFLOAD_free (void *p)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, p);
-#endif
-
- free (p);
-}
-
-STATIC void *
-GOMP_OFFLOAD_host2dev (void *d, const void *h, size_t s)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, d, h,
- s);
-#endif
-
-#ifdef HOST_NONSHM_PLUGIN
- memcpy (d, h, s);
-#endif
-
- return 0;
-}
-
-STATIC void *
-GOMP_OFFLOAD_dev2host (void *h, const void *d, size_t s)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, h, d,
- s);
-#endif
-
-#ifdef HOST_NONSHM_PLUGIN
- memcpy (h, d, s);
-#endif
-
- return 0;
-}
-
-STATIC void
-GOMP_OFFLOAD_run (void *fn_ptr, void *vars)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, fn_ptr,
- vars);
-#endif
-
- void (*fn)(void *) = (void (*)(void *)) fn_ptr;
-
- fn (vars);
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *),
- size_t mapnum __attribute__((unused)),
- void **hostaddrs __attribute__((unused)),
- void **devaddrs __attribute__((unused)),
- size_t *sizes __attribute__((unused)),
- unsigned short *kinds __attribute__((unused)),
- int num_gangs __attribute__((unused)),
- int num_workers __attribute__((unused)),
- int vector_length __attribute__((unused)),
- int async __attribute__((unused)),
- void *targ_mem_desc __attribute__((unused)))
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, %d, %p)\n",
- __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes, kinds,
- num_gangs, num_workers, vector_length, async, targ_mem_desc);
-#endif
-
-#ifdef HOST_NONSHM_PLUGIN
- fn (devaddrs);
-#else
- fn (hostaddrs);
-#endif
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
-{
-#ifdef HOST_NONSHM_PLUGIN
- /* "Asynchronous" launches are executed synchronously on the (non-SHM) host,
- so there's no point in delaying host-side cleanup -- just do it now. */
- GOMP_PLUGIN_async_unmap_vars (targ_mem_desc);
-#endif
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_async_set_async (int async __attribute__((unused)))
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
-#endif
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_async_test (int async __attribute__((unused)))
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
-#endif
-
- return 1;
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_async_test_all (void)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return 1;
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_async_wait (int async __attribute__((unused)))
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
-#endif
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_async_wait_all (void)
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
-#endif
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_async_wait_async (int async1 __attribute__((unused)),
- int async2 __attribute__((unused)))
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%d, %d)\n", __FILE__, __FUNCTION__, async1,
- async2);
-#endif
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__((unused)))
-{
-#ifdef DEBUG
- fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
-#endif
-}
-
-STATIC void *
-GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data __attribute__((unused)))
-{
- return NULL;
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_destroy_thread_data (void *tls_data __attribute__((unused)))
-{
-}
-
-#ifndef HOST_NONSHM_PLUGIN
-static struct gomp_device_descr host_dispatch =
- {
- .name = "host",
-
- .type = TARGET_TYPE_HOST,
- .capabilities = TARGET_CAP_OPENACC_200 | TARGET_CAP_NATIVE_EXEC
- | TARGET_CAP_SHARED_MEM,
- .id = 0,
-
- .is_initialized = false,
- .offload_regions_registered = false,
-
- .get_name_func = GOMP_OFFLOAD_get_name,
- .get_type_func = GOMP_OFFLOAD_get_type,
- .get_caps_func = GOMP_OFFLOAD_get_caps,
-
- .init_device_func = GOMP_OFFLOAD_init_device,
- .fini_device_func = GOMP_OFFLOAD_fini_device,
- .get_num_devices_func = GOMP_OFFLOAD_get_num_devices,
- .register_image_func = GOMP_OFFLOAD_register_image,
- .get_table_func = GOMP_OFFLOAD_get_table,
-
- .alloc_func = GOMP_OFFLOAD_alloc,
- .free_func = GOMP_OFFLOAD_free,
- .host2dev_func = GOMP_OFFLOAD_host2dev,
- .dev2host_func = GOMP_OFFLOAD_dev2host,
-
- .run_func = GOMP_OFFLOAD_run,
-
- .openacc = {
- .open_device_func = GOMP_OFFLOAD_openacc_open_device,
- .close_device_func = GOMP_OFFLOAD_openacc_close_device,
-
- .get_device_num_func = GOMP_OFFLOAD_openacc_get_device_num,
- .set_device_num_func = GOMP_OFFLOAD_openacc_set_device_num,
-
- /* Device available. */
- .avail_func = GOMP_OFFLOAD_openacc_avail,
-
- .exec_func = GOMP_OFFLOAD_openacc_parallel,
-
- .register_async_cleanup_func
- = GOMP_OFFLOAD_openacc_register_async_cleanup,
-
- .async_set_async_func = GOMP_OFFLOAD_openacc_async_set_async,
- .async_test_func = GOMP_OFFLOAD_openacc_async_test,
- .async_test_all_func = GOMP_OFFLOAD_openacc_async_test_all,
- .async_wait_func = GOMP_OFFLOAD_openacc_async_wait,
- .async_wait_async_func = GOMP_OFFLOAD_openacc_async_wait_async,
- .async_wait_all_func = GOMP_OFFLOAD_openacc_async_wait_all,
- .async_wait_all_async_func = GOMP_OFFLOAD_openacc_async_wait_all_async,
-
- .create_thread_data_func = GOMP_OFFLOAD_openacc_create_thread_data,
- .destroy_thread_data_func = GOMP_OFFLOAD_openacc_destroy_thread_data,
-
- .cuda = {
- .get_current_device_func = NULL,
- .get_current_context_func = NULL,
- .get_stream_func = NULL,
- .set_stream_func = NULL,
- }
- }
- };
-
-/* Register this device type. */
-static __attribute__ ((constructor))
-void ACC_host_init (void)
-{
- gomp_mutex_init (&host_dispatch.mem_map.lock);
- ACC_register (&host_dispatch);
-}
-#endif
-
+/* This shares much of the implementation of the plugin-host.c "host_nonshm"
+ plugin. */
+#include "plugin/plugin-host.c"
deleted file mode 100644
@@ -1,1909 +0,0 @@
-/* Plugin for NVPTX execution.
-
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
-
- Contributed by Mentor Embedded.
-
- This file is part of the GNU OpenMP Library (libgomp).
-
- Libgomp is free software; you can redistribute it and/or modify it
- under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3, or (at your option)
- any later version.
-
- Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- more details.
-
- Under Section 7 of GPL version 3, you are granted additional
- permissions described in the GCC Runtime Library Exception, version
- 3.1, as published by the Free Software Foundation.
-
- You should have received a copy of the GNU General Public License and
- a copy of the GCC Runtime Library Exception along with this program;
- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
- library appears to hold some implicit state, but the documentation
- is not clear as to what that state might be. Or how one might
- propagate it from one thread to another. */
-
-#include "openacc.h"
-#include "config.h"
-#include "libgomp.h"
-#include "target.h"
-#include "libgomp-plugin.h"
-#include "oacc-ptx.h"
-#include "oacc-plugin.h"
-
-#include <cuda.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-#include <dlfcn.h>
-#include <unistd.h>
-#include <assert.h>
-
-#define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
-
-static struct _errlist
-{
- CUresult r;
- char *m;
-} cuErrorList[] = {
- { CUDA_ERROR_INVALID_VALUE, "invalid value" },
- { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
- { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
- { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
- { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
- { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
- { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
- { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
- { CUDA_ERROR_NO_DEVICE, "no device" },
- { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
- { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
- { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
- { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
- { CUDA_ERROR_MAP_FAILED, "map error" },
- { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
- { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
- { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
- { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
- { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
- { CUDA_ERROR_NOT_MAPPED, "not mapped" },
- { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
- { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
- { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
- { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
- { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
- { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
- { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
- { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
- { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- "shared object symbol not found" },
- { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
- { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
- { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
- { CUDA_ERROR_NOT_FOUND, "not found" },
- { CUDA_ERROR_NOT_READY, "not ready" },
- { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
- { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
- { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
- { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- "launch incompatibe texturing" },
- { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
- { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
- { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
- { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
- { CUDA_ERROR_ASSERT, "assert" },
- { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
- { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
- "host memory already registered" },
- { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
- { CUDA_ERROR_NOT_PERMITTED, "no permitted" },
- { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
- { CUDA_ERROR_UNKNOWN, "unknown" }
-};
-
-static char errmsg[128];
-
-static char *
-cuErrorMsg (CUresult r)
-{
- int i;
-
- for (i = 0; i < ARRAYSIZE (cuErrorList); i++)
- {
- if (cuErrorList[i].r == r)
- return &cuErrorList[i].m[0];
- }
-
- sprintf (&errmsg[0], "unknown result code: %5d", r);
-
- return &errmsg[0];
-}
-
-struct targ_fn_descriptor
-{
- CUfunction fn;
- const char *name;
-};
-
-static bool PTX_inited = false;
-
-struct PTX_stream
-{
- CUstream stream;
- pthread_t host_thread;
- bool multithreaded;
-
- CUdeviceptr d;
- void *h;
- void *h_begin;
- void *h_end;
- void *h_next;
- void *h_prev;
- void *h_tail;
-
- struct PTX_stream *next;
-};
-
-/* Thread-specific data for PTX. */
-
-struct nvptx_thread
-{
- struct PTX_stream *current_stream;
- struct PTX_device *ptx_dev;
-};
-
-struct map
-{
- int async;
- size_t size;
- char mappings[0];
-};
-
-static void
-map_init (struct PTX_stream *s)
-{
- CUresult r;
-
- int size = getpagesize ();
-
- assert (s);
- assert (!s->d);
- assert (!s->h);
-
- r = cuMemAllocHost (&s->h, size);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuErrorMsg (r));
-
- r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuErrorMsg (r));
-
- assert (s->h);
-
- s->h_begin = s->h;
- s->h_end = s->h_begin + size;
- s->h_next = s->h_prev = s->h_tail = s->h_begin;
-
- assert (s->h_next);
- assert (s->h_end);
-}
-
-static void
-map_fini (struct PTX_stream *s)
-{
- CUresult r;
-
- r = cuMemFreeHost (s->h);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuErrorMsg (r));
-}
-
-static void
-map_pop (struct PTX_stream *s)
-{
- struct map *m;
-
- assert (s != NULL);
- assert (s->h_next);
- assert (s->h_prev);
- assert (s->h_tail);
-
- m = s->h_tail;
-
- s->h_tail += m->size;
-
- if (s->h_tail >= s->h_end)
- s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
-
- if (s->h_next == s->h_tail)
- s->h_prev = s->h_next;
-
- assert (s->h_next >= s->h_begin);
- assert (s->h_tail >= s->h_begin);
- assert (s->h_prev >= s->h_begin);
-
- assert (s->h_next <= s->h_end);
- assert (s->h_tail <= s->h_end);
- assert (s->h_prev <= s->h_end);
-}
-
-static void
-map_push (struct PTX_stream *s, int async, size_t size, void **h, void **d)
-{
- int left;
- int offset;
- struct map *m;
-
- assert (s != NULL);
-
- left = s->h_end - s->h_next;
- size += sizeof (struct map);
-
- assert (s->h_prev);
- assert (s->h_next);
-
- if (size >= left)
- {
- m = s->h_prev;
- m->size += left;
- s->h_next = s->h_begin;
-
- if (s->h_next + size > s->h_end)
- GOMP_PLUGIN_fatal ("unable to push map");
- }
-
- assert (s->h_next);
-
- m = s->h_next;
- m->async = async;
- m->size = size;
-
- offset = (void *)&m->mappings[0] - s->h;
-
- *d = (void *)(s->d + offset);
- *h = (void *)(s->h + offset);
-
- s->h_prev = s->h_next;
- s->h_next += size;
-
- assert (s->h_prev);
- assert (s->h_next);
-
- assert (s->h_next >= s->h_begin);
- assert (s->h_tail >= s->h_begin);
- assert (s->h_prev >= s->h_begin);
- assert (s->h_next <= s->h_end);
- assert (s->h_tail <= s->h_end);
- assert (s->h_prev <= s->h_end);
-
- return;
-}
-
-struct PTX_device
-{
- CUcontext ctx;
- bool ctx_shared;
- CUdevice dev;
- struct PTX_stream *null_stream;
- /* All non-null streams associated with this device (actually context),
- either created implicitly or passed in from the user (via
- acc_set_cuda_stream). */
- struct PTX_stream *active_streams;
- struct {
- struct PTX_stream **arr;
- int size;
- } async_streams;
- /* A lock for use when manipulating the above stream list and array. */
- gomp_mutex_t stream_lock;
- int ord;
- bool overlap;
- bool map;
- bool concur;
- int mode;
- bool mkern;
-
- struct PTX_device *next;
-};
-
-enum PTX_event_type
-{
- PTX_EVT_MEM,
- PTX_EVT_KNL,
- PTX_EVT_SYNC,
- PTX_EVT_ASYNC_CLEANUP
-};
-
-struct PTX_event
-{
- CUevent *evt;
- int type;
- void *addr;
- int ord;
-
- struct PTX_event *next;
-};
-
-static gomp_mutex_t PTX_event_lock;
-static struct PTX_event *PTX_events;
-
-#define _XSTR(s) _STR(s)
-#define _STR(s) #s
-
-static struct _synames
-{
- char *n;
-} cuSymNames[] =
-{
- { _XSTR(cuCtxCreate) },
- { _XSTR(cuCtxDestroy) },
- { _XSTR(cuCtxGetCurrent) },
- { _XSTR(cuCtxPushCurrent) },
- { _XSTR(cuCtxSynchronize) },
- { _XSTR(cuDeviceGet) },
- { _XSTR(cuDeviceGetAttribute) },
- { _XSTR(cuDeviceGetCount) },
- { _XSTR(cuEventCreate) },
- { _XSTR(cuEventDestroy) },
- { _XSTR(cuEventQuery) },
- { _XSTR(cuEventRecord) },
- { _XSTR(cuInit) },
- { _XSTR(cuLaunchKernel) },
- { _XSTR(cuLinkAddData) },
- { _XSTR(cuLinkComplete) },
- { _XSTR(cuLinkCreate) },
- { _XSTR(cuMemAlloc) },
- { _XSTR(cuMemAllocHost) },
- { _XSTR(cuMemcpy) },
- { _XSTR(cuMemcpyDtoH) },
- { _XSTR(cuMemcpyDtoHAsync) },
- { _XSTR(cuMemcpyHtoD) },
- { _XSTR(cuMemcpyHtoDAsync) },
- { _XSTR(cuMemFree) },
- { _XSTR(cuMemFreeHost) },
- { _XSTR(cuMemGetAddressRange) },
- { _XSTR(cuMemHostGetDevicePointer) },
- { _XSTR(cuMemHostRegister) },
- { _XSTR(cuMemHostUnregister) },
- { _XSTR(cuModuleGetFunction) },
- { _XSTR(cuModuleLoadData) },
- { _XSTR(cuStreamDestroy) },
- { _XSTR(cuStreamQuery) },
- { _XSTR(cuStreamSynchronize) },
- { _XSTR(cuStreamWaitEvent) }
-};
-
-static int
-verify_device_library (void)
-{
- int i;
- void *dh, *ds;
-
- dh = dlopen ("libcuda.so", RTLD_LAZY);
- if (!dh)
- return -1;
-
- for (i = 0; i < ARRAYSIZE (cuSymNames); i++)
- {
- ds = dlsym (dh, cuSymNames[i].n);
- if (!ds)
- return -1;
- }
-
- dlclose (dh);
-
- return 0;
-}
-
-static inline struct nvptx_thread *
-nvptx_thread (void)
-{
- return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
-}
-
-static void
-init_streams_for_device (struct PTX_device *ptx_dev, int concurrency)
-{
- int i;
- struct PTX_stream *null_stream
- = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream));
-
- null_stream->stream = NULL;
- null_stream->host_thread = pthread_self ();
- null_stream->multithreaded = true;
- null_stream->d = (CUdeviceptr) NULL;
- null_stream->h = NULL;
- map_init (null_stream);
- ptx_dev->null_stream = null_stream;
-
- ptx_dev->active_streams = NULL;
- GOMP_PLUGIN_mutex_init (&ptx_dev->stream_lock);
-
- if (concurrency < 1)
- concurrency = 1;
-
- /* This is just a guess -- make space for as many async streams as the
- current device is capable of concurrently executing. This can grow
- later as necessary. No streams are created yet. */
- ptx_dev->async_streams.arr
- = GOMP_PLUGIN_malloc (concurrency * sizeof (struct PTX_stream *));
- ptx_dev->async_streams.size = concurrency;
-
- for (i = 0; i < concurrency; i++)
- ptx_dev->async_streams.arr[i] = NULL;
-}
-
-static void
-fini_streams_for_device (struct PTX_device *ptx_dev)
-{
- free (ptx_dev->async_streams.arr);
-
- while (ptx_dev->active_streams != NULL)
- {
- struct PTX_stream *s = ptx_dev->active_streams;
- ptx_dev->active_streams = ptx_dev->active_streams->next;
-
- cuStreamDestroy (s->stream);
- map_fini (s);
- free (s);
- }
-
- map_fini (ptx_dev->null_stream);
- free (ptx_dev->null_stream);
-}
-
-/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
- thread THREAD (and also current device/context). If CREATE is true, create
- the stream if it does not exist (or use EXISTING if it is non-NULL), and
- associate the stream with the same thread argument. Returns stream to use
- as result. */
-
-static struct PTX_stream *
-select_stream_for_async (int async, pthread_t thread, bool create,
- CUstream existing)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
- /* Local copy of TLS variable. */
- struct PTX_device *ptx_dev = nvthd->ptx_dev;
- struct PTX_stream *stream = NULL;
- int orig_async = async;
-
- /* The special value acc_async_noval (-1) maps (for now) to an
- implicitly-created stream, which is then handled the same as any other
- numbered async stream. Other options are available, e.g. using the null
- stream for anonymous async operations, or choosing an idle stream from an
- active set. But, stick with this for now. */
- if (async > acc_async_sync)
- async++;
-
- if (create)
- GOMP_PLUGIN_mutex_lock (&ptx_dev->stream_lock);
-
- /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
- null stream, and in fact better performance may be obtainable if it doesn't
- (because the null stream enforces overly-strict synchronisation with
- respect to other streams for legacy reasons, and that's probably not
- needed with OpenACC). Maybe investigate later. */
- if (async == acc_async_sync)
- stream = ptx_dev->null_stream;
- else if (async >= 0 && async < ptx_dev->async_streams.size
- && ptx_dev->async_streams.arr[async] && !(create && existing))
- stream = ptx_dev->async_streams.arr[async];
- else if (async >= 0 && create)
- {
- if (async >= ptx_dev->async_streams.size)
- {
- int i, newsize = ptx_dev->async_streams.size * 2;
-
- if (async >= newsize)
- newsize = async + 1;
-
- ptx_dev->async_streams.arr
- = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
- newsize * sizeof (struct PTX_stream *));
-
- for (i = ptx_dev->async_streams.size; i < newsize; i++)
- ptx_dev->async_streams.arr[i] = NULL;
-
- ptx_dev->async_streams.size = newsize;
- }
-
- /* Create a new stream on-demand if there isn't one already, or if we're
- setting a particular async value to an existing (externally-provided)
- stream. */
- if (!ptx_dev->async_streams.arr[async] || existing)
- {
- CUresult r;
- struct PTX_stream *s
- = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream));
-
- if (existing)
- s->stream = existing;
- else
- {
- r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuErrorMsg (r));
- }
-
- /* If CREATE is true, we're going to be queueing some work on this
- stream. Associate it with the current host thread. */
- s->host_thread = thread;
- s->multithreaded = false;
-
- s->d = (CUdeviceptr) NULL;
- s->h = NULL;
- map_init (s);
-
- s->next = ptx_dev->active_streams;
- ptx_dev->active_streams = s;
- ptx_dev->async_streams.arr[async] = s;
- }
-
- stream = ptx_dev->async_streams.arr[async];
- }
- else if (async < 0)
- GOMP_PLUGIN_fatal ("bad async %d", async);
-
- if (create)
- {
- assert (stream != NULL);
-
- /* If we're trying to use the same stream from different threads
- simultaneously, set stream->multithreaded to true. This affects the
- behaviour of acc_async_test_all and acc_wait_all, which are supposed to
- only wait for asynchronous launches from the same host thread they are
- invoked on. If multiple threads use the same async value, we make note
- of that here and fall back to testing/waiting for all threads in those
- functions. */
- if (thread != stream->host_thread)
- stream->multithreaded = true;
-
- GOMP_PLUGIN_mutex_unlock (&ptx_dev->stream_lock);
- }
- else if (stream && !stream->multithreaded
- && !pthread_equal (stream->host_thread, thread))
- GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
-
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s using stream %p (CUDA stream %p) "
- "for async %d\n", __FILE__, __FUNCTION__, stream,
- stream ? stream->stream : NULL, orig_async);
-#endif
-
- return stream;
-}
-
-static int PTX_get_num_devices (void);
-
-/* Initialize the device. */
-static int
-PTX_init (void)
-{
- CUresult r;
- int rc;
-
- if (PTX_inited)
- return PTX_get_num_devices ();
-
- rc = verify_device_library ();
- if (rc < 0)
- return -1;
-
- r = cuInit (0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r));
-
- PTX_events = NULL;
-
- GOMP_PLUGIN_mutex_init (&PTX_event_lock);
-
- PTX_inited = true;
-
- return PTX_get_num_devices ();
-}
-
-static int
-PTX_fini (void)
-{
- PTX_inited = false;
-
- return 0;
-}
-
-static void *
-PTX_open_device (int n)
-{
- struct PTX_device *ptx_dev;
- CUdevice dev;
- CUresult r;
- int async_engines, pi;
-
- r = cuDeviceGet (&dev, n);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuErrorMsg (r));
-
- ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device));
-
- ptx_dev->ord = n;
- ptx_dev->dev = dev;
- ptx_dev->ctx_shared = false;
-
- r = cuCtxGetCurrent (&ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
-
- if (!ptx_dev->ctx)
- {
- r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuErrorMsg (r));
- }
- else
- ptx_dev->ctx_shared = true;
-
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
-
- ptx_dev->overlap = pi;
-
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
-
- ptx_dev->map = pi;
-
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
-
- ptx_dev->concur = pi;
-
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
-
- ptx_dev->mode = pi;
-
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
-
- ptx_dev->mkern = pi;
-
- r = cuDeviceGetAttribute (&async_engines,
- CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
- if (r != CUDA_SUCCESS)
- async_engines = 1;
-
- init_streams_for_device (ptx_dev, async_engines);
-
- return (void *) ptx_dev;
-}
-
-static int
-PTX_close_device (void *targ_data)
-{
- CUresult r;
- struct PTX_device *ptx_dev = targ_data;
-
- if (!ptx_dev)
- return 0;
-
- fini_streams_for_device (ptx_dev);
-
- if (!ptx_dev->ctx_shared)
- {
- r = cuCtxDestroy (ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r));
- }
-
- free (ptx_dev);
-
- return 0;
-}
-
-static int
-PTX_get_num_devices (void)
-{
- int n;
- CUresult r;
-
- /* This function will be called before the plugin has been initialized in
- order to enumerate available devices, but CUDA API routines can't be used
- until cuInit has been called. Just call it now (but don't yet do any
- further initialization). */
- if (!PTX_inited)
- cuInit (0);
-
- r = cuDeviceGetCount (&n);
- if (r!= CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuErrorMsg (r));
-
- return n;
-}
-
-static bool
-PTX_avail(void)
-{
- bool avail = false;
-
- if (PTX_init () > 0)
- avail = true;
-
- return avail;
-}
-
-static void
-link_ptx (CUmodule *module, char *ptx_code)
-{
- CUjit_option opts[7];
- void *optvals[7];
- float elapsed = 0.0;
-#define LOGSIZE 8192
- char elog[LOGSIZE];
- char ilog[LOGSIZE];
- unsigned long logsize = LOGSIZE;
- CUlinkState linkstate;
- CUresult r;
- void *linkout;
- size_t linkoutsize __attribute__((unused));
-
- GOMP_PLUGIN_notify ("attempting to load:\n---\n%s\n---\n", ptx_code);
-
- opts[0] = CU_JIT_WALL_TIME;
- optvals[0] = &elapsed;
-
- opts[1] = CU_JIT_INFO_LOG_BUFFER;
- optvals[1] = &ilog[0];
-
- opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
- optvals[2] = (void *) logsize;
-
- opts[3] = CU_JIT_ERROR_LOG_BUFFER;
- optvals[3] = &elog[0];
-
- opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
- optvals[4] = (void *) logsize;
-
- opts[5] = CU_JIT_LOG_VERBOSE;
- optvals[5] = (void *) 1;
-
- opts[6] = CU_JIT_TARGET;
- optvals[6] = (void *) CU_TARGET_COMPUTE_30;
-
- r = cuLinkCreate (7, opts, optvals, &linkstate);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuErrorMsg (r));
-
- char *abort_ptx = ABORT_PTX;
- r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
- strlen (abort_ptx) + 1, 0, 0, 0, 0);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuErrorMsg (r));
- }
-
- char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
- r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
- strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
- cuErrorMsg (r));
- }
-
- char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
- r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
- strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
- cuErrorMsg (r));
- }
-
- r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
- strlen (ptx_code) + 1, 0, 0, 0, 0);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuErrorMsg (r));
- }
-
- r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuErrorMsg (r));
-
- GOMP_PLUGIN_notify ("Link complete: %fms\n", elapsed);
- GOMP_PLUGIN_notify ("Link log %s\n", &ilog[0]);
-
- r = cuModuleLoadData (module, linkout);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuErrorMsg (r));
-}
-
-static void
-event_gc (bool memmap_lockable)
-{
- struct PTX_event *ptx_event = PTX_events;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
-
- while (ptx_event != NULL)
- {
- CUresult r;
- struct PTX_event *e = ptx_event;
-
- ptx_event = ptx_event->next;
-
- if (e->ord != nvthd->ptx_dev->ord)
- continue;
-
- r = cuEventQuery (*e->evt);
- if (r == CUDA_SUCCESS)
- {
- CUevent *te;
-
- te = e->evt;
-
- switch (e->type)
- {
- case PTX_EVT_MEM:
- case PTX_EVT_SYNC:
- break;
-
- case PTX_EVT_KNL:
- map_pop (e->addr);
- break;
-
- case PTX_EVT_ASYNC_CLEANUP:
- {
- /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the
- memory-map splay tree lock for the current device, so we
- can't call it when one of our callers has already claimed
- the lock. In that case, just delay the GC for this event
- until later. */
- if (!memmap_lockable)
- continue;
-
- GOMP_PLUGIN_async_unmap_vars (e->addr);
- }
- break;
- }
-
- cuEventDestroy (*te);
- free ((void *)te);
-
- if (PTX_events == e)
- PTX_events = PTX_events->next;
- else
- {
- struct PTX_event *e_ = PTX_events;
- while (e_->next != e)
- e_ = e_->next;
- e_->next = e_->next->next;
- }
-
- free (e);
- }
- }
-
- GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
-}
-
-static void
-event_add (enum PTX_event_type type, CUevent *e, void *h)
-{
- struct PTX_event *ptx_event;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
- || type == PTX_EVT_ASYNC_CLEANUP);
-
- ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event));
- ptx_event->type = type;
- ptx_event->evt = e;
- ptx_event->addr = h;
- ptx_event->ord = nvthd->ptx_dev->ord;
-
- GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
-
- ptx_event->next = PTX_events;
- PTX_events = ptx_event;
-
- GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
-}
-
-void
-PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
- size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
- int vector_length, int async, void *targ_mem_desc)
-{
- struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
- CUfunction function;
- CUresult r;
- int i;
- struct PTX_stream *dev_str;
- void *kargs[1];
- void *hp, *dp;
- unsigned int nthreads_in_block;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- function = targ_fn->fn;
-
- dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
- assert (dev_str == nvthd->current_stream);
-
- /* This reserves a chunk of a pre-allocated page of memory mapped on both
- the host and the device. HP is a host pointer to the new chunk, and DP is
- the corresponding device pointer. */
- map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
-
- GOMP_PLUGIN_notify (" %s: prepare mappings\n", __FUNCTION__);
-
- /* Copy the array of arguments to the mapped page. */
- for (i = 0; i < mapnum; i++)
- ((void **) hp)[i] = devaddrs[i];
-
- /* Copy the (device) pointers to arguments to the device (dp and hp might in
- fact have the same value on a unified-memory system). */
- r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuErrorMsg (r));
-
- GOMP_PLUGIN_notify (" %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
-
- // XXX: possible geometry mappings??
- //
- // OpenACC CUDA
- //
- // num_gangs blocks
- // num_workers warps (where a warp is equivalent to 32 threads)
- // vector length threads
- //
-
- /* The openacc vector_length clause 'determines the vector length to use for
- vector or SIMD operations'. The question is how to map this to CUDA.
-
- In CUDA, the warp size is the vector length of a CUDA device. However, the
- CUDA interface abstracts away from that, and only shows us warp size
- indirectly in maximum number of threads per block, which is a product of
- warp size and the number of hyperthreads of a multiprocessor.
-
- We choose to map openacc vector_length directly onto the number of threads
- in a block, in the x dimension. This is reflected in gcc code generation
- that uses ThreadIdx.x to access vector elements.
-
- Attempting to use an openacc vector_length of more than the maximum number
- of threads per block will result in a cuda error. */
- nthreads_in_block = vector_length;
-
- kargs[0] = &dp;
- r = cuLaunchKernel (function,
- num_gangs, 1, 1,
- nthreads_in_block, 1, 1,
- 0, dev_str->stream, kargs, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuErrorMsg (r));
-
-#ifndef DISABLE_ASYNC
- if (async < acc_async_noval)
- {
- r = cuStreamSynchronize (dev_str->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
- }
- else
- {
- CUevent *e;
-
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
-
- event_gc (true);
-
- r = cuEventRecord (*e, dev_str->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
-
- event_add (PTX_EVT_KNL, e, (void *)dev_str);
- }
-#else
- r = cuCtxSynchronize ();
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuErrorMsg (r));
-#endif
-
- GOMP_PLUGIN_notify (" %s: kernel %s: finished\n", __FUNCTION__,
- targ_fn->name);
-
-#ifndef DISABLE_ASYNC
- if (async < acc_async_noval)
-#endif
- map_pop (dev_str);
-}
-
-void * openacc_get_current_cuda_context (void);
-
-static void *
-PTX_alloc (size_t s)
-{
- CUdeviceptr d;
- CUresult r;
-
- r = cuMemAlloc (&d, s);
- if (r == CUDA_ERROR_OUT_OF_MEMORY)
- return 0;
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuErrorMsg (r));
- return (void *)d;
-}
-
-static void
-PTX_free (void *p)
-{
- CUresult r;
- CUdeviceptr pb;
- size_t ps;
-
- r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
-
- if ((CUdeviceptr)p != pb)
- GOMP_PLUGIN_fatal ("invalid device address");
-
- r = cuMemFree ((CUdeviceptr)p);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuErrorMsg (r));
-}
-
-static void *
-PTX_host2dev (void *d, const void *h, size_t s)
-{
- CUresult r;
- CUdeviceptr pb;
- size_t ps;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (!s)
- return 0;
-
- if (!d)
- GOMP_PLUGIN_fatal ("invalid device address");
-
- r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
-
- if (!pb)
- GOMP_PLUGIN_fatal ("invalid device address");
-
- if (!h)
- GOMP_PLUGIN_fatal ("invalid host address");
-
- if (d == h)
- GOMP_PLUGIN_fatal ("invalid host or device address");
-
- if ((void *)(d + s) > (void *)(pb + ps))
- GOMP_PLUGIN_fatal ("invalid size");
-
-#ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
- {
- CUevent *e;
-
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
-
- event_gc (false);
-
- r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
- nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r));
-
- r = cuEventRecord (*e, nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
-
- event_add (PTX_EVT_MEM, e, (void *)h);
- }
- else
-#endif
- {
- r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuErrorMsg (r));
- }
-
- return 0;
-}
-
-static void *
-PTX_dev2host (void *h, const void *d, size_t s)
-{
- CUresult r;
- CUdeviceptr pb;
- size_t ps;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (!s)
- return 0;
-
- if (!d)
- GOMP_PLUGIN_fatal ("invalid device address");
-
- r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
-
- if (!pb)
- GOMP_PLUGIN_fatal ("invalid device address");
-
- if (!h)
- GOMP_PLUGIN_fatal ("invalid host address");
-
- if (d == h)
- GOMP_PLUGIN_fatal ("invalid host or device address");
-
- if ((void *)(d + s) > (void *)(pb + ps))
- GOMP_PLUGIN_fatal ("invalid size");
-
-#ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
- {
- CUevent *e;
-
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuErrorMsg (r));
-
- event_gc (false);
-
- r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
- nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r));
-
- r = cuEventRecord (*e, nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
-
- event_add (PTX_EVT_MEM, e, (void *)h);
- }
- else
-#endif
- {
- r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuErrorMsg (r));
- }
-
- return 0;
-}
-
-static void
-PTX_set_async (int async)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
- nvthd->current_stream
- = select_stream_for_async (async, pthread_self (), true, NULL);
-}
-
-static int
-PTX_async_test (int async)
-{
- CUresult r;
- struct PTX_stream *s;
-
- s = select_stream_for_async (async, pthread_self (), false, NULL);
-
- if (!s)
- GOMP_PLUGIN_fatal ("unknown async %d", async);
-
- r = cuStreamQuery (s->stream);
- if (r == CUDA_SUCCESS)
- {
- /* The oacc-parallel.c:goacc_wait function calls this hook to determine
- whether all work has completed on this stream, and if so omits the call
- to the wait hook. If that happens, event_gc might not get called
- (which prevents variables from getting unmapped and their associated
- device storage freed), so call it here. */
- event_gc (true);
- return 1;
- }
- else if (r == CUDA_ERROR_NOT_READY)
- return 0;
-
- GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
-
- return 0;
-}
-
-static int
-PTX_async_test_all (void)
-{
- struct PTX_stream *s;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
- {
- if ((s->multithreaded || pthread_equal (s->host_thread, self))
- && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
- {
- GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
- return 0;
- }
- }
-
- GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
- event_gc (true);
-
- return 1;
-}
-
-static void
-PTX_wait (int async)
-{
- CUresult r;
- struct PTX_stream *s;
-
- s = select_stream_for_async (async, pthread_self (), false, NULL);
-
- if (!s)
- GOMP_PLUGIN_fatal ("unknown async %d", async);
-
- r = cuStreamSynchronize (s->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
-
- event_gc (true);
-}
-
-static void
-PTX_wait_async (int async1, int async2)
-{
- CUresult r;
- CUevent *e;
- struct PTX_stream *s1, *s2;
- pthread_t self = pthread_self ();
-
- /* The stream that is waiting (rather than being waited for) doesn't
- necessarily have to exist already. */
- s2 = select_stream_for_async (async2, self, true, NULL);
-
- s1 = select_stream_for_async (async1, self, false, NULL);
- if (!s1)
- GOMP_PLUGIN_fatal ("invalid async 1\n");
-
- if (s1 == s2)
- GOMP_PLUGIN_fatal ("identical parameters");
-
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
-
- event_gc (true);
-
- r = cuEventRecord (*e, s1->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
-
- event_add (PTX_EVT_SYNC, e, NULL);
-
- r = cuStreamWaitEvent (s2->stream, *e, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
-}
-
-static void
-PTX_wait_all (void)
-{
- CUresult r;
- struct PTX_stream *s;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- /* Wait for active streams initiated by this thread (or by multiple threads)
- to complete. */
- for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
- {
- if (s->multithreaded || pthread_equal (s->host_thread, self))
- {
- r = cuStreamQuery (s->stream);
- if (r == CUDA_SUCCESS)
- continue;
- else if (r != CUDA_ERROR_NOT_READY)
- GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
-
- r = cuStreamSynchronize (s->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
- }
- }
-
- GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
- event_gc (true);
-}
-
-static void
-PTX_wait_all_async (int async)
-{
- CUresult r;
- struct PTX_stream *waiting_stream, *other_stream;
- CUevent *e;
- struct nvptx_thread *nvthd = nvptx_thread ();
- pthread_t self = pthread_self ();
-
- /* The stream doing the waiting. This could be the first mention of the
- stream, so create it if necessary. */
- waiting_stream
- = select_stream_for_async (async, pthread_self (), true, NULL);
-
- /* Launches on the null stream already block on other streams in the
- context. */
- if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
- return;
-
- event_gc (true);
-
- GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- for (other_stream = nvthd->ptx_dev->active_streams;
- other_stream != NULL;
- other_stream = other_stream->next)
- {
- if (!other_stream->multithreaded
- && !pthread_equal (other_stream->host_thread, self))
- continue;
-
- e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
-
- /* Record an event on the waited-for stream. */
- r = cuEventRecord (*e, other_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
-
- event_add (PTX_EVT_SYNC, e, NULL);
-
- r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
- }
-
- GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-}
-
-static void *
-PTX_get_current_cuda_device (void)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (!nvthd || !nvthd->ptx_dev)
- return NULL;
-
- return &nvthd->ptx_dev->dev;
-}
-
-static void *
-PTX_get_current_cuda_context (void)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (!nvthd || !nvthd->ptx_dev)
- return NULL;
-
- return nvthd->ptx_dev->ctx;
-}
-
-static void *
-PTX_get_cuda_stream (int async)
-{
- struct PTX_stream *s;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (!nvthd || !nvthd->ptx_dev)
- return NULL;
-
- s = select_stream_for_async (async, pthread_self (), false, NULL);
-
- return s ? s->stream : NULL;
-}
-
-static int
-PTX_set_cuda_stream (int async, void *stream)
-{
- struct PTX_stream *oldstream;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- if (async < 0)
- GOMP_PLUGIN_fatal ("bad async %d", async);
-
- /* We have a list of active streams and an array mapping async values to
- entries of that list. We need to take "ownership" of the passed-in stream,
- and add it to our list, removing the previous entry also (if there was one)
- in order to prevent resource leaks. Note the potential for surprise
- here: maybe we should keep track of passed-in streams and leave it up to
- the user to tidy those up, but that doesn't work for stream handles
- returned from acc_get_cuda_stream above... */
-
- oldstream = select_stream_for_async (async, self, false, NULL);
-
- if (oldstream)
- {
- if (nvthd->ptx_dev->active_streams == oldstream)
- nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
- else
- {
- struct PTX_stream *s = nvthd->ptx_dev->active_streams;
- while (s->next != oldstream)
- s = s->next;
- s->next = s->next->next;
- }
-
- cuStreamDestroy (oldstream->stream);
- map_fini (oldstream);
- free (oldstream);
- }
-
- GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
- (void) select_stream_for_async (async, self, true, (CUstream) stream);
-
- return 1;
-}
-
-/* Plugin entry points. */
-
-
-int
-GOMP_OFFLOAD_get_type (void)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return TARGET_TYPE_NVIDIA_PTX;
-}
-
-unsigned int
-GOMP_OFFLOAD_get_caps (void)
-{
- return TARGET_CAP_OPENACC_200;
-}
-
-const char *
-GOMP_OFFLOAD_get_name (void)
-{
- return "nvidia";
-}
-
-int
-GOMP_OFFLOAD_get_num_devices (void)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return PTX_get_num_devices ();
-}
-
-static void **kernel_target_data;
-static void **kernel_host_table;
-
-void
-GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%p, %p)\n", __FILE__, __FUNCTION__,
- host_table, target_data);
-#endif
-
- kernel_target_data = target_data;
- kernel_host_table = host_table;
-}
-
-int
-GOMP_OFFLOAD_init_device (void)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return PTX_init ();
-}
-
-int
-GOMP_OFFLOAD_fini_device (void)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
-#endif
-
- return PTX_fini ();
-}
-
-int
-GOMP_OFFLOAD_get_table (struct mapping_table **tablep)
-{
- CUmodule module;
- void **fn_table;
- char **fn_names;
- int fn_entries, i;
- CUresult r;
- struct targ_fn_descriptor *targ_fns;
-
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
- tablep);
-#endif
-
- if (PTX_init () <= 0)
- return 0;
-
- /* This isn't an error, because an image may legitimately have no offloaded
- regions and so will not call GOMP_offload_register. */
- if (kernel_target_data == NULL)
- return 0;
-
- link_ptx (&module, kernel_target_data[0]);
-
- /* kernel_target_data[0] -> ptx code
- kernel_target_data[1] -> variable mappings
- kernel_target_data[2] -> array of kernel names in ascii
-
- kernel_host_table[0] -> start of function addresses (_omp_func_table)
- kernel_host_table[1] -> end of function addresses (_omp_funcs_end)
-
- The array of kernel names and the functions addresses form a
- one-to-one correspondence. */
-
- fn_table = kernel_host_table[0];
- fn_names = (char **) kernel_target_data[2];
- fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
-
- *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
- targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
- * fn_entries);
-
- for (i = 0; i < fn_entries; i++)
- {
- CUfunction function;
-
- r = cuModuleGetFunction (&function, module, fn_names[i]);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuErrorMsg (r));
-
- targ_fns[i].fn = function;
- targ_fns[i].name = (const char *) fn_names[i];
-
- (*tablep)[i].host_start = (uintptr_t) fn_table[i];
- (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
- (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
- (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
- }
-
- return fn_entries;
-}
-
-void *
-GOMP_OFFLOAD_alloc (size_t size)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%zu)\n", __FILE__, __FUNCTION__,
- size);
-#endif
-
- return PTX_alloc (size);
-}
-
-void
-GOMP_OFFLOAD_free (void *ptr)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, ptr);
-#endif
-
- PTX_free (ptr);
-}
-
-void *
-GOMP_OFFLOAD_dev2host (void *dst, const void *src, size_t n)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
- __FUNCTION__, dst,
- src, n);
-#endif
-
- return PTX_dev2host (dst, src, n);
-}
-
-void *
-GOMP_OFFLOAD_host2dev (void *dst, const void *src, size_t n)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
- __FUNCTION__, dst, src, n);
-#endif
-
- return PTX_host2dev (dst, src, n);
-}
-
-void (*device_run) (void *fn_ptr, void *vars) = NULL;
-
-void
-GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
- void **hostaddrs, void **devaddrs, size_t *sizes,
- unsigned short *kinds, int num_gangs,
- int num_workers, int vector_length, int async,
- void *targ_mem_desc)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, "
- "%d, %p)\n", __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes,
- kinds, num_gangs, num_workers, vector_length, async, targ_mem_desc);
-#endif
-
- PTX_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
- num_workers, vector_length, async, targ_mem_desc);
-}
-
-void *
-GOMP_OFFLOAD_openacc_open_device (int n)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, n);
-#endif
- return PTX_open_device (n);
-}
-
-int
-GOMP_OFFLOAD_openacc_close_device (void *h)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, h);
-#endif
- return PTX_close_device (h);
-}
-
-void
-GOMP_OFFLOAD_openacc_set_device_num (int n)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- assert (n >= 0);
-
- if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
- (void) PTX_open_device (n);
-}
-
-/* This can be called before the device is "opened" for the current thread, in
- which case we can't tell which device number should be returned. We don't
- actually want to open the device here, so just return -1 and let the caller
- (oacc-init.c:acc_get_device_num) handle it. */
-
-int
-GOMP_OFFLOAD_openacc_get_device_num (void)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (nvthd && nvthd->ptx_dev)
- return nvthd->ptx_dev->ord;
- else
- return -1;
-}
-
-bool
-GOMP_OFFLOAD_openacc_avail (void)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
-#endif
- return PTX_avail ();
-}
-
-void
-GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
-{
- CUevent *e;
- CUresult r;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
- targ_mem_desc);
-#endif
-
- e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
-
- r = cuEventRecord (*e, nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
-
- event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
-}
-
-int
-GOMP_OFFLOAD_openacc_async_test (int async)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
- async);
-#endif
- return PTX_async_test (async);
-}
-
-int
-GOMP_OFFLOAD_openacc_async_test_all (void)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
-#endif
- return PTX_async_test_all ();
-}
-
-void
-GOMP_OFFLOAD_openacc_async_wait (int async)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
- async);
-#endif
- PTX_wait (async);
-}
-
-void
-GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%d, %d)\n", __FILE__, __FUNCTION__,
- async1, async2);
-#endif
- PTX_wait_async (async1, async2);
-}
-
-void
-GOMP_OFFLOAD_openacc_async_wait_all (void)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
-#endif
- PTX_wait_all ();
-}
-
-void
-GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
- async);
-#endif
- PTX_wait_all_async (async);
-}
-
-void
-GOMP_OFFLOAD_openacc_async_set_async (int async)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
- async);
-#endif
- PTX_set_async (async);
-}
-
-void *
-GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data)
-{
- struct PTX_device *ptx_dev = (struct PTX_device *) targ_data;
- struct nvptx_thread *nvthd
- = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
- CUresult r;
- CUcontext thd_ctx;
-
- r = cuCtxGetCurrent (&thd_ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
-
- assert (ptx_dev->ctx);
-
- if (!thd_ctx)
- {
- r = cuCtxPushCurrent (ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuErrorMsg (r));
- }
-
- nvthd->current_stream = ptx_dev->null_stream;
- nvthd->ptx_dev = ptx_dev;
-
- return (void *) nvthd;
-}
-
-void
-GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
-{
- free (data);
-}
-
-void *
-GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
-#endif
- return PTX_get_current_cuda_device ();
-}
-
-void *
-GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
-#endif
- return PTX_get_current_cuda_context ();
-}
-
-/* NOTE: This returns a CUstream, not a PTX_stream pointer. */
-
-void *
-GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
- async);
-#endif
- return PTX_get_cuda_stream (async);
-}
-
-/* NOTE: This takes a CUstream, not a PTX_stream pointer. */
-
-int
-GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
-{
-#ifdef DEBUG
- fprintf (stderr, "libgomp plugin: %s:%s (%d, %p)\n", __FILE__, __FUNCTION__,
- async, stream);
-#endif
- return PTX_set_cuda_stream (async, stream);
-}
new file mode 100644
@@ -0,0 +1,64 @@
+# Plugins for offload execution.
+#
+# Copyright (C) 2014 Free Software Foundation, Inc.
+#
+# Contributed by Mentor Embedded.
+#
+# This file is part of the GNU OpenMP Library (libgomp).
+#
+# Libgomp is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+# more details.
+#
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+#
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+# <http://www.gnu.org/licenses/>.
+
+ACLOCAL_AMFLAGS = -I ../.. -I ../../config
+
+config_path = @config_path@
+search_path = .. $(addprefix $(top_srcdir)/../config/, $(config_path)) \
+ $(top_srcdir) $(top_srcdir)/../../include $(top_srcdir)/..
+
+AM_CPPFLAGS = $(addprefix -I, $(search_path))
+AM_CFLAGS = $(XCFLAGS)
+AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
+
+toolexeclib_LTLIBRARIES =
+
+if PLUGIN_NVPTX
+# Nvidia PTX OpenACC plugin.
+libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-nvptx.la
+libgomp_plugin_nvptx_la_SOURCES = plugin-nvptx.c
+libgomp_plugin_nvptx_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_NVPTX_CPPFLAGS)
+libgomp_plugin_nvptx_la_LDFLAGS = $(libgomp_plugin_nvptx_version_info) \
+ $(lt_host_flags)
+libgomp_plugin_nvptx_la_LDFLAGS += $(PLUGIN_NVPTX_LDFLAGS)
+libgomp_plugin_nvptx_la_LIBADD = $(PLUGIN_NVPTX_LIBS)
+libgomp_plugin_nvptx_la_LIBTOOLFLAGS = --tag=disable-static
+endif
+
+libgomp_plugin_host_nonshm_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-host_nonshm.la
+libgomp_plugin_host_nonshm_la_SOURCES = plugin-host.c
+libgomp_plugin_host_nonshm_la_CPPFLAGS = $(AM_CPPFLAGS) -DHOST_NONSHM_PLUGIN
+libgomp_plugin_host_nonshm_la_LDFLAGS = \
+ $(libgomp_plugin_host_nonshm_version_info) $(lt_host_flags)
+libgomp_plugin_host_nonshm_la_LIBTOOLFLAGS = --tag=disable-static
+
+LTLDFLAGS = $(shell $(SHELL) $(top_srcdir)/../../libtool-ldflags $(LDFLAGS))
+
+LINK = $(LIBTOOL) --tag CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
+ $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LTLDFLAGS) -o $@
new file mode 100644
new file mode 100644
new file mode 100644
new file mode 100644
new file mode 100644
@@ -0,0 +1,178 @@
+# Plugins for offload execution.
+#
+# Copyright (C) 2014 Free Software Foundation, Inc.
+#
+# Contributed by Mentor Embedded.
+#
+# This file is part of the GNU OpenMP Library (libgomp).
+#
+# Libgomp is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+# more details.
+#
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+#
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+# <http://www.gnu.org/licenses/>.
+
+AC_PREREQ([2.64])
+AC_INIT([Plugins for GNU OpenMP Runtime Library], [1.0], ,[libgomp-plugins])
+AC_CONFIG_HEADER([config.h])
+
+AC_CANONICAL_SYSTEM
+target_alias=${target_alias-$host_alias}
+AC_SUBST(target_alias)
+
+AM_INIT_AUTOMAKE([1.9.0 foreign no-dist])
+
+AM_MAINTAINER_MODE
+
+AC_PROG_CC
+AC_PROG_CXX
+
+AC_MSG_CHECKING([for --enable-version-specific-runtime-libs])
+AC_ARG_ENABLE([version-specific-runtime-libs],
+ AC_HELP_STRING([--enable-version-specific-runtime-libs],
+ [Specify that runtime libraries should be installed in a compiler-specific directory]),
+ [case "$enableval" in
+ yes) enable_version_specific_runtime_libs=yes ;;
+ no) enable_version_specific_runtime_libs=no ;;
+ *) AC_MSG_ERROR([Unknown argument to enable/disable version-specific libs]);;
+ esac],
+ [enable_version_specific_runtime_libs=no])
+AC_MSG_RESULT($enable_version_specific_runtime_libs)
+
+# Calculate toolexeclibdir
+# Also toolexecdir, though it's only used in toolexeclibdir
+case ${enable_version_specific_runtime_libs} in
+ yes)
+ # Need the gcc compiler version to know where to install libraries
+ # and header files if --enable-version-specific-runtime-libs option
+ # is selected.
+ toolexecdir='$(libdir)/gcc/$(target_alias)'
+ toolexeclibdir='$(toolexecdir)/$(gcc_version)$(MULTISUBDIR)'
+ ;;
+ no)
+ if test -n "$with_cross_host" &&
+ test x"$with_cross_host" != x"no"; then
+ # Install a library built with a cross compiler in tooldir, not libdir.
+ toolexecdir='$(exec_prefix)/$(target_alias)'
+ toolexeclibdir='$(toolexecdir)/lib'
+ else
+ toolexecdir='$(libdir)/gcc-lib/$(target_alias)'
+ toolexeclibdir='$(libdir)'
+ fi
+ multi_os_directory=`$CC -print-multi-os-directory`
+ case $multi_os_directory in
+ .) ;; # Avoid trailing /.
+ *) toolexeclibdir=$toolexeclibdir/$multi_os_directory ;;
+ esac
+ ;;
+esac
+
+AC_LIBTOOL_DLOPEN
+AM_PROG_LIBTOOL
+# Forbid libtool to hardcode RPATH, because we want to be able to specify
+# library search directory using LD_LIBRARY_PATH
+hardcode_into_libs=no
+AC_SUBST(toolexecdir)
+AC_SUBST(toolexeclibdir)
+
+libtool_VERSION=1:0:0
+AC_SUBST(libtool_VERSION)
+
+# Look for the CUDA driver package.
+CUDA_DRIVER_INCLUDE=
+CUDA_DRIVER_LIB=
+AC_SUBST(CUDA_DRIVER_INCLUDE)
+AC_SUBST(CUDA_DRIVER_LIB)
+CUDA_DRIVER_CPPFLAGS=
+CUDA_DRIVER_LDFLAGS=
+AC_ARG_WITH(cuda-driver,
+ [AS_HELP_STRING([--with-cuda-driver=PATH],
+ [specify prefix directory for installed CUDA driver package.
+ Equivalent to --with-cuda-driver-include=PATH/include
+ plus --with-cuda-driver-lib=PATH/lib])])
+AC_ARG_WITH(cuda-driver-include,
+ [AS_HELP_STRING([--with-cuda-driver-include=PATH],
+ [specify directory for installed CUDA driver include files])])
+AC_ARG_WITH(cuda-driver-lib,
+ [AS_HELP_STRING([--with-cuda-driver-lib=PATH],
+ [specify directory for the installed CUDA driver library])])
+if test "x$with_cuda_driver" != x; then
+ CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
+ CUDA_DRIVER_LIB=$with_cuda_driver/lib
+fi
+if test "x$with_cuda_driver_include" != x; then
+ CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
+fi
+if test "x$with_cuda_driver_lib" != x; then
+ CUDA_DRIVER_LIB=$with_cuda_driver_lib
+fi
+if test "x$CUDA_DRIVER_INCLUDE" != x; then
+ CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
+fi
+if test "x$CUDA_DRIVER_LIB" != x; then
+ CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
+fi
+
+PLUGIN_NVPTX=0
+PLUGIN_NVPTX_CPPFLAGS=
+PLUGIN_NVPTX_LDFLAGS=
+PLUGIN_NVPTX_LIBS=
+AC_SUBST(PLUGIN_NVPTX)
+AC_SUBST(PLUGIN_NVPTX_CPPFLAGS)
+AC_SUBST(PLUGIN_NVPTX_LDFLAGS)
+AC_SUBST(PLUGIN_NVPTX_LIBS)
+
+for accel in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
+ case "$accel" in
+ nvptx*)
+ PLUGIN_NVPTX=$accel
+ PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
+ PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
+ PLUGIN_NVPTX_LIBS='-lcuda'
+
+ PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
+ CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
+ PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
+ LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
+ PLUGIN_NVPTX_save_LIBS=$LIBS
+ LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM(
+ [#include "cuda.h"],
+ [CUresult r = cuCtxPushCurrent (NULL);])],
+ [PLUGIN_NVPTX=1])
+ CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
+ LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
+ LIBS=$PLUGIN_NVPTX_save_LIBS
+ case $PLUGIN_NVPTX in
+ nvptx*)
+ PLUGIN_NVPTX=0
+ AC_MSG_ERROR([CUDA driver package required for nvptx support])
+ ;;
+ esac
+ ;;
+ esac
+done
+AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
+AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
+ [Define to 1 if the NVIDIA plugin is built, 0 if not.])
+
+. ${srcdir}/../configure.tgt
+
+AC_SUBST(config_path)
+
+AC_CONFIG_FILES([Makefile])
+AC_OUTPUT
new file mode 100644
@@ -0,0 +1,454 @@
+/* OpenACC Runtime Library: acc_device_host, acc_device_host_nonshm.
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Simple implementation of support routines for a shared-memory
+ acc_device_host, and a non-shared memory acc_device_host_nonshm, with the
+ latter built as a plugin. */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "target.h"
+#ifdef HOST_NONSHM_PLUGIN
+#include "libgomp-plugin.h"
+#include "oacc-plugin.h"
+#else
+#include "oacc-int.h"
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HOST_NONSHM_PLUGIN
+#define STATIC
+#define GOMP(X) GOMP_PLUGIN_##X
+#define SELF "host_nonshm plugin: "
+#else
+#define STATIC static
+#define GOMP(X) gomp_##X
+#define SELF "host: "
+#endif
+
+#ifndef HOST_NONSHM_PLUGIN
+static struct gomp_device_descr host_dispatch;
+#endif
+
+STATIC const char *
+GOMP_OFFLOAD_get_name (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+#ifdef HOST_NONSHM_PLUGIN
+ return "host_nonshm";
+#else
+ return "host";
+#endif
+}
+
+STATIC int
+GOMP_OFFLOAD_get_type (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+#ifdef HOST_NONSHM_PLUGIN
+ return TARGET_TYPE_HOST_NONSHM;
+#else
+ return TARGET_TYPE_HOST;
+#endif
+}
+
+STATIC unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+ unsigned int caps = TARGET_CAP_OPENACC_200 | TARGET_CAP_OPENMP_400
+ | TARGET_CAP_NATIVE_EXEC;
+
+#ifndef HOST_NONSHM_PLUGIN
+ caps |= TARGET_CAP_SHARED_MEM;
+#endif
+
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s: 0x%x\n", __FILE__, __FUNCTION__, caps);
+#endif
+
+ return caps;
+}
+
+STATIC int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 1;
+}
+
+STATIC void
+GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, host_table,
+ target_data);
+#endif
+}
+
+STATIC int
+GOMP_OFFLOAD_init_device (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return GOMP_OFFLOAD_get_num_devices ();
+}
+
+STATIC int
+GOMP_OFFLOAD_fini_device (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 0;
+}
+
+STATIC int
+GOMP_OFFLOAD_get_table (struct mapping_table **table)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, table);
+#endif
+
+ return 0;
+}
+
+STATIC bool
+GOMP_OFFLOAD_openacc_avail (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 1;
+}
+
+STATIC void *
+GOMP_OFFLOAD_openacc_open_device (int n)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n);
+#endif
+
+ return (void *) (intptr_t) n;
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_close_device (void *hnd)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, hnd);
+#endif
+
+ return 0;
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_get_device_num (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 0;
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_set_device_num (int n)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n);
+#endif
+
+ if (n > 0)
+ GOMP(fatal) ("device number %u out of range for host execution", n);
+}
+
+STATIC void *
+GOMP_OFFLOAD_alloc (size_t s)
+{
+ void *ptr = GOMP(malloc) (s);
+
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%zd): %p\n", __FILE__, __FUNCTION__, s, ptr);
+#endif
+
+ return ptr;
+}
+
+STATIC void
+GOMP_OFFLOAD_free (void *p)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, p);
+#endif
+
+ free (p);
+}
+
+STATIC void *
+GOMP_OFFLOAD_host2dev (void *d, const void *h, size_t s)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, d, h,
+ s);
+#endif
+
+#ifdef HOST_NONSHM_PLUGIN
+ memcpy (d, h, s);
+#endif
+
+ return 0;
+}
+
+STATIC void *
+GOMP_OFFLOAD_dev2host (void *h, const void *d, size_t s)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, h, d,
+ s);
+#endif
+
+#ifdef HOST_NONSHM_PLUGIN
+ memcpy (h, d, s);
+#endif
+
+ return 0;
+}
+
+STATIC void
+GOMP_OFFLOAD_run (void *fn_ptr, void *vars)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, fn_ptr,
+ vars);
+#endif
+
+ void (*fn)(void *) = (void (*)(void *)) fn_ptr;
+
+ fn (vars);
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *),
+ size_t mapnum __attribute__((unused)),
+ void **hostaddrs __attribute__((unused)),
+ void **devaddrs __attribute__((unused)),
+ size_t *sizes __attribute__((unused)),
+ unsigned short *kinds __attribute__((unused)),
+ int num_gangs __attribute__((unused)),
+ int num_workers __attribute__((unused)),
+ int vector_length __attribute__((unused)),
+ int async __attribute__((unused)),
+ void *targ_mem_desc __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, %d, %p)\n",
+ __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes, kinds,
+ num_gangs, num_workers, vector_length, async, targ_mem_desc);
+#endif
+
+#ifdef HOST_NONSHM_PLUGIN
+ fn (devaddrs);
+#else
+ fn (hostaddrs);
+#endif
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
+{
+#ifdef HOST_NONSHM_PLUGIN
+ /* "Asynchronous" launches are executed synchronously on the (non-SHM) host,
+ so there's no point in delaying host-side cleanup -- just do it now. */
+ GOMP_PLUGIN_async_unmap_vars (targ_mem_desc);
+#endif
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_set_async (int async __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
+#endif
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_async_test (int async __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
+#endif
+
+ return 1;
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_async_test_all (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 1;
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait (int async __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
+#endif
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait_all (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait_async (int async1 __attribute__((unused)),
+ int async2 __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d, %d)\n", __FILE__, __FUNCTION__, async1,
+ async2);
+#endif
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
+#endif
+}
+
+STATIC void *
+GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data __attribute__((unused)))
+{
+ return NULL;
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *tls_data __attribute__((unused)))
+{
+}
+
+#ifndef HOST_NONSHM_PLUGIN
+static struct gomp_device_descr host_dispatch =
+ {
+ .name = "host",
+
+ .type = TARGET_TYPE_HOST,
+ .capabilities = TARGET_CAP_OPENACC_200 | TARGET_CAP_NATIVE_EXEC
+ | TARGET_CAP_SHARED_MEM,
+ .id = 0,
+
+ .is_initialized = false,
+ .offload_regions_registered = false,
+
+ .get_name_func = GOMP_OFFLOAD_get_name,
+ .get_type_func = GOMP_OFFLOAD_get_type,
+ .get_caps_func = GOMP_OFFLOAD_get_caps,
+
+ .init_device_func = GOMP_OFFLOAD_init_device,
+ .fini_device_func = GOMP_OFFLOAD_fini_device,
+ .get_num_devices_func = GOMP_OFFLOAD_get_num_devices,
+ .register_image_func = GOMP_OFFLOAD_register_image,
+ .get_table_func = GOMP_OFFLOAD_get_table,
+
+ .alloc_func = GOMP_OFFLOAD_alloc,
+ .free_func = GOMP_OFFLOAD_free,
+ .host2dev_func = GOMP_OFFLOAD_host2dev,
+ .dev2host_func = GOMP_OFFLOAD_dev2host,
+
+ .run_func = GOMP_OFFLOAD_run,
+
+ .openacc = {
+ .open_device_func = GOMP_OFFLOAD_openacc_open_device,
+ .close_device_func = GOMP_OFFLOAD_openacc_close_device,
+
+ .get_device_num_func = GOMP_OFFLOAD_openacc_get_device_num,
+ .set_device_num_func = GOMP_OFFLOAD_openacc_set_device_num,
+
+ /* Device available. */
+ .avail_func = GOMP_OFFLOAD_openacc_avail,
+
+ .exec_func = GOMP_OFFLOAD_openacc_parallel,
+
+ .register_async_cleanup_func
+ = GOMP_OFFLOAD_openacc_register_async_cleanup,
+
+ .async_set_async_func = GOMP_OFFLOAD_openacc_async_set_async,
+ .async_test_func = GOMP_OFFLOAD_openacc_async_test,
+ .async_test_all_func = GOMP_OFFLOAD_openacc_async_test_all,
+ .async_wait_func = GOMP_OFFLOAD_openacc_async_wait,
+ .async_wait_async_func = GOMP_OFFLOAD_openacc_async_wait_async,
+ .async_wait_all_func = GOMP_OFFLOAD_openacc_async_wait_all,
+ .async_wait_all_async_func = GOMP_OFFLOAD_openacc_async_wait_all_async,
+
+ .create_thread_data_func = GOMP_OFFLOAD_openacc_create_thread_data,
+ .destroy_thread_data_func = GOMP_OFFLOAD_openacc_destroy_thread_data,
+
+ .cuda = {
+ .get_current_device_func = NULL,
+ .get_current_context_func = NULL,
+ .get_stream_func = NULL,
+ .set_stream_func = NULL,
+ }
+ }
+ };
+
+/* Register this device type. */
+static __attribute__ ((constructor))
+void ACC_host_init (void)
+{
+ gomp_mutex_init (&host_dispatch.mem_map.lock);
+ ACC_register (&host_dispatch);
+}
+#endif
+
new file mode 100644
@@ -0,0 +1,1909 @@
+/* Plugin for NVPTX execution.
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
+ library appears to hold some implicit state, but the documentation
+ is not clear as to what that state might be. Or how one might
+ propagate it from one thread to another. */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "target.h"
+#include "libgomp-plugin.h"
+#include "oacc-ptx.h"
+#include "oacc-plugin.h"
+
+#include <cuda.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <dlfcn.h>
+#include <unistd.h>
+#include <assert.h>
+
+#define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
+
+static struct _errlist
+{
+ CUresult r;
+ char *m;
+} cuErrorList[] = {
+ { CUDA_ERROR_INVALID_VALUE, "invalid value" },
+ { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
+ { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
+ { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
+ { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
+ { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
+ { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
+ { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
+ { CUDA_ERROR_NO_DEVICE, "no device" },
+ { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
+ { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
+ { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
+ { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
+ { CUDA_ERROR_MAP_FAILED, "map error" },
+ { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
+ { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
+ { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
+ { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
+ { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
+ { CUDA_ERROR_NOT_MAPPED, "not mapped" },
+ { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
+ { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
+ { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
+ { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
+ { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
+ { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
+ { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
+ { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
+ { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ "shared object symbol not found" },
+ { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
+ { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
+ { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
+ { CUDA_ERROR_NOT_FOUND, "not found" },
+ { CUDA_ERROR_NOT_READY, "not ready" },
+ { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
+ { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
+ { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
+ { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ "launch incompatibe texturing" },
+ { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
+ { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
+ { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
+ { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
+ { CUDA_ERROR_ASSERT, "assert" },
+ { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
+ { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+ "host memory already registered" },
+ { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
+ { CUDA_ERROR_NOT_PERMITTED, "no permitted" },
+ { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
+ { CUDA_ERROR_UNKNOWN, "unknown" }
+};
+
+static char errmsg[128];
+
+static char *
+cuErrorMsg (CUresult r)
+{
+ int i;
+
+ for (i = 0; i < ARRAYSIZE (cuErrorList); i++)
+ {
+ if (cuErrorList[i].r == r)
+ return &cuErrorList[i].m[0];
+ }
+
+ sprintf (&errmsg[0], "unknown result code: %5d", r);
+
+ return &errmsg[0];
+}
+
+struct targ_fn_descriptor
+{
+ CUfunction fn;
+ const char *name;
+};
+
+static bool PTX_inited = false;
+
+struct PTX_stream
+{
+ CUstream stream;
+ pthread_t host_thread;
+ bool multithreaded;
+
+ CUdeviceptr d;
+ void *h;
+ void *h_begin;
+ void *h_end;
+ void *h_next;
+ void *h_prev;
+ void *h_tail;
+
+ struct PTX_stream *next;
+};
+
+/* Thread-specific data for PTX. */
+
+struct nvptx_thread
+{
+ struct PTX_stream *current_stream;
+ struct PTX_device *ptx_dev;
+};
+
+struct map
+{
+ int async;
+ size_t size;
+ char mappings[0];
+};
+
+static void
+map_init (struct PTX_stream *s)
+{
+ CUresult r;
+
+ int size = getpagesize ();
+
+ assert (s);
+ assert (!s->d);
+ assert (!s->h);
+
+ r = cuMemAllocHost (&s->h, size);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuErrorMsg (r));
+
+ r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuErrorMsg (r));
+
+ assert (s->h);
+
+ s->h_begin = s->h;
+ s->h_end = s->h_begin + size;
+ s->h_next = s->h_prev = s->h_tail = s->h_begin;
+
+ assert (s->h_next);
+ assert (s->h_end);
+}
+
+static void
+map_fini (struct PTX_stream *s)
+{
+ CUresult r;
+
+ r = cuMemFreeHost (s->h);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuErrorMsg (r));
+}
+
+static void
+map_pop (struct PTX_stream *s)
+{
+ struct map *m;
+
+ assert (s != NULL);
+ assert (s->h_next);
+ assert (s->h_prev);
+ assert (s->h_tail);
+
+ m = s->h_tail;
+
+ s->h_tail += m->size;
+
+ if (s->h_tail >= s->h_end)
+ s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+
+ if (s->h_next == s->h_tail)
+ s->h_prev = s->h_next;
+
+ assert (s->h_next >= s->h_begin);
+ assert (s->h_tail >= s->h_begin);
+ assert (s->h_prev >= s->h_begin);
+
+ assert (s->h_next <= s->h_end);
+ assert (s->h_tail <= s->h_end);
+ assert (s->h_prev <= s->h_end);
+}
+
+static void
+map_push (struct PTX_stream *s, int async, size_t size, void **h, void **d)
+{
+ int left;
+ int offset;
+ struct map *m;
+
+ assert (s != NULL);
+
+ left = s->h_end - s->h_next;
+ size += sizeof (struct map);
+
+ assert (s->h_prev);
+ assert (s->h_next);
+
+ if (size >= left)
+ {
+ m = s->h_prev;
+ m->size += left;
+ s->h_next = s->h_begin;
+
+ if (s->h_next + size > s->h_end)
+ GOMP_PLUGIN_fatal ("unable to push map");
+ }
+
+ assert (s->h_next);
+
+ m = s->h_next;
+ m->async = async;
+ m->size = size;
+
+ offset = (void *)&m->mappings[0] - s->h;
+
+ *d = (void *)(s->d + offset);
+ *h = (void *)(s->h + offset);
+
+ s->h_prev = s->h_next;
+ s->h_next += size;
+
+ assert (s->h_prev);
+ assert (s->h_next);
+
+ assert (s->h_next >= s->h_begin);
+ assert (s->h_tail >= s->h_begin);
+ assert (s->h_prev >= s->h_begin);
+ assert (s->h_next <= s->h_end);
+ assert (s->h_tail <= s->h_end);
+ assert (s->h_prev <= s->h_end);
+
+ return;
+}
+
+struct PTX_device
+{
+ CUcontext ctx;
+ bool ctx_shared;
+ CUdevice dev;
+ struct PTX_stream *null_stream;
+ /* All non-null streams associated with this device (actually context),
+ either created implicitly or passed in from the user (via
+ acc_set_cuda_stream). */
+ struct PTX_stream *active_streams;
+ struct {
+ struct PTX_stream **arr;
+ int size;
+ } async_streams;
+ /* A lock for use when manipulating the above stream list and array. */
+ gomp_mutex_t stream_lock;
+ int ord;
+ bool overlap;
+ bool map;
+ bool concur;
+ int mode;
+ bool mkern;
+
+ struct PTX_device *next;
+};
+
+enum PTX_event_type
+{
+ PTX_EVT_MEM,
+ PTX_EVT_KNL,
+ PTX_EVT_SYNC,
+ PTX_EVT_ASYNC_CLEANUP
+};
+
+struct PTX_event
+{
+ CUevent *evt;
+ int type;
+ void *addr;
+ int ord;
+
+ struct PTX_event *next;
+};
+
+static gomp_mutex_t PTX_event_lock;
+static struct PTX_event *PTX_events;
+
+#define _XSTR(s) _STR(s)
+#define _STR(s) #s
+
+static struct _synames
+{
+ char *n;
+} cuSymNames[] =
+{
+ { _XSTR(cuCtxCreate) },
+ { _XSTR(cuCtxDestroy) },
+ { _XSTR(cuCtxGetCurrent) },
+ { _XSTR(cuCtxPushCurrent) },
+ { _XSTR(cuCtxSynchronize) },
+ { _XSTR(cuDeviceGet) },
+ { _XSTR(cuDeviceGetAttribute) },
+ { _XSTR(cuDeviceGetCount) },
+ { _XSTR(cuEventCreate) },
+ { _XSTR(cuEventDestroy) },
+ { _XSTR(cuEventQuery) },
+ { _XSTR(cuEventRecord) },
+ { _XSTR(cuInit) },
+ { _XSTR(cuLaunchKernel) },
+ { _XSTR(cuLinkAddData) },
+ { _XSTR(cuLinkComplete) },
+ { _XSTR(cuLinkCreate) },
+ { _XSTR(cuMemAlloc) },
+ { _XSTR(cuMemAllocHost) },
+ { _XSTR(cuMemcpy) },
+ { _XSTR(cuMemcpyDtoH) },
+ { _XSTR(cuMemcpyDtoHAsync) },
+ { _XSTR(cuMemcpyHtoD) },
+ { _XSTR(cuMemcpyHtoDAsync) },
+ { _XSTR(cuMemFree) },
+ { _XSTR(cuMemFreeHost) },
+ { _XSTR(cuMemGetAddressRange) },
+ { _XSTR(cuMemHostGetDevicePointer) },
+ { _XSTR(cuMemHostRegister) },
+ { _XSTR(cuMemHostUnregister) },
+ { _XSTR(cuModuleGetFunction) },
+ { _XSTR(cuModuleLoadData) },
+ { _XSTR(cuStreamDestroy) },
+ { _XSTR(cuStreamQuery) },
+ { _XSTR(cuStreamSynchronize) },
+ { _XSTR(cuStreamWaitEvent) }
+};
+
+static int
+verify_device_library (void)
+{
+ int i;
+ void *dh, *ds;
+
+ dh = dlopen ("libcuda.so", RTLD_LAZY);
+ if (!dh)
+ return -1;
+
+ for (i = 0; i < ARRAYSIZE (cuSymNames); i++)
+ {
+ ds = dlsym (dh, cuSymNames[i].n);
+ if (!ds)
+ return -1;
+ }
+
+ dlclose (dh);
+
+ return 0;
+}
+
+static inline struct nvptx_thread *
+nvptx_thread (void)
+{
+ return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
+}
+
+static void
+init_streams_for_device (struct PTX_device *ptx_dev, int concurrency)
+{
+ int i;
+ struct PTX_stream *null_stream
+ = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream));
+
+ null_stream->stream = NULL;
+ null_stream->host_thread = pthread_self ();
+ null_stream->multithreaded = true;
+ null_stream->d = (CUdeviceptr) NULL;
+ null_stream->h = NULL;
+ map_init (null_stream);
+ ptx_dev->null_stream = null_stream;
+
+ ptx_dev->active_streams = NULL;
+ GOMP_PLUGIN_mutex_init (&ptx_dev->stream_lock);
+
+ if (concurrency < 1)
+ concurrency = 1;
+
+ /* This is just a guess -- make space for as many async streams as the
+ current device is capable of concurrently executing. This can grow
+ later as necessary. No streams are created yet. */
+ ptx_dev->async_streams.arr
+ = GOMP_PLUGIN_malloc (concurrency * sizeof (struct PTX_stream *));
+ ptx_dev->async_streams.size = concurrency;
+
+ for (i = 0; i < concurrency; i++)
+ ptx_dev->async_streams.arr[i] = NULL;
+}
+
+static void
+fini_streams_for_device (struct PTX_device *ptx_dev)
+{
+ free (ptx_dev->async_streams.arr);
+
+ while (ptx_dev->active_streams != NULL)
+ {
+ struct PTX_stream *s = ptx_dev->active_streams;
+ ptx_dev->active_streams = ptx_dev->active_streams->next;
+
+ cuStreamDestroy (s->stream);
+ map_fini (s);
+ free (s);
+ }
+
+ map_fini (ptx_dev->null_stream);
+ free (ptx_dev->null_stream);
+}
+
+/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
+ thread THREAD (and also current device/context). If CREATE is true, create
+ the stream if it does not exist (or use EXISTING if it is non-NULL), and
+ associate the stream with the same thread argument. Returns stream to use
+ as result. */
+
+static struct PTX_stream *
+select_stream_for_async (int async, pthread_t thread, bool create,
+ CUstream existing)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+ /* Local copy of TLS variable. */
+ struct PTX_device *ptx_dev = nvthd->ptx_dev;
+ struct PTX_stream *stream = NULL;
+ int orig_async = async;
+
+ /* The special value acc_async_noval (-1) maps (for now) to an
+ implicitly-created stream, which is then handled the same as any other
+ numbered async stream. Other options are available, e.g. using the null
+ stream for anonymous async operations, or choosing an idle stream from an
+ active set. But, stick with this for now. */
+ if (async > acc_async_sync)
+ async++;
+
+ if (create)
+ GOMP_PLUGIN_mutex_lock (&ptx_dev->stream_lock);
+
+ /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
+ null stream, and in fact better performance may be obtainable if it doesn't
+ (because the null stream enforces overly-strict synchronisation with
+ respect to other streams for legacy reasons, and that's probably not
+ needed with OpenACC). Maybe investigate later. */
+ if (async == acc_async_sync)
+ stream = ptx_dev->null_stream;
+ else if (async >= 0 && async < ptx_dev->async_streams.size
+ && ptx_dev->async_streams.arr[async] && !(create && existing))
+ stream = ptx_dev->async_streams.arr[async];
+ else if (async >= 0 && create)
+ {
+ if (async >= ptx_dev->async_streams.size)
+ {
+ int i, newsize = ptx_dev->async_streams.size * 2;
+
+ if (async >= newsize)
+ newsize = async + 1;
+
+ ptx_dev->async_streams.arr
+ = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
+ newsize * sizeof (struct PTX_stream *));
+
+ for (i = ptx_dev->async_streams.size; i < newsize; i++)
+ ptx_dev->async_streams.arr[i] = NULL;
+
+ ptx_dev->async_streams.size = newsize;
+ }
+
+ /* Create a new stream on-demand if there isn't one already, or if we're
+ setting a particular async value to an existing (externally-provided)
+ stream. */
+ if (!ptx_dev->async_streams.arr[async] || existing)
+ {
+ CUresult r;
+ struct PTX_stream *s
+ = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream));
+
+ if (existing)
+ s->stream = existing;
+ else
+ {
+ r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuErrorMsg (r));
+ }
+
+ /* If CREATE is true, we're going to be queueing some work on this
+ stream. Associate it with the current host thread. */
+ s->host_thread = thread;
+ s->multithreaded = false;
+
+ s->d = (CUdeviceptr) NULL;
+ s->h = NULL;
+ map_init (s);
+
+ s->next = ptx_dev->active_streams;
+ ptx_dev->active_streams = s;
+ ptx_dev->async_streams.arr[async] = s;
+ }
+
+ stream = ptx_dev->async_streams.arr[async];
+ }
+ else if (async < 0)
+ GOMP_PLUGIN_fatal ("bad async %d", async);
+
+ if (create)
+ {
+ assert (stream != NULL);
+
+ /* If we're trying to use the same stream from different threads
+ simultaneously, set stream->multithreaded to true. This affects the
+ behaviour of acc_async_test_all and acc_wait_all, which are supposed to
+ only wait for asynchronous launches from the same host thread they are
+ invoked on. If multiple threads use the same async value, we make note
+ of that here and fall back to testing/waiting for all threads in those
+ functions. */
+ if (thread != stream->host_thread)
+ stream->multithreaded = true;
+
+ GOMP_PLUGIN_mutex_unlock (&ptx_dev->stream_lock);
+ }
+ else if (stream && !stream->multithreaded
+ && !pthread_equal (stream->host_thread, thread))
+ GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
+
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s using stream %p (CUDA stream %p) "
+ "for async %d\n", __FILE__, __FUNCTION__, stream,
+ stream ? stream->stream : NULL, orig_async);
+#endif
+
+ return stream;
+}
+
+static int PTX_get_num_devices (void);
+
+/* Initialize the device. */
+static int
+PTX_init (void)
+{
+ CUresult r;
+ int rc;
+
+ if (PTX_inited)
+ return PTX_get_num_devices ();
+
+ rc = verify_device_library ();
+ if (rc < 0)
+ return -1;
+
+ r = cuInit (0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r));
+
+ PTX_events = NULL;
+
+ GOMP_PLUGIN_mutex_init (&PTX_event_lock);
+
+ PTX_inited = true;
+
+ return PTX_get_num_devices ();
+}
+
+static int
+PTX_fini (void)
+{
+ PTX_inited = false;
+
+ return 0;
+}
+
+static void *
+PTX_open_device (int n)
+{
+ struct PTX_device *ptx_dev;
+ CUdevice dev;
+ CUresult r;
+ int async_engines, pi;
+
+ r = cuDeviceGet (&dev, n);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuErrorMsg (r));
+
+ ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device));
+
+ ptx_dev->ord = n;
+ ptx_dev->dev = dev;
+ ptx_dev->ctx_shared = false;
+
+ r = cuCtxGetCurrent (&ptx_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
+
+ if (!ptx_dev->ctx)
+ {
+ r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuErrorMsg (r));
+ }
+ else
+ ptx_dev->ctx_shared = true;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ ptx_dev->overlap = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ ptx_dev->map = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ ptx_dev->concur = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ ptx_dev->mode = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ ptx_dev->mkern = pi;
+
+ r = cuDeviceGetAttribute (&async_engines,
+ CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+ if (r != CUDA_SUCCESS)
+ async_engines = 1;
+
+ init_streams_for_device (ptx_dev, async_engines);
+
+ return (void *) ptx_dev;
+}
+
+static int
+PTX_close_device (void *targ_data)
+{
+ CUresult r;
+ struct PTX_device *ptx_dev = targ_data;
+
+ if (!ptx_dev)
+ return 0;
+
+ fini_streams_for_device (ptx_dev);
+
+ if (!ptx_dev->ctx_shared)
+ {
+ r = cuCtxDestroy (ptx_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r));
+ }
+
+ free (ptx_dev);
+
+ return 0;
+}
+
+static int
+PTX_get_num_devices (void)
+{
+ int n;
+ CUresult r;
+
+ /* This function will be called before the plugin has been initialized in
+ order to enumerate available devices, but CUDA API routines can't be used
+ until cuInit has been called. Just call it now (but don't yet do any
+ further initialization). */
+ if (!PTX_inited)
+ cuInit (0);
+
+ r = cuDeviceGetCount (&n);
+ if (r!= CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuErrorMsg (r));
+
+ return n;
+}
+
+static bool
+PTX_avail(void)
+{
+ bool avail = false;
+
+ if (PTX_init () > 0)
+ avail = true;
+
+ return avail;
+}
+
+static void
+link_ptx (CUmodule *module, char *ptx_code)
+{
+ CUjit_option opts[7];
+ void *optvals[7];
+ float elapsed = 0.0;
+#define LOGSIZE 8192
+ char elog[LOGSIZE];
+ char ilog[LOGSIZE];
+ unsigned long logsize = LOGSIZE;
+ CUlinkState linkstate;
+ CUresult r;
+ void *linkout;
+ size_t linkoutsize __attribute__((unused));
+
+ GOMP_PLUGIN_notify ("attempting to load:\n---\n%s\n---\n", ptx_code);
+
+ opts[0] = CU_JIT_WALL_TIME;
+ optvals[0] = &elapsed;
+
+ opts[1] = CU_JIT_INFO_LOG_BUFFER;
+ optvals[1] = &ilog[0];
+
+ opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+ optvals[2] = (void *) logsize;
+
+ opts[3] = CU_JIT_ERROR_LOG_BUFFER;
+ optvals[3] = &elog[0];
+
+ opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+ optvals[4] = (void *) logsize;
+
+ opts[5] = CU_JIT_LOG_VERBOSE;
+ optvals[5] = (void *) 1;
+
+ opts[6] = CU_JIT_TARGET;
+ optvals[6] = (void *) CU_TARGET_COMPUTE_30;
+
+ r = cuLinkCreate (7, opts, optvals, &linkstate);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuErrorMsg (r));
+
+ char *abort_ptx = ABORT_PTX;
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
+ strlen (abort_ptx) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+ GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuErrorMsg (r));
+ }
+
+ char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
+ strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+ GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
+ cuErrorMsg (r));
+ }
+
+ char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
+ strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+ GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
+ cuErrorMsg (r));
+ }
+
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
+ strlen (ptx_code) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+ GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuErrorMsg (r));
+ }
+
+ r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuErrorMsg (r));
+
+ GOMP_PLUGIN_notify ("Link complete: %fms\n", elapsed);
+ GOMP_PLUGIN_notify ("Link log %s\n", &ilog[0]);
+
+ r = cuModuleLoadData (module, linkout);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuErrorMsg (r));
+}
+
+static void
+event_gc (bool memmap_lockable)
+{
+ struct PTX_event *ptx_event = PTX_events;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
+
+ while (ptx_event != NULL)
+ {
+ CUresult r;
+ struct PTX_event *e = ptx_event;
+
+ ptx_event = ptx_event->next;
+
+ if (e->ord != nvthd->ptx_dev->ord)
+ continue;
+
+ r = cuEventQuery (*e->evt);
+ if (r == CUDA_SUCCESS)
+ {
+ CUevent *te;
+
+ te = e->evt;
+
+ switch (e->type)
+ {
+ case PTX_EVT_MEM:
+ case PTX_EVT_SYNC:
+ break;
+
+ case PTX_EVT_KNL:
+ map_pop (e->addr);
+ break;
+
+ case PTX_EVT_ASYNC_CLEANUP:
+ {
+ /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the
+ memory-map splay tree lock for the current device, so we
+ can't call it when one of our callers has already claimed
+ the lock. In that case, just delay the GC for this event
+ until later. */
+ if (!memmap_lockable)
+ continue;
+
+ GOMP_PLUGIN_async_unmap_vars (e->addr);
+ }
+ break;
+ }
+
+ cuEventDestroy (*te);
+ free ((void *)te);
+
+ if (PTX_events == e)
+ PTX_events = PTX_events->next;
+ else
+ {
+ struct PTX_event *e_ = PTX_events;
+ while (e_->next != e)
+ e_ = e_->next;
+ e_->next = e_->next->next;
+ }
+
+ free (e);
+ }
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
+}
+
+static void
+event_add (enum PTX_event_type type, CUevent *e, void *h)
+{
+ struct PTX_event *ptx_event;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
+ || type == PTX_EVT_ASYNC_CLEANUP);
+
+ ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event));
+ ptx_event->type = type;
+ ptx_event->evt = e;
+ ptx_event->addr = h;
+ ptx_event->ord = nvthd->ptx_dev->ord;
+
+ GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
+
+ ptx_event->next = PTX_events;
+ PTX_events = ptx_event;
+
+ GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
+}
+
+void
+PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
+ size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
+ int vector_length, int async, void *targ_mem_desc)
+{
+ struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
+ CUfunction function;
+ CUresult r;
+ int i;
+ struct PTX_stream *dev_str;
+ void *kargs[1];
+ void *hp, *dp;
+ unsigned int nthreads_in_block;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ function = targ_fn->fn;
+
+ dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
+ assert (dev_str == nvthd->current_stream);
+
+ /* This reserves a chunk of a pre-allocated page of memory mapped on both
+ the host and the device. HP is a host pointer to the new chunk, and DP is
+ the corresponding device pointer. */
+ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
+
+ GOMP_PLUGIN_notify (" %s: prepare mappings\n", __FUNCTION__);
+
+ /* Copy the array of arguments to the mapped page. */
+ for (i = 0; i < mapnum; i++)
+ ((void **) hp)[i] = devaddrs[i];
+
+ /* Copy the (device) pointers to arguments to the device (dp and hp might in
+ fact have the same value on a unified-memory system). */
+ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuErrorMsg (r));
+
+ GOMP_PLUGIN_notify (" %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
+
+ // XXX: possible geometry mappings??
+ //
+ // OpenACC CUDA
+ //
+ // num_gangs blocks
+ // num_workers warps (where a warp is equivalent to 32 threads)
+ // vector length threads
+ //
+
+ /* The openacc vector_length clause 'determines the vector length to use for
+ vector or SIMD operations'. The question is how to map this to CUDA.
+
+ In CUDA, the warp size is the vector length of a CUDA device. However, the
+ CUDA interface abstracts away from that, and only shows us warp size
+ indirectly in maximum number of threads per block, which is a product of
+ warp size and the number of hyperthreads of a multiprocessor.
+
+ We choose to map openacc vector_length directly onto the number of threads
+ in a block, in the x dimension. This is reflected in gcc code generation
+ that uses ThreadIdx.x to access vector elements.
+
+ Attempting to use an openacc vector_length of more than the maximum number
+ of threads per block will result in a cuda error. */
+ nthreads_in_block = vector_length;
+
+ kargs[0] = &dp;
+ r = cuLaunchKernel (function,
+ num_gangs, 1, 1,
+ nthreads_in_block, 1, 1,
+ 0, dev_str->stream, kargs, 0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuErrorMsg (r));
+
+#ifndef DISABLE_ASYNC
+ if (async < acc_async_noval)
+ {
+ r = cuStreamSynchronize (dev_str->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
+ }
+ else
+ {
+ CUevent *e;
+
+ e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ event_gc (true);
+
+ r = cuEventRecord (*e, dev_str->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_KNL, e, (void *)dev_str);
+ }
+#else
+ r = cuCtxSynchronize ();
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuErrorMsg (r));
+#endif
+
+ GOMP_PLUGIN_notify (" %s: kernel %s: finished\n", __FUNCTION__,
+ targ_fn->name);
+
+#ifndef DISABLE_ASYNC
+ if (async < acc_async_noval)
+#endif
+ map_pop (dev_str);
+}
+
+void * openacc_get_current_cuda_context (void);
+
+static void *
+PTX_alloc (size_t s)
+{
+ CUdeviceptr d;
+ CUresult r;
+
+ r = cuMemAlloc (&d, s);
+ if (r == CUDA_ERROR_OUT_OF_MEMORY)
+ return 0;
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuErrorMsg (r));
+ return (void *)d;
+}
+
+static void
+PTX_free (void *p)
+{
+ CUresult r;
+ CUdeviceptr pb;
+ size_t ps;
+
+ r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
+
+ if ((CUdeviceptr)p != pb)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ r = cuMemFree ((CUdeviceptr)p);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuErrorMsg (r));
+}
+
+static void *
+PTX_host2dev (void *d, const void *h, size_t s)
+{
+ CUresult r;
+ CUdeviceptr pb;
+ size_t ps;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!s)
+ return 0;
+
+ if (!d)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
+
+ if (!pb)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ if (!h)
+ GOMP_PLUGIN_fatal ("invalid host address");
+
+ if (d == h)
+ GOMP_PLUGIN_fatal ("invalid host or device address");
+
+ if ((void *)(d + s) > (void *)(pb + ps))
+ GOMP_PLUGIN_fatal ("invalid size");
+
+#ifndef DISABLE_ASYNC
+ if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+ {
+ CUevent *e;
+
+ e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ event_gc (false);
+
+ r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
+ nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r));
+
+ r = cuEventRecord (*e, nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_MEM, e, (void *)h);
+ }
+ else
+#endif
+ {
+ r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuErrorMsg (r));
+ }
+
+ return 0;
+}
+
+static void *
+PTX_dev2host (void *h, const void *d, size_t s)
+{
+ CUresult r;
+ CUdeviceptr pb;
+ size_t ps;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!s)
+ return 0;
+
+ if (!d)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
+
+ if (!pb)
+ GOMP_PLUGIN_fatal ("invalid device address");
+
+ if (!h)
+ GOMP_PLUGIN_fatal ("invalid host address");
+
+ if (d == h)
+ GOMP_PLUGIN_fatal ("invalid host or device address");
+
+ if ((void *)(d + s) > (void *)(pb + ps))
+ GOMP_PLUGIN_fatal ("invalid size");
+
+#ifndef DISABLE_ASYNC
+ if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+ {
+ CUevent *e;
+
+ e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuErrorMsg (r));
+
+ event_gc (false);
+
+ r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
+ nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r));
+
+ r = cuEventRecord (*e, nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_MEM, e, (void *)h);
+ }
+ else
+#endif
+ {
+ r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuErrorMsg (r));
+ }
+
+ return 0;
+}
+
+static void
+PTX_set_async (int async)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+ nvthd->current_stream
+ = select_stream_for_async (async, pthread_self (), true, NULL);
+}
+
+static int
+PTX_async_test (int async)
+{
+ CUresult r;
+ struct PTX_stream *s;
+
+ s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+ if (!s)
+ GOMP_PLUGIN_fatal ("unknown async %d", async);
+
+ r = cuStreamQuery (s->stream);
+ if (r == CUDA_SUCCESS)
+ {
+ /* The oacc-parallel.c:goacc_wait function calls this hook to determine
+ whether all work has completed on this stream, and if so omits the call
+ to the wait hook. If that happens, event_gc might not get called
+ (which prevents variables from getting unmapped and their associated
+ device storage freed), so call it here. */
+ event_gc (true);
+ return 1;
+ }
+ else if (r == CUDA_ERROR_NOT_READY)
+ return 0;
+
+ GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
+
+ return 0;
+}
+
+static int
+PTX_async_test_all (void)
+{
+ struct PTX_stream *s;
+ pthread_t self = pthread_self ();
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+ for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
+ {
+ if ((s->multithreaded || pthread_equal (s->host_thread, self))
+ && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
+ {
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+ return 0;
+ }
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+
+ event_gc (true);
+
+ return 1;
+}
+
+static void
+PTX_wait (int async)
+{
+ CUresult r;
+ struct PTX_stream *s;
+
+ s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+ if (!s)
+ GOMP_PLUGIN_fatal ("unknown async %d", async);
+
+ r = cuStreamSynchronize (s->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
+
+ event_gc (true);
+}
+
+static void
+PTX_wait_async (int async1, int async2)
+{
+ CUresult r;
+ CUevent *e;
+ struct PTX_stream *s1, *s2;
+ pthread_t self = pthread_self ();
+
+ /* The stream that is waiting (rather than being waited for) doesn't
+ necessarily have to exist already. */
+ s2 = select_stream_for_async (async2, self, true, NULL);
+
+ s1 = select_stream_for_async (async1, self, false, NULL);
+ if (!s1)
+ GOMP_PLUGIN_fatal ("invalid async 1\n");
+
+ if (s1 == s2)
+ GOMP_PLUGIN_fatal ("identical parameters");
+
+ e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ event_gc (true);
+
+ r = cuEventRecord (*e, s1->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_SYNC, e, NULL);
+
+ r = cuStreamWaitEvent (s2->stream, *e, 0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
+}
+
+static void
+PTX_wait_all (void)
+{
+ CUresult r;
+ struct PTX_stream *s;
+ pthread_t self = pthread_self ();
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+ /* Wait for active streams initiated by this thread (or by multiple threads)
+ to complete. */
+ for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
+ {
+ if (s->multithreaded || pthread_equal (s->host_thread, self))
+ {
+ r = cuStreamQuery (s->stream);
+ if (r == CUDA_SUCCESS)
+ continue;
+ else if (r != CUDA_ERROR_NOT_READY)
+ GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
+
+ r = cuStreamSynchronize (s->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
+ }
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+
+ event_gc (true);
+}
+
+static void
+PTX_wait_all_async (int async)
+{
+ CUresult r;
+ struct PTX_stream *waiting_stream, *other_stream;
+ CUevent *e;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+ pthread_t self = pthread_self ();
+
+ /* The stream doing the waiting. This could be the first mention of the
+ stream, so create it if necessary. */
+ waiting_stream
+ = select_stream_for_async (async, pthread_self (), true, NULL);
+
+ /* Launches on the null stream already block on other streams in the
+ context. */
+ if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
+ return;
+
+ event_gc (true);
+
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+ for (other_stream = nvthd->ptx_dev->active_streams;
+ other_stream != NULL;
+ other_stream = other_stream->next)
+ {
+ if (!other_stream->multithreaded
+ && !pthread_equal (other_stream->host_thread, self))
+ continue;
+
+ e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ /* Record an event on the waited-for stream. */
+ r = cuEventRecord (*e, other_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_SYNC, e, NULL);
+
+ r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+}
+
+static void *
+PTX_get_current_cuda_device (void)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!nvthd || !nvthd->ptx_dev)
+ return NULL;
+
+ return &nvthd->ptx_dev->dev;
+}
+
+static void *
+PTX_get_current_cuda_context (void)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!nvthd || !nvthd->ptx_dev)
+ return NULL;
+
+ return nvthd->ptx_dev->ctx;
+}
+
+static void *
+PTX_get_cuda_stream (int async)
+{
+ struct PTX_stream *s;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!nvthd || !nvthd->ptx_dev)
+ return NULL;
+
+ s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+ return s ? s->stream : NULL;
+}
+
+static int
+PTX_set_cuda_stream (int async, void *stream)
+{
+ struct PTX_stream *oldstream;
+ pthread_t self = pthread_self ();
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+ if (async < 0)
+ GOMP_PLUGIN_fatal ("bad async %d", async);
+
+ /* We have a list of active streams and an array mapping async values to
+ entries of that list. We need to take "ownership" of the passed-in stream,
+ and add it to our list, removing the previous entry also (if there was one)
+ in order to prevent resource leaks. Note the potential for surprise
+ here: maybe we should keep track of passed-in streams and leave it up to
+ the user to tidy those up, but that doesn't work for stream handles
+ returned from acc_get_cuda_stream above... */
+
+ oldstream = select_stream_for_async (async, self, false, NULL);
+
+ if (oldstream)
+ {
+ if (nvthd->ptx_dev->active_streams == oldstream)
+ nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
+ else
+ {
+ struct PTX_stream *s = nvthd->ptx_dev->active_streams;
+ while (s->next != oldstream)
+ s = s->next;
+ s->next = s->next->next;
+ }
+
+ cuStreamDestroy (oldstream->stream);
+ map_fini (oldstream);
+ free (oldstream);
+ }
+
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+
+ (void) select_stream_for_async (async, self, true, (CUstream) stream);
+
+ return 1;
+}
+
+/* Plugin entry points. */
+
+
+int
+GOMP_OFFLOAD_get_type (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return TARGET_TYPE_NVIDIA_PTX;
+}
+
+unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+ return TARGET_CAP_OPENACC_200;
+}
+
+const char *
+GOMP_OFFLOAD_get_name (void)
+{
+ return "nvidia";
+}
+
+int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return PTX_get_num_devices ();
+}
+
+static void **kernel_target_data;
+static void **kernel_host_table;
+
+void
+GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p, %p)\n", __FILE__, __FUNCTION__,
+ host_table, target_data);
+#endif
+
+ kernel_target_data = target_data;
+ kernel_host_table = host_table;
+}
+
+int
+GOMP_OFFLOAD_init_device (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return PTX_init ();
+}
+
+int
+GOMP_OFFLOAD_fini_device (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return PTX_fini ();
+}
+
+int
+GOMP_OFFLOAD_get_table (struct mapping_table **tablep)
+{
+ CUmodule module;
+ void **fn_table;
+ char **fn_names;
+ int fn_entries, i;
+ CUresult r;
+ struct targ_fn_descriptor *targ_fns;
+
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
+ tablep);
+#endif
+
+ if (PTX_init () <= 0)
+ return 0;
+
+ /* This isn't an error, because an image may legitimately have no offloaded
+ regions and so will not call GOMP_offload_register. */
+ if (kernel_target_data == NULL)
+ return 0;
+
+ link_ptx (&module, kernel_target_data[0]);
+
+ /* kernel_target_data[0] -> ptx code
+ kernel_target_data[1] -> variable mappings
+ kernel_target_data[2] -> array of kernel names in ascii
+
+ kernel_host_table[0] -> start of function addresses (_omp_func_table)
+ kernel_host_table[1] -> end of function addresses (_omp_funcs_end)
+
+ The array of kernel names and the functions addresses form a
+ one-to-one correspondence. */
+
+ fn_table = kernel_host_table[0];
+ fn_names = (char **) kernel_target_data[2];
+ fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
+
+ *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
+ targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
+ * fn_entries);
+
+ for (i = 0; i < fn_entries; i++)
+ {
+ CUfunction function;
+
+ r = cuModuleGetFunction (&function, module, fn_names[i]);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuErrorMsg (r));
+
+ targ_fns[i].fn = function;
+ targ_fns[i].name = (const char *) fn_names[i];
+
+ (*tablep)[i].host_start = (uintptr_t) fn_table[i];
+ (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
+ (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
+ (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
+ }
+
+ return fn_entries;
+}
+
+void *
+GOMP_OFFLOAD_alloc (size_t size)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%zu)\n", __FILE__, __FUNCTION__,
+ size);
+#endif
+
+ return PTX_alloc (size);
+}
+
+void
+GOMP_OFFLOAD_free (void *ptr)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, ptr);
+#endif
+
+ PTX_free (ptr);
+}
+
+void *
+GOMP_OFFLOAD_dev2host (void *dst, const void *src, size_t n)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
+ __FUNCTION__, dst,
+ src, n);
+#endif
+
+ return PTX_dev2host (dst, src, n);
+}
+
+void *
+GOMP_OFFLOAD_host2dev (void *dst, const void *src, size_t n)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
+ __FUNCTION__, dst, src, n);
+#endif
+
+ return PTX_host2dev (dst, src, n);
+}
+
+void (*device_run) (void *fn_ptr, void *vars) = NULL;
+
+void
+GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
+ void **hostaddrs, void **devaddrs, size_t *sizes,
+ unsigned short *kinds, int num_gangs,
+ int num_workers, int vector_length, int async,
+ void *targ_mem_desc)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, "
+ "%d, %p)\n", __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes,
+ kinds, num_gangs, num_workers, vector_length, async, targ_mem_desc);
+#endif
+
+ PTX_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
+ num_workers, vector_length, async, targ_mem_desc);
+}
+
+void *
+GOMP_OFFLOAD_openacc_open_device (int n)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, n);
+#endif
+ return PTX_open_device (n);
+}
+
+int
+GOMP_OFFLOAD_openacc_close_device (void *h)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, h);
+#endif
+ return PTX_close_device (h);
+}
+
+void
+GOMP_OFFLOAD_openacc_set_device_num (int n)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ assert (n >= 0);
+
+ if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
+ (void) PTX_open_device (n);
+}
+
+/* This can be called before the device is "opened" for the current thread, in
+ which case we can't tell which device number should be returned. We don't
+ actually want to open the device here, so just return -1 and let the caller
+ (oacc-init.c:acc_get_device_num) handle it. */
+
+int
+GOMP_OFFLOAD_openacc_get_device_num (void)
+{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (nvthd && nvthd->ptx_dev)
+ return nvthd->ptx_dev->ord;
+ else
+ return -1;
+}
+
+bool
+GOMP_OFFLOAD_openacc_avail (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ return PTX_avail ();
+}
+
+void
+GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
+{
+ CUevent *e;
+ CUresult r;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
+ targ_mem_desc);
+#endif
+
+ e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ r = cuEventRecord (*e, nvthd->current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
+}
+
+int
+GOMP_OFFLOAD_openacc_async_test (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ return PTX_async_test (async);
+}
+
+int
+GOMP_OFFLOAD_openacc_async_test_all (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ return PTX_async_test_all ();
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ PTX_wait (async);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d, %d)\n", __FILE__, __FUNCTION__,
+ async1, async2);
+#endif
+ PTX_wait_async (async1, async2);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait_all (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ PTX_wait_all ();
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ PTX_wait_all_async (async);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_set_async (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ PTX_set_async (async);
+}
+
+void *
+GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data)
+{
+ struct PTX_device *ptx_dev = (struct PTX_device *) targ_data;
+ struct nvptx_thread *nvthd
+ = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
+ CUresult r;
+ CUcontext thd_ctx;
+
+ r = cuCtxGetCurrent (&thd_ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
+
+ assert (ptx_dev->ctx);
+
+ if (!thd_ctx)
+ {
+ r = cuCtxPushCurrent (ptx_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuErrorMsg (r));
+ }
+
+ nvthd->current_stream = ptx_dev->null_stream;
+ nvthd->ptx_dev = ptx_dev;
+
+ return (void *) nvthd;
+}
+
+void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
+{
+ free (data);
+}
+
+void *
+GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ return PTX_get_current_cuda_device ();
+}
+
+void *
+GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ return PTX_get_current_cuda_context ();
+}
+
+/* NOTE: This returns a CUstream, not a PTX_stream pointer. */
+
+void *
+GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ return PTX_get_cuda_stream (async);
+}
+
+/* NOTE: This takes a CUstream, not a PTX_stream pointer. */
+
+int
+GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d, %p)\n", __FILE__, __FUNCTION__,
+ async, stream);
+#endif
+ return PTX_set_cuda_stream (async, stream);
+}