[gomp4] Move libgomp plugins into subdirectory

Message ID	20141105175710.41c2f552@octopus
State	New
Headers	show Return-Path: <gcc-patches-return-382970-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:cc:subject:message-id:mime-version:content-type; q=dns; s=default; b=TnzaAgizAEOfrZHhT9yV1akJAdWXCJjUuyw/eoxMo7AIYE/c8K 1CbTJO7nWGezYMuYBeCpQUfzsI/rydS12Snyh0GMS86BPmG+sTpo/xeYUasrZGr5 MvK9nTIO7yzzxStjixD5HI5MCxUSxIAuAC0ZTqKve8+AjDtYgdcVkBy+U= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Date: Wed, 5 Nov 2014 17:57:10 +0000 From: Julian Brown <julian@codesourcery.com> To: <gcc-patches@gcc.gnu.org> CC: Thomas Schwinge <thomas_schwinge@mentor.com>, Jakub Jelinek <jakub@redhat.com>, Ilya Verbin <iverbin@gmail.com> Subject: [gomp4] Move libgomp plugins into subdirectory Message-ID: <20141105175710.41c2f552@octopus> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="MP_/iDcWLPzB6amxFDZMIun5QBp"

commit 8994fb8c1b9d52cb9c82a61227a450df29e61806 Author: Julian Brown <julian@codesourcery.com> Date: Wed Nov 5 02:54:30 2014 -0800 Move libgomp plugins into their own directory. diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am index e0ab763..f265c5d 100644 --- a/libgomp/Makefile.am +++ b/libgomp/Makefile.am @@ -1,7 +1,8 @@ ## Process this file with automake to produce Makefile.in ACLOCAL_AMFLAGS = -I .. -I ../config -SUBDIRS = testsuite +SUBDIRS = testsuite plugin +DIST_SUBDIRS = plugin ## May be used by toolexeclibdir. gcc_version := $(shell cat $(top_srcdir)/../gcc/BASE-VER) @@ -21,27 +22,6 @@ AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS) toolexeclib_LTLIBRARIES = libgomp.la nodist_toolexeclib_HEADERS = libgomp.spec -if PLUGIN_NVPTX -# Nvidia PTX OpenACC plugin. -libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION) -toolexeclib_LTLIBRARIES += libgomp-plugin-nvptx.la -libgomp_plugin_nvptx_la_SOURCES = plugin-nvptx.c -libgomp_plugin_nvptx_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_NVPTX_CPPFLAGS) -libgomp_plugin_nvptx_la_LDFLAGS = $(libgomp_plugin_nvptx_version_info) \ - $(lt_host_flags) -libgomp_plugin_nvptx_la_LDFLAGS += $(PLUGIN_NVPTX_LDFLAGS) -libgomp_plugin_nvptx_la_LIBADD = $(PLUGIN_NVPTX_LIBS) -libgomp_plugin_nvptx_la_LIBTOOLFLAGS = --tag=disable-static -endif - -libgomp_plugin_host_nonshm_version_info = -version-info $(libtool_VERSION) -toolexeclib_LTLIBRARIES += libgomp-plugin-host_nonshm.la -libgomp_plugin_host_nonshm_la_SOURCES = oacc-host.c -libgomp_plugin_host_nonshm_la_CPPFLAGS = $(AM_CPPFLAGS) -DHOST_NONSHM_PLUGIN -libgomp_plugin_host_nonshm_la_LDFLAGS = \ - $(libgomp_plugin_host_nonshm_version_info) $(lt_host_flags) -libgomp_plugin_host_nonshm_la_LIBTOOLFLAGS = --tag=disable-static - if LIBGOMP_BUILD_VERSIONED_SHLIB # -Wc is only a libtool option. comma = , diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in index d12376e..ea3e1ca 100644 diff --git a/libgomp/configure b/libgomp/configure index 7daccd9..11a7ae0 100755 diff --git a/libgomp/configure.ac b/libgomp/configure.ac index 89c6b31..e883945 100644 --- a/libgomp/configure.ac +++ b/libgomp/configure.ac @@ -30,42 +30,6 @@ LIBGOMP_ENABLE(generated-files-in-srcdir, no, , AC_MSG_RESULT($enable_generated_files_in_srcdir) AM_CONDITIONAL(GENINSRC, test "$enable_generated_files_in_srcdir" = yes) -# Look for the CUDA driver package. -CUDA_DRIVER_INCLUDE= -CUDA_DRIVER_LIB= -AC_SUBST(CUDA_DRIVER_INCLUDE) -AC_SUBST(CUDA_DRIVER_LIB) -CUDA_DRIVER_CPPFLAGS= -CUDA_DRIVER_LDFLAGS= -AC_ARG_WITH(cuda-driver, - [AS_HELP_STRING([--with-cuda-driver=PATH], - [specify prefix directory for installed CUDA driver package. - Equivalent to --with-cuda-driver-include=PATH/include - plus --with-cuda-driver-lib=PATH/lib])]) -AC_ARG_WITH(cuda-driver-include, - [AS_HELP_STRING([--with-cuda-driver-include=PATH], - [specify directory for installed CUDA driver include files])]) -AC_ARG_WITH(cuda-driver-lib, - [AS_HELP_STRING([--with-cuda-driver-lib=PATH], - [specify directory for the installed CUDA driver library])]) -if test "x$with_cuda_driver" != x; then - CUDA_DRIVER_INCLUDE=$with_cuda_driver/include - CUDA_DRIVER_LIB=$with_cuda_driver/lib -fi -if test "x$with_cuda_driver_include" != x; then - CUDA_DRIVER_INCLUDE=$with_cuda_driver_include -fi -if test "x$with_cuda_driver_lib" != x; then - CUDA_DRIVER_LIB=$with_cuda_driver_lib -fi -if test "x$CUDA_DRIVER_INCLUDE" != x; then - CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE -fi -if test "x$CUDA_DRIVER_LIB" != x; then - CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB -fi - - # ------- # ------- @@ -241,52 +205,7 @@ elif test "x$enable_accelerator" != xno; then AC_MSG_ERROR([Can't have support for accelerators without support for plugins]) fi -PLUGIN_NVPTX=0 -PLUGIN_NVPTX_CPPFLAGS= -PLUGIN_NVPTX_LDFLAGS= -PLUGIN_NVPTX_LIBS= -AC_SUBST(PLUGIN_NVPTX) -AC_SUBST(PLUGIN_NVPTX_CPPFLAGS) -AC_SUBST(PLUGIN_NVPTX_LDFLAGS) -AC_SUBST(PLUGIN_NVPTX_LIBS) -# enable_accelerator has already been validated at top level. -# No need to do it again. -case $enable_offload_targets in - auto-nvptx*|nvptx*) - PLUGIN_NVPTX=yes - PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS - PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS - PLUGIN_NVPTX_LIBS='-lcuda' - - PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" - PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS - LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" - PLUGIN_NVPTX_save_LIBS=$LIBS - LIBS="$PLUGIN_NVPTX_LIBS $LIBS" - AC_LINK_IFELSE( - [AC_LANG_PROGRAM( - [#include "cuda.h"], - [CUresult r = cuCtxPushCurrent (NULL);])], - [PLUGIN_NVPTX=1]) - CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS - LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS - LIBS=$PLUGIN_NVPTX_save_LIBS - case $PLUGIN_NVPTX in - auto-nvptx*) - PLUGIN_NVPTX=0 - AC_MSG_WARN([CUDA driver package required for nvptx support; disabling]) - ;; - nvptx*) - PLUGIN_NVPTX=0 - AC_MSG_ERROR([CUDA driver package required for nvptx support]) - ;; - esac - ;; -esac -AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1]) -AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX], - [Define to 1 if the NVIDIA plugin is built, 0 if not.]) +AC_CONFIG_SUBDIRS([plugin]) # Check for functions needed. AC_CHECK_FUNCS(getloadavg clock_gettime strtoull) diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index 02794bb..624024b 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -1,4 +1,4 @@ -/* OpenACC Runtime Library: acc_device_host, acc_device_host_nonshm. +/* OpenACC Runtime Library: acc_device_host. Copyright (C) 2013-2014 Free Software Foundation, Inc. @@ -25,430 +25,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -/* Simple implementation of support routines for a shared-memory - acc_device_host, and a non-shared memory acc_device_host_nonshm, with the - latter built as a plugin. */ - -#include "openacc.h" -#include "config.h" -#include "libgomp.h" -#include "target.h" -#ifdef HOST_NONSHM_PLUGIN -#include "libgomp-plugin.h" -#include "oacc-plugin.h" -#else -#include "oacc-int.h" -#endif - -#include <stdint.h> -#include <stdlib.h> -#include <string.h> -#include <stdio.h> - -#ifdef HOST_NONSHM_PLUGIN -#define STATIC -#define GOMP(X) GOMP_PLUGIN_##X -#define SELF "host_nonshm plugin: " -#else -#define STATIC static -#define GOMP(X) gomp_##X -#define SELF "host: " -#endif - -#ifndef HOST_NONSHM_PLUGIN -static struct gomp_device_descr host_dispatch; -#endif - -STATIC const char * -GOMP_OFFLOAD_get_name (void) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); -#endif - -#ifdef HOST_NONSHM_PLUGIN - return "host_nonshm"; -#else - return "host"; -#endif -} - -STATIC int -GOMP_OFFLOAD_get_type (void) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); -#endif - -#ifdef HOST_NONSHM_PLUGIN - return TARGET_TYPE_HOST_NONSHM; -#else - return TARGET_TYPE_HOST; -#endif -} - -STATIC unsigned int -GOMP_OFFLOAD_get_caps (void) -{ - unsigned int caps = TARGET_CAP_OPENACC_200 | TARGET_CAP_OPENMP_400 - | TARGET_CAP_NATIVE_EXEC; - -#ifndef HOST_NONSHM_PLUGIN - caps |= TARGET_CAP_SHARED_MEM; -#endif - -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s: 0x%x\n", __FILE__, __FUNCTION__, caps); -#endif - - return caps; -} - -STATIC int -GOMP_OFFLOAD_get_num_devices (void) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); -#endif - - return 1; -} - -STATIC void -GOMP_OFFLOAD_register_image (void *host_table, void *target_data) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, host_table, - target_data); -#endif -} - -STATIC int -GOMP_OFFLOAD_init_device (void) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); -#endif - - return GOMP_OFFLOAD_get_num_devices (); -} - -STATIC int -GOMP_OFFLOAD_fini_device (void) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); -#endif - - return 0; -} - -STATIC int -GOMP_OFFLOAD_get_table (struct mapping_table **table) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, table); -#endif - - return 0; -} - -STATIC bool -GOMP_OFFLOAD_openacc_avail (void) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); -#endif - - return 1; -} - -STATIC void * -GOMP_OFFLOAD_openacc_open_device (int n) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n); -#endif - - return (void *) (intptr_t) n; -} - -STATIC int -GOMP_OFFLOAD_openacc_close_device (void *hnd) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, hnd); -#endif - - return 0; -} - -STATIC int -GOMP_OFFLOAD_openacc_get_device_num (void) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); -#endif - - return 0; -} - -STATIC void -GOMP_OFFLOAD_openacc_set_device_num (int n) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n); -#endif - - if (n > 0) - GOMP(fatal) ("device number %u out of range for host execution", n); -} - -STATIC void * -GOMP_OFFLOAD_alloc (size_t s) -{ - void *ptr = GOMP(malloc) (s); - -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%zd): %p\n", __FILE__, __FUNCTION__, s, ptr); -#endif - - return ptr; -} - -STATIC void -GOMP_OFFLOAD_free (void *p) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, p); -#endif - - free (p); -} - -STATIC void * -GOMP_OFFLOAD_host2dev (void *d, const void *h, size_t s) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, d, h, - s); -#endif - -#ifdef HOST_NONSHM_PLUGIN - memcpy (d, h, s); -#endif - - return 0; -} - -STATIC void * -GOMP_OFFLOAD_dev2host (void *h, const void *d, size_t s) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, h, d, - s); -#endif - -#ifdef HOST_NONSHM_PLUGIN - memcpy (h, d, s); -#endif - - return 0; -} - -STATIC void -GOMP_OFFLOAD_run (void *fn_ptr, void *vars) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, fn_ptr, - vars); -#endif - - void (*fn)(void *) = (void (*)(void *)) fn_ptr; - - fn (vars); -} - -STATIC void -GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), - size_t mapnum __attribute__((unused)), - void **hostaddrs __attribute__((unused)), - void **devaddrs __attribute__((unused)), - size_t *sizes __attribute__((unused)), - unsigned short *kinds __attribute__((unused)), - int num_gangs __attribute__((unused)), - int num_workers __attribute__((unused)), - int vector_length __attribute__((unused)), - int async __attribute__((unused)), - void *targ_mem_desc __attribute__((unused))) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, %d, %p)\n", - __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes, kinds, - num_gangs, num_workers, vector_length, async, targ_mem_desc); -#endif - -#ifdef HOST_NONSHM_PLUGIN - fn (devaddrs); -#else - fn (hostaddrs); -#endif -} - -STATIC void -GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) -{ -#ifdef HOST_NONSHM_PLUGIN - /* "Asynchronous" launches are executed synchronously on the (non-SHM) host, - so there's no point in delaying host-side cleanup -- just do it now. */ - GOMP_PLUGIN_async_unmap_vars (targ_mem_desc); -#endif -} - -STATIC void -GOMP_OFFLOAD_openacc_async_set_async (int async __attribute__((unused))) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async); -#endif -} - -STATIC int -GOMP_OFFLOAD_openacc_async_test (int async __attribute__((unused))) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async); -#endif - - return 1; -} - -STATIC int -GOMP_OFFLOAD_openacc_async_test_all (void) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); -#endif - - return 1; -} - -STATIC void -GOMP_OFFLOAD_openacc_async_wait (int async __attribute__((unused))) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async); -#endif -} - -STATIC void -GOMP_OFFLOAD_openacc_async_wait_all (void) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); -#endif -} - -STATIC void -GOMP_OFFLOAD_openacc_async_wait_async (int async1 __attribute__((unused)), - int async2 __attribute__((unused))) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%d, %d)\n", __FILE__, __FUNCTION__, async1, - async2); -#endif -} - -STATIC void -GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__((unused))) -{ -#ifdef DEBUG - fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async); -#endif -} - -STATIC void * -GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data __attribute__((unused))) -{ - return NULL; -} - -STATIC void -GOMP_OFFLOAD_openacc_destroy_thread_data (void *tls_data __attribute__((unused))) -{ -} - -#ifndef HOST_NONSHM_PLUGIN -static struct gomp_device_descr host_dispatch = - { - .name = "host", - - .type = TARGET_TYPE_HOST, - .capabilities = TARGET_CAP_OPENACC_200 | TARGET_CAP_NATIVE_EXEC - | TARGET_CAP_SHARED_MEM, - .id = 0, - - .is_initialized = false, - .offload_regions_registered = false, - - .get_name_func = GOMP_OFFLOAD_get_name, - .get_type_func = GOMP_OFFLOAD_get_type, - .get_caps_func = GOMP_OFFLOAD_get_caps, - - .init_device_func = GOMP_OFFLOAD_init_device, - .fini_device_func = GOMP_OFFLOAD_fini_device, - .get_num_devices_func = GOMP_OFFLOAD_get_num_devices, - .register_image_func = GOMP_OFFLOAD_register_image, - .get_table_func = GOMP_OFFLOAD_get_table, - - .alloc_func = GOMP_OFFLOAD_alloc, - .free_func = GOMP_OFFLOAD_free, - .host2dev_func = GOMP_OFFLOAD_host2dev, - .dev2host_func = GOMP_OFFLOAD_dev2host, - - .run_func = GOMP_OFFLOAD_run, - - .openacc = { - .open_device_func = GOMP_OFFLOAD_openacc_open_device, - .close_device_func = GOMP_OFFLOAD_openacc_close_device, - - .get_device_num_func = GOMP_OFFLOAD_openacc_get_device_num, - .set_device_num_func = GOMP_OFFLOAD_openacc_set_device_num, - - /* Device available. */ - .avail_func = GOMP_OFFLOAD_openacc_avail, - - .exec_func = GOMP_OFFLOAD_openacc_parallel, - - .register_async_cleanup_func - = GOMP_OFFLOAD_openacc_register_async_cleanup, - - .async_set_async_func = GOMP_OFFLOAD_openacc_async_set_async, - .async_test_func = GOMP_OFFLOAD_openacc_async_test, - .async_test_all_func = GOMP_OFFLOAD_openacc_async_test_all, - .async_wait_func = GOMP_OFFLOAD_openacc_async_wait, - .async_wait_async_func = GOMP_OFFLOAD_openacc_async_wait_async, - .async_wait_all_func = GOMP_OFFLOAD_openacc_async_wait_all, - .async_wait_all_async_func = GOMP_OFFLOAD_openacc_async_wait_all_async, - - .create_thread_data_func = GOMP_OFFLOAD_openacc_create_thread_data, - .destroy_thread_data_func = GOMP_OFFLOAD_openacc_destroy_thread_data, - - .cuda = { - .get_current_device_func = NULL, - .get_current_context_func = NULL, - .get_stream_func = NULL, - .set_stream_func = NULL, - } - } - }; - -/* Register this device type. */ -static __attribute__ ((constructor)) -void ACC_host_init (void) -{ - gomp_mutex_init (&host_dispatch.mem_map.lock); - ACC_register (&host_dispatch); -} -#endif - +/* This shares much of the implementation of the plugin-host.c "host_nonshm" + plugin. */ +#include "plugin/plugin-host.c" diff --git a/libgomp/plugin-nvptx.c b/libgomp/plugin-nvptx.c deleted file mode 100644 index 4271c69..0000000 --- a/libgomp/plugin-nvptx.c +++ /dev/null @@ -1,1909 +0,0 @@ -/* Plugin for NVPTX execution. - - Copyright (C) 2013-2014 Free Software Foundation, Inc. - - Contributed by Mentor Embedded. - - This file is part of the GNU OpenMP Library (libgomp). - - Libgomp is free software; you can redistribute it and/or modify it - under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3, or (at your option) - any later version. - - Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - FOR A PARTICULAR PURPOSE. See the GNU General Public License for - more details. - - Under Section 7 of GPL version 3, you are granted additional - permissions described in the GCC Runtime Library Exception, version - 3.1, as published by the Free Software Foundation. - - You should have received a copy of the GNU General Public License and - a copy of the GCC Runtime Library Exception along with this program; - see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - <http://www.gnu.org/licenses/>. */ - -/* Nvidia PTX-specific parts of OpenACC support. The cuda driver - library appears to hold some implicit state, but the documentation - is not clear as to what that state might be. Or how one might - propagate it from one thread to another. */ - -#include "openacc.h" -#include "config.h" -#include "libgomp.h" -#include "target.h" -#include "libgomp-plugin.h" -#include "oacc-ptx.h" -#include "oacc-plugin.h" - -#include <cuda.h> -#include <stdint.h> -#include <string.h> -#include <stdio.h> -#include <dlfcn.h> -#include <unistd.h> -#include <assert.h> - -#define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0])) - -static struct _errlist -{ - CUresult r; - char *m; -} cuErrorList[] = { - { CUDA_ERROR_INVALID_VALUE, "invalid value" }, - { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" }, - { CUDA_ERROR_NOT_INITIALIZED, "not initialized" }, - { CUDA_ERROR_DEINITIALIZED, "deinitialized" }, - { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" }, - { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" }, - { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" }, - { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" }, - { CUDA_ERROR_NO_DEVICE, "no device" }, - { CUDA_ERROR_INVALID_DEVICE, "invalid device" }, - { CUDA_ERROR_INVALID_IMAGE, "invalid image" }, - { CUDA_ERROR_INVALID_CONTEXT, "invalid context" }, - { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" }, - { CUDA_ERROR_MAP_FAILED, "map error" }, - { CUDA_ERROR_UNMAP_FAILED, "unmap error" }, - { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" }, - { CUDA_ERROR_ALREADY_MAPPED, "already mapped" }, - { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" }, - { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" }, - { CUDA_ERROR_NOT_MAPPED, "not mapped" }, - { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" }, - { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" }, - { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" }, - { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" }, - { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" }, - { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" }, - { CUDA_ERROR_INVALID_SOURCE, "invalid source" }, - { CUDA_ERROR_FILE_NOT_FOUND, "file not found" }, - { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - "shared object symbol not found" }, - { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" }, - { CUDA_ERROR_OPERATING_SYSTEM, "operating system" }, - { CUDA_ERROR_INVALID_HANDLE, "invalid handle" }, - { CUDA_ERROR_NOT_FOUND, "not found" }, - { CUDA_ERROR_NOT_READY, "not ready" }, - { CUDA_ERROR_LAUNCH_FAILED, "launch error" }, - { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" }, - { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" }, - { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - "launch incompatibe texturing" }, - { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" }, - { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " }, - { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" }, - { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" }, - { CUDA_ERROR_ASSERT, "assert" }, - { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" }, - { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, - "host memory already registered" }, - { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" }, - { CUDA_ERROR_NOT_PERMITTED, "no permitted" }, - { CUDA_ERROR_NOT_SUPPORTED, "not supported" }, - { CUDA_ERROR_UNKNOWN, "unknown" } -}; - -static char errmsg[128]; - -static char * -cuErrorMsg (CUresult r) -{ - int i; - - for (i = 0; i < ARRAYSIZE (cuErrorList); i++) - { - if (cuErrorList[i].r == r) - return &cuErrorList[i].m[0]; - } - - sprintf (&errmsg[0], "unknown result code: %5d", r); - - return &errmsg[0]; -} - -struct targ_fn_descriptor -{ - CUfunction fn; - const char *name; -}; - -static bool PTX_inited = false; - -struct PTX_stream -{ - CUstream stream; - pthread_t host_thread; - bool multithreaded; - - CUdeviceptr d; - void *h; - void *h_begin; - void *h_end; - void *h_next; - void *h_prev; - void *h_tail; - - struct PTX_stream *next; -}; - -/* Thread-specific data for PTX. */ - -struct nvptx_thread -{ - struct PTX_stream *current_stream; - struct PTX_device *ptx_dev; -}; - -struct map -{ - int async; - size_t size; - char mappings[0]; -}; - -static void -map_init (struct PTX_stream *s) -{ - CUresult r; - - int size = getpagesize (); - - assert (s); - assert (!s->d); - assert (!s->h); - - r = cuMemAllocHost (&s->h, size); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuErrorMsg (r)); - - r = cuMemHostGetDevicePointer (&s->d, s->h, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuErrorMsg (r)); - - assert (s->h); - - s->h_begin = s->h; - s->h_end = s->h_begin + size; - s->h_next = s->h_prev = s->h_tail = s->h_begin; - - assert (s->h_next); - assert (s->h_end); -} - -static void -map_fini (struct PTX_stream *s) -{ - CUresult r; - - r = cuMemFreeHost (s->h); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuErrorMsg (r)); -} - -static void -map_pop (struct PTX_stream *s) -{ - struct map *m; - - assert (s != NULL); - assert (s->h_next); - assert (s->h_prev); - assert (s->h_tail); - - m = s->h_tail; - - s->h_tail += m->size; - - if (s->h_tail >= s->h_end) - s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); - - if (s->h_next == s->h_tail) - s->h_prev = s->h_next; - - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); - - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); -} - -static void -map_push (struct PTX_stream *s, int async, size_t size, void **h, void **d) -{ - int left; - int offset; - struct map *m; - - assert (s != NULL); - - left = s->h_end - s->h_next; - size += sizeof (struct map); - - assert (s->h_prev); - assert (s->h_next); - - if (size >= left) - { - m = s->h_prev; - m->size += left; - s->h_next = s->h_begin; - - if (s->h_next + size > s->h_end) - GOMP_PLUGIN_fatal ("unable to push map"); - } - - assert (s->h_next); - - m = s->h_next; - m->async = async; - m->size = size; - - offset = (void *)&m->mappings[0] - s->h; - - *d = (void *)(s->d + offset); - *h = (void *)(s->h + offset); - - s->h_prev = s->h_next; - s->h_next += size; - - assert (s->h_prev); - assert (s->h_next); - - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); - - return; -} - -struct PTX_device -{ - CUcontext ctx; - bool ctx_shared; - CUdevice dev; - struct PTX_stream *null_stream; - /* All non-null streams associated with this device (actually context), - either created implicitly or passed in from the user (via - acc_set_cuda_stream). */ - struct PTX_stream *active_streams; - struct { - struct PTX_stream **arr; - int size; - } async_streams; - /* A lock for use when manipulating the above stream list and array. */ - gomp_mutex_t stream_lock; - int ord; - bool overlap; - bool map; - bool concur; - int mode; - bool mkern; - - struct PTX_device *next; -}; - -enum PTX_event_type -{ - PTX_EVT_MEM, - PTX_EVT_KNL, - PTX_EVT_SYNC, - PTX_EVT_ASYNC_CLEANUP -}; - -struct PTX_event -{ - CUevent *evt; - int type; - void *addr; - int ord; - - struct PTX_event *next; -}; - -static gomp_mutex_t PTX_event_lock; -static struct PTX_event *PTX_events; - -#define _XSTR(s) _STR(s) -#define _STR(s) #s - -static struct _synames -{ - char *n; -} cuSymNames[] = -{ - { _XSTR(cuCtxCreate) }, - { _XSTR(cuCtxDestroy) }, - { _XSTR(cuCtxGetCurrent) }, - { _XSTR(cuCtxPushCurrent) }, - { _XSTR(cuCtxSynchronize) }, - { _XSTR(cuDeviceGet) }, - { _XSTR(cuDeviceGetAttribute) }, - { _XSTR(cuDeviceGetCount) }, - { _XSTR(cuEventCreate) }, - { _XSTR(cuEventDestroy) }, - { _XSTR(cuEventQuery) }, - { _XSTR(cuEventRecord) }, - { _XSTR(cuInit) }, - { _XSTR(cuLaunchKernel) }, - { _XSTR(cuLinkAddData) }, - { _XSTR(cuLinkComplete) }, - { _XSTR(cuLinkCreate) }, - { _XSTR(cuMemAlloc) }, - { _XSTR(cuMemAllocHost) }, - { _XSTR(cuMemcpy) }, - { _XSTR(cuMemcpyDtoH) }, - { _XSTR(cuMemcpyDtoHAsync) }, - { _XSTR(cuMemcpyHtoD) }, - { _XSTR(cuMemcpyHtoDAsync) }, - { _XSTR(cuMemFree) }, - { _XSTR(cuMemFreeHost) }, - { _XSTR(cuMemGetAddressRange) }, - { _XSTR(cuMemHostGetDevicePointer) }, - { _XSTR(cuMemHostRegister) }, - { _XSTR(cuMemHostUnregister) }, - { _XSTR(cuModuleGetFunction) }, - { _XSTR(cuModuleLoadData) }, - { _XSTR(cuStreamDestroy) }, - { _XSTR(cuStreamQuery) }, - { _XSTR(cuStreamSynchronize) }, - { _XSTR(cuStreamWaitEvent) } -}; - -static int -verify_device_library (void) -{ - int i; - void *dh, *ds; - - dh = dlopen ("libcuda.so", RTLD_LAZY); - if (!dh) - return -1; - - for (i = 0; i < ARRAYSIZE (cuSymNames); i++) - { - ds = dlsym (dh, cuSymNames[i].n); - if (!ds) - return -1; - } - - dlclose (dh); - - return 0; -} - -static inline struct nvptx_thread * -nvptx_thread (void) -{ - return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); -} - -static void -init_streams_for_device (struct PTX_device *ptx_dev, int concurrency) -{ - int i; - struct PTX_stream *null_stream - = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream)); - - null_stream->stream = NULL; - null_stream->host_thread = pthread_self (); - null_stream->multithreaded = true; - null_stream->d = (CUdeviceptr) NULL; - null_stream->h = NULL; - map_init (null_stream); - ptx_dev->null_stream = null_stream; - - ptx_dev->active_streams = NULL; - GOMP_PLUGIN_mutex_init (&ptx_dev->stream_lock); - - if (concurrency < 1) - concurrency = 1; - - /* This is just a guess -- make space for as many async streams as the - current device is capable of concurrently executing. This can grow - later as necessary. No streams are created yet. */ - ptx_dev->async_streams.arr - = GOMP_PLUGIN_malloc (concurrency * sizeof (struct PTX_stream *)); - ptx_dev->async_streams.size = concurrency; - - for (i = 0; i < concurrency; i++) - ptx_dev->async_streams.arr[i] = NULL; -} - -static void -fini_streams_for_device (struct PTX_device *ptx_dev) -{ - free (ptx_dev->async_streams.arr); - - while (ptx_dev->active_streams != NULL) - { - struct PTX_stream *s = ptx_dev->active_streams; - ptx_dev->active_streams = ptx_dev->active_streams->next; - - cuStreamDestroy (s->stream); - map_fini (s); - free (s); - } - - map_fini (ptx_dev->null_stream); - free (ptx_dev->null_stream); -} - -/* Select a stream for (OpenACC-semantics) ASYNC argument for the current - thread THREAD (and also current device/context). If CREATE is true, create - the stream if it does not exist (or use EXISTING if it is non-NULL), and - associate the stream with the same thread argument. Returns stream to use - as result. */ - -static struct PTX_stream * -select_stream_for_async (int async, pthread_t thread, bool create, - CUstream existing) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - /* Local copy of TLS variable. */ - struct PTX_device *ptx_dev = nvthd->ptx_dev; - struct PTX_stream *stream = NULL; - int orig_async = async; - - /* The special value acc_async_noval (-1) maps (for now) to an - implicitly-created stream, which is then handled the same as any other - numbered async stream. Other options are available, e.g. using the null - stream for anonymous async operations, or choosing an idle stream from an - active set. But, stick with this for now. */ - if (async > acc_async_sync) - async++; - - if (create) - GOMP_PLUGIN_mutex_lock (&ptx_dev->stream_lock); - - /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the - null stream, and in fact better performance may be obtainable if it doesn't - (because the null stream enforces overly-strict synchronisation with - respect to other streams for legacy reasons, and that's probably not - needed with OpenACC). Maybe investigate later. */ - if (async == acc_async_sync) - stream = ptx_dev->null_stream; - else if (async >= 0 && async < ptx_dev->async_streams.size - && ptx_dev->async_streams.arr[async] && !(create && existing)) - stream = ptx_dev->async_streams.arr[async]; - else if (async >= 0 && create) - { - if (async >= ptx_dev->async_streams.size) - { - int i, newsize = ptx_dev->async_streams.size * 2; - - if (async >= newsize) - newsize = async + 1; - - ptx_dev->async_streams.arr - = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr, - newsize * sizeof (struct PTX_stream *)); - - for (i = ptx_dev->async_streams.size; i < newsize; i++) - ptx_dev->async_streams.arr[i] = NULL; - - ptx_dev->async_streams.size = newsize; - } - - /* Create a new stream on-demand if there isn't one already, or if we're - setting a particular async value to an existing (externally-provided) - stream. */ - if (!ptx_dev->async_streams.arr[async] || existing) - { - CUresult r; - struct PTX_stream *s - = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream)); - - if (existing) - s->stream = existing; - else - { - r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuErrorMsg (r)); - } - - /* If CREATE is true, we're going to be queueing some work on this - stream. Associate it with the current host thread. */ - s->host_thread = thread; - s->multithreaded = false; - - s->d = (CUdeviceptr) NULL; - s->h = NULL; - map_init (s); - - s->next = ptx_dev->active_streams; - ptx_dev->active_streams = s; - ptx_dev->async_streams.arr[async] = s; - } - - stream = ptx_dev->async_streams.arr[async]; - } - else if (async < 0) - GOMP_PLUGIN_fatal ("bad async %d", async); - - if (create) - { - assert (stream != NULL); - - /* If we're trying to use the same stream from different threads - simultaneously, set stream->multithreaded to true. This affects the - behaviour of acc_async_test_all and acc_wait_all, which are supposed to - only wait for asynchronous launches from the same host thread they are - invoked on. If multiple threads use the same async value, we make note - of that here and fall back to testing/waiting for all threads in those - functions. */ - if (thread != stream->host_thread) - stream->multithreaded = true; - - GOMP_PLUGIN_mutex_unlock (&ptx_dev->stream_lock); - } - else if (stream && !stream->multithreaded - && !pthread_equal (stream->host_thread, thread)) - GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async); - -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s using stream %p (CUDA stream %p) " - "for async %d\n", __FILE__, __FUNCTION__, stream, - stream ? stream->stream : NULL, orig_async); -#endif - - return stream; -} - -static int PTX_get_num_devices (void); - -/* Initialize the device. */ -static int -PTX_init (void) -{ - CUresult r; - int rc; - - if (PTX_inited) - return PTX_get_num_devices (); - - rc = verify_device_library (); - if (rc < 0) - return -1; - - r = cuInit (0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r)); - - PTX_events = NULL; - - GOMP_PLUGIN_mutex_init (&PTX_event_lock); - - PTX_inited = true; - - return PTX_get_num_devices (); -} - -static int -PTX_fini (void) -{ - PTX_inited = false; - - return 0; -} - -static void * -PTX_open_device (int n) -{ - struct PTX_device *ptx_dev; - CUdevice dev; - CUresult r; - int async_engines, pi; - - r = cuDeviceGet (&dev, n); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuErrorMsg (r)); - - ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device)); - - ptx_dev->ord = n; - ptx_dev->dev = dev; - ptx_dev->ctx_shared = false; - - r = cuCtxGetCurrent (&ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r)); - - if (!ptx_dev->ctx) - { - r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuErrorMsg (r)); - } - else - ptx_dev->ctx_shared = true; - - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); - - ptx_dev->overlap = pi; - - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); - - ptx_dev->map = pi; - - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); - - ptx_dev->concur = pi; - - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); - - ptx_dev->mode = pi; - - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); - - ptx_dev->mkern = pi; - - r = cuDeviceGetAttribute (&async_engines, - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); - if (r != CUDA_SUCCESS) - async_engines = 1; - - init_streams_for_device (ptx_dev, async_engines); - - return (void *) ptx_dev; -} - -static int -PTX_close_device (void *targ_data) -{ - CUresult r; - struct PTX_device *ptx_dev = targ_data; - - if (!ptx_dev) - return 0; - - fini_streams_for_device (ptx_dev); - - if (!ptx_dev->ctx_shared) - { - r = cuCtxDestroy (ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r)); - } - - free (ptx_dev); - - return 0; -} - -static int -PTX_get_num_devices (void) -{ - int n; - CUresult r; - - /* This function will be called before the plugin has been initialized in - order to enumerate available devices, but CUDA API routines can't be used - until cuInit has been called. Just call it now (but don't yet do any - further initialization). */ - if (!PTX_inited) - cuInit (0); - - r = cuDeviceGetCount (&n); - if (r!= CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuErrorMsg (r)); - - return n; -} - -static bool -PTX_avail(void) -{ - bool avail = false; - - if (PTX_init () > 0) - avail = true; - - return avail; -} - -static void -link_ptx (CUmodule *module, char *ptx_code) -{ - CUjit_option opts[7]; - void *optvals[7]; - float elapsed = 0.0; -#define LOGSIZE 8192 - char elog[LOGSIZE]; - char ilog[LOGSIZE]; - unsigned long logsize = LOGSIZE; - CUlinkState linkstate; - CUresult r; - void *linkout; - size_t linkoutsize __attribute__((unused)); - - GOMP_PLUGIN_notify ("attempting to load:\n---\n%s\n---\n", ptx_code); - - opts[0] = CU_JIT_WALL_TIME; - optvals[0] = &elapsed; - - opts[1] = CU_JIT_INFO_LOG_BUFFER; - optvals[1] = &ilog[0]; - - opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; - optvals[2] = (void *) logsize; - - opts[3] = CU_JIT_ERROR_LOG_BUFFER; - optvals[3] = &elog[0]; - - opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; - optvals[4] = (void *) logsize; - - opts[5] = CU_JIT_LOG_VERBOSE; - optvals[5] = (void *) 1; - - opts[6] = CU_JIT_TARGET; - optvals[6] = (void *) CU_TARGET_COMPUTE_30; - - r = cuLinkCreate (7, opts, optvals, &linkstate); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuErrorMsg (r)); - - char *abort_ptx = ABORT_PTX; - r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx, - strlen (abort_ptx) + 1, 0, 0, 0, 0); - if (r != CUDA_SUCCESS) - { - GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); - GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuErrorMsg (r)); - } - - char *acc_on_device_ptx = ACC_ON_DEVICE_PTX; - r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx, - strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0); - if (r != CUDA_SUCCESS) - { - GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); - GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s", - cuErrorMsg (r)); - } - - char *goacc_internal_ptx = GOACC_INTERNAL_PTX; - r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx, - strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0); - if (r != CUDA_SUCCESS) - { - GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); - GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s", - cuErrorMsg (r)); - } - - r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code, - strlen (ptx_code) + 1, 0, 0, 0, 0); - if (r != CUDA_SUCCESS) - { - GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); - GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuErrorMsg (r)); - } - - r = cuLinkComplete (linkstate, &linkout, &linkoutsize); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuErrorMsg (r)); - - GOMP_PLUGIN_notify ("Link complete: %fms\n", elapsed); - GOMP_PLUGIN_notify ("Link log %s\n", &ilog[0]); - - r = cuModuleLoadData (module, linkout); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuErrorMsg (r)); -} - -static void -event_gc (bool memmap_lockable) -{ - struct PTX_event *ptx_event = PTX_events; - struct nvptx_thread *nvthd = nvptx_thread (); - - GOMP_PLUGIN_mutex_lock (&PTX_event_lock); - - while (ptx_event != NULL) - { - CUresult r; - struct PTX_event *e = ptx_event; - - ptx_event = ptx_event->next; - - if (e->ord != nvthd->ptx_dev->ord) - continue; - - r = cuEventQuery (*e->evt); - if (r == CUDA_SUCCESS) - { - CUevent *te; - - te = e->evt; - - switch (e->type) - { - case PTX_EVT_MEM: - case PTX_EVT_SYNC: - break; - - case PTX_EVT_KNL: - map_pop (e->addr); - break; - - case PTX_EVT_ASYNC_CLEANUP: - { - /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the - memory-map splay tree lock for the current device, so we - can't call it when one of our callers has already claimed - the lock. In that case, just delay the GC for this event - until later. */ - if (!memmap_lockable) - continue; - - GOMP_PLUGIN_async_unmap_vars (e->addr); - } - break; - } - - cuEventDestroy (*te); - free ((void *)te); - - if (PTX_events == e) - PTX_events = PTX_events->next; - else - { - struct PTX_event *e_ = PTX_events; - while (e_->next != e) - e_ = e_->next; - e_->next = e_->next->next; - } - - free (e); - } - } - - GOMP_PLUGIN_mutex_unlock (&PTX_event_lock); -} - -static void -event_add (enum PTX_event_type type, CUevent *e, void *h) -{ - struct PTX_event *ptx_event; - struct nvptx_thread *nvthd = nvptx_thread (); - - assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC - || type == PTX_EVT_ASYNC_CLEANUP); - - ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event)); - ptx_event->type = type; - ptx_event->evt = e; - ptx_event->addr = h; - ptx_event->ord = nvthd->ptx_dev->ord; - - GOMP_PLUGIN_mutex_lock (&PTX_event_lock); - - ptx_event->next = PTX_events; - PTX_events = ptx_event; - - GOMP_PLUGIN_mutex_unlock (&PTX_event_lock); -} - -void -PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, - size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers, - int vector_length, int async, void *targ_mem_desc) -{ - struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; - CUfunction function; - CUresult r; - int i; - struct PTX_stream *dev_str; - void *kargs[1]; - void *hp, *dp; - unsigned int nthreads_in_block; - struct nvptx_thread *nvthd = nvptx_thread (); - - function = targ_fn->fn; - - dev_str = select_stream_for_async (async, pthread_self (), false, NULL); - assert (dev_str == nvthd->current_stream); - - /* This reserves a chunk of a pre-allocated page of memory mapped on both - the host and the device. HP is a host pointer to the new chunk, and DP is - the corresponding device pointer. */ - map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); - - GOMP_PLUGIN_notify (" %s: prepare mappings\n", __FUNCTION__); - - /* Copy the array of arguments to the mapped page. */ - for (i = 0; i < mapnum; i++) - ((void **) hp)[i] = devaddrs[i]; - - /* Copy the (device) pointers to arguments to the device (dp and hp might in - fact have the same value on a unified-memory system). */ - r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuErrorMsg (r)); - - GOMP_PLUGIN_notify (" %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name); - - // XXX: possible geometry mappings?? - // - // OpenACC CUDA - // - // num_gangs blocks - // num_workers warps (where a warp is equivalent to 32 threads) - // vector length threads - // - - /* The openacc vector_length clause 'determines the vector length to use for - vector or SIMD operations'. The question is how to map this to CUDA. - - In CUDA, the warp size is the vector length of a CUDA device. However, the - CUDA interface abstracts away from that, and only shows us warp size - indirectly in maximum number of threads per block, which is a product of - warp size and the number of hyperthreads of a multiprocessor. - - We choose to map openacc vector_length directly onto the number of threads - in a block, in the x dimension. This is reflected in gcc code generation - that uses ThreadIdx.x to access vector elements. - - Attempting to use an openacc vector_length of more than the maximum number - of threads per block will result in a cuda error. */ - nthreads_in_block = vector_length; - - kargs[0] = &dp; - r = cuLaunchKernel (function, - num_gangs, 1, 1, - nthreads_in_block, 1, 1, - 0, dev_str->stream, kargs, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuErrorMsg (r)); - -#ifndef DISABLE_ASYNC - if (async < acc_async_noval) - { - r = cuStreamSynchronize (dev_str->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r)); - } - else - { - CUevent *e; - - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); - - event_gc (true); - - r = cuEventRecord (*e, dev_str->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - - event_add (PTX_EVT_KNL, e, (void *)dev_str); - } -#else - r = cuCtxSynchronize (); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuErrorMsg (r)); -#endif - - GOMP_PLUGIN_notify (" %s: kernel %s: finished\n", __FUNCTION__, - targ_fn->name); - -#ifndef DISABLE_ASYNC - if (async < acc_async_noval) -#endif - map_pop (dev_str); -} - -void * openacc_get_current_cuda_context (void); - -static void * -PTX_alloc (size_t s) -{ - CUdeviceptr d; - CUresult r; - - r = cuMemAlloc (&d, s); - if (r == CUDA_ERROR_OUT_OF_MEMORY) - return 0; - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuErrorMsg (r)); - return (void *)d; -} - -static void -PTX_free (void *p) -{ - CUresult r; - CUdeviceptr pb; - size_t ps; - - r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r)); - - if ((CUdeviceptr)p != pb) - GOMP_PLUGIN_fatal ("invalid device address"); - - r = cuMemFree ((CUdeviceptr)p); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuErrorMsg (r)); -} - -static void * -PTX_host2dev (void *d, const void *h, size_t s) -{ - CUresult r; - CUdeviceptr pb; - size_t ps; - struct nvptx_thread *nvthd = nvptx_thread (); - - if (!s) - return 0; - - if (!d) - GOMP_PLUGIN_fatal ("invalid device address"); - - r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r)); - - if (!pb) - GOMP_PLUGIN_fatal ("invalid device address"); - - if (!h) - GOMP_PLUGIN_fatal ("invalid host address"); - - if (d == h) - GOMP_PLUGIN_fatal ("invalid host or device address"); - - if ((void *)(d + s) > (void *)(pb + ps)) - GOMP_PLUGIN_fatal ("invalid size"); - -#ifndef DISABLE_ASYNC - if (nvthd->current_stream != nvthd->ptx_dev->null_stream) - { - CUevent *e; - - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); - - event_gc (false); - - r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, - nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r)); - - r = cuEventRecord (*e, nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - - event_add (PTX_EVT_MEM, e, (void *)h); - } - else -#endif - { - r = cuMemcpyHtoD ((CUdeviceptr)d, h, s); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuErrorMsg (r)); - } - - return 0; -} - -static void * -PTX_dev2host (void *h, const void *d, size_t s) -{ - CUresult r; - CUdeviceptr pb; - size_t ps; - struct nvptx_thread *nvthd = nvptx_thread (); - - if (!s) - return 0; - - if (!d) - GOMP_PLUGIN_fatal ("invalid device address"); - - r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r)); - - if (!pb) - GOMP_PLUGIN_fatal ("invalid device address"); - - if (!h) - GOMP_PLUGIN_fatal ("invalid host address"); - - if (d == h) - GOMP_PLUGIN_fatal ("invalid host or device address"); - - if ((void *)(d + s) > (void *)(pb + ps)) - GOMP_PLUGIN_fatal ("invalid size"); - -#ifndef DISABLE_ASYNC - if (nvthd->current_stream != nvthd->ptx_dev->null_stream) - { - CUevent *e; - - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuErrorMsg (r)); - - event_gc (false); - - r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, - nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r)); - - r = cuEventRecord (*e, nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - - event_add (PTX_EVT_MEM, e, (void *)h); - } - else -#endif - { - r = cuMemcpyDtoH (h, (CUdeviceptr)d, s); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuErrorMsg (r)); - } - - return 0; -} - -static void -PTX_set_async (int async) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - nvthd->current_stream - = select_stream_for_async (async, pthread_self (), true, NULL); -} - -static int -PTX_async_test (int async) -{ - CUresult r; - struct PTX_stream *s; - - s = select_stream_for_async (async, pthread_self (), false, NULL); - - if (!s) - GOMP_PLUGIN_fatal ("unknown async %d", async); - - r = cuStreamQuery (s->stream); - if (r == CUDA_SUCCESS) - { - /* The oacc-parallel.c:goacc_wait function calls this hook to determine - whether all work has completed on this stream, and if so omits the call - to the wait hook. If that happens, event_gc might not get called - (which prevents variables from getting unmapped and their associated - device storage freed), so call it here. */ - event_gc (true); - return 1; - } - else if (r == CUDA_ERROR_NOT_READY) - return 0; - - GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r)); - - return 0; -} - -static int -PTX_async_test_all (void) -{ - struct PTX_stream *s; - pthread_t self = pthread_self (); - struct nvptx_thread *nvthd = nvptx_thread (); - - GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock); - - for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) - { - if ((s->multithreaded || pthread_equal (s->host_thread, self)) - && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) - { - GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); - return 0; - } - } - - GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); - - event_gc (true); - - return 1; -} - -static void -PTX_wait (int async) -{ - CUresult r; - struct PTX_stream *s; - - s = select_stream_for_async (async, pthread_self (), false, NULL); - - if (!s) - GOMP_PLUGIN_fatal ("unknown async %d", async); - - r = cuStreamSynchronize (s->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r)); - - event_gc (true); -} - -static void -PTX_wait_async (int async1, int async2) -{ - CUresult r; - CUevent *e; - struct PTX_stream *s1, *s2; - pthread_t self = pthread_self (); - - /* The stream that is waiting (rather than being waited for) doesn't - necessarily have to exist already. */ - s2 = select_stream_for_async (async2, self, true, NULL); - - s1 = select_stream_for_async (async1, self, false, NULL); - if (!s1) - GOMP_PLUGIN_fatal ("invalid async 1\n"); - - if (s1 == s2) - GOMP_PLUGIN_fatal ("identical parameters"); - - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); - - event_gc (true); - - r = cuEventRecord (*e, s1->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - - event_add (PTX_EVT_SYNC, e, NULL); - - r = cuStreamWaitEvent (s2->stream, *e, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r)); -} - -static void -PTX_wait_all (void) -{ - CUresult r; - struct PTX_stream *s; - pthread_t self = pthread_self (); - struct nvptx_thread *nvthd = nvptx_thread (); - - GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock); - - /* Wait for active streams initiated by this thread (or by multiple threads) - to complete. */ - for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) - { - if (s->multithreaded || pthread_equal (s->host_thread, self)) - { - r = cuStreamQuery (s->stream); - if (r == CUDA_SUCCESS) - continue; - else if (r != CUDA_ERROR_NOT_READY) - GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r)); - - r = cuStreamSynchronize (s->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r)); - } - } - - GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); - - event_gc (true); -} - -static void -PTX_wait_all_async (int async) -{ - CUresult r; - struct PTX_stream *waiting_stream, *other_stream; - CUevent *e; - struct nvptx_thread *nvthd = nvptx_thread (); - pthread_t self = pthread_self (); - - /* The stream doing the waiting. This could be the first mention of the - stream, so create it if necessary. */ - waiting_stream - = select_stream_for_async (async, pthread_self (), true, NULL); - - /* Launches on the null stream already block on other streams in the - context. */ - if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream) - return; - - event_gc (true); - - GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock); - - for (other_stream = nvthd->ptx_dev->active_streams; - other_stream != NULL; - other_stream = other_stream->next) - { - if (!other_stream->multithreaded - && !pthread_equal (other_stream->host_thread, self)) - continue; - - e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); - - /* Record an event on the waited-for stream. */ - r = cuEventRecord (*e, other_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - - event_add (PTX_EVT_SYNC, e, NULL); - - r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r)); - } - - GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); -} - -static void * -PTX_get_current_cuda_device (void) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - - if (!nvthd || !nvthd->ptx_dev) - return NULL; - - return &nvthd->ptx_dev->dev; -} - -static void * -PTX_get_current_cuda_context (void) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - - if (!nvthd || !nvthd->ptx_dev) - return NULL; - - return nvthd->ptx_dev->ctx; -} - -static void * -PTX_get_cuda_stream (int async) -{ - struct PTX_stream *s; - struct nvptx_thread *nvthd = nvptx_thread (); - - if (!nvthd || !nvthd->ptx_dev) - return NULL; - - s = select_stream_for_async (async, pthread_self (), false, NULL); - - return s ? s->stream : NULL; -} - -static int -PTX_set_cuda_stream (int async, void *stream) -{ - struct PTX_stream *oldstream; - pthread_t self = pthread_self (); - struct nvptx_thread *nvthd = nvptx_thread (); - - GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock); - - if (async < 0) - GOMP_PLUGIN_fatal ("bad async %d", async); - - /* We have a list of active streams and an array mapping async values to - entries of that list. We need to take "ownership" of the passed-in stream, - and add it to our list, removing the previous entry also (if there was one) - in order to prevent resource leaks. Note the potential for surprise - here: maybe we should keep track of passed-in streams and leave it up to - the user to tidy those up, but that doesn't work for stream handles - returned from acc_get_cuda_stream above... */ - - oldstream = select_stream_for_async (async, self, false, NULL); - - if (oldstream) - { - if (nvthd->ptx_dev->active_streams == oldstream) - nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next; - else - { - struct PTX_stream *s = nvthd->ptx_dev->active_streams; - while (s->next != oldstream) - s = s->next; - s->next = s->next->next; - } - - cuStreamDestroy (oldstream->stream); - map_fini (oldstream); - free (oldstream); - } - - GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); - - (void) select_stream_for_async (async, self, true, (CUstream) stream); - - return 1; -} - -/* Plugin entry points. */ - - -int -GOMP_OFFLOAD_get_type (void) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); -#endif - - return TARGET_TYPE_NVIDIA_PTX; -} - -unsigned int -GOMP_OFFLOAD_get_caps (void) -{ - return TARGET_CAP_OPENACC_200; -} - -const char * -GOMP_OFFLOAD_get_name (void) -{ - return "nvidia"; -} - -int -GOMP_OFFLOAD_get_num_devices (void) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); -#endif - - return PTX_get_num_devices (); -} - -static void **kernel_target_data; -static void **kernel_host_table; - -void -GOMP_OFFLOAD_register_image (void *host_table, void *target_data) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%p, %p)\n", __FILE__, __FUNCTION__, - host_table, target_data); -#endif - - kernel_target_data = target_data; - kernel_host_table = host_table; -} - -int -GOMP_OFFLOAD_init_device (void) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); -#endif - - return PTX_init (); -} - -int -GOMP_OFFLOAD_fini_device (void) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); -#endif - - return PTX_fini (); -} - -int -GOMP_OFFLOAD_get_table (struct mapping_table **tablep) -{ - CUmodule module; - void **fn_table; - char **fn_names; - int fn_entries, i; - CUresult r; - struct targ_fn_descriptor *targ_fns; - -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, - tablep); -#endif - - if (PTX_init () <= 0) - return 0; - - /* This isn't an error, because an image may legitimately have no offloaded - regions and so will not call GOMP_offload_register. */ - if (kernel_target_data == NULL) - return 0; - - link_ptx (&module, kernel_target_data[0]); - - /* kernel_target_data[0] -> ptx code - kernel_target_data[1] -> variable mappings - kernel_target_data[2] -> array of kernel names in ascii - - kernel_host_table[0] -> start of function addresses (_omp_func_table) - kernel_host_table[1] -> end of function addresses (_omp_funcs_end) - - The array of kernel names and the functions addresses form a - one-to-one correspondence. */ - - fn_table = kernel_host_table[0]; - fn_names = (char **) kernel_target_data[2]; - fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *); - - *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries); - targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) - * fn_entries); - - for (i = 0; i < fn_entries; i++) - { - CUfunction function; - - r = cuModuleGetFunction (&function, module, fn_names[i]); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuErrorMsg (r)); - - targ_fns[i].fn = function; - targ_fns[i].name = (const char *) fn_names[i]; - - (*tablep)[i].host_start = (uintptr_t) fn_table[i]; - (*tablep)[i].host_end = (*tablep)[i].host_start + 1; - (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i]; - (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1; - } - - return fn_entries; -} - -void * -GOMP_OFFLOAD_alloc (size_t size) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%zu)\n", __FILE__, __FUNCTION__, - size); -#endif - - return PTX_alloc (size); -} - -void -GOMP_OFFLOAD_free (void *ptr) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, ptr); -#endif - - PTX_free (ptr); -} - -void * -GOMP_OFFLOAD_dev2host (void *dst, const void *src, size_t n) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__, - __FUNCTION__, dst, - src, n); -#endif - - return PTX_dev2host (dst, src, n); -} - -void * -GOMP_OFFLOAD_host2dev (void *dst, const void *src, size_t n) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__, - __FUNCTION__, dst, src, n); -#endif - - return PTX_host2dev (dst, src, n); -} - -void (*device_run) (void *fn_ptr, void *vars) = NULL; - -void -GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, - void **hostaddrs, void **devaddrs, size_t *sizes, - unsigned short *kinds, int num_gangs, - int num_workers, int vector_length, int async, - void *targ_mem_desc) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, " - "%d, %p)\n", __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes, - kinds, num_gangs, num_workers, vector_length, async, targ_mem_desc); -#endif - - PTX_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs, - num_workers, vector_length, async, targ_mem_desc); -} - -void * -GOMP_OFFLOAD_openacc_open_device (int n) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, n); -#endif - return PTX_open_device (n); -} - -int -GOMP_OFFLOAD_openacc_close_device (void *h) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, h); -#endif - return PTX_close_device (h); -} - -void -GOMP_OFFLOAD_openacc_set_device_num (int n) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - - assert (n >= 0); - - if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n) - (void) PTX_open_device (n); -} - -/* This can be called before the device is "opened" for the current thread, in - which case we can't tell which device number should be returned. We don't - actually want to open the device here, so just return -1 and let the caller - (oacc-init.c:acc_get_device_num) handle it. */ - -int -GOMP_OFFLOAD_openacc_get_device_num (void) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - - if (nvthd && nvthd->ptx_dev) - return nvthd->ptx_dev->ord; - else - return -1; -} - -bool -GOMP_OFFLOAD_openacc_avail (void) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); -#endif - return PTX_avail (); -} - -void -GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) -{ - CUevent *e; - CUresult r; - struct nvptx_thread *nvthd = nvptx_thread (); - -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, - targ_mem_desc); -#endif - - e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); - - r = cuEventRecord (*e, nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - - event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); -} - -int -GOMP_OFFLOAD_openacc_async_test (int async) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, - async); -#endif - return PTX_async_test (async); -} - -int -GOMP_OFFLOAD_openacc_async_test_all (void) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); -#endif - return PTX_async_test_all (); -} - -void -GOMP_OFFLOAD_openacc_async_wait (int async) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, - async); -#endif - PTX_wait (async); -} - -void -GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%d, %d)\n", __FILE__, __FUNCTION__, - async1, async2); -#endif - PTX_wait_async (async1, async2); -} - -void -GOMP_OFFLOAD_openacc_async_wait_all (void) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); -#endif - PTX_wait_all (); -} - -void -GOMP_OFFLOAD_openacc_async_wait_all_async (int async) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, - async); -#endif - PTX_wait_all_async (async); -} - -void -GOMP_OFFLOAD_openacc_async_set_async (int async) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, - async); -#endif - PTX_set_async (async); -} - -void * -GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data) -{ - struct PTX_device *ptx_dev = (struct PTX_device *) targ_data; - struct nvptx_thread *nvthd - = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); - CUresult r; - CUcontext thd_ctx; - - r = cuCtxGetCurrent (&thd_ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r)); - - assert (ptx_dev->ctx); - - if (!thd_ctx) - { - r = cuCtxPushCurrent (ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuErrorMsg (r)); - } - - nvthd->current_stream = ptx_dev->null_stream; - nvthd->ptx_dev = ptx_dev; - - return (void *) nvthd; -} - -void -GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) -{ - free (data); -} - -void * -GOMP_OFFLOAD_openacc_get_current_cuda_device (void) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); -#endif - return PTX_get_current_cuda_device (); -} - -void * -GOMP_OFFLOAD_openacc_get_current_cuda_context (void) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); -#endif - return PTX_get_current_cuda_context (); -} - -/* NOTE: This returns a CUstream, not a PTX_stream pointer. */ - -void * -GOMP_OFFLOAD_openacc_get_cuda_stream (int async) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, - async); -#endif - return PTX_get_cuda_stream (async); -} - -/* NOTE: This takes a CUstream, not a PTX_stream pointer. */ - -int -GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream) -{ -#ifdef DEBUG - fprintf (stderr, "libgomp plugin: %s:%s (%d, %p)\n", __FILE__, __FUNCTION__, - async, stream); -#endif - return PTX_set_cuda_stream (async, stream); -} diff --git a/libgomp/plugin/Makefile.am b/libgomp/plugin/Makefile.am new file mode 100644 index 0000000..59a5b95 --- /dev/null +++ b/libgomp/plugin/Makefile.am @@ -0,0 +1,64 @@ +# Plugins for offload execution. +# +# Copyright (C) 2014 Free Software Foundation, Inc. +# +# Contributed by Mentor Embedded. +# +# This file is part of the GNU OpenMP Library (libgomp). +# +# Libgomp is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# Under Section 7 of GPL version 3, you are granted additional +# permissions described in the GCC Runtime Library Exception, version +# 3.1, as published by the Free Software Foundation. +# +# You should have received a copy of the GNU General Public License and +# a copy of the GCC Runtime Library Exception along with this program; +# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +# <http://www.gnu.org/licenses/>. + +ACLOCAL_AMFLAGS = -I ../.. -I ../../config + +config_path = @config_path@ +search_path = .. $(addprefix $(top_srcdir)/../config/, $(config_path)) \ + $(top_srcdir) $(top_srcdir)/../../include $(top_srcdir)/.. + +AM_CPPFLAGS = $(addprefix -I, $(search_path)) +AM_CFLAGS = $(XCFLAGS) +AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS) + +toolexeclib_LTLIBRARIES = + +if PLUGIN_NVPTX +# Nvidia PTX OpenACC plugin. +libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION) +toolexeclib_LTLIBRARIES += libgomp-plugin-nvptx.la +libgomp_plugin_nvptx_la_SOURCES = plugin-nvptx.c +libgomp_plugin_nvptx_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_NVPTX_CPPFLAGS) +libgomp_plugin_nvptx_la_LDFLAGS = $(libgomp_plugin_nvptx_version_info) \ + $(lt_host_flags) +libgomp_plugin_nvptx_la_LDFLAGS += $(PLUGIN_NVPTX_LDFLAGS) +libgomp_plugin_nvptx_la_LIBADD = $(PLUGIN_NVPTX_LIBS) +libgomp_plugin_nvptx_la_LIBTOOLFLAGS = --tag=disable-static +endif + +libgomp_plugin_host_nonshm_version_info = -version-info $(libtool_VERSION) +toolexeclib_LTLIBRARIES += libgomp-plugin-host_nonshm.la +libgomp_plugin_host_nonshm_la_SOURCES = plugin-host.c +libgomp_plugin_host_nonshm_la_CPPFLAGS = $(AM_CPPFLAGS) -DHOST_NONSHM_PLUGIN +libgomp_plugin_host_nonshm_la_LDFLAGS = \ + $(libgomp_plugin_host_nonshm_version_info) $(lt_host_flags) +libgomp_plugin_host_nonshm_la_LIBTOOLFLAGS = --tag=disable-static + +LTLDFLAGS = $(shell $(SHELL) $(top_srcdir)/../../libtool-ldflags $(LDFLAGS)) + +LINK = $(LIBTOOL) --tag CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \ + $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LTLDFLAGS) -o $@ diff --git a/libgomp/plugin/Makefile.in b/libgomp/plugin/Makefile.in new file mode 100644 index 0000000..8a0853b diff --git a/libgomp/plugin/aclocal.m4 b/libgomp/plugin/aclocal.m4 new file mode 100644 index 0000000..06820b7 diff --git a/libgomp/plugin/config.h.in b/libgomp/plugin/config.h.in new file mode 100644 index 0000000..d044b92 diff --git a/libgomp/plugin/configure b/libgomp/plugin/configure new file mode 100644 index 0000000..7acb216 diff --git a/libgomp/plugin/configure.ac b/libgomp/plugin/configure.ac new file mode 100644 index 0000000..550696c --- /dev/null +++ b/libgomp/plugin/configure.ac @@ -0,0 +1,178 @@ +# Plugins for offload execution. +# +# Copyright (C) 2014 Free Software Foundation, Inc. +# +# Contributed by Mentor Embedded. +# +# This file is part of the GNU OpenMP Library (libgomp). +# +# Libgomp is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# Under Section 7 of GPL version 3, you are granted additional +# permissions described in the GCC Runtime Library Exception, version +# 3.1, as published by the Free Software Foundation. +# +# You should have received a copy of the GNU General Public License and +# a copy of the GCC Runtime Library Exception along with this program; +# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +# <http://www.gnu.org/licenses/>. + +AC_PREREQ([2.64]) +AC_INIT([Plugins for GNU OpenMP Runtime Library], [1.0], ,[libgomp-plugins]) +AC_CONFIG_HEADER([config.h]) + +AC_CANONICAL_SYSTEM +target_alias=${target_alias-$host_alias} +AC_SUBST(target_alias) + +AM_INIT_AUTOMAKE([1.9.0 foreign no-dist]) + +AM_MAINTAINER_MODE + +AC_PROG_CC +AC_PROG_CXX + +AC_MSG_CHECKING([for --enable-version-specific-runtime-libs]) +AC_ARG_ENABLE([version-specific-runtime-libs], + AC_HELP_STRING([--enable-version-specific-runtime-libs], + [Specify that runtime libraries should be installed in a compiler-specific directory]), + [case "$enableval" in + yes) enable_version_specific_runtime_libs=yes ;; + no) enable_version_specific_runtime_libs=no ;; + *) AC_MSG_ERROR([Unknown argument to enable/disable version-specific libs]);; + esac], + [enable_version_specific_runtime_libs=no]) +AC_MSG_RESULT($enable_version_specific_runtime_libs) + +# Calculate toolexeclibdir +# Also toolexecdir, though it's only used in toolexeclibdir +case ${enable_version_specific_runtime_libs} in + yes) + # Need the gcc compiler version to know where to install libraries + # and header files if --enable-version-specific-runtime-libs option + # is selected. + toolexecdir='$(libdir)/gcc/$(target_alias)' + toolexeclibdir='$(toolexecdir)/$(gcc_version)$(MULTISUBDIR)' + ;; + no) + if test -n "$with_cross_host" && + test x"$with_cross_host" != x"no"; then + # Install a library built with a cross compiler in tooldir, not libdir. + toolexecdir='$(exec_prefix)/$(target_alias)' + toolexeclibdir='$(toolexecdir)/lib' + else + toolexecdir='$(libdir)/gcc-lib/$(target_alias)' + toolexeclibdir='$(libdir)' + fi + multi_os_directory=`$CC -print-multi-os-directory` + case $multi_os_directory in + .) ;; # Avoid trailing /. + *) toolexeclibdir=$toolexeclibdir/$multi_os_directory ;; + esac + ;; +esac + +AC_LIBTOOL_DLOPEN +AM_PROG_LIBTOOL +# Forbid libtool to hardcode RPATH, because we want to be able to specify +# library search directory using LD_LIBRARY_PATH +hardcode_into_libs=no +AC_SUBST(toolexecdir) +AC_SUBST(toolexeclibdir) + +libtool_VERSION=1:0:0 +AC_SUBST(libtool_VERSION) + +# Look for the CUDA driver package. +CUDA_DRIVER_INCLUDE= +CUDA_DRIVER_LIB= +AC_SUBST(CUDA_DRIVER_INCLUDE) +AC_SUBST(CUDA_DRIVER_LIB) +CUDA_DRIVER_CPPFLAGS= +CUDA_DRIVER_LDFLAGS= +AC_ARG_WITH(cuda-driver, + [AS_HELP_STRING([--with-cuda-driver=PATH], + [specify prefix directory for installed CUDA driver package. + Equivalent to --with-cuda-driver-include=PATH/include + plus --with-cuda-driver-lib=PATH/lib])]) +AC_ARG_WITH(cuda-driver-include, + [AS_HELP_STRING([--with-cuda-driver-include=PATH], + [specify directory for installed CUDA driver include files])]) +AC_ARG_WITH(cuda-driver-lib, + [AS_HELP_STRING([--with-cuda-driver-lib=PATH], + [specify directory for the installed CUDA driver library])]) +if test "x$with_cuda_driver" != x; then + CUDA_DRIVER_INCLUDE=$with_cuda_driver/include + CUDA_DRIVER_LIB=$with_cuda_driver/lib +fi +if test "x$with_cuda_driver_include" != x; then + CUDA_DRIVER_INCLUDE=$with_cuda_driver_include +fi +if test "x$with_cuda_driver_lib" != x; then + CUDA_DRIVER_LIB=$with_cuda_driver_lib +fi +if test "x$CUDA_DRIVER_INCLUDE" != x; then + CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE +fi +if test "x$CUDA_DRIVER_LIB" != x; then + CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB +fi + +PLUGIN_NVPTX=0 +PLUGIN_NVPTX_CPPFLAGS= +PLUGIN_NVPTX_LDFLAGS= +PLUGIN_NVPTX_LIBS= +AC_SUBST(PLUGIN_NVPTX) +AC_SUBST(PLUGIN_NVPTX_CPPFLAGS) +AC_SUBST(PLUGIN_NVPTX_LDFLAGS) +AC_SUBST(PLUGIN_NVPTX_LIBS) + +for accel in `echo $enable_offload_targets | sed -e 's#,# #g'`; do + case "$accel" in + nvptx*) + PLUGIN_NVPTX=$accel + PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS + PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS + PLUGIN_NVPTX_LIBS='-lcuda' + + PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS + CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" + PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS + LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" + PLUGIN_NVPTX_save_LIBS=$LIBS + LIBS="$PLUGIN_NVPTX_LIBS $LIBS" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [#include "cuda.h"], + [CUresult r = cuCtxPushCurrent (NULL);])], + [PLUGIN_NVPTX=1]) + CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS + LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS + LIBS=$PLUGIN_NVPTX_save_LIBS + case $PLUGIN_NVPTX in + nvptx*) + PLUGIN_NVPTX=0 + AC_MSG_ERROR([CUDA driver package required for nvptx support]) + ;; + esac + ;; + esac +done +AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1]) +AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX], + [Define to 1 if the NVIDIA plugin is built, 0 if not.]) + +. ${srcdir}/../configure.tgt + +AC_SUBST(config_path) + +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT diff --git a/libgomp/plugin/plugin-host.c b/libgomp/plugin/plugin-host.c new file mode 100644 index 0000000..4fbccc2 --- /dev/null +++ b/libgomp/plugin/plugin-host.c @@ -0,0 +1,454 @@ +/* OpenACC Runtime Library: acc_device_host, acc_device_host_nonshm. + + Copyright (C) 2013-2014 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU OpenMP Library (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* Simple implementation of support routines for a shared-memory + acc_device_host, and a non-shared memory acc_device_host_nonshm, with the + latter built as a plugin. */ + +#include "openacc.h" +#include "config.h" +#include "libgomp.h" +#include "target.h" +#ifdef HOST_NONSHM_PLUGIN +#include "libgomp-plugin.h" +#include "oacc-plugin.h" +#else +#include "oacc-int.h" +#endif + +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#ifdef HOST_NONSHM_PLUGIN +#define STATIC +#define GOMP(X) GOMP_PLUGIN_##X +#define SELF "host_nonshm plugin: " +#else +#define STATIC static +#define GOMP(X) gomp_##X +#define SELF "host: " +#endif + +#ifndef HOST_NONSHM_PLUGIN +static struct gomp_device_descr host_dispatch; +#endif + +STATIC const char * +GOMP_OFFLOAD_get_name (void) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); +#endif + +#ifdef HOST_NONSHM_PLUGIN + return "host_nonshm"; +#else + return "host"; +#endif +} + +STATIC int +GOMP_OFFLOAD_get_type (void) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); +#endif + +#ifdef HOST_NONSHM_PLUGIN + return TARGET_TYPE_HOST_NONSHM; +#else + return TARGET_TYPE_HOST; +#endif +} + +STATIC unsigned int +GOMP_OFFLOAD_get_caps (void) +{ + unsigned int caps = TARGET_CAP_OPENACC_200 | TARGET_CAP_OPENMP_400 + | TARGET_CAP_NATIVE_EXEC; + +#ifndef HOST_NONSHM_PLUGIN + caps |= TARGET_CAP_SHARED_MEM; +#endif + +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s: 0x%x\n", __FILE__, __FUNCTION__, caps); +#endif + + return caps; +} + +STATIC int +GOMP_OFFLOAD_get_num_devices (void) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); +#endif + + return 1; +} + +STATIC void +GOMP_OFFLOAD_register_image (void *host_table, void *target_data) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, host_table, + target_data); +#endif +} + +STATIC int +GOMP_OFFLOAD_init_device (void) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); +#endif + + return GOMP_OFFLOAD_get_num_devices (); +} + +STATIC int +GOMP_OFFLOAD_fini_device (void) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); +#endif + + return 0; +} + +STATIC int +GOMP_OFFLOAD_get_table (struct mapping_table **table) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, table); +#endif + + return 0; +} + +STATIC bool +GOMP_OFFLOAD_openacc_avail (void) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); +#endif + + return 1; +} + +STATIC void * +GOMP_OFFLOAD_openacc_open_device (int n) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n); +#endif + + return (void *) (intptr_t) n; +} + +STATIC int +GOMP_OFFLOAD_openacc_close_device (void *hnd) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, hnd); +#endif + + return 0; +} + +STATIC int +GOMP_OFFLOAD_openacc_get_device_num (void) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); +#endif + + return 0; +} + +STATIC void +GOMP_OFFLOAD_openacc_set_device_num (int n) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n); +#endif + + if (n > 0) + GOMP(fatal) ("device number %u out of range for host execution", n); +} + +STATIC void * +GOMP_OFFLOAD_alloc (size_t s) +{ + void *ptr = GOMP(malloc) (s); + +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%zd): %p\n", __FILE__, __FUNCTION__, s, ptr); +#endif + + return ptr; +} + +STATIC void +GOMP_OFFLOAD_free (void *p) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, p); +#endif + + free (p); +} + +STATIC void * +GOMP_OFFLOAD_host2dev (void *d, const void *h, size_t s) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, d, h, + s); +#endif + +#ifdef HOST_NONSHM_PLUGIN + memcpy (d, h, s); +#endif + + return 0; +} + +STATIC void * +GOMP_OFFLOAD_dev2host (void *h, const void *d, size_t s) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, h, d, + s); +#endif + +#ifdef HOST_NONSHM_PLUGIN + memcpy (h, d, s); +#endif + + return 0; +} + +STATIC void +GOMP_OFFLOAD_run (void *fn_ptr, void *vars) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, fn_ptr, + vars); +#endif + + void (*fn)(void *) = (void (*)(void *)) fn_ptr; + + fn (vars); +} + +STATIC void +GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), + size_t mapnum __attribute__((unused)), + void **hostaddrs __attribute__((unused)), + void **devaddrs __attribute__((unused)), + size_t *sizes __attribute__((unused)), + unsigned short *kinds __attribute__((unused)), + int num_gangs __attribute__((unused)), + int num_workers __attribute__((unused)), + int vector_length __attribute__((unused)), + int async __attribute__((unused)), + void *targ_mem_desc __attribute__((unused))) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, %d, %p)\n", + __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes, kinds, + num_gangs, num_workers, vector_length, async, targ_mem_desc); +#endif + +#ifdef HOST_NONSHM_PLUGIN + fn (devaddrs); +#else + fn (hostaddrs); +#endif +} + +STATIC void +GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) +{ +#ifdef HOST_NONSHM_PLUGIN + /* "Asynchronous" launches are executed synchronously on the (non-SHM) host, + so there's no point in delaying host-side cleanup -- just do it now. */ + GOMP_PLUGIN_async_unmap_vars (targ_mem_desc); +#endif +} + +STATIC void +GOMP_OFFLOAD_openacc_async_set_async (int async __attribute__((unused))) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async); +#endif +} + +STATIC int +GOMP_OFFLOAD_openacc_async_test (int async __attribute__((unused))) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async); +#endif + + return 1; +} + +STATIC int +GOMP_OFFLOAD_openacc_async_test_all (void) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); +#endif + + return 1; +} + +STATIC void +GOMP_OFFLOAD_openacc_async_wait (int async __attribute__((unused))) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async); +#endif +} + +STATIC void +GOMP_OFFLOAD_openacc_async_wait_all (void) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__); +#endif +} + +STATIC void +GOMP_OFFLOAD_openacc_async_wait_async (int async1 __attribute__((unused)), + int async2 __attribute__((unused))) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%d, %d)\n", __FILE__, __FUNCTION__, async1, + async2); +#endif +} + +STATIC void +GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__((unused))) +{ +#ifdef DEBUG + fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async); +#endif +} + +STATIC void * +GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data __attribute__((unused))) +{ + return NULL; +} + +STATIC void +GOMP_OFFLOAD_openacc_destroy_thread_data (void *tls_data __attribute__((unused))) +{ +} + +#ifndef HOST_NONSHM_PLUGIN +static struct gomp_device_descr host_dispatch = + { + .name = "host", + + .type = TARGET_TYPE_HOST, + .capabilities = TARGET_CAP_OPENACC_200 | TARGET_CAP_NATIVE_EXEC + | TARGET_CAP_SHARED_MEM, + .id = 0, + + .is_initialized = false, + .offload_regions_registered = false, + + .get_name_func = GOMP_OFFLOAD_get_name, + .get_type_func = GOMP_OFFLOAD_get_type, + .get_caps_func = GOMP_OFFLOAD_get_caps, + + .init_device_func = GOMP_OFFLOAD_init_device, + .fini_device_func = GOMP_OFFLOAD_fini_device, + .get_num_devices_func = GOMP_OFFLOAD_get_num_devices, + .register_image_func = GOMP_OFFLOAD_register_image, + .get_table_func = GOMP_OFFLOAD_get_table, + + .alloc_func = GOMP_OFFLOAD_alloc, + .free_func = GOMP_OFFLOAD_free, + .host2dev_func = GOMP_OFFLOAD_host2dev, + .dev2host_func = GOMP_OFFLOAD_dev2host, + + .run_func = GOMP_OFFLOAD_run, + + .openacc = { + .open_device_func = GOMP_OFFLOAD_openacc_open_device, + .close_device_func = GOMP_OFFLOAD_openacc_close_device, + + .get_device_num_func = GOMP_OFFLOAD_openacc_get_device_num, + .set_device_num_func = GOMP_OFFLOAD_openacc_set_device_num, + + /* Device available. */ + .avail_func = GOMP_OFFLOAD_openacc_avail, + + .exec_func = GOMP_OFFLOAD_openacc_parallel, + + .register_async_cleanup_func + = GOMP_OFFLOAD_openacc_register_async_cleanup, + + .async_set_async_func = GOMP_OFFLOAD_openacc_async_set_async, + .async_test_func = GOMP_OFFLOAD_openacc_async_test, + .async_test_all_func = GOMP_OFFLOAD_openacc_async_test_all, + .async_wait_func = GOMP_OFFLOAD_openacc_async_wait, + .async_wait_async_func = GOMP_OFFLOAD_openacc_async_wait_async, + .async_wait_all_func = GOMP_OFFLOAD_openacc_async_wait_all, + .async_wait_all_async_func = GOMP_OFFLOAD_openacc_async_wait_all_async, + + .create_thread_data_func = GOMP_OFFLOAD_openacc_create_thread_data, + .destroy_thread_data_func = GOMP_OFFLOAD_openacc_destroy_thread_data, + + .cuda = { + .get_current_device_func = NULL, + .get_current_context_func = NULL, + .get_stream_func = NULL, + .set_stream_func = NULL, + } + } + }; + +/* Register this device type. */ +static __attribute__ ((constructor)) +void ACC_host_init (void) +{ + gomp_mutex_init (&host_dispatch.mem_map.lock); + ACC_register (&host_dispatch); +} +#endif + diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c new file mode 100644 index 0000000..4271c69 --- /dev/null +++ b/libgomp/plugin/plugin-nvptx.c @@ -0,0 +1,1909 @@ +/* Plugin for NVPTX execution. + + Copyright (C) 2013-2014 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU OpenMP Library (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* Nvidia PTX-specific parts of OpenACC support. The cuda driver + library appears to hold some implicit state, but the documentation + is not clear as to what that state might be. Or how one might + propagate it from one thread to another. */ + +#include "openacc.h" +#include "config.h" +#include "libgomp.h" +#include "target.h" +#include "libgomp-plugin.h" +#include "oacc-ptx.h" +#include "oacc-plugin.h" + +#include <cuda.h> +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include <dlfcn.h> +#include <unistd.h> +#include <assert.h> + +#define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0])) + +static struct _errlist +{ + CUresult r; + char *m; +} cuErrorList[] = { + { CUDA_ERROR_INVALID_VALUE, "invalid value" }, + { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" }, + { CUDA_ERROR_NOT_INITIALIZED, "not initialized" }, + { CUDA_ERROR_DEINITIALIZED, "deinitialized" }, + { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" }, + { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" }, + { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" }, + { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" }, + { CUDA_ERROR_NO_DEVICE, "no device" }, + { CUDA_ERROR_INVALID_DEVICE, "invalid device" }, + { CUDA_ERROR_INVALID_IMAGE, "invalid image" }, + { CUDA_ERROR_INVALID_CONTEXT, "invalid context" }, + { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" }, + { CUDA_ERROR_MAP_FAILED, "map error" }, + { CUDA_ERROR_UNMAP_FAILED, "unmap error" }, + { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" }, + { CUDA_ERROR_ALREADY_MAPPED, "already mapped" }, + { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" }, + { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" }, + { CUDA_ERROR_NOT_MAPPED, "not mapped" }, + { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" }, + { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" }, + { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" }, + { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" }, + { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" }, + { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" }, + { CUDA_ERROR_INVALID_SOURCE, "invalid source" }, + { CUDA_ERROR_FILE_NOT_FOUND, "file not found" }, + { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + "shared object symbol not found" }, + { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" }, + { CUDA_ERROR_OPERATING_SYSTEM, "operating system" }, + { CUDA_ERROR_INVALID_HANDLE, "invalid handle" }, + { CUDA_ERROR_NOT_FOUND, "not found" }, + { CUDA_ERROR_NOT_READY, "not ready" }, + { CUDA_ERROR_LAUNCH_FAILED, "launch error" }, + { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" }, + { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" }, + { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + "launch incompatibe texturing" }, + { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" }, + { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " }, + { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" }, + { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" }, + { CUDA_ERROR_ASSERT, "assert" }, + { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" }, + { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, + "host memory already registered" }, + { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" }, + { CUDA_ERROR_NOT_PERMITTED, "no permitted" }, + { CUDA_ERROR_NOT_SUPPORTED, "not supported" }, + { CUDA_ERROR_UNKNOWN, "unknown" } +}; + +static char errmsg[128]; + +static char * +cuErrorMsg (CUresult r) +{ + int i; + + for (i = 0; i < ARRAYSIZE (cuErrorList); i++) + { + if (cuErrorList[i].r == r) + return &cuErrorList[i].m[0]; + } + + sprintf (&errmsg[0], "unknown result code: %5d", r); + + return &errmsg[0]; +} + +struct targ_fn_descriptor +{ + CUfunction fn; + const char *name; +}; + +static bool PTX_inited = false; + +struct PTX_stream +{ + CUstream stream; + pthread_t host_thread; + bool multithreaded; + + CUdeviceptr d; + void *h; + void *h_begin; + void *h_end; + void *h_next; + void *h_prev; + void *h_tail; + + struct PTX_stream *next; +}; + +/* Thread-specific data for PTX. */ + +struct nvptx_thread +{ + struct PTX_stream *current_stream; + struct PTX_device *ptx_dev; +}; + +struct map +{ + int async; + size_t size; + char mappings[0]; +}; + +static void +map_init (struct PTX_stream *s) +{ + CUresult r; + + int size = getpagesize (); + + assert (s); + assert (!s->d); + assert (!s->h); + + r = cuMemAllocHost (&s->h, size); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuErrorMsg (r)); + + r = cuMemHostGetDevicePointer (&s->d, s->h, 0); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuErrorMsg (r)); + + assert (s->h); + + s->h_begin = s->h; + s->h_end = s->h_begin + size; + s->h_next = s->h_prev = s->h_tail = s->h_begin; + + assert (s->h_next); + assert (s->h_end); +} + +static void +map_fini (struct PTX_stream *s) +{ + CUresult r; + + r = cuMemFreeHost (s->h); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuErrorMsg (r)); +} + +static void +map_pop (struct PTX_stream *s) +{ + struct map *m; + + assert (s != NULL); + assert (s->h_next); + assert (s->h_prev); + assert (s->h_tail); + + m = s->h_tail; + + s->h_tail += m->size; + + if (s->h_tail >= s->h_end) + s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); + + if (s->h_next == s->h_tail) + s->h_prev = s->h_next; + + assert (s->h_next >= s->h_begin); + assert (s->h_tail >= s->h_begin); + assert (s->h_prev >= s->h_begin); + + assert (s->h_next <= s->h_end); + assert (s->h_tail <= s->h_end); + assert (s->h_prev <= s->h_end); +} + +static void +map_push (struct PTX_stream *s, int async, size_t size, void **h, void **d) +{ + int left; + int offset; + struct map *m; + + assert (s != NULL); + + left = s->h_end - s->h_next; + size += sizeof (struct map); + + assert (s->h_prev); + assert (s->h_next); + + if (size >= left) + { + m = s->h_prev; + m->size += left; + s->h_next = s->h_begin; + + if (s->h_next + size > s->h_end) + GOMP_PLUGIN_fatal ("unable to push map"); + } + + assert (s->h_next); + + m = s->h_next; + m->async = async; + m->size = size; + + offset = (void *)&m->mappings[0] - s->h; + + *d = (void *)(s->d + offset); + *h = (void *)(s->h + offset); + + s->h_prev = s->h_next; + s->h_next += size; + + assert (s->h_prev); + assert (s->h_next); + + assert (s->h_next >= s->h_begin); + assert (s->h_tail >= s->h_begin); + assert (s->h_prev >= s->h_begin); + assert (s->h_next <= s->h_end); + assert (s->h_tail <= s->h_end); + assert (s->h_prev <= s->h_end); + + return; +} + +struct PTX_device +{ + CUcontext ctx; + bool ctx_shared; + CUdevice dev; + struct PTX_stream *null_stream; + /* All non-null streams associated with this device (actually context), + either created implicitly or passed in from the user (via + acc_set_cuda_stream). */ + struct PTX_stream *active_streams; + struct { + struct PTX_stream **arr; + int size; + } async_streams; + /* A lock for use when manipulating the above stream list and array. */ + gomp_mutex_t stream_lock; + int ord; + bool overlap; + bool map; + bool concur; + int mode; + bool mkern; + + struct PTX_device *next; +}; + +enum PTX_event_type +{ + PTX_EVT_MEM, + PTX_EVT_KNL, + PTX_EVT_SYNC, + PTX_EVT_ASYNC_CLEANUP +}; + +struct PTX_event +{ + CUevent *evt; + int type; + void *addr; + int ord; + + struct PTX_event *next; +}; + +static gomp_mutex_t PTX_event_lock; +static struct PTX_event *PTX_events; + +#define _XSTR(s) _STR(s) +#define _STR(s) #s + +static struct _synames +{ + char *n; +} cuSymNames[] = +{ + { _XSTR(cuCtxCreate) }, + { _XSTR(cuCtxDestroy) }, + { _XSTR(cuCtxGetCurrent) }, + { _XSTR(cuCtxPushCurrent) }, + { _XSTR(cuCtxSynchronize) }, + { _XSTR(cuDeviceGet) }, + { _XSTR(cuDeviceGetAttribute) }, + { _XSTR(cuDeviceGetCount) }, + { _XSTR(cuEventCreate) }, + { _XSTR(cuEventDestroy) }, + { _XSTR(cuEventQuery) }, + { _XSTR(cuEventRecord) }, + { _XSTR(cuInit) }, + { _XSTR(cuLaunchKernel) }, + { _XSTR(cuLinkAddData) }, + { _XSTR(cuLinkComplete) }, + { _XSTR(cuLinkCreate) }, + { _XSTR(cuMemAlloc) }, + { _XSTR(cuMemAllocHost) }, + { _XSTR(cuMemcpy) }, + { _XSTR(cuMemcpyDtoH) }, + { _XSTR(cuMemcpyDtoHAsync) }, + { _XSTR(cuMemcpyHtoD) }, + { _XSTR(cuMemcpyHtoDAsync) }, + { _XSTR(cuMemFree) }, + { _XSTR(cuMemFreeHost) }, + { _XSTR(cuMemGetAddressRange) }, + { _XSTR(cuMemHostGetDevicePointer) }, + { _XSTR(cuMemHostRegister) }, + { _XSTR(cuMemHostUnregister) }, + { _XSTR(cuModuleGetFunction) }, + { _XSTR(cuModuleLoadData) }, + { _XSTR(cuStreamDestroy) }, + { _XSTR(cuStreamQuery) }, + { _XSTR(cuStreamSynchronize) }, + { _XSTR(cuStreamWaitEvent) } +}; + +static int +verify_device_library (void) +{ + int i; + void *dh, *ds; + + dh = dlopen ("libcuda.so", RTLD_LAZY); + if (!dh) + return -1; + + for (i = 0; i < ARRAYSIZE (cuSymNames); i++) + { + ds = dlsym (dh, cuSymNames[i].n); + if (!ds) + return -1; + } + + dlclose (dh); + + return 0; +} + +static inline struct nvptx_thread * +nvptx_thread (void) +{ + return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); +} + +static void +init_streams_for_device (struct PTX_device *ptx_dev, int concurrency) +{ + int i; + struct PTX_stream *null_stream + = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream)); + + null_stream->stream = NULL; + null_stream->host_thread = pthread_self (); + null_stream->multithreaded = true; + null_stream->d = (CUdeviceptr) NULL; + null_stream->h = NULL; + map_init (null_stream); + ptx_dev->null_stream = null_stream; + + ptx_dev->active_streams = NULL; + GOMP_PLUGIN_mutex_init (&ptx_dev->stream_lock); + + if (concurrency < 1) + concurrency = 1; + + /* This is just a guess -- make space for as many async streams as the + current device is capable of concurrently executing. This can grow + later as necessary. No streams are created yet. */ + ptx_dev->async_streams.arr + = GOMP_PLUGIN_malloc (concurrency * sizeof (struct PTX_stream *)); + ptx_dev->async_streams.size = concurrency; + + for (i = 0; i < concurrency; i++) + ptx_dev->async_streams.arr[i] = NULL; +} + +static void +fini_streams_for_device (struct PTX_device *ptx_dev) +{ + free (ptx_dev->async_streams.arr); + + while (ptx_dev->active_streams != NULL) + { + struct PTX_stream *s = ptx_dev->active_streams; + ptx_dev->active_streams = ptx_dev->active_streams->next; + + cuStreamDestroy (s->stream); + map_fini (s); + free (s); + } + + map_fini (ptx_dev->null_stream); + free (ptx_dev->null_stream); +} + +/* Select a stream for (OpenACC-semantics) ASYNC argument for the current + thread THREAD (and also current device/context). If CREATE is true, create + the stream if it does not exist (or use EXISTING if it is non-NULL), and + associate the stream with the same thread argument. Returns stream to use + as result. */ + +static struct PTX_stream * +select_stream_for_async (int async, pthread_t thread, bool create, + CUstream existing) +{ + struct nvptx_thread *nvthd = nvptx_thread (); + /* Local copy of TLS variable. */ + struct PTX_device *ptx_dev = nvthd->ptx_dev; + struct PTX_stream *stream = NULL; + int orig_async = async; + + /* The special value acc_async_noval (-1) maps (for now) to an + implicitly-created stream, which is then handled the same as any other + numbered async stream. Other options are available, e.g. using the null + stream for anonymous async operations, or choosing an idle stream from an + active set. But, stick with this for now. */ + if (async > acc_async_sync) + async++; + + if (create) + GOMP_PLUGIN_mutex_lock (&ptx_dev->stream_lock); + + /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the + null stream, and in fact better performance may be obtainable if it doesn't + (because the null stream enforces overly-strict synchronisation with + respect to other streams for legacy reasons, and that's probably not + needed with OpenACC). Maybe investigate later. */ + if (async == acc_async_sync) + stream = ptx_dev->null_stream; + else if (async >= 0 && async < ptx_dev->async_streams.size + && ptx_dev->async_streams.arr[async] && !(create && existing)) + stream = ptx_dev->async_streams.arr[async]; + else if (async >= 0 && create) + { + if (async >= ptx_dev->async_streams.size) + { + int i, newsize = ptx_dev->async_streams.size * 2; + + if (async >= newsize) + newsize = async + 1; + + ptx_dev->async_streams.arr + = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr, + newsize * sizeof (struct PTX_stream *)); + + for (i = ptx_dev->async_streams.size; i < newsize; i++) + ptx_dev->async_streams.arr[i] = NULL; + + ptx_dev->async_streams.size = newsize; + } + + /* Create a new stream on-demand if there isn't one already, or if we're + setting a particular async value to an existing (externally-provided) + stream. */ + if (!ptx_dev->async_streams.arr[async] || existing) + { + CUresult r; + struct PTX_stream *s + = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream)); + + if (existing) + s->stream = existing; + else + { + r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuErrorMsg (r)); + } + + /* If CREATE is true, we're going to be queueing some work on this + stream. Associate it with the current host thread. */ + s->host_thread = thread; + s->multithreaded = false; + + s->d = (CUdeviceptr) NULL; + s->h = NULL; + map_init (s); + + s->next = ptx_dev->active_streams; + ptx_dev->active_streams = s; + ptx_dev->async_streams.arr[async] = s; + } + + stream = ptx_dev->async_streams.arr[async]; + } + else if (async < 0) + GOMP_PLUGIN_fatal ("bad async %d", async); + + if (create) + { + assert (stream != NULL); + + /* If we're trying to use the same stream from different threads + simultaneously, set stream->multithreaded to true. This affects the + behaviour of acc_async_test_all and acc_wait_all, which are supposed to + only wait for asynchronous launches from the same host thread they are + invoked on. If multiple threads use the same async value, we make note + of that here and fall back to testing/waiting for all threads in those + functions. */ + if (thread != stream->host_thread) + stream->multithreaded = true; + + GOMP_PLUGIN_mutex_unlock (&ptx_dev->stream_lock); + } + else if (stream && !stream->multithreaded + && !pthread_equal (stream->host_thread, thread)) + GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async); + +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s using stream %p (CUDA stream %p) " + "for async %d\n", __FILE__, __FUNCTION__, stream, + stream ? stream->stream : NULL, orig_async); +#endif + + return stream; +} + +static int PTX_get_num_devices (void); + +/* Initialize the device. */ +static int +PTX_init (void) +{ + CUresult r; + int rc; + + if (PTX_inited) + return PTX_get_num_devices (); + + rc = verify_device_library (); + if (rc < 0) + return -1; + + r = cuInit (0); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r)); + + PTX_events = NULL; + + GOMP_PLUGIN_mutex_init (&PTX_event_lock); + + PTX_inited = true; + + return PTX_get_num_devices (); +} + +static int +PTX_fini (void) +{ + PTX_inited = false; + + return 0; +} + +static void * +PTX_open_device (int n) +{ + struct PTX_device *ptx_dev; + CUdevice dev; + CUresult r; + int async_engines, pi; + + r = cuDeviceGet (&dev, n); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuErrorMsg (r)); + + ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device)); + + ptx_dev->ord = n; + ptx_dev->dev = dev; + ptx_dev->ctx_shared = false; + + r = cuCtxGetCurrent (&ptx_dev->ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r)); + + if (!ptx_dev->ctx) + { + r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuErrorMsg (r)); + } + else + ptx_dev->ctx_shared = true; + + r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); + + ptx_dev->overlap = pi; + + r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); + + ptx_dev->map = pi; + + r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); + + ptx_dev->concur = pi; + + r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); + + ptx_dev->mode = pi; + + r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r)); + + ptx_dev->mkern = pi; + + r = cuDeviceGetAttribute (&async_engines, + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); + if (r != CUDA_SUCCESS) + async_engines = 1; + + init_streams_for_device (ptx_dev, async_engines); + + return (void *) ptx_dev; +} + +static int +PTX_close_device (void *targ_data) +{ + CUresult r; + struct PTX_device *ptx_dev = targ_data; + + if (!ptx_dev) + return 0; + + fini_streams_for_device (ptx_dev); + + if (!ptx_dev->ctx_shared) + { + r = cuCtxDestroy (ptx_dev->ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r)); + } + + free (ptx_dev); + + return 0; +} + +static int +PTX_get_num_devices (void) +{ + int n; + CUresult r; + + /* This function will be called before the plugin has been initialized in + order to enumerate available devices, but CUDA API routines can't be used + until cuInit has been called. Just call it now (but don't yet do any + further initialization). */ + if (!PTX_inited) + cuInit (0); + + r = cuDeviceGetCount (&n); + if (r!= CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuErrorMsg (r)); + + return n; +} + +static bool +PTX_avail(void) +{ + bool avail = false; + + if (PTX_init () > 0) + avail = true; + + return avail; +} + +static void +link_ptx (CUmodule *module, char *ptx_code) +{ + CUjit_option opts[7]; + void *optvals[7]; + float elapsed = 0.0; +#define LOGSIZE 8192 + char elog[LOGSIZE]; + char ilog[LOGSIZE]; + unsigned long logsize = LOGSIZE; + CUlinkState linkstate; + CUresult r; + void *linkout; + size_t linkoutsize __attribute__((unused)); + + GOMP_PLUGIN_notify ("attempting to load:\n---\n%s\n---\n", ptx_code); + + opts[0] = CU_JIT_WALL_TIME; + optvals[0] = &elapsed; + + opts[1] = CU_JIT_INFO_LOG_BUFFER; + optvals[1] = &ilog[0]; + + opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + optvals[2] = (void *) logsize; + + opts[3] = CU_JIT_ERROR_LOG_BUFFER; + optvals[3] = &elog[0]; + + opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + optvals[4] = (void *) logsize; + + opts[5] = CU_JIT_LOG_VERBOSE; + optvals[5] = (void *) 1; + + opts[6] = CU_JIT_TARGET; + optvals[6] = (void *) CU_TARGET_COMPUTE_30; + + r = cuLinkCreate (7, opts, optvals, &linkstate); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuErrorMsg (r)); + + char *abort_ptx = ABORT_PTX; + r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx, + strlen (abort_ptx) + 1, 0, 0, 0, 0); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); + GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuErrorMsg (r)); + } + + char *acc_on_device_ptx = ACC_ON_DEVICE_PTX; + r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx, + strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); + GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s", + cuErrorMsg (r)); + } + + char *goacc_internal_ptx = GOACC_INTERNAL_PTX; + r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx, + strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); + GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s", + cuErrorMsg (r)); + } + + r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code, + strlen (ptx_code) + 1, 0, 0, 0, 0); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); + GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuErrorMsg (r)); + } + + r = cuLinkComplete (linkstate, &linkout, &linkoutsize); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuErrorMsg (r)); + + GOMP_PLUGIN_notify ("Link complete: %fms\n", elapsed); + GOMP_PLUGIN_notify ("Link log %s\n", &ilog[0]); + + r = cuModuleLoadData (module, linkout); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuErrorMsg (r)); +} + +static void +event_gc (bool memmap_lockable) +{ + struct PTX_event *ptx_event = PTX_events; + struct nvptx_thread *nvthd = nvptx_thread (); + + GOMP_PLUGIN_mutex_lock (&PTX_event_lock); + + while (ptx_event != NULL) + { + CUresult r; + struct PTX_event *e = ptx_event; + + ptx_event = ptx_event->next; + + if (e->ord != nvthd->ptx_dev->ord) + continue; + + r = cuEventQuery (*e->evt); + if (r == CUDA_SUCCESS) + { + CUevent *te; + + te = e->evt; + + switch (e->type) + { + case PTX_EVT_MEM: + case PTX_EVT_SYNC: + break; + + case PTX_EVT_KNL: + map_pop (e->addr); + break; + + case PTX_EVT_ASYNC_CLEANUP: + { + /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the + memory-map splay tree lock for the current device, so we + can't call it when one of our callers has already claimed + the lock. In that case, just delay the GC for this event + until later. */ + if (!memmap_lockable) + continue; + + GOMP_PLUGIN_async_unmap_vars (e->addr); + } + break; + } + + cuEventDestroy (*te); + free ((void *)te); + + if (PTX_events == e) + PTX_events = PTX_events->next; + else + { + struct PTX_event *e_ = PTX_events; + while (e_->next != e) + e_ = e_->next; + e_->next = e_->next->next; + } + + free (e); + } + } + + GOMP_PLUGIN_mutex_unlock (&PTX_event_lock); +} + +static void +event_add (enum PTX_event_type type, CUevent *e, void *h) +{ + struct PTX_event *ptx_event; + struct nvptx_thread *nvthd = nvptx_thread (); + + assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC + || type == PTX_EVT_ASYNC_CLEANUP); + + ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event)); + ptx_event->type = type; + ptx_event->evt = e; + ptx_event->addr = h; + ptx_event->ord = nvthd->ptx_dev->ord; + + GOMP_PLUGIN_mutex_lock (&PTX_event_lock); + + ptx_event->next = PTX_events; + PTX_events = ptx_event; + + GOMP_PLUGIN_mutex_unlock (&PTX_event_lock); +} + +void +PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, + size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers, + int vector_length, int async, void *targ_mem_desc) +{ + struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; + CUfunction function; + CUresult r; + int i; + struct PTX_stream *dev_str; + void *kargs[1]; + void *hp, *dp; + unsigned int nthreads_in_block; + struct nvptx_thread *nvthd = nvptx_thread (); + + function = targ_fn->fn; + + dev_str = select_stream_for_async (async, pthread_self (), false, NULL); + assert (dev_str == nvthd->current_stream); + + /* This reserves a chunk of a pre-allocated page of memory mapped on both + the host and the device. HP is a host pointer to the new chunk, and DP is + the corresponding device pointer. */ + map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); + + GOMP_PLUGIN_notify (" %s: prepare mappings\n", __FUNCTION__); + + /* Copy the array of arguments to the mapped page. */ + for (i = 0; i < mapnum; i++) + ((void **) hp)[i] = devaddrs[i]; + + /* Copy the (device) pointers to arguments to the device (dp and hp might in + fact have the same value on a unified-memory system). */ + r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuErrorMsg (r)); + + GOMP_PLUGIN_notify (" %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name); + + // XXX: possible geometry mappings?? + // + // OpenACC CUDA + // + // num_gangs blocks + // num_workers warps (where a warp is equivalent to 32 threads) + // vector length threads + // + + /* The openacc vector_length clause 'determines the vector length to use for + vector or SIMD operations'. The question is how to map this to CUDA. + + In CUDA, the warp size is the vector length of a CUDA device. However, the + CUDA interface abstracts away from that, and only shows us warp size + indirectly in maximum number of threads per block, which is a product of + warp size and the number of hyperthreads of a multiprocessor. + + We choose to map openacc vector_length directly onto the number of threads + in a block, in the x dimension. This is reflected in gcc code generation + that uses ThreadIdx.x to access vector elements. + + Attempting to use an openacc vector_length of more than the maximum number + of threads per block will result in a cuda error. */ + nthreads_in_block = vector_length; + + kargs[0] = &dp; + r = cuLaunchKernel (function, + num_gangs, 1, 1, + nthreads_in_block, 1, 1, + 0, dev_str->stream, kargs, 0); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuErrorMsg (r)); + +#ifndef DISABLE_ASYNC + if (async < acc_async_noval) + { + r = cuStreamSynchronize (dev_str->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r)); + } + else + { + CUevent *e; + + e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); + + r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); + + event_gc (true); + + r = cuEventRecord (*e, dev_str->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); + + event_add (PTX_EVT_KNL, e, (void *)dev_str); + } +#else + r = cuCtxSynchronize (); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuErrorMsg (r)); +#endif + + GOMP_PLUGIN_notify (" %s: kernel %s: finished\n", __FUNCTION__, + targ_fn->name); + +#ifndef DISABLE_ASYNC + if (async < acc_async_noval) +#endif + map_pop (dev_str); +} + +void * openacc_get_current_cuda_context (void); + +static void * +PTX_alloc (size_t s) +{ + CUdeviceptr d; + CUresult r; + + r = cuMemAlloc (&d, s); + if (r == CUDA_ERROR_OUT_OF_MEMORY) + return 0; + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuErrorMsg (r)); + return (void *)d; +} + +static void +PTX_free (void *p) +{ + CUresult r; + CUdeviceptr pb; + size_t ps; + + r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r)); + + if ((CUdeviceptr)p != pb) + GOMP_PLUGIN_fatal ("invalid device address"); + + r = cuMemFree ((CUdeviceptr)p); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuErrorMsg (r)); +} + +static void * +PTX_host2dev (void *d, const void *h, size_t s) +{ + CUresult r; + CUdeviceptr pb; + size_t ps; + struct nvptx_thread *nvthd = nvptx_thread (); + + if (!s) + return 0; + + if (!d) + GOMP_PLUGIN_fatal ("invalid device address"); + + r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r)); + + if (!pb) + GOMP_PLUGIN_fatal ("invalid device address"); + + if (!h) + GOMP_PLUGIN_fatal ("invalid host address"); + + if (d == h) + GOMP_PLUGIN_fatal ("invalid host or device address"); + + if ((void *)(d + s) > (void *)(pb + ps)) + GOMP_PLUGIN_fatal ("invalid size"); + +#ifndef DISABLE_ASYNC + if (nvthd->current_stream != nvthd->ptx_dev->null_stream) + { + CUevent *e; + + e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); + + r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); + + event_gc (false); + + r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, + nvthd->current_stream->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r)); + + r = cuEventRecord (*e, nvthd->current_stream->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); + + event_add (PTX_EVT_MEM, e, (void *)h); + } + else +#endif + { + r = cuMemcpyHtoD ((CUdeviceptr)d, h, s); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuErrorMsg (r)); + } + + return 0; +} + +static void * +PTX_dev2host (void *h, const void *d, size_t s) +{ + CUresult r; + CUdeviceptr pb; + size_t ps; + struct nvptx_thread *nvthd = nvptx_thread (); + + if (!s) + return 0; + + if (!d) + GOMP_PLUGIN_fatal ("invalid device address"); + + r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r)); + + if (!pb) + GOMP_PLUGIN_fatal ("invalid device address"); + + if (!h) + GOMP_PLUGIN_fatal ("invalid host address"); + + if (d == h) + GOMP_PLUGIN_fatal ("invalid host or device address"); + + if ((void *)(d + s) > (void *)(pb + ps)) + GOMP_PLUGIN_fatal ("invalid size"); + +#ifndef DISABLE_ASYNC + if (nvthd->current_stream != nvthd->ptx_dev->null_stream) + { + CUevent *e; + + e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); + + r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuErrorMsg (r)); + + event_gc (false); + + r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, + nvthd->current_stream->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r)); + + r = cuEventRecord (*e, nvthd->current_stream->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); + + event_add (PTX_EVT_MEM, e, (void *)h); + } + else +#endif + { + r = cuMemcpyDtoH (h, (CUdeviceptr)d, s); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuErrorMsg (r)); + } + + return 0; +} + +static void +PTX_set_async (int async) +{ + struct nvptx_thread *nvthd = nvptx_thread (); + nvthd->current_stream + = select_stream_for_async (async, pthread_self (), true, NULL); +} + +static int +PTX_async_test (int async) +{ + CUresult r; + struct PTX_stream *s; + + s = select_stream_for_async (async, pthread_self (), false, NULL); + + if (!s) + GOMP_PLUGIN_fatal ("unknown async %d", async); + + r = cuStreamQuery (s->stream); + if (r == CUDA_SUCCESS) + { + /* The oacc-parallel.c:goacc_wait function calls this hook to determine + whether all work has completed on this stream, and if so omits the call + to the wait hook. If that happens, event_gc might not get called + (which prevents variables from getting unmapped and their associated + device storage freed), so call it here. */ + event_gc (true); + return 1; + } + else if (r == CUDA_ERROR_NOT_READY) + return 0; + + GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r)); + + return 0; +} + +static int +PTX_async_test_all (void) +{ + struct PTX_stream *s; + pthread_t self = pthread_self (); + struct nvptx_thread *nvthd = nvptx_thread (); + + GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock); + + for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) + { + if ((s->multithreaded || pthread_equal (s->host_thread, self)) + && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) + { + GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); + return 0; + } + } + + GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); + + event_gc (true); + + return 1; +} + +static void +PTX_wait (int async) +{ + CUresult r; + struct PTX_stream *s; + + s = select_stream_for_async (async, pthread_self (), false, NULL); + + if (!s) + GOMP_PLUGIN_fatal ("unknown async %d", async); + + r = cuStreamSynchronize (s->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r)); + + event_gc (true); +} + +static void +PTX_wait_async (int async1, int async2) +{ + CUresult r; + CUevent *e; + struct PTX_stream *s1, *s2; + pthread_t self = pthread_self (); + + /* The stream that is waiting (rather than being waited for) doesn't + necessarily have to exist already. */ + s2 = select_stream_for_async (async2, self, true, NULL); + + s1 = select_stream_for_async (async1, self, false, NULL); + if (!s1) + GOMP_PLUGIN_fatal ("invalid async 1\n"); + + if (s1 == s2) + GOMP_PLUGIN_fatal ("identical parameters"); + + e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); + + r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); + + event_gc (true); + + r = cuEventRecord (*e, s1->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); + + event_add (PTX_EVT_SYNC, e, NULL); + + r = cuStreamWaitEvent (s2->stream, *e, 0); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r)); +} + +static void +PTX_wait_all (void) +{ + CUresult r; + struct PTX_stream *s; + pthread_t self = pthread_self (); + struct nvptx_thread *nvthd = nvptx_thread (); + + GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock); + + /* Wait for active streams initiated by this thread (or by multiple threads) + to complete. */ + for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) + { + if (s->multithreaded || pthread_equal (s->host_thread, self)) + { + r = cuStreamQuery (s->stream); + if (r == CUDA_SUCCESS) + continue; + else if (r != CUDA_ERROR_NOT_READY) + GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r)); + + r = cuStreamSynchronize (s->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r)); + } + } + + GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); + + event_gc (true); +} + +static void +PTX_wait_all_async (int async) +{ + CUresult r; + struct PTX_stream *waiting_stream, *other_stream; + CUevent *e; + struct nvptx_thread *nvthd = nvptx_thread (); + pthread_t self = pthread_self (); + + /* The stream doing the waiting. This could be the first mention of the + stream, so create it if necessary. */ + waiting_stream + = select_stream_for_async (async, pthread_self (), true, NULL); + + /* Launches on the null stream already block on other streams in the + context. */ + if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream) + return; + + event_gc (true); + + GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock); + + for (other_stream = nvthd->ptx_dev->active_streams; + other_stream != NULL; + other_stream = other_stream->next) + { + if (!other_stream->multithreaded + && !pthread_equal (other_stream->host_thread, self)) + continue; + + e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); + + r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); + + /* Record an event on the waited-for stream. */ + r = cuEventRecord (*e, other_stream->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); + + event_add (PTX_EVT_SYNC, e, NULL); + + r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r)); + } + + GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); +} + +static void * +PTX_get_current_cuda_device (void) +{ + struct nvptx_thread *nvthd = nvptx_thread (); + + if (!nvthd || !nvthd->ptx_dev) + return NULL; + + return &nvthd->ptx_dev->dev; +} + +static void * +PTX_get_current_cuda_context (void) +{ + struct nvptx_thread *nvthd = nvptx_thread (); + + if (!nvthd || !nvthd->ptx_dev) + return NULL; + + return nvthd->ptx_dev->ctx; +} + +static void * +PTX_get_cuda_stream (int async) +{ + struct PTX_stream *s; + struct nvptx_thread *nvthd = nvptx_thread (); + + if (!nvthd || !nvthd->ptx_dev) + return NULL; + + s = select_stream_for_async (async, pthread_self (), false, NULL); + + return s ? s->stream : NULL; +} + +static int +PTX_set_cuda_stream (int async, void *stream) +{ + struct PTX_stream *oldstream; + pthread_t self = pthread_self (); + struct nvptx_thread *nvthd = nvptx_thread (); + + GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock); + + if (async < 0) + GOMP_PLUGIN_fatal ("bad async %d", async); + + /* We have a list of active streams and an array mapping async values to + entries of that list. We need to take "ownership" of the passed-in stream, + and add it to our list, removing the previous entry also (if there was one) + in order to prevent resource leaks. Note the potential for surprise + here: maybe we should keep track of passed-in streams and leave it up to + the user to tidy those up, but that doesn't work for stream handles + returned from acc_get_cuda_stream above... */ + + oldstream = select_stream_for_async (async, self, false, NULL); + + if (oldstream) + { + if (nvthd->ptx_dev->active_streams == oldstream) + nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next; + else + { + struct PTX_stream *s = nvthd->ptx_dev->active_streams; + while (s->next != oldstream) + s = s->next; + s->next = s->next->next; + } + + cuStreamDestroy (oldstream->stream); + map_fini (oldstream); + free (oldstream); + } + + GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock); + + (void) select_stream_for_async (async, self, true, (CUstream) stream); + + return 1; +} + +/* Plugin entry points. */ + + +int +GOMP_OFFLOAD_get_type (void) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); +#endif + + return TARGET_TYPE_NVIDIA_PTX; +} + +unsigned int +GOMP_OFFLOAD_get_caps (void) +{ + return TARGET_CAP_OPENACC_200; +} + +const char * +GOMP_OFFLOAD_get_name (void) +{ + return "nvidia"; +} + +int +GOMP_OFFLOAD_get_num_devices (void) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); +#endif + + return PTX_get_num_devices (); +} + +static void **kernel_target_data; +static void **kernel_host_table; + +void +GOMP_OFFLOAD_register_image (void *host_table, void *target_data) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%p, %p)\n", __FILE__, __FUNCTION__, + host_table, target_data); +#endif + + kernel_target_data = target_data; + kernel_host_table = host_table; +} + +int +GOMP_OFFLOAD_init_device (void) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); +#endif + + return PTX_init (); +} + +int +GOMP_OFFLOAD_fini_device (void) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); +#endif + + return PTX_fini (); +} + +int +GOMP_OFFLOAD_get_table (struct mapping_table **tablep) +{ + CUmodule module; + void **fn_table; + char **fn_names; + int fn_entries, i; + CUresult r; + struct targ_fn_descriptor *targ_fns; + +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, + tablep); +#endif + + if (PTX_init () <= 0) + return 0; + + /* This isn't an error, because an image may legitimately have no offloaded + regions and so will not call GOMP_offload_register. */ + if (kernel_target_data == NULL) + return 0; + + link_ptx (&module, kernel_target_data[0]); + + /* kernel_target_data[0] -> ptx code + kernel_target_data[1] -> variable mappings + kernel_target_data[2] -> array of kernel names in ascii + + kernel_host_table[0] -> start of function addresses (_omp_func_table) + kernel_host_table[1] -> end of function addresses (_omp_funcs_end) + + The array of kernel names and the functions addresses form a + one-to-one correspondence. */ + + fn_table = kernel_host_table[0]; + fn_names = (char **) kernel_target_data[2]; + fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *); + + *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries); + targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) + * fn_entries); + + for (i = 0; i < fn_entries; i++) + { + CUfunction function; + + r = cuModuleGetFunction (&function, module, fn_names[i]); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuErrorMsg (r)); + + targ_fns[i].fn = function; + targ_fns[i].name = (const char *) fn_names[i]; + + (*tablep)[i].host_start = (uintptr_t) fn_table[i]; + (*tablep)[i].host_end = (*tablep)[i].host_start + 1; + (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i]; + (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1; + } + + return fn_entries; +} + +void * +GOMP_OFFLOAD_alloc (size_t size) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%zu)\n", __FILE__, __FUNCTION__, + size); +#endif + + return PTX_alloc (size); +} + +void +GOMP_OFFLOAD_free (void *ptr) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, ptr); +#endif + + PTX_free (ptr); +} + +void * +GOMP_OFFLOAD_dev2host (void *dst, const void *src, size_t n) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__, + __FUNCTION__, dst, + src, n); +#endif + + return PTX_dev2host (dst, src, n); +} + +void * +GOMP_OFFLOAD_host2dev (void *dst, const void *src, size_t n) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__, + __FUNCTION__, dst, src, n); +#endif + + return PTX_host2dev (dst, src, n); +} + +void (*device_run) (void *fn_ptr, void *vars) = NULL; + +void +GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, + void **hostaddrs, void **devaddrs, size_t *sizes, + unsigned short *kinds, int num_gangs, + int num_workers, int vector_length, int async, + void *targ_mem_desc) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, " + "%d, %p)\n", __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes, + kinds, num_gangs, num_workers, vector_length, async, targ_mem_desc); +#endif + + PTX_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs, + num_workers, vector_length, async, targ_mem_desc); +} + +void * +GOMP_OFFLOAD_openacc_open_device (int n) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, n); +#endif + return PTX_open_device (n); +} + +int +GOMP_OFFLOAD_openacc_close_device (void *h) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, h); +#endif + return PTX_close_device (h); +} + +void +GOMP_OFFLOAD_openacc_set_device_num (int n) +{ + struct nvptx_thread *nvthd = nvptx_thread (); + + assert (n >= 0); + + if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n) + (void) PTX_open_device (n); +} + +/* This can be called before the device is "opened" for the current thread, in + which case we can't tell which device number should be returned. We don't + actually want to open the device here, so just return -1 and let the caller + (oacc-init.c:acc_get_device_num) handle it. */ + +int +GOMP_OFFLOAD_openacc_get_device_num (void) +{ + struct nvptx_thread *nvthd = nvptx_thread (); + + if (nvthd && nvthd->ptx_dev) + return nvthd->ptx_dev->ord; + else + return -1; +} + +bool +GOMP_OFFLOAD_openacc_avail (void) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); +#endif + return PTX_avail (); +} + +void +GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) +{ + CUevent *e; + CUresult r; + struct nvptx_thread *nvthd = nvptx_thread (); + +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, + targ_mem_desc); +#endif + + e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); + + r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); + + r = cuEventRecord (*e, nvthd->current_stream->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); + + event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); +} + +int +GOMP_OFFLOAD_openacc_async_test (int async) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, + async); +#endif + return PTX_async_test (async); +} + +int +GOMP_OFFLOAD_openacc_async_test_all (void) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); +#endif + return PTX_async_test_all (); +} + +void +GOMP_OFFLOAD_openacc_async_wait (int async) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, + async); +#endif + PTX_wait (async); +} + +void +GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%d, %d)\n", __FILE__, __FUNCTION__, + async1, async2); +#endif + PTX_wait_async (async1, async2); +} + +void +GOMP_OFFLOAD_openacc_async_wait_all (void) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); +#endif + PTX_wait_all (); +} + +void +GOMP_OFFLOAD_openacc_async_wait_all_async (int async) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, + async); +#endif + PTX_wait_all_async (async); +} + +void +GOMP_OFFLOAD_openacc_async_set_async (int async) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, + async); +#endif + PTX_set_async (async); +} + +void * +GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data) +{ + struct PTX_device *ptx_dev = (struct PTX_device *) targ_data; + struct nvptx_thread *nvthd + = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); + CUresult r; + CUcontext thd_ctx; + + r = cuCtxGetCurrent (&thd_ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r)); + + assert (ptx_dev->ctx); + + if (!thd_ctx) + { + r = cuCtxPushCurrent (ptx_dev->ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuErrorMsg (r)); + } + + nvthd->current_stream = ptx_dev->null_stream; + nvthd->ptx_dev = ptx_dev; + + return (void *) nvthd; +} + +void +GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) +{ + free (data); +} + +void * +GOMP_OFFLOAD_openacc_get_current_cuda_device (void) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); +#endif + return PTX_get_current_cuda_device (); +} + +void * +GOMP_OFFLOAD_openacc_get_current_cuda_context (void) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__); +#endif + return PTX_get_current_cuda_context (); +} + +/* NOTE: This returns a CUstream, not a PTX_stream pointer. */ + +void * +GOMP_OFFLOAD_openacc_get_cuda_stream (int async) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, + async); +#endif + return PTX_get_cuda_stream (async); +} + +/* NOTE: This takes a CUstream, not a PTX_stream pointer. */ + +int +GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream) +{ +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%d, %p)\n", __FILE__, __FUNCTION__, + async, stream); +#endif + return PTX_set_cuda_stream (async, stream); +} diff --git a/libgomp/testsuite/Makefile.in b/libgomp/testsuite/Makefile.in index 17ee96b..7cf04a7 100644

[gomp4] Move libgomp plugins into subdirectory

Commit Message

Comments

Patch