commit da6810c7f2a3e56f77fa589ba4777b68b5751fd4
Author: Julian Brown <julian@codesourcery.com>
Date: Mon Sep 22 02:55:12 2014 -0700
OpenACC support for libgomp.
new file mode 100644
@@ -0,0 +1,45 @@
+#ifndef GOMP_CONSTANTS_H
+#define GOMP_CONSTANTS_H 1
+
+/* Enumerated variable mapping types used to communicate between GCC and
+ libgomp. These values are used for both OpenMP and OpenACC. */
+
+#define GOMP_MAP_ALLOC 0x00
+#define GOMP_MAP_ALLOC_TO 0x01
+#define GOMP_MAP_ALLOC_FROM 0x02
+#define GOMP_MAP_ALLOC_TOFROM 0x03
+#define GOMP_MAP_POINTER 0x04
+#define GOMP_MAP_TO_PSET 0x05
+#define GOMP_MAP_FORCE_ALLOC 0x08
+#define GOMP_MAP_FORCE_TO 0x09
+#define GOMP_MAP_FORCE_FROM 0x0a
+#define GOMP_MAP_FORCE_TOFROM 0x0b
+#define GOMP_MAP_FORCE_PRESENT 0x0c
+#define GOMP_MAP_FORCE_DEALLOC 0x0d
+#define GOMP_MAP_FORCE_DEVICEPTR 0x0e
+#define GOMP_MAP_FORCE_PRIVATE 0x18
+#define GOMP_MAP_FORCE_FIRSTPRIVATE 0x19
+
+#define GOMP_MAP_COPYTO_P(X) \
+ ((X) == GOMP_MAP_ALLOC_TO || (X) == GOMP_MAP_FORCE_TO)
+
+#define GOMP_MAP_COPYFROM_P(X) \
+ ((X) == GOMP_MAP_ALLOC_FROM || (X) == GOMP_MAP_FORCE_FROM)
+
+#define GOMP_MAP_TOFROM_P(X) \
+ ((X) == GOMP_MAP_ALLOC_TOFROM || (X) == GOMP_MAP_FORCE_TOFROM)
+
+#define GOMP_MAP_POINTER_P(X) \
+ ((X) == GOMP_MAP_POINTER)
+
+#define GOMP_IF_CLAUSE_FALSE -2
+
+/* Canonical list of target type codes for OpenMP/OpenACC. */
+#define GOMP_TARGET_NONE 0
+#define GOMP_TARGET_HOST 2
+#define GOMP_TARGET_NONSHM_HOST 3
+#define GOMP_TARGET_NOT_HOST 4
+#define GOMP_TARGET_NVIDIA_PTX 5
+#define GOMP_TARGET_INTEL_MIC 6
+
+#endif
@@ -14,13 +14,35 @@ libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
vpath % $(strip $(search_path))
-AM_CPPFLAGS = $(addprefix -I, $(search_path))
+AM_CPPFLAGS = $(addprefix -I, $(search_path)) \
+ $(addprefix -I, $(search_path)/../include)
AM_CFLAGS = $(XCFLAGS)
AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
toolexeclib_LTLIBRARIES = libgomp.la
nodist_toolexeclib_HEADERS = libgomp.spec
+if PLUGIN_NVPTX
+# Nvidia PTX OpenACC plugin.
+libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-nvptx.la
+libgomp_plugin_nvptx_la_SOURCES = plugin-nvptx.c
+libgomp_plugin_nvptx_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_NVPTX_CPPFLAGS)
+libgomp_plugin_nvptx_la_LDFLAGS = $(libgomp_plugin_nvptx_version_info) \
+ $(lt_host_flags)
+libgomp_plugin_nvptx_la_LDFLAGS += $(PLUGIN_NVPTX_LDFLAGS)
+libgomp_plugin_nvptx_la_LIBADD = $(PLUGIN_NVPTX_LIBS)
+libgomp_plugin_nvptx_la_LIBTOOLFLAGS = --tag=disable-static
+endif
+
+libgomp_plugin_nonshm_host_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-nonshm-host.la
+libgomp_plugin_nonshm_host_la_SOURCES = oacc-host.c
+libgomp_plugin_nonshm_host_la_CPPFLAGS = $(AM_CPPFLAGS) -DNONSHM_HOST_PLUGIN
+libgomp_plugin_nonshm_host_la_LDFLAGS = \
+ $(libgomp_plugin_nonshm_host_version_info) $(lt_host_flags)
+libgomp_plugin_nonshm_host_la_LIBTOOLFLAGS = --tag=disable-static
+
if LIBGOMP_BUILD_VERSIONED_SHLIB
# -Wc is only a libtool option.
comma = ,
@@ -60,10 +82,12 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
- time.c fortran.c affinity.c target.c
+ time.c fortran.c affinity.c target.c oacc-parallel.c splay-tree.c \
+ oacc-fortran.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \
+ oacc-plugin.c oacc-cuda.c libgomp-plugin.c
nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h
+nodist_libsubinclude_HEADERS = omp.h openacc.h ../include/gomp-constants.h
if USE_FORTRAN
nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod
endif
@@ -36,6 +36,7 @@ POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
target_triplet = @target@
+@PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la
subdir = .
DIST_COMMON = ChangeLog $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
$(top_srcdir)/configure $(am__configure_deps) \
@@ -91,12 +92,37 @@ am__installdirs = "$(DESTDIR)$(toolexeclibdir)" "$(DESTDIR)$(infodir)" \
"$(DESTDIR)$(fincludedir)" "$(DESTDIR)$(libsubincludedir)" \
"$(DESTDIR)$(toolexeclibdir)"
LTLIBRARIES = $(toolexeclib_LTLIBRARIES)
+libgomp_plugin_nonshm_host_la_LIBADD =
+am_libgomp_plugin_nonshm_host_la_OBJECTS = \
+ libgomp_plugin_nonshm_host_la-oacc-host.lo
+libgomp_plugin_nonshm_host_la_OBJECTS = \
+ $(am_libgomp_plugin_nonshm_host_la_OBJECTS)
+libgomp_plugin_nonshm_host_la_LINK = $(LIBTOOL) --tag=CC \
+ $(libgomp_plugin_nonshm_host_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+ $(libgomp_plugin_nonshm_host_la_LDFLAGS) $(LDFLAGS) -o $@
+am__DEPENDENCIES_1 =
+@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_la_DEPENDENCIES = \
+@PLUGIN_NVPTX_TRUE@ $(am__DEPENDENCIES_1)
+@PLUGIN_NVPTX_TRUE@am_libgomp_plugin_nvptx_la_OBJECTS = \
+@PLUGIN_NVPTX_TRUE@ libgomp_plugin_nvptx_la-plugin-nvptx.lo
+libgomp_plugin_nvptx_la_OBJECTS = \
+ $(am_libgomp_plugin_nvptx_la_OBJECTS)
+libgomp_plugin_nvptx_la_LINK = $(LIBTOOL) --tag=CC \
+ $(libgomp_plugin_nvptx_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+ $(libgomp_plugin_nvptx_la_LDFLAGS) $(LDFLAGS) -o $@
+@PLUGIN_NVPTX_TRUE@am_libgomp_plugin_nvptx_la_rpath = -rpath \
+@PLUGIN_NVPTX_TRUE@ $(toolexeclibdir)
libgomp_la_LIBADD =
am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \
error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \
parallel.lo sections.lo single.lo task.lo team.lo work.lo \
lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \
- fortran.lo affinity.lo target.lo
+ fortran.lo affinity.lo target.lo oacc-parallel.lo \
+ splay-tree.lo oacc-fortran.lo oacc-host.lo oacc-init.lo \
+ oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \
+ libgomp-plugin.lo
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
DEFAULT_INCLUDES = -I.@am__isrc@
depcomp = $(SHELL) $(top_srcdir)/../depcomp
@@ -108,7 +134,8 @@ LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
CCLD = $(CC)
-SOURCES = $(libgomp_la_SOURCES)
+SOURCES = $(libgomp_plugin_nonshm_host_la_SOURCES) \
+ $(libgomp_plugin_nvptx_la_SOURCES) $(libgomp_la_SOURCES)
MULTISRCTOP =
MULTIBUILDTOP =
MULTIDIRS =
@@ -213,6 +240,10 @@ PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
PERL = @PERL@
+PLUGIN_NVPTX = @PLUGIN_NVPTX@
+PLUGIN_NVPTX_CPPFLAGS = @PLUGIN_NVPTX_CPPFLAGS@
+PLUGIN_NVPTX_LDFLAGS = @PLUGIN_NVPTX_LDFLAGS@
+PLUGIN_NVPTX_LIBS = @PLUGIN_NVPTX_LIBS@
RANLIB = @RANLIB@
SECTION_LDFLAGS = @SECTION_LDFLAGS@
SED = @SED@
@@ -293,12 +324,32 @@ gcc_version := $(shell cat $(top_srcdir)/../gcc/BASE-VER)
search_path = $(addprefix $(top_srcdir)/config/, $(config_path)) $(top_srcdir)
fincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/finclude
libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
-AM_CPPFLAGS = $(addprefix -I, $(search_path))
+AM_CPPFLAGS = $(addprefix -I, $(search_path)) \
+ $(addprefix -I, $(search_path)/../include)
+
AM_CFLAGS = $(XCFLAGS)
AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
-toolexeclib_LTLIBRARIES = libgomp.la
+toolexeclib_LTLIBRARIES = libgomp.la $(am__append_1) \
+ libgomp-plugin-nonshm-host.la
nodist_toolexeclib_HEADERS = libgomp.spec
+# Nvidia PTX OpenACC plugin.
+@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
+@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_la_SOURCES = plugin-nvptx.c
+@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_NVPTX_CPPFLAGS)
+@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_la_LDFLAGS = \
+@PLUGIN_NVPTX_TRUE@ $(libgomp_plugin_nvptx_version_info) \
+@PLUGIN_NVPTX_TRUE@ $(lt_host_flags) $(PLUGIN_NVPTX_LDFLAGS)
+@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_la_LIBADD = $(PLUGIN_NVPTX_LIBS)
+@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_la_LIBTOOLFLAGS = --tag=disable-static
+libgomp_plugin_nonshm_host_version_info = -version-info $(libtool_VERSION)
+libgomp_plugin_nonshm_host_la_SOURCES = oacc-host.c
+libgomp_plugin_nonshm_host_la_CPPFLAGS = $(AM_CPPFLAGS) -DNONSHM_HOST_PLUGIN
+libgomp_plugin_nonshm_host_la_LDFLAGS = \
+ $(libgomp_plugin_nonshm_host_version_info) $(lt_host_flags)
+
+libgomp_plugin_nonshm_host_la_LIBTOOLFLAGS = --tag=disable-static
+
# -Wc is only a libtool option.
@LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE@comma = ,
@LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE@PREPROCESS = $(subst -Wc$(comma), , $(COMPILE)) -E
@@ -317,10 +368,12 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
- time.c fortran.c affinity.c target.c
+ time.c fortran.c affinity.c target.c oacc-parallel.c splay-tree.c \
+ oacc-fortran.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \
+ oacc-plugin.c oacc-cuda.c libgomp-plugin.c
nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h
+nodist_libsubinclude_HEADERS = omp.h openacc.h ../include/gomp-constants.h
@USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod
LTLDFLAGS = $(shell $(SHELL) $(top_srcdir)/../libtool-ldflags $(LDFLAGS))
LINK = $(LIBTOOL) --tag CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -444,6 +497,10 @@ clean-toolexeclibLTLIBRARIES:
echo "rm -f \"$${dir}/so_locations\""; \
rm -f "$${dir}/so_locations"; \
done
+libgomp-plugin-nonshm-host.la: $(libgomp_plugin_nonshm_host_la_OBJECTS) $(libgomp_plugin_nonshm_host_la_DEPENDENCIES)
+ $(libgomp_plugin_nonshm_host_la_LINK) -rpath $(toolexeclibdir) $(libgomp_plugin_nonshm_host_la_OBJECTS) $(libgomp_plugin_nonshm_host_la_LIBADD) $(LIBS)
+libgomp-plugin-nvptx.la: $(libgomp_plugin_nvptx_la_OBJECTS) $(libgomp_plugin_nvptx_la_DEPENDENCIES)
+ $(libgomp_plugin_nvptx_la_LINK) $(am_libgomp_plugin_nvptx_la_rpath) $(libgomp_plugin_nvptx_la_OBJECTS) $(libgomp_plugin_nvptx_la_LIBADD) $(LIBS)
libgomp.la: $(libgomp_la_OBJECTS) $(libgomp_la_DEPENDENCIES)
$(libgomp_la_LINK) -rpath $(toolexeclibdir) $(libgomp_la_OBJECTS) $(libgomp_la_LIBADD) $(LIBS)
@@ -463,10 +520,21 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nonshm_host_la-oacc-host.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop_ull.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mutex.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-async.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-cuda.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-fortran.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-host.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-init.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc.Plo@am__quote@
@@ -474,6 +542,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/splay-tree.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
@@ -501,6 +570,20 @@ distclean-compile:
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $<
+libgomp_plugin_nonshm_host_la-oacc-host.lo: oacc-host.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(libgomp_plugin_nonshm_host_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_nonshm_host_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_nonshm_host_la-oacc-host.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_nonshm_host_la-oacc-host.Tpo -c -o libgomp_plugin_nonshm_host_la-oacc-host.lo `test -f 'oacc-host.c' || echo '$(srcdir)/'`oacc-host.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libgomp_plugin_nonshm_host_la-oacc-host.Tpo $(DEPDIR)/libgomp_plugin_nonshm_host_la-oacc-host.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='oacc-host.c' object='libgomp_plugin_nonshm_host_la-oacc-host.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(libgomp_plugin_nonshm_host_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_nonshm_host_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_nonshm_host_la-oacc-host.lo `test -f 'oacc-host.c' || echo '$(srcdir)/'`oacc-host.c
+
+libgomp_plugin_nvptx_la-plugin-nvptx.lo: plugin-nvptx.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(libgomp_plugin_nvptx_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_nvptx_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_nvptx_la-plugin-nvptx.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo -c -o libgomp_plugin_nvptx_la-plugin-nvptx.lo `test -f 'plugin-nvptx.c' || echo '$(srcdir)/'`plugin-nvptx.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='plugin-nvptx.c' object='libgomp_plugin_nvptx_la-plugin-nvptx.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(libgomp_plugin_nvptx_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_nvptx_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_nvptx_la-plugin-nvptx.lo `test -f 'plugin-nvptx.c' || echo '$(srcdir)/'`plugin-nvptx.c
+
mostlyclean-libtool:
-rm -f *.lo
@@ -82,7 +82,7 @@
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
-/* Define to 1 if GNU symbol versioning is used for libgomp. */
+/* Define to 1 if GNU symbol versioning is used. */
#undef LIBGOMP_GNU_SYMBOL_VERSIONING
/* Define to the sub-directory in which libtool stores uninstalled libraries.
@@ -110,6 +110,9 @@
/* Define to the version of this package. */
#undef PACKAGE_VERSION
+/* Define to 1 if the NVIDIA plugin is built, 0 if not. */
+#undef PLUGIN_NVPTX
+
/* Define if all infrastructure, needed for plugins, is supported. */
#undef PLUGIN_SUPPORT
@@ -627,6 +627,12 @@ LIBGOMP_BUILD_VERSIONED_SHLIB_FALSE
LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE
OPT_LDFLAGS
SECTION_LDFLAGS
+PLUGIN_NVPTX_FALSE
+PLUGIN_NVPTX_TRUE
+PLUGIN_NVPTX_LIBS
+PLUGIN_NVPTX_LDFLAGS
+PLUGIN_NVPTX_CPPFLAGS
+PLUGIN_NVPTX
libtool_VERSION
ac_ct_FC
FCFLAGS
@@ -758,6 +764,9 @@ ac_user_opts='
enable_option_checking
enable_version_specific_runtime_libs
enable_generated_files_in_srcdir
+with_cuda_driver
+with_cuda_driver_include
+with_cuda_driver_lib
enable_multilib
enable_dependency_tracking
enable_shared
@@ -1425,6 +1434,16 @@ Optional Features:
Optional Packages:
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
--without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
+ --with-cuda-driver=PATH specify prefix directory for installed CUDA driver
+ package. Equivalent to
+ --with-cuda-driver-include=PATH/include plus
+ --with-cuda-driver-lib=PATH/lib
+ --with-cuda-driver-include=PATH
+ specify directory for installed CUDA driver include
+ files
+ --with-cuda-driver-lib=PATH
+ specify directory for the installed CUDA driver
+ library
--with-pic try to use only PIC/non-PIC objects [default=use
both]
--with-gnu-ld assume the C compiler uses GNU ld [default=no]
@@ -2596,6 +2615,38 @@ else
fi
+# Look for the CUDA driver package.
+CUDA_DRIVER_CPPFLAGS=
+CUDA_DRIVER_LDFLAGS=
+
+# Check whether --with-cuda-driver was given.
+if test "${with_cuda_driver+set}" = set; then :
+ withval=$with_cuda_driver;
+fi
+
+
+# Check whether --with-cuda-driver-include was given.
+if test "${with_cuda_driver_include+set}" = set; then :
+ withval=$with_cuda_driver_include;
+fi
+
+
+# Check whether --with-cuda-driver-lib was given.
+if test "${with_cuda_driver_lib+set}" = set; then :
+ withval=$with_cuda_driver_lib;
+fi
+
+if test "x$with_cuda_driver" != x; then
+ CUDA_DRIVER_CPPFLAGS=-I$with_cuda_driver/include
+ CUDA_DRIVER_LDFLAGS=-L$with_cuda_driver/lib
+fi
+if test "x$with_cuda_driver_include" != x; then
+ CUDA_DRIVER_CPPFLAGS=-I$with_cuda_driver_include
+fi
+if test "x$with_cuda_driver_lib" != x; then
+ CUDA_DRIVER_LDFLAGS=-L$with_cuda_driver_lib
+fi
+
# -------
# -------
@@ -11094,7 +11145,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 11097 "configure"
+#line 11148 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -11200,7 +11251,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 11203 "configure"
+#line 11254 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -15009,6 +15060,7 @@ ac_config_commands="$ac_config_commands gstdint.h"
+# TODO: not for OpenACC?
# Check to see if -pthread or -lpthread is needed. Prefer the former.
# In case the pthread.h system header is not found, this test will fail.
XPCFLAGS=""
@@ -15113,7 +15165,78 @@ if test x$plugin_support = xyes; then
$as_echo "#define PLUGIN_SUPPORT 1" >>confdefs.h
+elif test "x$enable_accelerator" != xno; then
+ as_fn_error "Can't have support for accelerators without support for plugins" "$LINENO" 5
+fi
+
+PLUGIN_NVPTX=0
+PLUGIN_NVPTX_CPPFLAGS=
+PLUGIN_NVPTX_LDFLAGS=
+PLUGIN_NVPTX_LIBS=
+
+
+
+
+# enable_accelerator has already been validated at top level.
+# No need to do it again.
+case $enable_accelerator in
+ auto-nvptx*|nvptx*)
+ PLUGIN_NVPTX=$enable_accelerator
+ PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
+ PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
+ PLUGIN_NVPTX_LIBS='-lcuda'
+
+ PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
+ CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
+ PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
+ LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
+ PLUGIN_NVPTX_save_LIBS=$LIBS
+ LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include "cuda.h"
+int
+main ()
+{
+CUresult r = cuCtxPushCurrent (NULL);
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ PLUGIN_NVPTX=1
fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+ CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
+ LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
+ LIBS=$PLUGIN_NVPTX_save_LIBS
+ case $PLUGIN_NVPTX in
+ auto-nvptx*)
+ PLUGIN_NVPTX=0
+ { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: CUDA driver package required for nvptx support; disabling" >&5
+$as_echo "$as_me: WARNING: CUDA driver package required for nvptx support; disabling" >&2;}
+ ;;
+ nvptx*)
+ PLUGIN_NVPTX=0
+ as_fn_error "CUDA driver package required for nvptx support" "$LINENO" 5
+ ;;
+ esac
+ ;;
+esac
+ if test $PLUGIN_NVPTX = 1; then
+ PLUGIN_NVPTX_TRUE=
+ PLUGIN_NVPTX_FALSE='#'
+else
+ PLUGIN_NVPTX_TRUE='#'
+ PLUGIN_NVPTX_FALSE=
+fi
+
+
+cat >>confdefs.h <<_ACEOF
+#define PLUGIN_NVPTX $PLUGIN_NVPTX
+_ACEOF
+
# Check for functions needed.
for ac_func in getloadavg clock_gettime strtoull
@@ -15278,6 +15401,7 @@ fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
+# TODO: not for OpenACC?
# At least for glibc, clock_gettime is in librt. But don't pull that
# in if it still doesn't give us the function we want.
if test $ac_cv_func_clock_gettime = no; then
@@ -16127,6 +16251,7 @@ $as_echo "#define HAVE_SYNC_BUILTINS 1" >>confdefs.h
fi
+# TODO: not for OpenACC?
XCFLAGS="$XCFLAGS$XPCFLAGS"
@@ -16241,6 +16366,7 @@ fi
# the underscore here and update the PREREQ. If it doesn't, then we'll
# need to copy this macro to our acinclude.m4.
save_CFLAGS="$CFLAGS"
+# TODO: not for OpenACC?
for i in $config_path; do
if test -f $srcdir/config/$i/omp-lock.h; then
CFLAGS="$CFLAGS -include confdefs.h -include $srcdir/config/$i/omp-lock.h"
@@ -16458,6 +16584,10 @@ if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
as_fn_error "conditional \"MAINTAINER_MODE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
+if test -z "${PLUGIN_NVPTX_TRUE}" && test -z "${PLUGIN_NVPTX_FALSE}"; then
+ as_fn_error "conditional \"PLUGIN_NVPTX\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
if test -z "${LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE}" && test -z "${LIBGOMP_BUILD_VERSIONED_SHLIB_FALSE}"; then
as_fn_error "conditional \"LIBGOMP_BUILD_VERSIONED_SHLIB\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -2,6 +2,8 @@
# aclocal -I ../config && autoconf && autoheader && automake
AC_PREREQ(2.64)
+#TODO: Update for OpenACC? But then also have to update copyright notices in
+#all source files...
AC_INIT([GNU OpenMP Runtime Library], 1.0,,[libgomp])
AC_CONFIG_HEADER(config.h)
@@ -28,6 +30,31 @@ LIBGOMP_ENABLE(generated-files-in-srcdir, no, ,
AC_MSG_RESULT($enable_generated_files_in_srcdir)
AM_CONDITIONAL(GENINSRC, test "$enable_generated_files_in_srcdir" = yes)
+# Look for the CUDA driver package.
+CUDA_DRIVER_CPPFLAGS=
+CUDA_DRIVER_LDFLAGS=
+AC_ARG_WITH(cuda-driver,
+ [AS_HELP_STRING([--with-cuda-driver=PATH],
+ [specify prefix directory for installed CUDA driver package.
+ Equivalent to --with-cuda-driver-include=PATH/include
+ plus --with-cuda-driver-lib=PATH/lib])])
+AC_ARG_WITH(cuda-driver-include,
+ [AS_HELP_STRING([--with-cuda-driver-include=PATH],
+ [specify directory for installed CUDA driver include files])])
+AC_ARG_WITH(cuda-driver-lib,
+ [AS_HELP_STRING([--with-cuda-driver-lib=PATH],
+ [specify directory for the installed CUDA driver library])])
+if test "x$with_cuda_driver" != x; then
+ CUDA_DRIVER_CPPFLAGS=-I$with_cuda_driver/include
+ CUDA_DRIVER_LDFLAGS=-L$with_cuda_driver/lib
+fi
+if test "x$with_cuda_driver_include" != x; then
+ CUDA_DRIVER_CPPFLAGS=-I$with_cuda_driver_include
+fi
+if test "x$with_cuda_driver_lib" != x; then
+ CUDA_DRIVER_LDFLAGS=-L$with_cuda_driver_lib
+fi
+
# -------
# -------
@@ -174,6 +201,7 @@ AC_CHECK_HEADERS(unistd.h semaphore.h sys/loadavg.h sys/time.h sys/time.h)
GCC_HEADER_STDINT(gstdint.h)
+# TODO: not for OpenACC?
# Check to see if -pthread or -lpthread is needed. Prefer the former.
# In case the pthread.h system header is not found, this test will fail.
XPCFLAGS=""
@@ -200,8 +228,57 @@ AC_CHECK_HEADER(dirent.h, , [plugin_support=no])
if test x$plugin_support = xyes; then
AC_DEFINE(PLUGIN_SUPPORT, 1,
[Define if all infrastructure, needed for plugins, is supported.])
+elif test "x$enable_accelerator" != xno; then
+ AC_MSG_ERROR([Can't have support for accelerators without support for plugins])
fi
+PLUGIN_NVPTX=0
+PLUGIN_NVPTX_CPPFLAGS=
+PLUGIN_NVPTX_LDFLAGS=
+PLUGIN_NVPTX_LIBS=
+AC_SUBST(PLUGIN_NVPTX)
+AC_SUBST(PLUGIN_NVPTX_CPPFLAGS)
+AC_SUBST(PLUGIN_NVPTX_LDFLAGS)
+AC_SUBST(PLUGIN_NVPTX_LIBS)
+# enable_accelerator has already been validated at top level.
+# No need to do it again.
+case $enable_accelerator in
+ auto-nvptx*|nvptx*)
+ PLUGIN_NVPTX=$enable_accelerator
+ PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
+ PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
+ PLUGIN_NVPTX_LIBS='-lcuda'
+
+ PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
+ CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
+ PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
+ LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
+ PLUGIN_NVPTX_save_LIBS=$LIBS
+ LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM(
+ [#include "cuda.h"],
+ [CUresult r = cuCtxPushCurrent (NULL);])],
+ [PLUGIN_NVPTX=1])
+ CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
+ LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
+ LIBS=$PLUGIN_NVPTX_save_LIBS
+ case $PLUGIN_NVPTX in
+ auto-nvptx*)
+ PLUGIN_NVPTX=0
+ AC_MSG_WARN([CUDA driver package required for nvptx support; disabling])
+ ;;
+ nvptx*)
+ PLUGIN_NVPTX=0
+ AC_MSG_ERROR([CUDA driver package required for nvptx support])
+ ;;
+ esac
+ ;;
+esac
+AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
+AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
+ [Define to 1 if the NVIDIA plugin is built, 0 if not.])
+
# Check for functions needed.
AC_CHECK_FUNCS(getloadavg clock_gettime strtoull)
@@ -235,6 +312,7 @@ AC_LINK_IFELSE(
AC_DEFINE(HAVE_PTHREAD_AFFINITY_NP, 1,
[ Define if pthread_{,attr_}{g,s}etaffinity_np is supported.]))
+# TODO: not for OpenACC?
# At least for glibc, clock_gettime is in librt. But don't pull that
# in if it still doesn't give us the function we want.
if test $ac_cv_func_clock_gettime = no; then
@@ -255,7 +333,7 @@ LIBGOMP_ENABLE_SYMVERS
if test $enable_symvers = gnu; then
AC_DEFINE(LIBGOMP_GNU_SYMBOL_VERSIONING, 1,
- [Define to 1 if GNU symbol versioning is used for libgomp.])
+ [Define to 1 if GNU symbol versioning is used.])
fi
# Get target configury.
@@ -266,6 +344,7 @@ CFLAGS="$save_CFLAGS $XCFLAGS"
# had a chance to set XCFLAGS.
LIBGOMP_CHECK_SYNC_BUILTINS
+# TODO: not for OpenACC?
XCFLAGS="$XCFLAGS$XPCFLAGS"
AC_SUBST(config_path)
@@ -300,6 +379,7 @@ AM_CONDITIONAL([USE_FORTRAN], [test "$ac_cv_fc_compiler_gnu" = yes])
# the underscore here and update the PREREQ. If it doesn't, then we'll
# need to copy this macro to our acinclude.m4.
save_CFLAGS="$CFLAGS"
+# TODO: not for OpenACC?
for i in $config_path; do
if test -f $srcdir/config/$i/omp-lock.h; then
CFLAGS="$CFLAGS -include confdefs.h -include $srcdir/config/$i/omp-lock.h"
@@ -116,12 +116,14 @@ case "${target}" in
case "${target}" in
*-*-hpux11*)
# HPUX v11.x requires -lrt to resolve sem_init in libgomp.la
+ # TODO: not for OpenACC?
XLDFLAGS="${XLDFLAGS} -lrt"
;;
esac
case "${target}" in
hppa[12]*-*-hpux*)
# PA 32 HP-UX needs -frandom-seed for bootstrap compare.
+ # TODO: not for OpenACC?
XCFLAGS="${XCFLAGS} -frandom-seed=fixed-seed"
;;
esac
@@ -137,6 +139,7 @@ case "${target}" in
*-*-freebsd*)
# Need to link with -lpthread so libgomp.so is self-contained.
+ # TODO: not for OpenACC?
XLDFLAGS="${XLDFLAGS} -lpthread"
;;
@@ -27,6 +27,7 @@
#include "libgomp.h"
#include "libgomp_f.h"
+#include "target.h"
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
@@ -77,6 +78,9 @@ unsigned long gomp_bind_var_list_len;
void **gomp_places_list;
unsigned long gomp_places_list_len;
+int goacc_device_num;
+char* goacc_device_type;
+
/* Parse the OMP_SCHEDULE environment variable. */
static void
@@ -1013,6 +1017,37 @@ parse_affinity (bool ignore)
static void
+goacc_parse_device_num (void)
+{
+ const char *env = getenv ("ACC_DEVICE_NUM");
+ int default_num = -1;
+
+ if (env && *env != '\0')
+ {
+ char *end;
+ default_num = strtol (env, &end, 0);
+
+ if (*end || default_num < 0)
+ default_num = 0;
+ }
+ else
+ default_num = 0;
+
+ goacc_device_num = default_num;
+}
+
+static void
+goacc_parse_device_type (void)
+{
+ const char *env = getenv ("ACC_DEVICE_TYPE");
+
+ if (env && *env != '\0')
+ goacc_device_type = strdup (env);
+ else
+ goacc_device_type = NULL;
+}
+
+static void
handle_omp_display_env (unsigned long stacksize, int wait_policy)
{
const char *env;
@@ -1181,6 +1216,7 @@ initialize_env (void)
gomp_global_icv.thread_limit_var
= thread_limit_var > INT_MAX ? UINT_MAX : thread_limit_var;
}
+ parse_int ("GCC_ACC_NOTIFY", &gomp_global_icv.acc_notify_var, true);
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_init (&gomp_managed_threads_lock);
#endif
@@ -1271,6 +1307,13 @@ initialize_env (void)
}
handle_omp_display_env (stacksize, wait_policy);
+
+ /* Look for OpenACC-specific environment variables. */
+ goacc_parse_device_num ();
+ goacc_parse_device_type ();
+
+ /* Initialize OpenACC-specific internal state. */
+ ACC_runtime_initialize ();
}
@@ -35,7 +35,7 @@
#include <stdlib.h>
-static void
+void
gomp_verror (const char *fmt, va_list list)
{
fputs ("\nlibgomp: ", stderr);
@@ -54,13 +54,40 @@ gomp_error (const char *fmt, ...)
}
void
+gomp_vfatal (const char *fmt, va_list list)
+{
+ gomp_verror (fmt, list);
+ exit (EXIT_FAILURE);
+}
+
+void
gomp_fatal (const char *fmt, ...)
{
va_list list;
va_start (list, fmt);
- gomp_verror (fmt, list);
+ gomp_vfatal (fmt, list);
va_end (list);
- exit (EXIT_FAILURE);
+ /* Unreachable. */
+ abort ();
+}
+
+void
+gomp_vnotify (const char *msg, va_list list)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ if (icv->acc_notify_var)
+ vfprintf (stderr, msg, list);
+}
+
+void
+gomp_notify(const char *msg, ...)
+{
+ va_list list;
+
+ va_start (list, msg);
+ gomp_vnotify (msg, list);
+ va_end (list);
}
+
new file mode 100644
@@ -0,0 +1,106 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Exported (non-hidden) functions exposing libgomp interface for plugins. */
+
+#include <stdlib.h>
+
+#include "libgomp.h"
+#include "libgomp-plugin.h"
+#include "target.h"
+
+void *
+gomp_plugin_malloc (size_t size)
+{
+ return gomp_malloc (size);
+}
+
+void *
+gomp_plugin_malloc_cleared (size_t size)
+{
+ return gomp_malloc_cleared (size);
+}
+
+void *
+gomp_plugin_realloc (void *ptr, size_t size)
+{
+ return gomp_realloc (ptr, size);
+}
+
+void
+gomp_plugin_error (const char *msg, ...)
+{
+ va_list ap;
+
+ va_start (ap, msg);
+ gomp_verror (msg, ap);
+ va_end (ap);
+}
+
+void
+gomp_plugin_notify (const char *msg, ...)
+{
+ va_list ap;
+
+ va_start (ap, msg);
+ gomp_vnotify (msg, ap);
+ va_end (ap);
+}
+
+void
+gomp_plugin_fatal (const char *msg, ...)
+{
+ va_list ap;
+
+ va_start (ap, msg);
+ gomp_vfatal (msg, ap);
+ va_end (ap);
+
+ /* Unreachable. */
+ abort ();
+}
+
+void
+gomp_plugin_mutex_init (gomp_mutex_t *mutex)
+{
+ gomp_mutex_init (mutex);
+}
+
+void
+gomp_plugin_mutex_destroy (gomp_mutex_t *mutex)
+{
+ gomp_mutex_destroy (mutex);
+}
+
+void
+gomp_plugin_mutex_lock (gomp_mutex_t *mutex)
+{
+ gomp_mutex_lock (mutex);
+}
+
+void
+gomp_plugin_mutex_unlock (gomp_mutex_t *mutex)
+{
+ gomp_mutex_unlock (mutex);
+}
new file mode 100644
@@ -0,0 +1,57 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* An interface to various libgomp-internal functions for use by plugins. */
+
+#ifndef LIBGOMP_PLUGIN_H
+#define LIBGOMP_PLUGIN_H 1
+
+#include "mutex.h"
+
+/* alloc.c */
+
+extern void *gomp_plugin_malloc (size_t) __attribute__((malloc));
+extern void *gomp_plugin_malloc_cleared (size_t) __attribute__((malloc));
+extern void *gomp_plugin_realloc (void *, size_t);
+
+/* error.c */
+
+extern void gomp_plugin_notify(const char *msg, ...);
+extern void gomp_plugin_error (const char *, ...)
+ __attribute__((format (printf, 1, 2)));
+extern void gomp_plugin_fatal (const char *, ...)
+ __attribute__((noreturn, format (printf, 1, 2)));
+
+/* mutex.c */
+
+extern void gomp_plugin_mutex_init (gomp_mutex_t *mutex);
+extern void gomp_plugin_mutex_destroy (gomp_mutex_t *mutex);
+extern void gomp_plugin_mutex_lock (gomp_mutex_t *mutex);
+extern void gomp_plugin_mutex_unlock (gomp_mutex_t *mutex);
+
+/* target.c */
+
+extern void gomp_plugin_async_unmap_vars (void *ptr);
+
+#endif
@@ -40,6 +40,7 @@
#include <pthread.h>
#include <stdbool.h>
#include <stdlib.h>
+#include <stdarg.h>
#ifdef HAVE_ATTRIBUTE_VISIBILITY
# pragma GCC visibility push(hidden)
@@ -220,6 +221,7 @@ struct gomp_team_state
};
struct target_mem_desc;
+struct gomp_memory_mapping;
/* These are the OpenMP 4.0 Internal Control Variables described in
section 2.3.1. Those described as having one copy per task are
@@ -236,6 +238,7 @@ struct gomp_task_icv
bool dyn_var;
bool nest_var;
char bind_var;
+ int acc_notify_var;
/* Internal ICV. */
struct target_mem_desc *target_data;
};
@@ -254,6 +257,9 @@ extern unsigned long gomp_bind_var_list_len;
extern void **gomp_places_list;
extern unsigned long gomp_places_list_len;
+extern int goacc_device_num;
+extern char* goacc_device_type;
+
enum gomp_task_kind
{
GOMP_TASK_IMPLICIT,
@@ -532,8 +538,12 @@ extern void *gomp_realloc (void *, size_t);
/* error.c */
+extern void gomp_vnotify (const char *, va_list);
+extern void gomp_notify(const char *msg, ...);
+extern void gomp_verror (const char *, va_list);
extern void gomp_error (const char *, ...)
__attribute__((format (printf, 1, 2)));
+extern void gomp_vfatal (const char *, va_list);
extern void gomp_fatal (const char *, ...)
__attribute__((noreturn, format (printf, 1, 2)));
@@ -610,6 +620,7 @@ extern int gomp_get_num_devices (void);
/* target.c */
+extern void gomp_init_targets_once (void);
extern int gomp_get_num_devices (void);
/* work.c */
@@ -232,3 +232,66 @@ GOMP_4.0.1 {
global:
GOMP_offload_register;
} GOMP_4.0;
+
+OACC_2.0 {
+ global:
+ acc_get_num_devices;
+ acc_get_num_devices_;
+ acc_set_device_type;
+ acc_set_device_type_;
+ acc_get_device_type;
+ acc_get_device_type_;
+ acc_set_device_num;
+ acc_set_device_num_;
+ acc_get_device_num;
+ acc_get_device_num_;
+ acc_init;
+ acc_init_;
+ acc_shutdown;
+ acc_shutdown_;
+ acc_on_device;
+ acc_on_device_;
+ acc_malloc;
+ acc_free;
+ acc_copyin;
+ acc_present_or_copyin;
+ acc_create;
+ acc_present_or_create;
+ acc_copyout;
+ acc_delete;
+ acc_update_device;
+ acc_update_self;
+ acc_map_data;
+ acc_unmap_data;
+ acc_deviceptr;
+ acc_hostptr;
+ acc_is_present;
+ acc_memcpy_to_device;
+ acc_memcpy_from_device;
+ acc_async_test;
+ acc_async_test_all;
+ acc_wait;
+ acc_wait_async;
+ acc_wait_all;
+ acc_wait_all_async;
+ acc_get_current_cuda_device;
+ acc_get_current_cuda_context;
+ acc_get_cuda_stream;
+ acc_set_cuda_stream;
+};
+
+# FIXME: Hygiene/grouping/naming?
+PLUGIN_1.0 {
+ global:
+ gomp_plugin_malloc;
+ gomp_plugin_malloc_cleared;
+ gomp_plugin_realloc;
+ gomp_plugin_error;
+ gomp_plugin_notify;
+ gomp_plugin_fatal;
+ gomp_plugin_mutex_init;
+ gomp_plugin_mutex_destroy;
+ gomp_plugin_mutex_lock;
+ gomp_plugin_mutex_unlock;
+ gomp_plugin_async_unmap_vars;
+};
@@ -214,4 +214,17 @@ extern void GOMP_target_update (int, const void *,
size_t, void **, size_t *, unsigned char *);
extern void GOMP_teams (unsigned int, unsigned int);
+/* oacc-parallel.c */
+
+extern void GOACC_data_start (int, const void *,
+ size_t, void **, size_t *, unsigned short *);
+extern void GOACC_data_end (void);
+extern void GOACC_kernels (int, void (*) (void *), const void *,
+ size_t, void **, size_t *, unsigned short *,
+ int, int, int, int, int, ...);
+extern void GOACC_parallel (int, void (*) (void *), const void *,
+ size_t, void **, size_t *, unsigned short *,
+ int, int, int, int, int, ...);
+extern void GOACC_wait (int, int, ...);
+
#endif /* LIBGOMP_G_H */
new file mode 100644
@@ -0,0 +1,80 @@
+/* OpenACC Runtime Library Definitions.
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Nathan Sidwell <nathan@codesourcery.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include "openacc.h"
+#include "libgomp.h"
+#include "target.h"
+
+int
+acc_async_test (int async)
+{
+ if (async < acc_async_sync)
+ gomp_fatal ("invalid async argument: %d", async);
+
+ return ACC_dev->openacc.async_test_func (async);
+}
+
+int
+acc_async_test_all (void)
+{
+ return ACC_dev->openacc.async_test_all_func ();
+}
+
+void
+acc_wait (int async)
+{
+ if (async < acc_async_sync)
+ gomp_fatal ("invalid async argument: %d", async);
+
+ ACC_dev->openacc.async_wait_func (async);
+ return;
+}
+
+void
+acc_wait_async (int async1, int async2)
+{
+ ACC_dev->openacc.async_wait_async_func (async1, async2);
+ return;
+}
+
+void
+acc_wait_all (void)
+{
+ ACC_dev->openacc.async_wait_all_func ();
+ return;
+}
+
+void
+acc_wait_all_async (int async)
+{
+ if (async < acc_async_sync)
+ gomp_fatal ("invalid async argument: %d", async);
+
+ ACC_dev->openacc.async_wait_all_async_func (async);
+ return;
+}
new file mode 100644
@@ -0,0 +1,81 @@
+/* OpenACC Runtime Library: CUDA support glue.
+
+ Copyright (C) 2014 Free Software Foundation, Inc.
+
+ Contributed by Mentor Embedded.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "target.h"
+
+void *
+acc_get_current_cuda_device (void)
+{
+ void *p = NULL;
+
+ if (ACC_dev && ACC_dev->openacc.cuda.get_current_device_func)
+ p = ACC_dev->openacc.cuda.get_current_device_func ();
+
+ return p;
+}
+
+void *
+acc_get_current_cuda_context (void)
+{
+ void *p = NULL;
+
+ if (ACC_dev && ACC_dev->openacc.cuda.get_current_context_func)
+ p = ACC_dev->openacc.cuda.get_current_context_func ();
+
+ return p;
+}
+
+void *
+acc_get_cuda_stream (int async)
+{
+ void *p = NULL;
+
+ if (async < 0)
+ return p;
+
+ if (ACC_dev && ACC_dev->openacc.cuda.get_stream_func)
+ p = ACC_dev->openacc.cuda.get_stream_func (async);
+
+ return p;
+}
+
+int
+acc_set_cuda_stream (int async, void *stream)
+{
+ int s = -1;
+
+ if (async < 0 || stream == NULL)
+ return 0;
+
+ if (ACC_dev && ACC_dev->openacc.cuda.set_stream_func)
+ s = ACC_dev->openacc.cuda.set_stream_func (async, stream);
+
+ return s;
+}
new file mode 100644
@@ -0,0 +1,89 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Thomas Schwinge <thomas@codesourcery.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file contains Fortran wrapper routines. */
+
+#include "openacc.h"
+#include <stdint.h>
+
+#ifdef HAVE_ATTRIBUTE_ALIAS
+/* Use internal aliases if possible. */
+ialias_redirect (acc_get_num_devices)
+ialias_redirect (acc_set_device_type)
+ialias_redirect (acc_get_device_type)
+ialias_redirect (acc_set_device_num)
+ialias_redirect (acc_get_device_num)
+ialias_redirect (acc_init)
+ialias_redirect (acc_shutdown)
+ialias_redirect (acc_on_device)
+#endif
+
+int32_t
+acc_get_num_devices_ (const int32_t *dev)
+{
+ return acc_get_num_devices (*dev);
+}
+
+void
+acc_set_device_type_ (const int32_t *dev)
+{
+ acc_set_device_type (*dev);
+}
+
+int32_t
+acc_get_device_type_ (void)
+{
+ return acc_get_device_type ();
+}
+
+void
+acc_set_device_num_ (const int32_t *num, const int32_t *dev)
+{
+ acc_set_device_num (*num, *dev);
+}
+
+int32_t
+acc_get_device_num_ (const int32_t *dev)
+{
+ return acc_get_device_num (*dev);
+}
+
+void
+acc_init_ (const int32_t *dev)
+{
+ acc_init (*dev);
+}
+
+void
+acc_shutdown_ (const int32_t *dev)
+{
+ acc_shutdown (*dev);
+}
+
+int32_t
+acc_on_device_ (const acc_device_t *dev)
+{
+ return acc_on_device (*dev);
+}
new file mode 100644
@@ -0,0 +1,416 @@
+/* OpenACC Runtime Library: acc_device_host.
+
+ Copyright (C) 2013 Free Software Foundation, Inc.
+
+ Contributed by Thomas Schwinge <thomas@codesourcery.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Simple implementation of support routines for a shared-memory
+ acc_device_host, and a non-shared memory acc_device_nonshm_host, with the
+ latter built as a plugin. */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "target.h"
+#ifdef NONSHM_HOST_PLUGIN
+#include "libgomp-plugin.h"
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#undef DEBUG
+
+#ifdef NONSHM_HOST_PLUGIN
+#define STATIC
+#define GOMP(X) gomp_plugin_##X
+#define SELF "non-SHM host plugin: "
+#else
+#define STATIC static
+#define GOMP(X) gomp_##X
+#define SELF "host: "
+#endif
+
+#ifndef NONSHM_HOST_PLUGIN
+static struct gomp_device_descr host_dispatch;
+#endif
+
+STATIC const char *
+get_name (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+#ifdef NONSHM_HOST_PLUGIN
+ return "nonshm-host";
+#else
+ return "host";
+#endif
+}
+
+STATIC int
+get_type (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+#ifdef NONSHM_HOST_PLUGIN
+ return TARGET_TYPE_NONSHM_HOST;
+#else
+ return TARGET_TYPE_HOST;
+#endif
+}
+
+STATIC unsigned int
+get_caps (void)
+{
+ unsigned int caps = TARGET_CAP_OPENACC_200 | TARGET_CAP_OPENMP_400
+ | TARGET_CAP_NATIVE_EXEC;
+
+#ifndef NONSHM_HOST_PLUGIN
+ caps |= TARGET_CAP_SHARED_MEM;
+#endif
+
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s: 0x%x\n", __FILE__, __FUNCTION__, caps);
+#endif
+
+ return caps;
+}
+
+STATIC int
+get_num_devices (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 1;
+}
+
+STATIC void
+offload_register (void *host_table, void *target_data)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, host_table,
+ target_data);
+#endif
+}
+
+STATIC int
+device_init (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return get_num_devices ();
+}
+
+STATIC int
+device_fini (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 0;
+}
+
+STATIC int
+device_get_table (void *table)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, table);
+#endif
+
+ return 0;
+}
+
+STATIC bool
+openacc_avail (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 1;
+}
+
+STATIC void *
+openacc_open_device (int n)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n);
+#endif
+
+ return (void *) (intptr_t) n;
+}
+
+STATIC int
+openacc_close_device (void *hnd)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, hnd);
+#endif
+
+ return 0;
+}
+
+STATIC int
+openacc_get_device_num (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 0;
+}
+
+STATIC void
+openacc_set_device_num (int n)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%u)\n", __FILE__, __FUNCTION__, n);
+#endif
+
+ if (n > 0)
+ GOMP(fatal) ("device number %u out of range for host execution", n);
+}
+
+STATIC void *
+device_alloc (size_t s)
+{
+ void *ptr = GOMP(malloc) (s);
+
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%zd): %p\n", __FILE__, __FUNCTION__, s, ptr);
+#endif
+
+ return ptr;
+}
+
+STATIC void
+device_free (void *p)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p)\n", __FILE__, __FUNCTION__, p);
+#endif
+
+ free (p);
+}
+
+STATIC void *
+device_host2dev (void *d, const void *h, size_t s)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, d, h,
+ s);
+#endif
+
+ memcpy (d, h, s);
+
+ return 0;
+}
+
+STATIC void *
+device_dev2host (void *h, const void *d, size_t s)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %p, %zd)\n", __FILE__, __FUNCTION__, h, d,
+ s);
+#endif
+
+ memcpy (h, d, s);
+
+ return 0;
+}
+
+STATIC void
+device_run (void *fn_ptr, void *vars)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %p)\n", __FILE__, __FUNCTION__, fn_ptr,
+ vars);
+#endif
+
+ void (*fn)(void *) = (void (*)(void *)) fn_ptr;
+
+ fn (vars);
+}
+
+STATIC void
+openacc_parallel (void (*fn) (void *), size_t mapnum __attribute__((unused)),
+ void **hostaddrs, void **devaddrs __attribute__((unused)),
+ size_t *sizes __attribute__((unused)),
+ unsigned short *kinds __attribute__((unused)),
+ int num_gangs __attribute__((unused)),
+ int num_workers __attribute__((unused)),
+ int vector_length __attribute__((unused)),
+ int async __attribute__((unused)),
+ void *targ_mem_desc __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, %d, %p)\n",
+ __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes, kinds,
+ num_gangs, num_workers, vector_length, async, targ_mem_desc);
+#endif
+
+ fn (hostaddrs);
+}
+
+STATIC void
+openacc_async_set_async (int async __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
+#endif
+}
+
+STATIC int
+openacc_async_test (int async __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
+#endif
+
+ return 1;
+}
+
+STATIC int
+openacc_async_test_all (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return 1;
+}
+
+STATIC void
+openacc_async_wait (int async __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
+#endif
+}
+
+STATIC void
+openacc_async_wait_all (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s\n", __FILE__, __FUNCTION__);
+#endif
+}
+
+STATIC void
+openacc_async_wait_async (int async1 __attribute__((unused)),
+ int async2 __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d, %d)\n", __FILE__, __FUNCTION__, async1,
+ async2);
+#endif
+}
+
+STATIC void
+openacc_async_wait_all_async (int async __attribute__((unused)))
+{
+#ifdef DEBUG
+ fprintf (stderr, SELF "%s:%s (%d)\n", __FILE__, __FUNCTION__, async);
+#endif
+}
+
+#ifndef NONSHM_HOST_PLUGIN
+static struct gomp_device_descr host_dispatch =
+ {
+ .name = "host",
+
+ .type = TARGET_TYPE_HOST,
+ .capabilities = TARGET_CAP_OPENACC_200 | TARGET_CAP_NATIVE_EXEC
+ | TARGET_CAP_SHARED_MEM,
+ .id = 0,
+
+ .is_initialized = false,
+ .offload_regions_registered = false,
+
+ .get_name_func = get_name,
+ .get_type_func = get_type,
+ .get_caps_func = get_caps,
+
+ .device_init_func = device_init,
+ .device_fini_func = device_fini,
+ .get_num_devices_func = get_num_devices,
+ .offload_register_func = offload_register,
+ .device_get_table_func = device_get_table,
+
+ .device_alloc_func = device_alloc,
+ .device_free_func = device_free,
+ .device_host2dev_func = device_host2dev,
+ .device_dev2host_func = device_dev2host,
+
+ .device_run_func = device_run,
+
+ .openacc = {
+ .open_device_func = openacc_open_device,
+ .close_device_func = openacc_close_device,
+
+ .get_device_num_func = openacc_get_device_num,
+ .set_device_num_func = openacc_set_device_num,
+
+ /* Device available. */
+ .avail_func = openacc_avail,
+
+ .exec_func = openacc_parallel,
+
+ .async_set_async_func = openacc_async_set_async,
+ .async_test_func = openacc_async_test,
+ .async_test_all_func = openacc_async_test_all,
+ .async_wait_func = openacc_async_wait,
+ .async_wait_async_func = openacc_async_wait_async,
+ .async_wait_all_func = openacc_async_wait_all,
+ .async_wait_all_async_func = openacc_async_wait_all_async,
+
+ .cuda = {
+ .get_current_device_func = NULL,
+ .get_current_context_func = NULL,
+ .get_stream_func = NULL,
+ .set_stream_func = NULL,
+ }
+ }
+ };
+
+/* Register this device type. */
+static __attribute__ ((constructor))
+void ACC_host_init (void)
+{
+ gomp_mutex_init (&host_dispatch.mem_map.lock);
+ ACC_register (&host_dispatch);
+}
+#endif
+
new file mode 100644
@@ -0,0 +1,507 @@
+/* OpenACC Runtime initialization routines
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Nathan Sidwell <nathan@codesourcery.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "libgomp.h"
+#include "target.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <stdbool.h>
+#include <sys/queue.h>
+#include <stdio.h>
+
+gomp_mutex_t acc_device_lock;
+
+/* Current dispatcher, and how it was initialized */
+static acc_device_t init_key = _ACC_device_hwm;
+
+/* The dispatch table for the current accelerator device. This is currently
+ global, so you can only have one type of device open at any given time in a
+ program. */
+struct gomp_device_descr const *ACC_dev;
+
+/* Handle for current thread. */
+__thread void *ACC_handle;
+static __thread int handle_num = -1;
+
+/* This context structure associates the handle for a physical device with
+ memory-mapping information for that device, and is used to associate new
+ host threads with previously-opened devices. Note that it's not directly
+ connected with the CUDA "context" concept as used by the NVidia plugin. */
+struct ACC_context {
+ struct memmap_t *ACC_memmap;
+ void *ACC_handle;
+ SLIST_ENTRY(ACC_context) next;
+};
+
+static SLIST_HEAD(_ACC_contexts, ACC_context) _ACC_contexts;
+static struct _ACC_contexts *ACC_contexts;
+
+static struct gomp_device_descr const *dispatchers[_ACC_device_hwm] = { 0 };
+
+void
+ACC_register (struct gomp_device_descr const *disp)
+{
+ gomp_mutex_lock (&acc_device_lock);
+
+ assert (acc_device_type (disp->type) != acc_device_none
+ && acc_device_type (disp->type) != acc_device_default
+ && acc_device_type (disp->type) != acc_device_not_host);
+ assert (!dispatchers[disp->type]);
+ dispatchers[disp->type] = disp;
+
+ gomp_mutex_unlock (&acc_device_lock);
+}
+
+static void
+close_handle (void)
+{
+ if (ACC_memmap)
+ {
+ if (ACC_mem_close (ACC_handle, ACC_memmap))
+ {
+ if (ACC_dev->openacc.close_device_func (ACC_handle) < 0)
+ gomp_fatal ("failed to close device");
+ }
+
+ ACC_memmap = 0;
+ }
+}
+
+static struct gomp_device_descr const *
+resolve_device (acc_device_t d)
+{
+ acc_device_t d_arg = d;
+
+ switch (d)
+ {
+ case acc_device_default:
+ {
+ if (goacc_device_type)
+ {
+ /* Lookup the named device. */
+ while (++d != _ACC_device_hwm)
+ if (dispatchers[d]
+ && !strcasecmp (goacc_device_type, dispatchers[d]->name)
+ && dispatchers[d]->openacc.avail_func ())
+ goto found;
+
+ gomp_fatal ("device type %s not supported", goacc_device_type);
+ }
+
+ /* No default device specified, so start scanning for any non-host
+ device that is available. */
+ d = acc_device_not_host;
+ }
+ /* FALLTHROUGH */
+
+ case acc_device_not_host:
+ /* Find the first available device after acc_device_not_host. */
+ while (++d != _ACC_device_hwm)
+ if (dispatchers[d] && dispatchers[d]->openacc.avail_func ())
+ goto found;
+ if (d_arg == acc_device_default)
+ {
+ d = acc_device_host;
+ goto found;
+ }
+ gomp_fatal ("no device found");
+ break;
+
+ case acc_device_host:
+ break;
+
+ default:
+ if (d > _ACC_device_hwm)
+ gomp_fatal ("device %u out of range", (unsigned)d);
+ break;
+ }
+ found:
+
+ assert (d != acc_device_none
+ && d != acc_device_default
+ && d != acc_device_not_host);
+
+ return dispatchers[d];
+}
+
+static struct gomp_device_descr const *
+_acc_init (acc_device_t d)
+{
+ struct gomp_device_descr const *acc_dev;
+
+ if (ACC_dev)
+ gomp_fatal ("device already active");
+
+ init_key = d; /* We need to remember what we were intialized as, to
+ check shutdown etc. */
+
+ acc_dev = resolve_device (d);
+ if (!acc_dev || !acc_dev->openacc.avail_func ())
+ gomp_fatal ("device %u not supported", (unsigned)d);
+
+ if (!acc_dev->is_initialized)
+ gomp_init_device ((struct gomp_device_descr *) acc_dev);
+
+ return acc_dev;
+}
+
+/* Open the ORD'th device of the currently-active type (ACC_dev must be
+ initialised before calling). If ORD is < 0, open the default-numbered
+ device (set by the ACC_DEVICE_NUM environment variable or a call to
+ acc_set_device_num), or leave any currently-opened device as is. "Opening"
+ consists of calling the device's open_device_func hook, and either creating
+ a new memory mapping or associating a new thread with an existing such
+ mapping (that matches ACC_handle, i.e. which corresponds to the same
+ physical device). */
+
+static void
+lazy_open (int ord)
+{
+ struct ACC_context *acc_ctx;
+
+ if (ACC_memmap)
+ {
+ assert (ord < 0 || ord == handle_num);
+ return;
+ }
+
+ assert (ACC_dev);
+
+ if (ord < 0)
+ ord = goacc_device_num;
+
+ ACC_handle = ACC_dev->openacc.open_device_func (ord);
+ handle_num = ord;
+
+ SLIST_FOREACH(acc_ctx, ACC_contexts, next)
+ {
+ if (acc_ctx->ACC_handle == ACC_handle)
+ {
+ ACC_memmap = acc_ctx->ACC_memmap;
+ ACC_dev->openacc.async_set_async_func (acc_async_sync);
+
+ return;
+ }
+ }
+
+ ACC_memmap = ACC_mem_open (ACC_handle, NULL, handle_num);
+
+ ACC_dev->openacc.async_set_async_func (acc_async_sync);
+
+ acc_ctx = gomp_malloc (sizeof (struct ACC_context));
+ acc_ctx->ACC_handle = ACC_handle;
+ acc_ctx->ACC_memmap = ACC_memmap;
+
+ SLIST_INSERT_HEAD(ACC_contexts, acc_ctx, next);
+}
+
+/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
+ init/shutdown is per-process or per-thread. We choose per-process. */
+
+void
+acc_init (acc_device_t d)
+{
+ if (!ACC_dev)
+ gomp_init_targets_once ();
+
+ gomp_mutex_lock (&acc_device_lock);
+
+ ACC_dev = _acc_init (d);
+
+ lazy_open (-1);
+
+ gomp_mutex_unlock (&acc_device_lock);
+}
+
+ialias (acc_init)
+
+void
+_acc_shutdown (acc_device_t d)
+{
+ /* We don't check whether d matches the actual device found, because
+ OpenACC 2.0 (3.2.12) says the parameters to the init and this
+ call must match (for the shutdown call anyway, it's silent on
+ others). */
+
+ if (!ACC_dev)
+ gomp_fatal ("no device initialized");
+ if (init_key != d)
+ gomp_fatal ("device %u(%u) is initialized",
+ (unsigned)init_key, (unsigned)ACC_dev->type);
+
+ close_handle ();
+
+ while (SLIST_FIRST(ACC_contexts) != NULL)
+ {
+ struct ACC_context *c;
+
+ c = SLIST_FIRST(ACC_contexts);
+ SLIST_REMOVE_HEAD(ACC_contexts, next);
+ free (c);
+ }
+
+ gomp_fini_device ((struct gomp_device_descr *) ACC_dev);
+
+ ACC_dev = 0;
+ ACC_handle = 0;
+ handle_num = -1;
+}
+
+void
+acc_shutdown (acc_device_t d)
+{
+ gomp_mutex_lock (&acc_device_lock);
+
+ _acc_shutdown (d);
+
+ gomp_mutex_unlock (&acc_device_lock);
+}
+
+ialias (acc_shutdown)
+
+static struct gomp_device_descr const *
+lazy_init (acc_device_t d)
+{
+ if (ACC_dev)
+ {
+ /* Re-initializing the same device, do nothing. */
+ if (d == init_key)
+ return ACC_dev;
+
+ _acc_shutdown (init_key);
+ }
+
+ assert (!ACC_dev);
+
+ return _acc_init (d);
+}
+
+static void
+lazy_init_and_open (acc_device_t d)
+{
+ if (!ACC_dev)
+ gomp_init_targets_once ();
+
+ gomp_mutex_lock (&acc_device_lock);
+
+ ACC_dev = lazy_init (d);
+
+ lazy_open (-1);
+
+ gomp_mutex_unlock (&acc_device_lock);
+}
+
+int
+acc_get_num_devices (acc_device_t d)
+{
+ int n = 0;
+ struct gomp_device_descr const *acc_dev;
+
+ if (d == acc_device_none)
+ return 0;
+
+ if (!ACC_dev)
+ gomp_init_targets_once ();
+
+ acc_dev = resolve_device (d);
+ if (!acc_dev)
+ return 0;
+
+ n = acc_dev->device_init_func ();
+ if (n < 0)
+ n = 0;
+
+ return n;
+}
+
+ialias (acc_get_num_devices)
+
+void
+acc_set_device_type (acc_device_t d)
+{
+ lazy_init_and_open (d);
+}
+
+ialias (acc_set_device_type)
+
+acc_device_t
+acc_get_device_type (void)
+{
+ acc_device_t res = acc_device_none;
+ const struct gomp_device_descr *dev;
+
+ if (ACC_dev)
+ res = acc_device_type (ACC_dev->type);
+ else
+ {
+ gomp_init_targets_once ();
+
+ dev = resolve_device (acc_device_default);
+ res = acc_device_type (dev->type);
+ }
+
+ assert (res != acc_device_default
+ && res != acc_device_not_host);
+
+ return res;
+}
+
+ialias (acc_get_device_type)
+
+int
+acc_get_device_num (acc_device_t d)
+{
+ const struct gomp_device_descr *dev;
+ int num;
+
+ if (d >= _ACC_device_hwm)
+ gomp_fatal ("device %u out of range", (unsigned)d);
+
+ if (!ACC_dev)
+ gomp_init_targets_once ();
+
+ dev = resolve_device (d);
+ if (!dev)
+ gomp_fatal ("no devices of type %u", d);
+
+ /* We might not have called lazy_open for this host thread yet, in which case
+ the get_device_num_func hook will return -1. */
+ num = dev->openacc.get_device_num_func ();
+ if (num < 0)
+ num = goacc_device_num;
+
+ return num;
+}
+
+ialias (acc_get_device_num)
+
+void
+acc_set_device_num (int n, acc_device_t d)
+{
+ const struct gomp_device_descr *dev;
+ int num_devices;
+
+ if (!ACC_dev)
+ gomp_init_targets_once ();
+
+ if ((int) d == 0)
+ {
+ int i;
+
+ /* A device setting of zero sets all device types on the system to use
+ the Nth instance of that device type. Only attempt it for initialized
+ devices though. */
+ for (i = acc_device_not_host + 1; i < _ACC_device_hwm; i++)
+ {
+ dev = resolve_device (d);
+ if (dev && dev->is_initialized)
+ dev->openacc.set_device_num_func (n);
+ }
+
+ /* ...and for future calls to acc_init/acc_set_device_type, etc. */
+ goacc_device_num = n;
+ }
+ else
+ {
+ gomp_mutex_lock (&acc_device_lock);
+
+ ACC_dev = lazy_init (d);
+
+ num_devices = ACC_dev->get_num_devices_func ();
+
+ if (n >= num_devices)
+ gomp_fatal ("device %u out of range", n);
+
+ if (n != handle_num)
+ close_handle ();
+
+ lazy_open (n);
+
+ gomp_mutex_unlock (&acc_device_lock);
+ }
+}
+
+ialias (acc_set_device_num)
+
+int
+acc_on_device (acc_device_t dev)
+{
+ /* Just rely on the compiler builtin. */
+ return __builtin_acc_on_device (dev);
+}
+ialias (acc_on_device)
+
+attribute_hidden void
+ACC_runtime_initialize (void)
+{
+ gomp_mutex_init (&acc_device_lock);
+
+ ACC_contexts = &_ACC_contexts;
+ SLIST_INIT (ACC_contexts);
+}
+
+/* Compiler helper functions */
+
+static __thread struct gomp_device_descr const *saved_bound_dev;
+
+void
+ACC_save_and_set_bind (acc_device_t d)
+{
+ assert (!saved_bound_dev);
+
+ saved_bound_dev = ACC_dev;
+ ACC_dev = dispatchers[d];
+}
+
+void
+ACC_restore_bind (void)
+{
+ ACC_dev = saved_bound_dev;
+ saved_bound_dev = NULL;
+}
+
+/* This is called from any OpenACC support function that may need to implicitly
+ initialize the libgomp runtime. On exit all such initialization will have
+ been done, and both the global ACC_dev and the per-host-thread ACC_memmap
+ pointers will be valid. */
+
+void
+ACC_lazy_initialize (void)
+{
+ if (ACC_dev && ACC_memmap)
+ return;
+
+ if (!ACC_dev)
+ lazy_init_and_open (acc_device_default);
+ else
+ {
+ gomp_mutex_lock (&acc_device_lock);
+ lazy_open (-1);
+ gomp_mutex_unlock (&acc_device_lock);
+ }
+}
new file mode 100644
@@ -0,0 +1,127 @@
+/* OpenACC Runtime - internal declarations
+
+ Copyright (C) 2005-2014 Free Software Foundation, Inc.
+
+ Contributed by Nathan Sidwell <nathan@codesourcery.com> and Thomas Schwinge
+ <thomas@codesourcery.com>. In parts based on libgomp.h contributed by
+ Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file contains data types and function declarations that are not
+ part of the official OpenACC user interface. There are declarations
+ in here that are part of the GNU OpenACC ABI, in that the compiler is
+ required to know about them and use them.
+
+ The convention is that the all caps prefix "GOACC" is used group items
+ that are part of the external ABI, and the lower case prefix "goacc"
+ is used group items that are completely private to the library. */
+
+#ifndef _OACC_INT_H
+#define _OACC_INT_H 1
+
+#include "openacc.h"
+#include "config.h"
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdarg.h>
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility push(hidden)
+#endif
+
+typedef struct ACC_dispatch_t
+{
+ /* open or close a device instance. */
+ void *(*open_device_func) (int n);
+ int (*close_device_func) (void *h);
+
+ /* set or get the device number. */
+ int (*get_device_num_func) (void);
+ void (*set_device_num_func) (int);
+
+ /* availability */
+ bool (*avail_func) (void);
+
+ /* execute */
+ void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
+ unsigned short *, int, int, int, int, void *);
+
+ /* asynchronous routines */
+ int (*async_test_func) (int);
+ int (*async_test_all_func) (void);
+ void (*async_wait_func) (int);
+ void (*async_wait_async_func) (int, int);
+ void (*async_wait_all_func) (void);
+ void (*async_wait_all_async_func) (int);
+ void (*async_set_async_func) (int);
+
+ /* NVIDIA target specific routines */
+ struct {
+ void *(*get_current_device_func) (void);
+ void *(*get_current_context_func) (void);
+ void *(*get_stream_func) (int);
+ int (*set_stream_func) (int, void *);
+ } cuda;
+} ACC_dispatch_t;
+
+typedef enum ACC_dispatch_f
+ {
+ ACC_unified_mem_f = 1 << 0,
+ }
+ACC_dispatch_f;
+
+struct gomp_device_descr;
+
+void ACC_register (struct gomp_device_descr const *) __GOACC_NOTHROW;
+
+/* Memory routines. */
+struct memmap_t *ACC_mem_open (void *, struct memmap_t *, int) __GOACC_NOTHROW;
+bool ACC_mem_close (void *, struct memmap_t *) __GOACC_NOTHROW;
+struct gomp_device_descr *ACC_resolve_device(int) __GOACC_NOTHROW;
+
+/* Current dispatcher */
+extern struct gomp_device_descr const *ACC_dev;
+
+/* Device handle for current thread. */
+extern __thread void *ACC_handle;
+
+typedef struct memmap_t
+{
+ unsigned live;
+ struct target_mem_desc *tlist;
+ struct gomp_memory_mapping mem_map;
+} memmap_t;
+
+/* Memory mapping */
+extern __thread struct memmap_t *ACC_memmap;
+
+void ACC_runtime_initialize (void);
+void ACC_save_and_set_bind (acc_device_t);
+void ACC_restore_bind (void);
+void ACC_lazy_initialize (void);
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility pop
+#endif
+
+#endif /* _OACC_INT_H */
new file mode 100644
@@ -0,0 +1,515 @@
+/* OpenACC Runtime initialization routines
+
+ Copyright (C) 2013 Free Software Foundation, Inc.
+
+ Contributed by Nathan Sidwell <nathan@codesourcery.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "gomp-constants.h"
+#include "target.h"
+#include <stdio.h>
+#include <stdint.h>
+
+#include "splay-tree.h"
+
+/* Although this pointer is local to each host thread, it points to a memmap_t
+ that is stored per-context (different host threads may be associated with
+ different contexts, and each context is associated with a physical
+ device). */
+__thread struct memmap_t *ACC_memmap;
+
+memmap_t *
+ACC_mem_open (void *handle, memmap_t *src, int handle_num)
+{
+ if (!src)
+ {
+ src = gomp_malloc (sizeof (*src));
+ src->live = 0;
+ src->mem_map.splay_tree.root = NULL;
+ src->tlist = NULL;
+ gomp_mutex_init (&src->mem_map.lock);
+ }
+
+ src->live++;
+
+ return src;
+}
+
+bool
+ACC_mem_close (void *handle, memmap_t *mm)
+{
+ bool closed = 0;
+
+ if (!--mm->live)
+ {
+ struct target_mem_desc *t;
+
+ for (t = mm->tlist; t != NULL; t = t->prev)
+ {
+ ACC_dev->device_free_func (t->to_free);
+
+ t->tgt_end = 0;
+ t->to_free = 0;
+
+ gomp_unmap_vars (t, true);
+ }
+
+ closed = 1;
+ }
+
+ gomp_mutex_destroy (&mm->mem_map.lock);
+
+ return closed;
+}
+
+/* Return block containing [H->S), or NULL if not contained. */
+
+attribute_hidden splay_tree_key
+lookup_host (memmap_t *mm, void *h, size_t s)
+{
+ struct splay_tree_key_s node;
+ splay_tree_key key;
+ struct gomp_memory_mapping *mem_map = &mm->mem_map;
+
+ node.host_start = (uintptr_t) h;
+ node.host_end = (uintptr_t) h + s;
+
+ gomp_mutex_lock (&mem_map->lock);
+
+ key = splay_tree_lookup (&mem_map->splay_tree, &node);
+
+ gomp_mutex_unlock (&mem_map->lock);
+
+ return key;
+}
+
+/* Return block containing [D->S), or NULL if not contained.
+ The list isn't ordered by device address, so we have to iterate
+ over the whole array. This is not expected to be a common
+ operation. */
+
+static splay_tree_key
+lookup_dev (memmap_t *b, void *d, size_t s)
+{
+ int i;
+ struct target_mem_desc *t;
+
+ gomp_mutex_lock (&b->mem_map.lock);
+
+ for (t = b->tlist; t != NULL; t = t->prev)
+ {
+ if (t->tgt_start <= (uintptr_t) d && t->tgt_end >= (uintptr_t) d + s)
+ break;
+ }
+
+ gomp_mutex_unlock (&b->mem_map.lock);
+
+ if (!t)
+ return NULL;
+
+ for (i = 0; i < t->refcount; i++)
+ {
+ void * offset;
+
+ splay_tree_key k = &t->array[i].key;
+ offset = d - t->tgt_start + k->tgt_offset;
+
+ if (k->host_start + offset <= (void *) k->host_end)
+ return k;
+ }
+
+ return NULL;
+}
+
+/* OpenACC is silent on how memory exhaustion is indicated. We return
+ NULL. */
+
+void *
+acc_malloc (size_t s)
+{
+ if (!s)
+ return NULL;
+
+ ACC_lazy_initialize ();
+
+ return ACC_dev->device_alloc_func (s);
+}
+
+/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
+ the device address is mapped. We choose to check if it mapped,
+ and if it is, to unmap it. */
+void
+acc_free (void *d)
+{
+ splay_tree_key k;
+
+ if (!d)
+ return;
+
+ /* We don't have to call lazy open here, as the ptr value must have
+ been returned by acc_malloc. It's not permitted to pass NULL in
+ (unless you got that null from acc_malloc). */
+ if ((k = lookup_dev (ACC_memmap, d, 1)))
+ {
+ void *offset;
+
+ offset = d - k->tgt->tgt_start + k->tgt_offset;
+
+ acc_unmap_data((void *)(k->host_start + offset));
+ }
+
+ ACC_dev->device_free_func (d);
+}
+
+void
+acc_memcpy_to_device (void *d, void *h, size_t s)
+{
+ /* No need to call lazy open here, as the device pointer must have
+ been obtained from a routine that did that. */
+ ACC_dev->device_host2dev_func (d, h, s);
+}
+
+void
+acc_memcpy_from_device (void *h, void *d, size_t s)
+{
+ /* No need to call lazy open here, as the device pointer must have
+ been obtained from a routine that did that. */
+ ACC_dev->device_dev2host_func (h, d, s);
+}
+
+/* Return the device pointer that corresponds to host data H. Or NULL
+ if no mapping. */
+
+void *
+acc_deviceptr (void *h)
+{
+ splay_tree_key n;
+ void *d;
+ void *offset;
+
+ ACC_lazy_initialize ();
+
+ n = lookup_host (ACC_memmap, h, 1);
+
+ if (!n)
+ return NULL;
+
+ offset = h - n->host_start;
+
+ d = n->tgt->tgt_start + n->tgt_offset + offset;
+
+ return d;
+}
+
+/* Return the host pointer that corresponds to device data D. Or NULL
+ if no mapping. */
+
+void *
+acc_hostptr (void *d)
+{
+ splay_tree_key n;
+ void *h;
+ void *offset;
+
+ ACC_lazy_initialize ();
+
+ n = lookup_dev (ACC_memmap, d, 1);
+
+ if (!n)
+ return NULL;
+
+ offset = d - n->tgt->tgt_start + n->tgt_offset;
+
+ h = n->host_start + offset;
+
+ return h;
+}
+
+/* Return 1 if host data [H,+S] is present on the device. */
+
+int
+acc_is_present (void *h, size_t s)
+{
+ splay_tree_key n;
+
+ if (!s || !h)
+ return 0;
+
+ ACC_lazy_initialize ();
+
+ n = lookup_host (ACC_memmap, h, s);
+
+ if (n && (((uintptr_t)h < n->host_start) ||
+ ((uintptr_t)h + s > n->host_end) || (s > n->host_end - n->host_start)))
+ n = NULL;
+
+ return n != NULL;
+}
+
+/* Create a mapping for host [H,+S] -> device [D,+S] */
+
+void
+acc_map_data (void *h, void *d, size_t s)
+{
+ struct target_mem_desc *tgt;
+ size_t mapnum = 1;
+ void *hostaddrs = h;
+ void *devaddrs = d;
+ size_t sizes = s;
+ unsigned short kinds = GOMP_MAP_ALLOC;
+
+ ACC_lazy_initialize ();
+
+ if (!d || !h || !s)
+ gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
+ (void *)h, (int)s, (void *)d, (int)s);
+
+ if (lookup_host (ACC_memmap, h, s))
+ gomp_fatal ("host address [%p, +%d] is already mapped", (void *)h, (int)s);
+
+ if (lookup_dev (ACC_memmap, d, s))
+ gomp_fatal ("device address [%p, +%d] is already mapped", (void *)d, (int)s);
+
+ tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
+ &ACC_memmap->mem_map, mapnum, &hostaddrs,
+ &devaddrs, &sizes, &kinds, true, false);
+
+ tgt->prev = ACC_memmap->tlist;
+ ACC_memmap->tlist = tgt;
+}
+
+void
+acc_unmap_data (void *h)
+{
+ /* No need to call lazy open, as the address must have been mapped.
+ */
+
+ size_t host_size;
+ splay_tree_key n = lookup_host (ACC_memmap, h, 1);
+ struct target_mem_desc *t;
+
+ if (!n)
+ gomp_fatal ("%p is not a mapped block", (void *)h);
+
+ host_size = n->host_end - n->host_start;
+
+ if (n->host_start != (uintptr_t) h)
+ gomp_fatal ("[%p,%d] surrounds1 %p",
+ (void *)n->host_start, (int)host_size, (void *)h);
+
+ t = n->tgt;
+
+ if (t->refcount == 2)
+ {
+ struct target_mem_desc *tp;
+
+ /* This is the last reference, so pull the descriptor off the
+ chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from
+ freeing the device memory. */
+ t->tgt_end = 0;
+ t->to_free = 0;
+
+ gomp_mutex_lock (&ACC_memmap->mem_map.lock);
+
+ for (tp = NULL, t = ACC_memmap->tlist; t != NULL; tp = t, t = t->prev)
+ {
+ if (n->tgt == t)
+ {
+ if (tp)
+ tp->prev = t->prev;
+ else
+ ACC_memmap->tlist = t->prev;
+
+ break;
+ }
+ }
+
+ gomp_mutex_unlock (&ACC_memmap->mem_map.lock);
+ }
+
+ gomp_unmap_vars (t, true);
+}
+
+#define PCC_Present (1 << 0)
+#define PCC_Create (1 << 1)
+#define PCC_Copy (1 << 2)
+
+attribute_hidden void *
+present_create_copy (unsigned f, void *h, size_t s)
+{
+ void *d;
+ splay_tree_key n;
+
+ if (!h || !s)
+ gomp_fatal ("[%p,+%d] is a bad range", (void *)h, (int)s);
+
+ ACC_lazy_initialize ();
+
+ n = lookup_host (ACC_memmap, h, s);
+ if (n)
+ {
+ /* Present. */
+ d = (void *) (n->tgt->tgt_start + n->tgt_offset);
+
+ if (!(f & PCC_Present))
+ gomp_fatal ("[%p,+%d] already mapped to [%p,+%d]",
+ (void *)h, (int)s, (void *)d, (int)s);
+ if ((h + s) > (void *)n->host_end)
+ gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
+ }
+ else if (!(f & PCC_Create))
+ {
+ gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
+ }
+ else
+ {
+ struct target_mem_desc *tgt;
+ size_t mapnum = 1;
+ unsigned short kinds;
+ void *hostaddrs = h;
+
+ if (f & PCC_Copy)
+ kinds = GOMP_MAP_ALLOC_TO;
+ else
+ kinds = GOMP_MAP_ALLOC;
+
+ tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
+ &ACC_memmap->mem_map, mapnum, &hostaddrs,
+ NULL, &s, &kinds, true, false);
+
+ d = tgt->to_free;
+ tgt->prev = ACC_memmap->tlist;
+ ACC_memmap->tlist = tgt;
+ }
+
+ return d;
+}
+
+void *
+acc_create (void *h, size_t s)
+{
+ return present_create_copy (PCC_Create, h, s);
+}
+
+void *
+acc_copyin (void *h, size_t s)
+{
+ return present_create_copy (PCC_Create | PCC_Copy, h, s);
+}
+
+void *
+acc_present_or_create (void *h, size_t s)
+{
+ return present_create_copy (PCC_Present | PCC_Create, h, s);
+}
+
+void *
+acc_present_or_copyin (void *h, size_t s)
+{
+ return present_create_copy (PCC_Present | PCC_Create | PCC_Copy, h, s);
+}
+
+#define DC_Copyout (1 << 0)
+
+static void
+delete_copyout (unsigned f, void *h, size_t s)
+{
+ size_t host_size;
+ splay_tree_key n;
+ void *d;
+
+ n = lookup_host (ACC_memmap, h, s);
+
+ /* No need to call lazy open, as the data must already have been
+ mapped. */
+
+ if (!n)
+ gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s);
+
+ d = (void *) (n->tgt->tgt_start + n->tgt_offset);
+
+ host_size = n->host_end - n->host_start;
+
+ if (n->host_start != (uintptr_t) h || host_size != s)
+ gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]",
+ (void *)n->host_start, (int)host_size, (void *)h, (int)s);
+
+ if (f & DC_Copyout)
+ ACC_dev->device_dev2host_func (h, d, s);
+
+ acc_unmap_data(h);
+
+ ACC_dev->device_free_func (d);
+}
+
+void
+acc_delete (void *h , size_t s)
+{
+ delete_copyout (0, h, s);
+}
+
+void acc_copyout (void *h, size_t s)
+{
+ delete_copyout (DC_Copyout, h, s);
+}
+
+static void
+update_dev_host (int is_dev, void *h, size_t s)
+{
+ splay_tree_key n;
+ void *d;
+
+ if (!ACC_memmap)
+ gomp_fatal ("[%p,%d] is not mapped", h, (int)s);
+
+ n = lookup_host (ACC_memmap, h, s);
+
+ /* No need to call lazy open, as the data must already have been
+ mapped. */
+
+ if (!n)
+ gomp_fatal ("[%p,%d] is not mapped", h, (int)s);
+
+ d = (void *) (n->tgt->tgt_start + n->tgt_offset);
+
+ if (is_dev)
+ ACC_dev->device_host2dev_func (d, h, s);
+ else
+ ACC_dev->device_dev2host_func (h, d, s);
+
+}
+
+void
+acc_update_device (void *h, size_t s)
+{
+ update_dev_host (1, h, s);
+}
+
+void
+acc_update_self (void *h, size_t s)
+{
+ update_dev_host (0, h, s);
+}
new file mode 100644
@@ -0,0 +1,386 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Thomas Schwinge <thomas@codesourcery.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file handles OpenACC constructs. */
+
+#include "openacc.h"
+#include "libgomp.h"
+#include "libgomp_g.h"
+#include "gomp-constants.h"
+#include "target.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <alloca.h>
+
+#ifdef FUTURE
+// device geometry per device type
+struct devgeom
+{
+ int gangs;
+ int workers;
+ int vectors;
+};
+
+
+// XXX: acceptable defaults?
+static __thread struct devgeom devgeom = { 1, 1, 1 };
+#endif
+
+#ifdef LATER
+static void
+dump_devaddrs(void)
+{
+ int i;
+ struct devaddr *dp;
+
+ gomp_notify("++++ num_devaddrs %d\n", num_devaddrs);
+ for (dp = devaddrs, i = 1; dp != 0; dp = dp->next, i++)
+ {
+ gomp_notify("++++ %.02d) %p\n", i, dp->d);
+ }
+}
+#endif
+
+static void
+dump_var(char *s, size_t idx, void *hostaddr, size_t size, unsigned char kind)
+{
+ gomp_notify(" %2zi: %3s 0x%.2x -", idx, s, kind & 0xff);
+
+ switch (kind & 0xff)
+ {
+ case 0x00: gomp_notify(" ALLOC "); break;
+ case 0x01: gomp_notify(" ALLOC TO "); break;
+ case 0x02: gomp_notify(" ALLOC FROM "); break;
+ case 0x03: gomp_notify(" ALLOC TOFROM "); break;
+ case 0x04: gomp_notify(" POINTER "); break;
+ case 0x05: gomp_notify(" TO_PSET "); break;
+
+ case 0x08: gomp_notify(" FORCE_ALLOC "); break;
+ case 0x09: gomp_notify(" FORCE_TO "); break;
+ case 0x0a: gomp_notify(" FORCE_FROM "); break;
+ case 0x0b: gomp_notify(" FORCE_TOFROM "); break;
+ case 0x0c: gomp_notify(" FORCE_PRESENT "); break;
+ case 0x0d: gomp_notify(" FORCE_DEALLOC "); break;
+ case 0x0e: gomp_notify(" FORCE_DEVICEPTR "); break;
+
+ case 0x18: gomp_notify(" FORCE_PRIVATE "); break;
+ case 0x19: gomp_notify(" FORCE_FIRSTPRIVATE "); break;
+
+ case (unsigned char) -1: gomp_notify(" DUMMY "); break;
+ default: gomp_notify("UGH! 0x%x\n", kind);
+ }
+
+ gomp_notify("- %d - %4d/0x%04x ", 1 << (kind >> 8), (int)size, (int)size);
+ gomp_notify("- %p\n", hostaddr);
+
+ return;
+}
+
+/* Ensure that the target device for DEVICE_TYPE is initialised (and that
+ plugins have been loaded if appropriate). The ACC_dev variable for the
+ current thread will be set appropriately for the given device type on
+ return. */
+
+attribute_hidden void
+select_acc_device (int device_type)
+{
+ if (device_type == GOMP_IF_CLAUSE_FALSE)
+ return;
+
+ if (device_type == acc_device_none)
+ device_type = acc_device_host;
+
+ if (device_type >= 0)
+ {
+ /* NOTE: this will go badly if the surrounding data environment is set up
+ to use a different device type. We'll just have to trust that users
+ know what they're doing... */
+ acc_set_device_type (device_type);
+ }
+
+ ACC_lazy_initialize ();
+}
+
+void goacc_wait (int async, int num_waits, va_list ap);
+
+void
+GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
+ size_t mapnum, void **hostaddrs, size_t *sizes,
+ unsigned short *kinds,
+ int num_gangs, int num_workers, int vector_length,
+ int async, int num_waits, ...)
+{
+ bool if_clause_condition_value = device != GOMP_IF_CLAUSE_FALSE;
+ va_list ap;
+ struct target_mem_desc *tgt;
+ void **devaddrs;
+ unsigned int i;
+
+ if (num_gangs != 1)
+ gomp_fatal ("num_gangs (%d) different from one is not yet supported",
+ num_gangs);
+ if (num_workers != 1)
+ gomp_fatal ("num_workers (%d) different from one is not yet supported",
+ num_workers);
+
+ gomp_notify ("%s: mapnum=%zd, hostaddrs=%p, sizes=%p, kinds=%p, async=%d\n",
+ __FUNCTION__, mapnum, hostaddrs, sizes, kinds, async);
+
+ select_acc_device (device);
+
+ /* Host fallback if "if" clause is false or if the current device is set to
+ the host. */
+ if (!if_clause_condition_value)
+ {
+ ACC_save_and_set_bind (acc_device_host);
+ fn (hostaddrs);
+ ACC_restore_bind ();
+ return;
+ }
+ else if (acc_device_type (ACC_dev->type) == acc_device_host)
+ {
+ fn (hostaddrs);
+ return;
+ }
+
+ va_start (ap, num_waits);
+
+ if (num_waits > 0)
+ goacc_wait (async, num_waits, ap);
+
+ va_end (ap);
+
+ ACC_dev->openacc.async_set_async_func (async);
+
+ tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
+ &ACC_memmap->mem_map, mapnum, hostaddrs,
+ NULL, sizes, kinds, true, false);
+
+ devaddrs = alloca (sizeof (void *) * mapnum);
+ for (i = 0; i < mapnum; i++)
+ devaddrs[i] = (void *) (tgt->list[i]->tgt->tgt_start
+ + tgt->list[i]->tgt_offset);
+
+ ACC_dev->openacc.exec_func (fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
+ num_gangs, num_workers, vector_length, async,
+ tgt);
+
+ /* If running synchronously, unmap immediately. */
+ if (async < acc_async_noval)
+ gomp_unmap_vars (tgt, true);
+ else
+ gomp_copy_from_async (tgt);
+
+ ACC_dev->openacc.async_set_async_func (acc_async_sync);
+}
+
+static __thread struct target_mem_desc *mapped_data = NULL;
+
+void
+GOACC_data_start (int device, const void *openmp_target, size_t mapnum,
+ void **hostaddrs, size_t *sizes, unsigned short *kinds)
+{
+ bool if_clause_condition_value = device != GOMP_IF_CLAUSE_FALSE;
+ struct target_mem_desc *tgt;
+
+ gomp_notify ("%s: mapnum=%zd, hostaddrs=%p, sizes=%p, kinds=%p\n",
+ __FUNCTION__, mapnum, hostaddrs, sizes, kinds);
+
+ select_acc_device (device);
+
+ /* Host fallback or 'do nothing'. */
+ if ((ACC_dev->capabilities & TARGET_CAP_SHARED_MEM)
+ || !if_clause_condition_value)
+ {
+ tgt = gomp_map_vars (NULL, NULL, 0, NULL, NULL, NULL, NULL, true, false);
+ tgt->prev = mapped_data;
+ mapped_data = tgt;
+
+ return;
+ }
+
+ gomp_notify (" %s: prepare mappings\n", __FUNCTION__);
+ tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
+ &ACC_memmap->mem_map, mapnum, hostaddrs,
+ NULL, sizes, kinds, true, false);
+ gomp_notify (" %s: mappings prepared\n", __FUNCTION__);
+ tgt->prev = mapped_data;
+ mapped_data = tgt;
+}
+
+void
+GOACC_data_end (void)
+{
+ struct target_mem_desc *tgt = mapped_data;
+
+ gomp_notify (" %s: restore mappings\n", __FUNCTION__);
+ mapped_data = tgt->prev;
+ gomp_unmap_vars (tgt, true);
+ gomp_notify (" %s: mappings restored\n", __FUNCTION__);
+}
+
+
+void
+GOACC_kernels (int device, void (*fn) (void *), const void *openmp_target,
+ size_t mapnum, void **hostaddrs, size_t *sizes,
+ unsigned short *kinds,
+ int num_gangs, int num_workers, int vector_length,
+ int async, int num_waits, ...)
+{
+ gomp_notify ("%s: mapnum=%zd, hostaddrs=%p, sizes=%p, kinds=%p\n", __FUNCTION__,
+ mapnum, hostaddrs, sizes, kinds);
+
+ va_list ap;
+
+ select_acc_device (device);
+
+ va_start (ap, num_waits);
+
+ if (num_waits > 0)
+ goacc_wait (async, num_waits, ap);
+
+ va_end (ap);
+
+ /* TODO. */
+ GOACC_parallel (device, fn, openmp_target, mapnum, hostaddrs, sizes, kinds,
+ num_gangs, num_workers, vector_length, async, num_waits);
+}
+
+void
+goacc_wait (int async, int num_waits, va_list ap)
+{
+ int i;
+
+ assert (num_waits >= 0);
+
+ if (async == acc_async_sync && num_waits == 0)
+ {
+ acc_wait_all ();
+ return;
+ }
+
+ if (async == acc_async_sync && num_waits)
+ {
+ for (i = 0; i < num_waits; i++)
+ {
+ int qid = va_arg (ap, int);
+
+ if (acc_async_test (qid))
+ continue;
+
+ acc_wait (qid);
+ }
+ return;
+ }
+
+ if (async == acc_async_noval && num_waits == 0)
+ {
+ ACC_dev->openacc.async_wait_all_async_func (acc_async_noval);
+ return;
+ }
+
+ for (i = 0; i < num_waits; i++)
+ {
+ int qid = va_arg (ap, int);
+
+ if (acc_async_test (qid))
+ continue;
+
+ /* If we're waiting on the same asynchronous queue as we're launching on,
+ the queue itself will order work as required, so there's no need to
+ wait explicitly. */
+ if (qid != async)
+ ACC_dev->openacc.async_wait_async_func (qid, async);
+ }
+}
+
+void
+GOACC_update (int device, const void *openmp_target, size_t mapnum,
+ void **hostaddrs, size_t *sizes, unsigned short *kinds,
+ int async, int num_waits, ...)
+{
+ bool if_clause_condition_value = device != GOMP_IF_CLAUSE_FALSE;
+ size_t i;
+
+ select_acc_device (device);
+
+ if ((ACC_dev->capabilities & TARGET_CAP_SHARED_MEM)
+ || !if_clause_condition_value)
+ return;
+
+ if (num_waits > 0)
+ {
+ va_list ap;
+
+ va_start (ap, num_waits);
+
+ goacc_wait (async, num_waits, ap);
+
+ va_end (ap);
+ }
+
+ ACC_dev->openacc.async_set_async_func (async);
+
+ for (i = 0; i < mapnum; ++i)
+ {
+ unsigned char kind = kinds[i] & 0xff;
+
+ dump_var("UPD", i, hostaddrs[i], sizes[i], kinds[i]);
+
+ switch (kind)
+ {
+ case GOMP_MAP_POINTER:
+ break;
+
+ case GOMP_MAP_FORCE_TO:
+ acc_update_device (hostaddrs[i], sizes[i]);
+ break;
+
+ case GOMP_MAP_FORCE_FROM:
+ acc_update_self (hostaddrs[i], sizes[i]);
+ break;
+
+ default:
+ gomp_fatal (">>>> GOACC_update UNHANDLED kind 0x%.2x", kind);
+ break;
+ }
+ }
+
+ ACC_dev->openacc.async_set_async_func (acc_async_sync);
+}
+
+void
+GOACC_wait (int async, int num_waits, ...)
+{
+ va_list ap;
+
+ va_start (ap, num_waits);
+
+ goacc_wait (async, num_waits, ap);
+
+ va_end (ap);
+}
new file mode 100644
@@ -0,0 +1,44 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Initialize and register OpenACC dispatch table from libgomp plugin. */
+
+#include "libgomp.h"
+#include "oacc-plugin.h"
+#include "target.h"
+
+void
+ACC_plugin_register (struct gomp_device_descr *device)
+{
+ ACC_register (device);
+}
+
+
+void
+gomp_plugin_async_unmap_vars (void *ptr)
+{
+ struct target_mem_desc *tgt = ptr;
+
+ gomp_unmap_vars (tgt, false);
+}
new file mode 100644
@@ -0,0 +1,32 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _OACC_PLUGIN_H
+#define _OACC_PLUGIN_H 1
+
+#include "target.h"
+
+extern void ACC_plugin_register (struct gomp_device_descr *dev);
+
+#endif
new file mode 100644
@@ -0,0 +1,108 @@
+! OpenACC Runtime Library Definitions.
+
+! Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+! Contributed by Thomas Schwinge <thomas@codesourcery.com>.
+
+! This file is part of the GNU OpenMP Library (libgomp).
+
+! Libgomp is free software; you can redistribute it and/or modify it
+! under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 3, or (at your option)
+! any later version.
+
+! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+! FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+! more details.
+
+! Under Section 7 of GPL version 3, you are granted additional
+! permissions described in the GCC Runtime Library Exception, version
+! 3.1, as published by the Free Software Foundation.
+
+! You should have received a copy of the GNU General Public License and
+! a copy of the GCC Runtime Library Exception along with this program;
+! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+! <http://www.gnu.org/licenses/>.
+
+module openacc_kinds
+ implicit none
+
+ integer, parameter :: acc_device_kind = 4
+
+end module openacc_kinds
+
+module openacc
+ use openacc_kinds
+ implicit none
+
+ integer, parameter :: openacc_version = 201306
+
+ integer (acc_device_kind), parameter :: acc_device_none = 0
+ integer (acc_device_kind), parameter :: acc_device_default = 1
+ integer (acc_device_kind), parameter :: acc_device_host = 2
+ integer (acc_device_kind), parameter :: acc_device_nonshm_host = 3
+ integer (acc_device_kind), parameter :: acc_device_not_host = 4
+ integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+
+ interface
+ function acc_get_num_devices (dev)
+ use openacc_kinds
+ integer (4) :: acc_get_num_devices
+ integer (acc_device_kind), intent (in) :: dev
+ end function acc_get_num_devices
+ end interface
+
+ interface
+ subroutine acc_set_devices_type (dev)
+ use openacc_kinds
+ integer (acc_device_kind), intent (in) :: dev
+ end subroutine acc_set_devices_type
+ end interface
+
+ interface
+ function acc_get_device_type ()
+ use openacc_kinds
+ integer (acc_device_kind) :: acc_get_device_type
+ end function acc_get_device_type
+ end interface
+
+ interface acc_set_device_num
+ subroutine acc_set_device_num (num, dev)
+ use openacc_kinds
+ integer (4), intent (in) :: num
+ integer (acc_device_kind), intent (in) :: dev
+ end subroutine acc_set_device_num
+ end interface acc_set_device_num
+
+ interface
+ function acc_get_device_num (dev)
+ use openacc_kinds
+ integer (4) :: acc_get_device_num
+ integer (acc_device_kind), intent (in) :: dev
+ end function acc_get_device_num
+ end interface
+
+ interface
+ subroutine acc_init (dev)
+ use openacc_kinds
+ integer (acc_device_kind), intent (in) :: dev
+ end subroutine acc_init
+ end interface
+
+ interface
+ subroutine acc_shutdown (dev)
+ use openacc_kinds
+ integer (acc_device_kind), intent (in) :: dev
+ end subroutine acc_shutdown
+ end interface
+
+ interface
+ function acc_on_device (dev)
+ use openacc_kinds
+ logical (4) :: acc_on_device
+ integer (acc_device_kind), intent (in) :: dev
+ end function acc_on_device
+ end interface
+
+end module openacc
new file mode 100644
@@ -0,0 +1,127 @@
+/* OpenACC Runtime Library User-facing Declarations
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by Thomas Schwinge <thomas@codesourcery.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _OPENACC_H
+#define _OPENACC_H 1
+
+#include "gomp-constants.h"
+
+/* The OpenACC std is silent on whether or not including openacc.h
+ might or must not include other header files. We chose to include
+ some. */
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if __cplusplus >= 201103
+# define __GOACC_NOTHROW noexcept ()
+#elif __cplusplus
+# define __GOACC_NOTHROW throw ()
+#else /* Not C++ */
+# define __GOACC_NOTHROW __attribute__ ((__nothrow__))
+#endif
+
+ /* Types */
+ typedef enum acc_device_t
+ {
+ acc_device_none = 0,
+ acc_device_default, /* This has to be a distinct value, as no
+ return value can match it. */
+ acc_device_host = GOMP_TARGET_HOST,
+ acc_device_nonshm_host = GOMP_TARGET_NONSHM_HOST,
+ acc_device_not_host,
+ acc_device_nvidia = GOMP_TARGET_NVIDIA_PTX,
+ _ACC_device_hwm
+ } acc_device_t;
+
+ typedef enum acc_async_t
+ {
+ acc_async_noval = -1,
+ acc_async_sync = -2
+ } acc_async_t;
+
+ int acc_get_num_devices (acc_device_t __dev) __GOACC_NOTHROW;
+ void acc_set_device_type (acc_device_t __dev) __GOACC_NOTHROW;
+ acc_device_t acc_get_device_type (void) __GOACC_NOTHROW;
+ void acc_set_device_num (int __num, acc_device_t __dev) __GOACC_NOTHROW;
+ int acc_get_device_num (acc_device_t __dev) __GOACC_NOTHROW;
+ int acc_async_test (int __async) __GOACC_NOTHROW;
+ int acc_async_test_all (void) __GOACC_NOTHROW;
+ void acc_wait (int __async) __GOACC_NOTHROW;
+ void acc_wait_async (int __async1, int __async2) __GOACC_NOTHROW;
+ void acc_wait_all (void) __GOACC_NOTHROW;
+ void acc_wait_all_async (int __async) __GOACC_NOTHROW;
+ void acc_init (acc_device_t __dev) __GOACC_NOTHROW;
+ void acc_shutdown (acc_device_t __dev) __GOACC_NOTHROW;
+ int acc_on_device (acc_device_t __dev) __GOACC_NOTHROW;
+ void *acc_malloc (size_t) __GOACC_NOTHROW;
+ void acc_free (void *) __GOACC_NOTHROW;
+ /* Some of these would be more correct with const qualifiers, but
+ the standard specifies otherwise. */
+ void *acc_copyin (void *, size_t) __GOACC_NOTHROW;
+ void *acc_present_or_copyin (void *, size_t) __GOACC_NOTHROW;
+ void *acc_create (void *, size_t) __GOACC_NOTHROW;
+ void *acc_present_or_create (void *, size_t) __GOACC_NOTHROW;
+ void acc_copyout (void *, size_t) __GOACC_NOTHROW;
+ void acc_delete (void *, size_t) __GOACC_NOTHROW;
+ void acc_update_device (void *, size_t) __GOACC_NOTHROW;
+ void acc_update_self (void *, size_t) __GOACC_NOTHROW;
+ void acc_map_data (void *, void *, size_t) __GOACC_NOTHROW;
+ void acc_unmap_data (void *) __GOACC_NOTHROW;
+ void *acc_deviceptr (void *) __GOACC_NOTHROW;
+ void *acc_hostptr (void *) __GOACC_NOTHROW;
+ int acc_is_present (void *, size_t) __GOACC_NOTHROW;
+ void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW;
+ void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW;
+
+ void ACC_target (int, void (*) (void *), const void *,
+ size_t, void **, size_t *, unsigned char *, int *) __GOACC_NOTHROW;
+ void ACC_parallel (int, void (*) (void *), const void *,
+ size_t, void **, size_t *, unsigned char *) __GOACC_NOTHROW;
+ void ACC_add_device_code (void const *, char const *) __GOACC_NOTHROW;
+
+ void ACC_async_copy(int) __GOACC_NOTHROW;
+ void ACC_async_kern(int) __GOACC_NOTHROW;
+
+ /* Old names. OpenACC does not specify whether these can or must
+ not be macros, inlines or aliases for the new names. */
+ #define acc_pcreate acc_present_or_create
+ #define acc_pcopyin acc_present_or_copyin
+
+ /* CUDA-specific routines. */
+ void *acc_get_current_cuda_device (void) __GOACC_NOTHROW;
+ void *acc_get_current_cuda_context (void) __GOACC_NOTHROW;
+ void *acc_get_cuda_stream (int __async) __GOACC_NOTHROW;
+ int acc_set_cuda_stream (int __async, void *__stream) __GOACC_NOTHROW;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OPENACC_H */
new file mode 100644
@@ -0,0 +1,64 @@
+! OpenACC Runtime Library Definitions. -*- mode: fortran -*-
+
+! Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+! Contributed by Thomas Schwinge <thomas@codesourcery.com>.
+
+! This file is part of the GNU OpenMP Library (libgomp).
+
+! Libgomp is free software; you can redistribute it and/or modify it
+! under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 3, or (at your option)
+! any later version.
+
+! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+! FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+! more details.
+
+! Under Section 7 of GPL version 3, you are granted additional
+! permissions described in the GCC Runtime Library Exception, version
+! 3.1, as published by the Free Software Foundation.
+
+! You should have received a copy of the GNU General Public License and
+! a copy of the GCC Runtime Library Exception along with this program;
+! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+! <http://www.gnu.org/licenses/>.
+
+ integer openacc_version
+ parameter (openacc_version = 201306)
+
+ integer acc_device_kind
+ parameter (acc_device_kind = 4)
+ integer (acc_device_kind) acc_device_none
+ parameter (acc_device_none = 0)
+ integer (acc_device_kind) acc_device_default
+ parameter (acc_device_default = 1)
+ integer (acc_device_kind) acc_device_host
+ parameter (acc_device_host = 2)
+ integer (acc_device_kind) acc_device_nonshm_host
+ parameter (acc_device_nonshm_host = 3)
+ integer (acc_device_kind) acc_device_not_host
+ parameter (acc_device_not_host = 4)
+ integer (acc_device_kind) acc_device_nvidia
+ parameter (acc_device_nvidia = 5)
+
+ external acc_get_num_devices
+ integer (4) acc_get_num_devices
+
+ external acc_set_device_type
+
+ external acc_get_device_type
+ integer (acc_device_kind) acc_get_device_type
+
+ external acc_set_device_num
+
+ external acc_get_device_num
+ integer (4) acc_get_device_num
+
+ external acc_init
+
+ external acc_shutdown
+
+ external acc_on_device
+ logical (4) acc_on_device
new file mode 100644
@@ -0,0 +1,1854 @@
+/* Plugin for NVPTX execution.
+
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+ Contributed by CodeSourcery.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
+ library appears to hold some implicit state, but the documentation
+ is not clear as to what that state might be. Or how one might
+ propagate it from one thread to another. */
+
+//#define DEBUG
+//#define DISABLE_ASYNC
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp.h"
+#include "target.h"
+#include "libgomp-plugin.h"
+
+#include <cuda.h>
+#include <sys/queue.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <dlfcn.h>
+#include <unistd.h>
+#include <assert.h>
+
+#define CUERRORS 50
+static struct _errlist
+{
+ CUresult r;
+ char *m;
+} cuErrorList[CUERRORS] = {
+ { CUDA_ERROR_INVALID_VALUE, "invalid value" },
+ { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
+ { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
+ { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
+ { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
+ { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
+ { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
+ { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
+ { CUDA_ERROR_NO_DEVICE, "no device" },
+ { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
+ { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
+ { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
+ { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
+ { CUDA_ERROR_MAP_FAILED, "map error" },
+ { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
+ { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
+ { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
+ { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
+ { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
+ { CUDA_ERROR_NOT_MAPPED, "not mapped" },
+ { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
+ { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
+ { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
+ { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
+ { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
+ { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
+ { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
+ { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
+ { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ "shared object symbol not found" },
+ { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
+ { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
+ { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
+ { CUDA_ERROR_NOT_FOUND, "not found" },
+ { CUDA_ERROR_NOT_READY, "not ready" },
+ { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
+ { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
+ { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
+ { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ "launch incompatibe texturing" },
+ { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
+ { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
+ { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
+ { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
+ { CUDA_ERROR_ASSERT, "assert" },
+ { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
+ { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+ "host memory already registered" },
+ { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
+ { CUDA_ERROR_NOT_PERMITTED, "no permitted" },
+ { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
+ { CUDA_ERROR_UNKNOWN, "unknown" }
+};
+
+static char errmsg[128];
+
+static char *
+cuErrorMsg (CUresult r)
+{
+ int i;
+
+ for (i = 0; i < CUERRORS; i++)
+ {
+ if (cuErrorList[i].r == r)
+ return &cuErrorList[i].m[0];
+ }
+
+ sprintf (&errmsg[0], "unknown result code: %5d", r);
+
+ return &errmsg[0];
+}
+
+static bool PTX_inited = false;
+
+struct PTX_stream
+{
+ CUstream stream;
+ pthread_t host_thread;
+ bool multithreaded;
+
+ CUdeviceptr d;
+ void *h;
+ void *h_begin;
+ void *h_end;
+ void *h_next;
+ void *h_prev;
+ void *h_tail;
+
+ SLIST_ENTRY(PTX_stream) next;
+};
+
+SLIST_HEAD(PTX_streams, PTX_stream);
+
+/* Each thread may select a stream (also specific to a device/context). */
+static __thread struct PTX_stream *current_stream;
+
+struct map
+{
+ int async;
+ size_t size;
+ char mappings[0];
+};
+
+static void
+map_init (struct PTX_stream *s)
+{
+ CUresult r;
+
+ int size = getpagesize ();
+
+ assert (s);
+ assert (!s->d);
+ assert (!s->h);
+
+ r = cuMemAllocHost (&s->h, size);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemAllocHost error: %s", cuErrorMsg (r));
+
+ r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemHostGetDevicePointer error: %s", cuErrorMsg (r));
+
+ assert (s->h);
+
+ s->h_begin = s->h;
+ s->h_end = s->h_begin + size;
+ s->h_next = s->h_prev = s->h_tail = s->h_begin;
+
+ assert (s->h_next);
+ assert (s->h_end);
+}
+
+static void
+map_fini (struct PTX_stream *s)
+{
+ CUresult r;
+
+ r = cuMemFreeHost (s->h);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemFreeHost error: %s", cuErrorMsg (r));
+}
+
+static void
+map_pop (struct PTX_stream *s)
+{
+ struct map *m;
+
+ assert (s != NULL);
+ assert (s->h_next);
+ assert (s->h_prev);
+ assert (s->h_tail);
+
+ m = s->h_tail;
+
+ s->h_tail += m->size;
+
+ if (s->h_tail >= s->h_end)
+ s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+
+ if (s->h_next == s->h_tail)
+ s->h_prev = s->h_next;
+
+ assert (s->h_next >= s->h_begin);
+ assert (s->h_tail >= s->h_begin);
+ assert (s->h_prev >= s->h_begin);
+
+ assert (s->h_next <= s->h_end);
+ assert (s->h_tail <= s->h_end);
+ assert (s->h_prev <= s->h_end);
+}
+
+static void
+map_push (struct PTX_stream *s, int async, size_t size, void **h, void **d)
+{
+ int left;
+ int offset;
+ struct map *m;
+
+ assert (s != NULL);
+
+ left = s->h_end - s->h_next;
+ size += sizeof (struct map);
+
+ assert (s->h_prev);
+ assert (s->h_next);
+
+ if (size >= left)
+ {
+ m = s->h_prev;
+ m->size += left;
+ s->h_next = s->h_begin;
+
+ if (s->h_next + size > s->h_end)
+ gomp_plugin_fatal ("unable to push map");
+ }
+
+ assert (s->h_next);
+
+ m = s->h_next;
+ m->async = async;
+ m->size = size;
+
+ offset = (void *)&m->mappings[0] - s->h;
+
+ *d = (void *)(s->d + offset);
+ *h = (void *)(s->h + offset);
+
+ s->h_prev = s->h_next;
+ s->h_next += size;
+
+ assert (s->h_prev);
+ assert (s->h_next);
+
+ assert (s->h_next >= s->h_begin);
+ assert (s->h_tail >= s->h_begin);
+ assert (s->h_prev >= s->h_begin);
+ assert (s->h_next <= s->h_end);
+ assert (s->h_tail <= s->h_end);
+ assert (s->h_prev <= s->h_end);
+
+ return;
+}
+
+struct PTX_device
+{
+ CUcontext ctx;
+ bool ctx_shared;
+ CUdevice dev;
+ struct PTX_stream *null_stream;
+ /* All non-null streams associated with this device (actually context),
+ either created implicitly or passed in from the user (via
+ acc_set_cuda_stream). */
+ struct PTX_streams active_streams;
+ struct {
+ struct PTX_stream **arr;
+ int size;
+ } async_streams;
+ /* A lock for use when manipulating the above stream list and array. */
+ gomp_mutex_t stream_lock;
+ int ord;
+ bool overlap;
+ bool map;
+ bool concur;
+ int mode;
+ bool mkern;
+ SLIST_ENTRY(PTX_device) next;
+};
+
+static __thread struct PTX_device *PTX_dev;
+static SLIST_HEAD(_PTX_devices, PTX_device) _PTX_devices;
+static struct _PTX_devices *PTX_devices;
+
+enum PTX_event_type
+{
+ PTX_EVT_MEM,
+ PTX_EVT_KNL,
+ PTX_EVT_SYNC
+};
+
+struct PTX_event
+{
+ CUevent *evt;
+ int type;
+ void *addr;
+ void *tgt;
+ int ord;
+ SLIST_ENTRY(PTX_event) next;
+};
+
+static gomp_mutex_t PTX_event_lock;
+static SLIST_HEAD(_PTX_events, PTX_event) _PTX_events;
+static struct _PTX_events *PTX_events;
+
+#define _XSTR(s) _STR(s)
+#define _STR(s) #s
+
+#define CUSYMS 36
+static struct _synames
+{
+ char *n;
+} cuSymNames[CUSYMS] =
+{
+ { _XSTR(cuCtxCreate) },
+ { _XSTR(cuCtxDestroy) },
+ { _XSTR(cuCtxGetCurrent) },
+ { _XSTR(cuCtxPushCurrent) },
+ { _XSTR(cuCtxSynchronize) },
+ { _XSTR(cuDeviceGet) },
+ { _XSTR(cuDeviceGetAttribute) },
+ { _XSTR(cuDeviceGetCount) },
+ { _XSTR(cuEventCreate) },
+ { _XSTR(cuEventDestroy) },
+ { _XSTR(cuEventQuery) },
+ { _XSTR(cuEventRecord) },
+ { _XSTR(cuInit) },
+ { _XSTR(cuLaunchKernel) },
+ { _XSTR(cuLinkAddData) },
+ { _XSTR(cuLinkComplete) },
+ { _XSTR(cuLinkCreate) },
+ { _XSTR(cuMemAlloc) },
+ { _XSTR(cuMemAllocHost) },
+ { _XSTR(cuMemcpy) },
+ { _XSTR(cuMemcpyDtoH) },
+ { _XSTR(cuMemcpyDtoHAsync) },
+ { _XSTR(cuMemcpyHtoD) },
+ { _XSTR(cuMemcpyHtoDAsync) },
+ { _XSTR(cuMemFree) },
+ { _XSTR(cuMemFreeHost) },
+ { _XSTR(cuMemGetAddressRange) },
+ { _XSTR(cuMemHostGetDevicePointer) },
+ { _XSTR(cuMemHostRegister) },
+ { _XSTR(cuMemHostUnregister) },
+ { _XSTR(cuModuleGetFunction) },
+ { _XSTR(cuModuleLoadData) },
+ { _XSTR(cuStreamDestroy) },
+ { _XSTR(cuStreamQuery) },
+ { _XSTR(cuStreamSynchronize) },
+ { _XSTR(cuStreamWaitEvent) }
+};
+
+static int
+verify_device_library (void)
+{
+ int i;
+ void *dh, *ds;
+
+ dh = dlopen ("libcuda.so", RTLD_LAZY);
+ if (!dh)
+ return -1;
+
+ for (i = 0; i < CUSYMS; i++)
+ {
+ ds = dlsym (dh, cuSymNames[i].n);
+ if (!ds)
+ return -1;
+ }
+
+ dlclose (dh);
+
+ return 0;
+}
+
+static void
+init_streams_for_device (struct PTX_device *ptx_dev, int concurrency)
+{
+ int i;
+ struct PTX_stream *null_stream
+ = gomp_plugin_malloc (sizeof (struct PTX_stream));
+
+ null_stream->stream = NULL;
+ null_stream->host_thread = pthread_self ();
+ null_stream->multithreaded = true;
+ null_stream->d = (CUdeviceptr) NULL;
+ null_stream->h = NULL;
+ map_init (null_stream);
+ ptx_dev->null_stream = null_stream;
+
+ SLIST_INIT (&ptx_dev->active_streams);
+ gomp_plugin_mutex_init (&ptx_dev->stream_lock);
+
+ if (concurrency < 1)
+ concurrency = 1;
+
+ /* This is just a guess -- make space for as many async streams as the
+ current device is capable of concurrently executing. This can grow
+ later as necessary. No streams are created yet. */
+ ptx_dev->async_streams.arr
+ = gomp_plugin_malloc (concurrency * sizeof (struct PTX_stream *));
+ ptx_dev->async_streams.size = concurrency;
+
+ for (i = 0; i < concurrency; i++)
+ ptx_dev->async_streams.arr[i] = NULL;
+}
+
+static void
+fini_streams_for_device (struct PTX_device *ptx_dev)
+{
+ struct PTX_stream *s;
+ free (ptx_dev->async_streams.arr);
+
+ while (!SLIST_EMPTY (&ptx_dev->active_streams))
+ {
+ s = SLIST_FIRST (&ptx_dev->active_streams);
+ SLIST_REMOVE_HEAD (&ptx_dev->active_streams, next);
+ cuStreamDestroy (s->stream);
+ map_fini (s);
+ free (s);
+ }
+
+ map_fini (ptx_dev->null_stream);
+ free (ptx_dev->null_stream);
+}
+
+/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
+ thread THREAD (and also current device/context). If CREATE is true, create
+ the stream if it does not exist (or use EXISTING if it is non-NULL), and
+ associate the stream with the same thread argument. Returns stream to use
+ as result. */
+
+static struct PTX_stream *
+select_stream_for_async (int async, pthread_t thread, bool create,
+ CUstream existing)
+{
+ /* Local copy of TLS variable. */
+ struct PTX_device *ptx_dev = PTX_dev;
+ struct PTX_stream *stream = NULL;
+ int orig_async = async;
+
+ /* The special value acc_async_noval (-1) maps (for now) to an
+ implicitly-created stream, which is then handled the same as any other
+ numbered async stream. Other options are available, e.g. using the null
+ stream for anonymous async operations, or choosing an idle stream from an
+ active set. But, stick with this for now. */
+ if (async > acc_async_sync)
+ async++;
+
+ if (create)
+ gomp_plugin_mutex_lock (&ptx_dev->stream_lock);
+
+ /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
+ null stream, and in fact better performance may be obtainable if it doesn't
+ (because the null stream enforces overly-strict synchronisation with
+ respect to other streams for legacy reasons, and that's probably not
+ needed with OpenACC). Maybe investigate later. */
+ if (async == acc_async_sync)
+ stream = ptx_dev->null_stream;
+ else if (async >= 0 && async < ptx_dev->async_streams.size
+ && ptx_dev->async_streams.arr[async] && !(create && existing))
+ stream = ptx_dev->async_streams.arr[async];
+ else if (async >= 0 && create)
+ {
+ if (async >= ptx_dev->async_streams.size)
+ {
+ int i, newsize = ptx_dev->async_streams.size * 2;
+
+ if (async >= newsize)
+ newsize = async + 1;
+
+ ptx_dev->async_streams.arr
+ = gomp_plugin_realloc (ptx_dev->async_streams.arr,
+ newsize * sizeof (struct PTX_stream *));
+
+ for (i = ptx_dev->async_streams.size; i < newsize; i++)
+ ptx_dev->async_streams.arr[i] = NULL;
+
+ ptx_dev->async_streams.size = newsize;
+ }
+
+ /* Create a new stream on-demand if there isn't one already, or if we're
+ setting a particular async value to an existing (externally-provided)
+ stream. */
+ if (!ptx_dev->async_streams.arr[async] || existing)
+ {
+ CUresult r;
+ struct PTX_stream *s
+ = gomp_plugin_malloc (sizeof (struct PTX_stream));
+
+ if (existing)
+ s->stream = existing;
+ else
+ {
+ r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuStreamCreate error: %s", cuErrorMsg (r));
+ }
+
+ /* If CREATE is true, we're going to be queueing some work on this
+ stream. Associate it with the current host thread. */
+ s->host_thread = thread;
+ s->multithreaded = false;
+
+ s->d = (CUdeviceptr) NULL;
+ s->h = NULL;
+ map_init (s);
+
+ SLIST_INSERT_HEAD (&ptx_dev->active_streams, s, next);
+ ptx_dev->async_streams.arr[async] = s;
+ }
+
+ stream = ptx_dev->async_streams.arr[async];
+ }
+ else if (async < 0)
+ gomp_plugin_fatal ("bad async %d", async);
+
+ if (create)
+ {
+ assert (stream != NULL);
+
+ /* If we're trying to use the same stream from different threads
+ simultaneously, set stream->multithreaded to true. This affects the
+ behaviour of acc_async_test_all and acc_wait_all, which are supposed to
+ only wait for asynchronous launches from the same host thread they are
+ invoked on. If multiple threads use the same async value, we make note
+ of that here and fall back to testing/waiting for all threads in those
+ functions. */
+ if (thread != stream->host_thread)
+ stream->multithreaded = true;
+
+ gomp_plugin_mutex_unlock (&ptx_dev->stream_lock);
+ }
+ else if (stream && !stream->multithreaded
+ && !pthread_equal (stream->host_thread, thread))
+ gomp_plugin_fatal ("async %d used on wrong thread", orig_async);
+
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s using stream %p (CUDA stream %p) "
+ "for async %d\n", __FILE__, __FUNCTION__, stream,
+ stream ? stream->stream : NULL, orig_async);
+#endif
+
+ return stream;
+}
+
+static int PTX_get_num_devices (void);
+
+/* Initialize the device. */
+static int
+PTX_init (void)
+{
+ CUresult r;
+ int rc;
+
+ if (PTX_inited)
+ return PTX_get_num_devices ();
+
+ rc = verify_device_library ();
+ if (rc < 0)
+ return -1;
+
+ r = cuInit (0);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuInit error: %s", cuErrorMsg (r));
+
+ PTX_devices = &_PTX_devices;
+ PTX_events = &_PTX_events;
+
+ SLIST_INIT(PTX_devices);
+ SLIST_INIT(PTX_events);
+
+ gomp_plugin_mutex_init (&PTX_event_lock);
+
+ PTX_inited = true;
+
+ return PTX_get_num_devices ();
+}
+
+static int
+PTX_fini (void)
+{
+ PTX_inited = false;
+
+ return 0;
+}
+
+static void *
+PTX_open_device (int n)
+{
+ CUdevice dev;
+ CUresult r;
+ int async_engines, pi;
+
+ if (PTX_devices)
+ {
+ struct PTX_device *ptx_device;
+
+ SLIST_FOREACH(ptx_device, PTX_devices, next)
+ {
+ if (ptx_device->ord == n)
+ {
+ PTX_dev = ptx_device;
+
+ if (PTX_dev->ctx)
+ {
+ r = cuCtxPushCurrent (PTX_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuCtxPushCurrent error: %s",
+ cuErrorMsg (r));
+ }
+
+ return (void *)PTX_dev;
+ }
+ }
+ }
+
+ r = cuDeviceGet (&dev, n);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuDeviceGet error: %s", cuErrorMsg (r));
+
+ PTX_dev = gomp_plugin_malloc (sizeof (struct PTX_device));
+ PTX_dev->ord = n;
+ PTX_dev->dev = dev;
+ PTX_dev->ctx_shared = false;
+
+ SLIST_INSERT_HEAD(PTX_devices, PTX_dev, next);
+
+ r = cuCtxGetCurrent (&PTX_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
+
+ if (!PTX_dev->ctx)
+ {
+ r = cuCtxCreate (&PTX_dev->ctx, CU_CTX_SCHED_AUTO, dev);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuCtxCreate error: %s", cuErrorMsg (r));
+ }
+ else
+ {
+ PTX_dev->ctx_shared = true;
+ }
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ PTX_dev->overlap = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ PTX_dev->map = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ PTX_dev->concur = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ PTX_dev->mode = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
+
+ PTX_dev->mkern = pi;
+
+ r = cuDeviceGetAttribute (&async_engines,
+ CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+ if (r != CUDA_SUCCESS)
+ async_engines = 1;
+
+ init_streams_for_device (PTX_dev, async_engines);
+
+ current_stream = PTX_dev->null_stream;
+
+ return (void *)PTX_dev;
+}
+
+static int
+PTX_close_device (void *h __attribute__((unused)))
+{
+ CUresult r;
+
+ if (!PTX_dev)
+ return 0;
+
+ fini_streams_for_device (PTX_dev);
+
+ if (!PTX_dev->ctx_shared)
+ {
+ r = cuCtxDestroy (PTX_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r));
+ }
+
+ SLIST_REMOVE(PTX_devices, PTX_dev, PTX_device, next);
+ free (PTX_dev);
+
+ PTX_dev = NULL;
+
+ return 0;
+}
+
+static int
+PTX_get_num_devices (void)
+{
+ int n;
+ CUresult r;
+
+ assert (PTX_inited);
+
+ r = cuDeviceGetCount (&n);
+ if (r!= CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuDeviceGetCount error: %s", cuErrorMsg (r));
+
+ return n;
+}
+
+static bool
+PTX_avail(void)
+{
+ bool avail = false;
+
+ if (PTX_init () > 0)
+ avail = true;
+
+ return avail;
+}
+
+#define ABORT_PTX \
+ ".version 3.1\n" \
+ ".target sm_30\n" \
+ ".address_size 64\n" \
+ ".visible .func abort;\n" \
+ ".visible .func abort\n" \
+ "{\n" \
+ "trap;\n" \
+ "ret;\n" \
+ "}\n" \
+ ".visible .func _gfortran_abort;\n" \
+ ".visible .func _gfortran_abort\n" \
+ "{\n" \
+ "trap;\n" \
+ "ret;\n" \
+ "}\n" \
+
+/* Generated with:
+
+ $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
+*/
+#define ACC_ON_DEVICE_PTX \
+ " .version 3.1\n" \
+ " .target sm_30\n" \
+ " .address_size 64\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
+ "{\n" \
+ " .reg.u32 %ar1;\n" \
+ ".reg.u32 %retval;\n" \
+ " .reg.u64 %hr10;\n" \
+ " .reg.u32 %r24;\n" \
+ " .reg.u32 %r25;\n" \
+ " .reg.pred %r27;\n" \
+ " .reg.u32 %r30;\n" \
+ " ld.param.u32 %ar1, [%in_ar1];\n" \
+ " mov.u32 %r24, %ar1;\n" \
+ " setp.ne.u32 %r27,%r24,4;\n" \
+ " set.u32.eq.u32 %r30,%r24,5;\n" \
+ " neg.s32 %r25, %r30;\n" \
+ " @%r27 bra $L3;\n" \
+ " mov.u32 %r25, 1;\n" \
+ "$L3:\n" \
+ " mov.u32 %retval, %r25;\n" \
+ " st.param.u32 [%out_retval], %retval;\n" \
+ " ret;\n" \
+ " }\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1);\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1)\n" \
+ "{\n" \
+ " .reg.u64 %ar1;\n" \
+ ".reg.u32 %retval;\n" \
+ " .reg.u64 %hr10;\n" \
+ " .reg.u64 %r25;\n" \
+ " .reg.u32 %r26;\n" \
+ " .reg.u32 %r27;\n" \
+ " ld.param.u64 %ar1, [%in_ar1];\n" \
+ " mov.u64 %r25, %ar1;\n" \
+ " ld.u32 %r26, [%r25];\n" \
+ " {\n" \
+ " .param.u32 %retval_in;\n" \
+ " {\n" \
+ " .param.u32 %out_arg0;\n" \
+ " st.param.u32 [%out_arg0], %r26;\n" \
+ " call (%retval_in), acc_on_device, (%out_arg0);\n" \
+ " }\n" \
+ " ld.param.u32 %r27, [%retval_in];\n" \
+ "}\n" \
+ " mov.u32 %retval, %r27;\n" \
+ " st.param.u32 [%out_retval], %retval;\n" \
+ " ret;\n" \
+ " }"
+
+static void
+link_ptx (CUmodule *module, char *ptx_code)
+{
+ CUjit_option opts[7];
+ void *optvals[7];
+ float elapsed = 0.0;
+#define LOGSIZE 8192
+ char elog[LOGSIZE];
+ char ilog[LOGSIZE];
+ unsigned long logsize = LOGSIZE;
+ CUlinkState linkstate;
+ CUresult r;
+ void *linkout;
+ size_t linkoutsize __attribute__((unused));
+
+ gomp_plugin_notify ("attempting to load:\n---\n%s\n---\n", ptx_code);
+
+ opts[0] = CU_JIT_WALL_TIME;
+ optvals[0] = &elapsed;
+
+ opts[1] = CU_JIT_INFO_LOG_BUFFER;
+ optvals[1] = &ilog[0];
+
+ opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+ optvals[2] = (void *) logsize;
+
+ opts[3] = CU_JIT_ERROR_LOG_BUFFER;
+ optvals[3] = &elog[0];
+
+ opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+ optvals[4] = (void *) logsize;
+
+ opts[5] = CU_JIT_LOG_VERBOSE;
+ optvals[5] = (void *) 1;
+
+ opts[6] = CU_JIT_TARGET;
+ optvals[6] = (void *) CU_TARGET_COMPUTE_30;
+
+ r = cuLinkCreate (7, opts, optvals, &linkstate);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuLinkCreate error: %s", cuErrorMsg (r));
+
+ char *abort_ptx = ABORT_PTX;
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
+ strlen (abort_ptx) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ gomp_plugin_error ("Link error log %s\n", &elog[0]);
+ gomp_plugin_fatal ("cuLinkAddData (abort) error: %s", cuErrorMsg (r));
+ }
+
+ char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
+ strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ gomp_plugin_error ("Link error log %s\n", &elog[0]);
+ gomp_plugin_fatal ("cuLinkAddData (acc_on_device) error: %s",
+ cuErrorMsg (r));
+ }
+
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
+ strlen (ptx_code) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ gomp_plugin_error ("Link error log %s\n", &elog[0]);
+ gomp_plugin_fatal ("cuLinkAddData (ptx_code) error: %s", cuErrorMsg (r));
+ }
+
+ r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuLinkComplete error: %s", cuErrorMsg (r));
+
+ gomp_plugin_notify ("Link complete: %fms\n", elapsed);
+ gomp_plugin_notify ("Link log %s\n", &ilog[0]);
+
+ r = cuModuleLoadData (module, linkout);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuModuleLoadData error: %s", cuErrorMsg (r));
+}
+
+static void
+event_gc (void)
+{
+ struct PTX_event *ptx_event;
+
+ gomp_plugin_mutex_lock (&PTX_event_lock);
+
+ for (ptx_event = SLIST_FIRST (PTX_events); ptx_event;)
+ {
+ CUresult r;
+ struct PTX_event *next = SLIST_NEXT (ptx_event, next);
+
+ if (ptx_event->ord != PTX_dev->ord)
+ continue;
+
+ r = cuEventQuery (*ptx_event->evt);
+ if (r == CUDA_SUCCESS)
+ {
+ CUevent *te;
+
+ te = ptx_event->evt;
+
+ switch (ptx_event->type)
+ {
+ case PTX_EVT_MEM:
+ case PTX_EVT_SYNC:
+ break;
+
+ case PTX_EVT_KNL:
+ {
+ map_pop (ptx_event->addr);
+ if (ptx_event->tgt)
+ gomp_plugin_async_unmap_vars (ptx_event->tgt);
+ }
+ break;
+ }
+
+ cuEventDestroy (*te);
+ free ((void *)te);
+
+ SLIST_REMOVE (PTX_events, ptx_event, PTX_event, next);
+
+ free (ptx_event);
+
+ ptx_event = next;
+ }
+ }
+
+ gomp_plugin_mutex_unlock (&PTX_event_lock);
+}
+
+static void
+event_add (enum PTX_event_type type, CUevent *e, void *h, void *tgt)
+{
+ struct PTX_event *ptx_event;
+
+ assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC);
+
+ ptx_event = gomp_plugin_malloc (sizeof (struct PTX_event));
+ ptx_event->type = type;
+ ptx_event->evt = e;
+ ptx_event->addr = h;
+ ptx_event->tgt = tgt;
+ ptx_event->ord = PTX_dev->ord;
+
+ gomp_plugin_mutex_lock (&PTX_event_lock);
+
+ SLIST_INSERT_HEAD(PTX_events, ptx_event, next);
+
+ gomp_plugin_mutex_unlock (&PTX_event_lock);
+}
+
+static void **kernel_target_data;
+static void **kernel_host_table;
+
+void
+PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
+ size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
+ int vector_length, int async, void *targ_mem_desc)
+{
+ CUfunction function;
+ CUmodule module;
+ CUresult r;
+ char *kernel_name = NULL;
+ char **fn_names;
+ int i;
+ int fn_entries;
+ struct PTX_stream *dev_str;
+ void **fn_table;
+ void *kargs[1];
+ void *hp, *dp;
+ unsigned int nthreads_in_block;
+
+ if (kernel_target_data == NULL)
+ gomp_plugin_fatal ("Could not find module with kernel functions\n"
+ "Perhaps -fopenacc was used without -flto ?");
+
+ link_ptx (&module, kernel_target_data[0]);
+
+ /* kernel_target_data[0] -> ptx code
+ kernel_target_data[1] -> variable mappings
+ kernel_target_data[2] -> array of kernel names in ascii
+
+ kernel_host_table[0] -> start of function addresses (_omp_func_table)
+ kernel_host_table[1] -> end of function addresses (_omp_funcs_end)
+
+ The array of kernel names and the functions addresses form a
+ one-to-one correspondence. */
+
+ fn_table = kernel_host_table[0];
+ fn_names = (char **) kernel_target_data[2];
+ fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
+
+ for (i = 0; i < fn_entries; i++)
+ {
+ if (fn_table[i] == fn)
+ {
+ kernel_name = fn_names[i];
+ break;
+ }
+ }
+
+ if (!kernel_name)
+ gomp_plugin_fatal ("Could not find kernel name matching function %p", fn);
+
+ r = cuModuleGetFunction (&function, module, kernel_name);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuModuleGetFunction error: %s", cuErrorMsg (r));
+
+ dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
+ assert (dev_str == current_stream);
+
+ /* This reserves a chunk of a pre-allocated page of memory mapped on both
+ the host and the device. HP is a host pointer to the new chunk, and DP is
+ the corresponding device pointer. */
+ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
+
+ gomp_plugin_notify (" %s: prepare mappings\n", __FUNCTION__);
+
+ /* Copy the array of arguments to the mapped page. */
+ for (i = 0; i < mapnum; i++)
+ ((void **) hp)[i] = devaddrs[i];
+
+ /* Copy the (device) pointers to arguments to the device (dp and hp might in
+ fact have the same value on a unified-memory system). */
+ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemcpy failed: %s", cuErrorMsg (r));
+
+ gomp_plugin_notify (" %s: kernel %s: launch\n", __FUNCTION__, kernel_name);
+
+ // XXX: possible geometry mappings??
+ //
+ // OpenACC CUDA
+ //
+ // num_gangs blocks
+ // num_workers warps (where a warp is equivalent to 32 threads)
+ // vector length threads
+ //
+
+ /* The openacc vector_length clause 'determines the vector length to use for
+ vector or SIMD operations'. The question is how to map this to CUDA.
+
+ In CUDA, the warp size is the vector length of a CUDA device. However, the
+ CUDA interface abstracts away from that, and only shows us warp size
+ indirectly in maximum number of threads per block, which is a product of
+ warp size and the number of hyperthreads of a multiprocessor.
+
+ We choose to map openacc vector_length directly onto the number of threads
+ in a block, in the x dimension. This is reflected in gcc code generation
+ that uses ThreadIdx.x to access vector elements.
+
+ Attempting to use an openacc vector_length of more than the maximum number
+ of threads per block will result in a cuda error. */
+ nthreads_in_block = vector_length;
+
+ kargs[0] = &dp;
+ r = cuLaunchKernel (function,
+ 1, 1, 1,
+ nthreads_in_block, 1, 1,
+ 0, dev_str->stream, kargs, 0);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuLaunchKernel error: %s", cuErrorMsg (r));
+
+#ifndef DISABLE_ASYNC
+ if (async < acc_async_noval)
+ {
+ r = cuStreamSynchronize (dev_str->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
+ }
+ else
+ {
+ CUevent *e;
+
+ e = (CUevent *)gomp_plugin_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ event_gc ();
+
+ r = cuEventRecord (*e, dev_str->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_KNL, e, (void *)dev_str, targ_mem_desc);
+ }
+#else
+ r = cuCtxSynchronize ();
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuCtxSynchronize error: %s", cuErrorMsg (r));
+#endif
+
+ gomp_plugin_notify (" %s: kernel %s: finished\n", __FUNCTION__,
+ kernel_name);
+
+#ifndef DISABLE_ASYNC
+ if (async < acc_async_noval)
+#endif
+ map_pop (dev_str);
+}
+
+void * openacc_get_current_cuda_context (void);
+
+static void *
+PTX_alloc (size_t s)
+{
+ CUdeviceptr d;
+ CUresult r;
+
+ r = cuMemAlloc (&d, s);
+ if (r == CUDA_ERROR_OUT_OF_MEMORY)
+ return 0;
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemAlloc error: %s", cuErrorMsg (r));
+ return (void *)d;
+}
+
+static void
+PTX_free (void *p)
+{
+ CUresult r;
+ CUdeviceptr pb;
+ size_t ps;
+
+ r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
+
+ if ((CUdeviceptr)p != pb)
+ gomp_plugin_fatal ("invalid device address");
+
+ r = cuMemFree ((CUdeviceptr)p);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemFree error: %s", cuErrorMsg (r));
+}
+
+static void *
+PTX_host2dev (void *d, const void *h, size_t s)
+{
+ CUresult r;
+ CUdeviceptr pb;
+ size_t ps;
+
+ if (!s)
+ return 0;
+
+ if (!d)
+ gomp_plugin_fatal ("invalid device address");
+
+ r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
+
+ if (!pb)
+ gomp_plugin_fatal ("invalid device address");
+
+ if (!h)
+ gomp_plugin_fatal ("invalid host address");
+
+ if (d == h)
+ gomp_plugin_fatal ("invalid host or device address");
+
+ if ((void *)(d + s) > (void *)(pb + ps))
+ gomp_plugin_fatal ("invalid size");
+
+#ifndef DISABLE_ASYNC
+ if (current_stream != PTX_dev->null_stream)
+ {
+ CUevent *e;
+
+ e = (CUevent *)gomp_plugin_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ event_gc ();
+
+ r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r));
+
+ r = cuEventRecord (*e, current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_MEM, e, (void *)h, NULL);
+ }
+ else
+#endif
+ {
+ r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemcpyHtoD error: %s", cuErrorMsg (r));
+ }
+
+ return 0;
+}
+
+static void *
+PTX_dev2host (void *h, const void *d, size_t s)
+{
+ CUresult r;
+ CUdeviceptr pb;
+ size_t ps;
+
+ if (!s)
+ return 0;
+
+ if (!d)
+ gomp_plugin_fatal ("invalid device address");
+
+ r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
+
+ if (!pb)
+ gomp_plugin_fatal ("invalid device address");
+
+ if (!h)
+ gomp_plugin_fatal ("invalid host address");
+
+ if (d == h)
+ gomp_plugin_fatal ("invalid host or device address");
+
+ if ((void *)(d + s) > (void *)(pb + ps))
+ gomp_plugin_fatal ("invalid size");
+
+#ifndef DISABLE_ASYNC
+ if (current_stream != PTX_dev->null_stream)
+ {
+ CUevent *e;
+
+ e = (CUevent *)gomp_plugin_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventCreate error: %s\n", cuErrorMsg (r));
+
+ event_gc ();
+
+ r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r));
+
+ r = cuEventRecord (*e, current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_MEM, e, (void *)h, NULL);
+ }
+ else
+#endif
+ {
+ r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuMemcpyDtoH error: %s", cuErrorMsg (r));
+ }
+
+ return 0;
+}
+
+static void
+PTX_set_async (int async)
+{
+ current_stream = select_stream_for_async (async, pthread_self (), true, NULL);
+}
+
+static int
+PTX_async_test (int async)
+{
+ CUresult r;
+ struct PTX_stream *s;
+
+ s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+ if (!s)
+ gomp_plugin_fatal ("unknown async %d", async);
+
+ r = cuStreamQuery (s->stream);
+ if (r == CUDA_SUCCESS)
+ return 1;
+ else if (r == CUDA_ERROR_NOT_READY)
+ return 0;
+
+ gomp_plugin_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
+
+ return 0;
+}
+
+static int
+PTX_async_test_all (void)
+{
+ struct PTX_stream *s;
+ pthread_t self = pthread_self ();
+
+ gomp_plugin_mutex_lock (&PTX_dev->stream_lock);
+
+ SLIST_FOREACH (s, &PTX_dev->active_streams, next)
+ {
+ if ((s->multithreaded || pthread_equal (s->host_thread, self))
+ && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
+ {
+ gomp_plugin_mutex_unlock (&PTX_dev->stream_lock);
+ return 0;
+ }
+ }
+
+ gomp_plugin_mutex_unlock (&PTX_dev->stream_lock);
+
+ return 1;
+}
+
+static void
+PTX_wait (int async)
+{
+ CUresult r;
+ struct PTX_stream *s;
+
+ s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+ if (!s)
+ gomp_plugin_fatal ("unknown async %d", async);
+
+ r = cuStreamSynchronize (s->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
+
+ event_gc ();
+}
+
+static void
+PTX_wait_async (int async1, int async2)
+{
+ CUresult r;
+ CUevent *e;
+ struct PTX_stream *s1, *s2;
+ pthread_t self = pthread_self ();
+
+ /* The stream that is waiting (rather than being waited for) doesn't
+ necessarily have to exist already. */
+ s2 = select_stream_for_async (async2, self, true, NULL);
+
+ s1 = select_stream_for_async (async1, self, false, NULL);
+ if (!s1)
+ gomp_plugin_fatal ("invalid async 1\n");
+
+ if (s1 == s2)
+ gomp_plugin_fatal ("identical parameters");
+
+ e = (CUevent *)gomp_plugin_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ event_gc ();
+
+ r = cuEventRecord (*e, s1->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_SYNC, e, NULL, NULL);
+
+ r = cuStreamWaitEvent (s2->stream, *e, 0);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
+}
+
+static void
+PTX_wait_all (void)
+{
+ CUresult r;
+ struct PTX_stream *s;
+ pthread_t self = pthread_self ();
+
+ gomp_plugin_mutex_lock (&PTX_dev->stream_lock);
+
+ /* Wait for active streams initiated by this thread (or by multiple threads)
+ to complete. */
+ SLIST_FOREACH (s, &PTX_dev->active_streams, next)
+ {
+ if (s->multithreaded || pthread_equal (s->host_thread, self))
+ {
+ r = cuStreamQuery (s->stream);
+ if (r == CUDA_SUCCESS)
+ continue;
+ else if (r != CUDA_ERROR_NOT_READY)
+ gomp_plugin_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
+
+ r = cuStreamSynchronize (s->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
+ }
+ }
+
+ gomp_plugin_mutex_unlock (&PTX_dev->stream_lock);
+
+ event_gc ();
+}
+
+static void
+PTX_wait_all_async (int async)
+{
+ CUresult r;
+ struct PTX_stream *waiting_stream, *other_stream;
+ CUevent *e;
+ pthread_t self = pthread_self ();
+
+ /* The stream doing the waiting. This could be the first mention of the
+ stream, so create it if necessary. */
+ waiting_stream
+ = select_stream_for_async (async, pthread_self (), true, NULL);
+
+ /* Launches on the null stream already block on other streams in the
+ context. */
+ if (!waiting_stream || waiting_stream == PTX_dev->null_stream)
+ return;
+
+ event_gc ();
+
+ gomp_plugin_mutex_lock (&PTX_dev->stream_lock);
+
+ SLIST_FOREACH (other_stream, &PTX_dev->active_streams, next)
+ {
+ if (!other_stream->multithreaded
+ && !pthread_equal (other_stream->host_thread, self))
+ continue;
+
+ e = (CUevent *) gomp_plugin_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ /* Record an event on the waited-for stream. */
+ r = cuEventRecord (*e, other_stream->stream);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_SYNC, e, NULL, NULL);
+
+ r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
+ if (r != CUDA_SUCCESS)
+ gomp_plugin_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
+ }
+
+ gomp_plugin_mutex_unlock (&PTX_dev->stream_lock);
+}
+
+static void *
+PTX_get_current_cuda_device (void)
+{
+ if (!PTX_dev)
+ return NULL;
+
+ return &PTX_dev->dev;
+}
+
+static void *
+PTX_get_current_cuda_context (void)
+{
+ if (!PTX_dev)
+ return NULL;
+
+ return PTX_dev->ctx;
+}
+
+static void *
+PTX_get_cuda_stream (int async)
+{
+ struct PTX_stream *s;
+
+ if (!PTX_dev)
+ return NULL;
+
+ s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+ return s ? s->stream : NULL;
+}
+
+static int
+PTX_set_cuda_stream (int async, void *stream)
+{
+ struct PTX_stream *oldstream;
+ pthread_t self = pthread_self ();
+
+ gomp_plugin_mutex_lock (&PTX_dev->stream_lock);
+
+ if (async < 0)
+ gomp_plugin_fatal ("bad async %d", async);
+
+ /* We have a list of active streams and an array mapping async values to
+ entries of that list. We need to take "ownership" of the passed-in stream,
+ and add it to our list, removing the previous entry also (if there was one)
+ in order to prevent resource leaks. Note the potential for surprise
+ here: maybe we should keep track of passed-in streams and leave it up to
+ the user to tidy those up, but that doesn't work for stream handles
+ returned from acc_get_cuda_stream above... */
+
+ oldstream = select_stream_for_async (async, self, false, NULL);
+
+ if (oldstream)
+ {
+ SLIST_REMOVE (&PTX_dev->active_streams, oldstream, PTX_stream, next);
+
+ cuStreamDestroy (oldstream->stream);
+ map_fini (oldstream);
+ free (oldstream);
+ }
+
+ gomp_plugin_mutex_unlock (&PTX_dev->stream_lock);
+
+ (void) select_stream_for_async (async, self, true, (CUstream) stream);
+
+ return 1;
+}
+
+/* Plugin entry points. */
+
+
+int
+get_type (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return TARGET_TYPE_NVIDIA_PTX;
+}
+
+unsigned int
+get_caps (void)
+{
+ return TARGET_CAP_OPENACC_200;
+}
+
+const char *
+get_name (void)
+{
+ return "nvidia";
+}
+
+int
+get_num_devices (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return PTX_get_num_devices ();
+}
+
+void
+offload_register (void *host_table, void *target_data)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p, %p)\n", __FILE__, __FUNCTION__,
+ host_table, target_data);
+#endif
+
+ kernel_target_data = target_data;
+ kernel_host_table = host_table;
+}
+
+int
+device_init (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return PTX_init ();
+}
+
+int
+device_fini (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+
+ return PTX_fini ();
+}
+
+int
+device_get_table (void *table)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
+ table);
+#endif
+
+ /* There are no fixed host-target address mappings for NVPTX. */
+ return 0;
+}
+
+void *
+device_alloc (size_t size)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%zu)\n", __FILE__, __FUNCTION__,
+ size);
+#endif
+
+ return PTX_alloc (size);
+}
+
+void
+device_free (void *ptr)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, ptr);
+#endif
+
+ PTX_free (ptr);
+}
+
+void *
+device_dev2host (void *dst, const void *src, size_t n)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
+ __FUNCTION__, dst,
+ src, n);
+#endif
+
+ return PTX_dev2host (dst, src, n);
+}
+
+void *
+device_host2dev (void *dst, const void *src, size_t n)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
+ __FUNCTION__, dst, src, n);
+#endif
+
+ return PTX_host2dev (dst, src, n);
+}
+
+void (*device_run) (void *fn_ptr, void *vars) = NULL;
+
+void
+openacc_parallel (void (*fn) (void *), size_t mapnum, void **hostaddrs,
+ void **devaddrs, size_t *sizes, unsigned short *kinds,
+ int num_gangs, int num_workers, int vector_length,
+ int async, void *targ_mem_desc)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, "
+ "%d, %p)\n", __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes,
+ kinds, num_gangs, num_workers, vector_length, async, targ_mem_desc);
+#endif
+
+ PTX_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
+ num_workers, vector_length, async, targ_mem_desc);
+}
+
+void *
+openacc_open_device (int n)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, n);
+#endif
+ return PTX_open_device (n);
+}
+
+int
+openacc_close_device (void *h)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, h);
+#endif
+ return PTX_close_device (h);
+}
+
+void
+openacc_set_device_num (int n)
+{
+ assert (n >= 0);
+
+ if (!PTX_dev || PTX_dev->ord != n)
+ (void) PTX_open_device (n);
+}
+
+/* This can be called before the device is "opened" for the current thread, in
+ which case we can't tell which device number should be returned. We don't
+ actually want to open the device here, so just return -1 and let the caller
+ (oacc-init.c:acc_get_device_num) handle it. */
+
+int
+openacc_get_device_num (void)
+{
+ if (PTX_dev)
+ return PTX_dev->ord;
+ else
+ return -1;
+}
+
+bool
+openacc_avail (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ return PTX_avail ();
+}
+
+int
+openacc_async_test (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ return PTX_async_test (async);
+}
+
+int
+openacc_async_test_all (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ return PTX_async_test_all ();
+}
+
+void
+openacc_async_wait (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ PTX_wait (async);
+}
+
+void
+openacc_async_wait_async (int async1, int async2)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d, %d)\n", __FILE__, __FUNCTION__,
+ async1, async2);
+#endif
+ PTX_wait_async (async1, async2);
+}
+
+void
+openacc_async_wait_all (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ PTX_wait_all ();
+}
+
+void
+openacc_async_wait_all_async (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ PTX_wait_all_async (async);
+}
+
+void
+openacc_async_set_async (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ PTX_set_async (async);
+}
+
+void *
+openacc_get_current_cuda_device (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ return PTX_get_current_cuda_device ();
+}
+
+void *
+openacc_get_current_cuda_context (void)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
+#endif
+ return PTX_get_current_cuda_context ();
+}
+
+/* NOTE: This returns a CUstream, not a PTX_stream pointer. */
+
+void *
+openacc_get_cuda_stream (int async)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
+ async);
+#endif
+ return PTX_get_cuda_stream (async);
+}
+
+/* NOTE: This takes a CUstream, not a PTX_stream pointer. */
+
+int
+openacc_set_cuda_stream (int async, void *stream)
+{
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%d, %p)\n", __FILE__, __FUNCTION__,
+ async, stream);
+#endif
+ return PTX_set_cuda_stream (async, stream);
+}
new file mode 100644
@@ -0,0 +1,224 @@
+/* A splay-tree datatype.
+ Copyright 1998-2013
+ Free Software Foundation, Inc.
+ Contributed by Mark Mitchell (mark@markmitchell.com).
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The splay tree code copied from include/splay-tree.h and adjusted,
+ so that all the data lives directly in splay_tree_node_s structure
+ and no extra allocations are needed.
+
+ Files including this header should before including it add:
+typedef struct splay_tree_node_s *splay_tree_node;
+typedef struct splay_tree_s *splay_tree;
+typedef struct splay_tree_key_s *splay_tree_key;
+ define splay_tree_key_s structure, and define
+ splay_compare inline function. */
+
+/* For an easily readable description of splay-trees, see:
+
+ Lewis, Harry R. and Denenberg, Larry. Data Structures and Their
+ Algorithms. Harper-Collins, Inc. 1991.
+
+ The major feature of splay trees is that all basic tree operations
+ are amortized O(log n) time for a tree with n nodes. */
+
+#include "libgomp.h"
+#include "splay-tree.h"
+
+extern int splay_compare (splay_tree_key, splay_tree_key);
+
+/* Rotate the edge joining the left child N with its parent P. PP is the
+ grandparents' pointer to P. */
+
+static inline void
+rotate_left (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
+{
+ splay_tree_node tmp;
+ tmp = n->right;
+ n->right = p;
+ p->left = tmp;
+ *pp = n;
+}
+
+/* Rotate the edge joining the right child N with its parent P. PP is the
+ grandparents' pointer to P. */
+
+static inline void
+rotate_right (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
+{
+ splay_tree_node tmp;
+ tmp = n->left;
+ n->left = p;
+ p->right = tmp;
+ *pp = n;
+}
+
+/* Bottom up splay of KEY. */
+
+static void
+splay_tree_splay (splay_tree sp, splay_tree_key key)
+{
+ if (sp->root == NULL)
+ return;
+
+ do {
+ int cmp1, cmp2;
+ splay_tree_node n, c;
+
+ n = sp->root;
+ cmp1 = splay_compare (key, &n->key);
+
+ /* Found. */
+ if (cmp1 == 0)
+ return;
+
+ /* Left or right? If no child, then we're done. */
+ if (cmp1 < 0)
+ c = n->left;
+ else
+ c = n->right;
+ if (!c)
+ return;
+
+ /* Next one left or right? If found or no child, we're done
+ after one rotation. */
+ cmp2 = splay_compare (key, &c->key);
+ if (cmp2 == 0
+ || (cmp2 < 0 && !c->left)
+ || (cmp2 > 0 && !c->right))
+ {
+ if (cmp1 < 0)
+ rotate_left (&sp->root, n, c);
+ else
+ rotate_right (&sp->root, n, c);
+ return;
+ }
+
+ /* Now we have the four cases of double-rotation. */
+ if (cmp1 < 0 && cmp2 < 0)
+ {
+ rotate_left (&n->left, c, c->left);
+ rotate_left (&sp->root, n, n->left);
+ }
+ else if (cmp1 > 0 && cmp2 > 0)
+ {
+ rotate_right (&n->right, c, c->right);
+ rotate_right (&sp->root, n, n->right);
+ }
+ else if (cmp1 < 0 && cmp2 > 0)
+ {
+ rotate_right (&n->left, c, c->right);
+ rotate_left (&sp->root, n, n->left);
+ }
+ else if (cmp1 > 0 && cmp2 < 0)
+ {
+ rotate_left (&n->right, c, c->left);
+ rotate_right (&sp->root, n, n->right);
+ }
+ } while (1);
+}
+
+/* Insert a new NODE into SP. The NODE shouldn't exist in the tree. */
+
+attribute_hidden void
+splay_tree_insert (splay_tree sp, splay_tree_node node)
+{
+ int comparison = 0;
+
+ splay_tree_splay (sp, &node->key);
+
+ if (sp->root)
+ comparison = splay_compare (&sp->root->key, &node->key);
+
+ if (sp->root && comparison == 0)
+ gomp_fatal ("Duplicate node");
+ else
+ {
+ /* Insert it at the root. */
+ if (sp->root == NULL)
+ node->left = node->right = NULL;
+ else if (comparison < 0)
+ {
+ node->left = sp->root;
+ node->right = node->left->right;
+ node->left->right = NULL;
+ }
+ else
+ {
+ node->right = sp->root;
+ node->left = node->right->left;
+ node->right->left = NULL;
+ }
+
+ sp->root = node;
+ }
+}
+
+/* Remove node with KEY from SP. It is not an error if it did not exist. */
+
+attribute_hidden void
+splay_tree_remove (splay_tree sp, splay_tree_key key)
+{
+ splay_tree_splay (sp, key);
+
+ if (sp->root && splay_compare (&sp->root->key, key) == 0)
+ {
+ splay_tree_node left, right;
+
+ left = sp->root->left;
+ right = sp->root->right;
+
+ /* One of the children is now the root. Doesn't matter much
+ which, so long as we preserve the properties of the tree. */
+ if (left)
+ {
+ sp->root = left;
+
+ /* If there was a right child as well, hang it off the
+ right-most leaf of the left child. */
+ if (right)
+ {
+ while (left->right)
+ left = left->right;
+ left->right = right;
+ }
+ }
+ else
+ sp->root = right;
+ }
+}
+
+/* Lookup KEY in SP, returning NODE if present, and NULL
+ otherwise. */
+
+attribute_hidden splay_tree_key
+splay_tree_lookup (splay_tree sp, splay_tree_key key)
+{
+ splay_tree_splay (sp, key);
+
+ if (sp->root && splay_compare (&sp->root->key, key) == 0)
+ return &sp->root->key;
+ else
+ return NULL;
+}
@@ -43,6 +43,30 @@ typedef struct splay_tree_key_s *splay_tree_key;
The major feature of splay trees is that all basic tree operations
are amortized O(log n) time for a tree with n nodes. */
+#ifndef _SPLAY_TREE_H
+#define _SPLAY_TREE_H 1
+
+typedef struct splay_tree_node_s *splay_tree_node;
+typedef struct splay_tree_s *splay_tree;
+typedef struct splay_tree_key_s *splay_tree_key;
+
+struct splay_tree_key_s {
+ /* Address of the host object. */
+ uintptr_t host_start;
+ /* Address immediately after the host object. */
+ uintptr_t host_end;
+ /* Descriptor of the target memory. */
+ struct target_mem_desc *tgt;
+ /* Offset from tgt->tgt_start to the start of the target object. */
+ uintptr_t tgt_offset;
+ /* Reference count. */
+ uintptr_t refcount;
+ /* Asynchronous reference count. */
+ uintptr_t async_refcount;
+ /* True if data should be copied from device to host at the end. */
+ bool copy_from;
+};
+
/* The nodes in the splay tree. */
struct splay_tree_node_s {
struct splay_tree_key_s key;
@@ -56,177 +80,8 @@ struct splay_tree_s {
splay_tree_node root;
};
-/* Rotate the edge joining the left child N with its parent P. PP is the
- grandparents' pointer to P. */
-
-static inline void
-rotate_left (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
-{
- splay_tree_node tmp;
- tmp = n->right;
- n->right = p;
- p->left = tmp;
- *pp = n;
-}
-
-/* Rotate the edge joining the right child N with its parent P. PP is the
- grandparents' pointer to P. */
-
-static inline void
-rotate_right (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
-{
- splay_tree_node tmp;
- tmp = n->left;
- n->left = p;
- p->right = tmp;
- *pp = n;
-}
-
-/* Bottom up splay of KEY. */
-
-static void
-splay_tree_splay (splay_tree sp, splay_tree_key key)
-{
- if (sp->root == NULL)
- return;
-
- do {
- int cmp1, cmp2;
- splay_tree_node n, c;
-
- n = sp->root;
- cmp1 = splay_compare (key, &n->key);
-
- /* Found. */
- if (cmp1 == 0)
- return;
-
- /* Left or right? If no child, then we're done. */
- if (cmp1 < 0)
- c = n->left;
- else
- c = n->right;
- if (!c)
- return;
-
- /* Next one left or right? If found or no child, we're done
- after one rotation. */
- cmp2 = splay_compare (key, &c->key);
- if (cmp2 == 0
- || (cmp2 < 0 && !c->left)
- || (cmp2 > 0 && !c->right))
- {
- if (cmp1 < 0)
- rotate_left (&sp->root, n, c);
- else
- rotate_right (&sp->root, n, c);
- return;
- }
-
- /* Now we have the four cases of double-rotation. */
- if (cmp1 < 0 && cmp2 < 0)
- {
- rotate_left (&n->left, c, c->left);
- rotate_left (&sp->root, n, n->left);
- }
- else if (cmp1 > 0 && cmp2 > 0)
- {
- rotate_right (&n->right, c, c->right);
- rotate_right (&sp->root, n, n->right);
- }
- else if (cmp1 < 0 && cmp2 > 0)
- {
- rotate_right (&n->left, c, c->right);
- rotate_left (&sp->root, n, n->left);
- }
- else if (cmp1 > 0 && cmp2 < 0)
- {
- rotate_left (&n->right, c, c->left);
- rotate_right (&sp->root, n, n->right);
- }
- } while (1);
-}
-
-/* Insert a new NODE into SP. The NODE shouldn't exist in the tree. */
-
-static void
-splay_tree_insert (splay_tree sp, splay_tree_node node)
-{
- int comparison = 0;
-
- splay_tree_splay (sp, &node->key);
-
- if (sp->root)
- comparison = splay_compare (&sp->root->key, &node->key);
-
- if (sp->root && comparison == 0)
- abort ();
- else
- {
- /* Insert it at the root. */
- if (sp->root == NULL)
- node->left = node->right = NULL;
- else if (comparison < 0)
- {
- node->left = sp->root;
- node->right = node->left->right;
- node->left->right = NULL;
- }
- else
- {
- node->right = sp->root;
- node->left = node->right->left;
- node->right->left = NULL;
- }
-
- sp->root = node;
- }
-}
-
-/* Remove node with KEY from SP. It is not an error if it did not exist. */
-
-static void
-splay_tree_remove (splay_tree sp, splay_tree_key key)
-{
- splay_tree_splay (sp, key);
-
- if (sp->root && splay_compare (&sp->root->key, key) == 0)
- {
- splay_tree_node left, right;
-
- left = sp->root->left;
- right = sp->root->right;
-
- /* One of the children is now the root. Doesn't matter much
- which, so long as we preserve the properties of the tree. */
- if (left)
- {
- sp->root = left;
-
- /* If there was a right child as well, hang it off the
- right-most leaf of the left child. */
- if (right)
- {
- while (left->right)
- left = left->right;
- left->right = right;
- }
- }
- else
- sp->root = right;
- }
-}
-
-/* Lookup KEY in SP, returning NODE if present, and NULL
- otherwise. */
-
-static splay_tree_key
-splay_tree_lookup (splay_tree sp, splay_tree_key key)
-{
- splay_tree_splay (sp, key);
-
- if (sp->root && splay_compare (&sp->root->key, key) == 0)
- return &sp->root->key;
- else
- return NULL;
-}
+attribute_hidden splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key);
+attribute_hidden void splay_tree_insert (splay_tree, splay_tree_node);
+attribute_hidden void splay_tree_remove (splay_tree, splay_tree_key);
+
+#endif /* _SPLAY_TREE_H */
@@ -26,10 +26,11 @@
creation and termination. */
#include "libgomp.h"
-#include <limits.h>
-#include <stdbool.h>
-#include <stdlib.h>
+#include "oacc-plugin.h"
+#include "gomp-constants.h"
#include <string.h>
+#include <stdio.h>
+#include <assert.h>
#ifdef PLUGIN_SUPPORT
# include <dlfcn.h>
@@ -40,54 +41,7 @@ static void gomp_target_init (void);
static pthread_once_t gomp_is_initialized = PTHREAD_ONCE_INIT;
-/* Forward declaration for a node in the tree. */
-typedef struct splay_tree_node_s *splay_tree_node;
-typedef struct splay_tree_s *splay_tree;
-typedef struct splay_tree_key_s *splay_tree_key;
-
-struct target_mem_desc {
- /* Reference count. */
- uintptr_t refcount;
- /* All the splay nodes allocated together. */
- splay_tree_node array;
- /* Start of the target region. */
- uintptr_t tgt_start;
- /* End of the targer region. */
- uintptr_t tgt_end;
- /* Handle to free. */
- void *to_free;
- /* Previous target_mem_desc. */
- struct target_mem_desc *prev;
- /* Number of items in following list. */
- size_t list_count;
-
- /* Corresponding target device descriptor. */
- struct gomp_device_descr *device_descr;
-
- /* List of splay keys to remove (or decrease refcount)
- at the end of region. */
- splay_tree_key list[];
-};
-
-struct splay_tree_key_s {
- /* Address of the host object. */
- uintptr_t host_start;
- /* Address immediately after the host object. */
- uintptr_t host_end;
- /* Descriptor of the target memory. */
- struct target_mem_desc *tgt;
- /* Offset from tgt->tgt_start to the start of the target object. */
- uintptr_t tgt_offset;
- /* Reference count. */
- uintptr_t refcount;
- /* True if data should be copied from device to host at the end. */
- bool copy_from;
-};
-
-enum target_type {
- TARGET_TYPE_HOST,
- TARGET_TYPE_INTEL_MIC
-};
+#include "splay-tree.h"
/* This structure describes an offload image.
It contains type of the target, pointer to host table descriptor, and pointer
@@ -112,7 +66,7 @@ static int num_devices;
/* The comparison function. */
-static int
+attribute_hidden int
splay_compare (splay_tree_key x, splay_tree_key y)
{
if (x->host_start == x->host_end
@@ -125,45 +79,7 @@ splay_compare (splay_tree_key x, splay_tree_key y)
return 0;
}
-#include "splay-tree.h"
-
-/* This structure describes accelerator device.
- It contains name of the corresponding libgomp plugin, function handlers for
- interaction with the device, ID-number of the device, and information about
- mapped memory. */
-struct gomp_device_descr
-{
- /* This is the ID number of device. It could be specified in DEVICE-clause of
- TARGET construct. */
- int id;
-
- /* This is the TYPE of device. */
- enum target_type type;
-
- /* Set to true when device is initialized. */
- bool is_initialized;
-
- /* Plugin file handler. */
- void *plugin_handle;
-
- /* Function handlers. */
- int (*get_type_func) (void);
- int (*get_num_devices_func) (void);
- void (*offload_register_func) (void *, void *);
- void (*device_init_func) (void);
- int (*device_get_table_func) (void *);
- void *(*device_alloc_func) (size_t);
- void (*device_free_func) (void *);
- void *(*device_dev2host_func) (void *, const void *, size_t);
- void *(*device_host2dev_func) (void *, const void *, size_t);
- void (*device_run_func) (void *, void *);
-
- /* Splay tree containing information about mapped memory regions. */
- struct splay_tree_s dev_splay_tree;
-
- /* Mutex for operating with the splay tree and other shared structures. */
- gomp_mutex_t dev_env_lock;
-};
+#include "target.h"
struct mapping_table {
uintptr_t host_start;
@@ -172,10 +88,16 @@ struct mapping_table {
uintptr_t tgt_end;
};
+attribute_hidden void
+gomp_init_targets_once (void)
+{
+ (void) pthread_once (&gomp_is_initialized, gomp_target_init);
+}
+
attribute_hidden int
gomp_get_num_devices (void)
{
- (void) pthread_once (&gomp_is_initialized, gomp_target_init);
+ gomp_init_targets_once ();
return num_devices;
}
@@ -194,6 +116,33 @@ resolve_device (int device_id)
return &devices[device_id];
}
+__attribute__((used)) static void
+dump_mappings (FILE *f, splay_tree_node node)
+{
+ int i;
+
+ splay_tree_key k = &node->key;
+
+ if (!k)
+ return;
+
+ fprintf (f, "key %p: host_start %p, host_end %p, tgt_offset %p, refcount %d, "
+ "copy_from %s\n", k, (void *) k->host_start,
+ (void *) k->host_end, (void *) k->tgt_offset, (int) k->refcount,
+ k->copy_from ? "true" : "false");
+ fprintf (f, "tgt->refcount %d, tgt->tgt_start %p, tgt->tgt_end %p, "
+ "tgt->to_free %p, tgt->prev %p, tgt->list_count %d, "
+ "tgt->device_descr %p\n", (int) k->tgt->refcount,
+ (void *) k->tgt->tgt_start, (void *) k->tgt->tgt_end,
+ k->tgt->to_free, k->tgt->prev, (int) k->tgt->list_count,
+ k->tgt->device_descr);
+
+ for (i = 0; i < k->tgt->list_count; i++)
+ fprintf (f, "item %d: %p\n", i, k->tgt->list[i]);
+
+ dump_mappings (f, node->left);
+ dump_mappings (f, node->right);
+}
/* Handle the case where splay_tree_lookup found oldn for newn.
Helper function of gomp_map_vars. */
@@ -211,18 +160,29 @@ gomp_map_vars_existing (splay_tree_key oldn, splay_tree_key newn,
oldn->refcount++;
}
-static struct target_mem_desc *
-gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
- void **hostaddrs, size_t *sizes, unsigned char *kinds,
- bool is_target)
+static int
+get_kind (bool is_openacc, void *kinds, int idx)
+{
+ return is_openacc ? ((unsigned short *) kinds)[idx]
+ : ((unsigned char *) kinds)[idx];
+}
+
+attribute_hidden struct target_mem_desc *
+gomp_map_vars (struct gomp_device_descr *devicep,
+ struct gomp_memory_mapping *mm, size_t mapnum,
+ void **hostaddrs, void **devaddrs, size_t *sizes,
+ void *kinds, bool is_openacc, bool is_target)
{
size_t i, tgt_align, tgt_size, not_found_cnt = 0;
+ const int rshift = is_openacc ? 8 : 3;
+ const int typemask = is_openacc ? 0xff : 0x7;
struct splay_tree_key_s cur_node;
struct target_mem_desc *tgt
= gomp_malloc (sizeof (*tgt) + sizeof (tgt->list[0]) * mapnum);
tgt->list_count = mapnum;
tgt->refcount = 1;
tgt->device_descr = devicep;
+ tgt->mem_map = mm;
if (mapnum == 0)
return tgt;
@@ -235,40 +195,41 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt_align = align;
tgt_size = mapnum * sizeof (void *);
}
- gomp_mutex_lock (&devicep->dev_env_lock);
+ gomp_mutex_lock (&mm->lock);
for (i = 0; i < mapnum; i++)
{
+ int kind = get_kind (is_openacc, kinds, i);
if (hostaddrs[i] == NULL)
{
tgt->list[i] = NULL;
continue;
}
cur_node.host_start = (uintptr_t) hostaddrs[i];
- if ((kinds[i] & 7) != 4)
+ if (!GOMP_MAP_POINTER_P (kind & typemask))
cur_node.host_end = cur_node.host_start + sizes[i];
else
cur_node.host_end = cur_node.host_start + sizeof (void *);
- splay_tree_key n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ splay_tree_key n = splay_tree_lookup (&mm->splay_tree, &cur_node);
if (n)
{
tgt->list[i] = n;
- gomp_map_vars_existing (n, &cur_node, kinds[i]);
+ gomp_map_vars_existing (n, &cur_node, kind);
}
else
{
- size_t align = (size_t) 1 << (kinds[i] >> 3);
+ size_t align = (size_t) 1 << (kind >> rshift);
tgt->list[i] = NULL;
not_found_cnt++;
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
tgt_size += cur_node.host_end - cur_node.host_start;
- if ((kinds[i] & 7) == 5)
+ if ((kind & typemask) == GOMP_MAP_TO_PSET)
{
size_t j;
for (j = i + 1; j < mapnum; j++)
- if ((kinds[j] & 7) != 4)
+ if (!GOMP_MAP_POINTER_P (get_kind (is_openacc, kinds, j)
+ & typemask))
break;
else if ((uintptr_t) hostaddrs[j] < cur_node.host_start
|| ((uintptr_t) hostaddrs[j] + sizeof (void *)
@@ -283,7 +244,15 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
}
}
- if (not_found_cnt || is_target)
+ if (devaddrs)
+ {
+ if (mapnum != 1)
+ gomp_fatal ("unexpected aggregation");
+ tgt->to_free = devaddrs[0];
+ tgt->tgt_start = (uintptr_t) tgt->to_free;
+ tgt->tgt_end = tgt->tgt_start + sizes[0];
+ }
+ else if (not_found_cnt || is_target)
{
/* Allocate tgt_align aligned tgt_size block of memory. */
/* FIXME: Perhaps change interface to allocate properly aligned
@@ -293,11 +262,18 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
tgt->tgt_end = tgt->tgt_start + tgt_size;
}
+ else
+ {
+ tgt->to_free = NULL;
+ tgt->tgt_start = 0;
+ tgt->tgt_end = 0;
+ }
tgt_size = 0;
if (is_target)
tgt_size = mapnum * sizeof (void *);
+ tgt->array = NULL;
if (not_found_cnt)
{
tgt->array = gomp_malloc (not_found_cnt * sizeof (*tgt->array));
@@ -307,43 +283,51 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
for (i = 0; i < mapnum; i++)
if (tgt->list[i] == NULL)
{
+ int kind = get_kind (is_openacc, kinds, i);
if (hostaddrs[i] == NULL)
continue;
splay_tree_key k = &array->key;
k->host_start = (uintptr_t) hostaddrs[i];
- if ((kinds[i] & 7) != 4)
+ if (!GOMP_MAP_POINTER_P (kind & typemask))
k->host_end = k->host_start + sizes[i];
else
k->host_end = k->host_start + sizeof (void *);
- splay_tree_key n
- = splay_tree_lookup (&devicep->dev_splay_tree, k);
+ splay_tree_key n = splay_tree_lookup (&mm->splay_tree, k);
if (n)
{
tgt->list[i] = n;
- gomp_map_vars_existing (n, k, kinds[i]);
+ gomp_map_vars_existing (n, k, kind);
}
else
{
- size_t align = (size_t) 1 << (kinds[i] >> 3);
+ size_t align = (size_t) 1 << (kind >> rshift);
tgt->list[i] = k;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
k->tgt = tgt;
k->tgt_offset = tgt_size;
tgt_size += k->host_end - k->host_start;
- if ((kinds[i] & 7) == 2 || (kinds[i] & 7) == 3)
- k->copy_from = true;
+ k->copy_from = GOMP_MAP_COPYFROM_P (kind & typemask)
+ || GOMP_MAP_TOFROM_P (kind & typemask);
k->refcount = 1;
+ k->async_refcount = 0;
tgt->refcount++;
array->left = NULL;
array->right = NULL;
- splay_tree_insert (&devicep->dev_splay_tree, array);
- switch (kinds[i] & 7)
+ splay_tree_insert (&mm->splay_tree, array);
+ switch (kind & typemask)
{
- case 0: /* ALLOC */
- case 2: /* FROM */
+ case GOMP_MAP_FORCE_ALLOC:
+ case GOMP_MAP_FORCE_FROM:
+ /* FIXME: No special handling (see comment in
+ oacc-parallel.c). */
+ case GOMP_MAP_ALLOC:
+ case GOMP_MAP_ALLOC_FROM:
break;
- case 1: /* TO */
- case 3: /* TOFROM */
+ case GOMP_MAP_FORCE_TO:
+ case GOMP_MAP_FORCE_TOFROM:
+ /* FIXME: No special handling, as above. */
+ case GOMP_MAP_ALLOC_TO:
+ case GOMP_MAP_ALLOC_TOFROM:
/* Copy from host to device memory. */
/* FIXME: Perhaps add some smarts, like if copying
several adjacent fields from host to target, use some
@@ -353,7 +337,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
(void *) k->host_start,
k->host_end - k->host_start);
break;
- case 4: /* POINTER */
+ case GOMP_MAP_POINTER:
cur_node.host_start
= (uintptr_t) *(void **) k->host_start;
if (cur_node.host_start == (uintptr_t) NULL)
@@ -370,19 +354,16 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
/* Add bias to the pointer value. */
cur_node.host_start += sizes[i];
cur_node.host_end = cur_node.host_start + 1;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree, &cur_node);
if (n == NULL)
{
/* Could be possibly zero size array section. */
cur_node.host_end--;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree, &cur_node);
if (n == NULL)
{
cur_node.host_start--;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree, &cur_node);
cur_node.host_start++;
}
}
@@ -403,7 +384,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
(void *) &cur_node.tgt_offset,
sizeof (void *));
break;
- case 5: /* TO_PSET */
+ case GOMP_MAP_TO_PSET:
/* Copy from host to device memory. */
/* FIXME: see above FIXME comment. */
devicep->device_host2dev_func
@@ -411,7 +392,8 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
(void *) k->host_start,
(k->host_end - k->host_start));
for (j = i + 1; j < mapnum; j++)
- if ((kinds[j] & 7) != 4)
+ if (!GOMP_MAP_POINTER_P (get_kind (is_openacc, kinds, j)
+ & typemask))
break;
else if ((uintptr_t) hostaddrs[j] < k->host_start
|| ((uintptr_t) hostaddrs[j] + sizeof (void *)
@@ -440,19 +422,18 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
/* Add bias to the pointer value. */
cur_node.host_start += sizes[j];
cur_node.host_end = cur_node.host_start + 1;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
- &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree, &cur_node);
if (n == NULL)
{
/* Could be possibly zero size array section. */
cur_node.host_end--;
- n = splay_tree_lookup (&devicep->dev_splay_tree,
+ n = splay_tree_lookup (&mm->splay_tree,
&cur_node);
if (n == NULL)
{
cur_node.host_start--;
- n = splay_tree_lookup
- (&devicep->dev_splay_tree, &cur_node);
+ n = splay_tree_lookup (&mm->splay_tree,
+ &cur_node);
cur_node.host_start++;
}
}
@@ -478,6 +459,31 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
i++;
}
break;
+ case GOMP_MAP_FORCE_PRESENT:
+ {
+ /* We already looked up the memory region above and it
+ was missing. */
+ size_t size = k->host_end - k->host_start;
+ gomp_fatal ("present clause: !acc_is_present (%p, "
+ "%zd (0x%zx))", (void *) k->host_start,
+ size, size);
+ }
+ break;
+ case GOMP_MAP_FORCE_DEVICEPTR:
+ assert (k->host_end - k->host_start == sizeof (void *));
+
+ devicep->device_host2dev_func
+ ((void *) (tgt->tgt_start + k->tgt_offset),
+ (void *) k->host_start,
+ sizeof (void *));
+ break;
+ case GOMP_MAP_FORCE_PRIVATE:
+ abort ();
+ case GOMP_MAP_FORCE_FIRSTPRIVATE:
+ abort ();
+ default:
+ gomp_fatal ("%s: unhandled kind 0x%.2x", __FUNCTION__,
+ kind);
}
array++;
}
@@ -501,7 +507,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
}
}
- gomp_mutex_unlock (&devicep->dev_env_lock);
+ gomp_mutex_unlock (&mm->lock);
return tgt;
}
@@ -516,10 +522,52 @@ gomp_unmap_tgt (struct target_mem_desc *tgt)
free (tgt);
}
-static void
-gomp_unmap_vars (struct target_mem_desc *tgt)
+/* Decrease the refcount for a set of mapped variables, and queue asychronous
+ copies from the device back to the host after any work that has been issued.
+ Because the regions are still "live", increment an asynchronous reference
+ count to indicate that they should not be unmapped from host-side data
+ structures until the asynchronous copy has completed. */
+
+attribute_hidden void
+gomp_copy_from_async (struct target_mem_desc *tgt)
{
struct gomp_device_descr *devicep = tgt->device_descr;
+ struct gomp_memory_mapping *mm = tgt->mem_map;
+ size_t i;
+
+ gomp_mutex_lock (&mm->lock);
+
+ for (i = 0; i < tgt->list_count; i++)
+ if (tgt->list[i] == NULL)
+ ;
+ else if (tgt->list[i]->refcount > 1)
+ {
+ tgt->list[i]->refcount--;
+ tgt->list[i]->async_refcount++;
+ }
+ else
+ {
+ splay_tree_key k = tgt->list[i];
+ if (k->copy_from)
+ /* Copy from device to host memory. */
+ devicep->device_dev2host_func
+ ((void *) k->host_start,
+ (void *) (k->tgt->tgt_start + k->tgt_offset),
+ k->host_end - k->host_start);
+ }
+
+ gomp_mutex_unlock (&mm->lock);
+}
+
+/* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant
+ variables back from device to host: if it is false, it is assumed that this
+ has been done already, i.e. by gomp_copy_from_async above. */
+
+attribute_hidden void
+gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
+{
+ struct gomp_device_descr *devicep = tgt->device_descr;
+ struct gomp_memory_mapping *mm = tgt->mem_map;
if (tgt->list_count == 0)
{
@@ -528,22 +576,24 @@ gomp_unmap_vars (struct target_mem_desc *tgt)
}
size_t i;
- gomp_mutex_lock (&devicep->dev_env_lock);
+ gomp_mutex_lock (&mm->lock);
for (i = 0; i < tgt->list_count; i++)
if (tgt->list[i] == NULL)
;
else if (tgt->list[i]->refcount > 1)
tgt->list[i]->refcount--;
+ else if (tgt->list[i]->async_refcount > 0)
+ tgt->list[i]->async_refcount--;
else
{
splay_tree_key k = tgt->list[i];
- if (k->copy_from)
+ if (k->copy_from && do_copyfrom)
/* Copy from device to host memory. */
devicep->device_dev2host_func
((void *) k->host_start,
(void *) (k->tgt->tgt_start + k->tgt_offset),
k->host_end - k->host_start);
- splay_tree_remove (&devicep->dev_splay_tree, k);
+ splay_tree_remove (&mm->splay_tree, k);
if (k->tgt->refcount > 1)
k->tgt->refcount--;
else
@@ -554,15 +604,17 @@ gomp_unmap_vars (struct target_mem_desc *tgt)
tgt->refcount--;
else
gomp_unmap_tgt (tgt);
- gomp_mutex_unlock (&devicep->dev_env_lock);
+ gomp_mutex_unlock (&mm->lock);
}
static void
-gomp_update (struct gomp_device_descr *devicep, size_t mapnum,
- void **hostaddrs, size_t *sizes, unsigned char *kinds)
+gomp_update (struct gomp_device_descr *devicep, struct gomp_memory_mapping *mm,
+ size_t mapnum, void **hostaddrs, size_t *sizes, void *kinds,
+ bool is_openacc)
{
size_t i;
struct splay_tree_key_s cur_node;
+ const int typemask = is_openacc ? 0xff : 0x7;
if (!devicep)
return;
@@ -570,16 +622,17 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum,
if (mapnum == 0)
return;
- gomp_mutex_lock (&devicep->dev_env_lock);
+ gomp_mutex_lock (&mm->lock);
for (i = 0; i < mapnum; i++)
if (sizes[i])
{
cur_node.host_start = (uintptr_t) hostaddrs[i];
cur_node.host_end = cur_node.host_start + sizes[i];
- splay_tree_key n = splay_tree_lookup (&devicep->dev_splay_tree,
+ splay_tree_key n = splay_tree_lookup (&mm->splay_tree,
&cur_node);
if (n)
{
+ int kind = get_kind (is_openacc, kinds, i);
if (n->host_start > cur_node.host_start
|| n->host_end < cur_node.host_end)
gomp_fatal ("Trying to update [%p..%p) object when"
@@ -588,7 +641,7 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum,
(void *) cur_node.host_end,
(void *) n->host_start,
(void *) n->host_end);
- if ((kinds[i] & 7) == 1)
+ if (GOMP_MAP_COPYTO_P (kind & typemask))
/* Copy from host to device memory. */
devicep->device_host2dev_func
((void *) (n->tgt->tgt_start
@@ -597,7 +650,7 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum,
- n->host_start),
(void *) cur_node.host_start,
cur_node.host_end - cur_node.host_start);
- else if ((kinds[i] & 7) == 2)
+ else if (GOMP_MAP_COPYFROM_P (kind & typemask))
/* Copy from device to host memory. */
devicep->device_dev2host_func
((void *) cur_node.host_start,
@@ -612,20 +665,25 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum,
(void *) cur_node.host_start,
(void *) cur_node.host_end);
}
- gomp_mutex_unlock (&devicep->dev_env_lock);
+ gomp_mutex_unlock (&mm->lock);
}
+static void gomp_register_image_for_device (struct gomp_device_descr *device,
+ struct offload_image_descr *image);
/* This function should be called from every offload image. It gets the
descriptor of the host func and var tables HOST_TABLE, TYPE of the target,
and TARGET_DATA needed by target plugin (target tables, etc.) */
void
-GOMP_offload_register (void *host_table, int type, void *target_data)
+GOMP_offload_register (void *host_table, int type, void **target_data)
{
offload_images = gomp_realloc (offload_images,
(num_offload_images + 1)
* sizeof (struct offload_image_descr));
+ if (offload_images == NULL)
+ return;
+
offload_images[num_offload_images].type = type;
offload_images[num_offload_images].host_table = host_table;
offload_images[num_offload_images].target_data = target_data;
@@ -633,18 +691,20 @@ GOMP_offload_register (void *host_table, int type, void *target_data)
num_offload_images++;
}
-static void
+attribute_hidden void
gomp_init_device (struct gomp_device_descr *devicep)
{
+ int i;
+
/* Initialize the target device. */
devicep->device_init_func ();
/* Get address mapping table for device. */
struct mapping_table *table = NULL;
int num_entries = devicep->device_get_table_func (&table);
+ struct gomp_memory_mapping *mm = &devicep->mem_map;
/* Insert host-target address mapping into dev_splay_tree. */
- int i;
for (i = 0; i < num_entries; i++)
{
struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt));
@@ -663,13 +723,32 @@ gomp_init_device (struct gomp_device_descr *devicep)
k->tgt = tgt;
node->left = NULL;
node->right = NULL;
- splay_tree_insert (&devicep->dev_splay_tree, node);
+ splay_tree_insert (&mm->splay_tree, node);
}
free (table);
devicep->is_initialized = true;
}
+attribute_hidden void
+gomp_fini_device (struct gomp_device_descr *devicep)
+{
+ struct gomp_memory_mapping *mm = &devicep->mem_map;
+
+ if (devicep->is_initialized)
+ devicep->device_fini_func ();
+
+ while (mm->splay_tree.root)
+ {
+ struct target_mem_desc *tgt = mm->splay_tree.root->key.tgt;
+ free (tgt->array);
+ free (tgt);
+ splay_tree_remove (&mm->splay_tree, &mm->splay_tree.root->key);
+ }
+
+ devicep->is_initialized = false;
+}
+
/* Called when encountering a target directive. If DEVICE
is -1, it means use device-var ICV. If it is -2 (or any other value
larger than last available hw device, use host fallback.
@@ -686,7 +765,12 @@ GOMP_target (int device, void (*fn) (void *), const void *openmp_target,
unsigned char *kinds)
{
struct gomp_device_descr *devicep = resolve_device (device);
- if (devicep == NULL)
+ struct gomp_memory_mapping *mm = &devicep->mem_map;
+
+ if (devicep != NULL && !devicep->is_initialized)
+ gomp_init_device (devicep);
+
+ if (devicep == NULL || !(devicep->capabilities & TARGET_CAP_OPENMP_400))
{
/* Host fallback. */
struct gomp_thread old_thr, *thr = gomp_thread ();
@@ -703,24 +787,24 @@ GOMP_target (int device, void (*fn) (void *), const void *openmp_target,
return;
}
- if (!devicep->is_initialized)
- gomp_init_device (devicep);
-
struct splay_tree_key_s k;
k.host_start = (uintptr_t) fn;
k.host_end = k.host_start + 1;
- splay_tree_key tgt_fn = splay_tree_lookup (&devicep->dev_splay_tree, &k);
- if (tgt_fn == NULL && devicep->type != TARGET_TYPE_HOST)
+ gomp_mutex_lock (&mm->lock);
+ splay_tree_key tgt_fn = splay_tree_lookup (&devicep->mem_map.splay_tree, &k);
+ if (tgt_fn == NULL && !(devicep->capabilities & TARGET_CAP_NATIVE_EXEC))
gomp_fatal ("Target function wasn't mapped");
+ gomp_mutex_unlock (&mm->lock);
struct target_mem_desc *tgt_vars
- = gomp_map_vars (devicep, mapnum, hostaddrs, sizes, kinds, true);
- if (devicep->type == TARGET_TYPE_HOST)
+ = gomp_map_vars (devicep, &devicep->mem_map, mapnum, hostaddrs, NULL,
+ sizes, kinds, false, true);
+ if (devicep->capabilities & TARGET_CAP_NATIVE_EXEC)
devicep->device_run_func (fn, (void *) tgt_vars->tgt_start);
else
devicep->device_run_func ((void *) tgt_fn->tgt->tgt_start,
(void *) tgt_vars->tgt_start);
- gomp_unmap_vars (tgt_vars);
+ gomp_unmap_vars (tgt_vars, true);
}
void
@@ -728,7 +812,11 @@ GOMP_target_data (int device, const void *openmp_target, size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned char *kinds)
{
struct gomp_device_descr *devicep = resolve_device (device);
- if (devicep == NULL)
+
+ if (devicep != NULL && !devicep->is_initialized)
+ gomp_init_device (devicep);
+
+ if (devicep == NULL || !(devicep->capabilities & TARGET_CAP_OPENMP_400))
{
/* Host fallback. */
struct gomp_task_icv *icv = gomp_icv (false);
@@ -739,18 +827,17 @@ GOMP_target_data (int device, const void *openmp_target, size_t mapnum,
new #pragma omp target data, otherwise GOMP_target_end_data
would get out of sync. */
struct target_mem_desc *tgt
- = gomp_map_vars (NULL, 0, NULL, NULL, NULL, false);
+ = gomp_map_vars (NULL, NULL, 0, NULL, NULL, NULL, NULL, false,
+ false);
tgt->prev = icv->target_data;
icv->target_data = tgt;
}
return;
}
- if (!devicep->is_initialized)
- gomp_init_device (devicep);
-
struct target_mem_desc *tgt
- = gomp_map_vars (devicep, mapnum, hostaddrs, sizes, kinds, false);
+ = gomp_map_vars (devicep, &devicep->mem_map, mapnum, hostaddrs, NULL, sizes,
+ kinds, false, false);
struct gomp_task_icv *icv = gomp_icv (true);
tgt->prev = icv->target_data;
icv->target_data = tgt;
@@ -764,7 +851,7 @@ GOMP_target_end_data (void)
{
struct target_mem_desc *tgt = icv->target_data;
icv->target_data = tgt->prev;
- gomp_unmap_vars (tgt);
+ gomp_unmap_vars (tgt, true);
}
}
@@ -773,13 +860,15 @@ GOMP_target_update (int device, const void *openmp_target, size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned char *kinds)
{
struct gomp_device_descr *devicep = resolve_device (device);
- if (devicep == NULL)
- return;
- if (!devicep->is_initialized)
+ if (devicep != NULL && !devicep->is_initialized)
gomp_init_device (devicep);
- gomp_update (devicep, mapnum, hostaddrs, sizes, kinds);
+ if (devicep == NULL || !(devicep->capabilities & TARGET_CAP_OPENMP_400))
+ return;
+
+ gomp_update (devicep, &devicep->mem_map, mapnum, hostaddrs, sizes, kinds,
+ false);
}
void
@@ -822,7 +911,8 @@ static bool
gomp_load_plugin_for_device (struct gomp_device_descr *device,
const char *plugin_name)
{
- char *err = NULL;
+ char *err = NULL, *last_missing = NULL;
+ int optional_present, optional_total;
/* Clear any existing error. */
dlerror ();
@@ -845,40 +935,98 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
goto out; \
} \
while (0)
+ /* Similar, but missing functions are not an error. */
+#define DLSYM_OPT(f,n) \
+ do \
+ { \
+ char *tmp_err; \
+ device->f##_func = dlsym (device->plugin_handle, #n); \
+ tmp_err = dlerror (); \
+ if (tmp_err == NULL) \
+ optional_present++; \
+ else \
+ last_missing = #n; \
+ optional_total++; \
+ } \
+ while (0)
+
+ DLSYM (get_name);
+ DLSYM (get_caps);
DLSYM (get_type);
DLSYM (get_num_devices);
DLSYM (offload_register);
DLSYM (device_init);
+ DLSYM (device_fini);
DLSYM (device_get_table);
DLSYM (device_alloc);
DLSYM (device_free);
DLSYM (device_dev2host);
DLSYM (device_host2dev);
- DLSYM (device_run);
+ if (device->get_caps_func () & TARGET_CAP_OPENMP_400)
+ DLSYM (device_run);
+ if (device->get_caps_func () & TARGET_CAP_OPENACC_200)
+ {
+ optional_present = optional_total = 0;
+ DLSYM_OPT (openacc.exec, openacc_parallel);
+ DLSYM_OPT (openacc.open_device, openacc_open_device);
+ DLSYM_OPT (openacc.close_device, openacc_close_device);
+ DLSYM_OPT (openacc.get_device_num, openacc_get_device_num);
+ DLSYM_OPT (openacc.set_device_num, openacc_set_device_num);
+ DLSYM_OPT (openacc.avail, openacc_avail);
+ DLSYM_OPT (openacc.async_test, openacc_async_test);
+ DLSYM_OPT (openacc.async_test_all, openacc_async_test_all);
+ DLSYM_OPT (openacc.async_wait, openacc_async_wait);
+ DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async);
+ DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all);
+ DLSYM_OPT (openacc.async_wait_all_async, openacc_async_wait_all_async);
+ DLSYM_OPT (openacc.async_set_async, openacc_async_set_async);
+ /* Require all the OpenACC handlers if we have TARGET_CAP_OPENACC_200. */
+ if (optional_present != optional_total)
+ {
+ err = "plugin missing OpenACC handler function";
+ goto out;
+ }
+ optional_present = optional_total = 0;
+ DLSYM_OPT (openacc.cuda.get_current_device,
+ openacc_get_current_cuda_device);
+ DLSYM_OPT (openacc.cuda.get_current_context,
+ openacc_get_current_cuda_context);
+ DLSYM_OPT (openacc.cuda.get_stream, openacc_get_cuda_stream);
+ DLSYM_OPT (openacc.cuda.set_stream, openacc_set_cuda_stream);
+ /* Make sure all the CUDA functions are there if any of them are. */
+ if (optional_present && optional_present != optional_total)
+ {
+ err = "plugin missing OpenACC CUDA handler function";
+ goto out;
+ }
+ }
#undef DLSYM
+#undef DLSYM_OPT
out:
if (err != NULL)
{
gomp_error ("while loading %s: %s", plugin_name, err);
+ if (last_missing)
+ gomp_error ("missing function was %s", last_missing);
if (device->plugin_handle)
dlclose (device->plugin_handle);
}
return err == NULL;
}
-/* This function finds OFFLOAD_IMAGES corresponding to DEVICE type, and
- registers them in the plugin. */
+/* This function adds a compatible offload image IMAGE to an accelerator device
+ DEVICE. */
+
static void
-gomp_register_images_for_device (struct gomp_device_descr *device)
+gomp_register_image_for_device (struct gomp_device_descr *device,
+ struct offload_image_descr *image)
{
- int i;
- for (i = 0; i < num_offload_images; i++)
+ if (!device->offload_regions_registered
+ && (device->type == image->type || device->type == TARGET_TYPE_HOST))
{
- struct offload_image_descr *image = &offload_images[i];
-
- if (device->type == image->type || device->type == TARGET_TYPE_HOST)
- device->offload_register_func (image->host_table, image->target_data);
+ device->offload_register_func (image->host_table, image->target_data);
+ device->offload_regions_registered = true;
}
}
@@ -895,6 +1043,7 @@ gomp_find_available_plugins (void)
DIR *dir = NULL;
struct dirent *ent;
char plugin_name[PATH_MAX];
+ int i;
num_devices = 0;
devices = NULL;
@@ -909,7 +1058,7 @@ gomp_find_available_plugins (void)
while ((ent = readdir (dir)) != NULL)
{
- struct gomp_device_descr current_device;
+ struct gomp_device_descr current_device, *devicep;
if (!gomp_check_plugin_file_name (ent->d_name))
continue;
if (strlen (plugin_path) + 1 + strlen (ent->d_name) >= PATH_MAX)
@@ -919,7 +1068,7 @@ gomp_find_available_plugins (void)
strcat (plugin_name, ent->d_name);
if (!gomp_load_plugin_for_device (¤t_device, plugin_name))
continue;
- devices = realloc (devices, (num_devices + 1)
+ devices = gomp_realloc (devices, (num_devices + 1)
* sizeof (struct gomp_device_descr));
if (devices == NULL)
{
@@ -927,18 +1076,31 @@ gomp_find_available_plugins (void)
goto out;
}
- /* FIXME: Properly handle multiple devices of the same type. */
- if (current_device.get_num_devices_func () >= 1)
- {
- current_device.id = num_devices + 1;
- current_device.type = current_device.get_type_func ();
- current_device.is_initialized = false;
- current_device.dev_splay_tree.root = NULL;
- gomp_register_images_for_device (¤t_device);
- devices[num_devices] = current_device;
- gomp_mutex_init (&devices[num_devices].dev_env_lock);
- num_devices++;
- }
+ devices[num_devices] = current_device;
+ devicep = &devices[num_devices];
+
+ devicep->is_initialized = false;
+ devicep->offload_regions_registered = false;
+ devicep->mem_map.splay_tree.root = NULL;
+ devicep->type = devicep->get_type_func ();
+ devicep->name = devicep->get_name_func ();
+ devicep->capabilities = devicep->get_caps_func ();
+ gomp_mutex_init (&devicep->mem_map.lock);
+ devicep->id = ++num_devices;
+ }
+
+ for (i = 0; i < num_devices; i++)
+ {
+ int j;
+
+ for (j = 0; j < num_offload_images; j++)
+ gomp_register_image_for_device (&devices[i], &offload_images[j]);
+
+ /* The 'devices' array can be moved (by the realloc call) until we have
+ found all the plugins, so registering with the OpenACC runtime (which
+ takes a copy of the pointer argument) must be delayed until now. */
+ if (devices[i].capabilities & TARGET_CAP_OPENACC_200)
+ ACC_plugin_register (&devices[i]);
}
out:
new file mode 100644
@@ -0,0 +1,164 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file handles the maintainence of threads in response to team
+ creation and termination. */
+
+#ifndef _TARGET_H
+#define _TARGET_H 1
+
+#include <stdarg.h>
+#include "splay-tree.h"
+#include "gomp-constants.h"
+
+struct target_mem_desc {
+ /* Reference count. */
+ uintptr_t refcount;
+ /* All the splay nodes allocated together. */
+ splay_tree_node array;
+ /* Start of the target region. */
+ uintptr_t tgt_start;
+ /* End of the targer region. */
+ uintptr_t tgt_end;
+ /* Handle to free. */
+ void *to_free;
+ /* Previous target_mem_desc. */
+ struct target_mem_desc *prev;
+ /* Number of items in following list. */
+ size_t list_count;
+
+ /* Corresponding target device descriptor. */
+ struct gomp_device_descr *device_descr;
+
+ /* Memory mapping info for the thread that created this descriptor. */
+ struct gomp_memory_mapping *mem_map;
+
+ /* List of splay keys to remove (or decrease refcount)
+ at the end of region. */
+ splay_tree_key list[];
+};
+
+/* Keep in sync with openacc.h:acc_device_t. */
+
+enum target_type {
+ TARGET_TYPE_HOST = GOMP_TARGET_HOST,
+ TARGET_TYPE_NONSHM_HOST = GOMP_TARGET_NONSHM_HOST,
+ TARGET_TYPE_NVIDIA_PTX = GOMP_TARGET_NVIDIA_PTX,
+ TARGET_TYPE_INTEL_MIC = GOMP_TARGET_INTEL_MIC,
+};
+
+#define TARGET_CAP_SHARED_MEM 1
+#define TARGET_CAP_NATIVE_EXEC 2
+#define TARGET_CAP_OPENMP_400 4
+#define TARGET_CAP_OPENACC_200 8
+
+/* Information about mapped memory regions (per device/context). */
+
+struct gomp_memory_mapping
+{
+ /* Splay tree containing information about mapped memory regions. */
+ struct splay_tree_s splay_tree;
+
+ /* Mutex for operating with the splay tree and other shared structures. */
+ gomp_mutex_t lock;
+};
+
+#include "oacc-int.h"
+
+static inline enum acc_device_t
+acc_device_type (enum target_type type)
+{
+ return (enum acc_device_t) type;
+}
+
+/* This structure describes accelerator device.
+ It contains name of the corresponding libgomp plugin, function handlers for
+ interaction with the device, ID-number of the device, and information about
+ mapped memory. */
+struct gomp_device_descr
+{
+ /* The name of the device. */
+ const char *name;
+
+ /* Capabilities of device (supports OpenACC, OpenMP). */
+ unsigned int capabilities;
+
+ /* This is the ID number of device. It could be specified in DEVICE-clause of
+ TARGET construct. */
+ int id;
+
+ /* This is the TYPE of device. */
+ enum target_type type;
+
+ /* Set to true when device is initialized. */
+ bool is_initialized;
+
+ /* True when offload regions have been registered with this device. */
+ bool offload_regions_registered;
+
+ /* Plugin file handler. */
+ void *plugin_handle;
+
+ /* Function handlers. */
+ const char *(*get_name_func) (void);
+ unsigned int (*get_caps_func) (void);
+ int (*get_type_func) (void);
+ int (*get_num_devices_func) (void);
+ void (*offload_register_func) (void *, void *);
+ int (*device_init_func) (void);
+ int (*device_fini_func) (void);
+ int (*device_get_table_func) (void *);
+ void *(*device_alloc_func) (size_t);
+ void (*device_free_func) (void *);
+ void *(*device_dev2host_func) (void *, const void *, size_t);
+ void *(*device_host2dev_func) (void *, const void *, size_t);
+ void (*device_run_func) (void *, void *);
+
+ /* OpenACC-specific functions. */
+ ACC_dispatch_t openacc;
+
+ /* Memory-mapping info (only for OpenMP -- mappings are stored per-thread
+ for OpenACC. It's not clear if that's a useful distinction). */
+ struct gomp_memory_mapping mem_map;
+};
+
+extern struct target_mem_desc *
+gomp_map_vars (struct gomp_device_descr *devicep,
+ struct gomp_memory_mapping *mm, size_t mapnum,
+ void **hostaddrs, void **devaddrs, size_t *sizes,
+ void *kinds, bool is_openacc, bool is_target);
+
+extern void
+gomp_copy_from_async (struct target_mem_desc *tgt);
+
+extern void
+gomp_unmap_vars (struct target_mem_desc *tgt, bool);
+
+extern attribute_hidden void
+gomp_init_device (struct gomp_device_descr *devicep);
+
+extern attribute_hidden void
+gomp_fini_device (struct gomp_device_descr *devicep);
+
+#endif /* _TARGET_H */
@@ -129,6 +129,10 @@ PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
PERL = @PERL@
+PLUGIN_NVPTX = @PLUGIN_NVPTX@
+PLUGIN_NVPTX_CPPFLAGS = @PLUGIN_NVPTX_CPPFLAGS@
+PLUGIN_NVPTX_LDFLAGS = @PLUGIN_NVPTX_LDFLAGS@
+PLUGIN_NVPTX_LIBS = @PLUGIN_NVPTX_LIBS@
RANLIB = @RANLIB@
SECTION_LDFLAGS = @SECTION_LDFLAGS@
SED = @SED@