commit 898dba8e56827d7dde964e63f53c804c59674e9b
Author: Julian Brown <julian@codesourcery.com>
Date: Mon Oct 27 08:43:07 2014 -0700
TLS rework
@@ -28,6 +28,7 @@
#include "libgomp.h"
#include "libgomp_f.h"
#include "target.h"
+#include "oacc-int.h"
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
@@ -50,8 +50,4 @@ extern void GOMP_PLUGIN_mutex_destroy (gomp_mutex_t *mutex);
extern void GOMP_PLUGIN_mutex_lock (gomp_mutex_t *mutex);
extern void GOMP_PLUGIN_mutex_unlock (gomp_mutex_t *mutex);
-/* target.c */
-
-extern void GOMP_PLUGIN_async_unmap_vars (void *ptr);
-
#endif
@@ -337,4 +337,5 @@ PLUGIN_1.0 {
GOMP_PLUGIN_mutex_lock;
GOMP_PLUGIN_mutex_unlock;
GOMP_PLUGIN_async_unmap_vars;
+ GOMP_PLUGIN_acc_thread;
};
@@ -29,6 +29,7 @@
#include "openacc.h"
#include "libgomp.h"
#include "target.h"
+#include "oacc-int.h"
int
acc_async_test (int async)
@@ -36,13 +37,13 @@ acc_async_test (int async)
if (async < acc_async_sync)
gomp_fatal ("invalid async argument: %d", async);
- return ACC_dev->openacc.async_test_func (async);
+ return base_dev->openacc.async_test_func (async);
}
int
acc_async_test_all (void)
{
- return ACC_dev->openacc.async_test_all_func ();
+ return base_dev->openacc.async_test_all_func ();
}
void
@@ -51,22 +52,19 @@ acc_wait (int async)
if (async < acc_async_sync)
gomp_fatal ("invalid async argument: %d", async);
- ACC_dev->openacc.async_wait_func (async);
- return;
+ base_dev->openacc.async_wait_func (async);
}
void
acc_wait_async (int async1, int async2)
{
- ACC_dev->openacc.async_wait_async_func (async1, async2);
- return;
+ base_dev->openacc.async_wait_async_func (async1, async2);
}
void
acc_wait_all (void)
{
- ACC_dev->openacc.async_wait_all_func ();
- return;
+ base_dev->openacc.async_wait_all_func ();
}
void
@@ -75,6 +73,5 @@ acc_wait_all_async (int async)
if (async < acc_async_sync)
gomp_fatal ("invalid async argument: %d", async);
- ACC_dev->openacc.async_wait_all_async_func (async);
- return;
+ base_dev->openacc.async_wait_all_async_func (async);
}
@@ -29,14 +29,15 @@
#include "config.h"
#include "libgomp.h"
#include "target.h"
+#include "oacc-int.h"
void *
acc_get_current_cuda_device (void)
{
void *p = NULL;
- if (ACC_dev && ACC_dev->openacc.cuda.get_current_device_func)
- p = ACC_dev->openacc.cuda.get_current_device_func ();
+ if (base_dev && base_dev->openacc.cuda.get_current_device_func)
+ p = base_dev->openacc.cuda.get_current_device_func ();
return p;
}
@@ -46,8 +47,8 @@ acc_get_current_cuda_context (void)
{
void *p = NULL;
- if (ACC_dev && ACC_dev->openacc.cuda.get_current_context_func)
- p = ACC_dev->openacc.cuda.get_current_context_func ();
+ if (base_dev && base_dev->openacc.cuda.get_current_context_func)
+ p = base_dev->openacc.cuda.get_current_context_func ();
return p;
}
@@ -60,8 +61,8 @@ acc_get_cuda_stream (int async)
if (async < 0)
return p;
- if (ACC_dev && ACC_dev->openacc.cuda.get_stream_func)
- p = ACC_dev->openacc.cuda.get_stream_func (async);
+ if (base_dev && base_dev->openacc.cuda.get_stream_func)
+ p = base_dev->openacc.cuda.get_stream_func (async);
return p;
}
@@ -73,9 +74,11 @@ acc_set_cuda_stream (int async, void *stream)
if (async < 0 || stream == NULL)
return 0;
+
+ ACC_lazy_initialize ();
- if (ACC_dev && ACC_dev->openacc.cuda.set_stream_func)
- s = ACC_dev->openacc.cuda.set_stream_func (async, stream);
+ if (base_dev && base_dev->openacc.cuda.set_stream_func)
+ s = base_dev->openacc.cuda.set_stream_func (async, stream);
return s;
}
@@ -35,6 +35,9 @@
#include "target.h"
#ifdef HOST_NONSHM_PLUGIN
#include "libgomp-plugin.h"
+#include "oacc-plugin.h"
+#else
+#include "oacc-int.h"
#endif
#include <stdint.h>
@@ -365,6 +368,17 @@ openacc_async_wait_all_async (int async __attribute__((unused)))
#endif
}
+STATIC void *
+openacc_create_thread_data (void *targ_data __attribute__((unused)))
+{
+ return NULL;
+}
+
+STATIC void
+openacc_destroy_thread_data (void *tls_data __attribute__((unused)))
+{
+}
+
#ifndef HOST_NONSHM_PLUGIN
static struct gomp_device_descr host_dispatch =
{
@@ -416,7 +430,10 @@ static struct gomp_device_descr host_dispatch =
.async_wait_async_func = openacc_async_wait_async,
.async_wait_all_func = openacc_async_wait_all,
.async_wait_all_async_func = openacc_async_wait_all_async,
-
+
+ .create_thread_data_func = openacc_create_thread_data,
+ .destroy_thread_data_func = openacc_destroy_thread_data,
+
.cuda = {
.get_current_device_func = NULL,
.get_current_context_func = NULL,
@@ -27,6 +27,7 @@
#include "libgomp.h"
#include "target.h"
+#include "oacc-int.h"
#include <assert.h>
#include <stdlib.h>
#include <strings.h>
@@ -34,38 +35,43 @@
#include <sys/queue.h>
#include <stdio.h>
-gomp_mutex_t acc_device_lock;
+static gomp_mutex_t acc_device_lock;
-/* Current dispatcher, and how it was initialized */
-static acc_device_t init_key = _ACC_device_hwm;
-
-/* The dispatch table for the current accelerator device. This is currently
- global, so you can only have one type of device open at any given time in a
- program. */
-struct gomp_device_descr const *ACC_dev;
+/* The dispatch table for the current accelerator device. This is global, so
+ you can only have one type of device open at any given time in a program.
+ This is the "base" device in that several devices that use the same
+ dispatch table may be active concurrently: this one (the "zeroth") is used
+ for overall initialisation/shutdown, and other instances -- not necessarily
+ including this one -- may be opened and closed once the base device has
+ been initialized. */
+struct gomp_device_descr const *base_dev;
-/* Handle for current thread. */
-__thread void *ACC_handle;
-static __thread int handle_num = -1;
+#ifdef HAVE_TLS
+__thread struct goacc_thread *goacc_tls_data;
+#else
+pthread_key_t goacc_tls_key;
+#endif
+static pthread_key_t goacc_cleanup_key;
-/* This context structure associates the handle for a physical device with
- memory-mapping information for that device, and is used to associate new
- host threads with previously-opened devices. Note that it's not directly
- connected with the CUDA "context" concept as used by the NVidia plugin. */
-struct ACC_context {
- struct memmap_t *ACC_memmap;
- void *ACC_handle;
- SLIST_ENTRY(ACC_context) next;
-};
+/* Current dispatcher, and how it was initialized */
+static acc_device_t init_key = _ACC_device_hwm;
-static SLIST_HEAD(_ACC_contexts, ACC_context) _ACC_contexts;
-static struct _ACC_contexts *ACC_contexts;
+static struct goacc_thread *goacc_threads;
+static gomp_mutex_t goacc_thread_lock;
+/* An array of dispatchers for device types, indexed by the type. This array
+ only references "base" devices, and other instances of the same type are
+ found by simply indexing from each such device (which are stored linearly,
+ grouped by device in target.c:devices). */
static struct gomp_device_descr const *dispatchers[_ACC_device_hwm] = { 0 };
void
ACC_register (struct gomp_device_descr const *disp)
{
+ /* Only register the 0th device here. */
+ if (disp->ord != 0)
+ return;
+
gomp_mutex_lock (&acc_device_lock);
assert (acc_device_type (disp->type) != acc_device_none
@@ -77,21 +83,6 @@ ACC_register (struct gomp_device_descr const *disp)
gomp_mutex_unlock (&acc_device_lock);
}
-static void
-close_handle (void)
-{
- if (ACC_memmap)
- {
- if (ACC_mem_close (ACC_handle, ACC_memmap))
- {
- if (ACC_dev->openacc.close_device_func (ACC_handle) < 0)
- gomp_fatal ("failed to close device");
- }
-
- ACC_memmap = 0;
- }
-}
-
static struct gomp_device_descr const *
resolve_device (acc_device_t d)
{
@@ -149,78 +140,135 @@ resolve_device (acc_device_t d)
return dispatchers[d];
}
+/* This is called when plugins have been initialized, and serves to call
+ (indirectly) the target's device_init hook. Calling multiple times without
+ an intervening _acc_shutdown call is an error. */
+
static struct gomp_device_descr const *
_acc_init (acc_device_t d)
{
struct gomp_device_descr const *acc_dev;
- if (ACC_dev)
- gomp_fatal ("device already active");
-
- init_key = d; /* We need to remember what we were intialized as, to
- check shutdown etc. */
-
acc_dev = resolve_device (d);
+
if (!acc_dev || !acc_dev->openacc.avail_func ())
gomp_fatal ("device %u not supported", (unsigned)d);
- if (!acc_dev->is_initialized)
- gomp_init_device ((struct gomp_device_descr *) acc_dev);
+ if (acc_dev->is_initialized)
+ gomp_fatal ("device already active");
+
+ /* We need to remember what we were intialized as, to check shutdown etc. */
+ init_key = d;
+
+ gomp_init_device ((struct gomp_device_descr *) acc_dev);
return acc_dev;
}
-/* Open the ORD'th device of the currently-active type (ACC_dev must be
+static struct goacc_thread *
+goacc_new_thread (void)
+{
+ struct goacc_thread *thr = gomp_malloc (sizeof (struct gomp_thread));
+
+#ifdef HAVE_TLS
+ goacc_tls_data = thr;
+#else
+ pthread_setspecific (goacc_tls_key, thr);
+#endif
+
+ pthread_setspecific (goacc_cleanup_key, thr);
+
+ gomp_mutex_lock (&goacc_thread_lock);
+ thr->next = goacc_threads;
+ goacc_threads = thr;
+ gomp_mutex_unlock (&goacc_thread_lock);
+
+ return thr;
+}
+
+static void
+goacc_destroy_thread (void *data)
+{
+ struct goacc_thread *thr = data, *walk, *prev;
+
+ gomp_mutex_lock (&goacc_thread_lock);
+
+ if (thr)
+ {
+ if (base_dev && thr->target_tls)
+ {
+ base_dev->openacc.destroy_thread_data_func (thr->target_tls);
+ thr->target_tls = NULL;
+ }
+
+ assert (!thr->mapped_data);
+
+ /* Remove from thread list. */
+ for (prev = NULL, walk = goacc_threads; walk;
+ prev = walk, walk = walk->next)
+ if (walk == thr)
+ {
+ if (prev == NULL)
+ goacc_threads = walk->next;
+ else
+ prev->next = walk->next;
+
+ free (thr);
+
+ break;
+ }
+
+ assert (walk);
+ }
+
+ gomp_mutex_unlock (&goacc_thread_lock);
+}
+
+/* Open the ORD'th device of the currently-active type (base_dev must be
initialised before calling). If ORD is < 0, open the default-numbered
device (set by the ACC_DEVICE_NUM environment variable or a call to
acc_set_device_num), or leave any currently-opened device as is. "Opening"
- consists of calling the device's open_device_func hook, and either creating
- a new memory mapping or associating a new thread with an existing such
- mapping (that matches ACC_handle, i.e. which corresponds to the same
- physical device). */
+ consists of calling the device's open_device_func hook, and setting up
+ thread-local data (maybe allocating, then initializing with information
+ pertaining to the newly-opened or previously-opened device). */
static void
lazy_open (int ord)
{
- struct ACC_context *acc_ctx;
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev;
- if (ACC_memmap)
+ if (thr && thr->dev)
{
- assert (ord < 0 || ord == handle_num);
+ assert (ord < 0 || ord == thr->dev->ord);
return;
}
- assert (ACC_dev);
+ assert (base_dev);
if (ord < 0)
ord = goacc_device_num;
- ACC_handle = ACC_dev->openacc.open_device_func (ord);
- handle_num = ord;
-
- SLIST_FOREACH(acc_ctx, ACC_contexts, next)
- {
- if (acc_ctx->ACC_handle == ACC_handle)
- {
- ACC_memmap = acc_ctx->ACC_memmap;
- ACC_dev->openacc.async_set_async_func (acc_async_sync);
+ if (!thr)
+ thr = goacc_new_thread ();
+
+ acc_dev = thr->dev = (struct gomp_device_descr *) &base_dev[ord];
- return;
- }
- }
+ assert (acc_dev->ord == ord);
- ACC_memmap = ACC_mem_open (ACC_handle, NULL, handle_num);
+ thr->saved_bound_dev = NULL;
+ thr->mapped_data = NULL;
- ACC_dev->openacc.async_set_async_func (acc_async_sync);
+ if (!acc_dev->target_data)
+ acc_dev->target_data = acc_dev->openacc.open_device_func (ord);
- acc_ctx = gomp_malloc (sizeof (struct ACC_context));
- acc_ctx->ACC_handle = ACC_handle;
- acc_ctx->ACC_memmap = ACC_memmap;
+ thr->target_tls
+ = acc_dev->openacc.create_thread_data_func (acc_dev->target_data);
- if (!ACC_memmap->mem_map.is_initialized)
- gomp_init_tables (ACC_dev, &ACC_memmap->mem_map);
+ acc_dev->openacc.async_set_async_func (acc_async_sync);
- SLIST_INSERT_HEAD(ACC_contexts, acc_ctx, next);
+ if (!acc_dev->mem_map.is_initialized)
+ gomp_init_tables (acc_dev, &acc_dev->mem_map);
}
/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
@@ -229,12 +277,12 @@ lazy_open (int ord)
void
acc_init (acc_device_t d)
{
- if (!ACC_dev)
+ if (!base_dev)
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
- ACC_dev = _acc_init (d);
+ base_dev = _acc_init (d);
lazy_open (-1);
@@ -246,33 +294,52 @@ ialias (acc_init)
void
_acc_shutdown (acc_device_t d)
{
+ struct goacc_thread *walk;
+
/* We don't check whether d matches the actual device found, because
OpenACC 2.0 (3.2.12) says the parameters to the init and this
call must match (for the shutdown call anyway, it's silent on
others). */
- if (!ACC_dev)
+ if (!base_dev)
gomp_fatal ("no device initialized");
- if (init_key != d)
+ if (d != init_key)
gomp_fatal ("device %u(%u) is initialized",
- (unsigned)init_key, (unsigned)ACC_dev->type);
+ (unsigned) init_key, (unsigned) base_dev->type);
- close_handle ();
+ gomp_mutex_lock (&goacc_thread_lock);
- while (SLIST_FIRST(ACC_contexts) != NULL)
+ /* Free target-specific TLS data and close all devices. */
+ for (walk = goacc_threads; walk != NULL; walk = walk->next)
{
- struct ACC_context *c;
+ if (walk->target_tls)
+ base_dev->openacc.destroy_thread_data_func (walk->target_tls);
+
+ walk->target_tls = NULL;
- c = SLIST_FIRST(ACC_contexts);
- SLIST_REMOVE_HEAD(ACC_contexts, next);
- free (c);
+ /* This would mean the user is shutting down OpenACC in the middle of an
+ "acc data" pragma. Likely not intentional. */
+ if (walk->mapped_data)
+ gomp_fatal ("shutdown in 'acc data' region");
+
+ if (walk->dev)
+ {
+ if (walk->dev->openacc.close_device_func (walk->dev->target_data) < 0)
+ gomp_fatal ("failed to close device");
+
+ walk->dev->target_data = NULL;
+
+ gomp_free_memmap (walk->dev);
+
+ walk->dev = NULL;
+ }
}
- gomp_fini_device ((struct gomp_device_descr *) ACC_dev);
+ gomp_mutex_unlock (&goacc_thread_lock);
- ACC_dev = 0;
- ACC_handle = 0;
- handle_num = -1;
+ gomp_fini_device ((struct gomp_device_descr *) base_dev);
+
+ base_dev = NULL;
}
void
@@ -287,32 +354,42 @@ acc_shutdown (acc_device_t d)
ialias (acc_shutdown)
+/* This function is called after plugins have been initialized. It deals with
+ the "base" device, and is used to prepare the runtime for dealing with a
+ number of such devices (as implemented by some particular plugin). If the
+ argument device type D matches a previous call to the function, return the
+ current base device, else shut the old device down and re-initialize with
+ the new device type. */
+
static struct gomp_device_descr const *
lazy_init (acc_device_t d)
{
- if (ACC_dev)
+ if (base_dev)
{
/* Re-initializing the same device, do nothing. */
if (d == init_key)
- return ACC_dev;
+ return base_dev;
_acc_shutdown (init_key);
}
- assert (!ACC_dev);
+ assert (!base_dev);
return _acc_init (d);
}
+/* Ensure that plugins are loaded, initialize and open the (default-numbered)
+ device. */
+
static void
lazy_init_and_open (acc_device_t d)
{
- if (!ACC_dev)
+ if (!base_dev)
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
- ACC_dev = lazy_init (d);
+ base_dev = lazy_init (d);
lazy_open (-1);
@@ -328,7 +405,7 @@ acc_get_num_devices (acc_device_t d)
if (d == acc_device_none)
return 0;
- if (!ACC_dev)
+ if (!base_dev)
gomp_init_targets_once ();
acc_dev = resolve_device (d);
@@ -358,8 +435,8 @@ acc_get_device_type (void)
acc_device_t res = acc_device_none;
const struct gomp_device_descr *dev;
- if (ACC_dev)
- res = acc_device_type (ACC_dev->type);
+ if (base_dev)
+ res = acc_device_type (base_dev->type);
else
{
gomp_init_targets_once ();
@@ -385,7 +462,7 @@ acc_get_device_num (acc_device_t d)
if (d >= _ACC_device_hwm)
gomp_fatal ("device %u out of range", (unsigned)d);
- if (!ACC_dev)
+ if (!base_dev)
gomp_init_targets_once ();
dev = resolve_device (d);
@@ -409,7 +486,7 @@ acc_set_device_num (int n, acc_device_t d)
const struct gomp_device_descr *dev;
int num_devices;
- if (!ACC_dev)
+ if (!base_dev)
gomp_init_targets_once ();
if ((int) d == 0)
@@ -431,17 +508,22 @@ acc_set_device_num (int n, acc_device_t d)
}
else
{
+ struct goacc_thread *thr = goacc_thread ();
+
gomp_mutex_lock (&acc_device_lock);
- ACC_dev = lazy_init (d);
+ base_dev = lazy_init (d);
- num_devices = ACC_dev->get_num_devices_func ();
+ num_devices = base_dev->get_num_devices_func ();
if (n >= num_devices)
gomp_fatal ("device %u out of range", n);
- if (n != handle_num)
- close_handle ();
+ /* If we're changing the device number, de-associate this thread with
+ the device (but don't close the device, since it may be in use by
+ other threads). */
+ if (thr && thr->dev && n != thr->dev->ord)
+ thr->dev = NULL;
lazy_open (n);
@@ -454,7 +536,10 @@ ialias (acc_set_device_num)
int
acc_on_device (acc_device_t dev)
{
- if (ACC_dev && acc_device_type (ACC_dev->type) == acc_device_host_nonshm)
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (thr && thr->dev
+ && acc_device_type (thr->dev->type) == acc_device_host_nonshm)
return dev == acc_device_host_nonshm || dev == acc_device_not_host;
/* Just rely on the compiler builtin. */
@@ -467,28 +552,38 @@ ACC_runtime_initialize (void)
{
gomp_mutex_init (&acc_device_lock);
- ACC_contexts = &_ACC_contexts;
- SLIST_INIT (ACC_contexts);
+#ifndef HAVE_TLS
+ pthread_key_create (&goacc_tls_key, NULL);
+#endif
+
+ pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread);
+
+ base_dev = NULL;
+
+ goacc_threads = NULL;
+ gomp_mutex_init (&goacc_thread_lock);
}
/* Compiler helper functions */
-static __thread struct gomp_device_descr const *saved_bound_dev;
-
void
ACC_save_and_set_bind (acc_device_t d)
{
- assert (!saved_bound_dev);
+ struct goacc_thread *thr = goacc_thread ();
+
+ assert (!thr->saved_bound_dev);
- saved_bound_dev = ACC_dev;
- ACC_dev = dispatchers[d];
+ thr->saved_bound_dev = thr->dev;
+ thr->dev = (struct gomp_device_descr *) dispatchers[d];
}
void
ACC_restore_bind (void)
{
- ACC_dev = saved_bound_dev;
- saved_bound_dev = NULL;
+ struct goacc_thread *thr = goacc_thread ();
+
+ thr->dev = thr->saved_bound_dev;
+ thr->saved_bound_dev = NULL;
}
/* This is called from any OpenACC support function that may need to implicitly
@@ -499,10 +594,12 @@ ACC_restore_bind (void)
void
ACC_lazy_initialize (void)
{
- if (ACC_dev && ACC_memmap)
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (thr && thr->dev)
return;
- if (!ACC_dev)
+ if (!base_dev)
lazy_init_and_open (acc_device_default);
else
{
@@ -47,74 +47,52 @@
# pragma GCC visibility push(hidden)
#endif
-typedef struct ACC_dispatch_t
+static inline enum acc_device_t
+acc_device_type (enum target_type type)
{
- /* open or close a device instance. */
- void *(*open_device_func) (int n);
- int (*close_device_func) (void *h);
-
- /* set or get the device number. */
- int (*get_device_num_func) (void);
- void (*set_device_num_func) (int);
-
- /* availability */
- bool (*avail_func) (void);
-
- /* execute */
- void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
- unsigned short *, int, int, int, int, void *);
-
- /* async cleanup callback registration */
- void (*register_async_cleanup_func) (void *);
-
- /* asynchronous routines */
- int (*async_test_func) (int);
- int (*async_test_all_func) (void);
- void (*async_wait_func) (int);
- void (*async_wait_async_func) (int, int);
- void (*async_wait_all_func) (void);
- void (*async_wait_all_async_func) (int);
- void (*async_set_async_func) (int);
-
- /* NVIDIA target specific routines */
- struct {
- void *(*get_current_device_func) (void);
- void *(*get_current_context_func) (void);
- void *(*get_stream_func) (int);
- int (*set_stream_func) (int, void *);
- } cuda;
-} ACC_dispatch_t;
-
-typedef enum ACC_dispatch_f
- {
- ACC_unified_mem_f = 1 << 0,
- }
-ACC_dispatch_f;
+ return (enum acc_device_t) type;
+}
+
+struct goacc_thread
+{
+ /* The device for the current thread. */
+ struct gomp_device_descr *dev;
+
+ struct gomp_device_descr *saved_bound_dev;
+
+ /* This is a linked list of data mapped by the "acc data" pragma, following
+ strictly push/pop semantics according to lexical scope. */
+ struct target_mem_desc *mapped_data;
+
+ /* These structures form a list: this is the next thread in that list. */
+ struct goacc_thread *next;
+
+ /* Target-specific data (used by plugin). */
+ void *target_tls;
+};
+
+#ifdef HAVE_TLS
+extern __thread struct goacc_thread *goacc_tls_data;
+static inline struct goacc_thread *
+goacc_thread (void)
+{
+ return goacc_tls_data;
+}
+#else
+extern pthread_key_t goacc_tls_key;
+static inline struct goacc_thread *
+goacc_thread (void)
+{
+ return pthread_getspecific (goacc_tls_key);
+}
+#endif
struct gomp_device_descr;
void ACC_register (struct gomp_device_descr const *) __GOACC_NOTHROW;
-/* Memory routines. */
-struct memmap_t *ACC_mem_open (void *, struct memmap_t *, int) __GOACC_NOTHROW;
-bool ACC_mem_close (void *, struct memmap_t *) __GOACC_NOTHROW;
-struct gomp_device_descr *ACC_resolve_device(int) __GOACC_NOTHROW;
-
-/* Current dispatcher */
-extern struct gomp_device_descr const *ACC_dev;
-
-/* Device handle for current thread. */
-extern __thread void *ACC_handle;
-
-typedef struct memmap_t
-{
- unsigned live;
- struct target_mem_desc *tlist;
- struct gomp_memory_mapping mem_map;
-} memmap_t;
-
-/* Memory mapping */
-extern __thread struct memmap_t *ACC_memmap;
+/* Current dispatcher. */
+extern struct gomp_device_descr const *base_dev;
void ACC_runtime_initialize (void);
void ACC_save_and_set_bind (acc_device_t);
@@ -30,70 +30,20 @@
#include "libgomp.h"
#include "gomp-constants.h"
#include "target.h"
+#include "oacc-int.h"
#include <stdio.h>
#include <stdint.h>
+#include <assert.h>
#include "splay-tree.h"
-/* Although this pointer is local to each host thread, it points to a memmap_t
- that is stored per-context (different host threads may be associated with
- different contexts, and each context is associated with a physical
- device). */
-__thread struct memmap_t *ACC_memmap;
-
-memmap_t *
-ACC_mem_open (void *handle, memmap_t *src, int handle_num)
-{
- if (!src)
- {
- src = gomp_malloc (sizeof (*src));
- src->live = 0;
- src->mem_map.splay_tree.root = NULL;
- src->tlist = NULL;
- gomp_mutex_init (&src->mem_map.lock);
- src->mem_map.is_initialized = false;
- }
-
- src->live++;
-
- return src;
-}
-
-bool
-ACC_mem_close (void *handle, memmap_t *mm)
-{
- bool closed = 0;
-
- if (!--mm->live)
- {
- struct target_mem_desc *t;
-
- for (t = mm->tlist; t != NULL; t = t->prev)
- {
- ACC_dev->device_free_func (t->to_free);
-
- t->tgt_end = 0;
- t->to_free = 0;
-
- gomp_unmap_vars (t, true);
- }
-
- closed = 1;
- }
-
- gomp_mutex_destroy (&mm->mem_map.lock);
-
- return closed;
-}
-
/* Return block containing [H->S), or NULL if not contained. */
attribute_hidden splay_tree_key
-lookup_host (memmap_t *mm, void *h, size_t s)
+lookup_host (struct gomp_memory_mapping *mem_map, void *h, size_t s)
{
struct splay_tree_key_s node;
splay_tree_key key;
- struct gomp_memory_mapping *mem_map = &mm->mem_map;
node.host_start = (uintptr_t) h;
node.host_end = (uintptr_t) h + s;
@@ -113,25 +63,31 @@ lookup_host (memmap_t *mm, void *h, size_t s)
operation. */
static splay_tree_key
-lookup_dev (memmap_t *b, void *d, size_t s)
+lookup_dev (struct target_mem_desc *tgt, void *d, size_t s)
{
int i;
struct target_mem_desc *t;
+ struct gomp_memory_mapping *mem_map;
+
+ if (!tgt)
+ return NULL;
+
+ mem_map = tgt->mem_map;
- gomp_mutex_lock (&b->mem_map.lock);
+ gomp_mutex_lock (&mem_map->lock);
- for (t = b->tlist; t != NULL; t = t->prev)
+ for (t = tgt; t != NULL; t = t->prev)
{
if (t->tgt_start <= (uintptr_t) d && t->tgt_end >= (uintptr_t) d + s)
break;
}
- gomp_mutex_unlock (&b->mem_map.lock);
+ gomp_mutex_unlock (&mem_map->lock);
if (!t)
return NULL;
- for (i = 0; i < t->refcount; i++)
+ for (i = 0; i < t->list_count; i++)
{
void * offset;
@@ -156,7 +112,7 @@ acc_malloc (size_t s)
ACC_lazy_initialize ();
- return ACC_dev->device_alloc_func (s);
+ return base_dev->device_alloc_func (s);
}
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@@ -166,6 +122,7 @@ void
acc_free (void *d)
{
splay_tree_key k;
+ struct goacc_thread *thr = goacc_thread ();
if (!d)
return;
@@ -173,16 +130,16 @@ acc_free (void *d)
/* We don't have to call lazy open here, as the ptr value must have
been returned by acc_malloc. It's not permitted to pass NULL in
(unless you got that null from acc_malloc). */
- if ((k = lookup_dev (ACC_memmap, d, 1)))
+ if ((k = lookup_dev (thr->dev->openacc.data_environ, d, 1)))
{
void *offset;
offset = d - k->tgt->tgt_start + k->tgt_offset;
- acc_unmap_data((void *)(k->host_start + offset));
+ acc_unmap_data ((void *)(k->host_start + offset));
}
- ACC_dev->device_free_func (d);
+ base_dev->device_free_func (d);
}
void
@@ -190,7 +147,7 @@ acc_memcpy_to_device (void *d, void *h, size_t s)
{
/* No need to call lazy open here, as the device pointer must have
been obtained from a routine that did that. */
- ACC_dev->device_host2dev_func (d, h, s);
+ base_dev->device_host2dev_func (d, h, s);
}
void
@@ -198,7 +155,7 @@ acc_memcpy_from_device (void *h, void *d, size_t s)
{
/* No need to call lazy open here, as the device pointer must have
been obtained from a routine that did that. */
- ACC_dev->device_dev2host_func (h, d, s);
+ base_dev->device_dev2host_func (h, d, s);
}
/* Return the device pointer that corresponds to host data H. Or NULL
@@ -213,7 +170,9 @@ acc_deviceptr (void *h)
ACC_lazy_initialize ();
- n = lookup_host (ACC_memmap, h, 1);
+ struct goacc_thread *thr = goacc_thread ();
+
+ n = lookup_host (&thr->dev->mem_map, h, 1);
if (!n)
return NULL;
@@ -237,7 +196,9 @@ acc_hostptr (void *d)
ACC_lazy_initialize ();
- n = lookup_dev (ACC_memmap, d, 1);
+ struct goacc_thread *thr = goacc_thread ();
+
+ n = lookup_dev (thr->dev->openacc.data_environ, d, 1);
if (!n)
return NULL;
@@ -261,10 +222,14 @@ acc_is_present (void *h, size_t s)
ACC_lazy_initialize ();
- n = lookup_host (ACC_memmap, h, s);
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ n = lookup_host (&acc_dev->mem_map, h, s);
- if (n && (((uintptr_t)h < n->host_start) ||
- ((uintptr_t)h + s > n->host_end) || (s > n->host_end - n->host_start)))
+ if (n && ((uintptr_t)h < n->host_start
+ || (uintptr_t)h + s > n->host_end
+ || s > n->host_end - n->host_start))
n = NULL;
return n != NULL;
@@ -284,7 +249,10 @@ acc_map_data (void *h, void *d, size_t s)
ACC_lazy_initialize ();
- if (ACC_dev->capabilities & TARGET_CAP_SHARED_MEM)
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ if (acc_dev->capabilities & TARGET_CAP_SHARED_MEM)
{
if (d != h)
gomp_fatal ("cannot map data on shared-memory system");
@@ -293,35 +261,39 @@ acc_map_data (void *h, void *d, size_t s)
}
else
{
+ struct goacc_thread *thr = goacc_thread ();
+
if (!d || !h || !s)
gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
(void *)h, (int)s, (void *)d, (int)s);
- if (lookup_host (ACC_memmap, h, s))
+ if (lookup_host (&acc_dev->mem_map, h, s))
gomp_fatal ("host address [%p, +%d] is already mapped", (void *)h,
(int)s);
- if (lookup_dev (ACC_memmap, d, s))
+ if (lookup_dev (thr->dev->openacc.data_environ, d, s))
gomp_fatal ("device address [%p, +%d] is already mapped", (void *)d,
(int)s);
- tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
- &ACC_memmap->mem_map, mapnum, &hostaddrs,
+ tgt = gomp_map_vars ((struct gomp_device_descr *) acc_dev,
+ &acc_dev->mem_map, mapnum, &hostaddrs,
&devaddrs, &sizes, &kinds, true, false);
}
- tgt->prev = ACC_memmap->tlist;
- ACC_memmap->tlist = tgt;
+ tgt->prev = acc_dev->openacc.data_environ;
+ acc_dev->openacc.data_environ = tgt;
}
void
acc_unmap_data (void *h)
{
- /* No need to call lazy open, as the address must have been mapped.
- */
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ /* No need to call lazy open, as the address must have been mapped. */
size_t host_size;
- splay_tree_key n = lookup_host (ACC_memmap, h, 1);
+ splay_tree_key n = lookup_host (&acc_dev->mem_map, h, 1);
struct target_mem_desc *t;
if (!n)
@@ -331,7 +303,7 @@ acc_unmap_data (void *h)
if (n->host_start != (uintptr_t) h)
gomp_fatal ("[%p,%d] surrounds1 %p",
- (void *)n->host_start, (int)host_size, (void *)h);
+ (void *) n->host_start, (int) host_size, (void *) h);
t = n->tgt;
@@ -345,24 +317,23 @@ acc_unmap_data (void *h)
t->tgt_end = 0;
t->to_free = 0;
- gomp_mutex_lock (&ACC_memmap->mem_map.lock);
+ gomp_mutex_lock (&acc_dev->mem_map.lock);
- for (tp = NULL, t = ACC_memmap->tlist; t != NULL; tp = t, t = t->prev)
- {
- if (n->tgt == t)
- {
- if (tp)
- tp->prev = t->prev;
- else
- ACC_memmap->tlist = t->prev;
+ for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
+ tp = t, t = t->prev)
+ if (n->tgt == t)
+ {
+ if (tp)
+ tp->prev = t->prev;
+ else
+ acc_dev->openacc.data_environ = t->prev;
- break;
- }
- }
+ break;
+ }
- gomp_mutex_unlock (&ACC_memmap->mem_map.lock);
+ gomp_mutex_unlock (&acc_dev->mem_map.lock);
}
-
+
gomp_unmap_vars (t, true);
}
@@ -381,7 +352,10 @@ present_create_copy (unsigned f, void *h, size_t s)
ACC_lazy_initialize ();
- n = lookup_host (ACC_memmap, h, s);
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ n = lookup_host (&acc_dev->mem_map, h, s);
if (n)
{
/* Present. */
@@ -409,13 +383,17 @@ present_create_copy (unsigned f, void *h, size_t s)
else
kinds = GOMP_MAP_ALLOC;
- tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
- &ACC_memmap->mem_map, mapnum, &hostaddrs,
+ tgt = gomp_map_vars ((struct gomp_device_descr *) acc_dev,
+ &acc_dev->mem_map, mapnum, &hostaddrs,
NULL, &s, &kinds, true, false);
+ gomp_mutex_lock (&acc_dev->mem_map.lock);
+
d = tgt->to_free;
- tgt->prev = ACC_memmap->tlist;
- ACC_memmap->tlist = tgt;
+ tgt->prev = acc_dev->openacc.data_environ;
+ acc_dev->openacc.data_environ = tgt;
+
+ gomp_mutex_unlock (&acc_dev->mem_map.lock);
}
return d;
@@ -453,8 +431,10 @@ delete_copyout (unsigned f, void *h, size_t s)
size_t host_size;
splay_tree_key n;
void *d;
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
- n = lookup_host (ACC_memmap, h, s);
+ n = lookup_host (&acc_dev->mem_map, h, s);
/* No need to call lazy open, as the data must already have been
mapped. */
@@ -468,14 +448,14 @@ delete_copyout (unsigned f, void *h, size_t s)
if (n->host_start != (uintptr_t) h || host_size != s)
gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]",
- (void *)n->host_start, (int)host_size, (void *)h, (int)s);
+ (void *) n->host_start, (int) host_size, (void *) h, (int) s);
if (f & DC_Copyout)
- ACC_dev->device_dev2host_func (h, d, s);
+ acc_dev->device_dev2host_func (h, d, s);
- acc_unmap_data(h);
+ acc_unmap_data (h);
- ACC_dev->device_free_func (d);
+ acc_dev->device_free_func (d);
}
void
@@ -494,11 +474,10 @@ update_dev_host (int is_dev, void *h, size_t s)
{
splay_tree_key n;
void *d;
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
- if (!ACC_memmap)
- gomp_fatal ("[%p,%d] is not mapped", h, (int)s);
-
- n = lookup_host (ACC_memmap, h, s);
+ n = lookup_host (&acc_dev->mem_map, h, s);
/* No need to call lazy open, as the data must already have been
mapped. */
@@ -509,10 +488,9 @@ update_dev_host (int is_dev, void *h, size_t s)
d = (void *) (n->tgt->tgt_start + n->tgt_offset);
if (is_dev)
- ACC_dev->device_host2dev_func (d, h, s);
+ acc_dev->device_host2dev_func (d, h, s);
else
- ACC_dev->device_dev2host_func (h, d, s);
-
+ acc_dev->device_dev2host_func (h, d, s);
}
void
@@ -30,43 +30,15 @@
#include "libgomp_g.h"
#include "gomp-constants.h"
#include "target.h"
+#include "oacc-int.h"
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <assert.h>
#include <alloca.h>
-#ifdef FUTURE
-// device geometry per device type
-struct devgeom
-{
- int gangs;
- int workers;
- int vectors;
-};
-
-
-// XXX: acceptable defaults?
-static __thread struct devgeom devgeom = { 1, 1, 1 };
-#endif
-
-#ifdef LATER
-static void
-dump_devaddrs(void)
-{
- int i;
- struct devaddr *dp;
-
- gomp_notify("++++ num_devaddrs %d\n", num_devaddrs);
- for (dp = devaddrs, i = 1; dp != 0; dp = dp->next, i++)
- {
- gomp_notify("++++ %.02d) %p\n", i, dp->d);
- }
-}
-#endif
-
static void
-dump_var(char *s, size_t idx, void *hostaddr, size_t size, unsigned char kind)
+dump_var (char *s, size_t idx, void *hostaddr, size_t size, unsigned char kind)
{
gomp_notify(" %2zi: %3s 0x%.2x -", idx, s, kind & 0xff);
@@ -108,6 +80,8 @@ dump_var(char *s, size_t idx, void *hostaddr, size_t size, unsigned char kind)
attribute_hidden void
select_acc_device (int device_type)
{
+ ACC_lazy_initialize ();
+
if (device_type == GOMP_IF_CLAUSE_FALSE)
return;
@@ -121,8 +95,6 @@ select_acc_device (int device_type)
know what they're doing... */
acc_set_device_type (device_type);
}
-
- ACC_lazy_initialize ();
}
void goacc_wait (int async, int num_waits, va_list ap);
@@ -136,6 +108,8 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
{
bool if_clause_condition_value = device != GOMP_IF_CLAUSE_FALSE;
va_list ap;
+ struct goacc_thread *thr;
+ struct gomp_device_descr *acc_dev;
struct target_mem_desc *tgt;
void **devaddrs;
unsigned int i;
@@ -155,6 +129,9 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
select_acc_device (device);
+ thr = goacc_thread ();
+ acc_dev = thr->dev;
+
/* Host fallback if "if" clause is false or if the current device is set to
the host. */
if (!if_clause_condition_value)
@@ -164,7 +141,7 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
ACC_restore_bind ();
return;
}
- else if (acc_device_type (ACC_dev->type) == acc_device_host)
+ else if (acc_device_type (acc_dev->type) == acc_device_host)
{
fn (hostaddrs);
return;
@@ -177,15 +154,15 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
va_end (ap);
- ACC_dev->openacc.async_set_async_func (async);
+ acc_dev->openacc.async_set_async_func (async);
- if (!(ACC_dev->capabilities & TARGET_CAP_NATIVE_EXEC))
+ if (!(acc_dev->capabilities & TARGET_CAP_NATIVE_EXEC))
{
k.host_start = (uintptr_t) fn;
k.host_end = k.host_start + 1;
- gomp_mutex_lock (&ACC_memmap->mem_map.lock);
- tgt_fn_key = splay_tree_lookup (&ACC_memmap->mem_map.splay_tree, &k);
- gomp_mutex_unlock (&ACC_memmap->mem_map.lock);
+ gomp_mutex_lock (&acc_dev->mem_map.lock);
+ tgt_fn_key = splay_tree_lookup (&acc_dev->mem_map.splay_tree, &k);
+ gomp_mutex_unlock (&acc_dev->mem_map.lock);
if (tgt_fn_key == NULL)
gomp_fatal ("target function wasn't mapped: perhaps -fopenacc was "
@@ -196,8 +173,8 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
else
tgt_fn = (void (*)) fn;
- tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
- &ACC_memmap->mem_map, mapnum, hostaddrs,
+ tgt = gomp_map_vars ((struct gomp_device_descr *) acc_dev,
+ &acc_dev->mem_map, mapnum, hostaddrs,
NULL, sizes, kinds, true, false);
devaddrs = alloca (sizeof (void *) * mapnum);
@@ -205,7 +182,7 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
devaddrs[i] = (void *) (tgt->list[i]->tgt->tgt_start
+ tgt->list[i]->tgt_offset);
- ACC_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
+ acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
num_gangs, num_workers, vector_length, async,
tgt);
@@ -215,14 +192,12 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
else
{
gomp_copy_from_async (tgt);
- ACC_dev->openacc.register_async_cleanup_func (tgt);
+ acc_dev->openacc.register_async_cleanup_func (tgt);
}
- ACC_dev->openacc.async_set_async_func (acc_async_sync);
+ acc_dev->openacc.async_set_async_func (acc_async_sync);
}
-static __thread struct target_mem_desc *mapped_data = NULL;
-
void
GOACC_data_start (int device, const void *openmp_target, size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned short *kinds)
@@ -235,33 +210,37 @@ GOACC_data_start (int device, const void *openmp_target, size_t mapnum,
select_acc_device (device);
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
/* Host fallback or 'do nothing'. */
- if ((ACC_dev->capabilities & TARGET_CAP_SHARED_MEM)
+ if ((acc_dev->capabilities & TARGET_CAP_SHARED_MEM)
|| !if_clause_condition_value)
{
tgt = gomp_map_vars (NULL, NULL, 0, NULL, NULL, NULL, NULL, true, false);
- tgt->prev = mapped_data;
- mapped_data = tgt;
+ tgt->prev = thr->mapped_data;
+ thr->mapped_data = tgt;
return;
}
gomp_notify (" %s: prepare mappings\n", __FUNCTION__);
- tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
- &ACC_memmap->mem_map, mapnum, hostaddrs,
+ tgt = gomp_map_vars ((struct gomp_device_descr *) acc_dev,
+ &acc_dev->mem_map, mapnum, hostaddrs,
NULL, sizes, kinds, true, false);
gomp_notify (" %s: mappings prepared\n", __FUNCTION__);
- tgt->prev = mapped_data;
- mapped_data = tgt;
+ tgt->prev = thr->mapped_data;
+ thr->mapped_data = tgt;
}
void
GOACC_data_end (void)
{
- struct target_mem_desc *tgt = mapped_data;
+ struct goacc_thread *thr = goacc_thread ();
+ struct target_mem_desc *tgt = thr->mapped_data;
gomp_notify (" %s: restore mappings\n", __FUNCTION__);
- mapped_data = tgt->prev;
+ thr->mapped_data = tgt->prev;
gomp_unmap_vars (tgt, true);
gomp_notify (" %s: mappings restored\n", __FUNCTION__);
}
@@ -296,6 +275,8 @@ GOACC_kernels (int device, void (*fn) (void *), const void *openmp_target,
void
goacc_wait (int async, int num_waits, va_list ap)
{
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
int i;
assert (num_waits >= 0);
@@ -322,7 +303,7 @@ goacc_wait (int async, int num_waits, va_list ap)
if (async == acc_async_noval && num_waits == 0)
{
- ACC_dev->openacc.async_wait_all_async_func (acc_async_noval);
+ acc_dev->openacc.async_wait_all_async_func (acc_async_noval);
return;
}
@@ -337,7 +318,7 @@ goacc_wait (int async, int num_waits, va_list ap)
the queue itself will order work as required, so there's no need to
wait explicitly. */
if (qid != async)
- ACC_dev->openacc.async_wait_async_func (qid, async);
+ acc_dev->openacc.async_wait_async_func (qid, async);
}
}
@@ -351,7 +332,10 @@ GOACC_update (int device, const void *openmp_target, size_t mapnum,
select_acc_device (device);
- if ((ACC_dev->capabilities & TARGET_CAP_SHARED_MEM)
+ struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
+
+ if ((acc_dev->capabilities & TARGET_CAP_SHARED_MEM)
|| !if_clause_condition_value)
return;
@@ -366,34 +350,34 @@ GOACC_update (int device, const void *openmp_target, size_t mapnum,
va_end (ap);
}
- ACC_dev->openacc.async_set_async_func (async);
+ acc_dev->openacc.async_set_async_func (async);
for (i = 0; i < mapnum; ++i)
{
unsigned char kind = kinds[i] & 0xff;
- dump_var("UPD", i, hostaddrs[i], sizes[i], kinds[i]);
+ dump_var ("UPD", i, hostaddrs[i], sizes[i], kinds[i]);
switch (kind)
{
- case GOMP_MAP_POINTER:
- break;
+ case GOMP_MAP_POINTER:
+ break;
- case GOMP_MAP_FORCE_TO:
- acc_update_device (hostaddrs[i], sizes[i]);
- break;
+ case GOMP_MAP_FORCE_TO:
+ acc_update_device (hostaddrs[i], sizes[i]);
+ break;
- case GOMP_MAP_FORCE_FROM:
- acc_update_self (hostaddrs[i], sizes[i]);
- break;
+ case GOMP_MAP_FORCE_FROM:
+ acc_update_self (hostaddrs[i], sizes[i]);
+ break;
- default:
- gomp_fatal (">>>> GOACC_update UNHANDLED kind 0x%.2x", kind);
- break;
+ default:
+ gomp_fatal (">>>> GOACC_update UNHANDLED kind 0x%.2x", kind);
+ break;
}
}
- ACC_dev->openacc.async_set_async_func (acc_async_sync);
+ acc_dev->openacc.async_set_async_func (acc_async_sync);
}
void
@@ -28,13 +28,7 @@
#include "libgomp.h"
#include "oacc-plugin.h"
#include "target.h"
-
-void
-ACC_plugin_register (struct gomp_device_descr *device)
-{
- ACC_register (device);
-}
-
+#include "oacc-int.h"
void
GOMP_PLUGIN_async_unmap_vars (void *ptr)
@@ -43,3 +37,12 @@ GOMP_PLUGIN_async_unmap_vars (void *ptr)
gomp_unmap_vars (tgt, false);
}
+
+/* Return the target-specific part of the TLS data for the current thread. */
+
+void *
+GOMP_PLUGIN_acc_thread (void)
+{
+ struct goacc_thread *thr = goacc_thread ();
+ return thr ? thr->target_tls : NULL;
+}
@@ -26,8 +26,7 @@
#ifndef _OACC_PLUGIN_H
#define _OACC_PLUGIN_H 1
-#include "target.h"
-
-extern void ACC_plugin_register (struct gomp_device_descr *dev);
+extern void GOMP_PLUGIN_async_unmap_vars (void *ptr);
+extern void *GOMP_PLUGIN_acc_thread (void);
#endif
@@ -38,6 +38,7 @@
#include "libgomp.h"
#include "target.h"
#include "libgomp-plugin.h"
+#include "oacc-plugin.h"
#include <cuda.h>
#include <sys/queue.h>
@@ -154,8 +155,13 @@ struct PTX_stream
SLIST_HEAD(PTX_streams, PTX_stream);
-/* Each thread may select a stream (also specific to a device/context). */
-static __thread struct PTX_stream *current_stream;
+/* Thread-specific data for PTX. */
+
+struct nvptx_thread
+{
+ struct PTX_stream *current_stream;
+ struct PTX_device *ptx_dev;
+};
struct map
{
@@ -309,10 +315,6 @@ struct PTX_device
SLIST_ENTRY(PTX_device) next;
};
-static __thread struct PTX_device *PTX_dev;
-static SLIST_HEAD(_PTX_devices, PTX_device) _PTX_devices;
-static struct _PTX_devices *PTX_devices;
-
enum PTX_event_type
{
PTX_EVT_MEM,
@@ -402,6 +404,12 @@ verify_device_library (void)
return 0;
}
+static inline struct nvptx_thread *
+nvptx_thread (void)
+{
+ return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
+}
+
static void
init_streams_for_device (struct PTX_device *ptx_dev, int concurrency)
{
@@ -463,8 +471,9 @@ static struct PTX_stream *
select_stream_for_async (int async, pthread_t thread, bool create,
CUstream existing)
{
+ struct nvptx_thread *nvthd = nvptx_thread ();
/* Local copy of TLS variable. */
- struct PTX_device *ptx_dev = PTX_dev;
+ struct PTX_device *ptx_dev = nvthd->ptx_dev;
struct PTX_stream *stream = NULL;
int orig_async = async;
@@ -593,10 +602,8 @@ PTX_init (void)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r));
- PTX_devices = &_PTX_devices;
PTX_events = &_PTX_events;
- SLIST_INIT(PTX_devices);
SLIST_INIT(PTX_events);
GOMP_PLUGIN_mutex_init (&PTX_event_lock);
@@ -617,122 +624,93 @@ PTX_fini (void)
static void *
PTX_open_device (int n)
{
+ struct PTX_device *ptx_dev;
CUdevice dev;
CUresult r;
int async_engines, pi;
- if (PTX_devices)
- {
- struct PTX_device *ptx_device;
-
- SLIST_FOREACH(ptx_device, PTX_devices, next)
- {
- if (ptx_device->ord == n)
- {
- PTX_dev = ptx_device;
-
- if (PTX_dev->ctx)
- {
- r = cuCtxPushCurrent (PTX_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s",
- cuErrorMsg (r));
- }
-
- return (void *)PTX_dev;
- }
- }
- }
-
r = cuDeviceGet (&dev, n);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuErrorMsg (r));
- PTX_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device));
- PTX_dev->ord = n;
- PTX_dev->dev = dev;
- PTX_dev->ctx_shared = false;
+ ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device));
- SLIST_INSERT_HEAD(PTX_devices, PTX_dev, next);
+ ptx_dev->ord = n;
+ ptx_dev->dev = dev;
+ ptx_dev->ctx_shared = false;
- r = cuCtxGetCurrent (&PTX_dev->ctx);
+ r = cuCtxGetCurrent (&ptx_dev->ctx);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
- if (!PTX_dev->ctx)
+ if (!ptx_dev->ctx)
{
- r = cuCtxCreate (&PTX_dev->ctx, CU_CTX_SCHED_AUTO, dev);
+ r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuErrorMsg (r));
}
else
- {
- PTX_dev->ctx_shared = true;
- }
+ ptx_dev->ctx_shared = true;
r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
- PTX_dev->overlap = pi;
+ ptx_dev->overlap = pi;
r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
- PTX_dev->map = pi;
+ ptx_dev->map = pi;
r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
- PTX_dev->concur = pi;
+ ptx_dev->concur = pi;
r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
- PTX_dev->mode = pi;
+ ptx_dev->mode = pi;
r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
- PTX_dev->mkern = pi;
+ ptx_dev->mkern = pi;
r = cuDeviceGetAttribute (&async_engines,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
if (r != CUDA_SUCCESS)
async_engines = 1;
- init_streams_for_device (PTX_dev, async_engines);
-
- current_stream = PTX_dev->null_stream;
+ init_streams_for_device (ptx_dev, async_engines);
- return (void *)PTX_dev;
+ return (void *) ptx_dev;
}
static int
-PTX_close_device (void *h __attribute__((unused)))
+PTX_close_device (void *targ_data)
{
CUresult r;
+ struct PTX_device *ptx_dev = targ_data;
- if (!PTX_dev)
+ if (!ptx_dev)
return 0;
- fini_streams_for_device (PTX_dev);
+ fini_streams_for_device (ptx_dev);
- if (!PTX_dev->ctx_shared)
+ if (!ptx_dev->ctx_shared)
{
- r = cuCtxDestroy (PTX_dev->ctx);
+ r = cuCtxDestroy (ptx_dev->ctx);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r));
}
- SLIST_REMOVE(PTX_devices, PTX_dev, PTX_device, next);
- free (PTX_dev);
-
- PTX_dev = NULL;
+ free (ptx_dev);
return 0;
}
@@ -743,7 +721,12 @@ PTX_get_num_devices (void)
int n;
CUresult r;
- assert (PTX_inited);
+ /* This function will be called before the plugin has been initialized in
+ order to enumerate available devices, but CUDA API routines can't be used
+ until cuInit has been called. Just call it now (but don't yet do any
+ further initialization). */
+ if (!PTX_inited)
+ cuInit (0);
r = cuDeviceGetCount (&n);
if (r!= CUDA_SUCCESS)
@@ -921,6 +904,7 @@ static void
event_gc (bool memmap_lockable)
{
struct PTX_event *ptx_event;
+ struct nvptx_thread *nvthd = nvptx_thread ();
GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
@@ -929,7 +913,7 @@ event_gc (bool memmap_lockable)
CUresult r;
struct PTX_event *next = SLIST_NEXT (ptx_event, next);
- if (ptx_event->ord != PTX_dev->ord)
+ if (ptx_event->ord != nvthd->ptx_dev->ord)
goto next_event;
r = cuEventQuery (*ptx_event->evt);
@@ -983,6 +967,7 @@ static void
event_add (enum PTX_event_type type, CUevent *e, void *h)
{
struct PTX_event *ptx_event;
+ struct nvptx_thread *nvthd = nvptx_thread ();
assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
|| type == PTX_EVT_ASYNC_CLEANUP);
@@ -991,7 +976,7 @@ event_add (enum PTX_event_type type, CUevent *e, void *h)
ptx_event->type = type;
ptx_event->evt = e;
ptx_event->addr = h;
- ptx_event->ord = PTX_dev->ord;
+ ptx_event->ord = nvthd->ptx_dev->ord;
GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
@@ -1013,11 +998,12 @@ PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
void *kargs[1];
void *hp, *dp;
unsigned int nthreads_in_block;
+ struct nvptx_thread *nvthd = nvptx_thread ();
function = targ_fn->fn;
dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
- assert (dev_str == current_stream);
+ assert (dev_str == nvthd->current_stream);
/* This reserves a chunk of a pre-allocated page of memory mapped on both
the host and the device. HP is a host pointer to the new chunk, and DP is
@@ -1152,6 +1138,7 @@ PTX_host2dev (void *d, const void *h, size_t s)
CUresult r;
CUdeviceptr pb;
size_t ps;
+ struct nvptx_thread *nvthd = nvptx_thread ();
if (!s)
return 0;
@@ -1176,7 +1163,7 @@ PTX_host2dev (void *d, const void *h, size_t s)
GOMP_PLUGIN_fatal ("invalid size");
#ifndef DISABLE_ASYNC
- if (current_stream != PTX_dev->null_stream)
+ if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
CUevent *e;
@@ -1188,11 +1175,12 @@ PTX_host2dev (void *d, const void *h, size_t s)
event_gc (false);
- r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, current_stream->stream);
+ r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
+ nvthd->current_stream->stream);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r));
- r = cuEventRecord (*e, current_stream->stream);
+ r = cuEventRecord (*e, nvthd->current_stream->stream);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
@@ -1215,6 +1203,7 @@ PTX_dev2host (void *h, const void *d, size_t s)
CUresult r;
CUdeviceptr pb;
size_t ps;
+ struct nvptx_thread *nvthd = nvptx_thread ();
if (!s)
return 0;
@@ -1239,7 +1228,7 @@ PTX_dev2host (void *h, const void *d, size_t s)
GOMP_PLUGIN_fatal ("invalid size");
#ifndef DISABLE_ASYNC
- if (current_stream != PTX_dev->null_stream)
+ if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
CUevent *e;
@@ -1251,11 +1240,12 @@ PTX_dev2host (void *h, const void *d, size_t s)
event_gc (false);
- r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, current_stream->stream);
+ r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
+ nvthd->current_stream->stream);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r));
- r = cuEventRecord (*e, current_stream->stream);
+ r = cuEventRecord (*e, nvthd->current_stream->stream);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
@@ -1275,7 +1265,9 @@ PTX_dev2host (void *h, const void *d, size_t s)
static void
PTX_set_async (int async)
{
- current_stream = select_stream_for_async (async, pthread_self (), true, NULL);
+ struct nvptx_thread *nvthd = nvptx_thread ();
+ nvthd->current_stream
+ = select_stream_for_async (async, pthread_self (), true, NULL);
}
static int
@@ -1313,20 +1305,21 @@ PTX_async_test_all (void)
{
struct PTX_stream *s;
pthread_t self = pthread_self ();
+ struct nvptx_thread *nvthd = nvptx_thread ();
- GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
- SLIST_FOREACH (s, &PTX_dev->active_streams, next)
+ SLIST_FOREACH (s, &nvthd->ptx_dev->active_streams, next)
{
if ((s->multithreaded || pthread_equal (s->host_thread, self))
&& cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
{
- GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
return 0;
}
}
- GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
event_gc (true);
@@ -1395,12 +1388,13 @@ PTX_wait_all (void)
CUresult r;
struct PTX_stream *s;
pthread_t self = pthread_self ();
+ struct nvptx_thread *nvthd = nvptx_thread ();
- GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
/* Wait for active streams initiated by this thread (or by multiple threads)
to complete. */
- SLIST_FOREACH (s, &PTX_dev->active_streams, next)
+ SLIST_FOREACH (s, &nvthd->ptx_dev->active_streams, next)
{
if (s->multithreaded || pthread_equal (s->host_thread, self))
{
@@ -1416,7 +1410,7 @@ PTX_wait_all (void)
}
}
- GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
event_gc (true);
}
@@ -1427,6 +1421,7 @@ PTX_wait_all_async (int async)
CUresult r;
struct PTX_stream *waiting_stream, *other_stream;
CUevent *e;
+ struct nvptx_thread *nvthd = nvptx_thread ();
pthread_t self = pthread_self ();
/* The stream doing the waiting. This could be the first mention of the
@@ -1436,14 +1431,14 @@ PTX_wait_all_async (int async)
/* Launches on the null stream already block on other streams in the
context. */
- if (!waiting_stream || waiting_stream == PTX_dev->null_stream)
+ if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
return;
event_gc (true);
- GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
- SLIST_FOREACH (other_stream, &PTX_dev->active_streams, next)
+ SLIST_FOREACH (other_stream, &nvthd->ptx_dev->active_streams, next)
{
if (!other_stream->multithreaded
&& !pthread_equal (other_stream->host_thread, self))
@@ -1467,33 +1462,38 @@ PTX_wait_all_async (int async)
GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
}
- GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
}
static void *
PTX_get_current_cuda_device (void)
{
- if (!PTX_dev)
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!nvthd || !nvthd->ptx_dev)
return NULL;
- return &PTX_dev->dev;
+ return &nvthd->ptx_dev->dev;
}
static void *
PTX_get_current_cuda_context (void)
{
- if (!PTX_dev)
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (!nvthd || !nvthd->ptx_dev)
return NULL;
- return PTX_dev->ctx;
+ return nvthd->ptx_dev->ctx;
}
static void *
PTX_get_cuda_stream (int async)
{
struct PTX_stream *s;
+ struct nvptx_thread *nvthd = nvptx_thread ();
- if (!PTX_dev)
+ if (!nvthd || !nvthd->ptx_dev)
return NULL;
s = select_stream_for_async (async, pthread_self (), false, NULL);
@@ -1506,8 +1506,9 @@ PTX_set_cuda_stream (int async, void *stream)
{
struct PTX_stream *oldstream;
pthread_t self = pthread_self ();
+ struct nvptx_thread *nvthd = nvptx_thread ();
- GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
+ GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
if (async < 0)
GOMP_PLUGIN_fatal ("bad async %d", async);
@@ -1524,14 +1525,15 @@ PTX_set_cuda_stream (int async, void *stream)
if (oldstream)
{
- SLIST_REMOVE (&PTX_dev->active_streams, oldstream, PTX_stream, next);
+ SLIST_REMOVE (&nvthd->ptx_dev->active_streams, oldstream, PTX_stream,
+ next);
cuStreamDestroy (oldstream->stream);
map_fini (oldstream);
free (oldstream);
}
- GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+ GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
(void) select_stream_for_async (async, self, true, (CUstream) stream);
@@ -1754,9 +1756,11 @@ openacc_close_device (void *h)
void
openacc_set_device_num (int n)
{
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
assert (n >= 0);
- if (!PTX_dev || PTX_dev->ord != n)
+ if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
(void) PTX_open_device (n);
}
@@ -1768,8 +1772,10 @@ openacc_set_device_num (int n)
int
openacc_get_device_num (void)
{
- if (PTX_dev)
- return PTX_dev->ord;
+ struct nvptx_thread *nvthd = nvptx_thread ();
+
+ if (nvthd && nvthd->ptx_dev)
+ return nvthd->ptx_dev->ord;
else
return -1;
}
@@ -1788,6 +1794,7 @@ openacc_register_async_cleanup (void *targ_mem_desc)
{
CUevent *e;
CUresult r;
+ struct nvptx_thread *nvthd = nvptx_thread ();
#ifdef DEBUG
fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
@@ -1800,7 +1807,7 @@ openacc_register_async_cleanup (void *targ_mem_desc)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
- r = cuEventRecord (*e, current_stream->stream);
+ r = cuEventRecord (*e, nvthd->current_stream->stream);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
@@ -1876,6 +1883,40 @@ openacc_async_set_async (int async)
}
void *
+openacc_create_thread_data (void *targ_data)
+{
+ struct PTX_device *ptx_dev = (struct PTX_device *) targ_data;
+ struct nvptx_thread *nvthd
+ = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
+ CUresult r;
+ CUcontext thd_ctx;
+
+ r = cuCtxGetCurrent (&thd_ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
+
+ assert (ptx_dev->ctx);
+
+ if (!thd_ctx)
+ {
+ r = cuCtxPushCurrent (ptx_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuErrorMsg (r));
+ }
+
+ nvthd->current_stream = ptx_dev->null_stream;
+ nvthd->ptx_dev = ptx_dev;
+
+ return (void *) nvthd;
+}
+
+void
+openacc_destroy_thread_data (void *data)
+{
+ free (data);
+}
+
+void *
openacc_get_current_cuda_device (void)
{
#ifdef DEBUG
@@ -80,6 +80,7 @@ splay_compare (splay_tree_key x, splay_tree_key y)
}
#include "target.h"
+#include "oacc-int.h"
attribute_hidden void
gomp_init_targets_once (void)
@@ -815,21 +816,28 @@ gomp_init_dev_tables (struct gomp_device_descr *devicep)
}
attribute_hidden void
-gomp_fini_device (struct gomp_device_descr *devicep)
+gomp_free_memmap (struct gomp_device_descr *devicep)
{
struct gomp_memory_mapping *mm = &devicep->mem_map;
- if (devicep->is_initialized)
- devicep->device_fini_func ();
-
while (mm->splay_tree.root)
{
struct target_mem_desc *tgt = mm->splay_tree.root->key.tgt;
+
+ splay_tree_remove (&mm->splay_tree, &mm->splay_tree.root->key);
free (tgt->array);
free (tgt);
- splay_tree_remove (&mm->splay_tree, &mm->splay_tree.root->key);
}
+ mm->is_initialized = false;
+}
+
+attribute_hidden void
+gomp_fini_device (struct gomp_device_descr *devicep)
+{
+ if (devicep->is_initialized)
+ devicep->device_fini_func ();
+
devicep->is_initialized = false;
}
@@ -1076,6 +1084,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all);
DLSYM_OPT (openacc.async_wait_all_async, openacc_async_wait_all_async);
DLSYM_OPT (openacc.async_set_async, openacc_async_set_async);
+ DLSYM_OPT (openacc.create_thread_data, openacc_create_thread_data);
+ DLSYM_OPT (openacc.destroy_thread_data, openacc_destroy_thread_data);
/* Require all the OpenACC handlers if we have TARGET_CAP_OPENACC_200. */
if (optional_present != optional_total)
{
@@ -1155,6 +1165,8 @@ gomp_find_available_plugins (void)
while ((ent = readdir (dir)) != NULL)
{
struct gomp_device_descr current_device, *devicep;
+ unsigned int i;
+
if (!gomp_check_plugin_file_name (ent->d_name))
continue;
if (strlen (plugin_path) + 1 + strlen (ent->d_name) >= PATH_MAX)
@@ -1172,18 +1184,24 @@ gomp_find_available_plugins (void)
goto out;
}
- devices[num_devices] = current_device;
- devicep = &devices[num_devices];
-
- devicep->is_initialized = false;
- devicep->offload_regions_registered = false;
- devicep->mem_map.splay_tree.root = NULL;
- devicep->mem_map.is_initialized = false;
- devicep->type = devicep->get_type_func ();
- devicep->name = devicep->get_name_func ();
- devicep->capabilities = devicep->get_caps_func ();
- gomp_mutex_init (&devicep->mem_map.lock);
- devicep->id = ++num_devices;
+ for (i = 0; i < current_device.get_num_devices_func (); i++)
+ {
+ devices[num_devices] = current_device;
+ devicep = &devices[num_devices];
+
+ devicep->is_initialized = false;
+ devicep->offload_regions_registered = false;
+ devicep->mem_map.splay_tree.root = NULL;
+ devicep->mem_map.is_initialized = false;
+ devicep->type = devicep->get_type_func ();
+ devicep->name = devicep->get_name_func ();
+ devicep->capabilities = devicep->get_caps_func ();
+ gomp_mutex_init (&devicep->mem_map.lock);
+ devicep->ord = i;
+ devicep->target_data = NULL;
+ devicep->openacc.data_environ = NULL;
+ devicep->id = ++num_devices;
+ }
}
/* Prefer a device with TARGET_CAP_OPENMP_400 for ICV default-device-var. */
if (num_devices > 1)
@@ -1219,7 +1237,7 @@ gomp_find_available_plugins (void)
found all the plugins, so registering with the OpenACC runtime (which
takes a copy of the pointer argument) must be delayed until now. */
if (devices[i].capabilities & TARGET_CAP_OPENACC_200)
- ACC_plugin_register (&devices[i]);
+ ACC_register (&devices[i]);
}
out:
@@ -87,13 +87,53 @@ struct gomp_memory_mapping
bool is_initialized;
};
-#include "oacc-int.h"
-
-static inline enum acc_device_t
-acc_device_type (enum target_type type)
+typedef struct ACC_dispatch_t
{
- return (enum acc_device_t) type;
-}
+ /* This is a linked list of data mapped using the
+ acc_map_data/acc_unmap_data or "acc enter data"/"acc exit data" pragmas
+ (TODO). Unlike mapped_data in the goacc_thread struct, unmapping can
+ happen out-of-order with respect to mapping. */
+ struct target_mem_desc *data_environ;
+
+ /* Open or close a device instance. */
+ void *(*open_device_func) (int n);
+ int (*close_device_func) (void *h);
+
+ /* Set or get the device number. */
+ int (*get_device_num_func) (void);
+ void (*set_device_num_func) (int);
+
+ /* Availability. */
+ bool (*avail_func) (void);
+
+ /* Execute. */
+ void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
+ unsigned short *, int, int, int, int, void *);
+
+ /* Async cleanup callback registration. */
+ void (*register_async_cleanup_func) (void *);
+
+ /* Asynchronous routines. */
+ int (*async_test_func) (int);
+ int (*async_test_all_func) (void);
+ void (*async_wait_func) (int);
+ void (*async_wait_async_func) (int, int);
+ void (*async_wait_all_func) (void);
+ void (*async_wait_all_async_func) (int);
+ void (*async_set_async_func) (int);
+
+ /* Create/destroy TLS data. */
+ void *(*create_thread_data_func) (void *);
+ void (*destroy_thread_data_func) (void *);
+
+ /* NVIDIA target specific routines. */
+ struct {
+ void *(*get_current_device_func) (void);
+ void *(*get_current_context_func) (void);
+ void *(*get_stream_func) (int);
+ int (*set_stream_func) (int, void *);
+ } cuda;
+} ACC_dispatch_t;
struct mapping_table {
uintptr_t host_start;
@@ -118,6 +158,9 @@ struct gomp_device_descr
TARGET construct. */
int id;
+ /* The number of the device for this particular device type. */
+ int ord;
+
/* This is the TYPE of device. */
enum target_type type;
@@ -148,9 +191,11 @@ struct gomp_device_descr
/* OpenACC-specific functions. */
ACC_dispatch_t openacc;
- /* Memory-mapping info (only for OpenMP -- mappings are stored per-thread
- for OpenACC. It's not clear if that's a useful distinction). */
+ /* Memory-mapping info for this device instance. */
struct gomp_memory_mapping mem_map;
+
+ /* Extra information required for a device instance by a given target. */
+ void *target_data;
};
extern struct target_mem_desc *
@@ -175,4 +220,7 @@ gomp_init_tables (const struct gomp_device_descr *devicep,
extern attribute_hidden void
gomp_fini_device (struct gomp_device_descr *devicep);
+extern attribute_hidden void
+gomp_free_memmap (struct gomp_device_descr *devicep);
+
#endif /* _TARGET_H */