Fix hang when running oacc exec with CUDA 9.0 nvprof
2018-02-15 Tom de Vries <tom@codesourcery.com>
* oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread):
New variable.
(acc_init_1): Set acc_init_thread to pthread_self (). Set
acc_init_state to initializing at the start, and to initialized at the
end.
(self_initializing_p): New function.
(acc_get_device_type): Return acc_device_none if called by thread that
is currently executing acc_init_1.
---
libgomp/oacc-init.c | 33 +++++++++++++++++++++++++++++++++
2 files changed, 44 insertions(+)
@@ -40,6 +40,11 @@
static gomp_mutex_t acc_device_lock;
+static gomp_mutex_t acc_init_state_lock;
+static enum { uninitialized, initializing, initialized } acc_init_state
+ = uninitialized;
+static pthread_t acc_init_thread;
+
/* A cached version of the dispatcher for the global "current" accelerator type,
e.g. used as the default when creating new host threads. This is the
device-type equivalent of goacc_device_num (which specifies which device to
@@ -220,6 +225,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
static struct gomp_device_descr *
acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
{
+ gomp_mutex_lock (&acc_init_state_lock);
+ acc_init_state = initializing;
+ acc_init_thread = pthread_self ();
+ gomp_mutex_unlock (&acc_init_state_lock);
+
bool check_not_nested_p;
if (implicit)
{
@@ -312,6 +322,9 @@ acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
&api_info);
}
+ gomp_mutex_lock (&acc_init_state_lock);
+ acc_init_state = initialized;
+ gomp_mutex_unlock (&acc_init_state_lock);
return base_dev;
}
@@ -644,6 +657,17 @@ acc_set_device_type (acc_device_t d)
ialias (acc_set_device_type)
+static bool
+self_initializing_p (void)
+{
+ bool res;
+ gomp_mutex_lock (&acc_init_state_lock);
+ res = (acc_init_state == initializing
+ && pthread_equal (acc_init_thread, pthread_self ()));
+ gomp_mutex_unlock (&acc_init_state_lock);
+ return res;
+}
+
acc_device_t
acc_get_device_type (void)
{
@@ -653,6 +677,15 @@ acc_get_device_type (void)
if (thr && thr->base_dev)
res = acc_device_type (thr->base_dev->type);
+ else if (self_initializing_p ())
+ /* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the
+ acc_ev_device_init_start event callback, which is dispatched during
+ acc_init_1. Trying to lock acc_device_lock during such a call (as we do
+ in the else clause below), will result in deadlock, since the lock has
+ already been taken by the acc_init_1 caller. We work around this problem
+ by using the acc_get_device_type property "If the device type has not yet
+ been selected, the value acc_device_none may be returned". */
+ ;
else
{
acc_prof_info prof_info;