@@ -1,3 +1,29 @@
+2019-09-17 Julian Brown <julian@codesourcery.com>
+
+ * oacc-host.c (host_openacc_async_queue_callback): Invoke callback
+ function immediately.
+ * oacc-parallel.c (struct async_prof_callback_info, async_prof_dispatch,
+ queue_async_prof_dispatch): New.
+ (GOACC_parallel_keyed): Call queue_async_prof_dispatch for asynchronous
+ profile-event dispatches.
+ (GOACC_enter_exit_data): Likewise.
+ (GOACC_update): Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
+ (cb_compute_construct_start): Remove/fix TODO.
+ * testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
+ (cb_exit_data_start): Tweak expected state values.
+ (cb_exit_data_end): Likewise.
+ (cb_compute_construct_start): Remove/fix TODO.
+ (cb_compute_construct_end): Don't do adjustments for
+ acc_ev_enqueue_launch_start/acc_ev_enqueue_launch_end callbacks.
+ (cb_compute_construct_end): Tweak expected state values.
+ (cb_enqueue_launch_start, cb_enqueue_launch_end): Don't expect
+ launch-enqueue operations to happen synchronously with respect to
+ profiling events on async streams.
+ (main): Tweak expected state values.
+ * testsuite/libgomp.oacc-c-c++-common/lib-94.c (main): Reorder
+ operations for async-safety.
+
2019-09-17 Julian Brown <julian@codesourcery.com>
* target.c (gomp_map_vars_internal): Remove read of uninitialised
@@ -250,10 +250,9 @@ host_openacc_async_dev2host (int ord __attribute__ ((unused)),
static void
host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
__attribute__ ((unused)),
- void (*callback_fn)(void *)
- __attribute__ ((unused)),
- void *userptr __attribute__ ((unused)))
+ void (*callback_fn)(void *), void *userptr)
{
+ callback_fn (userptr);
}
static struct goacc_asyncqueue *
@@ -169,6 +169,62 @@ goacc_call_host_fn (void (*fn) (void *), size_t mapnum, void **hostaddrs,
fn (hostaddrs);
}
+struct async_prof_callback_info {
+ acc_prof_info prof_info;
+ acc_event_info event_info;
+ acc_api_info api_info;
+ struct async_prof_callback_info *start_info;
+};
+
+static void
+async_prof_dispatch (void *ptr)
+{
+ struct async_prof_callback_info *info
+ = (struct async_prof_callback_info *) ptr;
+
+ if (info->start_info)
+ {
+ /* The TOOL_INFO must be preserved from a start event to the
+ corresponding end event. Copy that here. */
+ void *tool_info = info->start_info->event_info.other_event.tool_info;
+ info->event_info.other_event.tool_info = tool_info;
+ }
+
+ goacc_profiling_dispatch (&info->prof_info, &info->event_info,
+ &info->api_info);
+
+ /* The async_prof_dispatch function is (so far) always used for start/end
+ profiling event pairs: the start and end parts are queued, then each is
+ dispatched (or the dispatches might be interleaved before the end part is
+ queued).
+ In any case, it's not safe to delete either info structure before the
+ whole bracketed event is complete. */
+
+ if (info->start_info)
+ {
+ free (info->start_info);
+ free (info);
+ }
+}
+
+static struct async_prof_callback_info *
+queue_async_prof_dispatch (struct gomp_device_descr *devicep, goacc_aq aq,
+ acc_prof_info *prof_info, acc_event_info *event_info,
+ acc_api_info *api_info,
+ struct async_prof_callback_info *prev_info)
+{
+ struct async_prof_callback_info *info = malloc (sizeof (*info));
+
+ info->prof_info = *prof_info;
+ info->event_info = *event_info;
+ info->api_info = *api_info;
+ info->start_info = prev_info;
+
+ devicep->openacc.async.queue_callback_func (aq, async_prof_dispatch,
+ (void *) info);
+ return info;
+}
+
/* Launch a possibly offloaded function with FLAGS. FN is the host fn
address. MAPNUM, HOSTADDRS, SIZES & KINDS describe the memory
blocks to be copied to/from the device. Varadic arguments are
@@ -194,6 +250,8 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
unsigned dims[GOMP_DIM_MAX];
unsigned tag;
bool args_exploded = false;
+ struct async_prof_callback_info *comp_start_info = NULL,
+ *data_start_info = NULL;
#ifdef HAVE_INTTYPES_H
gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n",
@@ -255,10 +313,6 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
api_info.async_handle = NULL;
}
- if (profiling_p)
- goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
- &api_info);
-
handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
/* Default: let the runtime choose. */
@@ -294,11 +348,12 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
if (async == GOMP_LAUNCH_OP_MAX)
async = va_arg (ap, unsigned);
- if (profiling_p)
- {
- prof_info.async = async;
- prof_info.async_queue = prof_info.async;
- }
+ /* Set async number in profiling data, unless the device is the
+ host or we're doing host fallback. */
+ if (profiling_p
+ && !(flags & GOACC_FLAG_HOST_FALLBACK)
+ && acc_device_type (acc_dev->type) != acc_device_host)
+ prof_info.async = prof_info.async_queue = async;
break;
}
@@ -321,6 +376,20 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
}
va_end (ap);
+ goacc_aq aq = get_goacc_asyncqueue (async);
+
+ if (profiling_p)
+ {
+ if (aq)
+ comp_start_info
+ = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+ &compute_construct_event_info,
+ &api_info, NULL);
+ else
+ goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+ &api_info);
+ }
+
/* Host fallback if "if" clause is false or if the current device is set to
the host. */
if (flags & GOACC_FLAG_HOST_FALLBACK)
@@ -368,12 +437,16 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
= compute_construct_event_info.other_event.parent_construct;
enter_exit_data_event_info.other_event.implicit = 1;
enter_exit_data_event_info.other_event.tool_info = NULL;
- goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
- &api_info);
+ if (aq)
+ data_start_info
+ = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+ &enter_exit_data_event_info, &api_info,
+ NULL);
+ else
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
}
- goacc_aq aq = get_goacc_asyncqueue (async);
-
tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
true, GOMP_MAP_VARS_OPENACC);
@@ -391,8 +464,13 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
prof_info.event_type = acc_ev_enter_data_end;
enter_exit_data_event_info.other_event.event_type
= prof_info.event_type;
- goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
- &api_info);
+ if (aq)
+ queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+ &enter_exit_data_event_info, &api_info,
+ data_start_info);
+ else
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
}
devaddrs = gomp_alloca (sizeof (void *) * mapnum);
@@ -423,8 +501,14 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
prof_info.event_type = acc_ev_exit_data_start;
enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
enter_exit_data_event_info.other_event.tool_info = NULL;
- goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
- &api_info);
+ if (aq)
+ data_start_info
+ = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+ &enter_exit_data_event_info, &api_info,
+ NULL);
+ else
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
}
/* If running synchronously, unmap immediately. */
@@ -437,8 +521,13 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
{
prof_info.event_type = acc_ev_exit_data_end;
enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
- goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
- &api_info);
+ if (aq)
+ queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+ &enter_exit_data_event_info, &api_info,
+ data_start_info);
+ else
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
}
#ifdef RC_CHECKING
@@ -453,8 +542,13 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
prof_info.event_type = acc_ev_compute_construct_end;
compute_construct_event_info.other_event.event_type
= prof_info.event_type;
- goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
- &api_info);
+ if (aq)
+ queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+ &compute_construct_event_info, &api_info,
+ comp_start_info);
+ else
+ goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+ &api_info);
thr->prof_info = NULL;
thr->api_info = NULL;
@@ -697,6 +791,7 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
struct gomp_device_descr *acc_dev;
bool data_enter = false;
size_t i;
+ struct async_prof_callback_info *data_start_info = NULL;
goacc_lazy_initialize ();
@@ -806,9 +901,19 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
api_info.async_handle = NULL;
}
+ goacc_aq aq = get_goacc_asyncqueue (async);
+
if (profiling_p)
- goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
- &api_info);
+ {
+ if (aq)
+ data_start_info
+ = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+ &enter_exit_data_event_info, &api_info,
+ NULL);
+ else
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
+ }
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| (flags & GOACC_FLAG_HOST_FALLBACK))
@@ -867,7 +972,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
case GOMP_MAP_STRUCT:
{
int elems = sizes[i];
- goacc_aq aq = get_goacc_asyncqueue (async);
gomp_map_vars_async (acc_dev, aq, elems + 1, &hostaddrs[i],
NULL, &sizes[i], &kinds[i], true,
GOMP_MAP_VARS_OPENACC_ENTER_DATA);
@@ -890,7 +994,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
&sizes[i], &kinds[i]);
else
{
- goacc_aq aq = get_goacc_asyncqueue (async);
for (int j = 0; j < 2; j++)
gomp_map_vars_async (acc_dev, aq,
(j == 0 || pointer == 2) ? 1 : 2,
@@ -1003,7 +1106,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
case GOMP_MAP_STRUCT:
{
int elems = sizes[i];
- goacc_aq aq = get_goacc_asyncqueue (async);
for (int j = 1; j <= elems; j++)
{
struct splay_tree_key_s k;
@@ -1067,8 +1169,13 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
prof_info.event_type
= data_enter ? acc_ev_enter_data_end : acc_ev_exit_data_end;
enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
- goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
- &api_info);
+ if (aq)
+ queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+ &enter_exit_data_event_info, &api_info,
+ data_start_info);
+ else
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
thr->prof_info = NULL;
thr->api_info = NULL;
@@ -1120,6 +1227,8 @@ GOACC_update (int flags_m, size_t mapnum,
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
+ goacc_aq aq = NULL;
+ struct async_prof_callback_info *update_start_info = NULL;
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
@@ -1169,7 +1278,15 @@ GOACC_update (int flags_m, size_t mapnum,
}
if (profiling_p)
- goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+ {
+ aq = get_goacc_asyncqueue (async);
+ if (aq)
+ update_start_info
+ = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+ &update_event_info, &api_info, NULL);
+ else
+ goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+ }
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| (flags & GOACC_FLAG_HOST_FALLBACK))
@@ -1257,7 +1374,11 @@ GOACC_update (int flags_m, size_t mapnum,
{
prof_info.event_type = acc_ev_update_end;
update_event_info.other_event.event_type = prof_info.event_type;
- goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+ if (aq)
+ queue_async_prof_dispatch (acc_dev, aq, &prof_info, &update_event_info,
+ &api_info, update_start_info);
+ else
+ goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
thr->prof_info = NULL;
thr->api_info = NULL;
@@ -159,7 +159,10 @@ static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
- assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+ if (acc_device_type == acc_device_host)
+ assert (prof_info->async == acc_async_sync);
+ else
+ assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
@@ -284,9 +284,9 @@ static void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_
{
DEBUG_printf ("%s\n", __FUNCTION__);
- assert (state == 7
+ assert (state == 5
#if ASYNC_EXIT_DATA
- || state == 107
+ || state == 105
#endif
);
STATE_OP (state, ++);
@@ -340,9 +340,9 @@ static void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_in
{
DEBUG_printf ("%s\n", __FUNCTION__);
- assert (state == 8
+ assert (state == 6
#if ASYNC_EXIT_DATA
- || state == 108
+ || state == 106
#endif
);
STATE_OP (state, ++);
@@ -426,7 +426,10 @@ static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
- assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+ if (acc_device_type == acc_device_host)
+ assert (prof_info->async == acc_async_sync);
+ else
+ assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
@@ -467,9 +470,6 @@ static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *
{
/* Compensate for the missing 'acc_ev_enter_data_end'. */
state += 1;
- /* Compensate for the missing 'acc_ev_enqueue_launch_start' and
- 'acc_ev_enqueue_launch_end'. */
- state += 2;
/* Compensate for the missing 'acc_ev_exit_data_start' and
'acc_ev_exit_data_end'. */
state += 2;
@@ -482,8 +482,8 @@ static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *
state += 2;
}
#endif
- assert (state == 9
- || state == 109);
+ assert (state == 7
+ || state == 107);
STATE_OP (state, ++);
assert (tool_info != NULL);
@@ -537,17 +537,6 @@ static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
assert (acc_device_type != acc_device_host);
- assert (state == 5
- || state == 105);
- STATE_OP (state, ++);
-
- assert (tool_info != NULL);
- assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
- assert (tool_info->nested == NULL);
- tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
- assert (tool_info->nested != NULL);
- tool_info->nested->nested = NULL;
-
assert (prof_info->event_type == acc_ev_enqueue_launch_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
@@ -591,13 +580,6 @@ static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
-
- tool_info->nested->event_info.launch_event.event_type = event_info->launch_event.event_type;
- tool_info->nested->event_info.launch_event.kernel_name = strdup (event_info->launch_event.kernel_name);
- tool_info->nested->event_info.launch_event.num_gangs = event_info->launch_event.num_gangs;
- tool_info->nested->event_info.launch_event.num_workers = event_info->launch_event.num_workers;
- tool_info->nested->event_info.launch_event.vector_length = event_info->launch_event.vector_length;
- event_info->other_event.tool_info = tool_info->nested;
}
static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
@@ -606,19 +588,6 @@ static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
assert (acc_device_type != acc_device_host);
- assert (state == 6
- || state == 106);
- STATE_OP (state, ++);
-
- assert (tool_info != NULL);
- assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
- assert (tool_info->nested != NULL);
- assert (tool_info->nested->event_info.launch_event.event_type == acc_ev_enqueue_launch_start);
- assert (tool_info->nested->event_info.launch_event.kernel_name != NULL);
- assert (tool_info->nested->event_info.launch_event.num_gangs >= 1);
- assert (tool_info->nested->event_info.launch_event.num_workers >= 1);
- assert (tool_info->nested->event_info.launch_event.vector_length >= 1);
-
assert (prof_info->event_type == acc_ev_enqueue_launch_end);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
@@ -638,12 +607,7 @@ static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
assert (event_info->launch_event.parent_construct == acc_construct_parallel);
assert (event_info->launch_event.implicit == 1);
- assert (event_info->launch_event.tool_info == tool_info->nested);
assert (event_info->launch_event.kernel_name != NULL);
- assert (strcmp (event_info->launch_event.kernel_name, tool_info->nested->event_info.launch_event.kernel_name) == 0);
- assert (event_info->launch_event.num_gangs == tool_info->nested->event_info.launch_event.num_gangs);
- assert (event_info->launch_event.num_workers == tool_info->nested->event_info.launch_event.num_workers);
- assert (event_info->launch_event.vector_length == tool_info->nested->event_info.launch_event.vector_length);
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
@@ -657,10 +621,6 @@ static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
-
- free ((void *) tool_info->nested->event_info.launch_event.kernel_name);
- free (tool_info->nested);
- tool_info->nested = NULL;
}
@@ -707,7 +667,7 @@ int main()
}
assert (state_init == 4);
}
- assert (state == 10);
+ assert (state == 8);
STATE_OP (state, = 100);
@@ -723,7 +683,7 @@ int main()
#pragma acc wait
assert (state_init == 104);
}
- assert (state == 110);
+ assert (state == 108);
return 0;
}
@@ -22,10 +22,10 @@ main (int argc, char **argv)
acc_copyin_async (h, N, async);
- memset (h, 0, N);
-
acc_wait (async);
+ memset (h, 0, N);
+
acc_copyout_async (h, N, async + 1);
acc_wait (async + 1);