@@ -1,3 +1,36 @@
+2019-09-17 Julian Brown <julian@codesourcery.com>
+
+ * libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_host2dev): Update
+ prototype.
+ * libgomp.h (gomp_copy_host2dev): Update prototype.
+ * oacc-host.c (host_openacc_async_host2dev): Add ephemeral parameter.
+ * oacc-mem.c (memcpy_tofrom_device): Update call to gomp_copy_host2dev.
+ (update_dev_host): Likewise.
+ * oacc-parallel.c (GOACC_enter_exit_data): Call async versions of
+ acc_attach/acc_detach/acc_detach_finalize functions.
+ * plugin/plugin-gcn.c (wait_for_queue_nonfull): Don't lock/unlock
+ aq->mutex here.
+ (queue_push_launch): Lock aq->mutex before calling
+ wait_for_queue_nonfull.
+ (queue_push_callback): Likewise.
+ (queue_push_asyncwait): Likewise.
+ (queue_push_placeholder): Likewise.
+ (GOMP_OFFLOAD_openacc_async_host2dev): Add ephemeral parameter. Copy
+ source data to temporary space immediately if true, and pass to
+ queue_push_copy.
+ (goacc_device_copy_async): Remove.
+ (gomp_copy_host2dev): Add ephemeral parameter. Update function comment.
+ Call async host2dev plugin hook directly.
+ (gomp_copy_dev2host): Call async dev2host plugin hook directly.
+ (gomp_map_vars_existing, gomp_map_pointer, gomp_attach_pointer,
+ gomp_detach_pointer): Update calls to gomp_copy_host2dev.
+ (gomp_map_vars_internal): Don't use coalescing buffer for asynchronous
+ copies. Update calls to gomp_copy_host2dev.
+ (gomp_update): Update calls to gomp_copy_host2dev.
+ * testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c (main): Fix
+ async-safety issue. Increase number of iterations.
+ * testsuite/libgomp.oacc-fortran/lib-16-2.f90: Fix async-safety issue.
+
2019-09-17 Julian Brown <julian@codesourcery.com>
* oacc-host.c (host_openacc_async_queue_callback): Invoke callback
@@ -141,7 +141,8 @@ extern void GOMP_OFFLOAD_openacc_async_exec_params (void (*) (void *), size_t,
struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t,
struct goacc_asyncqueue *);
-extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
+extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *,
+ size_t, bool,
struct goacc_asyncqueue *);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
@@ -1120,7 +1120,7 @@ extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
struct gomp_coalesce_buf;
extern void gomp_copy_host2dev (struct gomp_device_descr *,
struct goacc_asyncqueue *, void *, const void *,
- size_t, struct gomp_coalesce_buf *);
+ size_t, bool, struct gomp_coalesce_buf *);
extern void gomp_copy_dev2host (struct gomp_device_descr *,
struct goacc_asyncqueue *, void *, const void *,
size_t);
@@ -230,6 +230,7 @@ host_openacc_async_host2dev (int ord __attribute__ ((unused)),
void *dst __attribute__ ((unused)),
const void *src __attribute__ ((unused)),
size_t n __attribute__ ((unused)),
+ bool eph __attribute__ ((unused)),
struct goacc_asyncqueue *aq
__attribute__ ((unused)))
{
@@ -203,7 +203,7 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
if (from)
gomp_copy_dev2host (thr->dev, aq, h, d, s);
else
- gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
+ gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
if (profiling_p)
{
@@ -819,7 +819,7 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
goacc_aq aq = get_goacc_asyncqueue (async);
if (is_dev)
- gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
+ gomp_copy_host2dev (acc_dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
else
gomp_copy_dev2host (acc_dev, aq, h, d, s);
@@ -1022,7 +1022,7 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
if (!pointer)
{
if (kind == GOMP_MAP_ATTACH)
- acc_attach (hostaddrs[i]);
+ acc_attach_async (hostaddrs[i], async);
else if (kind == GOMP_MAP_STRUCT)
i += sizes[i];
}
@@ -1042,9 +1042,9 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
if (!pointer)
{
if (kind == GOMP_MAP_DETACH)
- acc_detach (hostaddrs[i]);
+ acc_detach_async (hostaddrs[i], async);
else if (kind == GOMP_MAP_FORCE_DETACH)
- acc_detach_finalize (hostaddrs[i]);
+ acc_detach_finalize_async (hostaddrs[i], async);
else if (kind == GOMP_MAP_STRUCT)
i += sizes[i];
}
@@ -1053,9 +1053,9 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
unsigned char kind2 = kinds[i + pointer - 1] & 0xff;
if (kind2 == GOMP_MAP_DETACH)
- acc_detach (hostaddrs[i + pointer - 1]);
+ acc_detach_async (hostaddrs[i + pointer - 1], async);
else if (kind2 == GOMP_MAP_FORCE_DETACH)
- acc_detach_finalize (hostaddrs[i + pointer - 1]);
+ acc_detach_finalize_async (hostaddrs[i + pointer - 1], async);
i += pointer - 1;
}
@@ -1408,13 +1408,9 @@ wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
{
if (aq->queue_n == ASYNC_QUEUE_SIZE)
{
- pthread_mutex_lock (&aq->mutex);
-
/* Queue is full. Wait for it to not be full. */
while (aq->queue_n == ASYNC_QUEUE_SIZE)
pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
-
- pthread_mutex_unlock (&aq->mutex);
}
}
@@ -1424,10 +1420,10 @@ queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
{
assert (aq->agent == kernel->agent);
- wait_for_queue_nonfull (aq);
-
pthread_mutex_lock (&aq->mutex);
+ wait_for_queue_nonfull (aq);
+
int queue_last = ((aq->queue_first + aq->queue_n)
% ASYNC_QUEUE_SIZE);
if (DEBUG_QUEUES)
@@ -1453,10 +1449,10 @@ static void
queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
void *data)
{
- wait_for_queue_nonfull (aq);
-
pthread_mutex_lock (&aq->mutex);
+ wait_for_queue_nonfull (aq);
+
int queue_last = ((aq->queue_first + aq->queue_n)
% ASYNC_QUEUE_SIZE);
if (DEBUG_QUEUES)
@@ -1484,10 +1480,10 @@ static void
queue_push_asyncwait (struct goacc_asyncqueue *aq,
struct placeholder *placeholderp)
{
- wait_for_queue_nonfull (aq);
-
pthread_mutex_lock (&aq->mutex);
+ wait_for_queue_nonfull (aq);
+
int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
if (DEBUG_QUEUES)
HSA_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
@@ -1511,10 +1507,10 @@ queue_push_placeholder (struct goacc_asyncqueue *aq)
{
struct placeholder *placeholderp;
- wait_for_queue_nonfull (aq);
-
pthread_mutex_lock (&aq->mutex);
+ wait_for_queue_nonfull (aq);
+
int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
if (DEBUG_QUEUES)
HSA_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
@@ -3683,19 +3679,22 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
bool
GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
- size_t n, struct goacc_asyncqueue *aq)
+ size_t n, bool ephemeral,
+ struct goacc_asyncqueue *aq)
{
struct agent_info *agent = get_agent_info (device);
assert (agent == aq->agent);
- /* The source data does not necessarily remain live until the deferred
- copy happens. Taking a snapshot of the data here avoids reading
- uninitialised data later, but means that (a) data is copied twice and
- (b) modifications to the copied data between the "spawning" point of
- the asynchronous kernel and when it is executed will not be seen.
- But, that is probably correct. */
- void *src_copy = GOMP_PLUGIN_malloc (n);
- memcpy (src_copy, src, n);
- queue_push_copy (aq, dst, src_copy, n, true);
+
+ if (ephemeral)
+ {
+ /* The source data is on the stack or otherwise may be deallocated
+ before the asynchronous copy takes place. Take a copy of the source
+ data. */
+ void *src_copy = GOMP_PLUGIN_malloc (n);
+ memcpy (src_copy, src, n);
+ src = src_copy;
+ }
+ queue_push_copy (aq, dst, src, n, ephemeral);
return true;
}
@@ -194,22 +194,6 @@ gomp_device_copy (struct gomp_device_descr *devicep,
}
}
-static inline void
-goacc_device_copy_async (struct gomp_device_descr *devicep,
- bool (*copy_func) (int, void *, const void *, size_t,
- struct goacc_asyncqueue *),
- const char *dst, void *dstaddr,
- const char *src, const void *srcaddr,
- size_t size, struct goacc_asyncqueue *aq)
-{
- if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq))
- {
- gomp_mutex_unlock (&devicep->lock);
- gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed",
- src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size);
- }
-}
-
/* Infrastructure for coalescing adjacent or nearly adjacent (in device
addresses) host to device memory transfers. */
@@ -303,15 +287,17 @@ gomp_to_device_kind_p (int kind)
}
/* Copy host memory to an offload device. In asynchronous mode (if AQ is
- non-NULL), H may point to a stack location. It is up to the underlying
- plugin to ensure that this data is read immediately, rather than at some
- later point when the stack frame will likely have been destroyed. */
+ non-NULL), when the source data is stack or may otherwise be deallocated
+ before the asynchronous copy takes place, EPHEMERAL must be passed as
+ TRUE. The CBUF isn't used for non-ephemeral asynchronous copies, because
+ the host data might not be computed yet (by an earlier asynchronous compute
+ region). */
attribute_hidden void
gomp_copy_host2dev (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq,
void *d, const void *h, size_t sz,
- struct gomp_coalesce_buf *cbuf)
+ bool ephemeral, struct gomp_coalesce_buf *cbuf)
{
if (cbuf)
{
@@ -339,8 +325,15 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
}
}
if (__builtin_expect (aq != NULL, 0))
- goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
- "dev", d, "host", h, sz, aq);
+ {
+ if (!devicep->openacc.async.host2dev_func (devicep->target_id, d, h, sz,
+ ephemeral, aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("Copying of host object [%p..%p) to dev object [%p..%p) "
+ "failed", h, h + sz, d, d + sz);
+ }
+ }
else
gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
}
@@ -351,8 +344,15 @@ gomp_copy_dev2host (struct gomp_device_descr *devicep,
void *h, const void *d, size_t sz)
{
if (__builtin_expect (aq != NULL, 0))
- goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
- "host", h, "dev", d, sz, aq);
+ {
+ if (!devicep->openacc.async.dev2host_func (devicep->target_id, h, d, sz,
+ aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("Copying of dev object [%p..%p) to host object [%p..%p) "
+ "failed", d, d + sz, h, h + sz);
+ }
+ }
else
gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
}
@@ -579,7 +579,7 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep,
(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
+ newn->host_start - oldn->host_start),
(void *) newn->host_start,
- newn->host_end - newn->host_start, cbuf);
+ newn->host_end - newn->host_start, false, cbuf);
if (oldn->refcount != REFCOUNT_INFINITY)
oldn->refcount++;
@@ -607,8 +607,8 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
cur_node.tgt_offset = (uintptr_t) NULL;
gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + target_offset),
- (void *) &cur_node.tgt_offset, sizeof (void *),
- cbuf);
+ (void *) &cur_node.tgt_offset,
+ sizeof (void *), true, cbuf);
return;
}
/* Add bias to the pointer value. */
@@ -628,7 +628,8 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
to initialize the pointer with. */
cur_node.tgt_offset -= bias;
gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
- (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
+ (void *) &cur_node.tgt_offset, sizeof (void *), true,
+ cbuf);
}
static void
@@ -760,7 +761,7 @@ gomp_attach_pointer (struct gomp_device_descr *devicep,
(void *) (n->tgt->tgt_start + n->tgt_offset), (void *) data);
gomp_copy_host2dev (devicep, aq, (void *) devptr, (void *) &data,
- sizeof (void *), cbufp);
+ sizeof (void *), true, cbufp);
}
else
gomp_debug (1, "%s: attach count for %p -> %u\n", __FUNCTION__,
@@ -815,7 +816,7 @@ gomp_detach_pointer (struct gomp_device_descr *devicep,
(void *) target);
gomp_copy_host2dev (devicep, aq, (void *) devptr, (void *) &target,
- sizeof (void *), cbufp);
+ sizeof (void *), true, cbufp);
}
else
gomp_debug (1, "%s: attach count for %p -> %u\n", __FUNCTION__,
@@ -1147,8 +1148,9 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
for (i = first; i <= last; i++)
{
tgt->list[i].key = NULL;
- if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
- & typemask))
+ if (!aq
+ && gomp_to_device_kind_p (get_kind (short_mapkind, kinds,
+ i) & typemask))
gomp_coalesce_buf_add (&cbuf,
tgt_size - cur_node.host_end
+ (uintptr_t) hostaddrs[i],
@@ -1209,8 +1211,9 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
- gomp_coalesce_buf_add (&cbuf, tgt_size,
- cur_node.host_end - cur_node.host_start);
+ if (!aq)
+ gomp_coalesce_buf_add (&cbuf, tgt_size,
+ cur_node.host_end - cur_node.host_start);
tgt_size += cur_node.host_end - cur_node.host_start;
has_firstprivate = true;
continue;
@@ -1240,7 +1243,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
- if (gomp_to_device_kind_p (kind & typemask))
+ if (!aq && gomp_to_device_kind_p (kind & typemask))
gomp_coalesce_buf_add (&cbuf, tgt_size,
cur_node.host_end - cur_node.host_start);
tgt_size += cur_node.host_end - cur_node.host_start;
@@ -1395,7 +1398,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
len = sizes[i];
gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + tgt_size),
- (void *) hostaddrs[i], len, cbufp);
+ (void *) hostaddrs[i], len, false, cbufp);
tgt_size += len;
continue;
case GOMP_MAP_FIRSTPRIVATE_INT:
@@ -1448,12 +1451,11 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
if (cur_node.tgt_offset)
cur_node.tgt_offset -= sizes[i];
gomp_copy_host2dev (devicep, aq,
- (void *) (n->tgt->tgt_start
- + n->tgt_offset
+ (void *) (n->tgt->tgt_start + n->tgt_offset
+ cur_node.host_start
- n->host_start),
(void *) &cur_node.tgt_offset,
- sizeof (void *), cbufp);
+ sizeof (void *), true, cbufp);
cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
+ cur_node.host_start - n->host_start;
continue;
@@ -1612,7 +1614,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
- k->host_end - k->host_start, cbufp);
+ k->host_end - k->host_start, false,
+ cbufp);
break;
case GOMP_MAP_POINTER:
gomp_map_pointer (tgt, aq,
@@ -1624,7 +1627,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
- k->host_end - k->host_start, cbufp);
+ k->host_end - k->host_start, false,
+ cbufp);
for (j = i + 1; j < mapnum; j++)
if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
@@ -1676,7 +1680,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
- sizeof (void *), cbufp);
+ sizeof (void *), false, cbufp);
break;
default:
gomp_mutex_unlock (&devicep->lock);
@@ -1692,7 +1696,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
/* We intentionally do not use coalescing here, as it's not
data allocated by the current call to this function. */
gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
- &tgt_addr, sizeof (void *), NULL);
+ &tgt_addr, sizeof (void *), true, NULL);
}
array++;
}
@@ -1779,7 +1783,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) tgt->tgt_start
+ k->tgt_offset,
(void *) k->host_start,
- da->data_row_size, cbufp);
+ da->data_row_size, false, cbufp);
array++;
}
target_data_rows[row_start + j] = (void *) target_row_addr;
@@ -1793,7 +1797,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
void *ptrblock = gomp_dynamic_array_create_ptrblock
(da, target_ptrblock, target_data_rows + row_start);
gomp_copy_host2dev (devicep, aq, target_ptrblock, ptrblock,
- da->ptrblock_size, cbufp);
+ da->ptrblock_size, true, cbufp);
free (ptrblock);
}
@@ -1817,7 +1821,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + i * sizeof (void *)),
(void *) &cur_node.tgt_offset, sizeof (void *),
- cbufp);
+ true, cbufp);
}
}
@@ -1829,7 +1833,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) (tgt->tgt_start + cbuf.chunks[c].start),
(char *) cbuf.buf + (cbuf.chunks[c].start
- cbuf.chunks[0].start),
- cbuf.chunks[c].end - cbuf.chunks[c].start, NULL);
+ cbuf.chunks[c].end - cbuf.chunks[c].start, true,
+ NULL);
free (cbuf.buf);
cbuf.buf = NULL;
cbufp = NULL;
@@ -2099,7 +2104,7 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
if (GOMP_MAP_COPY_TO_P (kind & typemask))
gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
- NULL);
+ false, NULL);
if (GOMP_MAP_COPY_FROM_P (kind & typemask))
gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
}
@@ -1,6 +1,10 @@
#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
-/* Test asyncronous attach and detach operation. */
+#define ITERATIONS 1023
+
+/* Test asynchronous attach and detach operation. */
typedef struct {
int *a;
@@ -25,13 +29,13 @@ main (int argc, char* argv[])
#pragma acc enter data copyin(m)
- for (int i = 0; i < 99; i++)
+ for (int i = 0; i < ITERATIONS; i++)
{
int j;
-#pragma acc parallel loop copy(m.a[0:N]) async(i % 2)
+#pragma acc parallel loop copy(m.a[0:N]) async(0)
for (j = 0; j < N; j++)
m.a[j]++;
-#pragma acc parallel loop copy(m.b[0:N]) async((i + 1) % 2)
+#pragma acc parallel loop copy(m.b[0:N]) async(1)
for (j = 0; j < N; j++)
m.b[j]++;
}
@@ -40,10 +44,10 @@ main (int argc, char* argv[])
for (i = 0; i < N; i++)
{
- if (m.a[i] != 99)
- abort ();
- if (m.b[i] != 99)
- abort ();
+ if (m.a[i] != ITERATIONS)
+ abort ();
+ if (m.b[i] != ITERATIONS)
+ abort ();
}
free (m.a);
@@ -27,6 +27,9 @@ program main
if (acc_is_present (h) .neqv. .TRUE.) call abort
+ ! We must wait for the update to be done.
+ call acc_wait (async)
+
h(:) = 0
call acc_copyout_async (h, sizeof (h), async)
@@ -45,6 +48,8 @@ program main
if (acc_is_present (h) .neqv. .TRUE.) call abort
+ call acc_wait (async)
+
do i = 1, N
if (h(i) /= i + i) call abort
end do