[gomp4,1/3] OpenACC 2.0 support for libgomp - OpenACC runtime, NVidia PTX/CUDA plugin

Message ID	87bnowgoue.fsf@kepler.schwinge.homeip.net
State	New
Headers	show Return-Path: <gcc-patches-return-382193-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :to:cc:subject:in-reply-to:references:date:message-id :mime-version:content-type; q=dns; s=default; b=I7bPXlf0tcbZdLsm wjRJvOQsXeemkNm5bJ2smf4h3Hd+Ky5wKtHrg4XAJ82aWzf/iXijeP65uTbCOlVf /osqXv/qKBR0ItAb4KSxAhvEiihFdUMJ0s8Yq4WDGXnY0tm2m/Na+FFR5itLPzC2 888cQL5Tqk9FmQh0eAgPo0wsgnc= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org From: Thomas Schwinge <thomas@codesourcery.com> To: <gcc-patches@gcc.gnu.org> CC: Julian Brown <julian@codesourcery.com> Subject: Re: [gomp4] [1/3] OpenACC 2.0 support for libgomp - OpenACC runtime, NVidia PTX/CUDA plugin In-Reply-To: <20141014171118.6cec6fb4@octopus> References: <20141014171118.6cec6fb4@octopus> User-Agent: Notmuch/0.9-101-g81dad07 (http://notmuchmail.org) Emacs/24.3.1 (i586-pc-linux-gnu) Date: Tue, 28 Oct 2014 17:07:21 +0100 Message-ID: <87bnowgoue.fsf@kepler.schwinge.homeip.net> MIME-Version: 1.0 Content-Type: multipart/signed; boundary="=-=-="; micalg=pgp-sha1; protocol="application/pgp-signature"

diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp index 5363068..fda1cbc 100644 --- libgomp/ChangeLog.gomp +++ libgomp/ChangeLog.gomp @@ -1,3 +1,8 @@ +2014-10-28 Thomas Schwinge <thomas@codesourcery.com> + + * oacc-init.c: Don't use <sys/queue.h>'s SLIST_*. + * plugin-nvptx.c: Likewise. + 2014-10-23 Thomas Schwinge <thomas@codesourcery.com> * testsuite/libgomp.oacc-c/reduction-initial-1.c: New file. diff --git libgomp/oacc-init.c libgomp/oacc-init.c index f797f89..ffa9ad8 100644 --- libgomp/oacc-init.c +++ libgomp/oacc-init.c @@ -31,7 +31,6 @@ #include <stdlib.h> #include <strings.h> #include <stdbool.h> -#include <sys/queue.h> #include <stdio.h> gomp_mutex_t acc_device_lock; @@ -55,11 +54,11 @@ static __thread int handle_num = -1; struct ACC_context { struct memmap_t *ACC_memmap; void *ACC_handle; - SLIST_ENTRY(ACC_context) next; + + struct ACC_context *next; }; -static SLIST_HEAD(_ACC_contexts, ACC_context) _ACC_contexts; -static struct _ACC_contexts *ACC_contexts; +static struct ACC_context *ACC_contexts; static struct gomp_device_descr const *dispatchers[_ACC_device_hwm] = { 0 }; @@ -198,7 +197,7 @@ lazy_open (int ord) ACC_handle = ACC_dev->openacc.open_device_func (ord); handle_num = ord; - SLIST_FOREACH(acc_ctx, ACC_contexts, next) + for (acc_ctx = ACC_contexts; acc_ctx != NULL; acc_ctx = acc_ctx->next) { if (acc_ctx->ACC_handle == ACC_handle) { @@ -220,7 +219,8 @@ lazy_open (int ord) if (!ACC_memmap->mem_map.is_initialized) gomp_init_tables (ACC_dev, &ACC_memmap->mem_map); - SLIST_INSERT_HEAD(ACC_contexts, acc_ctx, next); + acc_ctx->next = ACC_contexts; + ACC_contexts = acc_ctx; } /* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of @@ -259,12 +259,10 @@ _acc_shutdown (acc_device_t d) close_handle (); - while (SLIST_FIRST(ACC_contexts) != NULL) + while (ACC_contexts != NULL) { - struct ACC_context *c; - - c = SLIST_FIRST(ACC_contexts); - SLIST_REMOVE_HEAD(ACC_contexts, next); + struct ACC_context *c = ACC_contexts; + ACC_contexts = ACC_contexts->next; free (c); } @@ -467,8 +465,7 @@ ACC_runtime_initialize (void) { gomp_mutex_init (&acc_device_lock); - ACC_contexts = &_ACC_contexts; - SLIST_INIT (ACC_contexts); + ACC_contexts = NULL; } /* Compiler helper functions */ diff --git libgomp/plugin-nvptx.c libgomp/plugin-nvptx.c index f193229..33f868a 100644 --- libgomp/plugin-nvptx.c +++ libgomp/plugin-nvptx.c @@ -40,7 +40,6 @@ #include "libgomp-plugin.h" #include <cuda.h> -#include <sys/queue.h> #include <stdint.h> #include <string.h> #include <stdio.h> @@ -149,11 +148,9 @@ struct PTX_stream void *h_prev; void *h_tail; - SLIST_ENTRY(PTX_stream) next; + struct PTX_stream *next; }; -SLIST_HEAD(PTX_streams, PTX_stream); - /* Each thread may select a stream (also specific to a device/context). */ static __thread struct PTX_stream *current_stream; @@ -293,7 +290,7 @@ struct PTX_device /* All non-null streams associated with this device (actually context), either created implicitly or passed in from the user (via acc_set_cuda_stream). */ - struct PTX_streams active_streams; + struct PTX_stream *active_streams; struct { struct PTX_stream **arr; int size; @@ -306,12 +303,12 @@ struct PTX_device bool concur; int mode; bool mkern; - SLIST_ENTRY(PTX_device) next; + + struct PTX_device *next; }; static __thread struct PTX_device *PTX_dev; -static SLIST_HEAD(_PTX_devices, PTX_device) _PTX_devices; -static struct _PTX_devices *PTX_devices; +static struct PTX_device *PTX_devices; enum PTX_event_type { @@ -327,12 +324,12 @@ struct PTX_event int type; void *addr; int ord; - SLIST_ENTRY(PTX_event) next; + + struct PTX_event *next; }; static gomp_mutex_t PTX_event_lock; -static SLIST_HEAD(_PTX_events, PTX_event) _PTX_events; -static struct _PTX_events *PTX_events; +static struct PTX_event *PTX_events; #define _XSTR(s) _STR(s) #define _STR(s) #s @@ -417,7 +414,7 @@ init_streams_for_device (struct PTX_device *ptx_dev, int concurrency) map_init (null_stream); ptx_dev->null_stream = null_stream; - SLIST_INIT (&ptx_dev->active_streams); + ptx_dev->active_streams = NULL; GOMP_PLUGIN_mutex_init (&ptx_dev->stream_lock); if (concurrency < 1) @@ -437,13 +434,13 @@ init_streams_for_device (struct PTX_device *ptx_dev, int concurrency) static void fini_streams_for_device (struct PTX_device *ptx_dev) { - struct PTX_stream *s; free (ptx_dev->async_streams.arr); - while (!SLIST_EMPTY (&ptx_dev->active_streams)) + while (ptx_dev->active_streams != NULL) { - s = SLIST_FIRST (&ptx_dev->active_streams); - SLIST_REMOVE_HEAD (&ptx_dev->active_streams, next); + struct PTX_stream *s = ptx_dev->active_streams; + ptx_dev->active_streams = ptx_dev->active_streams->next; + cuStreamDestroy (s->stream); map_fini (s); free (s); @@ -535,7 +532,8 @@ select_stream_for_async (int async, pthread_t thread, bool create, s->h = NULL; map_init (s); - SLIST_INSERT_HEAD (&ptx_dev->active_streams, s, next); + s->next = ptx_dev->active_streams; + ptx_dev->active_streams = s; ptx_dev->async_streams.arr[async] = s; } @@ -593,11 +591,8 @@ PTX_init (void) if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r)); - PTX_devices = &_PTX_devices; - PTX_events = &_PTX_events; - - SLIST_INIT(PTX_devices); - SLIST_INIT(PTX_events); + PTX_devices = NULL; + PTX_events = NULL; GOMP_PLUGIN_mutex_init (&PTX_event_lock); @@ -625,7 +620,9 @@ PTX_open_device (int n) { struct PTX_device *ptx_device; - SLIST_FOREACH(ptx_device, PTX_devices, next) + for (ptx_device = PTX_devices; + ptx_device != NULL; + ptx_device = ptx_device->next) { if (ptx_device->ord == n) { @@ -653,7 +650,8 @@ PTX_open_device (int n) PTX_dev->dev = dev; PTX_dev->ctx_shared = false; - SLIST_INSERT_HEAD(PTX_devices, PTX_dev, next); + PTX_dev->next = PTX_devices; + PTX_devices = PTX_dev; r = cuCtxGetCurrent (&PTX_dev->ctx); if (r != CUDA_SUCCESS) @@ -729,7 +727,15 @@ PTX_close_device (void *h __attribute__((unused))) GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r)); } - SLIST_REMOVE(PTX_devices, PTX_dev, PTX_device, next); + if (PTX_devices == PTX_dev) + PTX_devices = PTX_devices->next; + else + { + struct PTX_device* d = PTX_devices; + while (d->next != PTX_dev) + d = d->next; + d->next = d->next->next; + } free (PTX_dev); PTX_dev = NULL; @@ -920,60 +926,67 @@ link_ptx (CUmodule *module, char *ptx_code) static void event_gc (bool memmap_lockable) { - struct PTX_event *ptx_event; + struct PTX_event *ptx_event = PTX_events; GOMP_PLUGIN_mutex_lock (&PTX_event_lock); - for (ptx_event = SLIST_FIRST (PTX_events); ptx_event;) + while (ptx_event != NULL) { CUresult r; - struct PTX_event *next = SLIST_NEXT (ptx_event, next); + struct PTX_event *e = ptx_event; - if (ptx_event->ord != PTX_dev->ord) - goto next_event; + ptx_event = ptx_event->next; - r = cuEventQuery (*ptx_event->evt); + if (e->ord != PTX_dev->ord) + continue; + + r = cuEventQuery (*e->evt); if (r == CUDA_SUCCESS) - { - CUevent *te; + { + CUevent *te; - te = ptx_event->evt; + te = e->evt; - switch (ptx_event->type) + switch (e->type) { case PTX_EVT_MEM: case PTX_EVT_SYNC: break; case PTX_EVT_KNL: - map_pop (ptx_event->addr); + map_pop (e->addr); break; case PTX_EVT_ASYNC_CLEANUP: - { - /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the + { + /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the memory-map splay tree lock for the current device, so we can't call it when one of our callers has already claimed the lock. In that case, just delay the GC for this event - until later. */ - if (!memmap_lockable) - goto next_event; + until later. */ + if (!memmap_lockable) + continue; - GOMP_PLUGIN_async_unmap_vars (ptx_event->addr); - } + GOMP_PLUGIN_async_unmap_vars (e->addr); + } break; } - cuEventDestroy (*te); - free ((void *)te); + cuEventDestroy (*te); + free ((void *)te); - SLIST_REMOVE (PTX_events, ptx_event, PTX_event, next); + if (PTX_events == e) + PTX_events = PTX_events->next; + else + { + struct PTX_event *e_ = PTX_events; + while (e_->next != e) + e_ = e_->next; + e_->next = e_->next->next; + } - free (ptx_event); - } - - next_event: - ptx_event = next; + free (e); + } } GOMP_PLUGIN_mutex_unlock (&PTX_event_lock); @@ -995,7 +1008,8 @@ event_add (enum PTX_event_type type, CUevent *e, void *h) GOMP_PLUGIN_mutex_lock (&PTX_event_lock); - SLIST_INSERT_HEAD(PTX_events, ptx_event, next); + ptx_event->next = PTX_events; + PTX_events = ptx_event; GOMP_PLUGIN_mutex_unlock (&PTX_event_lock); } @@ -1316,7 +1330,7 @@ PTX_async_test_all (void) GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock); - SLIST_FOREACH (s, &PTX_dev->active_streams, next) + for (s = PTX_dev->active_streams; s != NULL; s = s->next) { if ((s->multithreaded || pthread_equal (s->host_thread, self)) && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) @@ -1400,7 +1414,7 @@ PTX_wait_all (void) /* Wait for active streams initiated by this thread (or by multiple threads) to complete. */ - SLIST_FOREACH (s, &PTX_dev->active_streams, next) + for (s = PTX_dev->active_streams; s != NULL; s = s->next) { if (s->multithreaded || pthread_equal (s->host_thread, self)) { @@ -1443,7 +1457,9 @@ PTX_wait_all_async (int async) GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock); - SLIST_FOREACH (other_stream, &PTX_dev->active_streams, next) + for (other_stream = PTX_dev->active_streams; + other_stream != NULL; + other_stream = other_stream->next) { if (!other_stream->multithreaded && !pthread_equal (other_stream->host_thread, self)) @@ -1524,8 +1540,16 @@ PTX_set_cuda_stream (int async, void *stream) if (oldstream) { - SLIST_REMOVE (&PTX_dev->active_streams, oldstream, PTX_stream, next); - + if (PTX_dev->active_streams == oldstream) + PTX_dev->active_streams = PTX_dev->active_streams->next; + else + { + struct PTX_stream *s = PTX_dev->active_streams; + while (s->next != oldstream) + s = s->next; + s->next = s->next->next; + } + cuStreamDestroy (oldstream->stream); map_fini (oldstream); free (oldstream);

[gomp4,1/3] OpenACC 2.0 support for libgomp - OpenACC runtime, NVidia PTX/CUDA plugin

Commit Message

Patch