From 4bd844f3e0202b3d083f0784f4343570c88bb86c Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Mon, 20 Feb 2023 14:44:43 +0100
Subject: [PATCH] Attempt to not just register but allocate OpenMP pinned
memory using a device
... instead of 'mmap' plus attempting to register using a device.
Implemented for nvptx offloading via 'cuMemHostAlloc'.
This re-works og12 commit a5a4800e92773da7126c00a9c79b172494d58ab5
"Attempt to register OpenMP pinned memory using a device instead of 'mlock'".
include/
* cuda/cuda.h (cuMemHostRegister, cuMemHostUnregister): Remove.
libgomp/
* config/linux/allocator.c (linux_memspace_alloc): Add 'init0'
formal parameter. Adjust all users.
(linux_memspace_alloc, linux_memspace_free): Attempt to allocate
OpenMP pinned memory using a device instead of 'mmap' plus
attempting to register using a device.
* libgomp-plugin.h (GOMP_OFFLOAD_register_page_locked)
(GOMP_OFFLOAD_unregister_page_locked): Remove.
(GOMP_OFFLOAD_page_locked_host_alloc)
(GOMP_OFFLOAD_page_locked_host_free): New.
* libgomp.h (gomp_register_page_locked)
(gomp_unregister_page_locked): Remove.
(gomp_page_locked_host_alloc, gomp_page_locked_host_free): New.
(struct gomp_device_descr): Remove 'register_page_locked_func',
'unregister_page_locked_func'. Add 'page_locked_host_alloc_func',
'page_locked_host_free_func'.
* plugin/cuda-lib.def (cuMemHostRegister_v2, cuMemHostRegister)
(cuMemHostUnregister): Remove.
* plugin/plugin-nvptx.c (GOMP_OFFLOAD_register_page_locked)
(GOMP_OFFLOAD_unregister_page_locked): Remove.
(GOMP_OFFLOAD_page_locked_host_alloc)
(GOMP_OFFLOAD_page_locked_host_free): New.
* target.c (gomp_register_page_locked)
(gomp_unregister_page_locked): Remove.
(gomp_page_locked_host_alloc, gomp_page_locked_host_free): Add.
(gomp_load_plugin_for_device): Don't handle
'register_page_locked', 'unregister_page_locked'. Handle
'page_locked_host_alloc', 'page_locked_host_free'.
Suggested-by: Andrew Stubbs <ams@codesourcery.com>
---
include/cuda/cuda.h | 3 --
libgomp/config/linux/allocator.c | 85 ++++++++++++++++++--------------
libgomp/libgomp-plugin.h | 4 +-
libgomp/libgomp.h | 8 +--
libgomp/plugin/cuda-lib.def | 3 --
libgomp/plugin/plugin-nvptx.c | 33 +++++++------
libgomp/target.c | 49 +++++++++---------
7 files changed, 98 insertions(+), 87 deletions(-)
@@ -183,9 +183,6 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t);
CUresult cuMemAllocHost (void **, size_t);
CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
CUresult cuMemHostAlloc (void **, size_t, unsigned int);
-#define cuMemHostRegister cuMemHostRegister_v2
-CUresult cuMemHostRegister(void *, size_t, unsigned int);
-CUresult cuMemHostUnregister(void *);
CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
@@ -25,8 +25,9 @@
/* Implement malloc routines that can handle pinned memory on Linux.
Given that pinned memory is typically used to help host <-> device memory
- transfers, we attempt to register such using a device (really: libgomp
- plugin), but fall back to mlock if no suitable device is available.
+ transfers, we attempt to allocate such memory using a device (really:
+ libgomp plugin), but fall back to mmap plus mlock if no suitable device is
+ available.
It's possible to use mlock on any heap memory, but using munlock is
problematic if there are multiple pinned allocations on the same page.
@@ -58,40 +59,36 @@ GOMP_enable_pinned_mode ()
always_pinned_mode = true;
}
-static int using_device_for_register_page_locked
+static int using_device_for_page_locked
= /* uninitialized */ -1;
static void *
-linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
+linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
+ bool init0)
{
- gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
- __FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
+ gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d, init0=%d\n",
+ __FUNCTION__, (unsigned long long) memspace,
+ (unsigned long long) size, pin, init0);
/* Explicit pinning may not be required. */
pin = pin && !always_pinned_mode;
+ void *addr;
+
if (memspace == ompx_unified_shared_mem_space)
- {
- return gomp_usm_alloc (size, GOMP_DEVICE_ICV);
- }
+ addr = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
else if (pin)
{
- /* 'mmap' zero-initializes, which 'linux_memspace_calloc' relies on. */
- void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (addr == MAP_FAILED)
- return NULL;
-
int using_device
- = __atomic_load_n (&using_device_for_register_page_locked,
+ = __atomic_load_n (&using_device_for_page_locked,
MEMMODEL_RELAXED);
gomp_debug (0, " using_device=%d\n",
using_device);
if (using_device != 0)
{
- using_device = gomp_register_page_locked (addr, size);
+ using_device = gomp_page_locked_host_alloc (&addr, size);
int using_device_old
- = __atomic_exchange_n (&using_device_for_register_page_locked,
+ = __atomic_exchange_n (&using_device_for_page_locked,
using_device, MEMMODEL_RELAXED);
gomp_debug (0, " using_device=%d, using_device_old=%d\n",
using_device, using_device_old);
@@ -101,19 +98,37 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
}
if (using_device == 0)
{
- gomp_debug (0, " mlock\n");
- if (mlock (addr, size))
+ gomp_debug (0, " mmap\n");
+ addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (addr == MAP_FAILED)
+ addr = NULL;
+ else
{
- gomp_debug (0, "libgomp: failed to pin memory (ulimit too low?)\n");
- munmap (addr, size);
- return NULL;
+ /* 'mmap' zero-initializes. */
+ init0 = false;
+
+ gomp_debug (0, " mlock\n");
+ if (mlock (addr, size))
+ {
+ gomp_debug (0, "libgomp: failed to pin memory"
+ " (ulimit too low?)\n");
+ munmap (addr, size);
+ addr = NULL;
+ }
}
}
-
- return addr;
}
else
- return malloc (size);
+ addr = malloc (size);
+
+ if (addr && init0)
+ {
+ gomp_debug (0, " init0\n");
+ memset (addr, 0, size);
+ }
+
+ return addr;
}
static void *
@@ -132,8 +147,7 @@ linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
return ret;
}
else if (pin)
- /* If PINned, 'linux_memspace_alloc' 'mmap's, which zero-initializes. */
- return linux_memspace_alloc (memspace, size, pin);
+ return linux_memspace_alloc (memspace, size, pin, true);
else
return calloc (1, size);
}
@@ -153,16 +167,15 @@ linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
else if (pin)
{
int using_device
- = __atomic_load_n (&using_device_for_register_page_locked,
+ = __atomic_load_n (&using_device_for_page_locked,
MEMMODEL_RELAXED);
gomp_debug (0, " using_device=%d\n",
using_device);
if (using_device == 1)
- gomp_unregister_page_locked (addr, size);
+ gomp_page_locked_host_free (addr);
else
/* 'munlock'ing is implicit with following 'munmap'. */
- ;
- munmap (addr, size);
+ munmap (addr, size);
}
else
free (addr);
@@ -183,9 +196,9 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
else if (oldpin && pin)
{
/* We can only expect to be able to just 'mremap' if not using a device
- for registering page-locked memory. */
+ for page-locked memory. */
int using_device
- = __atomic_load_n (&using_device_for_register_page_locked,
+ = __atomic_load_n (&using_device_for_page_locked,
MEMMODEL_RELAXED);
gomp_debug (0, " using_device=%d\n",
using_device);
@@ -205,7 +218,7 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
return realloc (addr, size);
manual_realloc:
- void *newaddr = linux_memspace_alloc (memspace, size, pin);
+ void *newaddr = linux_memspace_alloc (memspace, size, pin, false);
if (newaddr)
{
memcpy (newaddr, addr, oldsize < size ? oldsize : size);
@@ -216,7 +229,7 @@ manual_realloc:
}
#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
- linux_memspace_alloc (MEMSPACE, SIZE, PIN)
+ linux_memspace_alloc (MEMSPACE, SIZE, PIN, false)
#define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
linux_memspace_calloc (MEMSPACE, SIZE, PIN)
#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
@@ -144,8 +144,8 @@ extern bool GOMP_OFFLOAD_free (int, void *);
extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
extern bool GOMP_OFFLOAD_usm_free (int, void *);
extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
-extern bool GOMP_OFFLOAD_register_page_locked (void *, size_t);
-extern bool GOMP_OFFLOAD_unregister_page_locked (void *, size_t);
+extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
+extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
@@ -1133,8 +1133,8 @@ extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
void *);
extern void * gomp_usm_alloc (size_t size, int device_num);
extern void gomp_usm_free (void *device_ptr, int device_num);
-extern bool gomp_register_page_locked (void *, size_t);
-extern void gomp_unregister_page_locked (void *, size_t);
+extern bool gomp_page_locked_host_alloc (void **, size_t);
+extern void gomp_page_locked_host_free (void *);
/* Splay tree definitions. */
typedef struct splay_tree_node_s *splay_tree_node;
@@ -1394,8 +1394,8 @@ struct gomp_device_descr
__typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func;
__typeof (GOMP_OFFLOAD_usm_free) *usm_free_func;
__typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
- __typeof (GOMP_OFFLOAD_register_page_locked) *register_page_locked_func;
- __typeof (GOMP_OFFLOAD_unregister_page_locked) *unregister_page_locked_func;
+ __typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
+ __typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
__typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
__typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
@@ -31,9 +31,6 @@ CUDA_ONE_CALL (cuMemAlloc)
CUDA_ONE_CALL (cuMemAllocHost)
CUDA_ONE_CALL (cuMemAllocManaged)
CUDA_ONE_CALL (cuMemHostAlloc)
-CUDA_ONE_CALL_MAYBE_NULL (cuMemHostRegister_v2)
-CUDA_ONE_CALL (cuMemHostRegister)
-CUDA_ONE_CALL (cuMemHostUnregister)
CUDA_ONE_CALL (cuMemcpy)
CUDA_ONE_CALL (cuMemcpyDtoDAsync)
CUDA_ONE_CALL (cuMemcpyDtoH)
@@ -77,14 +77,11 @@ extern CUresult cuGetErrorString (CUresult, const char **);
CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
-#undef cuMemHostRegister
-CUresult cuMemHostRegister (void *, size_t, unsigned int);
#else
typedef size_t (*CUoccupancyB2DSize)(int);
CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
-CUresult cuMemHostRegister_v2 (void *, size_t, unsigned int);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
#endif
@@ -1709,30 +1706,36 @@ GOMP_OFFLOAD_is_usm_ptr (void *ptr)
bool
-GOMP_OFFLOAD_register_page_locked (void *ptr, size_t size)
+GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
{
GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
__FUNCTION__, ptr, (unsigned long long) size);
+ CUresult r;
+
unsigned int flags = 0;
/* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
- 'flags |= CU_MEMHOSTREGISTER_PORTABLE;' here. */
- if (CUDA_CALL_EXISTS (cuMemHostRegister_v2))
- CUDA_CALL (cuMemHostRegister_v2, ptr, size, flags);
- else
- CUDA_CALL (cuMemHostRegister, ptr, size, flags);
-
+ 'flags |= CU_MEMHOSTALLOC_PORTABLE;' here. */
+ r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags);
+ if (r == CUDA_ERROR_OUT_OF_MEMORY)
+ *ptr = NULL;
+ else if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r));
+ return false;
+ }
+ GOMP_PLUGIN_debug (0, " -> *ptr=%p\n",
+ *ptr);
return true;
}
bool
-GOMP_OFFLOAD_unregister_page_locked (void *ptr, size_t size)
+GOMP_OFFLOAD_page_locked_host_free (void *ptr)
{
- GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
- __FUNCTION__, ptr, (unsigned long long) size);
-
- CUDA_CALL (cuMemHostUnregister, ptr);
+ GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p\n",
+ __FUNCTION__, ptr);
+ CUDA_CALL (cuMemFreeHost, ptr);
return true;
}
@@ -4585,15 +4585,15 @@ gomp_usm_free (void *device_ptr, int device_num)
}
-/* Device (really: libgomp plugin) for registering paged-locked memory. We
+/* Device (really: libgomp plugin) to use for paged-locked memory. We
assume there is either none or exactly one such device for the lifetime of
the process. */
-static struct gomp_device_descr *device_for_register_page_locked
+static struct gomp_device_descr *device_for_page_locked
= /* uninitialized */ (void *) -1;
static struct gomp_device_descr *
-get_device_for_register_page_locked (void)
+get_device_for_page_locked (void)
{
gomp_debug (0, "%s\n",
__FUNCTION__);
@@ -4601,7 +4601,7 @@ get_device_for_register_page_locked (void)
struct gomp_device_descr *device;
#ifdef HAVE_SYNC_BUILTINS
device
- = __atomic_load_n (&device_for_register_page_locked, MEMMODEL_RELAXED);
+ = __atomic_load_n (&device_for_page_locked, MEMMODEL_RELAXED);
if (device == (void *) -1)
{
gomp_debug (0, " init\n");
@@ -4621,7 +4621,7 @@ get_device_for_register_page_locked (void)
if (devices[i].target_id != 0)
continue;
- if (!devices[i].register_page_locked_func)
+ if (!devices[i].page_locked_host_alloc_func)
continue;
gomp_debug (0, " found device: %p (%s)\n",
@@ -4629,16 +4629,16 @@ get_device_for_register_page_locked (void)
if (device)
gomp_fatal ("Unclear how %s and %s libgomp plugins may"
" simultaneously provide functionality"
- " to register page-locked memory",
+ " for page-locked memory",
device->name, devices[i].name);
else
device = &devices[i];
}
struct gomp_device_descr *device_old
- = __atomic_exchange_n (&device_for_register_page_locked, device,
+ = __atomic_exchange_n (&device_for_page_locked, device,
MEMMODEL_RELAXED);
- gomp_debug (0, " old device_for_register_page_locked: %p\n",
+ gomp_debug (0, " old device_for_page_locked: %p\n",
device_old);
assert (device_old == (void *) -1
/* We shouldn't have concurrently found a different or no
@@ -4647,7 +4647,7 @@ get_device_for_register_page_locked (void)
}
#else /* !HAVE_SYNC_BUILTINS */
gomp_debug (0, " not implemented for '!HAVE_SYNC_BUILTINS'\n");
- (void) &device_for_register_page_locked;
+ (void) &device_for_page_locked;
device = NULL;
#endif /* HAVE_SYNC_BUILTINS */
@@ -4656,16 +4656,16 @@ get_device_for_register_page_locked (void)
return device;
}
-/* Register page-locked memory region.
+/* Allocate page-locked host memory.
Returns whether we have a device capable of that. */
attribute_hidden bool
-gomp_register_page_locked (void *ptr, size_t size)
+gomp_page_locked_host_alloc (void **ptr, size_t size)
{
gomp_debug (0, "%s: ptr=%p, size=%llu\n",
__FUNCTION__, ptr, (unsigned long long) size);
- struct gomp_device_descr *device = get_device_for_register_page_locked ();
+ struct gomp_device_descr *device = get_device_for_page_locked ();
gomp_debug (0, " device=%p (%s)\n",
device, device ? device->name : "[none]");
if (device)
@@ -4676,29 +4676,30 @@ gomp_register_page_locked (void *ptr, size_t size)
else if (device->state == GOMP_DEVICE_FINALIZED)
{
gomp_mutex_unlock (&device->lock);
- gomp_fatal ("Device %s for registering page-locked memory"
- " is finalized", device->name);
+ gomp_fatal ("Device %s used for for page-locked memory is finalized",
+ device->name);
}
gomp_mutex_unlock (&device->lock);
- if (!device->register_page_locked_func (ptr, size))
- gomp_fatal ("Failed to register page-locked memory"
+ if (!device->page_locked_host_alloc_func (ptr, size))
+ gomp_fatal ("Failed to allocate page-locked host memory"
" via %s libgomp plugin",
device->name);
}
return device != NULL;
}
-/* Unregister page-locked memory region.
- This must only be called if 'gomp_register_page_locked' returned 'true'. */
+/* Free page-locked host memory.
+ This must only be called if 'gomp_page_locked_host_alloc' returned
+ 'true'. */
attribute_hidden void
-gomp_unregister_page_locked (void *ptr, size_t size)
+gomp_page_locked_host_free (void *ptr)
{
gomp_debug (0, "%s: ptr=%p\n",
__FUNCTION__, ptr);
- struct gomp_device_descr *device = get_device_for_register_page_locked ();
+ struct gomp_device_descr *device = get_device_for_page_locked ();
gomp_debug (0, " device=%p (%s)\n",
device, device ? device->name : "[none]");
assert (device);
@@ -4712,8 +4713,8 @@ gomp_unregister_page_locked (void *ptr, size_t size)
}
gomp_mutex_unlock (&device->lock);
- if (!device->unregister_page_locked_func (ptr, size))
- gomp_fatal ("Failed to unregister page-locked memory"
+ if (!device->page_locked_host_free_func (ptr))
+ gomp_fatal ("Failed to free page-locked host memory"
" via %s libgomp plugin",
device->name);
}
@@ -5403,8 +5404,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
DLSYM_OPT (usm_alloc, usm_alloc);
DLSYM_OPT (usm_free, usm_free);
DLSYM_OPT (is_usm_ptr, is_usm_ptr);
- DLSYM_OPT (register_page_locked, register_page_locked);
- DLSYM_OPT (unregister_page_locked, unregister_page_locked);
+ DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
+ DLSYM_OPT (page_locked_host_free, page_locked_host_free);
DLSYM (dev2host);
DLSYM (host2dev);
DLSYM (evaluate_device);
--
2.25.1