@@ -70,7 +70,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
target.c splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \
oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \
priority_queue.c affinity-fmt.c teams.c allocator.c oacc-profiling.c \
- oacc-target.c target-indirect.c
+ oacc-target.c target-indirect.c usmpin-allocator.c
include $(top_srcdir)/plugin/Makefrag.am
@@ -219,7 +219,8 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
oacc-parallel.lo oacc-host.lo oacc-init.lo oacc-mem.lo \
oacc-async.lo oacc-plugin.lo oacc-cuda.lo priority_queue.lo \
affinity-fmt.lo teams.lo allocator.lo oacc-profiling.lo \
- oacc-target.lo target-indirect.lo $(am__objects_1)
+ oacc-target.lo target-indirect.lo usmpin-allocator.lo \
+ $(am__objects_1)
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
@@ -552,7 +553,8 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
affinity-fmt.c teams.c allocator.c oacc-profiling.c \
- oacc-target.c target-indirect.c $(am__append_3)
+ oacc-target.c target-indirect.c usmpin-allocator.c \
+ $(am__append_3)
# Nvidia PTX OpenACC plugin.
@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -786,6 +788,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/teams.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/usmpin-allocator.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/work.Plo@am__quote@
.c.o:
@@ -53,6 +53,7 @@
#define _GNU_SOURCE
#include <sys/mman.h>
+#include <unistd.h>
#include <string.h>
#include <assert.h>
#include "libgomp.h"
@@ -77,6 +78,16 @@ GOMP_enable_pinned_mode ()
static int using_device_for_page_locked
= /* uninitialized */ -1;
+
+static usmpin_ctx_p pin_ctx = NULL;
+static pthread_once_t ctxlock = PTHREAD_ONCE_INIT;
+
+static void
+linux_init_pin_ctx ()
+{
+ pin_ctx = usmpin_init_context ();
+}
+
static void *
linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
bool init0)
@@ -85,7 +96,7 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
__FUNCTION__, (unsigned long long) memspace,
(unsigned long long) size, pin, init0);
- void *addr;
+ void *addr = NULL;
/* Explicit pinning may not be required. */
pin = pin && !always_pinned_mode;
@@ -111,28 +122,51 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
}
if (using_device == 0)
{
- gomp_debug (0, " mmap\n");
- addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (addr == MAP_FAILED)
- addr = NULL;
- else
- {
- /* 'mmap' zero-initializes. */
- init0 = false;
+ static int pagesize = 0;
+ static void *addrhint = NULL;
- gomp_debug (0, " mlock\n");
- if (mlock (addr, size))
+ if (!pagesize)
+ pagesize = sysconf(_SC_PAGE_SIZE);
+
+ while (1)
+ {
+ addr = usmpin_alloc (pin_ctx, size);
+ if (addr)
+ break;
+
+ gomp_debug (0, " mmap\n");
+
+ /* Round up to a whole page. */
+ size_t misalignment = size % pagesize;
+ size_t mmap_size = (misalignment > 0
+ ? size + pagesize - misalignment
+ : size);
+ void *newpage = mmap (addrhint, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (newpage == MAP_FAILED)
+ break;
+ else
{
+ gomp_debug (0, " mlock\n");
+ if (mlock (newpage, size))
+ {
#ifdef HAVE_INTTYPES_H
- gomp_debug (0, "libgomp: failed to pin %"PRIu64" bytes of"
- " memory (ulimit too low?)\n", (uint64_t) size);
+ gomp_debug (0, "libgomp: failed to pin %"PRIu64" bytes"
+ " of memory (ulimit too low?)\n",
+ (uint64_t) size);
#else
- gomp_debug (0, "libgomp: failed to pin %lu bytes of memory"
- " (ulimit too low?)\n", (unsigned long) size);
+ gomp_debug (0, "libgomp: failed to pin %lu bytes of"
+ " memory (ulimit too low?)\n",
+ (unsigned long) size);
#endif
- munmap (addr, size);
- addr = NULL;
+ munmap (newpage, size);
+ break;
+ }
+
+ addrhint = newpage + mmap_size;
+
+ pthread_once (&ctxlock, linux_init_pin_ctx);
+ usmpin_register_memory (pin_ctx, newpage, mmap_size);
}
}
}
@@ -184,8 +218,7 @@ linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
if (using_device == 1)
gomp_page_locked_host_free (addr);
else
- /* 'munlock'ing is implicit with following 'munmap'. */
- munmap (addr, size);
+ usmpin_free (pin_ctx, addr);
}
else
free (addr);
@@ -203,29 +236,29 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
if (oldpin && pin)
{
- /* We can only expect to be able to just 'mremap' if not using a device
- for page-locked memory. */
int using_device
= __atomic_load_n (&using_device_for_page_locked,
MEMMODEL_RELAXED);
gomp_debug (0, " using_device=%d\n",
using_device);
- if (using_device != 0)
- goto manual_realloc;
-
- gomp_debug (0, " mremap\n");
- void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
- if (newaddr == MAP_FAILED)
- return NULL;
- return newaddr;
+ /* The device plugin API does not support realloc,
+ but the usmpin allocator does. */
+ if (using_device == 0)
+ {
+ /* This can fail if there is insufficient pinned memory free. */
+ void *newaddr = usmpin_realloc (pin_ctx, addr, size);
+ if (newaddr)
+ return newaddr;
+ }
}
else if (oldpin || pin)
- goto manual_realloc;
+ /* Moving from pinned to unpinned memory cannot be done in-place. */
+ ;
else
return realloc (addr, size);
-manual_realloc:;
+ /* In-place reallocation failed. Fall back to copy. */
void *newaddr = linux_memspace_alloc (memspace, size, pin, false);
if (newaddr)
{
@@ -1658,4 +1658,14 @@ gomp_thread_to_pthread_t (struct gomp_thread *thr)
}
#endif
+/* usmpin-allocator.c */
+
+typedef struct usmpin_context *usmpin_ctx_p;
+
+usmpin_ctx_p usmpin_init_context ();
+void usmpin_register_memory (usmpin_ctx_p ctx, char *base, size_t size);
+void *usmpin_alloc (usmpin_ctx_p ctx, size_t size);
+void usmpin_free (usmpin_ctx_p ctx, void *addr);
+void *usmpin_realloc (usmpin_ctx_p ctx, void *addr, size_t newsize);
+
#endif /* LIBGOMP_H */
new file mode 100644
@@ -0,0 +1,122 @@
+/* { dg-do run } */
+
+/* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
+
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
+/* Test that pinned memory works for small allocations. */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __linux__
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/resource.h>
+
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+#define CHECK_SIZE(SIZE) { \
+ struct rlimit limit; \
+ if (getrlimit (RLIMIT_MEMLOCK, &limit) \
+ || limit.rlim_cur <= SIZE) \
+ fprintf (stderr, "insufficient lockable memory; please increase ulimit\n"); \
+ }
+
+int
+get_pinned_mem ()
+{
+ int pid = getpid ();
+ char buf[100];
+ sprintf (buf, "/proc/%d/status", pid);
+
+ FILE *proc = fopen (buf, "r");
+ if (!proc)
+ abort ();
+ while (fgets (buf, 100, proc))
+ {
+ int val;
+ if (sscanf (buf, "VmLck: %d", &val))
+ {
+ fclose (proc);
+ return val;
+ }
+ }
+ abort ();
+}
+#else
+#error "OS unsupported"
+#endif
+
+static void
+verify0 (char *p, size_t s)
+{
+ for (size_t i = 0; i < s; ++i)
+ if (p[i] != 0)
+ abort ();
+}
+
+#include <omp.h>
+
+int
+main ()
+{
+ /* Choose a small size where all our allocations fit on one page. */
+ const int SIZE = 10;
+#ifndef OFFLOAD_DEVICE_NVPTX
+ CHECK_SIZE (SIZE*4);
+#endif
+
+ const omp_alloctrait_t traits[] = {
+ { omp_atk_pinned, 1 }
+ };
+ omp_allocator_handle_t allocator = omp_init_allocator (omp_default_mem_space, 1, traits);
+
+ // Sanity check
+ if (get_pinned_mem () != 0)
+ abort ();
+
+ void *p = omp_alloc (SIZE, allocator);
+ if (!p)
+ abort ();
+
+ int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+ /* This doesn't show up as process 'VmLck'ed memory. */
+ if (amount != 0)
+ abort ();
+#else
+ if (amount == 0)
+ abort ();
+#endif
+
+ p = omp_realloc (p, SIZE * 2, allocator, allocator);
+
+ int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+ /* This doesn't show up as process 'VmLck'ed memory. */
+ if (amount2 != 0)
+ abort ();
+#else
+ /* A small allocation should not allocate another page. */
+ if (amount2 != amount)
+ abort ();
+#endif
+
+ p = omp_calloc (1, SIZE, allocator);
+
+#ifdef OFFLOAD_DEVICE_NVPTX
+ /* This doesn't show up as process 'VmLck'ed memory. */
+ if (get_pinned_mem () != 0)
+ abort ();
+#else
+ /* A small allocation should not allocate another page. */
+ if (get_pinned_mem () != amount2)
+ abort ();
+#endif
+
+ verify0 (p, SIZE);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,319 @@
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is a simple "malloc" implementation intended for use with Unified
+ Shared Memory and Pinned Memory. It allocates memory from a pool allocated
+ and configured by the device plugin (for USM), or the OS-specific allocator
+ (for pinned).
+
+ This implementation keeps the allocated/free chain in a side-table (splay
+ tree) to ensure that the allocation routine does not migrate all the USM
+ pages back into host memory. Keeping the meta-data elsewhere is also useful
+ for pinned memory, which is typically an extremely limited resource. */
+
+#include <string.h>
+#include "libgomp.h"
+
+/* Use a splay tree to track allocations. */
+
+typedef struct usmpin_splay_tree_node_s *usmpin_splay_tree_node;
+typedef struct usmpin_splay_tree_s *usmpin_splay_tree;
+typedef struct usmpin_splay_tree_key_s *usmpin_splay_tree_key;
+
+struct usmpin_splay_tree_key_s {
+ void *base;
+ size_t size;
+};
+
+static inline int
+usmpin_splay_compare (usmpin_splay_tree_key x, usmpin_splay_tree_key y)
+{
+ return (x->base == y->base ? 0
+ : x->base > y->base ? 1
+ : -1);
+}
+
+#define splay_tree_prefix usmpin
+#include "splay-tree.h"
+
+/* 128-byte granularity means GPU cache-line aligned. */
+#define ALIGN(VAR) (((VAR) + 127) & ~127)
+
+/* The context data prevents the need for global state. */
+struct usmpin_context {
+ int lock;
+ struct usmpin_splay_tree_s allocations;
+ struct usmpin_splay_tree_s free_space;
+};
+
+usmpin_ctx_p
+usmpin_init_context ()
+{
+ return calloc (1, sizeof (struct usmpin_context));
+}
+
+/* Coalesce contiguous free space into one entry. This considers the entries
+ either side of the root node only, so it should be called each time a new
+ entry in inserted into the root. */
+
+static void
+usmpin_coalesce_free_space (usmpin_ctx_p ctx)
+{
+ usmpin_splay_tree_node prev, next, node = ctx->free_space.root;
+
+ for (prev = node->left; prev && prev->right; prev = prev->right)
+ ;
+ for (next = node->right; next && next->left; next = next->left)
+ ;
+
+ /* Coalesce adjacent free chunks. */
+ if (next
+ && node->key.base + node->key.size == next->key.base)
+ {
+ /* Free chunk follows. */
+ node->key.size += next->key.size;
+ usmpin_splay_tree_remove (&ctx->free_space, &next->key);
+ free (next);
+ }
+ if (prev
+ && prev->key.base + prev->key.size == node->key.base)
+ {
+ /* Free chunk precedes. */
+ prev->key.size += node->key.size;
+ usmpin_splay_tree_remove (&ctx->free_space, &node->key);
+ free (node);
+ }
+}
+
+/* Add a new memory region into the free chain. This is how the USM heap is
+ initialized and extended. If the new region is contiguous with an existing
+ region then any free space will be coalesced. */
+
+void
+usmpin_register_memory (usmpin_ctx_p ctx, char *base, size_t size)
+{
+ if (base == NULL || ctx == NULL)
+ return;
+
+ while (__atomic_exchange_n (&ctx->lock, 1, MEMMODEL_ACQUIRE) == 1)
+ ;
+
+ usmpin_splay_tree_node node;
+ node = malloc (sizeof (struct usmpin_splay_tree_node_s));
+ node->key.base = base;
+ node->key.size = size;
+ node->left = NULL;
+ node->right = NULL;
+ usmpin_splay_tree_insert (&ctx->free_space, node);
+ usmpin_coalesce_free_space (ctx);
+
+ __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+}
+
+/* This splay_tree_foreach callback selects the first free space large enough
+ to hold the allocation needed. Since the splay_tree walk may start in the
+ middle the "first" isn't necessarily the "leftmost" entry. */
+
+struct usmpin_callback_data {
+ size_t size;
+ usmpin_splay_tree_node found;
+};
+
+static int
+usmpin_alloc_callback (usmpin_splay_tree_key key, void *data)
+{
+ struct usmpin_callback_data *cbd = (struct usmpin_callback_data *)data;
+
+ if (key->size >= cbd->size)
+ {
+ cbd->found = (usmpin_splay_tree_node)key;
+ return 1;
+ }
+
+ return 0;
+}
+
+/* USM "malloc". Selects and moves and address range from ctx->free_space to
+ ctx->allocations, while leaving any excess in ctx->free_space. */
+
+void *
+usmpin_alloc (usmpin_ctx_p ctx, size_t size)
+{
+ if (ctx == NULL)
+ return NULL;
+
+ /* Memory is allocated in N-byte granularity. */
+ size = ALIGN (size);
+
+ /* Acquire the lock. */
+ while (__atomic_exchange_n (&ctx->lock, 1, MEMMODEL_ACQUIRE) == 1)
+ ;
+
+ if (!ctx->free_space.root)
+ {
+ /* No memory registered, or no free space. */
+ __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+ return NULL;
+ }
+
+ /* Find a suitable free block. */
+ struct usmpin_callback_data cbd = {size, NULL};
+ usmpin_splay_tree_foreach_lazy (&ctx->free_space, usmpin_alloc_callback,
+ &cbd);
+ usmpin_splay_tree_node freenode = cbd.found;
+
+ void *result = NULL;
+ if (freenode)
+ {
+ /* Allocation successful. */
+ result = freenode->key.base;
+ usmpin_splay_tree_node allocnode = malloc (sizeof (*allocnode));
+ allocnode->key.base = result;
+ allocnode->key.size = size;
+ allocnode->left = NULL;
+ allocnode->right = NULL;
+ usmpin_splay_tree_insert (&ctx->allocations, allocnode);
+
+ /* Update the free chain. */
+ size_t stillfree_size = freenode->key.size - size;
+ if (stillfree_size > 0)
+ {
+ freenode->key.base = freenode->key.base + size;
+ freenode->key.size = stillfree_size;
+ }
+ else
+ {
+ usmpin_splay_tree_remove (&ctx->free_space, &freenode->key);
+ free (freenode);
+ }
+ }
+
+ /* Release the lock. */
+ __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+
+ return result;
+}
+
+/* USM "free". Moves an address range from ctx->allocations to
+ ctx->free_space and merges that record with any contiguous free memory. */
+
+void
+usmpin_free (usmpin_ctx_p ctx, void *addr)
+{
+ if (ctx == NULL)
+ return;
+
+ /* Acquire the lock. */
+ while (__atomic_exchange_n (&ctx->lock, 1, MEMMODEL_ACQUIRE) == 1)
+ ;
+
+ /* Convert the memory map to free. */
+ struct usmpin_splay_tree_key_s key = {addr};
+ usmpin_splay_tree_key found = usmpin_splay_tree_lookup (&ctx->allocations,
+ &key);
+ if (!found)
+ GOMP_PLUGIN_fatal ("invalid free");
+ usmpin_splay_tree_remove (&ctx->allocations, &key);
+ usmpin_splay_tree_insert (&ctx->free_space, (usmpin_splay_tree_node)found);
+ usmpin_coalesce_free_space (ctx);
+
+ /* Release the lock. */
+ __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+}
+
+/* USM "realloc". Works in-place, if possible; reallocates otherwise. */
+
+void *
+usmpin_realloc (usmpin_ctx_p ctx, void *addr, size_t newsize)
+{
+ if (ctx == NULL)
+ return NULL;
+
+ newsize = ALIGN (newsize);
+
+ /* Acquire the lock. */
+ while (__atomic_exchange_n (&ctx->lock, 1, MEMMODEL_ACQUIRE) == 1)
+ ;
+
+ /* Convert the memory map to free. */
+ struct usmpin_splay_tree_key_s key = {addr};
+ usmpin_splay_tree_key found = usmpin_splay_tree_lookup (&ctx->allocations,
+ &key);
+ if (!found)
+ GOMP_PLUGIN_fatal ("invalid realloc");
+
+ if (newsize == found->size)
+ ; /* Nothing to do. */
+ else if (newsize < found->size)
+ {
+ /* We're reducing the allocation size. */
+ usmpin_splay_tree_node newfree = malloc (sizeof (*newfree));
+ newfree->key.base = found->base + newsize;
+ newfree->key.size = found->size - newsize;
+ newfree->left = NULL;
+ newfree->right = NULL;
+ usmpin_splay_tree_insert (&ctx->free_space, newfree);
+ usmpin_coalesce_free_space (ctx);
+ }
+ else
+ {
+ /* We're extending the allocation. */
+ struct usmpin_splay_tree_key_s freekey = {addr + found->size};
+ usmpin_splay_tree_key foundfree;
+ foundfree = usmpin_splay_tree_lookup (&ctx->free_space, &freekey);
+ if (foundfree && foundfree->size >= newsize - found->size)
+ {
+ /* Allocation can be expanded in place. */
+ foundfree->base += found->size;
+ foundfree->size -= newsize - found->size;
+ found->size = newsize;
+
+ if (foundfree->size == 0)
+ usmpin_splay_tree_remove (&ctx->free_space, &freekey);
+ }
+ else
+ {
+ /* Allocation must be relocated.
+ Release the lock and use alloc/free. */
+ __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+
+ void *newaddr = usmpin_alloc (ctx, newsize);
+ if (!newaddr)
+ return NULL;
+
+ memcpy (newaddr, addr, found->size);
+ usmpin_free (ctx, addr);
+ return newaddr;
+ }
+ }
+
+ /* Release the lock. */
+ __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+ return addr;
+}
+
+/* Include the splay tree code inline, with the prefixes added. */
+#define splay_tree_prefix usmpin
+#define splay_tree_c
+#include "splay-tree.h"