@@ -7352,9 +7352,9 @@ resolve_omp_udr_clause (gfc_omp_namelist *n, gfc_namespace *ns,
}
/* Assume that a constant expression in the range 1 (omp_default_mem_alloc)
- to 8 (omp_thread_mem_alloc) range, or 200 (ompx_gnu_pinned_mem_alloc) is
- fine. The original symbol name is already lost during matching via
- gfc_match_expr. */
+ to 8 (omp_thread_mem_alloc) range, or 200 (ompx_gnu_pinned_mem_alloc) to
+ 202 (ompx_gnu_host_mem_alloc) is fine. The original symbol name is already
+ lost during matching via gfc_match_expr. */
static bool
is_predefined_allocator (gfc_expr *expr)
{
@@ -7366,7 +7366,7 @@ is_predefined_allocator (gfc_expr *expr)
&& ((mpz_sgn (expr->value.integer) > 0
&& mpz_cmp_si (expr->value.integer, 8) <= 0)
|| (mpz_cmp_si (expr->value.integer, 200) >= 0
- && mpz_cmp_si (expr->value.integer, 200) <= 0)));
+ && mpz_cmp_si (expr->value.integer, 202) <= 0)));
}
/* Resolve declarative ALLOCATE statement. Note: Common block vars only appear
@@ -83,10 +83,21 @@ typedef enum {
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
+ CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
} CUdevice_attribute;
+typedef enum {
+ CU_MEM_ATTACH_GLOBAL = 0x1
+} CUmemAttach_flags;
+
+typedef enum {
+ CU_POINTER_ATTRIBUTE_IS_MANAGED = 8
+} CUpointer_attribute;
+
enum {
CU_EVENT_DEFAULT = 0,
CU_EVENT_DISABLE_TIMING = 2
@@ -247,6 +258,7 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t);
#define cuMemAllocHost cuMemAllocHost_v2
CUresult cuMemAllocHost (void **, size_t);
CUresult cuMemHostAlloc (void **, size_t, unsigned int);
+CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
CUresult cuMemcpyPeer (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t);
CUresult cuMemcpyPeerAsync (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream);
@@ -287,6 +299,7 @@ CUresult cuModuleLoadData (CUmodule *, const void *);
CUresult cuModuleUnload (CUmodule);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
+CUresult cuPointerGetAttribute(void *, CUpointer_attribute, CUdeviceptr);
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int);
CUresult cuStreamCreate (CUstream *, unsigned);
@@ -100,7 +100,7 @@ GOMP_is_alloc (void *ptr)
#define omp_max_predefined_alloc omp_thread_mem_alloc
#define ompx_gnu_min_predefined_alloc ompx_gnu_pinned_mem_alloc
-#define ompx_gnu_max_predefined_alloc ompx_gnu_pinned_mem_alloc
+#define ompx_gnu_max_predefined_alloc ompx_gnu_host_mem_alloc
/* These macros may be overridden in config/<target>/allocator.c.
The defaults (no override) are to return NULL for pinned memory requests
@@ -146,6 +146,8 @@ static const omp_memspace_handle_t predefined_omp_alloc_mapping[] = {
};
static const omp_memspace_handle_t predefined_ompx_gnu_alloc_mapping[] = {
omp_default_mem_space, /* ompx_gnu_pinned_mem_alloc. */
+ ompx_gnu_unified_shared_mem_space, /* ompx_gnu_unified_shared_mem_alloc. */
+ ompx_gnu_host_mem_space, /* ompx_gnu_host_mem_alloc. */
};
#define ARRAY_SIZE(A) (sizeof (A) / sizeof ((A)[0]))
@@ -380,7 +382,9 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
struct omp_allocator_data *ret;
int i;
- if (memspace > omp_low_lat_mem_space)
+ if (memspace > omp_low_lat_mem_space
+ && (memspace < ompx_gnu_unified_shared_mem_space
+ || memspace > ompx_gnu_host_mem_space))
return omp_null_allocator;
for (i = 0; i < ntraits; i++)
switch (traits[i].key)
@@ -743,7 +747,8 @@ fail:;
int fallback = (allocator_data
? allocator_data->fallback
: (allocator == omp_default_mem_alloc
- || allocator == ompx_gnu_pinned_mem_alloc)
+ || allocator == ompx_gnu_pinned_mem_alloc
+ || allocator == ompx_gnu_host_mem_alloc)
? omp_atv_null_fb
: omp_atv_default_mem_fb);
switch (fallback)
@@ -1057,7 +1062,8 @@ fail:;
int fallback = (allocator_data
? allocator_data->fallback
: (allocator == omp_default_mem_alloc
- || allocator == ompx_gnu_pinned_mem_alloc)
+ || allocator == ompx_gnu_pinned_mem_alloc
+ || allocator == ompx_gnu_host_mem_alloc)
? omp_atv_null_fb
: omp_atv_default_mem_fb);
switch (fallback)
@@ -1440,7 +1446,8 @@ fail:;
int fallback = (allocator_data
? allocator_data->fallback
: (allocator == omp_default_mem_alloc
- || allocator == ompx_gnu_pinned_mem_alloc)
+ || allocator == ompx_gnu_pinned_mem_alloc
+ || allocator == ompx_gnu_host_mem_alloc)
? omp_atv_null_fb
: omp_atv_default_mem_fb);
switch (fallback)
@@ -101,7 +101,9 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
/* Explicit pinning may not be required. */
pin = pin && !always_pinned_mode;
- if (pin)
+ if (memspace == ompx_gnu_unified_shared_mem_space)
+ addr = gomp_usm_alloc (size);
+ else if (pin)
{
int using_device
= __atomic_load_n (&using_device_for_page_locked,
@@ -192,7 +194,13 @@ linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
/* Explicit pinning may not be required. */
pin = pin && !always_pinned_mode;
- if (pin)
+ if (memspace == ompx_gnu_unified_shared_mem_space)
+ {
+ void *ret = gomp_usm_alloc (size);
+ memset (ret, 0, size);
+ return ret;
+ }
+ else if (pin)
return linux_memspace_alloc (memspace, size, pin, true);
else
return calloc (1, size);
@@ -208,7 +216,9 @@ linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
/* Explicit pinning may not be required. */
pin = pin && !always_pinned_mode;
- if (pin)
+ if (memspace == ompx_gnu_unified_shared_mem_space)
+ gomp_usm_free (addr);
+ else if (pin)
{
int using_device
= __atomic_load_n (&using_device_for_page_locked,
@@ -234,7 +244,10 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
/* Explicit pinning may not be required. */
pin = pin && !always_pinned_mode;
- if (oldpin && pin)
+ if (memspace == ompx_gnu_unified_shared_mem_space)
+ /* Realloc is not implemented for USM. */
+ ;
+ else if (oldpin && pin)
{
int using_device
= __atomic_load_n (&using_device_for_page_locked,
@@ -42,6 +42,7 @@
chunks. */
#include "libgomp.h"
+#include <assert.h>
#include <stdlib.h>
#define BASIC_ALLOC_PREFIX __nvptx_lowlat
@@ -61,6 +62,8 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
return __nvptx_lowlat_alloc (shared_pool, size);
}
+ else if (memspace == ompx_gnu_host_mem_space)
+ return NULL;
else
return malloc (size);
}
@@ -75,6 +78,8 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
return __nvptx_lowlat_calloc (shared_pool, size);
}
+ else if (memspace == ompx_gnu_host_mem_space)
+ return NULL;
else
return calloc (1, size);
}
@@ -89,6 +94,9 @@ nvptx_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size)
__nvptx_lowlat_free (shared_pool, addr, size);
}
+ else if (memspace == ompx_gnu_host_mem_space)
+ /* Just verify what all allocator functions return. */
+ assert (addr == NULL);
else
free (addr);
}
@@ -104,6 +112,8 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size);
}
+ else if (memspace == ompx_gnu_host_mem_space)
+ return NULL;
else
return realloc (addr, size);
}
@@ -140,6 +140,9 @@ extern int GOMP_OFFLOAD_load_image (int, unsigned, const void *,
extern bool GOMP_OFFLOAD_unload_image (int, unsigned, const void *);
extern void *GOMP_OFFLOAD_alloc (int, size_t);
extern bool GOMP_OFFLOAD_free (int, void *);
+extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
+extern bool GOMP_OFFLOAD_usm_free (int, void *);
+extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
@@ -1128,6 +1128,8 @@ extern int gomp_get_num_devices (void);
extern bool gomp_target_task_fn (void *);
extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
int, struct goacc_asyncqueue *);
+extern void *gomp_usm_alloc (size_t size);
+extern void gomp_usm_free (void *device_ptr);
extern bool gomp_page_locked_host_alloc (void **, size_t);
extern void gomp_page_locked_host_free (void *);
@@ -1192,6 +1194,7 @@ struct target_mem_desc;
#define OFFSET_INLINED (~(uintptr_t) 0)
#define OFFSET_POINTER (~(uintptr_t) 1)
#define OFFSET_STRUCT (~(uintptr_t) 2)
+#define OFFSET_USM (~(uintptr_t) 3)
/* Auxiliary structure for infrequently-used or API-specific data. */
@@ -1412,6 +1415,9 @@ struct gomp_device_descr
__typeof (GOMP_OFFLOAD_unload_image) *unload_image_func;
__typeof (GOMP_OFFLOAD_alloc) *alloc_func;
__typeof (GOMP_OFFLOAD_free) *free_func;
+ __typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func;
+ __typeof (GOMP_OFFLOAD_usm_free) *usm_free_func;
+ __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
__typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
__typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
@@ -120,6 +120,8 @@ typedef enum omp_memspace_handle_t __GOMP_UINTPTR_T_ENUM
omp_const_mem_space = 2,
omp_high_bw_mem_space = 3,
omp_low_lat_mem_space = 4,
+ ompx_gnu_unified_shared_mem_space = 201,
+ ompx_gnu_host_mem_space = 202,
__omp_memspace_handle_t_max__ = __UINTPTR_MAX__
} omp_memspace_handle_t;
@@ -135,6 +137,8 @@ typedef enum omp_allocator_handle_t __GOMP_UINTPTR_T_ENUM
omp_pteam_mem_alloc = 7,
omp_thread_mem_alloc = 8,
ompx_gnu_pinned_mem_alloc = 200,
+ ompx_gnu_unified_shared_mem_alloc = 201,
+ ompx_gnu_host_mem_alloc = 202,
__omp_allocator_handle_t_max__ = __UINTPTR_MAX__
} omp_allocator_handle_t;
@@ -160,6 +160,10 @@
parameter :: omp_thread_mem_alloc = 8
integer (kind=omp_allocator_handle_kind), &
parameter :: ompx_gnu_pinned_mem_alloc = 200
+ integer (kind=omp_allocator_handle_kind), &
+ parameter :: ompx_gnu_unified_shared_mem_alloc = 201
+ integer (kind=omp_allocator_handle_kind), &
+ parameter :: ompx_gnu_host_mem_alloc = 202
integer (omp_memspace_handle_kind), &
parameter :: omp_default_mem_space = 0
integer (omp_memspace_handle_kind), &
@@ -170,6 +174,10 @@
parameter :: omp_high_bw_mem_space = 3
integer (omp_memspace_handle_kind), &
parameter :: omp_low_lat_mem_space = 4
+ integer (omp_memspace_handle_kind), &
+ parameter :: ompx_gnu_unified_shared_mem_space = 201
+ integer (omp_memspace_handle_kind), &
+ parameter :: ompx_gnu_host_mem_space = 202
integer, parameter :: omp_initial_device = -1
integer, parameter :: omp_invalid_device = -4
@@ -156,6 +156,9 @@
integer (omp_allocator_handle_kind) omp_pteam_mem_alloc
integer (omp_allocator_handle_kind) omp_thread_mem_alloc
integer (omp_allocator_handle_kind) ompx_gnu_pinned_mem_alloc
+ integer (omp_allocator_handle_kind) &
+ & ompx_gnu_unified_shared_mem_alloc
+ integer (omp_allocator_handle_kind) ompx_gnu_host_mem_alloc
parameter (omp_null_allocator = 0)
parameter (omp_default_mem_alloc = 1)
parameter (omp_large_cap_mem_alloc = 2)
@@ -166,16 +169,23 @@
parameter (omp_pteam_mem_alloc = 7)
parameter (omp_thread_mem_alloc = 8)
parameter (ompx_gnu_pinned_mem_alloc = 200)
+ parameter (ompx_gnu_unified_shared_mem_alloc = 201)
+ parameter (ompx_gnu_host_mem_alloc = 202)
integer (omp_memspace_handle_kind) omp_default_mem_space
integer (omp_memspace_handle_kind) omp_large_cap_mem_space
integer (omp_memspace_handle_kind) omp_const_mem_space
integer (omp_memspace_handle_kind) omp_high_bw_mem_space
integer (omp_memspace_handle_kind) omp_low_lat_mem_space
+ integer (omp_memspace_handle_kind) &
+ & ompx_gnu_unified_shared_mem_space
+ integer (omp_memspace_handle_kind) ompx_gnu_host_mem_space
parameter (omp_default_mem_space = 0)
parameter (omp_large_cap_mem_space = 1)
parameter (omp_const_mem_space = 2)
parameter (omp_high_bw_mem_space = 3)
parameter (omp_low_lat_mem_space = 4)
+ parameter (ompx_gnu_unified_shared_mem_space = 201)
+ parameter (ompx_gnu_host_mem_space = 202)
integer omp_initial_device, omp_invalid_device
parameter (omp_initial_device = -1)
parameter (omp_invalid_device = -4)
@@ -30,6 +30,7 @@ CUDA_ONE_CALL (cuLinkDestroy)
CUDA_ONE_CALL (cuMemAlloc)
CUDA_ONE_CALL (cuMemAllocHost)
CUDA_ONE_CALL (cuMemHostAlloc)
+CUDA_ONE_CALL (cuMemAllocManaged)
CUDA_ONE_CALL (cuMemcpy)
CUDA_ONE_CALL (cuMemcpyDtoDAsync)
CUDA_ONE_CALL (cuMemcpyDtoH)
@@ -50,6 +51,7 @@ CUDA_ONE_CALL (cuModuleLoad)
CUDA_ONE_CALL (cuModuleLoadData)
CUDA_ONE_CALL (cuModuleUnload)
CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
+CUDA_ONE_CALL (cuPointerGetAttribute)
CUDA_ONE_CALL (cuStreamAddCallback)
CUDA_ONE_CALL (cuStreamCreate)
CUDA_ONE_CALL (cuStreamDestroy)
@@ -1058,11 +1058,13 @@ nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
}
static void *
-nvptx_alloc (size_t s, bool suppress_errors)
+nvptx_alloc (size_t s, bool suppress_errors, bool usm)
{
CUdeviceptr d;
- CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
+ CUresult r = (usm ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s,
+ CU_MEM_ATTACH_GLOBAL)
+ : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s));
if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
return NULL;
else if (r != CUDA_SUCCESS)
@@ -1229,8 +1231,13 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
{
int pi;
CUresult r;
+ /* Check access via migration. */
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
- CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS, dev);
+ CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev);
+ if (r != CUDA_SUCCESS || pi == 0)
+ /* Check direct access. */
+ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
+ CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
if (r != CUDA_SUCCESS || pi == 0)
return -1;
}
@@ -1598,8 +1605,8 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
return ret;
}
-void *
-GOMP_OFFLOAD_alloc (int ord, size_t size)
+static void *
+GOMP_OFFLOAD_alloc_1 (int ord, size_t size, bool usm)
{
if (!nvptx_attach_host_thread_to_device (ord))
return NULL;
@@ -1622,7 +1629,7 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
blocks = tmp;
}
- void *d = nvptx_alloc (size, true);
+ void *d = nvptx_alloc (size, true, usm);
if (d)
return d;
else
@@ -1630,10 +1637,22 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
/* Memory allocation failed. Try freeing the stacks block, and
retrying. */
nvptx_stacks_free (ptx_dev, true);
- return nvptx_alloc (size, false);
+ return nvptx_alloc (size, false, usm);
}
}
+void *
+GOMP_OFFLOAD_alloc (int ord, size_t size)
+{
+ return GOMP_OFFLOAD_alloc_1 (ord, size, false);
+}
+
+void *
+GOMP_OFFLOAD_usm_alloc (int ord, size_t size)
+{
+ return GOMP_OFFLOAD_alloc_1 (ord, size, true);
+}
+
bool
GOMP_OFFLOAD_free (int ord, void *ptr)
{
@@ -1641,6 +1660,25 @@ GOMP_OFFLOAD_free (int ord, void *ptr)
&& nvptx_free (ptr, ptx_devices[ord]));
}
+bool
+GOMP_OFFLOAD_usm_free (int ord, void *ptr)
+{
+ return GOMP_OFFLOAD_free (ord, ptr);
+}
+
+bool
+GOMP_OFFLOAD_is_usm_ptr (void *ptr)
+{
+ bool managed = false;
+ /* This returns 3 outcomes ...
+ CUDA_ERROR_INVALID_VALUE - Not a Cuda allocated pointer.
+ CUDA_SUCCESS, managed:false - Cuda allocated, but not USM.
+ CUDA_SUCCESS, managed:true - USM. */
+ CUDA_CALL_NOCHECK (cuPointerGetAttribute, &managed,
+ CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
+ return managed;
+}
+
bool
GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
{
@@ -706,7 +706,9 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
splay_tree_key n = gomp_map_lookup (mem_map, &cur_node);
if (n == NULL)
{
- if (allow_zero_length_array_sections)
+ if (allow_zero_length_array_sections
+ || (devicep->is_usm_ptr_func
+ && devicep->is_usm_ptr_func ((void*)cur_node.host_start)))
cur_node.tgt_offset = cur_node.host_start;
else
{
@@ -859,6 +861,11 @@ gomp_attach_pointer (struct gomp_device_descr *devicep,
return;
}
+ if (devicep->is_usm_ptr_func
+ && devicep->is_usm_ptr_func ((void*)(target + bias)))
+ /* Nothing to do here. */
+ return;
+
s.host_start = target + bias;
s.host_end = s.host_start + 1;
tn = splay_tree_lookup (mem_map, &s);
@@ -955,6 +962,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i)
switch (tgt->list[i].offset)
{
case OFFSET_INLINED:
+ case OFFSET_USM:
return (uintptr_t) hostaddrs[i];
case OFFSET_POINTER:
@@ -1038,6 +1046,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
{
int kind = get_kind (short_mapkind, kinds, i);
bool implicit = get_implicit (short_mapkind, kinds, i);
+ tgt->list[i].offset = 0;
if (hostaddrs[i] == NULL
|| (kind & typemask) == GOMP_MAP_FIRSTPRIVATE_INT)
{
@@ -1045,6 +1054,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
tgt->list[i].offset = OFFSET_INLINED;
continue;
}
+ else if (devicep->is_usm_ptr_func
+ && devicep->is_usm_ptr_func (hostaddrs[i]))
+ {
+ /* The memory is visible from both host and target
+ so nothing needs to be moved. */
+ tgt->list[i].key = NULL;
+ tgt->list[i].offset = OFFSET_USM;
+ continue;
+ }
else if ((kind & typemask) == GOMP_MAP_USE_DEVICE_PTR
|| (kind & typemask) == GOMP_MAP_USE_DEVICE_PTR_IF_PRESENT)
{
@@ -1398,6 +1416,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
bool implicit = get_implicit (short_mapkind, kinds, i);
if (hostaddrs[i] == NULL)
continue;
+ if (tgt->list[i].offset == OFFSET_USM)
+ continue;
switch (kind & typemask)
{
size_t align, len, first, last;
@@ -1595,6 +1615,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
continue;
}
default:
+ if (tgt->list[i].offset == OFFSET_INLINED)
+ continue;
break;
}
splay_tree_key k = &array->key;
@@ -4437,6 +4459,56 @@ omp_target_free (void *device_ptr, int device_num)
gomp_mutex_unlock (&devicep->lock);
}
+void *
+gomp_usm_alloc (size_t size)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ struct gomp_device_descr *devicep = resolve_device (icv->default_device_var,
+ false);
+ if (devicep == NULL)
+ return NULL;
+
+ if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+ || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ return malloc (size);
+
+ void *ret = NULL;
+ gomp_mutex_lock (&devicep->lock);
+ if (devicep->usm_alloc_func)
+ ret = devicep->usm_alloc_func (devicep->target_id, size);
+ gomp_mutex_unlock (&devicep->lock);
+ return ret;
+}
+
+void
+gomp_usm_free (void *device_ptr)
+{
+ if (device_ptr == NULL)
+ return;
+
+ struct gomp_task_icv *icv = gomp_icv (false);
+ struct gomp_device_descr *devicep = resolve_device (icv->default_device_var,
+ false);
+ if (devicep == NULL)
+ return;
+
+ if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+ || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ {
+ free (device_ptr);
+ return;
+ }
+
+ gomp_mutex_lock (&devicep->lock);
+ if (devicep->usm_free_func
+ && !devicep->usm_free_func (devicep->target_id, device_ptr))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("error in freeing device memory block at %p", device_ptr);
+ }
+ gomp_mutex_unlock (&devicep->lock);
+}
+
/* Device (really: libgomp plugin) to use for paged-locked memory. We
assume there is either none or exactly one such device for the lifetime of
the process. */
@@ -5294,6 +5366,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
DLSYM (unload_image);
DLSYM (alloc);
DLSYM (free);
+ DLSYM_OPT (usm_alloc, usm_alloc);
+ DLSYM_OPT (usm_free, usm_free);
+ DLSYM_OPT (is_usm_ptr, is_usm_ptr);
DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
DLSYM_OPT (page_locked_host_free, page_locked_host_free);
DLSYM (dev2host);
@@ -594,3 +594,13 @@ int main() {
return 0;
} } "-lcuda -lcudart" ]
}
+
+# return 1 if OpenMP Unified Share Memory is supported
+
+proc check_effective_target_omp_usm { } {
+ if { [libgomp_check_effective_target_offload_target "nvptx"] } {
+ return 1
+ }
+ return 0
+}
+
@@ -11,12 +11,13 @@
#pragma omp requires unified_shared_memory, unified_address, reverse_offload
-int a[10] = { 0 };
extern void foo (void);
int
main (void)
{
+ int *a = (int*)__builtin_calloc(10, sizeof (int));
+
#pragma omp target map(to: a)
for (int i = 0; i < 10; i++)
a[i] = i;
new file mode 100644
@@ -0,0 +1,77 @@
+/* Verify that on the host we can but on a device we cannot allocate 'ompx_gnu_host_mem_alloc' memory. */
+
+/* { dg-additional-options -DOFFLOAD_DEVICE { target offload_device } } */
+
+#include <omp.h>
+
+#pragma omp requires dynamic_allocators
+
+int main()
+{
+#pragma omp target
+ {
+ char *c, *c_;
+
+ c = omp_alloc(1, ompx_gnu_host_mem_alloc);
+#ifdef OFFLOAD_DEVICE
+ if (c)
+ __builtin_abort ();
+#else
+ if (!c)
+ __builtin_abort ();
+#endif
+ omp_free(c, ompx_gnu_host_mem_alloc);
+
+ c = omp_aligned_alloc(128, 256, ompx_gnu_host_mem_alloc);
+#ifdef OFFLOAD_DEVICE
+ if (c)
+ __builtin_abort ();
+#else
+ if (!c)
+ __builtin_abort ();
+#endif
+ omp_free(c, omp_null_allocator);
+
+ c = omp_calloc(1, 1, ompx_gnu_host_mem_alloc);
+#ifdef OFFLOAD_DEVICE
+ if (c)
+ __builtin_abort ();
+#else
+ if (!c)
+ __builtin_abort ();
+#endif
+ c_ = omp_realloc(c, 2, ompx_gnu_host_mem_alloc, ompx_gnu_host_mem_alloc);
+#ifdef OFFLOAD_DEVICE
+ if (c_)
+ __builtin_abort ();
+#else
+ if (!c_)
+ __builtin_abort ();
+#endif
+ c = omp_realloc(c_, 0, ompx_gnu_host_mem_alloc, ompx_gnu_host_mem_alloc);
+ if (c)
+ __builtin_abort ();
+
+ c = omp_aligned_calloc(64, 1, 512, ompx_gnu_host_mem_alloc);
+#ifdef OFFLOAD_DEVICE
+ if (c)
+ __builtin_abort ();
+#else
+ if (!c)
+ __builtin_abort ();
+#endif
+ c_ = omp_realloc(c, 2, c ? omp_null_allocator : ompx_gnu_host_mem_alloc, omp_null_allocator);
+#ifdef OFFLOAD_DEVICE
+ if (c_)
+ __builtin_abort ();
+#else
+ if (!c_)
+ __builtin_abort ();
+#endif
+ c = omp_realloc(c_, 0, omp_null_allocator, omp_null_allocator);
+ if (c)
+ __builtin_abort ();
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-require-effective-target omp_usm } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int), ompx_gnu_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ *a = 42;
+ uintptr_t a_p = (uintptr_t)a;
+
+ #pragma omp target is_device_ptr(a)
+ {
+ if (*a != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_gnu_unified_shared_mem_alloc);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,33 @@
+/* { dg-do run } */
+/* { dg-require-effective-target omp_usm } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int)*2, ompx_gnu_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+ #pragma omp target map(a[0])
+ {
+ if (a[0] != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ #pragma omp target map(a[1])
+ {
+ if (a[1] != 43 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_gnu_unified_shared_mem_alloc);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target omp_usm } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int)*2, ompx_gnu_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+#pragma omp target data map(a[0:2])
+ {
+#pragma omp target
+ {
+ if (a[0] != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+#pragma omp target
+ {
+ if (a[1] != 43 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+ }
+
+ omp_free(a, ompx_gnu_unified_shared_mem_alloc);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-require-effective-target omp_usm } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int)*2, ompx_gnu_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+#pragma omp target enter data map(to:a[0:2])
+
+#pragma omp target
+ {
+ if (a[0] != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+#pragma omp target
+ {
+ if (a[1] != 43 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+#pragma omp target exit data map(delete:a[0:2])
+
+ omp_free(a, ompx_gnu_unified_shared_mem_alloc);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-require-effective-target omp_usm } */
+
+#include <omp.h>
+#include <stdint.h>
+
+#pragma omp requires unified_shared_memory
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int), ompx_gnu_host_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+#pragma omp target map(a[0:1])
+ {
+ if (a[0] != 42 || a_p == (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_gnu_host_mem_alloc);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,33 @@
+! { dg-do run }
+! { dg-require-effective-target omp_usm }
+
+! Ensure that derived types containing allocated values work
+! with Unified Shared Memory.
+
+program usm
+!$omp requires unified_shared_memory
+ use iso_fortran_env
+ implicit none
+
+ type :: struct
+ real(real64), allocatable :: v(:)
+ end type struct
+
+ integer :: index
+ type(struct) :: s
+
+ real(real64), allocatable :: expected(:)
+
+ allocate(s%v(100))
+ do index = 1, size(s%v)
+ s%v(index) = index
+ end do
+ allocate(expected, mold=s%v)
+ expected = s%v - 1._real64
+
+ !$omp target
+ s%v = s%v - 1._real64
+ !$omp end target
+
+ if (any(s%v /= expected)) STOP 1
+end program usm
From: Andrew Stubbs <ams@codesourcery.com> This adds support for using Cuda Managed Memory with omp_alloc. It will be used as the underpinnings for "requires unified_shared_memory" in a later patch. There are two new predefined allocators, ompx_gnu_unified_shared_mem_alloc and ompx_gnu_host_mem_alloc, plus corresponding memory spaces, which can be used to allocate memory in the "managed" space and explicitly on the host (it is intended that "malloc" will be intercepted by the compiler). The nvptx plugin is modified to make the necessary Cuda calls, and libgomp is modified to switch to shared-memory mode for USM allocated mappings. gcc/fortran/ChangeLog: * openmp.cc (is_predefined_allocator): Recognise new allocators. include/ChangeLog: * cuda/cuda.h (CUdevice_attribute): Add definitions for CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR and CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR. (CUmemAttach_flags): New. (CUpointer_attribute): New. (cuMemAllocManaged): New prototype. (cuPointerGetAttribute): New prototype. libgomp/ChangeLog: * allocator.c (ompx_gnu_max_predefined_alloc): Update. (predefined_ompx_gnu_alloc_mapping): Add ompx_gnu_unified_shared_mem_space and ompx_gnu_host_mem_space. (omp_init_allocator): Recognise ompx_gnu_pinned_mem_alloc and ompx_gnu_host_mem_space. * config/linux/allocator.c (linux_memspace_alloc): Support USM. (linux_memspace_calloc): Likewise. (linux_memspace_free): Likewise. (linux_memspace_realloc): Likewise. * config/nvptx/allocator.c (nvptx_memspace_alloc): Disallow host memory. (nvptx_memspace_calloc): Likewise. (nvptx_memspace_free): Likewise. (nvptx_memspace_realloc): Likewise. * libgomp-plugin.h (GOMP_OFFLOAD_usm_alloc): New prototype. (GOMP_OFFLOAD_usm_free): New prototype. (GOMP_OFFLOAD_is_usm_ptr): New prototype. * libgomp.h (gomp_usm_alloc): New prototype. (gomp_usm_free): New prototype. (OFFSET_USM): New define. (struct gomp_device_descr): Add USM functions. * omp.h.in (omp_memspace_handle_t): Add ompx_gnu_unified_shared_mem_space and ompx_gnu_host_mem_space. (omp_allocator_handle_t): Ad ompx_gnu_unified_shared_mem_alloc and ompx_gnu_host_mem_alloc. * omp_lib.f90.in: Likewise. * omp_lib.h.in: Likewise. * plugin/cuda-lib.def (cuMemAllocManaged): Add new call. (cuPointerGetAttribute): Likewise. * plugin/plugin-nvptx.c (nvptx_alloc): Add "usm" parameter. Call cuMemAllocManaged as appropriate. (GOMP_OFFLOAD_get_num_devices): Allow GOMP_REQUIRES_UNIFIED_SHARED_MEMORY if the device supports managed memory or integrated memory. (GOMP_OFFLOAD_alloc): Move internals to ... (GOMP_OFFLOAD_alloc_1): ... this, and add usm parameter. (GOMP_OFFLOAD_usm_alloc): New function. (GOMP_OFFLOAD_usm_free): New function. (GOMP_OFFLOAD_is_usm_ptr): New function. * target.c (gomp_map_pointer): Add USM support. (gomp_attach_pointer): Likewise. (gomp_map_val): Likewise. (gomp_map_vars_internal): Likewise. (gomp_usm_alloc): New function. (gomp_usm_free): New function. (gomp_load_plugin_for_device): Add usm_alloc, usm_free, and is_usm_ptr. * testsuite/lib/libgomp.exp (check_effective_target_omp_usm): New. * testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c: New test. * testsuite/libgomp.c/usm-1.c: New test. * testsuite/libgomp.c/usm-2.c: New test. * testsuite/libgomp.c/usm-3.c: New test. * testsuite/libgomp.c/usm-4.c: New test. * testsuite/libgomp.c/usm-5.c: New test. * testsuite/libgomp.fortran/usm-3.f90: New test. */testsuite/libgomp.c-c++-common/requires-5.c: Fix static data failure. co-authored-by: Kwok Cheung Yeung <kcyeung@baylibre.com> co-authored-by: Thomas Schwinge <tschwinge@baylibre.com> --- gcc/fortran/openmp.cc | 8 +- include/cuda/cuda.h | 13 ++++ libgomp/allocator.c | 17 ++-- libgomp/config/linux/allocator.c | 21 ++++- libgomp/config/nvptx/allocator.c | 10 +++ libgomp/libgomp-plugin.h | 3 + libgomp/libgomp.h | 6 ++ libgomp/omp.h.in | 4 + libgomp/omp_lib.f90.in | 8 ++ libgomp/omp_lib.h.in | 10 +++ libgomp/plugin/cuda-lib.def | 2 + libgomp/plugin/plugin-nvptx.c | 52 +++++++++++-- libgomp/target.c | 77 ++++++++++++++++++- libgomp/testsuite/lib/libgomp.exp | 10 +++ .../libgomp.c-c++-common/requires-5.c | 3 +- .../alloc-ompx_gnu_host_mem_alloc-1.c | 77 +++++++++++++++++++ libgomp/testsuite/libgomp.c/usm-1.c | 25 ++++++ libgomp/testsuite/libgomp.c/usm-2.c | 33 ++++++++ libgomp/testsuite/libgomp.c/usm-3.c | 36 +++++++++ libgomp/testsuite/libgomp.c/usm-4.c | 37 +++++++++ libgomp/testsuite/libgomp.c/usm-5.c | 28 +++++++ libgomp/testsuite/libgomp.fortran/usm-3.f90 | 33 ++++++++ 22 files changed, 491 insertions(+), 22 deletions(-) create mode 100644 libgomp/testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c create mode 100644 libgomp/testsuite/libgomp.c/usm-1.c create mode 100644 libgomp/testsuite/libgomp.c/usm-2.c create mode 100644 libgomp/testsuite/libgomp.c/usm-3.c create mode 100644 libgomp/testsuite/libgomp.c/usm-4.c create mode 100644 libgomp/testsuite/libgomp.c/usm-5.c create mode 100644 libgomp/testsuite/libgomp.fortran/usm-3.f90