[v2,2/8] openmp, nvptx: ompx_gnu_unified_shared_mem_alloc

Message ID	20240628102449.562467-3-ams@baylibre.com
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org A3941382EF3D From: Andrew Stubbs <ams@baylibre.com> To: gcc-patches@gcc.gnu.org Cc: tburnus@baylibre.com, jakub@redhat.com Subject: [PATCH v2 2/8] openmp, nvptx: ompx_gnu_unified_shared_mem_alloc Date: Fri, 28 Jun 2024 10:24:43 +0000 Message-ID: <20240628102449.562467-3-ams@baylibre.com> In-Reply-To: <20240628102449.562467-1-ams@baylibre.com> References: <20240628102449.562467-1-ams@baylibre.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org
Series	OpenMP: Unified Shared Memory via Managed Memory \| expand [v2,0/8] OpenMP: Unified Shared Memory via Managed Memory [v2,1/8] libgomp: Disentangle shared memory from managed [v2,2/8] openmp, nvptx: ompx_gnu_unified_shared_mem_alloc [v2,3/8] openmp: Enable -foffload-memory=unified [v2,4/8] openmp: Use libgomp memory allocation functions with unified shared memory. [v2,5/8] amdgcn, openmp: Auto-detect USM mode and set HSA_XNACK [v2,6/8] amdgcn: libgomp plugin USM implementation [v2,7/8] openmp, libgomp: Handle unified shared memory in omp_target_is_accessible [v2,8/8] libgomp: Map omp_default_mem_space to USM

diff --git a/gcc/fortran/openmp.cc b/gcc/fortran/openmp.cc index a177afb4974..548b36a4b62 100644 --- a/gcc/fortran/openmp.cc +++ b/gcc/fortran/openmp.cc @@ -7352,9 +7352,9 @@ resolve_omp_udr_clause (gfc_omp_namelist *n, gfc_namespace *ns, } /* Assume that a constant expression in the range 1 (omp_default_mem_alloc) - to 8 (omp_thread_mem_alloc) range, or 200 (ompx_gnu_pinned_mem_alloc) is - fine. The original symbol name is already lost during matching via - gfc_match_expr. */ + to 8 (omp_thread_mem_alloc) range, or 200 (ompx_gnu_pinned_mem_alloc) to + 202 (ompx_gnu_host_mem_alloc) is fine. The original symbol name is already + lost during matching via gfc_match_expr. */ static bool is_predefined_allocator (gfc_expr *expr) { @@ -7366,7 +7366,7 @@ is_predefined_allocator (gfc_expr *expr) && ((mpz_sgn (expr->value.integer) > 0 && mpz_cmp_si (expr->value.integer, 8) <= 0) || (mpz_cmp_si (expr->value.integer, 200) >= 0 - && mpz_cmp_si (expr->value.integer, 200) <= 0))); + && mpz_cmp_si (expr->value.integer, 202) <= 0))); } /* Resolve declarative ALLOCATE statement. Note: Common block vars only appear diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h index 804d08ca57e..d0cfd0e471e 100644 --- a/include/cuda/cuda.h +++ b/include/cuda/cuda.h @@ -83,10 +83,21 @@ typedef enum { CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88 } CUdevice_attribute; +typedef enum { + CU_MEM_ATTACH_GLOBAL = 0x1 +} CUmemAttach_flags; + +typedef enum { + CU_POINTER_ATTRIBUTE_IS_MANAGED = 8 +} CUpointer_attribute; + enum { CU_EVENT_DEFAULT = 0, CU_EVENT_DISABLE_TIMING = 2 @@ -247,6 +258,7 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t); #define cuMemAllocHost cuMemAllocHost_v2 CUresult cuMemAllocHost (void **, size_t); CUresult cuMemHostAlloc (void **, size_t, unsigned int); +CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int); CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t); CUresult cuMemcpyPeer (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t); CUresult cuMemcpyPeerAsync (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); @@ -287,6 +299,7 @@ CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleUnload (CUmodule); CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int); +CUresult cuPointerGetAttribute(void *, CUpointer_attribute, CUdeviceptr); typedef void (*CUstreamCallback)(CUstream, CUresult, void *); CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int); CUresult cuStreamCreate (CUstream *, unsigned); diff --git a/libgomp/allocator.c b/libgomp/allocator.c index 91aa58e162b..1dee642684e 100644 --- a/libgomp/allocator.c +++ b/libgomp/allocator.c @@ -100,7 +100,7 @@ GOMP_is_alloc (void *ptr) #define omp_max_predefined_alloc omp_thread_mem_alloc #define ompx_gnu_min_predefined_alloc ompx_gnu_pinned_mem_alloc -#define ompx_gnu_max_predefined_alloc ompx_gnu_pinned_mem_alloc +#define ompx_gnu_max_predefined_alloc ompx_gnu_host_mem_alloc /* These macros may be overridden in config/<target>/allocator.c. The defaults (no override) are to return NULL for pinned memory requests @@ -146,6 +146,8 @@ static const omp_memspace_handle_t predefined_omp_alloc_mapping[] = { }; static const omp_memspace_handle_t predefined_ompx_gnu_alloc_mapping[] = { omp_default_mem_space, /* ompx_gnu_pinned_mem_alloc. */ + ompx_gnu_unified_shared_mem_space, /* ompx_gnu_unified_shared_mem_alloc. */ + ompx_gnu_host_mem_space, /* ompx_gnu_host_mem_alloc. */ }; #define ARRAY_SIZE(A) (sizeof (A) / sizeof ((A)[0])) @@ -380,7 +382,9 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits, struct omp_allocator_data *ret; int i; - if (memspace > omp_low_lat_mem_space) + if (memspace > omp_low_lat_mem_space + && (memspace < ompx_gnu_unified_shared_mem_space + || memspace > ompx_gnu_host_mem_space)) return omp_null_allocator; for (i = 0; i < ntraits; i++) switch (traits[i].key) @@ -743,7 +747,8 @@ fail:; int fallback = (allocator_data ? allocator_data->fallback : (allocator == omp_default_mem_alloc - || allocator == ompx_gnu_pinned_mem_alloc) + || allocator == ompx_gnu_pinned_mem_alloc + || allocator == ompx_gnu_host_mem_alloc) ? omp_atv_null_fb : omp_atv_default_mem_fb); switch (fallback) @@ -1057,7 +1062,8 @@ fail:; int fallback = (allocator_data ? allocator_data->fallback : (allocator == omp_default_mem_alloc - || allocator == ompx_gnu_pinned_mem_alloc) + || allocator == ompx_gnu_pinned_mem_alloc + || allocator == ompx_gnu_host_mem_alloc) ? omp_atv_null_fb : omp_atv_default_mem_fb); switch (fallback) @@ -1440,7 +1446,8 @@ fail:; int fallback = (allocator_data ? allocator_data->fallback : (allocator == omp_default_mem_alloc - || allocator == ompx_gnu_pinned_mem_alloc) + || allocator == ompx_gnu_pinned_mem_alloc + || allocator == ompx_gnu_host_mem_alloc) ? omp_atv_null_fb : omp_atv_default_mem_fb); switch (fallback) diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c index 04214973cc9..81d2877b8f1 100644 --- a/libgomp/config/linux/allocator.c +++ b/libgomp/config/linux/allocator.c @@ -101,7 +101,9 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin, /* Explicit pinning may not be required. */ pin = pin && !always_pinned_mode; - if (pin) + if (memspace == ompx_gnu_unified_shared_mem_space) + addr = gomp_usm_alloc (size); + else if (pin) { int using_device = __atomic_load_n (&using_device_for_page_locked, @@ -192,7 +194,13 @@ linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin) /* Explicit pinning may not be required. */ pin = pin && !always_pinned_mode; - if (pin) + if (memspace == ompx_gnu_unified_shared_mem_space) + { + void *ret = gomp_usm_alloc (size); + memset (ret, 0, size); + return ret; + } + else if (pin) return linux_memspace_alloc (memspace, size, pin, true); else return calloc (1, size); @@ -208,7 +216,9 @@ linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size, /* Explicit pinning may not be required. */ pin = pin && !always_pinned_mode; - if (pin) + if (memspace == ompx_gnu_unified_shared_mem_space) + gomp_usm_free (addr); + else if (pin) { int using_device = __atomic_load_n (&using_device_for_page_locked, @@ -234,7 +244,10 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr, /* Explicit pinning may not be required. */ pin = pin && !always_pinned_mode; - if (oldpin && pin) + if (memspace == ompx_gnu_unified_shared_mem_space) + /* Realloc is not implemented for USM. */ + ; + else if (oldpin && pin) { int using_device = __atomic_load_n (&using_device_for_page_locked, diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c index acf8646e574..30b4e87ca67 100644 --- a/libgomp/config/nvptx/allocator.c +++ b/libgomp/config/nvptx/allocator.c @@ -42,6 +42,7 @@ chunks. */ #include "libgomp.h" +#include <assert.h> #include <stdlib.h> #define BASIC_ALLOC_PREFIX __nvptx_lowlat @@ -61,6 +62,8 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size) return __nvptx_lowlat_alloc (shared_pool, size); } + else if (memspace == ompx_gnu_host_mem_space) + return NULL; else return malloc (size); } @@ -75,6 +78,8 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size) return __nvptx_lowlat_calloc (shared_pool, size); } + else if (memspace == ompx_gnu_host_mem_space) + return NULL; else return calloc (1, size); } @@ -89,6 +94,9 @@ nvptx_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size) __nvptx_lowlat_free (shared_pool, addr, size); } + else if (memspace == ompx_gnu_host_mem_space) + /* Just verify what all allocator functions return. */ + assert (addr == NULL); else free (addr); } @@ -104,6 +112,8 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr, return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size); } + else if (memspace == ompx_gnu_host_mem_space) + return NULL; else return realloc (addr, size); } diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 100dbca1633..404209802f8 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -140,6 +140,9 @@ extern int GOMP_OFFLOAD_load_image (int, unsigned, const void *, extern bool GOMP_OFFLOAD_unload_image (int, unsigned, const void *); extern void *GOMP_OFFLOAD_alloc (int, size_t); extern bool GOMP_OFFLOAD_free (int, void *); +extern void *GOMP_OFFLOAD_usm_alloc (int, size_t); +extern bool GOMP_OFFLOAD_usm_free (int, void *); +extern bool GOMP_OFFLOAD_is_usm_ptr (void *); extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t); extern bool GOMP_OFFLOAD_page_locked_host_free (void *); extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index f48bf7418f0..707fcdb39d7 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1128,6 +1128,8 @@ extern int gomp_get_num_devices (void); extern bool gomp_target_task_fn (void *); extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, int, struct goacc_asyncqueue *); +extern void *gomp_usm_alloc (size_t size); +extern void gomp_usm_free (void *device_ptr); extern bool gomp_page_locked_host_alloc (void **, size_t); extern void gomp_page_locked_host_free (void *); @@ -1192,6 +1194,7 @@ struct target_mem_desc; #define OFFSET_INLINED (~(uintptr_t) 0) #define OFFSET_POINTER (~(uintptr_t) 1) #define OFFSET_STRUCT (~(uintptr_t) 2) +#define OFFSET_USM (~(uintptr_t) 3) /* Auxiliary structure for infrequently-used or API-specific data. */ @@ -1412,6 +1415,9 @@ struct gomp_device_descr __typeof (GOMP_OFFLOAD_unload_image) *unload_image_func; __typeof (GOMP_OFFLOAD_alloc) *alloc_func; __typeof (GOMP_OFFLOAD_free) *free_func; + __typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func; + __typeof (GOMP_OFFLOAD_usm_free) *usm_free_func; + __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func; __typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func; __typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func; __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func; diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in index 4438d341160..db400830113 100644 --- a/libgomp/omp.h.in +++ b/libgomp/omp.h.in @@ -120,6 +120,8 @@ typedef enum omp_memspace_handle_t __GOMP_UINTPTR_T_ENUM omp_const_mem_space = 2, omp_high_bw_mem_space = 3, omp_low_lat_mem_space = 4, + ompx_gnu_unified_shared_mem_space = 201, + ompx_gnu_host_mem_space = 202, __omp_memspace_handle_t_max__ = __UINTPTR_MAX__ } omp_memspace_handle_t; @@ -135,6 +137,8 @@ typedef enum omp_allocator_handle_t __GOMP_UINTPTR_T_ENUM omp_pteam_mem_alloc = 7, omp_thread_mem_alloc = 8, ompx_gnu_pinned_mem_alloc = 200, + ompx_gnu_unified_shared_mem_alloc = 201, + ompx_gnu_host_mem_alloc = 202, __omp_allocator_handle_t_max__ = __UINTPTR_MAX__ } omp_allocator_handle_t; diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in index 545a79fcec9..f8d7d6a3e99 100644 --- a/libgomp/omp_lib.f90.in +++ b/libgomp/omp_lib.f90.in @@ -160,6 +160,10 @@ parameter :: omp_thread_mem_alloc = 8 integer (kind=omp_allocator_handle_kind), & parameter :: ompx_gnu_pinned_mem_alloc = 200 + integer (kind=omp_allocator_handle_kind), & + parameter :: ompx_gnu_unified_shared_mem_alloc = 201 + integer (kind=omp_allocator_handle_kind), & + parameter :: ompx_gnu_host_mem_alloc = 202 integer (omp_memspace_handle_kind), & parameter :: omp_default_mem_space = 0 integer (omp_memspace_handle_kind), & @@ -170,6 +174,10 @@ parameter :: omp_high_bw_mem_space = 3 integer (omp_memspace_handle_kind), & parameter :: omp_low_lat_mem_space = 4 + integer (omp_memspace_handle_kind), & + parameter :: ompx_gnu_unified_shared_mem_space = 201 + integer (omp_memspace_handle_kind), & + parameter :: ompx_gnu_host_mem_space = 202 integer, parameter :: omp_initial_device = -1 integer, parameter :: omp_invalid_device = -4 diff --git a/libgomp/omp_lib.h.in b/libgomp/omp_lib.h.in index b5144bb4144..b56d62d5994 100644 --- a/libgomp/omp_lib.h.in +++ b/libgomp/omp_lib.h.in @@ -156,6 +156,9 @@ integer (omp_allocator_handle_kind) omp_pteam_mem_alloc integer (omp_allocator_handle_kind) omp_thread_mem_alloc integer (omp_allocator_handle_kind) ompx_gnu_pinned_mem_alloc + integer (omp_allocator_handle_kind) & + & ompx_gnu_unified_shared_mem_alloc + integer (omp_allocator_handle_kind) ompx_gnu_host_mem_alloc parameter (omp_null_allocator = 0) parameter (omp_default_mem_alloc = 1) parameter (omp_large_cap_mem_alloc = 2) @@ -166,16 +169,23 @@ parameter (omp_pteam_mem_alloc = 7) parameter (omp_thread_mem_alloc = 8) parameter (ompx_gnu_pinned_mem_alloc = 200) + parameter (ompx_gnu_unified_shared_mem_alloc = 201) + parameter (ompx_gnu_host_mem_alloc = 202) integer (omp_memspace_handle_kind) omp_default_mem_space integer (omp_memspace_handle_kind) omp_large_cap_mem_space integer (omp_memspace_handle_kind) omp_const_mem_space integer (omp_memspace_handle_kind) omp_high_bw_mem_space integer (omp_memspace_handle_kind) omp_low_lat_mem_space + integer (omp_memspace_handle_kind) & + & ompx_gnu_unified_shared_mem_space + integer (omp_memspace_handle_kind) ompx_gnu_host_mem_space parameter (omp_default_mem_space = 0) parameter (omp_large_cap_mem_space = 1) parameter (omp_const_mem_space = 2) parameter (omp_high_bw_mem_space = 3) parameter (omp_low_lat_mem_space = 4) + parameter (ompx_gnu_unified_shared_mem_space = 201) + parameter (ompx_gnu_host_mem_space = 202) integer omp_initial_device, omp_invalid_device parameter (omp_initial_device = -1) parameter (omp_invalid_device = -4) diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index 007c6e0f4df..016aee8219f 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -30,6 +30,7 @@ CUDA_ONE_CALL (cuLinkDestroy) CUDA_ONE_CALL (cuMemAlloc) CUDA_ONE_CALL (cuMemAllocHost) CUDA_ONE_CALL (cuMemHostAlloc) +CUDA_ONE_CALL (cuMemAllocManaged) CUDA_ONE_CALL (cuMemcpy) CUDA_ONE_CALL (cuMemcpyDtoDAsync) CUDA_ONE_CALL (cuMemcpyDtoH) @@ -50,6 +51,7 @@ CUDA_ONE_CALL (cuModuleLoad) CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleUnload) CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) +CUDA_ONE_CALL (cuPointerGetAttribute) CUDA_ONE_CALL (cuStreamAddCallback) CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamDestroy) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 2ef3198ebe1..a6f86c94715 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1058,11 +1058,13 @@ nvptx_stacks_free (struct ptx_device *ptx_dev, bool force) } static void * -nvptx_alloc (size_t s, bool suppress_errors) +nvptx_alloc (size_t s, bool suppress_errors, bool usm) { CUdeviceptr d; - CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s); + CUresult r = (usm ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s, + CU_MEM_ATTACH_GLOBAL) + : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s)); if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY) return NULL; else if (r != CUDA_SUCCESS) @@ -1229,8 +1231,13 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask) { int pi; CUresult r; + /* Check access via migration. */ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS, dev); + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev); + if (r != CUDA_SUCCESS || pi == 0) + /* Check direct access. */ + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, + CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); if (r != CUDA_SUCCESS || pi == 0) return -1; } @@ -1598,8 +1605,8 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) return ret; } -void * -GOMP_OFFLOAD_alloc (int ord, size_t size) +static void * +GOMP_OFFLOAD_alloc_1 (int ord, size_t size, bool usm) { if (!nvptx_attach_host_thread_to_device (ord)) return NULL; @@ -1622,7 +1629,7 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) blocks = tmp; } - void *d = nvptx_alloc (size, true); + void *d = nvptx_alloc (size, true, usm); if (d) return d; else @@ -1630,10 +1637,22 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) /* Memory allocation failed. Try freeing the stacks block, and retrying. */ nvptx_stacks_free (ptx_dev, true); - return nvptx_alloc (size, false); + return nvptx_alloc (size, false, usm); } } +void * +GOMP_OFFLOAD_alloc (int ord, size_t size) +{ + return GOMP_OFFLOAD_alloc_1 (ord, size, false); +} + +void * +GOMP_OFFLOAD_usm_alloc (int ord, size_t size) +{ + return GOMP_OFFLOAD_alloc_1 (ord, size, true); +} + bool GOMP_OFFLOAD_free (int ord, void *ptr) { @@ -1641,6 +1660,25 @@ GOMP_OFFLOAD_free (int ord, void *ptr) && nvptx_free (ptr, ptx_devices[ord])); } +bool +GOMP_OFFLOAD_usm_free (int ord, void *ptr) +{ + return GOMP_OFFLOAD_free (ord, ptr); +} + +bool +GOMP_OFFLOAD_is_usm_ptr (void *ptr) +{ + bool managed = false; + /* This returns 3 outcomes ... + CUDA_ERROR_INVALID_VALUE - Not a Cuda allocated pointer. + CUDA_SUCCESS, managed:false - Cuda allocated, but not USM. + CUDA_SUCCESS, managed:true - USM. */ + CUDA_CALL_NOCHECK (cuPointerGetAttribute, &managed, + CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr); + return managed; +} + bool GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size) { diff --git a/libgomp/target.c b/libgomp/target.c index effd48bb92f..754dea4e031 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -706,7 +706,9 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq, splay_tree_key n = gomp_map_lookup (mem_map, &cur_node); if (n == NULL) { - if (allow_zero_length_array_sections) + if (allow_zero_length_array_sections + || (devicep->is_usm_ptr_func + && devicep->is_usm_ptr_func ((void*)cur_node.host_start))) cur_node.tgt_offset = cur_node.host_start; else { @@ -859,6 +861,11 @@ gomp_attach_pointer (struct gomp_device_descr *devicep, return; } + if (devicep->is_usm_ptr_func + && devicep->is_usm_ptr_func ((void*)(target + bias))) + /* Nothing to do here. */ + return; + s.host_start = target + bias; s.host_end = s.host_start + 1; tn = splay_tree_lookup (mem_map, &s); @@ -955,6 +962,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i) switch (tgt->list[i].offset) { case OFFSET_INLINED: + case OFFSET_USM: return (uintptr_t) hostaddrs[i]; case OFFSET_POINTER: @@ -1038,6 +1046,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, { int kind = get_kind (short_mapkind, kinds, i); bool implicit = get_implicit (short_mapkind, kinds, i); + tgt->list[i].offset = 0; if (hostaddrs[i] == NULL || (kind & typemask) == GOMP_MAP_FIRSTPRIVATE_INT) { @@ -1045,6 +1054,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, tgt->list[i].offset = OFFSET_INLINED; continue; } + else if (devicep->is_usm_ptr_func + && devicep->is_usm_ptr_func (hostaddrs[i])) + { + /* The memory is visible from both host and target + so nothing needs to be moved. */ + tgt->list[i].key = NULL; + tgt->list[i].offset = OFFSET_USM; + continue; + } else if ((kind & typemask) == GOMP_MAP_USE_DEVICE_PTR || (kind & typemask) == GOMP_MAP_USE_DEVICE_PTR_IF_PRESENT) { @@ -1398,6 +1416,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, bool implicit = get_implicit (short_mapkind, kinds, i); if (hostaddrs[i] == NULL) continue; + if (tgt->list[i].offset == OFFSET_USM) + continue; switch (kind & typemask) { size_t align, len, first, last; @@ -1595,6 +1615,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, continue; } default: + if (tgt->list[i].offset == OFFSET_INLINED) + continue; break; } splay_tree_key k = &array->key; @@ -4437,6 +4459,56 @@ omp_target_free (void *device_ptr, int device_num) gomp_mutex_unlock (&devicep->lock); } +void * +gomp_usm_alloc (size_t size) +{ + struct gomp_task_icv *icv = gomp_icv (false); + struct gomp_device_descr *devicep = resolve_device (icv->default_device_var, + false); + if (devicep == NULL) + return NULL; + + if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + return malloc (size); + + void *ret = NULL; + gomp_mutex_lock (&devicep->lock); + if (devicep->usm_alloc_func) + ret = devicep->usm_alloc_func (devicep->target_id, size); + gomp_mutex_unlock (&devicep->lock); + return ret; +} + +void +gomp_usm_free (void *device_ptr) +{ + if (device_ptr == NULL) + return; + + struct gomp_task_icv *icv = gomp_icv (false); + struct gomp_device_descr *devicep = resolve_device (icv->default_device_var, + false); + if (devicep == NULL) + return; + + if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + { + free (device_ptr); + return; + } + + gomp_mutex_lock (&devicep->lock); + if (devicep->usm_free_func + && !devicep->usm_free_func (devicep->target_id, device_ptr)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("error in freeing device memory block at %p", device_ptr); + } + gomp_mutex_unlock (&devicep->lock); +} + /* Device (really: libgomp plugin) to use for paged-locked memory. We assume there is either none or exactly one such device for the lifetime of the process. */ @@ -5294,6 +5366,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM (unload_image); DLSYM (alloc); DLSYM (free); + DLSYM_OPT (usm_alloc, usm_alloc); + DLSYM_OPT (usm_free, usm_free); + DLSYM_OPT (is_usm_ptr, is_usm_ptr); DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc); DLSYM_OPT (page_locked_host_free, page_locked_host_free); DLSYM (dev2host); diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp index 7c109262916..007bdf2d5c4 100644 --- a/libgomp/testsuite/lib/libgomp.exp +++ b/libgomp/testsuite/lib/libgomp.exp @@ -594,3 +594,13 @@ int main() { return 0; } } "-lcuda -lcudart" ] } + +# return 1 if OpenMP Unified Share Memory is supported + +proc check_effective_target_omp_usm { } { + if { [libgomp_check_effective_target_offload_target "nvptx"] } { + return 1 + } + return 0 +} + diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c index d43d78db6fa..0f839ef2957 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c +++ b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c @@ -11,12 +11,13 @@ #pragma omp requires unified_shared_memory, unified_address, reverse_offload -int a[10] = { 0 }; extern void foo (void); int main (void) { + int *a = (int*)__builtin_calloc(10, sizeof (int)); + #pragma omp target map(to: a) for (int i = 0; i < 10; i++) a[i] = i; diff --git a/libgomp/testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c b/libgomp/testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c new file mode 100644 index 00000000000..87a84db8953 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c @@ -0,0 +1,77 @@ +/* Verify that on the host we can but on a device we cannot allocate 'ompx_gnu_host_mem_alloc' memory. */ + +/* { dg-additional-options -DOFFLOAD_DEVICE { target offload_device } } */ + +#include <omp.h> + +#pragma omp requires dynamic_allocators + +int main() +{ +#pragma omp target + { + char *c, *c_; + + c = omp_alloc(1, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c) + __builtin_abort (); +#else + if (!c) + __builtin_abort (); +#endif + omp_free(c, ompx_gnu_host_mem_alloc); + + c = omp_aligned_alloc(128, 256, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c) + __builtin_abort (); +#else + if (!c) + __builtin_abort (); +#endif + omp_free(c, omp_null_allocator); + + c = omp_calloc(1, 1, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c) + __builtin_abort (); +#else + if (!c) + __builtin_abort (); +#endif + c_ = omp_realloc(c, 2, ompx_gnu_host_mem_alloc, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c_) + __builtin_abort (); +#else + if (!c_) + __builtin_abort (); +#endif + c = omp_realloc(c_, 0, ompx_gnu_host_mem_alloc, ompx_gnu_host_mem_alloc); + if (c) + __builtin_abort (); + + c = omp_aligned_calloc(64, 1, 512, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c) + __builtin_abort (); +#else + if (!c) + __builtin_abort (); +#endif + c_ = omp_realloc(c, 2, c ? omp_null_allocator : ompx_gnu_host_mem_alloc, omp_null_allocator); +#ifdef OFFLOAD_DEVICE + if (c_) + __builtin_abort (); +#else + if (!c_) + __builtin_abort (); +#endif + c = omp_realloc(c_, 0, omp_null_allocator, omp_null_allocator); + if (c) + __builtin_abort (); + } + + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-1.c b/libgomp/testsuite/libgomp.c/usm-1.c new file mode 100644 index 00000000000..c8e8a9328ee --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-1.c @@ -0,0 +1,25 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int), ompx_gnu_unified_shared_mem_alloc); + if (!a) + __builtin_abort (); + + *a = 42; + uintptr_t a_p = (uintptr_t)a; + + #pragma omp target is_device_ptr(a) + { + if (*a != 42 || a_p != (uintptr_t)a) + __builtin_abort (); + } + + omp_free(a, ompx_gnu_unified_shared_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-2.c b/libgomp/testsuite/libgomp.c/usm-2.c new file mode 100644 index 00000000000..9f414b16319 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-2.c @@ -0,0 +1,33 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int)*2, ompx_gnu_unified_shared_mem_alloc); + if (!a) + __builtin_abort (); + + a[0] = 42; + a[1] = 43; + + uintptr_t a_p = (uintptr_t)a; + + #pragma omp target map(a[0]) + { + if (a[0] != 42 || a_p != (uintptr_t)a) + __builtin_abort (); + } + + #pragma omp target map(a[1]) + { + if (a[1] != 43 || a_p != (uintptr_t)a) + __builtin_abort (); + } + + omp_free(a, ompx_gnu_unified_shared_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-3.c b/libgomp/testsuite/libgomp.c/usm-3.c new file mode 100644 index 00000000000..d7a77a5c2ee --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-3.c @@ -0,0 +1,36 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int)*2, ompx_gnu_unified_shared_mem_alloc); + if (!a) + __builtin_abort (); + + a[0] = 42; + a[1] = 43; + + uintptr_t a_p = (uintptr_t)a; + +#pragma omp target data map(a[0:2]) + { +#pragma omp target + { + if (a[0] != 42 || a_p != (uintptr_t)a) + __builtin_abort (); + } + +#pragma omp target + { + if (a[1] != 43 || a_p != (uintptr_t)a) + __builtin_abort (); + } + } + + omp_free(a, ompx_gnu_unified_shared_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-4.c b/libgomp/testsuite/libgomp.c/usm-4.c new file mode 100644 index 00000000000..825bb4e8b3e --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-4.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int)*2, ompx_gnu_unified_shared_mem_alloc); + if (!a) + __builtin_abort (); + + a[0] = 42; + a[1] = 43; + + uintptr_t a_p = (uintptr_t)a; + +#pragma omp target enter data map(to:a[0:2]) + +#pragma omp target + { + if (a[0] != 42 || a_p != (uintptr_t)a) + __builtin_abort (); + } + +#pragma omp target + { + if (a[1] != 43 || a_p != (uintptr_t)a) + __builtin_abort (); + } + +#pragma omp target exit data map(delete:a[0:2]) + + omp_free(a, ompx_gnu_unified_shared_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-5.c b/libgomp/testsuite/libgomp.c/usm-5.c new file mode 100644 index 00000000000..00332050591 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-5.c @@ -0,0 +1,28 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +#pragma omp requires unified_shared_memory + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int), ompx_gnu_host_mem_alloc); + if (!a) + __builtin_abort (); + + a[0] = 42; + + uintptr_t a_p = (uintptr_t)a; + +#pragma omp target map(a[0:1]) + { + if (a[0] != 42 || a_p == (uintptr_t)a) + __builtin_abort (); + } + + omp_free(a, ompx_gnu_host_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.fortran/usm-3.f90 b/libgomp/testsuite/libgomp.fortran/usm-3.f90 new file mode 100644 index 00000000000..ff15f4ba1f5 --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/usm-3.f90 @@ -0,0 +1,33 @@ +! { dg-do run } +! { dg-require-effective-target omp_usm } + +! Ensure that derived types containing allocated values work +! with Unified Shared Memory. + +program usm +!$omp requires unified_shared_memory + use iso_fortran_env + implicit none + + type :: struct + real(real64), allocatable :: v(:) + end type struct + + integer :: index + type(struct) :: s + + real(real64), allocatable :: expected(:) + + allocate(s%v(100)) + do index = 1, size(s%v) + s%v(index) = index + end do + allocate(expected, mold=s%v) + expected = s%v - 1._real64 + + !$omp target + s%v = s%v - 1._real64 + !$omp end target + + if (any(s%v /= expected)) STOP 1 +end program usm

[v2,2/8] openmp, nvptx: ompx_gnu_unified_shared_mem_alloc

Commit Message

Patch