diff mbox series

[v2,6/8] amdgcn: libgomp plugin USM implementation

Message ID 20240628102449.562467-7-ams@baylibre.com
State New
Headers show
Series OpenMP: Unified Shared Memory via Managed Memory | expand

Commit Message

Andrew Stubbs June 28, 2024, 10:24 a.m. UTC
From: Andrew Stubbs <ams@codesourcery.com>

Implement the Unified Shared Memory API calls in the GCN plugin.

The AMD equivalent of "Managed Memory" means registering previously allocated
host memory as "coarse-grained" (whereas allocating coarse-grained memory via
hsa_allocate_memory allocates device-side memory, initially).  It's possible to
do this to ordinary host heap memory (i.e. from "malloc"), but a) this caused
mysterious crashes inside the HSA runtime (presumably an unfortunate
page-sharing situation), and b) it's unlikely that the malloc/free
implementation is optimized for avoiding page migrations (in general).

This implementation reuses the "usmpin" allocator (introduced in my previous
patch-set to optimize pinned memory allocation) to solve these issues.
Firstly, all USM memory is allocated from specially memmap'd pages to ensure
that as few pages as possible get migrated.  Secondly, the free chain is stored
in a side-table so that we can be sure that walking the chain doesn't migrate
all the pages back to the host, for no reason.

The HSA header files update included here were relicenced by AMD and sent to me
explicitly to enable this project. AMD retain the copyright (Q4 2022), as they
do for the headers already in-tree.  This is *not* just a random copy from the
other project with the incompatible license.  (The small change made recently
by Tobias has not been erased, however.)

include/ChangeLog:

	* hsa.h: Import a new version from AMD.
	* hsa_ext_amd.h: Likewise.
	* hsa_ext_image.h: Likewise.

libgomp/ChangeLog:

	* Makefile.in: Regenerate.
	* config/gcn/allocator.c (gcn_memspace_alloc): Disallow
	ompx_gnu_host_mem_space.
	(gcn_memspace_calloc): Likewise.
	(gcn_memspace_free): Likewise.
	(gcn_memspace_realloc): Likewise.
	* plugin/Makefrag.am
	(libgomp_plugin_gcn_la_SOURCES): Add usmpin-allocator.c.
	* plugin/plugin-gcn.c: Include libgomp.h, sys/mman.h, and unistd.h.
	(struct hsa_runtime_fn_info): Add hsa_amd_svm_attributes_set_fn.
	(dump_hsa_system_info): Dump HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED and
	HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT data.
	(init_hsa_runtime_functions): Load hsa_amd_svm_attributes_set.
	(usm_ctx): New variable.
	(usm_heap_pages): New.
	(usm_heap_create): New function.
	(GOMP_OFFLOAD_get_num_devices): Update comment only.
	(GOMP_OFFLOAD_usm_alloc): New function.
	(GOMP_OFFLOAD_usm_free): New function.
	(GOMP_OFFLOAD_is_usm_ptr): New function.
	* testsuite/lib/libgomp.exp (check_effective_target_omp_usm): Add
	amdgcn test.
	* testsuite/libgomp.c++/usm-1.C: Switch to omp_usm effective target.
	* testsuite/libgomp.c-c++-common/requires-1.c: Require omp_usm.
	* testsuite/libgomp.c-c++-common/requires-4.c: Skip AMD devices that
	don't support USM.
	* testsuite/libgomp.c-c++-common/requires-4a.c: Likewise.
	* testsuite/libgomp.c-c++-common/requires-5.c: Likewise.
	* testsuite/libgomp.c-c++-common/target-implicit-map-4.c: Likewise.
	* testsuite/libgomp.c/usm-1.c: Set amdgcn options.
	* testsuite/libgomp.c/usm-2.c: Likewise.
	* testsuite/libgomp.c/usm-3.c: Likewise.
	* testsuite/libgomp.c/usm-4.c: Likewise.
	* testsuite/libgomp.c/usm-5.c: Clarify host-fallback behaviour.
	* testsuite/libgomp.c/usm-6.c: Require omp_usm.
	* usmpin-allocator.c (gomp_fatal): Define.
	* usm-allocator.c: New file.
---
 include/hsa.h                                 |  28 +-
 include/hsa_ext_amd.h                         | 459 +++++++++++++++++-
 include/hsa_ext_image.h                       |   2 +-
 libgomp/Makefile.in                           |  13 +-
 libgomp/config/gcn/allocator.c                |  10 +
 libgomp/plugin/Makefrag.am                    |   2 +-
 libgomp/plugin/plugin-gcn.c                   | 169 ++++++-
 libgomp/testsuite/lib/libgomp.exp             |  12 +
 libgomp/testsuite/libgomp.c++/usm-1.C         |   2 +-
 .../libgomp.c-c++-common/requires-1.c         |   1 +
 .../libgomp.c-c++-common/requires-4.c         |   5 +-
 .../libgomp.c-c++-common/requires-4a.c        |   2 +
 .../libgomp.c-c++-common/requires-5.c         |   2 +
 .../target-implicit-map-4.c                   |   2 +
 libgomp/testsuite/libgomp.c/usm-1.c           |   1 +
 libgomp/testsuite/libgomp.c/usm-2.c           |   1 +
 libgomp/testsuite/libgomp.c/usm-3.c           |   1 +
 libgomp/testsuite/libgomp.c/usm-4.c           |   1 +
 libgomp/testsuite/libgomp.c/usm-5.c           |   2 +
 libgomp/testsuite/libgomp.c/usm-6.c           |   2 +-
 libgomp/usm-allocator.c                       | 232 +++++++++
 libgomp/usmpin-allocator.c                    |   3 +
 22 files changed, 922 insertions(+), 30 deletions(-)
 mode change 100644 => 100755 include/hsa.h
 mode change 100644 => 100755 include/hsa_ext_amd.h
 mode change 100644 => 100755 include/hsa_ext_image.h
 create mode 100644 libgomp/usm-allocator.c
diff mbox series

Patch

diff --git a/include/hsa.h b/include/hsa.h
old mode 100644
new mode 100755
index 3c7be95d7fd..28867a91a7c
--- a/include/hsa.h
+++ b/include/hsa.h
@@ -1,6 +1,6 @@ 
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Copyright (C) 2014-2020 Advanced Micro Devices Inc.  All rights reserved.
+// Copyright (C) 2014-2022 Advanced Micro Devices Inc.  All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person or organization
 // obtaining a copy of the software and accompanying documentation covered by
@@ -467,7 +467,19 @@  typedef enum {
   * String containing the ROCr build identifier.
   */
   HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200,
-
+  /**
+  * Returns true if hsa_amd_svm_* APIs are supported by the driver.  The type of
+  * this attribute is bool.
+  */
+  HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201,
+  // TODO: Should this be per Agent?
+  /**
+  * Returns true if all Agents have access to system allocated memory (such as
+  * that allocated by mmap, malloc, or new) by default.
+  * If false then system allocated memory may only be made SVM accessible to
+  * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes.
+  * The type of this attribute is bool.
+  */
   HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202
 } hsa_system_info_t;
 
@@ -986,8 +998,16 @@  typedef enum {
    * Minor version of the HSA runtime specification supported by the
    * agent. The type of this attribute is uint16_t.
    */
-  HSA_AGENT_INFO_VERSION_MINOR = 22
-
+  HSA_AGENT_INFO_VERSION_MINOR = 22,
+  /**
+   * This enum does not have a fixed underlying type, thus in C++ post D2338:
+   * If the enumeration type does not have a fixed underlying type, the value is
+   * unchanged if the original value is within the range of the enumeration
+   * values (9.7.1 [dcl.enum]), and otherwise, the behavior is
+   * undefined.
+   * Thus increase the range of this enum to encompass vendor extensions.
+   */
+  HSA_AGENT_INFO_LAST = INT32_MAX
 } hsa_agent_info_t;
 
 /**
diff --git a/include/hsa_ext_amd.h b/include/hsa_ext_amd.h
old mode 100644
new mode 100755
index 1f93e0d96cc..6aac7392287
--- a/include/hsa_ext_amd.h
+++ b/include/hsa_ext_amd.h
@@ -1,6 +1,6 @@ 
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Copyright (C) 2014-2020 Advanced Micro Devices Inc.  All rights reserved.
+// Copyright (C) 2014-2022 Advanced Micro Devices Inc.  All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person or organization
 // obtaining a copy of the software and accompanying documentation covered by
@@ -41,6 +41,115 @@ 
 extern "C" {
 #endif
 
+/** \addtogroup aql Architected Queuing Language
+ *  @{
+ */
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants.
+ */
+typedef uint32_t hsa_signal_condition32_t;
+
+/**
+ * @brief AMD vendor specific packet type.
+ */
+typedef enum {
+  /**
+   * Packet used by agents to delay processing of subsequent packets until a
+   * configurable condition is satisfied by an HSA signal.  Only kernel dispatch
+   * queues created from AMD GPU Agents support this packet.
+   */
+  HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2,
+} hsa_amd_packet_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants.
+ */
+typedef uint8_t hsa_amd_packet_type8_t;
+
+/**
+ * @brief AMD vendor specific AQL packet header
+ */
+typedef struct hsa_amd_packet_header_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   *Format of the vendor specific packet.
+   */
+  hsa_amd_packet_type8_t AmdFormat;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint8_t reserved;
+} hsa_amd_vendor_packet_header_t;
+
+/**
+ * @brief AMD barrier value packet.  Halts packet processing and waits for
+ * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value
+ * is the value of the signal ::signal.
+ */
+typedef struct hsa_amd_barrier_value_packet_s {
+  /**
+   * AMD vendor specific packet header.
+   */
+  hsa_amd_vendor_packet_header_t header;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved0;
+
+  /**
+   * Dependent signal object. A signal with a handle value of 0 is
+   * allowed and is interpreted by the packet processor a satisfied
+   * dependency.
+   */
+  hsa_signal_t signal;
+
+  /**
+   * Value to compare against.
+   */
+  hsa_signal_value_t value;
+
+  /**
+   * Bit mask to be combined by bitwise AND with ::signal's value.
+   */
+  hsa_signal_value_t mask;
+
+  /**
+   * Comparison operation.  See ::hsa_signal_condition_t.
+   */
+  hsa_signal_condition32_t cond;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved3;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+} hsa_amd_barrier_value_packet_t;
+
+/** @} */
+
 /**
  * @brief Enumeration constants added to ::hsa_status_t.
  *
@@ -61,6 +170,20 @@  enum {
    * Agent executed an invalid shader instruction.
    */
   HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42,
+
+  /**
+   * Agent attempted to access an inaccessible address.
+   * See hsa_amd_register_system_event_handler and
+   * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses.
+   */
+  HSA_STATUS_ERROR_MEMORY_FAULT = 43,
+
+  /**
+   * The CU mask was successfully set but the mask attempted to enable a CU
+   * which was disabled for the process.  CUs disabled for the process remain
+   * disabled.
+   */
+  HSA_STATUS_CU_MASK_REDUCED = 44,
 };
 
 /**
@@ -168,7 +291,38 @@  typedef enum hsa_amd_agent_info_s {
    * selective workarounds for hardware errata.
    * The type of this attribute is uint32_t.
    */
-  HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012
+  HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012,
+  /**
+   * Queries whether or not the host can directly access SVM memory that is
+   * physically resident in the agent's local memory.
+   * The type of this attribute is bool.
+   */
+  HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS = 0xA013,
+  /**
+   * Some processors support more CUs than can reliably be used in a cooperative
+   * dispatch.  This queries the count of CUs which are fully enabled for
+   * cooperative dispatch.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT = 0xA014,
+  /**
+   * Queries the amount of memory available in bytes accross all global pools
+   * owned by the agent.
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_MEMORY_AVAIL = 0xA015,
+  /**
+   * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is
+   * in the range 1-400MHz.
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016,
+  /**
+   * Queries for the ASIC family ID of an agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID = 0xA107
+
 } hsa_amd_agent_info_t;
 
 typedef struct hsa_amd_hdp_flush_s {
@@ -471,6 +625,37 @@  hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uin
                                            const hsa_agent_t* consumers, uint64_t attributes,
                                            hsa_signal_t* signal);
 
+/**
+ * @brief Returns a pointer to the value of a signal.
+ *
+ * Use of this API does not modify the lifetime of ::signal and any
+ * hsa_signal_value_t retrieved by this API has lifetime equal to that of
+ * ::signal.
+ *
+ * This API is intended for partial interoperability with non-HSA compatible
+ * devices and should not be used where HSA interfaces are available.
+ *
+ * Use of the signal value must comply with use restritions of ::signal.
+ * Use may result in data races if the operations performed are not platform
+ * atomic.  Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC
+ * attributed signals is required.
+ *
+ * @param[in] Signal handle to extract the signal value pointer from.
+ *
+ * @param[out] Location where the extracted signal value pointer will be placed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL.
+ */
+hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal,
+                                          volatile hsa_signal_value_t** value_ptr);
+
 /**
  * @brief Asyncronous signal handler function type.
  *
@@ -613,32 +798,69 @@  hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
                                                     void* value);
 
 /**
- * @brief Set a CU affinity to specific queues within the process, this function
- * call is "atomic".
+ * @brief Set a queue's CU affinity mask.
+ *
+ * @details Enables the queue to run on only selected CUs.  The given mask is
+ * combined by bitwise AND with any device wide mask in HSA_CU_MASK before
+ * being applied.
+ * If num_cu_mask_count is 0 then the request is interpreted as a request to
+ * enable all CUs and no cu_mask array need be given.
  *
  * @param[in] queue A pointer to HSA queue.
  *
- * @param[in] num_cu_mask_count Size of CUMask bit array passed in.
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
  *
  * @param[in] cu_mask Bit-vector representing the CU mask.
  *
  * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
  *
+ * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed
+ * but the given mask attempted to enable a CU which was disabled by
+ * HSA_CU_MASK.  CUs disabled by HSA_CU_MASK remain disabled.
+ *
  * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
  * initialized.
  *
  * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
  *
  * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not
- * multiple of 32 or @p cu_mask is NULL.
- *
- * @retval ::HSA_STATUS_ERROR failed to call thunk api
+ * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL.
+ * Devices with work group processors must even-index contiguous pairwise
+ * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110)
+ * are invalid.
  *
  */
 hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                                uint32_t num_cu_mask_count,
                                                const uint32_t* cu_mask);
 
+/**
+ * @brief Retrieve a queue's CU affinity mask.
+ *
+ * @details Returns the first num_cu_mask_count bits of a queue's CU mask.
+ * Ensure that num_cu_mask_count is at least as large as
+ * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask.
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
+ *
+ * @param[out] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
+ * a multiple of 32 or @p cu_mask is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask);
+
 /**
  * @brief Memory segments associated with a memory pool.
  */
@@ -770,6 +992,24 @@  typedef enum {
   HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
 } hsa_amd_memory_pool_info_t;
 
+/**
+ * @brief Memory pool flag used to specify allocation directives
+ *
+ */
+typedef enum hsa_amd_memory_pool_flag_s {
+  /**
+   * Allocates memory that conforms to standard HSA memory consistency model
+   */
+  HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0,
+  /**
+   * Allocates fine grain memory type where memory ordering is per point to point
+   * connection. Atomic memory operations on these memory buffers are not
+   * guaranteed to be visible at system scope.
+   */
+  HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1,
+
+} hsa_amd_memory_pool_flag_t;
+
 /**
  * @brief Get the current value of an attribute of a memory pool.
  *
@@ -838,7 +1078,7 @@  hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
  * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool.
  *
  * @param[in] flags A bit-field that is used to specify allocation
- * directives. Reserved parameter, must be 0.
+ * directives.
  *
  * @param[out] ptr Pointer to the location where to store the base virtual
  * address of
@@ -895,6 +1135,8 @@  hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
  *
  * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly
  * access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
  *
  * @param[in] src A valid pointer to the source of data to be copied. The source
  * buffer must not overlap with the destination buffer, otherwise the copy will succeed
@@ -902,6 +1144,8 @@  hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
  *
  * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly
  * access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
  *
  * @param[in] size Number of bytes to copy. If @p size is 0, no copy is
  * performed and the function returns success. Copying a number of bytes larger
@@ -912,9 +1156,9 @@  hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
  *
  * @param[in] dep_signals List of signals that must be waited on before the copy
  * operation starts. The copy will start after every signal has been observed with
- * the value 0. The dependent signal should not include completion signal from hsa_amd_memory_async_copy
- * operation to be issued in future as that can result in a deadlock. If @p num_dep_signals is 0, this
- * argument is ignored.
+ * the value 0. The dependent signal should not include completion signal from
+ * hsa_amd_memory_async_copy operation to be issued in future as that can result
+ * in a deadlock. If @p num_dep_signals is 0, this argument is ignored.
  *
  * @param[in] completion_signal Signal used to indicate completion of the copy
  * operation. When the copy operation is finished, the value of the signal is
@@ -929,7 +1173,7 @@  hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
  * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
  * initialized.
  *
- * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access.
  *
  * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid.
  *
@@ -1568,6 +1812,12 @@  typedef struct hsa_amd_pointer_info_s {
   GPU boards) any such agent may be returned.
   */
   hsa_agent_t agentOwner;
+  /*
+  Contains a bitfield of hsa_amd_memory_pool_global_flag_t values.
+  Reports the effective global flags bitmask for the allocation.  This field is not meaningful if
+  the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  uint32_t global_flags;
 } hsa_amd_pointer_info_t;
 
 /**
@@ -1603,7 +1853,7 @@  typedef struct hsa_amd_pointer_info_s {
  *
  * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info.
  */
-hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr,
+hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr,
                                           hsa_amd_pointer_info_t* info,
                                           void* (*alloc)(size_t),
                                           uint32_t* num_agents_accessible,
@@ -1627,7 +1877,7 @@  hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr,
  *
  * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr.
  */
-hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(void* ptr,
+hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr,
                                                        void* userdata);
 
 /**
@@ -1801,11 +2051,11 @@  typedef enum {
   // GPU attempted access to a host only page.
   HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3,
   // DRAM ECC failure.
-  HSA_AMD_MEMORY_FAULT_DRAM_ECC = 1 << 4,
+  HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4,
   // Can't determine the exact fault address.
   HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5,
   // SRAM ECC failure (ie registers, no fault address).
-  HSA_AMD_MEMORY_FAULT_SRAM_ECC = 1 << 6,
+  HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6,
   // GPU reset following unspecified hang.
   HSA_AMD_MEMORY_FAULT_HANG = 1 << 31
 } hsa_amd_memory_fault_reason_t;
@@ -1962,6 +2212,181 @@  hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr,
 hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
                                                       hsa_amd_deallocation_callback_t callback);
 
+typedef enum hsa_amd_svm_model_s {
+  /**
+   * Updates to memory with this attribute conform to HSA memory consistency
+   * model.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0,
+  /**
+   * Writes to memory with this attribute can be performed by a single agent
+   * at a time.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1,
+  /**
+   * Memory region queried contains subregions with both
+   * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and
+   * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes.
+   *
+   * This attribute can not be used in hsa_amd_svm_attributes_set.  It is a
+   * possible return from hsa_amd_svm_attributes_get indicating that the query
+   * region contains both coarse and fine grained memory.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2
+} hsa_amd_svm_model_t;
+
+typedef enum hsa_amd_svm_attribute_s {
+  // Memory model attribute.
+  // Type of this attribute is hsa_amd_svm_model_t.
+  HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0,
+  // Marks the range read only.  This allows multiple physical copies to be
+  // placed local to each accessing device.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_READ_ONLY = 1,
+  // Automatic migrations should attempt to keep the memory within the xgmi hive
+  // containing accessible agents.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2,
+  // Page granularity to migrate at once.  Page granularity is specified as
+  // log2(page_count).
+  // Type of this attribute is uint64_t.
+  HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3,
+  // Physical location to prefer when automatic migration occurs.
+  // Set to the null agent handle (handle == 0) to indicate there
+  // is no preferred location.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set (see
+  // ::hsa_amd_svm_prefetch_async).
+  // Queries the physical location of most recent prefetch command.
+  // If the prefetch location has not been set or is not uniform across the
+  // address range then returned hsa_agent_t::handle will be 0.
+  // Querying this attribute will return the destination agent of the most
+  // recent ::hsa_amd_svm_prefetch_async targeting the address range.  If
+  // multiple async prefetches have been issued targeting the region and the
+  // most recently issued prefetch has completed then the query will return
+  // the location of the most recently completed prefetch.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5,
+  // Optimizes with the anticipation that the majority of operations to the
+  // range will be read operations.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6,
+  // Allows the execution on GPU.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range.  Access may incur a page fault
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM
+  // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range without page faults.  Access
+  // will not incur a page fault and will not cause access based migration.
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if
+  // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Denies an agent access to the memory range.  Access will cause a terminal
+  // segfault.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set.
+  // Returns the access attribute associated with the agent.
+  // The agent to query must be set in the attribute value field.
+  // The attribute enum will be replaced with the agent's current access
+  // attribute for the address range.
+  // TODO: Clarify KFD return value for non-uniform access attribute.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203,
+} hsa_amd_svm_attribute_t;
+
+// List type for hsa_amd_svm_attributes_set/get. 
+typedef struct hsa_amd_svm_attribute_pair_s {
+  // hsa_amd_svm_attribute_t value.
+  uint64_t attribute;
+  // Attribute value.  Bit values should be interpreted according to the type
+  // given in the associated attribute description.
+  uint64_t value;
+} hsa_amd_svm_attribute_pair_t;
+
+/**
+ * @brief Sets SVM memory attributes.
+ *
+ * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling
+ * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE
+ * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM
+ * memory access by that Agent.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION
+ * may not be used with this API.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Gets SVM memory attributes.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE,
+ * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and
+ * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API.
+ *
+ * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an
+ * hsa_agent_t and returns the current access type through its attribute field.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Asynchronously migrates memory to an agent.
+ *
+ * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero.
+ * @p completion_signal will decrement when the migration is complete.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] agent Agent to migrate to.
+ *
+ * @param[in] num_dep_signals Number of dependent signals. Can be 0.
+ *
+ * @param[in] dep_signals List of signals that must be waited on before the migration
+ * operation starts. The migration will start after every signal has been observed with
+ * the value 0. If @p num_dep_signals is 0, this argument is ignored.
+ *
+ * @param[in] completion_signal Signal used to indicate completion of the migration
+ * operation. When the migration operation is finished, the value of the signal is
+ * decremented. The runtime indicates that an error has occurred during the copy
+ * operation by setting the value of the completion signal to a negative
+ * number. If no completion signal is required this handle may be null.
+ */
+hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
+                                        uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+                                        hsa_signal_t completion_signal);
+
 #ifdef __cplusplus
 }  // end extern "C" block
 #endif
diff --git a/include/hsa_ext_image.h b/include/hsa_ext_image.h
old mode 100644
new mode 100755
index 52b695bc9fa..eee1f807f5b
--- a/include/hsa_ext_image.h
+++ b/include/hsa_ext_image.h
@@ -1,6 +1,6 @@ 
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Copyright (C) 2014-2020 Advanced Micro Devices Inc.  All rights reserved.
+// Copyright (C) 2014-2022 Advanced Micro Devices Inc.  All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person or organization
 // obtaining a copy of the software and accompanying documentation covered by
diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in
index b74e39a1c2a..e0b26d9e7f1 100644
--- a/libgomp/Makefile.in
+++ b/libgomp/Makefile.in
@@ -185,7 +185,8 @@  am__DEPENDENCIES_1 =
 @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_DEPENDENCIES = libgomp.la \
 @PLUGIN_GCN_TRUE@	$(am__DEPENDENCIES_1)
 @PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_OBJECTS =  \
-@PLUGIN_GCN_TRUE@	libgomp_plugin_gcn_la-plugin-gcn.lo
+@PLUGIN_GCN_TRUE@	libgomp_plugin_gcn_la-plugin-gcn.lo \
+@PLUGIN_GCN_TRUE@	libgomp_plugin_gcn_la-usmpin-allocator.lo
 libgomp_plugin_gcn_la_OBJECTS = $(am_libgomp_plugin_gcn_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
@@ -581,7 +582,7 @@  libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
 
 # AMD GCN plugin
 @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
-@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c usmpin-allocator.c
 @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \
 @PLUGIN_GCN_TRUE@	-D_GNU_SOURCE
 
@@ -758,6 +759,7 @@  distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-usmpin-allocator.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop.Plo@am__quote@
@@ -819,6 +821,13 @@  libgomp_plugin_gcn_la-plugin-gcn.lo: plugin/plugin-gcn.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c
 
+libgomp_plugin_gcn_la-usmpin-allocator.lo: usmpin-allocator.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-usmpin-allocator.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-usmpin-allocator.Tpo -c -o libgomp_plugin_gcn_la-usmpin-allocator.lo `test -f 'usmpin-allocator.c' || echo '$(srcdir)/'`usmpin-allocator.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-usmpin-allocator.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-usmpin-allocator.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='usmpin-allocator.c' object='libgomp_plugin_gcn_la-usmpin-allocator.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-usmpin-allocator.lo `test -f 'usmpin-allocator.c' || echo '$(srcdir)/'`usmpin-allocator.c
+
 libgomp_plugin_nvptx_la-plugin-nvptx.lo: plugin/plugin-nvptx.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_nvptx_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_nvptx_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_nvptx_la-plugin-nvptx.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo -c -o libgomp_plugin_nvptx_la-plugin-nvptx.lo `test -f 'plugin/plugin-nvptx.c' || echo '$(srcdir)/'`plugin/plugin-nvptx.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo
diff --git a/libgomp/config/gcn/allocator.c b/libgomp/config/gcn/allocator.c
index 99cba6c9265..20405618caf 100644
--- a/libgomp/config/gcn/allocator.c
+++ b/libgomp/config/gcn/allocator.c
@@ -36,6 +36,7 @@ 
    when the memspace access trait is set accordingly.  */
 
 #include "libgomp.h"
+#include <assert.h>
 #include <stdlib.h>
 
 #define BASIC_ALLOC_PREFIX __gcn_lowlat
@@ -56,6 +57,8 @@  gcn_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
 
       return __gcn_lowlat_alloc (shared_pool, size);
     }
+  else if (memspace == ompx_gnu_host_mem_space)
+    return NULL;
   else
     return malloc (size);
 }
@@ -69,6 +72,8 @@  gcn_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
 
       return __gcn_lowlat_calloc (shared_pool, size);
     }
+  else if (memspace == ompx_gnu_host_mem_space)
+    return NULL;
   else
     return calloc (1, size);
 }
@@ -82,6 +87,9 @@  gcn_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size)
 
       __gcn_lowlat_free (shared_pool, addr, size);
     }
+  else if (memspace == ompx_gnu_host_mem_space)
+    /* Just verify what all allocator functions return.  */
+    assert (addr == NULL);
   else
     free (addr);
 }
@@ -96,6 +104,8 @@  gcn_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
 
       return __gcn_lowlat_realloc (shared_pool, addr, oldsize, size);
     }
+  else if (memspace == ompx_gnu_host_mem_space)
+    return NULL;
   else
     return realloc (addr, size);
 }
diff --git a/libgomp/plugin/Makefrag.am b/libgomp/plugin/Makefrag.am
index c9058709ad0..56dfbe3538a 100644
--- a/libgomp/plugin/Makefrag.am
+++ b/libgomp/plugin/Makefrag.am
@@ -57,7 +57,7 @@  if PLUGIN_GCN
 # AMD GCN plugin
 libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
 toolexeclib_LTLIBRARIES += libgomp-plugin-gcn.la
-libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c usmpin-allocator.c
 libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \
 	-D_GNU_SOURCE
 libgomp_plugin_gcn_la_LDFLAGS = $(libgomp_plugin_gcn_version_info) \
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index c8c588e8efa..dd9c42c0b8c 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -43,11 +43,14 @@ 
 #include <signal.h>
 #include "libgomp-plugin.h"
 #include "config/gcn/libgomp-gcn.h"  /* For struct output.  */
+#include "libgomp.h" /* For usmpin API.  */
 #include "gomp-constants.h"
 #include <elf.h>
 #include "oacc-plugin.h"
 #include "oacc-int.h"
 #include <assert.h>
+#include <sys/mman.h>
+#include <unistd.h>
 
 /* These probably won't be in elf.h for a while.  */
 #ifndef R_AMDGPU_NONE
@@ -206,6 +209,9 @@  struct hsa_runtime_fn_info
      const hsa_dim3_t *range, hsa_agent_t copy_agent,
      hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
      const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
+  hsa_status_t (*hsa_amd_svm_attributes_set_fn)
+    (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
+     size_t attribute_count);
 };
 
 /* Structure describing the run-time and grid properties of an HSA kernel
@@ -709,6 +715,24 @@  dump_hsa_system_info (void)
     }
   else
     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
+
+  bool svm_supported;
+  status = hsa_fns.hsa_system_get_info_fn
+    (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED, &svm_supported);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: %s\n",
+	       (svm_supported ? "TRUE" : "FALSE"));
+  else
+    GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: FAILED\n");
+
+  bool svm_accessible;
+  status = hsa_fns.hsa_system_get_info_fn
+    (HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: %s\n",
+	       (svm_accessible ? "TRUE" : "FALSE"));
+  else
+    GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: FAILED\n");
 }
 
 /* Dump information about the available hardware.  */
@@ -1426,6 +1450,7 @@  init_hsa_runtime_functions (void)
   DLSYM_OPT_FN (hsa_amd_memory_lock)
   DLSYM_OPT_FN (hsa_amd_memory_unlock)
   DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
+  DLSYM_OPT_FN (hsa_amd_svm_attributes_set)
   return true;
 #undef DLSYM_OPT_FN
 #undef DLSYM_FN
@@ -3165,6 +3190,107 @@  wait_queue (struct goacc_asyncqueue *aq)
 }
 
 /* }}}  */
+/* {{{ Unified Shared Memory
+
+   Normal heap memory is already enabled for USM, but by default it is "fine-
+   grained" memory, meaning that the GPU must access it via the system bus,
+   slowly.  Changing the page to "coarse-grained" mode means that the page
+   is migrated on-demand and can therefore be accessed quickly by both CPU and
+   GPU (although care should be taken to prevent thrashing the page back and
+   forth).
+
+   GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case
+   the initial location is GPU memory; GOMP_OFFLOAD_usm_alloc returns system
+   memory configure coarse-grained.
+
+   The USM memory space is allocated as a largish block and then subdivided
+   via a custom allocator.  (It would be possible to reconfigure regular
+   "malloc'd" memory, but if it ends up on the same page as memory used by
+   the HSA driver then bad things happen.)  */
+
+usmpin_ctx_p usm_ctx = NULL;
+
+/* Record a list of the memory blocks configured for USM.  */
+static struct usm_heap_pages {
+  void *start;
+  void *end;
+  struct usm_heap_pages *next;
+} *usm_heap_pages = NULL;
+
+/* Initialize or extend the USM memory space.  This is called whenever
+   allocation fails.  SIZE is the minimum size required for the failed
+   allocation to succeed; the function may choose a larger size.
+   Note that Linux lazy allocation means that the memory returned isn't
+   guarenteed to acually exist.  */
+
+static bool
+usm_heap_create (size_t size)
+{
+  static int lock = 0;
+  while (__atomic_exchange_n (&lock, 1, MEMMODEL_ACQUIRE) != 0)
+    ;
+
+  size_t default_size = 1L * 1024 * 1024 * 1024; /* 1GB */
+  if (size < default_size)
+    size = default_size;
+
+  /* Round up to a whole page.  */
+  int pagesize = getpagesize ();
+  int misalignment = size % pagesize;
+  if (misalignment > 0)
+    size += pagesize - misalignment;
+
+  /* Try to get contiguous memory, but it might not be possible.
+     The most recent previous allocation is at the head of the list.  */
+  static void *addrhint = NULL;
+  void *new_pages = mmap (addrhint, size, PROT_READ | PROT_WRITE,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (!new_pages)
+    {
+      GCN_DEBUG ("Could not allocate Unified Shared Memory heap.");
+      __atomic_store_n (&lock, 0, MEMMODEL_RELEASE);
+      return false;
+    }
+
+  /* Register the heap allocation as coarse grained, which implies USM.  */
+  struct hsa_amd_svm_attribute_pair_s attr = {
+    HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG,
+    HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED
+  };
+  hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size,
+							       &attr, 1);
+  if (status != HSA_STATUS_SUCCESS)
+    GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
+		       " please update your drivers and/or kernel");
+
+  /* Record the page addresses for use in GOMP_OFFLOAD_is_usm_ptr.  */
+  if (new_pages == addrhint)
+    {
+      /* We got contiguous pages, so we don't need extra list entries.  */
+      usm_heap_pages->end += size;
+    }
+  else
+    {
+      /* We need a new list entry to record a discontiguous range.  */
+      struct usm_heap_pages *page = malloc (sizeof (*page));
+      page->start = new_pages;
+      page->end = new_pages + size;
+      page->next = usm_heap_pages;
+      usm_heap_pages = page;
+    }
+  addrhint = new_pages + size;
+
+  /* Initialize a new USM heap, or add the new memory into an existing USM
+     heap.  */
+  if (!usm_ctx)
+    usm_ctx = usmpin_init_context (new_pages, size);
+  usmpin_register_memory (usm_ctx, new_pages, size);
+
+  __atomic_store_n (&lock, 0, MEMMODEL_RELEASE);
+  return true;
+}
+
+/* }}} */
 /* {{{ OpenACC support  */
 
 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
@@ -3385,7 +3511,9 @@  GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
   if (!init_hsa_context (true))
     exit (EXIT_FAILURE);
   /* Return -1 if no omp_requires_mask cannot be fulfilled but
-     devices were present.  */
+     devices were present.
+     Note: not all devices support USM, but the compiler refuses to create
+     binaries for those that don't anyway.  */
   if (hsa_context.agent_count > 0
       && ((omp_requires_mask
 	   & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
@@ -4446,6 +4574,45 @@  GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
 		       GOMP_PLUGIN_target_task_completion, async_data);
 }
 
+/* Allocate memory suitable for Unified Shared Memory.  */
+
+void *
+GOMP_OFFLOAD_usm_alloc (int device, size_t size)
+{
+  while (1)
+    {
+      void *result = usmpin_alloc (usm_ctx, size);
+      if (result)
+	return result;
+
+      /* Allocation failed.  Try again if we can create a new heap block.
+	 Note: it's possible another thread could get to the new memory
+	 first, so the while loop is necessary. */
+      if (!usm_heap_create (size))
+	return NULL;
+    }
+}
+
+/* Free memory allocated via GOMP_OFFLOAD_usm_alloc.  */
+
+bool
+GOMP_OFFLOAD_usm_free (int device, void *ptr)
+{
+  usmpin_free (usm_ctx, ptr);
+  return true;
+}
+
+/* True if the memory was allocated via GOMP_OFFLOAD_usm_alloc.  */
+
+bool
+GOMP_OFFLOAD_is_usm_ptr (void *ptr)
+{
+  for (struct usm_heap_pages *heap = usm_heap_pages; heap; heap = heap->next)
+    if (ptr >= (void*)heap && ptr < heap->end)
+      return true;
+  return false;
+}
+
 /* }}} */
 /* {{{ OpenACC Plugin API  */
 
diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp
index 007bdf2d5c4..f9667cf0494 100644
--- a/libgomp/testsuite/lib/libgomp.exp
+++ b/libgomp/testsuite/lib/libgomp.exp
@@ -601,6 +601,18 @@  proc check_effective_target_omp_usm { } {
     if { [libgomp_check_effective_target_offload_target "nvptx"] } {
 	return 1
     }
+
+    if { [libgomp_check_effective_target_offload_target "amdgcn"] } {
+	return [check_no_compiler_messages omp_usm executable {
+           #pragma omp requires unified_shared_memory
+	   int main () {
+	     #pragma omp target
+	       ;
+	     return 0;
+	   }
+	}]
+    }
+
     return 0
 }
 
diff --git a/libgomp/testsuite/libgomp.c++/usm-1.C b/libgomp/testsuite/libgomp.c++/usm-1.C
index fea25e5f10b..6e88f90d61f 100644
--- a/libgomp/testsuite/libgomp.c++/usm-1.C
+++ b/libgomp/testsuite/libgomp.c++/usm-1.C
@@ -1,5 +1,5 @@ 
 /* { dg-do run } */
-/* { dg-skip-if "Only valid for nvptx" { ! offload_target_nvptx } } */
+/* { dg-require-effective-target omp_usm } */
 #include <stdint.h>
 
 #pragma omp requires unified_shared_memory
diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-1.c b/libgomp/testsuite/libgomp.c-c++-common/requires-1.c
index 31996f1ecf6..d8b8119d4fa 100644
--- a/libgomp/testsuite/libgomp.c-c++-common/requires-1.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/requires-1.c
@@ -1,3 +1,4 @@ 
+/* { dg-require-effective-target omp_usm } */
 /* { dg-do link { target offload_target_any } } */
 /* { dg-additional-sources requires-1-aux.c } */
 
diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-4.c b/libgomp/testsuite/libgomp.c-c++-common/requires-4.c
index b19d9bbc20d..a459b4524b8 100644
--- a/libgomp/testsuite/libgomp.c-c++-common/requires-4.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/requires-4.c
@@ -1,10 +1,11 @@ 
+/* { dg-skip-if "Not all devices allow USM" { offload_device_gcn && { ! omp_usm } } } */
+/* { dg-xfail-run-if "USM via -foffload-memory=... does not support static variables" { offload_device_nvptx || offload_device_gcn } } */
+
 /* { dg-require-effective-target lto } */
 /* { dg-additional-options "-flto" } */
 /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
 /* { dg-additional-sources requires-4-aux.c } */
 
-/* { dg-xfail-run-if "USM via -foffload-memory=... does not support static variables" { offload_device_nvptx || offload_device_gcn } } */
-
 /* Check no diagnostic by device-compiler's or host compiler's lto1.
    Other file uses: 'requires reverse_offload', but that's inactive as
    there are no declare target directives, device constructs nor device routines  */
diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c b/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c
index 0e0db927c2c..38dba827b2a 100644
--- a/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c
@@ -1,3 +1,5 @@ 
+/* { dg-skip-if "Not all devices allow USM" { offload_device_gcn && { ! omp_usm } } } */
+
 /* { dg-require-effective-target lto } */
 /* { dg-additional-options "-flto" } */
 /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c
index 0f839ef2957..f41d7006aba 100644
--- a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c
@@ -1,3 +1,5 @@ 
+/* { dg-skip-if "Not all devices allow USM" { offload_device_gcn && { ! omp_usm } } } */
+
 /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
 /* { dg-additional-sources requires-5-aux.c } */
 
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-implicit-map-4.c b/libgomp/testsuite/libgomp.c-c++-common/target-implicit-map-4.c
index d0b0cd178c0..2766312292b 100644
--- a/libgomp/testsuite/libgomp.c-c++-common/target-implicit-map-4.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-implicit-map-4.c
@@ -4,6 +4,8 @@ 
    and for not mapping the stack variables 'A' and 'B' (not mapped
    but accessible -> USM makes this tested feature even more important.)  */
 
+/* { dg-skip-if "Not all devices allow USM" { offload_device_gcn && { ! omp_usm } } } */
+
 #pragma omp requires unified_shared_memory
 
 /* Ensure that defaultmap(default : pointer) uses correct OpenMP 5.2
diff --git a/libgomp/testsuite/libgomp.c/usm-1.c b/libgomp/testsuite/libgomp.c/usm-1.c
index c8e8a9328ee..8d0e0e05198 100644
--- a/libgomp/testsuite/libgomp.c/usm-1.c
+++ b/libgomp/testsuite/libgomp.c/usm-1.c
@@ -1,5 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target omp_usm } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn } } */
 
 #include <omp.h>
 #include <stdint.h>
diff --git a/libgomp/testsuite/libgomp.c/usm-2.c b/libgomp/testsuite/libgomp.c/usm-2.c
index 9f414b16319..4609f89ed0c 100644
--- a/libgomp/testsuite/libgomp.c/usm-2.c
+++ b/libgomp/testsuite/libgomp.c/usm-2.c
@@ -1,5 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target omp_usm } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn } } */
 
 #include <omp.h>
 #include <stdint.h>
diff --git a/libgomp/testsuite/libgomp.c/usm-3.c b/libgomp/testsuite/libgomp.c/usm-3.c
index d7a77a5c2ee..c81ff20e1ff 100644
--- a/libgomp/testsuite/libgomp.c/usm-3.c
+++ b/libgomp/testsuite/libgomp.c/usm-3.c
@@ -1,5 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target omp_usm } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn } } */
 
 #include <omp.h>
 #include <stdint.h>
diff --git a/libgomp/testsuite/libgomp.c/usm-4.c b/libgomp/testsuite/libgomp.c/usm-4.c
index 825bb4e8b3e..8565018ce52 100644
--- a/libgomp/testsuite/libgomp.c/usm-4.c
+++ b/libgomp/testsuite/libgomp.c/usm-4.c
@@ -1,5 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target omp_usm } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn } } */
 
 #include <omp.h>
 #include <stdint.h>
diff --git a/libgomp/testsuite/libgomp.c/usm-5.c b/libgomp/testsuite/libgomp.c/usm-5.c
index 00332050591..865737b9417 100644
--- a/libgomp/testsuite/libgomp.c/usm-5.c
+++ b/libgomp/testsuite/libgomp.c/usm-5.c
@@ -19,6 +19,8 @@  main ()
 
 #pragma omp target map(a[0:1])
     {
+      if (omp_is_initial_device())
+	__builtin_abort ();  // The test will not work in fallback mode
       if (a[0] != 42 || a_p == (uintptr_t)a)
 	__builtin_abort ();
     }
diff --git a/libgomp/testsuite/libgomp.c/usm-6.c b/libgomp/testsuite/libgomp.c/usm-6.c
index 1598117f2b2..1ee63641bb8 100644
--- a/libgomp/testsuite/libgomp.c/usm-6.c
+++ b/libgomp/testsuite/libgomp.c/usm-6.c
@@ -1,5 +1,5 @@ 
 /* { dg-do run } */
-/* { dg-skip-if "Only valid for nvptx" { ! offload_target_nvptx } } */
+/* { dg-require-effective-target omp_usm } */
 
 #include <stdint.h>
 #include <stdlib.h>
diff --git a/libgomp/usm-allocator.c b/libgomp/usm-allocator.c
new file mode 100644
index 00000000000..68c1ebafec2
--- /dev/null
+++ b/libgomp/usm-allocator.c
@@ -0,0 +1,232 @@ 
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a simple "malloc" implementation intended for use with Unified
+   Shared Memory.  It allocates memory from a pool allocated and configured
+   by the device plugin, and is intended to be #included from there.  It
+   keeps the allocated/free chain in a side-table (splay tree) to ensure that
+   the allocation routine does not migrate all the USM pages back into host
+   memory.  */
+
+
+#include "libgomp.h"
+
+/* Use a splay tree to track USM allocations.  */
+
+typedef struct usm_splay_tree_node_s *usm_splay_tree_node;
+typedef struct usm_splay_tree_s *usm_splay_tree;
+typedef struct usm_splay_tree_key_s *usm_splay_tree_key;
+
+struct usm_splay_tree_key_s {
+  void *base;
+  size_t size;
+};
+
+static inline int
+usm_splay_compare (usm_splay_tree_key x, usm_splay_tree_key y)
+{
+  return (x->base == y->base ? 0
+	  : x->base > y->base ? 1
+	  : -1);
+}
+#define splay_tree_prefix usm
+#include "splay-tree.h"
+
+static int usm_lock = 0;
+static struct usm_splay_tree_s usm_allocations = { NULL };
+static struct usm_splay_tree_s usm_free_space = { NULL };
+
+/* 128-byte granularity means GPU cache-line aligned.  */
+#define ALIGN(VAR) (((VAR) + 127) & ~127)
+
+/* Coalesce contiguous free space into one entry.  This considers the entries
+   either side of the root node only, so it should be called each time a new
+   entry in inserted into the root.  */
+
+static void
+usm_coalesce_free_space ()
+{
+  usm_splay_tree_node prev, next, node = usm_free_space.root;
+
+  for (prev = node->left; prev && prev->right; prev = prev->right)
+    ;
+  for (next = node->right; next && next->left; next = next->left)
+    ;
+
+  /* Coalesce adjacent free chunks.  */
+  if (next
+      && node->key.base + node->key.size == next->key.base)
+    {
+      /* Free chunk follows.  */
+      node->key.size += next->key.size;
+      usm_splay_tree_remove (&usm_free_space, &next->key);
+      free (next);
+    }
+  if (prev
+      && prev->key.base + prev->key.size == node->key.base)
+    {
+      /* Free chunk precedes.  */
+      prev->key.size += node->key.size;
+      usm_splay_tree_remove (&usm_free_space, &node->key);
+      free (node);
+    }
+}
+
+/* Add a new memory region into the free chain.  This is how the USM heap is
+   initialized and extended.  If the new region is contiguous with an existing
+   region then any free space will be coalesced.  */
+
+static void
+usm_register_memory (char *base, size_t size)
+{
+  if (base == NULL)
+    return;
+
+  while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s));
+  node->key.base = base;
+  node->key.size = size;
+  node->left = NULL;
+  node->right = NULL;
+  usm_splay_tree_insert (&usm_free_space, node);
+  usm_coalesce_free_space (node);
+
+  __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+}
+
+/* This splay_tree_foreach callback selects the first free space large enough
+   to hold the allocation needed.  Since the splay_tree walk may start in the
+   middle the "first" isn't necessarily the "leftmost" entry.  */
+
+struct usm_callback_data {
+  size_t size;
+  usm_splay_tree_node found;
+};
+
+static int
+usm_alloc_callback (usm_splay_tree_key key, void *data)
+{
+  struct usm_callback_data *cbd = (struct usm_callback_data *)data;
+
+  if (key->size >= cbd->size)
+    {
+      cbd->found = (usm_splay_tree_node)key;
+      return 1;
+    }
+
+  return 0;
+}
+
+/* USM "malloc".  Selects and moves and address range from usm_free_space to
+   usm_allocations, while leaving any excess in usm_free_space.  */
+
+static void *
+usm_alloc (size_t size)
+{
+  /* Memory is allocated in N-byte granularity.  */
+  size = ALIGN (size);
+
+  /* Acquire the lock.  */
+  while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  if (!usm_free_space.root)
+    {
+      /* No memory registered, or no free space.  */
+      __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+      return NULL;
+    }
+
+  /* Find a suitable free block.  */
+  struct usm_callback_data cbd = {size, NULL};
+  usm_splay_tree_foreach_lazy (&usm_free_space, usm_alloc_callback, &cbd);
+  usm_splay_tree_node freenode = cbd.found;
+
+  void *result = NULL;
+  if (freenode)
+    {
+      /* Allocation successful.  */
+      result = freenode->key.base;
+      usm_splay_tree_node allocnode = malloc (sizeof (*allocnode));
+      allocnode->key.base = result;
+      allocnode->key.size = size;
+      allocnode->left = NULL;
+      allocnode->right = NULL;
+      usm_splay_tree_insert (&usm_allocations, allocnode);
+
+      /* Update the free chain.  */
+      size_t stillfree_size = freenode->key.size - size;
+      if (stillfree_size > 0)
+	{
+	  freenode->key.base = freenode->key.base + size;
+	  freenode->key.size = stillfree_size;
+	}
+      else
+	{
+	  usm_splay_tree_remove (&usm_free_space, &freenode->key);
+	  free (freenode);
+	}
+    }
+
+  /* Release the lock.  */
+  __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+
+  return result;
+}
+
+/* USM "free".  Moves an address range from usm_allocations to usm_free_space
+   and merges that record with any contiguous free memory.  */
+
+static void
+usm_free (void *addr)
+{
+  /* Acquire the lock.  */
+  while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  /* Convert the memory map to free.  */
+  struct usm_splay_tree_key_s key = {addr};
+  usm_splay_tree_key found = usm_splay_tree_lookup (&usm_allocations, &key);
+  if (!found)
+    GOMP_PLUGIN_fatal ("invalid free");
+  usm_splay_tree_remove (&usm_allocations, &key);
+  usm_splay_tree_insert (&usm_free_space, (usm_splay_tree_node)found);
+  usm_coalesce_free_space ();
+
+  /* Release the lock.  */
+  __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+}
+
+#undef ALIGN
+
+/* This allows splay-tree.c to call gomp_fatal in this context.  The splay
+   tree code doesn't use the variadic arguments right now.  */
+#define gomp_fatal(MSG, ...) GOMP_PLUGIN_fatal (MSG)
+
+/* Include the splay tree code inline, with the prefixes added.  */
+#define splay_tree_prefix usm
+#define splay_tree_c
+#include "splay-tree.h"
diff --git a/libgomp/usmpin-allocator.c b/libgomp/usmpin-allocator.c
index 311bda5054e..e1df6197a88 100644
--- a/libgomp/usmpin-allocator.c
+++ b/libgomp/usmpin-allocator.c
@@ -313,6 +313,9 @@  usmpin_realloc (usmpin_ctx_p ctx, void *addr, size_t newsize)
   return addr;
 }
 
+/* Ensure that the splay tree will link into the plugin.  */
+#define gomp_fatal GOMP_PLUGIN_fatal
+
 /* Include the splay tree code inline, with the prefixes added.  */
 #define splay_tree_prefix usmpin
 #define splay_tree_c