[v2,6/8] amdgcn: libgomp plugin USM implementation

Message ID	20240628102449.562467-7-ams@baylibre.com
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 29D08388461C From: Andrew Stubbs <ams@baylibre.com> To: gcc-patches@gcc.gnu.org Cc: tburnus@baylibre.com, jakub@redhat.com Subject: [PATCH v2 6/8] amdgcn: libgomp plugin USM implementation Date: Fri, 28 Jun 2024 10:24:47 +0000 Message-ID: <20240628102449.562467-7-ams@baylibre.com> In-Reply-To: <20240628102449.562467-1-ams@baylibre.com> References: <20240628102449.562467-1-ams@baylibre.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org
Series	OpenMP: Unified Shared Memory via Managed Memory \| expand [v2,0/8] OpenMP: Unified Shared Memory via Managed Memory [v2,1/8] libgomp: Disentangle shared memory from managed [v2,2/8] openmp, nvptx: ompx_gnu_unified_shared_mem_alloc [v2,3/8] openmp: Enable -foffload-memory=unified [v2,4/8] openmp: Use libgomp memory allocation functions with unified shared memory. [v2,5/8] amdgcn, openmp: Auto-detect USM mode and set HSA_XNACK [v2,6/8] amdgcn: libgomp plugin USM implementation [v2,7/8] openmp, libgomp: Handle unified shared memory in omp_target_is_accessible [v2,8/8] libgomp: Map omp_default_mem_space to USM

diff --git a/include/hsa.h b/include/hsa.h old mode 100644 new mode 100755 index 3c7be95d7fd..28867a91a7c --- a/include/hsa.h +++ b/include/hsa.h @@ -1,6 +1,6 @@ //////////////////////////////////////////////////////////////////////////////// // -// Copyright (C) 2014-2020 Advanced Micro Devices Inc. All rights reserved. +// Copyright (C) 2014-2022 Advanced Micro Devices Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by @@ -467,7 +467,19 @@ typedef enum { * String containing the ROCr build identifier. */ HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, - + /** + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, + // TODO: Should this be per Agent? + /** + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202 } hsa_system_info_t; @@ -986,8 +998,16 @@ typedef enum { * Minor version of the HSA runtime specification supported by the * agent. The type of this attribute is uint16_t. */ - HSA_AGENT_INFO_VERSION_MINOR = 22 - + HSA_AGENT_INFO_VERSION_MINOR = 22, + /** + * This enum does not have a fixed underlying type, thus in C++ post D2338: + * If the enumeration type does not have a fixed underlying type, the value is + * unchanged if the original value is within the range of the enumeration + * values (9.7.1 [dcl.enum]), and otherwise, the behavior is + * undefined. + * Thus increase the range of this enum to encompass vendor extensions. + */ + HSA_AGENT_INFO_LAST = INT32_MAX } hsa_agent_info_t; /** diff --git a/include/hsa_ext_amd.h b/include/hsa_ext_amd.h old mode 100644 new mode 100755 index 1f93e0d96cc..6aac7392287 --- a/include/hsa_ext_amd.h +++ b/include/hsa_ext_amd.h @@ -1,6 +1,6 @@ //////////////////////////////////////////////////////////////////////////////// // -// Copyright (C) 2014-2020 Advanced Micro Devices Inc. All rights reserved. +// Copyright (C) 2014-2022 Advanced Micro Devices Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by @@ -41,6 +41,115 @@ extern "C" { #endif +/** \addtogroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants. + */ +typedef uint32_t hsa_signal_condition32_t; + +/** + * @brief AMD vendor specific packet type. + */ +typedef enum { + /** + * Packet used by agents to delay processing of subsequent packets until a + * configurable condition is satisfied by an HSA signal. Only kernel dispatch + * queues created from AMD GPU Agents support this packet. + */ + HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2, +} hsa_amd_packet_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants. + */ +typedef uint8_t hsa_amd_packet_type8_t; + +/** + * @brief AMD vendor specific AQL packet header + */ +typedef struct hsa_amd_packet_header_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + *Format of the vendor specific packet. + */ + hsa_amd_packet_type8_t AmdFormat; + + /** + * Reserved. Must be 0. + */ + uint8_t reserved; +} hsa_amd_vendor_packet_header_t; + +/** + * @brief AMD barrier value packet. Halts packet processing and waits for + * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value + * is the value of the signal ::signal. + */ +typedef struct hsa_amd_barrier_value_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + + /** + * Dependent signal object. A signal with a handle value of 0 is + * allowed and is interpreted by the packet processor a satisfied + * dependency. + */ + hsa_signal_t signal; + + /** + * Value to compare against. + */ + hsa_signal_value_t value; + + /** + * Bit mask to be combined by bitwise AND with ::signal's value. + */ + hsa_signal_value_t mask; + + /** + * Comparison operation. See ::hsa_signal_condition_t. + */ + hsa_signal_condition32_t cond; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; +} hsa_amd_barrier_value_packet_t; + +/** @} */ + /** * @brief Enumeration constants added to ::hsa_status_t. * @@ -61,6 +170,20 @@ enum { * Agent executed an invalid shader instruction. */ HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42, + + /** + * Agent attempted to access an inaccessible address. + * See hsa_amd_register_system_event_handler and + * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses. + */ + HSA_STATUS_ERROR_MEMORY_FAULT = 43, + + /** + * The CU mask was successfully set but the mask attempted to enable a CU + * which was disabled for the process. CUs disabled for the process remain + * disabled. + */ + HSA_STATUS_CU_MASK_REDUCED = 44, }; /** @@ -168,7 +291,38 @@ typedef enum hsa_amd_agent_info_s { * selective workarounds for hardware errata. * The type of this attribute is uint32_t. */ - HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012 + HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012, + /** + * Queries whether or not the host can directly access SVM memory that is + * physically resident in the agent's local memory. + * The type of this attribute is bool. + */ + HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS = 0xA013, + /** + * Some processors support more CUs than can reliably be used in a cooperative + * dispatch. This queries the count of CUs which are fully enabled for + * cooperative dispatch. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT = 0xA014, + /** + * Queries the amount of memory available in bytes accross all global pools + * owned by the agent. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_MEMORY_AVAIL = 0xA015, + /** + * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is + * in the range 1-400MHz. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016, + /** + * Queries for the ASIC family ID of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID = 0xA107 + } hsa_amd_agent_info_t; typedef struct hsa_amd_hdp_flush_s { @@ -471,6 +625,37 @@ hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uin const hsa_agent_t* consumers, uint64_t attributes, hsa_signal_t* signal); +/** + * @brief Returns a pointer to the value of a signal. + * + * Use of this API does not modify the lifetime of ::signal and any + * hsa_signal_value_t retrieved by this API has lifetime equal to that of + * ::signal. + * + * This API is intended for partial interoperability with non-HSA compatible + * devices and should not be used where HSA interfaces are available. + * + * Use of the signal value must comply with use restritions of ::signal. + * Use may result in data races if the operations performed are not platform + * atomic. Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC + * attributed signals is required. + * + * @param[in] Signal handle to extract the signal value pointer from. + * + * @param[out] Location where the extracted signal value pointer will be placed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL. + */ +hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, + volatile hsa_signal_value_t** value_ptr); + /** * @brief Asyncronous signal handler function type. * @@ -613,32 +798,69 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent, void* value); /** - * @brief Set a CU affinity to specific queues within the process, this function - * call is "atomic". + * @brief Set a queue's CU affinity mask. + * + * @details Enables the queue to run on only selected CUs. The given mask is + * combined by bitwise AND with any device wide mask in HSA_CU_MASK before + * being applied. + * If num_cu_mask_count is 0 then the request is interpreted as a request to + * enable all CUs and no cu_mask array need be given. * * @param[in] queue A pointer to HSA queue. * - * @param[in] num_cu_mask_count Size of CUMask bit array passed in. + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. * * @param[in] cu_mask Bit-vector representing the CU mask. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * + * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed + * but the given mask attempted to enable a CU which was disabled by + * HSA_CU_MASK. CUs disabled by HSA_CU_MASK remain disabled. + * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not - * multiple of 32 or @p cu_mask is NULL. - * - * @retval ::HSA_STATUS_ERROR failed to call thunk api + * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL. + * Devices with work group processors must even-index contiguous pairwise + * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110) + * are invalid. * */ hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, const uint32_t* cu_mask); +/** + * @brief Retrieve a queue's CU affinity mask. + * + * @details Returns the first num_cu_mask_count bits of a queue's CU mask. + * Ensure that num_cu_mask_count is at least as large as + * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask. + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. + * + * @param[out] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not + * a multiple of 32 or @p cu_mask is NULL. + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + uint32_t* cu_mask); + /** * @brief Memory segments associated with a memory pool. */ @@ -770,6 +992,24 @@ typedef enum { HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16, } hsa_amd_memory_pool_info_t; +/** + * @brief Memory pool flag used to specify allocation directives + * + */ +typedef enum hsa_amd_memory_pool_flag_s { + /** + * Allocates memory that conforms to standard HSA memory consistency model + */ + HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0, + /** + * Allocates fine grain memory type where memory ordering is per point to point + * connection. Atomic memory operations on these memory buffers are not + * guaranteed to be visible at system scope. + */ + HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1, + +} hsa_amd_memory_pool_flag_t; + /** * @brief Get the current value of an attribute of a memory pool. * @@ -838,7 +1078,7 @@ hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool. * * @param[in] flags A bit-field that is used to specify allocation - * directives. Reserved parameter, must be 0. + * directives. * * @param[out] ptr Pointer to the location where to store the base virtual * address of @@ -895,6 +1135,8 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); * * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. * * @param[in] src A valid pointer to the source of data to be copied. The source * buffer must not overlap with the destination buffer, otherwise the copy will succeed @@ -902,6 +1144,8 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); * * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. * * @param[in] size Number of bytes to copy. If @p size is 0, no copy is * performed and the function returns success. Copying a number of bytes larger @@ -912,9 +1156,9 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); * * @param[in] dep_signals List of signals that must be waited on before the copy * operation starts. The copy will start after every signal has been observed with - * the value 0. The dependent signal should not include completion signal from hsa_amd_memory_async_copy - * operation to be issued in future as that can result in a deadlock. If @p num_dep_signals is 0, this - * argument is ignored. + * the value 0. The dependent signal should not include completion signal from + * hsa_amd_memory_async_copy operation to be issued in future as that can result + * in a deadlock. If @p num_dep_signals is 0, this argument is ignored. * * @param[in] completion_signal Signal used to indicate completion of the copy * operation. When the copy operation is finished, the value of the signal is @@ -929,7 +1173,7 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * - * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access. * * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid. * @@ -1568,6 +1812,12 @@ typedef struct hsa_amd_pointer_info_s { GPU boards) any such agent may be returned. */ hsa_agent_t agentOwner; + /* + Contains a bitfield of hsa_amd_memory_pool_global_flag_t values. + Reports the effective global flags bitmask for the allocation. This field is not meaningful if + the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + uint32_t global_flags; } hsa_amd_pointer_info_t; /** @@ -1603,7 +1853,7 @@ typedef struct hsa_amd_pointer_info_s { * * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info. */ -hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr, +hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t), uint32_t* num_agents_accessible, @@ -1627,7 +1877,7 @@ hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr, * * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr. */ -hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(void* ptr, +hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr, void* userdata); /** @@ -1801,11 +2051,11 @@ typedef enum { // GPU attempted access to a host only page. HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3, // DRAM ECC failure. - HSA_AMD_MEMORY_FAULT_DRAM_ECC = 1 << 4, + HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4, // Can't determine the exact fault address. HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5, // SRAM ECC failure (ie registers, no fault address). - HSA_AMD_MEMORY_FAULT_SRAM_ECC = 1 << 6, + HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6, // GPU reset following unspecified hang. HSA_AMD_MEMORY_FAULT_HANG = 1 << 31 } hsa_amd_memory_fault_reason_t; @@ -1962,6 +2212,181 @@ hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr, hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr, hsa_amd_deallocation_callback_t callback); +typedef enum hsa_amd_svm_model_s { + /** + * Updates to memory with this attribute conform to HSA memory consistency + * model. + */ + HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0, + /** + * Writes to memory with this attribute can be performed by a single agent + * at a time. + */ + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1, + /** + * Memory region queried contains subregions with both + * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and + * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes. + * + * This attribute can not be used in hsa_amd_svm_attributes_set. It is a + * possible return from hsa_amd_svm_attributes_get indicating that the query + * region contains both coarse and fine grained memory. + */ + HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2 +} hsa_amd_svm_model_t; + +typedef enum hsa_amd_svm_attribute_s { + // Memory model attribute. + // Type of this attribute is hsa_amd_svm_model_t. + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0, + // Marks the range read only. This allows multiple physical copies to be + // placed local to each accessing device. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_ONLY = 1, + // Automatic migrations should attempt to keep the memory within the xgmi hive + // containing accessible agents. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2, + // Page granularity to migrate at once. Page granularity is specified as + // log2(page_count). + // Type of this attribute is uint64_t. + HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3, + // Physical location to prefer when automatic migration occurs. + // Set to the null agent handle (handle == 0) to indicate there + // is no preferred location. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4, + // This attribute can not be used in ::hsa_amd_svm_attributes_set (see + // ::hsa_amd_svm_prefetch_async). + // Queries the physical location of most recent prefetch command. + // If the prefetch location has not been set or is not uniform across the + // address range then returned hsa_agent_t::handle will be 0. + // Querying this attribute will return the destination agent of the most + // recent ::hsa_amd_svm_prefetch_async targeting the address range. If + // multiple async prefetches have been issued targeting the region and the + // most recently issued prefetch has completed then the query will return + // the location of the most recently completed prefetch. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5, + // Optimizes with the anticipation that the majority of operations to the + // range will be read operations. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6, + // Allows the execution on GPU. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range. Access may incur a page fault + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM + // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range without page faults. Access + // will not incur a page fault and will not cause access based migration. + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if + // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Denies an agent access to the memory range. Access will cause a terminal + // segfault. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202, + // This attribute can not be used in ::hsa_amd_svm_attributes_set. + // Returns the access attribute associated with the agent. + // The agent to query must be set in the attribute value field. + // The attribute enum will be replaced with the agent's current access + // attribute for the address range. + // TODO: Clarify KFD return value for non-uniform access attribute. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203, +} hsa_amd_svm_attribute_t; + +// List type for hsa_amd_svm_attributes_set/get. +typedef struct hsa_amd_svm_attribute_pair_s { + // hsa_amd_svm_attribute_t value. + uint64_t attribute; + // Attribute value. Bit values should be interpreted according to the type + // given in the associated attribute description. + uint64_t value; +} hsa_amd_svm_attribute_pair_t; + +/** + * @brief Sets SVM memory attributes. + * + * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling + * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE + * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM + * memory access by that Agent. + * + * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION + * may not be used with this API. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Gets SVM memory attributes. + * + * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE, + * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and + * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API. + * + * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an + * hsa_agent_t and returns the current access type through its attribute field. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Asynchronously migrates memory to an agent. + * + * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero. + * @p completion_signal will decrement when the migration is complete. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] agent Agent to migrate to. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the migration + * operation starts. The migration will start after every signal has been observed with + * the value 0. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the migration + * operation. When the migration operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. If no completion signal is required this handle may be null. + */ +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + #ifdef __cplusplus } // end extern "C" block #endif diff --git a/include/hsa_ext_image.h b/include/hsa_ext_image.h old mode 100644 new mode 100755 index 52b695bc9fa..eee1f807f5b --- a/include/hsa_ext_image.h +++ b/include/hsa_ext_image.h @@ -1,6 +1,6 @@ //////////////////////////////////////////////////////////////////////////////// // -// Copyright (C) 2014-2020 Advanced Micro Devices Inc. All rights reserved. +// Copyright (C) 2014-2022 Advanced Micro Devices Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in index b74e39a1c2a..e0b26d9e7f1 100644 --- a/libgomp/Makefile.in +++ b/libgomp/Makefile.in @@ -185,7 +185,8 @@ am__DEPENDENCIES_1 = @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_DEPENDENCIES = libgomp.la \ @PLUGIN_GCN_TRUE@ $(am__DEPENDENCIES_1) @PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_OBJECTS = \ -@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-plugin-gcn.lo +@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-plugin-gcn.lo \ +@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-usmpin-allocator.lo libgomp_plugin_gcn_la_OBJECTS = $(am_libgomp_plugin_gcn_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -581,7 +582,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \ # AMD GCN plugin @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION) -@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c +@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c usmpin-allocator.c @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \ @PLUGIN_GCN_TRUE@ -D_GNU_SOURCE @@ -758,6 +759,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-usmpin-allocator.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop.Plo@am__quote@ @@ -819,6 +821,13 @@ libgomp_plugin_gcn_la-plugin-gcn.lo: plugin/plugin-gcn.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c +libgomp_plugin_gcn_la-usmpin-allocator.lo: usmpin-allocator.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-usmpin-allocator.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-usmpin-allocator.Tpo -c -o libgomp_plugin_gcn_la-usmpin-allocator.lo `test -f 'usmpin-allocator.c' || echo '$(srcdir)/'`usmpin-allocator.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-usmpin-allocator.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-usmpin-allocator.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='usmpin-allocator.c' object='libgomp_plugin_gcn_la-usmpin-allocator.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-usmpin-allocator.lo `test -f 'usmpin-allocator.c' || echo '$(srcdir)/'`usmpin-allocator.c + libgomp_plugin_nvptx_la-plugin-nvptx.lo: plugin/plugin-nvptx.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_nvptx_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_nvptx_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_nvptx_la-plugin-nvptx.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo -c -o libgomp_plugin_nvptx_la-plugin-nvptx.lo `test -f 'plugin/plugin-nvptx.c' || echo '$(srcdir)/'`plugin/plugin-nvptx.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo diff --git a/libgomp/config/gcn/allocator.c b/libgomp/config/gcn/allocator.c index 99cba6c9265..20405618caf 100644 --- a/libgomp/config/gcn/allocator.c +++ b/libgomp/config/gcn/allocator.c @@ -36,6 +36,7 @@ when the memspace access trait is set accordingly. */ #include "libgomp.h" +#include <assert.h> #include <stdlib.h> #define BASIC_ALLOC_PREFIX __gcn_lowlat @@ -56,6 +57,8 @@ gcn_memspace_alloc (omp_memspace_handle_t memspace, size_t size) return __gcn_lowlat_alloc (shared_pool, size); } + else if (memspace == ompx_gnu_host_mem_space) + return NULL; else return malloc (size); } @@ -69,6 +72,8 @@ gcn_memspace_calloc (omp_memspace_handle_t memspace, size_t size) return __gcn_lowlat_calloc (shared_pool, size); } + else if (memspace == ompx_gnu_host_mem_space) + return NULL; else return calloc (1, size); } @@ -82,6 +87,9 @@ gcn_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size) __gcn_lowlat_free (shared_pool, addr, size); } + else if (memspace == ompx_gnu_host_mem_space) + /* Just verify what all allocator functions return. */ + assert (addr == NULL); else free (addr); } @@ -96,6 +104,8 @@ gcn_memspace_realloc (omp_memspace_handle_t memspace, void *addr, return __gcn_lowlat_realloc (shared_pool, addr, oldsize, size); } + else if (memspace == ompx_gnu_host_mem_space) + return NULL; else return realloc (addr, size); } diff --git a/libgomp/plugin/Makefrag.am b/libgomp/plugin/Makefrag.am index c9058709ad0..56dfbe3538a 100644 --- a/libgomp/plugin/Makefrag.am +++ b/libgomp/plugin/Makefrag.am @@ -57,7 +57,7 @@ if PLUGIN_GCN # AMD GCN plugin libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION) toolexeclib_LTLIBRARIES += libgomp-plugin-gcn.la -libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c +libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c usmpin-allocator.c libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \ -D_GNU_SOURCE libgomp_plugin_gcn_la_LDFLAGS = $(libgomp_plugin_gcn_version_info) \ diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index c8c588e8efa..dd9c42c0b8c 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -43,11 +43,14 @@ #include <signal.h> #include "libgomp-plugin.h" #include "config/gcn/libgomp-gcn.h" /* For struct output. */ +#include "libgomp.h" /* For usmpin API. */ #include "gomp-constants.h" #include <elf.h> #include "oacc-plugin.h" #include "oacc-int.h" #include <assert.h> +#include <sys/mman.h> +#include <unistd.h> /* These probably won't be in elf.h for a while. */ #ifndef R_AMDGPU_NONE @@ -206,6 +209,9 @@ struct hsa_runtime_fn_info const hsa_dim3_t *range, hsa_agent_t copy_agent, hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t *dep_signals, hsa_signal_t completion_signal); + hsa_status_t (*hsa_amd_svm_attributes_set_fn) + (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); }; /* Structure describing the run-time and grid properties of an HSA kernel @@ -709,6 +715,24 @@ dump_hsa_system_info (void) } else GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n"); + + bool svm_supported; + status = hsa_fns.hsa_system_get_info_fn + (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED, &svm_supported); + if (status == HSA_STATUS_SUCCESS) + GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: %s\n", + (svm_supported ? "TRUE" : "FALSE")); + else + GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: FAILED\n"); + + bool svm_accessible; + status = hsa_fns.hsa_system_get_info_fn + (HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible); + if (status == HSA_STATUS_SUCCESS) + GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: %s\n", + (svm_accessible ? "TRUE" : "FALSE")); + else + GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: FAILED\n"); } /* Dump information about the available hardware. */ @@ -1426,6 +1450,7 @@ init_hsa_runtime_functions (void) DLSYM_OPT_FN (hsa_amd_memory_lock) DLSYM_OPT_FN (hsa_amd_memory_unlock) DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect) + DLSYM_OPT_FN (hsa_amd_svm_attributes_set) return true; #undef DLSYM_OPT_FN #undef DLSYM_FN @@ -3165,6 +3190,107 @@ wait_queue (struct goacc_asyncqueue *aq) } /* }}} */ +/* {{{ Unified Shared Memory + + Normal heap memory is already enabled for USM, but by default it is "fine- + grained" memory, meaning that the GPU must access it via the system bus, + slowly. Changing the page to "coarse-grained" mode means that the page + is migrated on-demand and can therefore be accessed quickly by both CPU and + GPU (although care should be taken to prevent thrashing the page back and + forth). + + GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case + the initial location is GPU memory; GOMP_OFFLOAD_usm_alloc returns system + memory configure coarse-grained. + + The USM memory space is allocated as a largish block and then subdivided + via a custom allocator. (It would be possible to reconfigure regular + "malloc'd" memory, but if it ends up on the same page as memory used by + the HSA driver then bad things happen.) */ + +usmpin_ctx_p usm_ctx = NULL; + +/* Record a list of the memory blocks configured for USM. */ +static struct usm_heap_pages { + void *start; + void *end; + struct usm_heap_pages *next; +} *usm_heap_pages = NULL; + +/* Initialize or extend the USM memory space. This is called whenever + allocation fails. SIZE is the minimum size required for the failed + allocation to succeed; the function may choose a larger size. + Note that Linux lazy allocation means that the memory returned isn't + guarenteed to acually exist. */ + +static bool +usm_heap_create (size_t size) +{ + static int lock = 0; + while (__atomic_exchange_n (&lock, 1, MEMMODEL_ACQUIRE) != 0) + ; + + size_t default_size = 1L * 1024 * 1024 * 1024; /* 1GB */ + if (size < default_size) + size = default_size; + + /* Round up to a whole page. */ + int pagesize = getpagesize (); + int misalignment = size % pagesize; + if (misalignment > 0) + size += pagesize - misalignment; + + /* Try to get contiguous memory, but it might not be possible. + The most recent previous allocation is at the head of the list. */ + static void *addrhint = NULL; + void *new_pages = mmap (addrhint, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (!new_pages) + { + GCN_DEBUG ("Could not allocate Unified Shared Memory heap."); + __atomic_store_n (&lock, 0, MEMMODEL_RELEASE); + return false; + } + + /* Register the heap allocation as coarse grained, which implies USM. */ + struct hsa_amd_svm_attribute_pair_s attr = { + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED + }; + hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size, + &attr, 1); + if (status != HSA_STATUS_SUCCESS) + GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;" + " please update your drivers and/or kernel"); + + /* Record the page addresses for use in GOMP_OFFLOAD_is_usm_ptr. */ + if (new_pages == addrhint) + { + /* We got contiguous pages, so we don't need extra list entries. */ + usm_heap_pages->end += size; + } + else + { + /* We need a new list entry to record a discontiguous range. */ + struct usm_heap_pages *page = malloc (sizeof (*page)); + page->start = new_pages; + page->end = new_pages + size; + page->next = usm_heap_pages; + usm_heap_pages = page; + } + addrhint = new_pages + size; + + /* Initialize a new USM heap, or add the new memory into an existing USM + heap. */ + if (!usm_ctx) + usm_ctx = usmpin_init_context (new_pages, size); + usmpin_register_memory (usm_ctx, new_pages, size); + + __atomic_store_n (&lock, 0, MEMMODEL_RELEASE); + return true; +} + +/* }}} */ /* {{{ OpenACC support */ /* Execute an OpenACC kernel, synchronously or asynchronously. */ @@ -3385,7 +3511,9 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask) if (!init_hsa_context (true)) exit (EXIT_FAILURE); /* Return -1 if no omp_requires_mask cannot be fulfilled but - devices were present. */ + devices were present. + Note: not all devices support USM, but the compiler refuses to create + binaries for those that don't anyway. */ if (hsa_context.agent_count > 0 && ((omp_requires_mask & ~(GOMP_REQUIRES_UNIFIED_ADDRESS @@ -4446,6 +4574,45 @@ GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars, GOMP_PLUGIN_target_task_completion, async_data); } +/* Allocate memory suitable for Unified Shared Memory. */ + +void * +GOMP_OFFLOAD_usm_alloc (int device, size_t size) +{ + while (1) + { + void *result = usmpin_alloc (usm_ctx, size); + if (result) + return result; + + /* Allocation failed. Try again if we can create a new heap block. + Note: it's possible another thread could get to the new memory + first, so the while loop is necessary. */ + if (!usm_heap_create (size)) + return NULL; + } +} + +/* Free memory allocated via GOMP_OFFLOAD_usm_alloc. */ + +bool +GOMP_OFFLOAD_usm_free (int device, void *ptr) +{ + usmpin_free (usm_ctx, ptr); + return true; +} + +/* True if the memory was allocated via GOMP_OFFLOAD_usm_alloc. */ + +bool +GOMP_OFFLOAD_is_usm_ptr (void *ptr) +{ + for (struct usm_heap_pages *heap = usm_heap_pages; heap; heap = heap->next) + if (ptr >= (void*)heap && ptr < heap->end) + return true; + return false; +} + /* }}} */ /* {{{ OpenACC Plugin API */ diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp index 007bdf2d5c4..f9667cf0494 100644 --- a/libgomp/testsuite/lib/libgomp.exp +++ b/libgomp/testsuite/lib/libgomp.exp @@ -601,6 +601,18 @@ proc check_effective_target_omp_usm { } { if { [libgomp_check_effective_target_offload_target "nvptx"] } { return 1 } + + if { [libgomp_check_effective_target_offload_target "amdgcn"] } { + return [check_no_compiler_messages omp_usm executable { + #pragma omp requires unified_shared_memory + int main () { + #pragma omp target + ; + return 0; + } + }] + } + return 0 } diff --git a/libgomp/testsuite/libgomp.c++/usm-1.C b/libgomp/testsuite/libgomp.c++/usm-1.C index fea25e5f10b..6e88f90d61f 100644 --- a/libgomp/testsuite/libgomp.c++/usm-1.C +++ b/libgomp/testsuite/libgomp.c++/usm-1.C @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-skip-if "Only valid for nvptx" { ! offload_target_nvptx } } */ +/* { dg-require-effective-target omp_usm } */ #include <stdint.h> #pragma omp requires unified_shared_memory diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-1.c b/libgomp/testsuite/libgomp.c-c++-common/requires-1.c index 31996f1ecf6..d8b8119d4fa 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/requires-1.c +++ b/libgomp/testsuite/libgomp.c-c++-common/requires-1.c @@ -1,3 +1,4 @@ +/* { dg-require-effective-target omp_usm } */ /* { dg-do link { target offload_target_any } } */ /* { dg-additional-sources requires-1-aux.c } */ diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-4.c b/libgomp/testsuite/libgomp.c-c++-common/requires-4.c index b19d9bbc20d..a459b4524b8 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/requires-4.c +++ b/libgomp/testsuite/libgomp.c-c++-common/requires-4.c @@ -1,10 +1,11 @@ +/* { dg-skip-if "Not all devices allow USM" { offload_device_gcn && { ! omp_usm } } } */ +/* { dg-xfail-run-if "USM via -foffload-memory=... does not support static variables" { offload_device_nvptx || offload_device_gcn } } */ + /* { dg-require-effective-target lto } */ /* { dg-additional-options "-flto" } */ /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */ /* { dg-additional-sources requires-4-aux.c } */ -/* { dg-xfail-run-if "USM via -foffload-memory=... does not support static variables" { offload_device_nvptx || offload_device_gcn } } */ - /* Check no diagnostic by device-compiler's or host compiler's lto1. Other file uses: 'requires reverse_offload', but that's inactive as there are no declare target directives, device constructs nor device routines */ diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c b/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c index 0e0db927c2c..38dba827b2a 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c +++ b/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c @@ -1,3 +1,5 @@ +/* { dg-skip-if "Not all devices allow USM" { offload_device_gcn && { ! omp_usm } } } */ + /* { dg-require-effective-target lto } */ /* { dg-additional-options "-flto" } */ /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */ diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c index 0f839ef2957..f41d7006aba 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c +++ b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c @@ -1,3 +1,5 @@ +/* { dg-skip-if "Not all devices allow USM" { offload_device_gcn && { ! omp_usm } } } */ + /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */ /* { dg-additional-sources requires-5-aux.c } */ diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-implicit-map-4.c b/libgomp/testsuite/libgomp.c-c++-common/target-implicit-map-4.c index d0b0cd178c0..2766312292b 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/target-implicit-map-4.c +++ b/libgomp/testsuite/libgomp.c-c++-common/target-implicit-map-4.c @@ -4,6 +4,8 @@ and for not mapping the stack variables 'A' and 'B' (not mapped but accessible -> USM makes this tested feature even more important.) */ +/* { dg-skip-if "Not all devices allow USM" { offload_device_gcn && { ! omp_usm } } } */ + #pragma omp requires unified_shared_memory /* Ensure that defaultmap(default : pointer) uses correct OpenMP 5.2 diff --git a/libgomp/testsuite/libgomp.c/usm-1.c b/libgomp/testsuite/libgomp.c/usm-1.c index c8e8a9328ee..8d0e0e05198 100644 --- a/libgomp/testsuite/libgomp.c/usm-1.c +++ b/libgomp/testsuite/libgomp.c/usm-1.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_usm } */ +/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn } } */ #include <omp.h> #include <stdint.h> diff --git a/libgomp/testsuite/libgomp.c/usm-2.c b/libgomp/testsuite/libgomp.c/usm-2.c index 9f414b16319..4609f89ed0c 100644 --- a/libgomp/testsuite/libgomp.c/usm-2.c +++ b/libgomp/testsuite/libgomp.c/usm-2.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_usm } */ +/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn } } */ #include <omp.h> #include <stdint.h> diff --git a/libgomp/testsuite/libgomp.c/usm-3.c b/libgomp/testsuite/libgomp.c/usm-3.c index d7a77a5c2ee..c81ff20e1ff 100644 --- a/libgomp/testsuite/libgomp.c/usm-3.c +++ b/libgomp/testsuite/libgomp.c/usm-3.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_usm } */ +/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn } } */ #include <omp.h> #include <stdint.h> diff --git a/libgomp/testsuite/libgomp.c/usm-4.c b/libgomp/testsuite/libgomp.c/usm-4.c index 825bb4e8b3e..8565018ce52 100644 --- a/libgomp/testsuite/libgomp.c/usm-4.c +++ b/libgomp/testsuite/libgomp.c/usm-4.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_usm } */ +/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn } } */ #include <omp.h> #include <stdint.h> diff --git a/libgomp/testsuite/libgomp.c/usm-5.c b/libgomp/testsuite/libgomp.c/usm-5.c index 00332050591..865737b9417 100644 --- a/libgomp/testsuite/libgomp.c/usm-5.c +++ b/libgomp/testsuite/libgomp.c/usm-5.c @@ -19,6 +19,8 @@ main () #pragma omp target map(a[0:1]) { + if (omp_is_initial_device()) + __builtin_abort (); // The test will not work in fallback mode if (a[0] != 42 || a_p == (uintptr_t)a) __builtin_abort (); } diff --git a/libgomp/testsuite/libgomp.c/usm-6.c b/libgomp/testsuite/libgomp.c/usm-6.c index 1598117f2b2..1ee63641bb8 100644 --- a/libgomp/testsuite/libgomp.c/usm-6.c +++ b/libgomp/testsuite/libgomp.c/usm-6.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-skip-if "Only valid for nvptx" { ! offload_target_nvptx } } */ +/* { dg-require-effective-target omp_usm } */ #include <stdint.h> #include <stdlib.h> diff --git a/libgomp/usm-allocator.c b/libgomp/usm-allocator.c new file mode 100644 index 00000000000..68c1ebafec2 --- /dev/null +++ b/libgomp/usm-allocator.c @@ -0,0 +1,232 @@ +/* Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is a simple "malloc" implementation intended for use with Unified + Shared Memory. It allocates memory from a pool allocated and configured + by the device plugin, and is intended to be #included from there. It + keeps the allocated/free chain in a side-table (splay tree) to ensure that + the allocation routine does not migrate all the USM pages back into host + memory. */ + + +#include "libgomp.h" + +/* Use a splay tree to track USM allocations. */ + +typedef struct usm_splay_tree_node_s *usm_splay_tree_node; +typedef struct usm_splay_tree_s *usm_splay_tree; +typedef struct usm_splay_tree_key_s *usm_splay_tree_key; + +struct usm_splay_tree_key_s { + void *base; + size_t size; +}; + +static inline int +usm_splay_compare (usm_splay_tree_key x, usm_splay_tree_key y) +{ + return (x->base == y->base ? 0 + : x->base > y->base ? 1 + : -1); +} +#define splay_tree_prefix usm +#include "splay-tree.h" + +static int usm_lock = 0; +static struct usm_splay_tree_s usm_allocations = { NULL }; +static struct usm_splay_tree_s usm_free_space = { NULL }; + +/* 128-byte granularity means GPU cache-line aligned. */ +#define ALIGN(VAR) (((VAR) + 127) & ~127) + +/* Coalesce contiguous free space into one entry. This considers the entries + either side of the root node only, so it should be called each time a new + entry in inserted into the root. */ + +static void +usm_coalesce_free_space () +{ + usm_splay_tree_node prev, next, node = usm_free_space.root; + + for (prev = node->left; prev && prev->right; prev = prev->right) + ; + for (next = node->right; next && next->left; next = next->left) + ; + + /* Coalesce adjacent free chunks. */ + if (next + && node->key.base + node->key.size == next->key.base) + { + /* Free chunk follows. */ + node->key.size += next->key.size; + usm_splay_tree_remove (&usm_free_space, &next->key); + free (next); + } + if (prev + && prev->key.base + prev->key.size == node->key.base) + { + /* Free chunk precedes. */ + prev->key.size += node->key.size; + usm_splay_tree_remove (&usm_free_space, &node->key); + free (node); + } +} + +/* Add a new memory region into the free chain. This is how the USM heap is + initialized and extended. If the new region is contiguous with an existing + region then any free space will be coalesced. */ + +static void +usm_register_memory (char *base, size_t size) +{ + if (base == NULL) + return; + + while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1) + ; + + usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s)); + node->key.base = base; + node->key.size = size; + node->left = NULL; + node->right = NULL; + usm_splay_tree_insert (&usm_free_space, node); + usm_coalesce_free_space (node); + + __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE); +} + +/* This splay_tree_foreach callback selects the first free space large enough + to hold the allocation needed. Since the splay_tree walk may start in the + middle the "first" isn't necessarily the "leftmost" entry. */ + +struct usm_callback_data { + size_t size; + usm_splay_tree_node found; +}; + +static int +usm_alloc_callback (usm_splay_tree_key key, void *data) +{ + struct usm_callback_data *cbd = (struct usm_callback_data *)data; + + if (key->size >= cbd->size) + { + cbd->found = (usm_splay_tree_node)key; + return 1; + } + + return 0; +} + +/* USM "malloc". Selects and moves and address range from usm_free_space to + usm_allocations, while leaving any excess in usm_free_space. */ + +static void * +usm_alloc (size_t size) +{ + /* Memory is allocated in N-byte granularity. */ + size = ALIGN (size); + + /* Acquire the lock. */ + while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1) + ; + + if (!usm_free_space.root) + { + /* No memory registered, or no free space. */ + __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE); + return NULL; + } + + /* Find a suitable free block. */ + struct usm_callback_data cbd = {size, NULL}; + usm_splay_tree_foreach_lazy (&usm_free_space, usm_alloc_callback, &cbd); + usm_splay_tree_node freenode = cbd.found; + + void *result = NULL; + if (freenode) + { + /* Allocation successful. */ + result = freenode->key.base; + usm_splay_tree_node allocnode = malloc (sizeof (*allocnode)); + allocnode->key.base = result; + allocnode->key.size = size; + allocnode->left = NULL; + allocnode->right = NULL; + usm_splay_tree_insert (&usm_allocations, allocnode); + + /* Update the free chain. */ + size_t stillfree_size = freenode->key.size - size; + if (stillfree_size > 0) + { + freenode->key.base = freenode->key.base + size; + freenode->key.size = stillfree_size; + } + else + { + usm_splay_tree_remove (&usm_free_space, &freenode->key); + free (freenode); + } + } + + /* Release the lock. */ + __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE); + + return result; +} + +/* USM "free". Moves an address range from usm_allocations to usm_free_space + and merges that record with any contiguous free memory. */ + +static void +usm_free (void *addr) +{ + /* Acquire the lock. */ + while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1) + ; + + /* Convert the memory map to free. */ + struct usm_splay_tree_key_s key = {addr}; + usm_splay_tree_key found = usm_splay_tree_lookup (&usm_allocations, &key); + if (!found) + GOMP_PLUGIN_fatal ("invalid free"); + usm_splay_tree_remove (&usm_allocations, &key); + usm_splay_tree_insert (&usm_free_space, (usm_splay_tree_node)found); + usm_coalesce_free_space (); + + /* Release the lock. */ + __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE); +} + +#undef ALIGN + +/* This allows splay-tree.c to call gomp_fatal in this context. The splay + tree code doesn't use the variadic arguments right now. */ +#define gomp_fatal(MSG, ...) GOMP_PLUGIN_fatal (MSG) + +/* Include the splay tree code inline, with the prefixes added. */ +#define splay_tree_prefix usm +#define splay_tree_c +#include "splay-tree.h" diff --git a/libgomp/usmpin-allocator.c b/libgomp/usmpin-allocator.c index 311bda5054e..e1df6197a88 100644 --- a/libgomp/usmpin-allocator.c +++ b/libgomp/usmpin-allocator.c @@ -313,6 +313,9 @@ usmpin_realloc (usmpin_ctx_p ctx, void *addr, size_t newsize) return addr; } +/* Ensure that the splay tree will link into the plugin. */ +#define gomp_fatal GOMP_PLUGIN_fatal + /* Include the splay tree code inline, with the prefixes added. */ #define splay_tree_prefix usmpin #define splay_tree_c

[v2,6/8] amdgcn: libgomp plugin USM implementation

Commit Message

Patch