From patchwork Wed Nov 13 12:42:10 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Andrew Stubbs X-Patchwork-Id: 1194243 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=gcc.gnu.org (client-ip=209.132.180.131; helo=sourceware.org; envelope-from=gcc-patches-return-513234-incoming=patchwork.ozlabs.org@gcc.gnu.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=codesourcery.com Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.b="V88NzZET"; dkim-atps=neutral Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 47Cklp1tBJz9sP6 for ; Wed, 13 Nov 2019 23:42:28 +1100 (AEDT) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:to :from:subject:message-id:date:mime-version:content-type; q=dns; s=default; b=ASlD72hP/UuEc4CIwj0jWjo9uxDll1DEH8+TcQhJsrz5mdyVVd /URQTi+HhyP1e3XCBtI6sk7kSzV5AxRn2vF27Di14sNyAQHxA8lSkZB3qTGB+ouI hfsPUwee/du5vaZSTE/IIUFWgoWiP0Z/jCYwjEWDy0qZplAwowVasLVhM= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:to :from:subject:message-id:date:mime-version:content-type; s= default; bh=RWRNpYh0NE4XDrPk5QrdwrB6RAc=; b=V88NzZETbQl/Lv4v51WF 3caelvGjrRzDUjrAzNSuAVyMFd1zQh5WaZqJDXp3K9884/8AEsVirlYaow+/W8pY fwb2N7fTKNdL2Pj02RtMPXoZj7mUcvMSOz3I3QcK0ql9Ru+5O5AurDPHmYteBgeS Z0cxtvui9wF7ADFNRW68yCs= Received: (qmail 77717 invoked by alias); 13 Nov 2019 12:42:20 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 77709 invoked by uid 89); 13 Nov 2019 12:42:20 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-18.4 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, SPF_PASS autolearn=ham version=3.3.1 spammy= X-HELO: esa3.mentor.iphmx.com Received: from esa3.mentor.iphmx.com (HELO esa3.mentor.iphmx.com) (68.232.137.180) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Wed, 13 Nov 2019 12:42:18 +0000 IronPort-SDR: RvqOe1vkdqyg1g5dX5Q2Osb7JQwYziaTNHb4zdRIN9HikEoKXogF8zfApZTm0poSXlJcrdR8cS kO4mmlwfG9c/K+0Rre7/oMysP7At8ImG1q4ZRjX5LDS1Dq4U7oBoKhCDMVL8K4mb+v5B9gYWqB 1YhSqxgRe2bc1kf7CPPE7KCrhFxjdEPvZYDmPep1g75bpvoEPqwYCNOZ9lnf/pHytRFVy7GqxH o7umeBFwNqcQfmGG9hRgjQbzUzQnlx6KrnYh3HdFXIu7TobAyrQGfdES6l2NLW0H4CFexebUC1 vno= Received: from orw-gwy-01-in.mentorg.com ([192.94.38.165]) by esa3.mentor.iphmx.com with ESMTP; 13 Nov 2019 04:42:16 -0800 IronPort-SDR: HZc+NQvtzVKQa5xZY65JNxGtrnIAiNSeR3NEvtSaPbL9gOQK8en2lymjTsY8hKZRsSUVA4BAOL IfdMYoB/uhe/EZYw5WzCHobT59CmboVjitNRDG23PaM4oviF3J/pfL4Z51lFbV8g4ZonFqmAvL kUDWI+jXC9zDfGs8+kwjpT4GCAUUQwIIwK4J6zaIjlk7EqRMyGXwXctXzeRuWvy6+JjP3ftOpD AWBwX107EDdM+YzvNVJIyreG8m3jK8OACI648COLd71PZLSb0+Ruj3GY14Hguibxaxwxh9F5OT w7k= To: "gcc-patches@gcc.gnu.org" From: Andrew Stubbs Subject: [committed, amdgcn] Move gcn-run heap into GPU memory Message-ID: Date: Wed, 13 Nov 2019 12:42:10 +0000 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.1.2 MIME-Version: 1.0 I've just committed the attached patch. The patch adjusts the amdgcn-specific gcn-run tool such that it places the heap memory in actual GPU memory (previously it was accessing host memory via PCI). This is a performance optimization, of course, but also matches the conditions that offload kernels will experience, and therefore make standalone testing more meaningful. Andrew Move gcn-run heap into GPU memory. 2019-11-13 Andrew Stubbs gcc/ * config/gcn/gcn-run.c (heap_region): New global variable. (struct hsa_runtime_fn_info): Add hsa_memory_assign_agent_fn. (init_hsa_runtime_functions): Initialize hsa_memory_assign_agent. (get_kernarg_region): Move contents to .... (get_memory_region): .... here. (get_heap_region): New function. (init_device): Initialize the heap_region. (device_malloc): Add region parameter. (struct kernargs): Move heap .... (heap): ... to global scope. (main): Allocate heap separate to kernargs. diff --git a/gcc/config/gcn/gcn-run.c b/gcc/config/gcn/gcn-run.c index 21852d78bc5..cf4870ff4be 100644 --- a/gcc/config/gcn/gcn-run.c +++ b/gcc/config/gcn/gcn-run.c @@ -72,6 +72,7 @@ uint64_t main_kernel = 0; hsa_executable_t executable = { 0 }; hsa_region_t kernargs_region = { 0 }; +hsa_region_t heap_region = { 0 }; uint32_t kernarg_segment_size = 0; uint32_t group_segment_size = 0; uint32_t private_segment_size = 0; @@ -135,6 +136,8 @@ struct hsa_runtime_fn_info hsa_signal_t *signal); hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size, void **ptr); + hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent, + hsa_access_permission_t access); hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src, size_t size); hsa_status_t (*hsa_memory_free_fn) (void *ptr); @@ -204,6 +207,7 @@ init_hsa_runtime_functions (void) DLSYM_FN (hsa_executable_freeze) DLSYM_FN (hsa_signal_create) DLSYM_FN (hsa_memory_allocate) + DLSYM_FN (hsa_memory_assign_agent) DLSYM_FN (hsa_memory_copy) DLSYM_FN (hsa_memory_free) DLSYM_FN (hsa_signal_destroy) @@ -282,7 +286,8 @@ get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused))) suitable one has been found. */ static hsa_status_t -get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused))) +get_memory_region (hsa_region_t region, hsa_region_t *retval, + hsa_region_global_flag_t kind) { /* Reject non-global regions. */ hsa_region_segment_t segment; @@ -294,9 +299,9 @@ get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused))) hsa_region_global_flag_t flags; hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); - if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) + if (flags & kind) { - kernargs_region = region; + *retval = region; return HSA_STATUS_INFO_BREAK; } @@ -304,6 +309,20 @@ get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused))) return HSA_STATUS_SUCCESS; } +static hsa_status_t +get_kernarg_region (hsa_region_t region, void *data __attribute__((unused))) +{ + return get_memory_region (region, &kernargs_region, + HSA_REGION_GLOBAL_FLAG_KERNARG); +} + +static hsa_status_t +get_heap_region (hsa_region_t region, void *data __attribute__((unused))) +{ + return get_memory_region (region, &heap_region, + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED); +} + /* Initialize the HSA Runtime library and GPU device. */ static void @@ -338,6 +357,13 @@ init_device () NULL), status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, "Locate kernargs memory"); + + /* Select a memory region for the kernel heap. + The call-back function, get_heap_region, does the selection. */ + XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region, + NULL), + status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, + "Locate device memory"); } @@ -593,10 +619,10 @@ found_main:; __flat_scalar GCN address space). */ static void * -device_malloc (size_t size) +device_malloc (size_t size, hsa_region_t region) { void *result; - XHSA (hsa_fns.hsa_memory_allocate_fn (kernargs_region, size, &result), + XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result), "Allocate device memory"); return result; } @@ -634,14 +660,14 @@ struct kernargs } queue[1024]; unsigned int consumed; } output_data; - - struct heap - { - int64_t size; - char data[0]; - } heap; }; +struct heap +{ + int64_t size; + char data[0]; +} heap; + /* Print any console output from the kernel. We print all entries from "consumed" to the next entry without a "written" flag, or "next_output" is reached. The buffer is circular, but the @@ -811,13 +837,19 @@ main (int argc, char *argv[]) /* Allocate device memory for both function parameters and the argv data. */ - size_t heap_size = 10 * 1024 * 1024; /* 10MB. */ - struct kernargs *kernargs = device_malloc (sizeof (*kernargs) + heap_size); + struct kernargs *kernargs = device_malloc (sizeof (*kernargs), + kernargs_region); struct argdata { int64_t argv_data[kernel_argc]; char strings[args_size]; - } *args = device_malloc (sizeof (struct argdata)); + } *args = device_malloc (sizeof (struct argdata), kernargs_region); + + size_t heap_size = 10 * 1024 * 1024; /* 10MB. */ + struct heap *heap = device_malloc (heap_size, heap_region); + XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device, + HSA_ACCESS_PERMISSION_RW), + "Assign heap to device agent"); /* Write the data to the target. */ kernargs->argc = kernel_argc; @@ -837,8 +869,8 @@ main (int argc, char *argv[]) memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1); offset += arg_len; } - kernargs->heap_ptr = (int64_t) &kernargs->heap; - kernargs->heap.size = heap_size; + kernargs->heap_ptr = (int64_t) heap; + hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size)); /* Run constructors on the GPU. */ run (init_array_kernel, kernargs);