From patchwork Wed Nov 19 19:44:48 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Bernd Schmidt X-Patchwork-Id: 412493 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 7E856140142 for ; Thu, 20 Nov 2014 06:45:45 +1100 (AEDT) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:cc:subject:references :in-reply-to:content-type; q=dns; s=default; b=qMm5R3a2Z4VM1XeeF xYmi5IOnGJpPBf00k/46EdkyQ1CUpiSPvvs2Ah6a0fDNfOy9pCzLorruEokzJrbC qumeKASL9dd/e973bb9DBALSFCiGSSky/OztldblZqw6zV0kCuJfbSzqPlhBGWbm 0N8/+lYgL71Dcnr7bDG4/wNYyk= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:cc:subject:references :in-reply-to:content-type; s=default; bh=pwawFmTWPt6zch/a+6SYx6/ ATx4=; b=bI70nup0gMyPtKA2A2h6esePuaZybrjrDQHzObbU+KwaO/23ZGB3X7W q/DdOsdpj8RaLnxw4Mk4G6hoNmN6d4fKAnzqT/M+kF3RnxHc4k1p8hv0dh5rET3a CGYVNdTDkdf1VKpvtfBqlOOZLinlxAM7yvSJeM5j7NF+3amQ+9wo= Received: (qmail 30524 invoked by alias); 19 Nov 2014 19:45:22 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 30505 invoked by uid 89); 19 Nov 2014 19:45:21 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=1.3 required=5.0 tests=AWL, BAYES_50, KAM_STOCKGEN, RCVD_IN_DNSWL_NONE, UNSUBSCRIBE_BODY autolearn=no version=3.3.2 X-HELO: relay1.mentorg.com Received: from relay1.mentorg.com (HELO relay1.mentorg.com) (192.94.38.131) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Wed, 19 Nov 2014 19:44:53 +0000 Received: from nat-ies.mentorg.com ([192.94.31.2] helo=SVR-IES-FEM-01.mgc.mentorg.com) by relay1.mentorg.com with esmtp id 1XrBBF-00011m-Ph from Bernd_Schmidt@mentor.com ; Wed, 19 Nov 2014 11:44:49 -0800 Received: from [127.0.0.1] (137.202.0.76) by SVR-IES-FEM-01.mgc.mentorg.com (137.202.0.104) with Microsoft SMTP Server id 14.3.181.6; Wed, 19 Nov 2014 19:44:42 +0000 Message-ID: <546CF330.1020701@codesourcery.com> Date: Wed, 19 Nov 2014 20:44:48 +0100 From: Bernd Schmidt User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.8.0 MIME-Version: 1.0 To: Julian Brown , Jakub Jelinek CC: , Thomas Schwinge , Ilya Verbin , Cesar Philippidis Subject: Re: [PATCH 1/5] OpenACC 2.0 support for libgomp - OpenACC runtime, NVidia PTX/CUDA plugin (repost) References: <20140923191931.2177e60f@octopus> <20141111135323.29e0f27b@octopus> <20141112100626.GP5026@tucnak.redhat.com> <20141115004904.55b16737@octopus> In-Reply-To: <20141115004904.55b16737@octopus> I've had some trouble with this patch as well - parts of it appear malformed, and in one instance a file still references the nonexistent target.h rather than libgomp_target.h. That was fixed relatively easily, but it is also missing some changes that Cesar made to our local sources recently, and which are required by Thomas' middle-end submission. I'm attaching the patch in the form in which I've made it work locally, plus Cesar's patch which is needed on top of it. Julian, you'll probably want to look for that patch since it also included testsuite changes. Cesar - have a look over this please and maybe explain for review purposes what your patch does. On the bright side, I now have a local tree based on gcc trunk with all posted patches plus several additional fixes, and it appears to be offloading stuff to ptx. Bernd commit 4a5e8ad6d5c5fa2e944d1318dbcba28f234abffe Author: Bernd Schmidt Date: Wed Nov 19 18:35:41 2014 +0100 Cesar's latest patch diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map index f6e70e9..0fa62ff 100644 --- a/libgomp/libgomp.map +++ b/libgomp/libgomp.map @@ -310,6 +310,8 @@ GOACC_2.0 { GOACC_parallel; GOACC_update; GOACC_wait; + GOACC_get_thread_num; + GOACC_get_num_threads; }; GOMP_PLUGIN_1.0 { diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h index 44f200c..3db5676 100644 --- a/libgomp/libgomp_g.h +++ b/libgomp/libgomp_g.h @@ -226,5 +226,7 @@ extern void GOACC_parallel (int, void (*) (void *), const void *, size_t, void **, size_t *, unsigned short *, int, int, int, int, int, ...); extern void GOACC_wait (int, int, ...); +extern int GOACC_get_num_threads (void); +extern int GOACC_get_thread_num (void); #endif /* LIBGOMP_G_H */ diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c index 0ff44bf..e142384 100644 --- a/libgomp/oacc-parallel.c +++ b/libgomp/oacc-parallel.c @@ -115,9 +115,6 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target, splay_tree_key tgt_fn_key; void (*tgt_fn); - if (num_gangs != 1) - gomp_fatal ("num_gangs (%d) different from one is not yet supported", - num_gangs); if (num_workers != 1) gomp_fatal ("num_workers (%d) different from one is not yet supported", num_workers); @@ -386,3 +383,15 @@ GOACC_wait (int async, int num_waits, ...) va_end (ap); } + +int +GOACC_get_num_threads (void) +{ + return 1; +} + +int +GOACC_get_thread_num (void) +{ + return 0; +} diff --git a/libgomp/oacc-ptx.h b/libgomp/oacc-ptx.h new file mode 100644 index 0000000..1af81b2 --- /dev/null +++ b/libgomp/oacc-ptx.h @@ -0,0 +1,400 @@ +#define ABORT_PTX \ + ".version 3.1\n" \ + ".target sm_30\n" \ + ".address_size 64\n" \ + ".visible .func abort;\n" \ + ".visible .func abort\n" \ + "{\n" \ + "trap;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func _gfortran_abort;\n" \ + ".visible .func _gfortran_abort\n" \ + "{\n" \ + "trap;\n" \ + "ret;\n" \ + "}\n" \ + +/* Generated with: + + $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline +*/ +#define ACC_ON_DEVICE_PTX \ + " .version 3.1\n" \ + " .target sm_30\n" \ + " .address_size 64\n" \ + ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \ + ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \ + "{\n" \ + " .reg.u32 %ar1;\n" \ + ".reg.u32 %retval;\n" \ + " .reg.u64 %hr10;\n" \ + " .reg.u32 %r24;\n" \ + " .reg.u32 %r25;\n" \ + " .reg.pred %r27;\n" \ + " .reg.u32 %r30;\n" \ + " ld.param.u32 %ar1, [%in_ar1];\n" \ + " mov.u32 %r24, %ar1;\n" \ + " setp.ne.u32 %r27,%r24,4;\n" \ + " set.u32.eq.u32 %r30,%r24,5;\n" \ + " neg.s32 %r25, %r30;\n" \ + " @%r27 bra $L3;\n" \ + " mov.u32 %r25, 1;\n" \ + "$L3:\n" \ + " mov.u32 %retval, %r25;\n" \ + " st.param.u32 [%out_retval], %retval;\n" \ + " ret;\n" \ + " }\n" \ + ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \ + ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \ + "{\n" \ + " .reg.u64 %ar1;\n" \ + ".reg.u32 %retval;\n" \ + " .reg.u64 %hr10;\n" \ + " .reg.u64 %r25;\n" \ + " .reg.u32 %r26;\n" \ + " .reg.u32 %r27;\n" \ + " ld.param.u64 %ar1, [%in_ar1];\n" \ + " mov.u64 %r25, %ar1;\n" \ + " ld.u32 %r26, [%r25];\n" \ + " {\n" \ + " .param.u32 %retval_in;\n" \ + " {\n" \ + " .param.u32 %out_arg0;\n" \ + " st.param.u32 [%out_arg0], %r26;\n" \ + " call (%retval_in), acc_on_device, (%out_arg0);\n" \ + " }\n" \ + " ld.param.u32 %r27, [%retval_in];\n" \ + "}\n" \ + " mov.u32 %retval, %r27;\n" \ + " st.param.u32 [%out_retval], %retval;\n" \ + " ret;\n" \ + " }" + + #define GOACC_INTERNAL_PTX \ + ".version 3.1\n" \ + ".target sm_30\n" \ + ".address_size 64\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n" \ + ".extern .func abort;\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n" \ + "{\n" \ + ".reg .u32 %ar1;\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .pred %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .pred %r32;\n" \ + ".reg .u32 %r33;\n" \ + ".reg .pred %r34;\n" \ + ".local .align 8 .b8 %frame[4];\n" \ + "ld.param.u32 %ar1,[%in_ar1];\n" \ + "mov.u32 %r27,%ar1;\n" \ + "st.local.u32 [%frame],%r27;\n" \ + "ld.local.u32 %r28,[%frame];\n" \ + "mov.u32 %r29,1;\n" \ + "setp.eq.u32 %r30,%r28,%r29;\n" \ + "@%r30 bra $L4;\n" \ + "mov.u32 %r31,2;\n" \ + "setp.eq.u32 %r32,%r28,%r31;\n" \ + "@%r32 bra $L5;\n" \ + "mov.u32 %r33,0;\n" \ + "setp.eq.u32 %r34,%r28,%r33;\n" \ + "@!%r34 bra $L8;\n" \ + "mov.u32 %r23,%tid.x;\n" \ + "mov.u32 %r22,%r23;\n" \ + "bra $L7;\n" \ + "$L4:\n" \ + "mov.u32 %r24,%tid.y;\n" \ + "mov.u32 %r22,%r24;\n" \ + "bra $L7;\n" \ + "$L5:\n" \ + "mov.u32 %r25,%tid.z;\n" \ + "mov.u32 %r22,%r25;\n" \ + "bra $L7;\n" \ + "$L8:\n" \ + "{\n" \ + "{\n" \ + "call abort;\n" \ + "}\n" \ + "}\n" \ + "$L7:\n" \ + "mov.u32 %r26,%r22;\n" \ + "mov.u32 %retval,%r26;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1)\n" \ + "{\n" \ + ".reg .u32 %ar1;\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .pred %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .pred %r32;\n" \ + ".reg .u32 %r33;\n" \ + ".reg .pred %r34;\n" \ + ".local .align 8 .b8 %frame[4];\n" \ + "ld.param.u32 %ar1,[%in_ar1];\n" \ + "mov.u32 %r27,%ar1;\n" \ + "st.local.u32 [%frame],%r27;\n" \ + "ld.local.u32 %r28,[%frame];\n" \ + "mov.u32 %r29,1;\n" \ + "setp.eq.u32 %r30,%r28,%r29;\n" \ + "@%r30 bra $L11;\n" \ + "mov.u32 %r31,2;\n" \ + "setp.eq.u32 %r32,%r28,%r31;\n" \ + "@%r32 bra $L12;\n" \ + "mov.u32 %r33,0;\n" \ + "setp.eq.u32 %r34,%r28,%r33;\n" \ + "@!%r34 bra $L15;\n" \ + "mov.u32 %r23,%ntid.x;\n" \ + "mov.u32 %r22,%r23;\n" \ + "bra $L14;\n" \ + "$L11:\n" \ + "mov.u32 %r24,%ntid.y;\n" \ + "mov.u32 %r22,%r24;\n" \ + "bra $L14;\n" \ + "$L12:\n" \ + "mov.u32 %r25,%ntid.z;\n" \ + "mov.u32 %r22,%r25;\n" \ + "bra $L14;\n" \ + "$L15:\n" \ + "{\n" \ + "{\n" \ + "call abort;\n" \ + "}\n" \ + "}\n" \ + "$L14:\n" \ + "mov.u32 %r26,%r22;\n" \ + "mov.u32 %retval,%r26;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1)\n" \ + "{\n" \ + ".reg .u32 %ar1;\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .pred %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .pred %r32;\n" \ + ".reg .u32 %r33;\n" \ + ".reg .pred %r34;\n" \ + ".local .align 8 .b8 %frame[4];\n" \ + "ld.param.u32 %ar1,[%in_ar1];\n" \ + "mov.u32 %r27,%ar1;\n" \ + "st.local.u32 [%frame],%r27;\n" \ + "ld.local.u32 %r28,[%frame];\n" \ + "mov.u32 %r29,1;\n" \ + "setp.eq.u32 %r30,%r28,%r29;\n" \ + "@%r30 bra $L18;\n" \ + "mov.u32 %r31,2;\n" \ + "setp.eq.u32 %r32,%r28,%r31;\n" \ + "@%r32 bra $L19;\n" \ + "mov.u32 %r33,0;\n" \ + "setp.eq.u32 %r34,%r28,%r33;\n" \ + "@!%r34 bra $L22;\n" \ + "mov.u32 %r23,%ctaid.x;\n" \ + "mov.u32 %r22,%r23;\n" \ + "bra $L21;\n" \ + "$L18:\n" \ + "mov.u32 %r24,%ctaid.y;\n" \ + "mov.u32 %r22,%r24;\n" \ + "bra $L21;\n" \ + "$L19:\n" \ + "mov.u32 %r25,%ctaid.z;\n" \ + "mov.u32 %r22,%r25;\n" \ + "bra $L21;\n" \ + "$L22:\n" \ + "{\n" \ + "{\n" \ + "call abort;\n" \ + "}\n" \ + "}\n" \ + "$L21:\n" \ + "mov.u32 %r26,%r22;\n" \ + "mov.u32 %retval,%r26;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1)\n" \ + "{\n" \ + ".reg .u32 %ar1;\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .pred %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .pred %r32;\n" \ + ".reg .u32 %r33;\n" \ + ".reg .pred %r34;\n" \ + ".local .align 8 .b8 %frame[4];\n" \ + "ld.param.u32 %ar1,[%in_ar1];\n" \ + "mov.u32 %r27,%ar1;\n" \ + "st.local.u32 [%frame],%r27;\n" \ + "ld.local.u32 %r28,[%frame];\n" \ + "mov.u32 %r29,1;\n" \ + "setp.eq.u32 %r30,%r28,%r29;\n" \ + "@%r30 bra $L25;\n" \ + "mov.u32 %r31,2;\n" \ + "setp.eq.u32 %r32,%r28,%r31;\n" \ + "@%r32 bra $L26;\n" \ + "mov.u32 %r33,0;\n" \ + "setp.eq.u32 %r34,%r28,%r33;\n" \ + "@!%r34 bra $L29;\n" \ + "mov.u32 %r23,%nctaid.x;\n" \ + "mov.u32 %r22,%r23;\n" \ + "bra $L28;\n" \ + "$L25:\n" \ + "mov.u32 %r24,%nctaid.y;\n" \ + "mov.u32 %r22,%r24;\n" \ + "bra $L28;\n" \ + "$L26:\n" \ + "mov.u32 %r25,%nctaid.z;\n" \ + "mov.u32 %r22,%r25;\n" \ + "bra $L28;\n" \ + "$L29:\n" \ + "{\n" \ + "{\n" \ + "call abort;\n" \ + "}\n" \ + "}\n" \ + "$L28:\n" \ + "mov.u32 %r26,%r22;\n" \ + "mov.u32 %retval,%r26;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n" \ + "{\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + "mov.u32 %r26,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r26;\n" \ + "call (%retval_in),GOACC_ntid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r27,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r22,%r27;\n" \ + "mov.u32 %r28,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r28;\n" \ + "call (%retval_in),GOACC_nctaid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r29,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r23,%r29;\n" \ + "mul.lo.u32 %r24,%r22,%r23;\n" \ + "mov.u32 %r25,%r24;\n" \ + "mov.u32 %retval,%r25;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n" \ + "{\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .u32 %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .u32 %r32;\n" \ + ".reg .u32 %r33;\n" \ + "mov.u32 %r28,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r28;\n" \ + "call (%retval_in),GOACC_ntid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r29,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r22,%r29;\n" \ + "mov.u32 %r30,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r30;\n" \ + "call (%retval_in),GOACC_ctaid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r31,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r23,%r31;\n" \ + "mul.lo.u32 %r24,%r22,%r23;\n" \ + "mov.u32 %r32,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r32;\n" \ + "call (%retval_in),GOACC_tid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r33,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r25,%r33;\n" \ + "add.u32 %r26,%r24,%r25;\n" \ + "mov.u32 %r27,%r26;\n" \ + "mov.u32 %retval,%r27;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 3d1b81b..7fedd2d 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -35,6 +35,7 @@ #include "libgomp.h" #include "libgomp_target.h" #include "libgomp-plugin.h" +#include "oacc-ptx.h" #include "oacc-plugin.h" #include @@ -722,78 +723,6 @@ PTX_get_num_devices (void) return n; } -#define ABORT_PTX \ - ".version 3.1\n" \ - ".target sm_30\n" \ - ".address_size 64\n" \ - ".visible .func abort;\n" \ - ".visible .func abort\n" \ - "{\n" \ - "trap;\n" \ - "ret;\n" \ - "}\n" \ - ".visible .func _gfortran_abort;\n" \ - ".visible .func _gfortran_abort\n" \ - "{\n" \ - "trap;\n" \ - "ret;\n" \ - "}\n" \ - -/* Generated with: - - $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline -*/ -#define ACC_ON_DEVICE_PTX \ - " .version 3.1\n" \ - " .target sm_30\n" \ - " .address_size 64\n" \ - ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \ - ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \ - "{\n" \ - " .reg.u32 %ar1;\n" \ - ".reg.u32 %retval;\n" \ - " .reg.u64 %hr10;\n" \ - " .reg.u32 %r24;\n" \ - " .reg.u32 %r25;\n" \ - " .reg.pred %r27;\n" \ - " .reg.u32 %r30;\n" \ - " ld.param.u32 %ar1, [%in_ar1];\n" \ - " mov.u32 %r24, %ar1;\n" \ - " setp.ne.u32 %r27,%r24,4;\n" \ - " set.u32.eq.u32 %r30,%r24,5;\n" \ - " neg.s32 %r25, %r30;\n" \ - " @%r27 bra $L3;\n" \ - " mov.u32 %r25, 1;\n" \ - "$L3:\n" \ - " mov.u32 %retval, %r25;\n" \ - " st.param.u32 [%out_retval], %retval;\n" \ - " ret;\n" \ - " }\n" \ - ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1);\n" \ - ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1)\n" \ - "{\n" \ - " .reg.u64 %ar1;\n" \ - ".reg.u32 %retval;\n" \ - " .reg.u64 %hr10;\n" \ - " .reg.u64 %r25;\n" \ - " .reg.u32 %r26;\n" \ - " .reg.u32 %r27;\n" \ - " ld.param.u64 %ar1, [%in_ar1];\n" \ - " mov.u64 %r25, %ar1;\n" \ - " ld.u32 %r26, [%r25];\n" \ - " {\n" \ - " .param.u32 %retval_in;\n" \ - " {\n" \ - " .param.u32 %out_arg0;\n" \ - " st.param.u32 [%out_arg0], %r26;\n" \ - " call (%retval_in), acc_on_device, (%out_arg0);\n" \ - " }\n" \ - " ld.param.u32 %r27, [%retval_in];\n" \ - "}\n" \ - " mov.u32 %retval, %r27;\n" \ - " st.param.u32 [%out_retval], %retval;\n" \ - " ret;\n" \ - " }" static void link_ptx (CUmodule *module, char *ptx_code) @@ -856,6 +785,16 @@ link_ptx (CUmodule *module, char *ptx_code) cuda_error (r)); } + char *goacc_internal_ptx = GOACC_INTERNAL_PTX; + r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx, + strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); + GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s", + cuda_error (r)); + } + r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code, strlen (ptx_code) + 1, 0, 0, 0, 0); if (r != CUDA_SUCCESS) @@ -1043,7 +982,7 @@ PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, kargs[0] = &dp; r = cuLaunchKernel (function, - 1, 1, 1, + num_gangs, 1, 1, nthreads_in_block, 1, 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90 new file mode 100644 index 0000000..6325431 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90 @@ -0,0 +1,30 @@ +! { dg-do run } + +program reduction + implicit none + + integer, parameter :: n = 100 + integer :: i, s1, s2, vs1, vs2 + + s1 = 0 + s2 = 0 + vs1 = 0 + vs2 = 0 + + !$acc parallel vector_length (1000) + !$acc loop reduction(+:s1, s2) + do i = 1, n + s1 = s1 + 1 + s2 = s2 + 2 + end do + !$acc end parallel + + ! Verify the results + do i = 1, n + vs1 = vs1 + 1 + vs2 = vs2 + 2 + end do + + if (s1.ne.vs1) call abort () + if (s2.ne.vs2) call abort () +end program reduction