diff mbox

[1/5] OpenACC 2.0 support for libgomp - OpenACC runtime, NVidia PTX/CUDA plugin (repost)

Message ID 546CF330.1020701@codesourcery.com
State New
Headers show

Commit Message

Bernd Schmidt Nov. 19, 2014, 7:44 p.m. UTC
I've had some trouble with this patch as well - parts of it appear 
malformed, and in one instance a file still references the nonexistent 
target.h rather than libgomp_target.h. That was fixed relatively easily, 
but it is also missing some changes that Cesar made to our local sources 
recently, and which are required by Thomas' middle-end submission.

I'm attaching the patch in the form in which I've made it work locally, 
plus Cesar's patch which is needed on top of it. Julian, you'll probably 
want to look for that patch since it also included testsuite changes. 
Cesar - have a look over this please and maybe explain for review 
purposes what your patch does.

On the bright side, I now have a local tree based on gcc trunk with all 
posted patches plus several additional fixes, and it appears to be 
offloading stuff to ptx.


Bernd

Comments

Cesar Philippidis Nov. 19, 2014, 8:13 p.m. UTC | #1
On 11/19/2014 11:44 AM, Bernd Schmidt wrote:

> I'm attaching the patch in the form in which I've made it work locally,
> plus Cesar's patch which is needed on top of it. Julian, you'll probably
> want to look for that patch since it also included testsuite changes.
> Cesar - have a look over this please and maybe explain for review
> purposes what your patch does.

Julian's initial libgomp patch set somewhat diverged both from our
internal tree and gomp-4_0-branch. I think he was trying to get an
earlier snapshot of gomp-4_0-branch to play nicely with gomp4-offload
branch, and my patch went in kind of late.

Anyway, here's the like to my original patch:

https://gcc.gnu.org/ml/gcc-patches/2014-10/msg03392.html

The patch introduces two new libgomp-internal functions
GOACC_get_thread_num and GOACC_get_num_thread. There's some more details
in the link.

Cesar
diff mbox

Patch

commit 4a5e8ad6d5c5fa2e944d1318dbcba28f234abffe
Author: Bernd Schmidt <bernds@codesourcery.com>
Date:   Wed Nov 19 18:35:41 2014 +0100

    Cesar's latest patch

diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index f6e70e9..0fa62ff 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -310,6 +310,8 @@  GOACC_2.0 {
 	GOACC_parallel;
 	GOACC_update;
 	GOACC_wait;
+	GOACC_get_thread_num;
+	GOACC_get_num_threads;
 };
 
 GOMP_PLUGIN_1.0 {
diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h
index 44f200c..3db5676 100644
--- a/libgomp/libgomp_g.h
+++ b/libgomp/libgomp_g.h
@@ -226,5 +226,7 @@  extern void GOACC_parallel (int, void (*) (void *), const void *,
 			    size_t, void **, size_t *, unsigned short *,
 			    int, int, int, int, int, ...);
 extern void GOACC_wait (int, int, ...);
+extern int GOACC_get_num_threads (void);
+extern int GOACC_get_thread_num (void);
 
 #endif /* LIBGOMP_G_H */
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 0ff44bf..e142384 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -115,9 +115,6 @@  GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
   splay_tree_key tgt_fn_key;
   void (*tgt_fn);
 
-  if (num_gangs != 1)
-    gomp_fatal ("num_gangs (%d) different from one is not yet supported",
-		num_gangs);
   if (num_workers != 1)
     gomp_fatal ("num_workers (%d) different from one is not yet supported",
 		num_workers);
@@ -386,3 +383,15 @@  GOACC_wait (int async, int num_waits, ...)
 
   va_end (ap);
 }
+
+int
+GOACC_get_num_threads (void)
+{
+  return 1;
+}
+
+int
+GOACC_get_thread_num (void)
+{
+  return 0;
+}
diff --git a/libgomp/oacc-ptx.h b/libgomp/oacc-ptx.h
new file mode 100644
index 0000000..1af81b2
--- /dev/null
+++ b/libgomp/oacc-ptx.h
@@ -0,0 +1,400 @@ 
+#define ABORT_PTX				\
+  ".version 3.1\n"				\
+  ".target sm_30\n"				\
+  ".address_size 64\n"				\
+  ".visible .func abort;\n"			\
+  ".visible .func abort\n"			\
+  "{\n"						\
+  "trap;\n"					\
+  "ret;\n"					\
+  "}\n"						\
+  ".visible .func _gfortran_abort;\n"		\
+  ".visible .func _gfortran_abort\n"		\
+  "{\n"						\
+  "trap;\n"					\
+  "ret;\n"					\
+  "}\n" \
+
+/* Generated with:
+
+   $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
+*/
+#define ACC_ON_DEVICE_PTX						\
+  "        .version        3.1\n"					\
+  "        .target sm_30\n"						\
+  "        .address_size 64\n"						\
+  ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
+  ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
+  "{\n"									\
+  "        .reg.u32 %ar1;\n"						\
+  ".reg.u32 %retval;\n"							\
+  "        .reg.u64 %hr10;\n"						\
+  "        .reg.u32 %r24;\n"						\
+  "        .reg.u32 %r25;\n"						\
+  "        .reg.pred %r27;\n"						\
+  "        .reg.u32 %r30;\n"						\
+  "        ld.param.u32 %ar1, [%in_ar1];\n"				\
+  "                mov.u32 %r24, %ar1;\n"				\
+  "                setp.ne.u32 %r27,%r24,4;\n"				\
+  "                set.u32.eq.u32 %r30,%r24,5;\n"			\
+  "                neg.s32 %r25, %r30;\n"				\
+  "        @%r27   bra     $L3;\n"					\
+  "                mov.u32 %r25, 1;\n"					\
+  "$L3:\n"								\
+  "                mov.u32 %retval, %r25;\n"				\
+  "        st.param.u32    [%out_retval], %retval;\n"			\
+  "        ret;\n"							\
+  "        }\n"								\
+  ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \
+  ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \
+  "{\n"									\
+  "        .reg.u64 %ar1;\n"						\
+  ".reg.u32 %retval;\n"							\
+  "        .reg.u64 %hr10;\n"						\
+  "        .reg.u64 %r25;\n"						\
+  "        .reg.u32 %r26;\n"						\
+  "        .reg.u32 %r27;\n"						\
+  "        ld.param.u64 %ar1, [%in_ar1];\n"				\
+  "                mov.u64 %r25, %ar1;\n"				\
+  "                ld.u32  %r26, [%r25];\n"				\
+  "        {\n"								\
+  "                .param.u32 %retval_in;\n"				\
+  "        {\n"								\
+  "                .param.u32 %out_arg0;\n"				\
+  "                st.param.u32 [%out_arg0], %r26;\n"			\
+  "                call (%retval_in), acc_on_device, (%out_arg0);\n"	\
+  "        }\n"								\
+  "                ld.param.u32    %r27, [%retval_in];\n"		\
+  "}\n"									\
+  "                mov.u32 %retval, %r27;\n"				\
+  "        st.param.u32    [%out_retval], %retval;\n"			\
+  "        ret;\n"							\
+  "        }"
+
+ #define GOACC_INTERNAL_PTX						\
+  ".version 3.1\n" \
+  ".target sm_30\n" \
+  ".address_size 64\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n" \
+  ".extern .func abort;\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n" \
+  "{\n" \
+  ".reg .u32 %ar1;\n" \
+  ".reg .u32 %retval;\n" \
+  ".reg .u64 %hr10;\n" \
+  ".reg .u32 %r22;\n" \
+  ".reg .u32 %r23;\n" \
+  ".reg .u32 %r24;\n" \
+  ".reg .u32 %r25;\n" \
+  ".reg .u32 %r26;\n" \
+  ".reg .u32 %r27;\n" \
+  ".reg .u32 %r28;\n" \
+  ".reg .u32 %r29;\n" \
+  ".reg .pred %r30;\n" \
+  ".reg .u32 %r31;\n" \
+  ".reg .pred %r32;\n" \
+  ".reg .u32 %r33;\n" \
+  ".reg .pred %r34;\n" \
+  ".local .align 8 .b8 %frame[4];\n" \
+  "ld.param.u32 %ar1,[%in_ar1];\n" \
+  "mov.u32 %r27,%ar1;\n" \
+  "st.local.u32 [%frame],%r27;\n" \
+  "ld.local.u32 %r28,[%frame];\n" \
+  "mov.u32 %r29,1;\n"							\
+  "setp.eq.u32 %r30,%r28,%r29;\n"					\
+  "@%r30 bra $L4;\n"							\
+  "mov.u32 %r31,2;\n"							\
+  "setp.eq.u32 %r32,%r28,%r31;\n"					\
+  "@%r32 bra $L5;\n"							\
+  "mov.u32 %r33,0;\n"							\
+  "setp.eq.u32 %r34,%r28,%r33;\n"					\
+  "@!%r34 bra $L8;\n"							\
+  "mov.u32 %r23,%tid.x;\n"						\
+  "mov.u32 %r22,%r23;\n"						\
+  "bra $L7;\n"								\
+  "$L4:\n"								\
+  "mov.u32 %r24,%tid.y;\n"						\
+  "mov.u32 %r22,%r24;\n"						\
+  "bra $L7;\n"								\
+  "$L5:\n"								\
+  "mov.u32 %r25,%tid.z;\n"						\
+  "mov.u32 %r22,%r25;\n"						\
+  "bra $L7;\n"								\
+  "$L8:\n"								\
+  "{\n"									\
+  "{\n"									\
+  "call abort;\n"							\
+  "}\n"									\
+  "}\n"									\
+  "$L7:\n"								\
+  "mov.u32 %r26,%r22;\n"						\
+  "mov.u32 %retval,%r26;\n"						\
+  "st.param.u32 [%out_retval],%retval;\n"				\
+  "ret;\n"								\
+  "}\n"									\
+  ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1)\n" \
+  "{\n"									\
+  ".reg .u32 %ar1;\n"							\
+  ".reg .u32 %retval;\n"						\
+  ".reg .u64 %hr10;\n"							\
+  ".reg .u32 %r22;\n"							\
+  ".reg .u32 %r23;\n"							\
+  ".reg .u32 %r24;\n"							\
+  ".reg .u32 %r25;\n"							\
+  ".reg .u32 %r26;\n"							\
+  ".reg .u32 %r27;\n"							\
+  ".reg .u32 %r28;\n"							\
+  ".reg .u32 %r29;\n"							\
+  ".reg .pred %r30;\n"							\
+  ".reg .u32 %r31;\n"							\
+  ".reg .pred %r32;\n"							\
+  ".reg .u32 %r33;\n"							\
+  ".reg .pred %r34;\n"							\
+  ".local .align 8 .b8 %frame[4];\n"					\
+  "ld.param.u32 %ar1,[%in_ar1];\n"					\
+  "mov.u32 %r27,%ar1;\n"						\
+  "st.local.u32 [%frame],%r27;\n"					\
+  "ld.local.u32 %r28,[%frame];\n"					\
+  "mov.u32 %r29,1;\n"							\
+  "setp.eq.u32 %r30,%r28,%r29;\n"					\
+  "@%r30 bra $L11;\n"							\
+  "mov.u32 %r31,2;\n"							\
+  "setp.eq.u32 %r32,%r28,%r31;\n"					\
+  "@%r32 bra $L12;\n"							\
+  "mov.u32 %r33,0;\n"							\
+  "setp.eq.u32 %r34,%r28,%r33;\n"					\
+  "@!%r34 bra $L15;\n"							\
+  "mov.u32 %r23,%ntid.x;\n"						\
+  "mov.u32 %r22,%r23;\n"						\
+  "bra $L14;\n"								\
+  "$L11:\n"								\
+  "mov.u32 %r24,%ntid.y;\n"						\
+  "mov.u32 %r22,%r24;\n"						\
+  "bra $L14;\n"								\
+  "$L12:\n"								\
+  "mov.u32 %r25,%ntid.z;\n"						\
+  "mov.u32 %r22,%r25;\n"						\
+  "bra $L14;\n"								\
+  "$L15:\n"								\
+  "{\n"									\
+  "{\n"									\
+  "call abort;\n"							\
+  "}\n"									\
+  "}\n"									\
+  "$L14:\n"								\
+  "mov.u32 %r26,%r22;\n"						\
+  "mov.u32 %retval,%r26;\n"						\
+  "st.param.u32 [%out_retval],%retval;\n"				\
+  "ret;\n"								\
+  "}\n"									\
+  ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1)\n" \
+  "{\n"									\
+  ".reg .u32 %ar1;\n"							\
+  ".reg .u32 %retval;\n"						\
+  ".reg .u64 %hr10;\n"							\
+  ".reg .u32 %r22;\n"							\
+  ".reg .u32 %r23;\n"							\
+  ".reg .u32 %r24;\n"							\
+  ".reg .u32 %r25;\n"							\
+  ".reg .u32 %r26;\n"							\
+  ".reg .u32 %r27;\n"							\
+  ".reg .u32 %r28;\n"							\
+  ".reg .u32 %r29;\n"							\
+  ".reg .pred %r30;\n"							\
+  ".reg .u32 %r31;\n"							\
+  ".reg .pred %r32;\n"							\
+  ".reg .u32 %r33;\n"							\
+  ".reg .pred %r34;\n"							\
+  ".local .align 8 .b8 %frame[4];\n"					\
+  "ld.param.u32 %ar1,[%in_ar1];\n"					\
+  "mov.u32 %r27,%ar1;\n"						\
+  "st.local.u32 [%frame],%r27;\n"					\
+  "ld.local.u32 %r28,[%frame];\n"					\
+  "mov.u32 %r29,1;\n"							\
+  "setp.eq.u32 %r30,%r28,%r29;\n"					\
+  "@%r30 bra $L18;\n"							\
+  "mov.u32 %r31,2;\n"							\
+  "setp.eq.u32 %r32,%r28,%r31;\n"					\
+  "@%r32 bra $L19;\n"							\
+  "mov.u32 %r33,0;\n"							\
+  "setp.eq.u32 %r34,%r28,%r33;\n"					\
+  "@!%r34 bra $L22;\n"							\
+  "mov.u32 %r23,%ctaid.x;\n"						\
+  "mov.u32 %r22,%r23;\n"						\
+  "bra $L21;\n"								\
+  "$L18:\n"								\
+  "mov.u32 %r24,%ctaid.y;\n"						\
+  "mov.u32 %r22,%r24;\n"						\
+  "bra $L21;\n"								\
+  "$L19:\n"								\
+  "mov.u32 %r25,%ctaid.z;\n"						\
+  "mov.u32 %r22,%r25;\n"						\
+  "bra $L21;\n"								\
+  "$L22:\n"								\
+  "{\n"									\
+  "{\n"									\
+  "call abort;\n"							\
+  "}\n"									\
+  "}\n"									\
+  "$L21:\n"								\
+  "mov.u32 %r26,%r22;\n"						\
+  "mov.u32 %retval,%r26;\n"						\
+  "st.param.u32 [%out_retval],%retval;\n"				\
+  "ret;\n"								\
+  "}\n"									\
+  ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1)\n" \
+  "{\n"									\
+  ".reg .u32 %ar1;\n"							\
+  ".reg .u32 %retval;\n"						\
+  ".reg .u64 %hr10;\n"							\
+  ".reg .u32 %r22;\n"							\
+  ".reg .u32 %r23;\n"							\
+  ".reg .u32 %r24;\n"							\
+  ".reg .u32 %r25;\n"							\
+  ".reg .u32 %r26;\n"							\
+  ".reg .u32 %r27;\n"							\
+  ".reg .u32 %r28;\n"							\
+  ".reg .u32 %r29;\n"							\
+  ".reg .pred %r30;\n"							\
+  ".reg .u32 %r31;\n"							\
+  ".reg .pred %r32;\n"							\
+  ".reg .u32 %r33;\n"							\
+  ".reg .pred %r34;\n"							\
+  ".local .align 8 .b8 %frame[4];\n"					\
+  "ld.param.u32 %ar1,[%in_ar1];\n"					\
+  "mov.u32 %r27,%ar1;\n"						\
+  "st.local.u32 [%frame],%r27;\n"					\
+  "ld.local.u32 %r28,[%frame];\n"					\
+  "mov.u32 %r29,1;\n"							\
+  "setp.eq.u32 %r30,%r28,%r29;\n"					\
+  "@%r30 bra $L25;\n"							\
+  "mov.u32 %r31,2;\n"							\
+  "setp.eq.u32 %r32,%r28,%r31;\n"					\
+  "@%r32 bra $L26;\n"							\
+  "mov.u32 %r33,0;\n"							\
+  "setp.eq.u32 %r34,%r28,%r33;\n"					\
+  "@!%r34 bra $L29;\n"							\
+  "mov.u32 %r23,%nctaid.x;\n"						\
+  "mov.u32 %r22,%r23;\n"						\
+  "bra $L28;\n"								\
+  "$L25:\n"								\
+  "mov.u32 %r24,%nctaid.y;\n"						\
+  "mov.u32 %r22,%r24;\n"						\
+  "bra $L28;\n"								\
+  "$L26:\n"								\
+  "mov.u32 %r25,%nctaid.z;\n"						\
+  "mov.u32 %r22,%r25;\n"						\
+  "bra $L28;\n"								\
+  "$L29:\n"								\
+  "{\n"									\
+  "{\n"									\
+  "call abort;\n"							\
+  "}\n"									\
+  "}\n"									\
+  "$L28:\n"								\
+  "mov.u32 %r26,%r22;\n"						\
+  "mov.u32 %retval,%r26;\n"						\
+  "st.param.u32 [%out_retval],%retval;\n"				\
+  "ret;\n"								\
+  "}\n"									\
+  ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n"	\
+  "{\n"									\
+  ".reg .u32 %retval;\n"						\
+  ".reg .u64 %hr10;\n"							\
+  ".reg .u32 %r22;\n"							\
+  ".reg .u32 %r23;\n"							\
+  ".reg .u32 %r24;\n"							\
+  ".reg .u32 %r25;\n"							\
+  ".reg .u32 %r26;\n"							\
+  ".reg .u32 %r27;\n"							\
+  ".reg .u32 %r28;\n"							\
+  ".reg .u32 %r29;\n"							\
+  "mov.u32 %r26,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r26;\n"					\
+  "call (%retval_in),GOACC_ntid,(%out_arg0);\n"				\
+  "}\n"									\
+  "ld.param.u32 %r27,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r22,%r27;\n"						\
+  "mov.u32 %r28,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r28;\n"					\
+  "call (%retval_in),GOACC_nctaid,(%out_arg0);\n"			\
+  "}\n"									\
+  "ld.param.u32 %r29,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r23,%r29;\n"						\
+  "mul.lo.u32 %r24,%r22,%r23;\n"					\
+  "mov.u32 %r25,%r24;\n"						\
+  "mov.u32 %retval,%r25;\n"						\
+  "st.param.u32 [%out_retval],%retval;\n"				\
+  "ret;\n"								\
+  "}\n"									\
+  ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n"	\
+  "{\n"									\
+  ".reg .u32 %retval;\n"						\
+  ".reg .u64 %hr10;\n"							\
+  ".reg .u32 %r22;\n"							\
+  ".reg .u32 %r23;\n"							\
+  ".reg .u32 %r24;\n"							\
+  ".reg .u32 %r25;\n"							\
+  ".reg .u32 %r26;\n"							\
+  ".reg .u32 %r27;\n"							\
+  ".reg .u32 %r28;\n"							\
+  ".reg .u32 %r29;\n"							\
+  ".reg .u32 %r30;\n"							\
+  ".reg .u32 %r31;\n"							\
+  ".reg .u32 %r32;\n"							\
+  ".reg .u32 %r33;\n"							\
+  "mov.u32 %r28,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r28;\n"					\
+  "call (%retval_in),GOACC_ntid,(%out_arg0);\n"				\
+  "}\n"									\
+  "ld.param.u32 %r29,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r22,%r29;\n"						\
+  "mov.u32 %r30,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r30;\n"					\
+  "call (%retval_in),GOACC_ctaid,(%out_arg0);\n"			\
+  "}\n"									\
+  "ld.param.u32 %r31,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r23,%r31;\n"						\
+  "mul.lo.u32 %r24,%r22,%r23;\n"					\
+  "mov.u32 %r32,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r32;\n"					\
+  "call (%retval_in),GOACC_tid,(%out_arg0);\n"				\
+  "}\n"									\
+  "ld.param.u32 %r33,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r25,%r33;\n"						\
+  "add.u32 %r26,%r24,%r25;\n"						\
+  "mov.u32 %r27,%r26;\n"						\
+  "mov.u32 %retval,%r27;\n"						\
+  "st.param.u32 [%out_retval],%retval;\n"				\
+  "ret;\n"								\
+  "}\n"
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 3d1b81b..7fedd2d 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -35,6 +35,7 @@ 
 #include "libgomp.h"
 #include "libgomp_target.h"
 #include "libgomp-plugin.h"
+#include "oacc-ptx.h"
 #include "oacc-plugin.h"
 
 #include <cuda.h>
@@ -722,78 +723,6 @@  PTX_get_num_devices (void)
   return n;
 }
 
-#define ABORT_PTX				\
-  ".version 3.1\n"				\
-  ".target sm_30\n"				\
-  ".address_size 64\n"				\
-  ".visible .func abort;\n"			\
-  ".visible .func abort\n"			\
-  "{\n"						\
-  "trap;\n"					\
-  "ret;\n"					\
-  "}\n"						\
-  ".visible .func _gfortran_abort;\n"		\
-  ".visible .func _gfortran_abort\n"		\
-  "{\n"						\
-  "trap;\n"					\
-  "ret;\n"					\
-  "}\n" \
-
-/* Generated with:
-
-   $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
-*/
-#define ACC_ON_DEVICE_PTX						\
-  "        .version        3.1\n"					\
-  "        .target sm_30\n"						\
-  "        .address_size 64\n"						\
-  ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
-  ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
-  "{\n"									\
-  "        .reg.u32 %ar1;\n"						\
-  ".reg.u32 %retval;\n"							\
-  "        .reg.u64 %hr10;\n"						\
-  "        .reg.u32 %r24;\n"						\
-  "        .reg.u32 %r25;\n"						\
-  "        .reg.pred %r27;\n"						\
-  "        .reg.u32 %r30;\n"						\
-  "        ld.param.u32 %ar1, [%in_ar1];\n"				\
-  "                mov.u32 %r24, %ar1;\n"				\
-  "                setp.ne.u32 %r27,%r24,4;\n"				\
-  "                set.u32.eq.u32 %r30,%r24,5;\n"			\
-  "                neg.s32 %r25, %r30;\n"				\
-  "        @%r27   bra     $L3;\n"					\
-  "                mov.u32 %r25, 1;\n"					\
-  "$L3:\n"								\
-  "                mov.u32 %retval, %r25;\n"				\
-  "        st.param.u32    [%out_retval], %retval;\n"			\
-  "        ret;\n"							\
-  "        }\n"								\
-  ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1);\n" \
-  ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1)\n" \
-  "{\n"									\
-  "        .reg.u64 %ar1;\n"						\
-  ".reg.u32 %retval;\n"							\
-  "        .reg.u64 %hr10;\n"						\
-  "        .reg.u64 %r25;\n"						\
-  "        .reg.u32 %r26;\n"						\
-  "        .reg.u32 %r27;\n"						\
-  "        ld.param.u64 %ar1, [%in_ar1];\n"				\
-  "                mov.u64 %r25, %ar1;\n"				\
-  "                ld.u32  %r26, [%r25];\n"				\
-  "        {\n"								\
-  "                .param.u32 %retval_in;\n"				\
-  "        {\n"								\
-  "                .param.u32 %out_arg0;\n"				\
-  "                st.param.u32 [%out_arg0], %r26;\n"			\
-  "                call (%retval_in), acc_on_device, (%out_arg0);\n"	\
-  "        }\n"								\
-  "                ld.param.u32    %r27, [%retval_in];\n"		\
-  "}\n"									\
-  "                mov.u32 %retval, %r27;\n"				\
-  "        st.param.u32    [%out_retval], %retval;\n"			\
-  "        ret;\n"							\
-  "        }"
 
 static void
 link_ptx (CUmodule *module, char *ptx_code)
@@ -856,6 +785,16 @@  link_ptx (CUmodule *module, char *ptx_code)
 			 cuda_error (r));
     }
 
+  char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
+  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
+		     strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+      GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
+			 cuda_error (r));
+    }
+
   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
               strlen (ptx_code) + 1, 0, 0, 0, 0);
   if (r != CUDA_SUCCESS)
@@ -1043,7 +982,7 @@  PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
   kargs[0] = &dp;
   r = cuLaunchKernel (function,
-			1, 1, 1,
+			num_gangs, 1, 1,
 			nthreads_in_block, 1, 1,
 			0, dev_str->stream, kargs, 0);
   if (r != CUDA_SUCCESS)
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90
new file mode 100644
index 0000000..6325431
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90
@@ -0,0 +1,30 @@ 
+! { dg-do run }
+
+program reduction
+  implicit none
+
+  integer, parameter    :: n = 100
+  integer               :: i, s1, s2, vs1, vs2
+
+  s1 = 0
+  s2 = 0
+  vs1 = 0
+  vs2 = 0
+
+  !$acc parallel vector_length (1000)
+  !$acc loop reduction(+:s1, s2)
+  do i = 1, n
+     s1 = s1 + 1
+     s2 = s2 + 2
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vs1 = vs1 + 1
+     vs2 = vs2 + 2
+  end do
+
+  if (s1.ne.vs1) call abort ()
+  if (s2.ne.vs2) call abort ()
+end program reduction