diff mbox series

[committed] Update OpenACC testcases

Message ID 87po0jxhfx.fsf@euler.schwinge.homeip.net
State New
Headers show
Series [committed] Update OpenACC testcases | expand

Commit Message

Thomas Schwinge June 22, 2018, 10:07 a.m. UTC
Hi!

On our development branch(es) we had accumulated a bunch of testcases
(updates) that should have been part of earlier patch submissions, or
were not yet pushed for unknown reasons.  ... until now; in r261884, I
just committed the following to trunk:

commit e342f300e74ee68bc48ccfdb6ee202da6ca99e9e
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Fri Jun 22 10:04:14 2018 +0000

    Update OpenACC testcases
    
            gcc/testsuite/
            * c-c++-common/goacc/deviceptr-4.c: New file.
            * c-c++-common/goacc/kernels-counter-var-redundant-load.c:
            Likewise.
            * c-c++-common/goacc/kernels-loop-data-2.c: Likewise.
            * c-c++-common/goacc/kernels-loop-data-enter-exit-2.c: Likewise.
            * c-c++-common/goacc/kernels-loop-data-enter-exit.c: Likewise.
            * c-c++-common/goacc/kernels-loop-data-update.c: Likewise.
            * c-c++-common/goacc/kernels-loop-data.c: Likewise.
            * c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c:
            Likewise.
            * c-c++-common/goacc/parallel-reduction.c: Likewise.
            * c-c++-common/goacc/private-reduction-1.c: Likewise.
            * gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95:
            Likewise.
            * gfortran.dg/goacc/modules.f95: Likewise.
            * gfortran.dg/goacc/routine-8.f90: Likewise.
            * gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise.
            libgomp/
            * testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Don't force "-O2".
            * testsuite/libgomp.oacc-c-c++-common/data-2.c: Update.
            * testsuite/libgomp.oacc-c-c++-common/host_data-1.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/mode-transitions.c: Likewise.
            * testsuite/libgomp.oacc-fortran/data-1.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/data-2.f90: Likewise.
            * testsuite/libgomp.oacc-c++/non-scalar-data.C: New file.
            * testsuite/libgomp.oacc-c-c++-common/declare-3.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/enter-data.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-1.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-2.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-3.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-4.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-5.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-1.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-2.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-3.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-4.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-5.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-6.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-1.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-2.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-1.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-2.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-3.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-4.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-5.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-6.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-7.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.h: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/parallel-loop-2.h: Likewise.
            * testsuite/libgomp.oacc-fortran/cublas-fixed.h: Likewise.
            * testsuite/libgomp.oacc-fortran/dummy-array.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/host_data-2.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/host_data-3.f: Likewise.
            * testsuite/libgomp.oacc-fortran/host_data-4.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction-2.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-collapse-3.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-collapse-4.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-independent.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-loop-1.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-map-1.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-1.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-2.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-3.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-6.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-1.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-2.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-1.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-2.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-3.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-4.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-5.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-6.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-7.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90:
            Likewise.
            * testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/lib-13.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/lib-14.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/lib-15.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/parallel-loop-1.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/reference-reductions.f90: Likewise.
            * testsuite/libgomp.oacc-fortran/vector-routine.f90: Likewise.
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@261884 138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/testsuite/ChangeLog                            |  23 ++
 gcc/testsuite/c-c++-common/goacc/deviceptr-4.c     |  11 +
 .../goacc/kernels-counter-var-redundant-load.c     |  34 +++
 .../c-c++-common/goacc/kernels-loop-data-2.c       |  68 +++++
 .../goacc/kernels-loop-data-enter-exit-2.c         |  66 +++++
 .../goacc/kernels-loop-data-enter-exit.c           |  63 +++++
 .../c-c++-common/goacc/kernels-loop-data-update.c  |  63 +++++
 .../c-c++-common/goacc/kernels-loop-data.c         |  62 +++++
 .../goacc/kernels-parallel-loop-data-enter-exit.c  |  66 +++++
 .../c-c++-common/goacc/parallel-reduction.c        |  17 ++
 .../c-c++-common/goacc/private-reduction-1.c       |  12 +
 .../kernels-parallel-loop-data-enter-exit.f95      |  48 ++++
 gcc/testsuite/gfortran.dg/goacc/modules.f95        |  55 ++++
 gcc/testsuite/gfortran.dg/goacc/routine-8.f90      |  32 +++
 .../goacc/routine-level-of-parallelism-1.f90       |  72 ++++++
 libgomp/ChangeLog                                  | 125 +++++++++
 .../testsuite/libgomp.oacc-c++/non-scalar-data.C   | 110 ++++++++
 .../testsuite/libgomp.oacc-c-c++-common/data-2.c   |  27 ++
 .../libgomp.oacc-c-c++-common/declare-3.c          |  61 +++++
 .../libgomp.oacc-c-c++-common/enter-data.c         |  23 ++
 .../libgomp.oacc-c-c++-common/host_data-1.c        |  60 ++---
 .../kernels-loop-data-2.c                          |  53 ++++
 .../kernels-loop-data-enter-exit-2.c               |  51 ++++
 .../kernels-loop-data-enter-exit.c                 |  48 ++++
 .../kernels-loop-data-update.c                     |  50 ++++
 .../libgomp.oacc-c-c++-common/kernels-loop-data.c  |  47 ++++
 .../kernels-parallel-loop-data-enter-exit.c        |  49 ++++
 .../kernels-private-vars-local-worker-1.c          |  54 ++++
 .../kernels-private-vars-local-worker-2.c          |  49 ++++
 .../kernels-private-vars-local-worker-3.c          |  55 ++++
 .../kernels-private-vars-local-worker-4.c          |  58 +++++
 .../kernels-private-vars-local-worker-5.c          |  51 ++++
 .../kernels-private-vars-loop-gang-1.c             |  27 ++
 .../kernels-private-vars-loop-gang-2.c             |  31 +++
 .../kernels-private-vars-loop-gang-3.c             |  31 +++
 .../kernels-private-vars-loop-gang-4.c             |  35 +++
 .../kernels-private-vars-loop-gang-5.c             |  32 +++
 .../kernels-private-vars-loop-gang-6.c             |  40 +++
 .../kernels-private-vars-loop-vector-1.c           |  51 ++++
 .../kernels-private-vars-loop-vector-2.c           |  46 ++++
 .../kernels-private-vars-loop-worker-1.c           |  36 +++
 .../kernels-private-vars-loop-worker-2.c           |  43 ++++
 .../kernels-private-vars-loop-worker-3.c           |  54 ++++
 .../kernels-private-vars-loop-worker-4.c           |  49 ++++
 .../kernels-private-vars-loop-worker-5.c           |  51 ++++
 .../kernels-private-vars-loop-worker-6.c           |  55 ++++
 .../kernels-private-vars-loop-worker-7.c           |  54 ++++
 .../kernels-reduction-1.c                          |  24 ++
 .../libgomp.oacc-c-c++-common/loop-red-wv-1.c      |   3 -
 .../libgomp.oacc-c-c++-common/mode-transitions.c   | 270 ++++++++++++++++++++
 .../libgomp.oacc-c-c++-common/parallel-loop-1.c    |  38 +++
 .../libgomp.oacc-c-c++-common/parallel-loop-1.h    |  20 ++
 .../libgomp.oacc-c-c++-common/parallel-loop-2.h    | 280 +++++++++++++++++++++
 .../testsuite/libgomp.oacc-fortran/cublas-fixed.h  |  16 ++
 libgomp/testsuite/libgomp.oacc-fortran/data-1.f90  | 231 ++++++++++++++---
 libgomp/testsuite/libgomp.oacc-fortran/data-2.f90  |  50 ++++
 .../testsuite/libgomp.oacc-fortran/dummy-array.f90 |  28 +++
 .../testsuite/libgomp.oacc-fortran/host_data-2.f90 |  98 ++++++++
 .../testsuite/libgomp.oacc-fortran/host_data-3.f   |  85 +++++++
 .../testsuite/libgomp.oacc-fortran/host_data-4.f90 | 101 ++++++++
 .../kernels-acc-loop-reduction-2.f90               |  26 ++
 .../kernels-acc-loop-reduction.f90                 |  21 ++
 .../libgomp.oacc-fortran/kernels-collapse-3.f90    |  30 +++
 .../libgomp.oacc-fortran/kernels-collapse-4.f90    |  41 +++
 .../libgomp.oacc-fortran/kernels-independent.f90   |  42 ++++
 .../libgomp.oacc-fortran/kernels-loop-1.f90        |  66 +++++
 .../libgomp.oacc-fortran/kernels-map-1.f90         | 116 +++++++++
 .../kernels-parallel-loop-data-enter-exit.f95      |  36 +++
 .../kernels-private-vars-loop-gang-1.f90           |  23 ++
 .../kernels-private-vars-loop-gang-2.f90           |  28 +++
 .../kernels-private-vars-loop-gang-3.f90           |  28 +++
 .../kernels-private-vars-loop-gang-6.f90           |  36 +++
 .../kernels-private-vars-loop-vector-1.f90         |  41 +++
 .../kernels-private-vars-loop-vector-2.f90         |  38 +++
 .../kernels-private-vars-loop-worker-1.f90         |  27 ++
 .../kernels-private-vars-loop-worker-2.f90         |  36 +++
 .../kernels-private-vars-loop-worker-3.f90         |  48 ++++
 .../kernels-private-vars-loop-worker-4.f90         |  45 ++++
 .../kernels-private-vars-loop-worker-5.f90         |  48 ++++
 .../kernels-private-vars-loop-worker-6.f90         |  49 ++++
 .../kernels-private-vars-loop-worker-7.f90         |  44 ++++
 .../libgomp.oacc-fortran/kernels-reduction-1.f90   |  19 ++
 libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90  |  27 ++
 libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90  |  34 +++
 libgomp/testsuite/libgomp.oacc-fortran/lib-14.f90  |  82 ++++++
 libgomp/testsuite/libgomp.oacc-fortran/lib-15.f90  |  52 ++++
 .../libgomp.oacc-fortran/parallel-loop-1.f90       |  77 ++++++
 .../libgomp.oacc-fortran/reference-reductions.f90  |  38 +++
 .../libgomp.oacc-fortran/vector-routine.f90        |  41 +++
 89 files changed, 4718 insertions(+), 63 deletions(-)



Grüße
 Thomas

Comments

Rainer Orth June 25, 2018, 10:11 a.m. UTC | #1
Hi Thomas,

> On our development branch(es) we had accumulated a bunch of testcases
> (updates) that should have been part of earlier patch submissions, or
> were not yet pushed for unknown reasons.  ... until now; in r261884, I
> just committed the following to trunk:
>
> commit e342f300e74ee68bc48ccfdb6ee202da6ca99e9e
> Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
> Date:   Fri Jun 22 10:04:14 2018 +0000
>
>     Update OpenACC testcases
[...]
>             libgomp/
[...]
>             * testsuite/libgomp.oacc-c++/non-scalar-data.C: New file.

this test ...

> diff --git libgomp/testsuite/libgomp.oacc-c++/non-scalar-data.C libgomp/testsuite/libgomp.oacc-c++/non-scalar-data.C
> new file mode 100644
> index 0000000..8e4b296
> --- /dev/null
> +++ libgomp/testsuite/libgomp.oacc-c++/non-scalar-data.C
> @@ -0,0 +1,110 @@
> +// Ensure that a non-scalar dummy arguments which are implicitly used inside
> +// offloaded regions are properly mapped using present_or_copy semantics.
> +
> +// { dg-xfail-if "TODO" { *-*-* } }
> +// { dg-excess-errors "ICE" }

comes up as UNRESOLVED everywhere:

UNRESOLVED: libgomp.oacc-c++/non-scalar-data.C -DACC_DEVICE_TYPE_host=1 -DACC_MEM_SHARED=1  -O2  compilation failed to produce executable

Unless you plan to fix the ICE soon, please either remove the test or
dg-skip-if it to avoid unnecessary testsuite noise.

Thanks.
        Rainer
diff mbox series

Patch

diff --git gcc/testsuite/ChangeLog gcc/testsuite/ChangeLog
index bb7aa60..655a440 100644
--- gcc/testsuite/ChangeLog
+++ gcc/testsuite/ChangeLog
@@ -1,3 +1,26 @@ 
+2018-06-22  Cesar Philippidis  <cesar@codesourcery.com>
+	    James Norris  <jnorris@codesourcery.com>
+	    Thomas Schwinge  <thomas@codesourcery.com>
+	    Tom de Vries  <tom@codesourcery.com>
+
+	* c-c++-common/goacc/deviceptr-4.c: New file.
+	* c-c++-common/goacc/kernels-counter-var-redundant-load.c:
+	Likewise.
+	* c-c++-common/goacc/kernels-loop-data-2.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-data-enter-exit-2.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-data-enter-exit.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-data-update.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-data.c: Likewise.
+	* c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c:
+	Likewise.
+	* c-c++-common/goacc/parallel-reduction.c: Likewise.
+	* c-c++-common/goacc/private-reduction-1.c: Likewise.
+	* gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95:
+	Likewise.
+	* gfortran.dg/goacc/modules.f95: Likewise.
+	* gfortran.dg/goacc/routine-8.f90: Likewise.
+	* gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise.
+
 2018-06-21  Michael Meissner  <meissner@linux.ibm.com>
 
 	* gcc.target/powerpc/pack02.c: Use __ibm128 instead of long double
diff --git gcc/testsuite/c-c++-common/goacc/deviceptr-4.c gcc/testsuite/c-c++-common/goacc/deviceptr-4.c
new file mode 100644
index 0000000..db1b916
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/deviceptr-4.c
@@ -0,0 +1,11 @@ 
+/* { dg-additional-options "-fdump-tree-gimple" } */
+
+void
+subr (int *a)
+{
+#pragma acc data deviceptr (a)
+#pragma acc parallel
+  a[0] += 1.0;
+}
+
+/* { dg-final { scan-tree-dump-times "#pragma omp target oacc_parallel.*map\\(tofrom:a" 1 "gimple" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c
new file mode 100644
index 0000000..0304254
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c
@@ -0,0 +1,34 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-dom3" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+COUNTERTYPE
+foo (unsigned int *c)
+{
+  COUNTERTYPE ii;
+
+#pragma acc kernels copyout (c[0:N])
+  {
+    for (ii = 0; ii < N; ii++)
+      c[ii] = 1;
+  }
+
+  return ii;
+}
+
+/* We're expecting:
+
+   .omp_data_i_10 = &.omp_data_arr.3;
+   _11 = .omp_data_i_10->ii;
+   *_11 = 0;
+   _15 = .omp_data_i_10->c;
+   c.1_16 = *_15;
+
+   Check that there's only one load from anonymous ssa-name (which we assume to
+   be the one to read c), and that there's no such load for ii.  */
+
+/* { dg-final { scan-tree-dump-times "(?n)\\*_\[0-9\]\[0-9\]*;$" 1 "dom3" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
new file mode 100644
index 0000000..7180021
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
@@ -0,0 +1,68 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N])
+  {
+#pragma acc kernels present (a[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	a[i] = i * 2;
+    }
+  }
+
+#pragma acc data copyout (b[0:N])
+  {
+#pragma acc kernels present (b[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	b[i] = i * 4;
+    }
+  }
+
+#pragma acc data copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+    {
+      for (COUNTERTYPE ii = 0; ii < N; ii++)
+	c[ii] = a[ii] + b[ii];
+    }
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
new file mode 100644
index 0000000..0c9f833
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
@@ -0,0 +1,66 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N])
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+#pragma acc exit data copyout (a[0:N])
+
+#pragma acc enter data create (b[0:N])
+#pragma acc kernels present (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+#pragma acc exit data copyout (b[0:N])
+
+
+#pragma acc enter data copyin (a[0:N], b[0:N]) create (c[0:N])
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+#pragma acc exit data copyout (c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
new file mode 100644
index 0000000..0bd21b6
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
@@ -0,0 +1,63 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc kernels present (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
new file mode 100644
index 0000000..dd5a841
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
@@ -0,0 +1,63 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc update device (b[0:N])
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only two loops are analyzed, and that both can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 2 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 2 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
new file mode 100644
index 0000000..a658182
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
@@ -0,0 +1,62 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N], b[0:N], c[0:N])
+  {
+#pragma acc kernels present (a[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	a[i] = i * 2;
+    }
+
+#pragma acc kernels present (b[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	b[i] = i * 4;
+    }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+    {
+      for (COUNTERTYPE ii = 0; ii < N; ii++)
+	c[ii] = a[ii] + b[ii];
+    }
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
new file mode 100644
index 0000000..81b0fee
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
@@ -0,0 +1,66 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc parallel present (b[0:N])
+  {
+#pragma acc loop
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], b[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only two loops are analyzed, and that both can be
+   parallelized.  */
+// FIXME: OpenACC kernels stopped working with the firstprivate subarray
+// changes.
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 2 "parloops1" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 2 "parloops1" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" { xfail *-*-* } } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/parallel-reduction.c gcc/testsuite/c-c++-common/goacc/parallel-reduction.c
new file mode 100644
index 0000000..d7cc947
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/parallel-reduction.c
@@ -0,0 +1,17 @@ 
+int
+main ()
+{
+  int sum = 0;
+  int dummy = 0;
+
+#pragma acc data copy (dummy)
+  {
+#pragma acc parallel num_gangs (10) copy (sum) reduction (+:sum)
+    {
+      int v = 5;
+      sum += 10 + v;
+    }
+  }
+
+  return sum;
+}
diff --git gcc/testsuite/c-c++-common/goacc/private-reduction-1.c gcc/testsuite/c-c++-common/goacc/private-reduction-1.c
new file mode 100644
index 0000000..d4e3995
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/private-reduction-1.c
@@ -0,0 +1,12 @@ 
+int
+reduction ()
+{
+  int i, r;
+
+  #pragma acc parallel
+  #pragma acc loop private (r) reduction (+:r)
+  for (i = 0; i < 100; i++)
+    r += 10;
+
+  return r;
+}
diff --git gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95 gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95
new file mode 100644
index 0000000..48c20b9
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95
@@ -0,0 +1,48 @@ 
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-fdump-tree-parloops1-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  !$acc parallel present (b(0:n-1))
+  !$acc loop
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end parallel
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc exit data copyout (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 2 "parloops1" { xfail *-*-* } } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 2 "parloops1" { xfail *-*-* } } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" { xfail *-*-* } } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
diff --git gcc/testsuite/gfortran.dg/goacc/modules.f95 gcc/testsuite/gfortran.dg/goacc/modules.f95
new file mode 100644
index 0000000..19a2abe
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/modules.f95
@@ -0,0 +1,55 @@ 
+! { dg-do compile } 
+
+MODULE reduction_test
+
+CONTAINS
+
+SUBROUTINE reduction_kernel(x_min,x_max,y_min,y_max,arr,sum)
+
+  IMPLICIT NONE
+
+  INTEGER      :: x_min,x_max,y_min,y_max
+  REAL(KIND=8), DIMENSION(x_min-2:x_max+2,y_min-2:y_max+2) :: arr
+  REAL(KIND=8) :: sum
+
+  INTEGER      :: j,k
+
+  sum=0.0
+
+!$ACC DATA PRESENT(arr) COPY(sum)
+!$ACC PARALLEL LOOP REDUCTION(+ : sum)
+  DO k=y_min,y_max
+    DO j=x_min,x_max
+      sum=sum*arr(j,k)
+    ENDDO
+  ENDDO
+!$ACC END PARALLEL LOOP
+!$ACC END DATA
+
+END SUBROUTINE reduction_kernel
+
+END MODULE reduction_test
+
+program main
+    use reduction_test
+
+    integer :: x_min,x_max,y_min,y_max
+    real(kind=8), dimension(1:10,1:10) :: arr
+    real(kind=8) :: sum
+
+    x_min = 5
+    x_max = 6
+    y_min = 5
+    y_max = 6
+
+    arr(:,:) = 1.0
+
+    sum = 1.0
+
+    !$acc data copy(arr)
+
+    call field_summary_kernel(x_min,x_max,y_min,y_max,arr,sum)
+
+    !$acc end data
+
+end program
diff --git gcc/testsuite/gfortran.dg/goacc/routine-8.f90 gcc/testsuite/gfortran.dg/goacc/routine-8.f90
new file mode 100644
index 0000000..c903915
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/routine-8.f90
@@ -0,0 +1,32 @@ 
+! Test ACC ROUTINE inside an interface block.
+
+program main
+  interface
+     function s_1 (a)
+       integer a
+       !$acc routine
+     end function s_1
+  end interface
+
+  interface
+     function s_2 (a)
+       integer a
+       !$acc routine seq
+     end function s_2
+  end interface
+
+  interface
+     function s_3 (a)
+       integer a
+       !$acc routine (s_3) ! { dg-error "Only the ..ACC ROUTINE form without list is allowed in interface block" }
+     end function s_3
+  end interface
+
+  interface
+     function s_4 (a)
+       integer a
+         !$acc routine (s_4) seq ! { dg-error "Only the ..ACC ROUTINE form without list is allowed in interface block" }
+     end function s_4
+  end interface
+end program main
+
diff --git gcc/testsuite/gfortran.dg/goacc/routine-level-of-parallelism-1.f90 gcc/testsuite/gfortran.dg/goacc/routine-level-of-parallelism-1.f90
new file mode 100644
index 0000000..75dd1b0
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/routine-level-of-parallelism-1.f90
@@ -0,0 +1,72 @@ 
+! Test various aspects of clauses specifying compatible levels of
+! parallelism with the OpenACC routine directive.  The Fortran counterpart is
+! c-c++-common/goacc/routine-level-of-parallelism-2.c
+
+subroutine g_1
+  !$acc routine gang
+end subroutine g_1
+
+subroutine s_1_2a
+  !$acc routine
+end subroutine s_1_2a
+
+subroutine s_1_2b
+  !$acc routine seq
+end subroutine s_1_2b
+
+subroutine s_1_2c
+  !$acc routine (s_1_2c)
+end subroutine s_1_2c
+
+subroutine s_1_2d
+  !$acc routine (s_1_2d) seq
+end subroutine s_1_2d
+
+module s_2
+contains
+  subroutine s_2_1a
+    !$acc routine
+  end subroutine s_2_1a
+
+  subroutine s_2_1b
+    !$acc routine seq
+  end subroutine s_2_1b
+
+  subroutine s_2_1c
+    !$acc routine (s_2_1c)
+  end subroutine s_2_1c
+
+  subroutine s_2_1d
+    !$acc routine (s_2_1d) seq
+  end subroutine s_2_1d
+end module s_2
+
+subroutine test
+  external g_1, w_1, v_1
+  external s_1_1, s_1_2
+
+  interface
+     function s_3_1a (a)
+       integer a
+       !$acc routine
+     end function s_3_1a
+  end interface
+
+  interface
+     function s_3_1b (a)
+       integer a
+       !$acc routine seq
+     end function s_3_1b
+  end interface
+
+  !$acc routine(g_1) gang
+
+  !$acc routine(w_1) worker
+
+  !$acc routine(v_1) worker
+
+  ! Also test the implicit seq clause.
+
+  !$acc routine (s_1_1) seq
+
+end subroutine test
diff --git libgomp/ChangeLog libgomp/ChangeLog
index c4ba406..d827739 100644
--- libgomp/ChangeLog
+++ libgomp/ChangeLog
@@ -1,3 +1,128 @@ 
+2018-06-22  Cesar Philippidis  <cesar@codesourcery.com>
+	    James Norris  <jnorris@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+	    Thomas Schwinge  <thomas@codesourcery.com>
+	    Tom de Vries  <tom@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Don't force "-O2".
+	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Update.
+	* testsuite/libgomp.oacc-c-c++-common/host_data-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/mode-transitions.c: Likewise.
+	* testsuite/libgomp.oacc-fortran/data-1.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/data-2.f90: Likewise.
+	* testsuite/libgomp.oacc-c++/non-scalar-data.C: New file.
+	* testsuite/libgomp.oacc-c-c++-common/declare-3.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/enter-data.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-3.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-4.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-5.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-3.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-4.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-5.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-6.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-3.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-4.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-5.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-6.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-7.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.h: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/parallel-loop-2.h: Likewise.
+	* testsuite/libgomp.oacc-fortran/cublas-fixed.h: Likewise.
+	* testsuite/libgomp.oacc-fortran/dummy-array.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/host_data-2.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/host_data-3.f: Likewise.
+	* testsuite/libgomp.oacc-fortran/host_data-4.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction-2.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-collapse-3.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-collapse-4.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-independent.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-1.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-map-1.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-1.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-2.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-3.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-6.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-1.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-2.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-1.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-2.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-3.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-4.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-5.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-6.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-7.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/lib-13.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/lib-14.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/lib-15.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/parallel-loop-1.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/reference-reductions.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/vector-routine.f90: Likewise.
+
 2018-06-20  Chung-Lin Tang <cltang@codesourcery.com>
 	    Thomas Schwinge <thomas@codesourcery.com>
 	    Cesar Philippidis  <cesar@codesourcery.com>
diff --git libgomp/testsuite/libgomp.oacc-c++/non-scalar-data.C libgomp/testsuite/libgomp.oacc-c++/non-scalar-data.C
new file mode 100644
index 0000000..8e4b296
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c++/non-scalar-data.C
@@ -0,0 +1,110 @@ 
+// Ensure that a non-scalar dummy arguments which are implicitly used inside
+// offloaded regions are properly mapped using present_or_copy semantics.
+
+// { dg-xfail-if "TODO" { *-*-* } }
+// { dg-excess-errors "ICE" }
+
+#include <cassert>
+
+const int n = 100;
+
+struct data {
+  int v;
+};
+
+void
+kernels_present (data &d, int &x)
+{
+#pragma acc kernels present (d, x) default (none)
+  {
+    d.v = x;
+  }
+}
+
+void
+parallel_present (data &d, int &x)
+{
+#pragma acc parallel present (d, x) default (none)
+  {
+    d.v = x;
+  }
+}
+
+void
+kernels_implicit (data &d, int &x)
+{
+#pragma acc kernels
+  {
+    d.v = x;
+  }
+}
+
+void
+parallel_implicit (data &d, int &x)
+{
+#pragma acc parallel
+  {
+    d.v = x;
+  }
+}
+
+void
+reference_data (data &d, int &x)
+{
+#pragma acc data copy(d, x)
+  {
+    kernels_present (d, x);
+
+#pragma acc update host(d)
+    assert (d.v == x);
+
+    x = 200;
+#pragma acc update device(x)
+    
+    parallel_present (d, x);
+  }
+
+  assert (d.v == x);
+
+  x = 300;
+  kernels_implicit (d, x);
+  assert (d.v == x);
+
+  x = 400;
+  parallel_implicit (d, x);
+  assert (d.v == x);
+}
+
+int
+main ()
+{
+  data d;
+  int x = 100;
+
+#pragma acc data copy(d, x)
+  {
+    kernels_present (d, x);
+
+#pragma acc update host(d)
+    assert (d.v == x);
+
+    x = 200;
+#pragma acc update device(x)
+    
+    parallel_present (d, x);
+  }
+
+  assert (d.v == x);
+
+  x = 300;
+  kernels_implicit (d, x);
+  assert (d.v == x);
+
+  x = 400;
+  parallel_implicit (d, x);
+  assert (d.v == x);
+
+  reference_data (d, x);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
index c1c0825..0c6abe6 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
@@ -1,6 +1,7 @@ 
 /* Test 'acc enter/exit data' regions.  */
 
 /* { dg-do run } */
+/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } } */
 
 #include <stdlib.h>
 
@@ -46,6 +47,32 @@  main (int argc, char **argv)
 
   for (i = 0; i < N; i++)
     {
+      a[i] = 3.0;
+      b[i] = 0.0;
+    }
+
+#pragma acc enter data copyin (a[0:N]) async 
+#pragma acc enter data copyin (b[0:N]) async wait
+#pragma acc enter data copyin (N) async wait
+#pragma acc parallel async wait
+#pragma acc loop
+  for (i = 0; i < N; i++)
+    b[i] = a[i];
+
+#pragma acc exit data copyout (a[0:N]) copyout (b[0:N]) delete (N) wait async
+#pragma acc wait
+
+  for (i = 0; i < N; i++)
+    {
+      if (a[i] != 3.0)
+	abort ();
+
+      if (b[i] != 3.0)
+	abort ();
+    }
+
+  for (i = 0; i < N; i++)
+    {
       a[i] = 2.0;
       b[i] = 0.0;
     }
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/declare-3.c libgomp/testsuite/libgomp.oacc-c-c++-common/declare-3.c
new file mode 100644
index 0000000..c3a2187
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/declare-3.c
@@ -0,0 +1,61 @@ 
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+
+#include <stdlib.h>
+#include <openacc.h>
+
+float *b;
+#pragma acc declare deviceptr (b)
+
+#pragma acc routine
+float *
+subr2 (void)
+{
+  return b;
+}
+
+float
+subr1 (float a)
+{
+  float b;
+#pragma acc declare present_or_copy (b)
+  float c;
+#pragma acc declare present_or_copyin (c)
+  float d;
+#pragma acc declare present_or_create (d)
+  float e;
+#pragma acc declare present_or_copyout (e)
+
+#pragma acc parallel copy (a)
+  {
+    b = a;
+    c = b;
+    d = c;
+    e = d;
+    a = e;
+  }
+
+  return a;
+}
+
+int
+main (int argc, char **argv)
+{
+  float a;
+  float *c;
+
+  a = 2.0;
+
+  a = subr1 (a);
+
+  if (a != 2.0)
+    abort ();
+
+  b = (float *) acc_malloc (sizeof (float));
+
+  c = subr2 ();
+
+  if (b != c)
+    abort ();
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/enter-data.c libgomp/testsuite/libgomp.oacc-c-c++-common/enter-data.c
new file mode 100644
index 0000000..0f566c9
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/enter-data.c
@@ -0,0 +1,23 @@ 
+/* This test verifies that the present data clauses to acc enter data
+   don't cause duplicate mapping failures at runtime.  */
+
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+int
+main (void)
+{
+  int a;
+
+#pragma acc enter data copyin (a)
+#pragma acc enter data pcopyin (a)
+#pragma acc enter data pcreate (a)
+#pragma acc exit data delete (a)
+
+#pragma acc enter data create (a)
+#pragma acc enter data pcreate (a)
+#pragma acc exit data delete (a)
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
index 51745ba..21d2139 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
@@ -1,24 +1,16 @@ 
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
-/* { dg-additional-options "-lcuda -lcublas -lcudart" } */
+/* { dg-additional-options "-lm -lcuda -lcublas -lcudart -Wall -Wextra" } */
 
 #include <stdlib.h>
+#include <math.h>
 #include <openacc.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 
-void
-saxpy_host (int n, float a, float *x, float *y)
-{
-  int i;
-
-  for (i = 0; i < n; i++)
-    y[i] = y[i] + a * x[i];
-}
-
 #pragma acc routine
 void
-saxpy_target (int n, float a, float *x, float *y)
+saxpy (int n, float a, float *x, float *y)
 {
   int i;
 
@@ -26,8 +18,18 @@  saxpy_target (int n, float a, float *x, float *y)
     y[i] = y[i] + a * x[i];
 }
 
+void
+validate_results (int n, float *a, float *b)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    if (fabs (a[i] - b[i]) > .00001)
+      abort ();
+}
+
 int
-main(int argc, char **argv)
+main()
 {
 #define N 8
   int i;
@@ -42,7 +44,7 @@  main(int argc, char **argv)
       y[i] = y_ref[i] = 3.0;
     }
 
-  saxpy_host (N, a, x_ref, y_ref);
+  saxpy (N, a, x_ref, y_ref);
 
   cublasCreate (&h);
 
@@ -54,11 +56,7 @@  main(int argc, char **argv)
     }
   }
 
-  for (i = 0; i < N; i++)
-    {
-      if (y[i] != y_ref[i])
-        abort ();
-    }
+  validate_results (N, y, y_ref);
 
 #pragma acc data create (x[0:N]) copyout (y[0:N])
   {
@@ -74,11 +72,7 @@  main(int argc, char **argv)
 
   cublasDestroy (h);
 
-  for (i = 0; i < N; i++)
-    {
-      if (y[i] != y_ref[i])
-        abort ();
-    }
+  validate_results (N, y, y_ref);
 
   for (i = 0; i < N; i++)
     y[i] = 3.0;
@@ -87,14 +81,24 @@  main(int argc, char **argv)
 #pragma acc data copyin (x[0:N]) copyin (a) copy (y[0:N])
   {
 #pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a)
-    saxpy_target (N, a, x, y);
+    saxpy (N, a, x, y);
   }
 
+  validate_results (N, y, y_ref);
+
+  /* Exercise host_data with data transferred with acc enter data.  */
+
   for (i = 0; i < N; i++)
-    {
-      if (y[i] != y_ref[i])
-        abort ();
-    }
+    y[i] = 3.0;
+
+#pragma acc enter data copyin (x, a, y)
+#pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a)
+  {
+    saxpy (N, a, x, y);
+  }
+#pragma acc exit data delete (x, a) copyout (y)
+
+  validate_results (N, y, y_ref);
 
   return 0;
 }
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c
new file mode 100644
index 0000000..607c350
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c
@@ -0,0 +1,53 @@ 
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N])
+  {
+#pragma acc kernels present (a[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	a[i] = i * 2;
+    }
+  }
+
+#pragma acc data copyout (b[0:N])
+  {
+#pragma acc kernels present (b[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	b[i] = i * 4;
+    }
+  }
+
+#pragma acc data copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+    {
+      for (COUNTERTYPE ii = 0; ii < N; ii++)
+	c[ii] = a[ii] + b[ii];
+    }
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c
new file mode 100644
index 0000000..8b9dd5f
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c
@@ -0,0 +1,51 @@ 
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N])
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+#pragma acc exit data copyout (a[0:N])
+
+#pragma acc enter data create (b[0:N])
+#pragma acc kernels present (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+#pragma acc exit data copyout (b[0:N])
+
+
+#pragma acc enter data copyin (a[0:N], b[0:N]) create (c[0:N])
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+#pragma acc exit data copyout (c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c
new file mode 100644
index 0000000..5d5da6f
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c
@@ -0,0 +1,48 @@ 
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc kernels present (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], b[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c
new file mode 100644
index 0000000..c111c8f
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c
@@ -0,0 +1,50 @@ 
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc update device (b[0:N])
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c
new file mode 100644
index 0000000..947bcda
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c
@@ -0,0 +1,47 @@ 
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N], b[0:N], c[0:N])
+  {
+#pragma acc kernels present (a[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	a[i] = i * 2;
+    }
+
+#pragma acc kernels present (b[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	b[i] = i * 4;
+    }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+    {
+      for (COUNTERTYPE ii = 0; ii < N; ii++)
+	c[ii] = a[ii] + b[ii];
+    }
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c
new file mode 100644
index 0000000..ebcc6e1
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c
@@ -0,0 +1,49 @@ 
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc parallel present (b[0:N])
+  {
+#pragma acc loop
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], b[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-1.c
new file mode 100644
index 0000000..bcbe28a
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-1.c
@@ -0,0 +1,54 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared in a local scope, broadcasting
+   to vector-partitioned mode.  Back-to-back worker loops.  */
+
+int
+main (int argc, char* argv[])
+{
+  int i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+	#pragma acc loop worker(num:32)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    int x = i ^ j * 3;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	  }
+
+	#pragma acc loop worker(num:32)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    int x = i | j * 5;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-2.c
new file mode 100644
index 0000000..a944486
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-2.c
@@ -0,0 +1,49 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared in a local scope, broadcasting
+   to vector-partitioned mode.  Successive vector loops.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    int x = i ^ j * 3;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	    
+	    x = i | j * 5;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-3.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-3.c
new file mode 100644
index 0000000..ba0b44d
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-3.c
@@ -0,0 +1,55 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared in a local scope, broadcasting
+   to vector-partitioned mode.  Aggregate worker variable.  */
+
+typedef struct
+{
+  int x, y;
+} vec2;
+
+int
+main (int argc, char* argv[])
+{
+  int i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    vec2 pt;
+	    
+	    pt.x = i ^ j * 3;
+	    pt.y = i | j * 5;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += pt.x * k;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += pt.y * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-4.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-4.c
new file mode 100644
index 0000000..7189d2a
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-4.c
@@ -0,0 +1,58 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared in a local scope, broadcasting
+   to vector-partitioned mode.  Addressable worker variable.  */
+
+typedef struct
+{
+  int x, y;
+} vec2;
+
+int
+main (int argc, char* argv[])
+{
+  int i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    vec2 pt, *ptp;
+	    
+	    ptp = &pt;
+	    
+	    pt.x = i ^ j * 3;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += ptp->x * k;
+
+	    ptp->y = i | j * 5;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += pt.y * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-5.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-5.c
new file mode 100644
index 0000000..854ad7e
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-5.c
@@ -0,0 +1,51 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared in a local scope, broadcasting
+   to vector-partitioned mode.  Array worker variable.  */
+
+int
+main (int argc, char* argv[])
+{
+  int i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    int pt[2];
+	    
+	    pt[0] = i ^ j * 3;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += pt[0] * k;
+
+	    pt[1] = i | j * 5;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += pt[1] * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-1.c
new file mode 100644
index 0000000..5bc90c2
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-1.c
@@ -0,0 +1,27 @@ 
+#include <assert.h>
+
+/* Test of gang-private variables declared on loop directive.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32];
+
+  for (i = 0; i < 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    #pragma acc loop gang(num:32) private(x)
+    for (i = 0; i < 32; i++)
+      {
+	x = i * 2;
+	arr[i] += x;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    assert (arr[i] == i * 3);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-2.c
new file mode 100644
index 0000000..3eb1167
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-2.c
@@ -0,0 +1,31 @@ 
+#include <assert.h>
+
+/* Test of gang-private variables declared on loop directive, with broadcasting
+   to partitioned workers.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    #pragma acc loop gang(num:32) private(x)
+    for (i = 0; i < 32; i++)
+      {
+	x = i * 2;
+
+	#pragma acc loop worker(num:32)
+	for (int j = 0; j < 32; j++)
+	  arr[i * 32 + j] += x;
+      }
+  }
+
+  for (i = 0; i < 32 * 32; i++)
+    assert (arr[i] == i + (i / 32) * 2);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-3.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-3.c
new file mode 100644
index 0000000..86b9a71
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-3.c
@@ -0,0 +1,31 @@ 
+#include <assert.h>
+
+/* Test of gang-private variables declared on loop directive, with broadcasting
+   to partitioned vectors.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    #pragma acc loop gang(num:32) private(x)
+    for (i = 0; i < 32; i++)
+      {
+	x = i * 2;
+
+	#pragma acc loop vector(length:32)
+	for (int j = 0; j < 32; j++)
+	  arr[i * 32 + j] += x;
+      }
+  }
+
+  for (i = 0; i < 32 * 32; i++)
+    assert (arr[i] == i + (i / 32) * 2);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-4.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-4.c
new file mode 100644
index 0000000..4174248
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-4.c
@@ -0,0 +1,35 @@ 
+#include <assert.h>
+
+/* Test of gang-private addressable variable declared on loop directive, with
+   broadcasting to partitioned workers.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    #pragma acc loop gang(num:32) private(x)
+    for (i = 0; i < 32; i++)
+      {
+        int *p = &x;
+
+	x = i * 2;
+
+	#pragma acc loop worker(num:32)
+	for (int j = 0; j < 32; j++)
+	  arr[i * 32 + j] += x;
+
+	(*p)--;
+      }
+  }
+
+  for (i = 0; i < 32 * 32; i++)
+    assert (arr[i] == i + (i / 32) * 2);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-5.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-5.c
new file mode 100644
index 0000000..b160eaa
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-5.c
@@ -0,0 +1,32 @@ 
+#include <assert.h>
+
+/* Test of gang-private array variable declared on loop directive, with
+   broadcasting to partitioned workers.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x[8], i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    #pragma acc loop gang(num:32) private(x)
+    for (i = 0; i < 32; i++)
+      {
+        for (int j = 0; j < 8; j++)
+	  x[j] = j * 2;
+
+	#pragma acc loop worker(num:32)
+	for (int j = 0; j < 32; j++)
+	  arr[i * 32 + j] += x[j % 8];
+      }
+  }
+
+  for (i = 0; i < 32 * 32; i++)
+    assert (arr[i] == i + (i % 8) * 2);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-6.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-6.c
new file mode 100644
index 0000000..88ab245
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-6.c
@@ -0,0 +1,40 @@ 
+#include <assert.h>
+
+/* Test of gang-private aggregate variable declared on loop directive, with
+   broadcasting to partitioned workers.  */
+
+typedef struct {
+  int x, y, z;
+  int attr[13];
+} vec3;
+
+int
+main (int argc, char* argv[])
+{
+  int i, arr[32 * 32];
+  vec3 pt;
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    #pragma acc loop gang private(pt)
+    for (i = 0; i < 32; i++)
+      {
+        pt.x = i;
+	pt.y = i * 2;
+	pt.z = i * 4;
+	pt.attr[5] = i * 6;
+
+	#pragma acc loop worker
+	for (int j = 0; j < 32; j++)
+	  arr[i * 32 + j] += pt.x + pt.y + pt.z + pt.attr[5];
+      }
+  }
+
+  for (i = 0; i < 32 * 32; i++)
+    assert (arr[i] == i + (i / 32) * 13);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-1.c
new file mode 100644
index 0000000..df4add1
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-1.c
@@ -0,0 +1,51 @@ 
+#include <assert.h>
+
+/* Test of vector-private variables declared on loop directive.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x, i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+
+	    #pragma acc loop vector(length:32) private(x)
+	    for (k = 0; k < 32; k++)
+	      {
+		x = i ^ j * 3;
+		arr[i * 1024 + j * 32 + k] += x * k;
+	      }
+
+	    #pragma acc loop vector(length:32) private(x)
+	    for (k = 0; k < 32; k++)
+	      {
+		x = i | j * 5;
+		arr[i * 1024 + j * 32 + k] += x * k;
+	      }
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-2.c
new file mode 100644
index 0000000..53c56b2
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-2.c
@@ -0,0 +1,46 @@ 
+#include <assert.h>
+
+/* Test of vector-private variables declared on loop directive. Array type.  */
+
+int
+main (int argc, char* argv[])
+{
+  int pt[2], i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+
+	    #pragma acc loop vector(length:32) private(pt)
+	    for (k = 0; k < 32; k++)
+	      {
+	        pt[0] = i ^ j * 3;
+		pt[1] = i | j * 5;
+		arr[i * 1024 + j * 32 + k] += pt[0] * k;
+		arr[i * 1024 + j * 32 + k] += pt[1] * k;
+	      }
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-1.c
new file mode 100644
index 0000000..95db2f8
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-1.c
@@ -0,0 +1,36 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared on a loop directive.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32) private(x)
+	for (j = 0; j < 32; j++)
+	  {
+	    x = i ^ j * 3;
+	    /* Try to ensure 'x' accesses doesn't get optimized into a
+	       temporary.  */
+	    __asm__ __volatile__ ("");
+	    arr[i * 32 + j] += x;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32 * 32; i++)
+    assert (arr[i] == i + ((i / 32) ^ (i % 32) * 3));
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-2.c
new file mode 100644
index 0000000..ceaa3ee
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-2.c
@@ -0,0 +1,43 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared on a loop directive, broadcasting
+   to vector-partitioned mode.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32) private(x)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    x = i ^ j * 3;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-3.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-3.c
new file mode 100644
index 0000000..193a1d1
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-3.c
@@ -0,0 +1,54 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared on a loop directive, broadcasting
+   to vector-partitioned mode.  Back-to-back worker loops.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32) private(x)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    x = i ^ j * 3;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	  }
+
+	#pragma acc loop worker(num:32) private(x)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    x = i | j * 5;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-4.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-4.c
new file mode 100644
index 0000000..4320cd8
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-4.c
@@ -0,0 +1,49 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared on a loop directive, broadcasting
+   to vector-partitioned mode.  Successive vector loops.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32) private(x)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    x = i ^ j * 3;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	    
+	    x = i | j * 5;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-5.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-5.c
new file mode 100644
index 0000000..80992ee
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-5.c
@@ -0,0 +1,51 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared on a loop directive, broadcasting
+   to vector-partitioned mode.  Addressable worker variable.  */
+
+int
+main (int argc, char* argv[])
+{
+  int x = 5, i, arr[32 * 32 * 32];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32) private(x)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    int *p = &x;
+	    
+	    x = i ^ j * 3;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	    
+	    *p = i | j * 5;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += x * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-6.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-6.c
new file mode 100644
index 0000000..005ba60
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-6.c
@@ -0,0 +1,55 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared on a loop directive, broadcasting
+   to vector-partitioned mode.  Aggregate worker variable.  */
+
+typedef struct
+{
+  int x, y;
+} vec2;
+
+int
+main (int argc, char* argv[])
+{
+  int i, arr[32 * 32 * 32];
+  vec2 pt;
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        #pragma acc loop worker(num:32) private(pt)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    
+	    pt.x = i ^ j * 3;
+	    pt.y = i | j * 5;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += pt.x * k;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += pt.y * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-7.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-7.c
new file mode 100644
index 0000000..8d367fb
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-7.c
@@ -0,0 +1,54 @@ 
+#include <assert.h>
+
+/* Test of worker-private variables declared on loop directive, broadcasting
+   to vector-partitioned mode.  Array worker variable.  */
+
+int
+main (int argc, char* argv[])
+{
+  int i, arr[32 * 32 * 32];
+  int pt[2];
+
+  for (i = 0; i < 32 * 32 * 32; i++)
+    arr[i] = i;
+
+  /* "pt" is treated as "present_or_copy" on the kernels directive because it
+     is an array variable.  */
+  #pragma acc kernels copy(arr)
+  {
+    int j;
+
+    #pragma acc loop gang(num:32)
+    for (i = 0; i < 32; i++)
+      {
+        /* But here, it is made private per-worker.  */
+        #pragma acc loop worker(num:32) private(pt)
+	for (j = 0; j < 32; j++)
+	  {
+	    int k;
+	    
+	    pt[0] = i ^ j * 3;
+
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += pt[0] * k;
+
+	    pt[1] = i | j * 5;
+	    
+	    #pragma acc loop vector(length:32)
+	    for (k = 0; k < 32; k++)
+	      arr[i * 1024 + j * 32 + k] += pt[1] * k;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      for (int k = 0; k < 32; k++)
+        {
+	  int idx = i * 1024 + j * 32 + k;
+          assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
+	}
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c
new file mode 100644
index 0000000..95f1b77
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c
@@ -0,0 +1,24 @@ 
+/* Verify that a simple, explicit acc loop reduction works inside
+ a kernels region.  */
+
+#include <stdlib.h>
+
+#define N 100
+
+int
+main ()
+{
+  int i, red = 0;
+
+#pragma acc kernels
+  {
+#pragma acc loop reduction (+:red)
+  for (i = 0; i < N; i++)
+    red++;
+  }
+
+  if (red != N)
+    abort ();
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
index 6743afa..71d3969 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
@@ -1,6 +1,3 @@ 
-/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
-
 #include <stdio.h>
 #include <openacc.h>
 #include <gomp-constants.h>
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c
index 2394ac8..4474c12 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c
@@ -74,6 +74,57 @@  void t2()
 }
 
 
+/* Test conditional vector-partitioned loops.  */
+
+void t3()
+{
+  int n[32], arr[1024], i;
+
+  for (i = 0; i < 1024; i++)
+    arr[i] = 0;
+
+  for (i = 0; i < 32; i++)
+    n[i] = 0;
+
+  #pragma acc parallel copy(n, arr) \
+		       num_gangs(32) num_workers(1) vector_length(32)
+  {
+    int j, k;
+
+    #pragma acc loop gang(static:*)
+    for (j = 0; j < 32; j++)
+      n[j]++;
+
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	if ((j % 2) == 0)
+	  {
+	    #pragma acc loop vector
+	    for (k = 0; k < 32; k++)
+	      arr[j * 32 + k]++;
+	  }
+	else
+	  {
+	    #pragma acc loop vector
+	    for (k = 0; k < 32; k++)
+	      arr[j * 32 + k]--;
+	  }
+      }
+
+    #pragma acc loop gang(static:*)
+    for (j = 0; j < 32; j++)
+      n[j]++;
+  }
+
+  for (i = 0; i < 32; i++)
+    assert (n[i] == 2);
+
+  for (i = 0; i < 1024; i++)
+    assert (arr[i] == ((i % 64) < 32) ? 1 : -1);
+}
+
+
 /* Test conditions inside vector-partitioned loops.  */
 
 void t4()
@@ -156,6 +207,79 @@  void t5()
 }
 
 
+/* Test switch containing vector-partitioned loops inside gang-partitioned
+   loops.  */
+
+void t6()
+{
+  int n[32], arr[1024], i;
+
+  for (i = 0; i < 1024; i++)
+    arr[i] = 0;
+
+  for (i = 0; i < 32; i++)
+    n[i] = i % 5;
+
+  #pragma acc parallel copy(n, arr) \
+		       num_gangs(32) num_workers(1) vector_length(32)
+  {
+    int j, k;
+
+    #pragma acc loop gang(static:*)
+    for (j = 0; j < 32; j++)
+      n[j]++;
+
+    #pragma acc loop gang(static:*)
+    for (j = 0; j < 32; j++)
+      switch (n[j])
+	{
+	case 1:
+	  #pragma acc loop vector
+	  for (k = 0; k < 32; k++)
+	    arr[j * 32 + k] += 1;
+	  break;
+
+	case 2:
+	  #pragma acc loop vector
+	  for (k = 0; k < 32; k++)
+	    arr[j * 32 + k] += 2;
+	  break;
+
+	case 3:
+	  #pragma acc loop vector
+	  for (k = 0; k < 32; k++)
+	    arr[j * 32 + k] += 3;
+	  break;
+
+	case 4:
+	  #pragma acc loop vector
+	  for (k = 0; k < 32; k++)
+	    arr[j * 32 + k] += 4;
+	  break;
+
+	case 5:
+	  #pragma acc loop vector
+	  for (k = 0; k < 32; k++)
+	    arr[j * 32 + k] += 5;
+	  break;
+
+	default:
+	  abort ();
+	}
+
+    #pragma acc loop gang(static:*)
+    for (j = 0; j < 32; j++)
+      n[j]++;
+  }
+
+  for (i = 0; i < 32; i++)
+    assert (n[i] == (i % 5) + 2);
+
+  for (i = 0; i < 1024; i++)
+    assert (arr[i] == ((i / 32) % 5) + 1);
+}
+
+
 /* Test trivial operation of vector-single mode.  */
 
 void t7()
@@ -381,6 +505,100 @@  void t13()
 }
 
 
+/* Test condition in worker-partitioned mode.  */
+
+void t14()
+{
+  int arr[32 * 32 * 8], i;
+
+  for (i = 0; i < 32 * 32 * 8; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) \
+		       num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	int k;
+	#pragma acc loop worker
+	for (k = 0; k < 8; k++)
+	  {
+	    int m;
+	    if ((k % 2) == 0)
+	      {
+		#pragma acc loop vector
+		for (m = 0; m < 32; m++)
+		  arr[j * 32 * 8 + k * 32 + m]++;
+	      }
+	    else
+	      {
+		#pragma acc loop vector
+		for (m = 0; m < 32; m++)
+		  arr[j * 32 * 8 + k * 32 + m] += 2;
+	      }
+	  }
+      }
+  }
+
+  for (i = 0; i < 32 * 32 * 8; i++)
+    assert (arr[i] == i + ((i / 32) % 2) + 1);
+}
+
+
+/* Test switch in worker-partitioned mode.  */
+
+void t15()
+{
+  int arr[32 * 32 * 8], i;
+
+  for (i = 0; i < 32 * 32 * 8; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) \
+		       num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	int k;
+	#pragma acc loop worker
+	for (k = 0; k < 8; k++)
+	  {
+	    int m;
+	    switch ((j * 32 + k) % 3)
+	    {
+	    case 0:
+	      #pragma acc loop vector
+	      for (m = 0; m < 32; m++)
+		arr[j * 32 * 8 + k * 32 + m]++;
+	      break;
+
+	    case 1:
+	      #pragma acc loop vector
+	      for (m = 0; m < 32; m++)
+		arr[j * 32 * 8 + k * 32 + m] += 2;
+	      break;
+
+	    case 2:
+	      #pragma acc loop vector
+	      for (m = 0; m < 32; m++)
+		arr[j * 32 * 8 + k * 32 + m] += 3;
+	      break;
+
+	    default: ;
+	    }
+	  }
+      }
+  }
+
+  for (i = 0; i < 32 * 32 * 8; i++)
+    assert (arr[i] == i + ((i / 32) % 3) + 1);
+}
+
+
 /* Test worker-single/worker-partitioned transitions.  */
 
 void t16()
@@ -790,6 +1008,53 @@  void t25()
 }
 
 
+/* Test multiple conditional vector-partitioned loops in worker-single
+   mode.  */
+
+void t26()
+{
+  int arr[32 * 32], i;
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) \
+		       num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	int k;
+	if ((j % 3) == 0)
+	  {
+	    #pragma acc loop vector
+	    for (k = 0; k < 32; k++)
+	      {
+		#pragma acc atomic
+		arr[j * 32 + k] += 3;
+	      }
+	  }
+	else if ((j % 3) == 1)
+	  {
+	    #pragma acc loop vector
+	    for (k = 0; k < 32; k++)
+	      {
+		#pragma acc atomic
+		arr[j * 32 + k] += 7;
+	      }
+	  }
+      }
+  }
+
+  for (i = 0; i < 32 * 32; i++)
+    {
+      int j = (i / 32) % 3;
+      assert (arr[i] == i + ((j == 0) ? 3 : (j == 1) ? 7 : 0));
+    }
+}
+
+
 /* Test worker-single, vector-partitioned, gang-redundant mode.  */
 
 #define ACTUAL_GANGS 8
@@ -869,8 +1134,10 @@  int main()
 {
   t1();
   t2();
+  t3();
   t4();
   t5();
+  t6();
   t7();
   t8();
   t9();
@@ -878,6 +1145,8 @@  int main()
   t11();
   t12();
   t13();
+  t14();
+  t15();
   t16();
   t17();
   t18();
@@ -888,6 +1157,7 @@  int main()
   t23();
   t24();
   t25();
+  t26();
   t27();
   t28();
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.c
new file mode 100644
index 0000000..4bc7141
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.c
@@ -0,0 +1,38 @@ 
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+#define PK parallel
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#define F
+#define G none
+#define L
+#include "parallel-loop-1.h"
+#undef L
+#undef F
+#undef G
+
+#define F num_gangs (10)
+#define G gangs
+#define L gang
+#include "parallel-loop-1.h"
+#undef L
+#undef F
+#undef G
+
+int
+main ()
+{
+  if (test_none_none ()
+      || test_none_auto ()
+      || test_none_independent ()
+      || test_none_seq ()
+      || test_gangs_none ()
+      || test_gangs_auto ()
+      || test_gangs_independent ()
+      || test_gangs_seq ())
+    abort ();
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.h libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.h
new file mode 100644
index 0000000..fd83dd4
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.h
@@ -0,0 +1,20 @@ 
+#define S
+#define N(x) M(x, G, none)
+#include "parallel-loop-2.h"
+#undef S
+#undef N
+#define S auto
+#define N(x) M(x, G, auto)
+#include "parallel-loop-2.h"
+#undef S
+#undef N
+#define S independent
+#define N(x) M(x, G, independent)
+#include "parallel-loop-2.h"
+#undef S
+#undef N
+#define S seq
+#define N(x) M(x, G, seq)
+#include "parallel-loop-2.h"
+#undef S
+#undef N
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-loop-2.h libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-loop-2.h
new file mode 100644
index 0000000..5691b7e
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-loop-2.h
@@ -0,0 +1,280 @@ 
+#ifndef VARS
+#define VARS
+int a[1500];
+float b[10][15][10];
+#pragma acc routine
+__attribute__((noreturn)) void
+noreturn (void)
+{
+  for (;;);
+}
+#endif
+#ifndef SC
+#define SC
+#endif
+
+__attribute__((noinline, noclone)) void
+N(f0) (void)
+{
+  int i;
+#pragma acc PK loop L F
+  for (i = 0; i < 1500; i++)
+    a[i] += 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f1) (void)
+{
+#pragma acc PK loop L F
+  for (unsigned int i = __INT_MAX__; i < 3000U + __INT_MAX__; i += 2)
+    a[(i - __INT_MAX__) >> 1] -= 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f2) (void)
+{
+  unsigned long long i;
+#pragma acc PK loop L F
+  for (i = __LONG_LONG_MAX__ + 4500ULL - 27;
+       i > __LONG_LONG_MAX__ - 27ULL; i -= 3)
+    a[(i + 26LL - __LONG_LONG_MAX__) / 3] -= 4;
+}
+
+__attribute__((noinline, noclone)) void
+N(f3) (long long n1, long long n2, long long s3)
+{
+#pragma acc PK loop L F
+  for (long long i = n1 + 23; i > n2 - 25; i -= s3)
+    a[i + 48] += 7;
+}
+
+__attribute__((noinline, noclone)) void
+N(f4) (void)
+{
+  unsigned int i;
+#pragma acc PK loop L F
+  for (i = 30; i < 20; i += 2)
+    a[i] += 10;
+}
+
+__attribute__((noinline, noclone)) void
+N(f5) (int n11, int n12, int n21, int n22, int n31, int n32,
+       int s1, int s2, int s3)
+{
+  SC int v1, v2, v3;
+#pragma acc PK loop L F
+  for (v1 = n11; v1 < n12; v1 += s1)
+#pragma acc loop S
+    for (v2 = n21; v2 < n22; v2 += s2)
+      for (v3 = n31; v3 < n32; v3 += s3)
+	b[v1][v2][v3] += 2.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f6) (int n11, int n12, int n21, int n22, long long n31, long long n32,
+       int s1, int s2, long long int s3)
+{
+  SC int v1, v2;
+  SC long long v3;
+#pragma acc PK loop L F
+  for (v1 = n11; v1 > n12; v1 += s1)
+#pragma acc loop S
+    for (v2 = n21; v2 > n22; v2 += s2)
+      for (v3 = n31; v3 > n32; v3 += s3)
+	b[v1][v2 / 2][v3] -= 4.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f7) (void)
+{
+  SC unsigned int v1, v3;
+  SC unsigned long long v2;
+#pragma acc PK loop L F
+  for (v1 = 0; v1 < 20; v1 += 2)
+#pragma acc loop S
+    for (v2 = __LONG_LONG_MAX__ + 16ULL;
+	 v2 > __LONG_LONG_MAX__ - 29ULL; v2 -= 3)
+      for (v3 = 10; v3 > 0; v3--)
+	b[v1 >> 1][(v2 - __LONG_LONG_MAX__ + 64) / 3 - 12][v3 - 1] += 5.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f8) (void)
+{
+  SC long long v1, v2, v3;
+#pragma acc PK loop L F
+  for (v1 = 0; v1 < 20; v1 += 2)
+#pragma acc loop S
+    for (v2 = 30; v2 < 20; v2++)
+      for (v3 = 10; v3 < 0; v3--)
+	b[v1][v2][v3] += 5.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f9) (void)
+{
+  int i;
+#pragma acc PK loop L F
+  for (i = 20; i < 10; i++)
+    {
+      a[i] += 2;
+      noreturn ();
+      a[i] -= 4;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+N(f10) (void)
+{
+  SC int i;
+#pragma acc PK loop L F
+  for (i = 0; i < 10; i++)
+#pragma acc loop S
+    for (int j = 10; j < 8; j++)
+      for (long k = -10; k < 10; k++)
+	{
+	  b[i][j][k] += 4;
+	  noreturn ();
+	  b[i][j][k] -= 8;
+	}
+}
+
+__attribute__((noinline, noclone)) void
+N(f11) (int n)
+{
+  int i;
+#pragma acc PK loop L F
+  for (i = 20; i < n; i++)
+    {
+      a[i] += 8;
+      noreturn ();
+      a[i] -= 16;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+N(f12) (int n)
+{
+  SC int i;
+#pragma acc PK loop L F
+  for (i = 0; i < 10; i++)
+#pragma acc loop S
+    for (int j = n; j < 8; j++)
+      for (long k = -10; k < 10; k++)
+	{
+	  b[i][j][k] += 16;
+	  noreturn ();
+	  b[i][j][k] -= 32;
+	}
+}
+
+__attribute__((noinline, noclone)) void
+N(f13) (void)
+{
+  int *i;
+#pragma acc PK loop L F
+  for (i = a; i < &a[1500]; i++)
+    i[0] += 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f14) (void)
+{
+  SC float *i;
+#pragma acc PK loop L F
+  for (i = &b[0][0][0]; i < &b[0][0][10]; i++)
+#pragma acc loop S
+    for (float *j = &b[0][15][0]; j > &b[0][0][0]; j -= 10)
+      for (float *k = &b[0][0][10]; k > &b[0][0][0]; --k)
+	b[i - &b[0][0][0]][(j - &b[0][0][0]) / 10 - 1][(k - &b[0][0][0]) - 1]
+	  -= 3.5;
+}
+
+__attribute__((noinline, noclone)) int
+N(test) (void)
+{
+  int i, j, k;
+  for (i = 0; i < 1500; i++)
+    a[i] = i - 25;
+  N(f0) ();
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 23)
+      return 1;
+  N(f1) ();
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 25)
+      return 1;
+  N(f2) ();
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 29)
+      return 1;
+  N(f3) (1500LL - 1 - 23 - 48, -1LL + 25 - 48, 1LL);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  N(f3) (1500LL - 1 - 23 - 48, 1500LL - 1, 7LL);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  N(f4) ();
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	b[i][j][k] = i - 2.5 + 1.5 * j - 1.5 * k;
+  N(f5) (0, 10, 0, 15, 0, 10, 1, 1, 1);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f5) (0, 10, 30, 15, 0, 10, 4, 5, 6);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f6) (9, -1, 29, 0, 9, -1, -1, -2, -1);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i - 4.5 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f7) ();
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f8) ();
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f9) ();
+  N(f10) ();
+  N(f11) (10);
+  N(f12) (12);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f13) ();
+  N(f14) ();
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 20)
+      return 1;
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i - 2.5 + 1.5 * j - 1.5 * k)
+	  return 1;
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h
new file mode 100644
index 0000000..4a5f61a
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h
@@ -0,0 +1,16 @@ 
+! CUDA BLAS interface binding for SAXPY.
+      
+      use iso_c_binding
+      interface
+        subroutine cublassaxpy(N, alpha, x, incx, y, incy)
+     1    bind(c, name="cublasSaxpy")
+          use iso_c_binding
+          integer(kind=c_int), value :: N
+          real(kind=c_float), value :: alpha
+          type(*), dimension(*) :: x
+          integer(kind=c_int), value :: incx
+          type(*), dimension(*) :: y
+          integer(kind=c_int), value :: incy
+        end subroutine cublassaxpy
+      end interface
+
diff --git libgomp/testsuite/libgomp.oacc-fortran/data-1.f90 libgomp/testsuite/libgomp.oacc-fortran/data-1.f90
index f4e9053..bf323b3 100644
--- libgomp/testsuite/libgomp.oacc-fortran/data-1.f90
+++ libgomp/testsuite/libgomp.oacc-fortran/data-1.f90
@@ -1,45 +1,212 @@ 
 ! { dg-do run }
+! { dg-additional-options "-cpp" }
 
-program test
-  integer, parameter :: N = 8
-  real, allocatable :: a(:), b(:)
+function is_mapped (n) result (rc)
+  use openacc
 
-  allocate (a(N))
-  allocate (b(N))
+  integer, intent (in) :: n
+  logical rc
 
-  a(:) = 3.0
-  b(:) = 0.0
+#if ACC_MEM_SHARED
+  integer i
 
-  !$acc enter data copyin (a(1:N), b(1:N))
+  rc = .TRUE.
+  i = n
+#else
+  rc = acc_is_present (n, sizeof (n))
+#endif
 
-  !$acc parallel
-  do i = 1, n
-    b(i) = a (i)
-  end do
-  !$acc end parallel
+end function is_mapped
 
-  !$acc exit data copyout (a(1:N), b(1:N))
+program main
+  integer i, j
+  logical is_mapped
 
-  do i = 1, n
-    if (a(i) .ne. 3.0) STOP 1
-    if (b(i) .ne. 3.0) STOP 2
-  end do
+  i = -1
+  j = -2
 
-  a(:) = 5.0
-  b(:) = 1.0
+  !$acc data copyin (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
 
-  !$acc enter data copyin (a(1:N), b(1:N))
+    if (i .ne. -1 .or. j .ne. -2) call abort
 
-  !$acc parallel
-  do i = 1, n
-    b(i) = a (i)
-  end do
-  !$acc end parallel
+    i = 2
+    j = 1
 
-  !$acc exit data copyout (a(1:N), b(1:N))
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
 
-  do i = 1, n
-    if (a(i) .ne. 5.0) STOP 3
-    if (b(i) .ne. 5.0) STOP 4
-  end do
-end program test
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data copyout (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+
+    !$acc parallel present (i, j)
+      i = 4
+      j = 2
+    !$acc end parallel
+  !$acc end data
+
+  if (i .ne. 4 .or. j .ne. 2) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data create (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data present_or_copyin (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data present_or_copyout (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+
+    !$acc parallel present (i, j)
+      i = 4
+      j = 2
+    !$acc end parallel
+  !$acc end data
+
+  if (i .ne. 4 .or. j .ne. 2) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data present_or_copy (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+#if ACC_MEM_SHARED
+  if (i .ne. 2 .or. j .ne. 1) call abort
+#else
+  if (i .ne. -1 .or. j .ne. -2) call abort
+#endif
+
+  i = -1
+  j = -2
+
+  !$acc data present_or_create (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data copyin (i, j)
+    !$acc data present (i, j)
+      if (is_mapped (i) .eqv. .FALSE.) call abort
+      if (is_mapped (j) .eqv. .FALSE.) call abort
+
+      if (i .ne. -1 .or. j .ne. -2) call abort
+
+      i = 2
+      j = 1
+
+      if (i .ne. 2 .or. j .ne. 1) call abort
+    !$acc end data
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data copyin (i, j)
+    !$acc data present (i, j)
+      if (is_mapped (i) .eqv. .FALSE.) call abort
+      if (is_mapped (j) .eqv. .FALSE.) call abort
+
+      if (i .ne. -1 .or. j .ne. -2) call abort
+
+      i = 2
+      j = 1
+
+      if (i .ne. 2 .or. j .ne. 1) call abort
+    !$acc end data
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data
+#if !ACC_MEM_SHARED
+    if (is_mapped (i) .eqv. .TRUE.) call abort
+    if (is_mapped (j) .eqv. .TRUE.) call abort
+#endif
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/data-2.f90 libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
index 22525b8..83a5400 100644
--- libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
+++ libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
@@ -1,8 +1,14 @@ 
 ! { dg-do run }
 
 program test
+  use openacc
   integer, parameter :: N = 8
   real, allocatable :: a(:,:), b(:,:)
+  real, allocatable :: c(:), d(:)
+  integer i, j
+
+  i = 0
+  j = 0
 
   allocate (a(N,N))
   allocate (b(N,N))
@@ -28,4 +34,48 @@  program test
       if (b(j,i) .ne. 3.0) STOP 2
     end do
   end do
+
+  allocate (c(N))
+  allocate (d(N))
+
+  c(:) = 3.0
+  d(:) = 0.0
+
+  !$acc enter data copyin (c(1:N)) create (d(1:N)) async
+  !$acc wait
+  
+  !$acc parallel 
+    do i = 1, N
+      d(i) = c(i) + 1
+    end do
+  !$acc end parallel
+
+  !$acc exit data copyout (c(1:N), d(1:N)) async
+  !$acc wait
+
+  do i = 1, N
+    if (d(i) .ne. 4.0) call abort
+  end do
+
+  c(:) = 3.0
+  d(:) = 0.0
+
+  !$acc enter data copyin (c(1:N)) async
+  !$acc enter data create (d(1:N)) wait
+  !$acc wait
+
+  !$acc parallel 
+    do i = 1, N
+      d(i) = c(i) + 1
+    end do
+  !$acc end parallel
+  
+  !$acc exit data copyout (d(1:N)) async
+  !$acc exit data async
+  !$acc wait
+
+  do i = 1, N
+    if (d(i) .ne. 4.0) call abort
+  end do
+
 end program test
diff --git libgomp/testsuite/libgomp.oacc-fortran/dummy-array.f90 libgomp/testsuite/libgomp.oacc-fortran/dummy-array.f90
new file mode 100644
index 0000000..e95563c
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/dummy-array.f90
@@ -0,0 +1,28 @@ 
+! Ensure that dummy arrays are transferred to the accelerator
+! via an implicit pcopy.
+
+! { dg-do run } 
+
+program main
+  integer, parameter :: n = 1000
+  integer :: a(n)
+  integer :: i
+
+  a(:) = -1
+
+  call dummy_array (a, n)
+  
+  do i = 1, n
+     if (a(i) .ne. i) call abort
+  end do
+end program main
+
+subroutine dummy_array (a, n)
+  integer a(n)
+
+  !$acc parallel loop num_gangs (100) gang
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end parallel loop
+end subroutine
diff --git libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90 libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
new file mode 100644
index 0000000..ff09218
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
@@ -0,0 +1,98 @@ 
+! Test host_data interoperability with CUDA blas.  This test was
+! derived from libgomp.oacc-c-c++-common/host_data-1.c.
+
+! { dg-do run { target openacc_nvidia_accel_selected } }
+! { dg-additional-options "-lcublas -Wall -Wextra" }
+
+program test
+  implicit none
+
+  integer, parameter :: N = 10
+  integer :: i
+  real*4 :: x_ref(N), y_ref(N), x(N), y(N), a
+  
+  interface
+     subroutine cublassaxpy(N, alpha, x, incx, y, incy) bind(c, name="cublasSaxpy")
+       use iso_c_binding
+       integer(kind=c_int), value :: N
+       real(kind=c_float), value :: alpha
+       type(*), dimension(*) :: x
+       integer(kind=c_int), value :: incx
+       type(*), dimension(*) :: y
+       integer(kind=c_int), value :: incy
+     end subroutine cublassaxpy
+  end interface
+
+  a = 2.0
+
+  do i = 1, N
+     x(i) = 4.0 * i
+     y(i) = 3.0
+     x_ref(i) = x(i)
+     y_ref(i) = y(i)
+  end do
+
+  call saxpy (N, a, x_ref, y_ref)
+
+  !$acc data copyin (x) copy (y)
+  !$acc host_data use_device (x, y)
+  call cublassaxpy(N, a, x, 1, y, 1)
+  !$acc end host_data
+  !$acc end data
+
+  call validate_results (N, y, y_ref)
+
+  !$acc data create (x) copyout (y)
+  !$acc parallel loop
+  do i = 1, N
+     y(i) = 3.0
+  end do
+  !$acc end parallel loop
+
+  !$acc host_data use_device (x, y)
+  call cublassaxpy(N, a, x, 1, y, 1)
+  !$acc end host_data
+  !$acc end data
+
+  call validate_results (N, y, y_ref)
+
+  y(:) = 3.0
+
+  !$acc data copyin (x) copyin (a) copy (y)
+  !$acc parallel present (x) pcopy (y) present (a)
+  call saxpy (N, a, x, y)
+  !$acc end parallel
+  !$acc end data
+
+  call validate_results (N, y, y_ref)
+
+  y(:) = 3.0
+
+  !$acc enter data copyin (x, a, y)
+  !$acc parallel present (x) pcopy (y) present (a)
+  call saxpy (N, a, x, y)
+  !$acc end parallel
+  !$acc exit data delete (x, a) copyout (y)
+
+  call validate_results (N, y, y_ref)
+end program test
+
+subroutine saxpy (nn, aa, xx, yy)
+  integer :: nn
+  real*4 :: aa, xx(nn), yy(nn)
+  integer i
+  !$acc routine
+
+  do i = 1, nn
+    yy(i) = yy(i) + aa * xx(i)
+  end do
+end subroutine saxpy
+
+subroutine validate_results (n, a, b)
+  integer :: n
+  real*4 :: a(n), b(n)
+
+  do i = 1, N
+     if (abs(a(i) - b(i)) > 0.0001) call abort
+  end do
+end subroutine validate_results
diff --git libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f
new file mode 100644
index 0000000..05ed949
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f
@@ -0,0 +1,85 @@ 
+! Fixed-mode host_data interaction with CUDA BLAS.
+
+! { dg-do run { target openacc_nvidia_accel_selected } }
+! { dg-additional-options "-lcublas -Wall -Wextra" }
+
+      include "cublas-fixed.h"
+
+      integer, parameter :: N = 10
+      integer :: i
+      real*4 :: x_ref(N), y_ref(N), x(N), y(N), a
+
+      a = 2.0
+
+      do i = 1, N
+         x(i) = 4.0 * i
+         y(i) = 3.0
+         x_ref(i) = x(i)
+         y_ref(i) = y(i)
+      end do
+
+      call saxpy (N, a, x_ref, y_ref)
+  
+!$acc data copyin (x) copy (y)
+!$acc host_data use_device (x, y)
+      call cublassaxpy(N, a, x, 1, y, 1)
+!$acc end host_data
+!$acc end data
+
+      call validate_results (N, y, y_ref)
+
+!$acc data create (x) copyout (y)
+!$acc parallel loop
+      do i = 1, N
+         y(i) = 3.0
+      end do
+!$acc end parallel loop
+
+!$acc host_data use_device (x, y)
+      call cublassaxpy(N, a, x, 1, y, 1)
+!$acc end host_data
+!$acc end data
+
+      call validate_results (N, y, y_ref)
+
+      y(:) = 3.0
+  
+!$acc data copyin (x) copyin (a) copy (y)
+!$acc parallel present (x) pcopy (y) present (a)
+      call saxpy (N, a, x, y)
+!$acc end parallel
+!$acc end data
+
+      call validate_results (N, y, y_ref)
+
+      y(:) = 3.0
+  
+!$acc enter data copyin (x, a, y)
+!$acc parallel present (x) pcopy (y) present (a)
+      call saxpy (N, a, x, y)
+!$acc end parallel
+!$acc exit data delete (x, a) copyout (y)
+
+      call validate_results (N, y, y_ref)
+      end
+
+      subroutine saxpy (nn, aa, xx, yy)
+      integer :: nn
+      real*4 :: aa, xx(nn), yy(nn)
+      integer i
+!$acc routine
+
+      do i = 1, nn
+         yy(i) = yy(i) + aa * xx(i)
+      end do
+      end subroutine saxpy
+
+      subroutine validate_results (n, a, b)
+      integer :: n
+      real*4 :: a(n), b(n)
+
+      do i = 1, N
+         if (abs(a(i) - b(i)) > 0.0001) call abort
+      end do
+      end subroutine validate_results
+
diff --git libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90 libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90
new file mode 100644
index 0000000..6e379b5
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90
@@ -0,0 +1,101 @@ 
+! Test host_data interoperability with CUDA blas using modules.
+
+! { dg-do run { target openacc_nvidia_accel_selected } }
+! { dg-additional-options "-lcublas -Wall -Wextra" }
+
+module cublas
+  interface
+     subroutine cublassaxpy(N, alpha, x, incx, y, incy) bind(c, name="cublasSaxpy")
+       use iso_c_binding
+       integer(kind=c_int), value :: N
+       real(kind=c_float), value :: alpha
+       type(*), dimension(*) :: x
+       integer(kind=c_int), value :: incx
+       type(*), dimension(*) :: y
+       integer(kind=c_int), value :: incy
+     end subroutine cublassaxpy
+  end interface
+
+contains
+  subroutine saxpy (nn, aa, xx, yy)
+    integer :: nn
+    real*4 :: aa, xx(nn), yy(nn)
+    integer i
+    !$acc routine
+
+    do i = 1, nn
+       yy(i) = yy(i) + aa * xx(i)
+    end do
+  end subroutine saxpy
+
+  subroutine validate_results (n, a, b)
+    integer :: n
+    real*4 :: a(n), b(n)
+
+    do i = 1, N
+       if (abs(a(i) - b(i)) > 0.0001) call abort
+    end do
+  end subroutine validate_results
+end module cublas
+
+program test
+  use cublas
+  implicit none
+
+  integer, parameter :: N = 10
+  integer :: i
+  real*4 :: x_ref(N), y_ref(N), x(N), y(N), a
+
+  a = 2.0
+
+  do i = 1, N
+     x(i) = 4.0 * i
+     y(i) = 3.0
+     x_ref(i) = x(i)
+     y_ref(i) = y(i)
+  end do
+
+  call saxpy (N, a, x_ref, y_ref)
+
+  !$acc data copyin (x) copy (y)
+  !$acc host_data use_device (x, y)
+  call cublassaxpy(N, a, x, 1, y, 1)
+  !$acc end host_data
+  !$acc end data
+
+  call validate_results (N, y, y_ref)
+
+  !$acc data create (x) copyout (y)
+  !$acc parallel loop
+  do i = 1, N
+     y(i) = 3.0
+  end do
+  !$acc end parallel loop
+
+  !$acc host_data use_device (x, y)
+  call cublassaxpy(N, a, x, 1, y, 1)
+  !$acc end host_data
+  !$acc end data
+
+  call validate_results (N, y, y_ref)
+
+  y(:) = 3.0
+
+  !$acc data copyin (x) copyin (a) copy (y)
+  !$acc parallel present (x) pcopy (y) present (a)
+  call saxpy (N, a, x, y)
+  !$acc end parallel
+  !$acc end data
+
+  call validate_results (N, y, y_ref)
+
+  y(:) = 3.0
+
+  !$acc enter data copyin (x, a, y)
+  !$acc parallel present (x) pcopy (y) present (a)
+  call saxpy (N, a, x, y)
+  !$acc end parallel
+  !$acc exit data delete (x, a) copyout (y)
+
+  call validate_results (N, y, y_ref)
+end program test
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction-2.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction-2.f90
new file mode 100644
index 0000000..fdf9409
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction-2.f90
@@ -0,0 +1,26 @@ 
+program foo
+
+  IMPLICIT NONE
+  INTEGER :: vol = 0
+
+  call bar (vol)
+
+  if (vol .ne. 4) call abort
+end program foo
+
+subroutine bar(vol)
+  IMPLICIT NONE
+
+  INTEGER :: vol
+  INTEGER :: j,k
+
+  !$ACC KERNELS
+  !$ACC LOOP REDUCTION(+:vol)
+  DO k=1,2
+     !$ACC LOOP REDUCTION(+:vol)
+     DO j=1,2
+	vol = vol + 1
+     ENDDO
+  ENDDO
+  !$ACC END KERNELS
+end subroutine bar
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction.f90
new file mode 100644
index 0000000..912a22b
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction.f90
@@ -0,0 +1,21 @@ 
+program foo
+  IMPLICIT NONE
+  INTEGER :: vol = 0
+
+  call bar (vol)
+
+  if (vol .ne. 2) call abort
+end program foo
+
+subroutine bar(vol)
+  IMPLICIT NONE
+  INTEGER :: vol
+  INTEGER :: j
+
+  !$ACC KERNELS
+  !$ACC LOOP REDUCTION(+:vol)
+  DO j=1,2
+     vol = vol + 1
+  ENDDO
+  !$ACC END KERNELS
+end subroutine bar
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-collapse-3.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-collapse-3.f90
new file mode 100644
index 0000000..4ef99cd
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-collapse-3.f90
@@ -0,0 +1,30 @@ 
+! Test the collapse clause inside a kernels region.
+
+! { dg-do run }
+
+program collapse3
+  integer :: a(3,3,3), k, kk, kkk, l, ll, lll
+  !$acc kernels
+  !$acc loop collapse(3)
+    do 115 k=1,3
+dokk: do kk=1,3
+        do kkk=1,3
+          a(k,kk,kkk) = 1
+        enddo
+      enddo dokk
+115   continue
+  !$acc end kernels
+  if (any(a(1:3,1:3,1:3).ne.1)) call abort
+
+  !$acc kernels
+  !$acc loop collapse(3)
+dol: do 120 l=1,3
+doll: do ll=1,3
+        do lll=1,3
+          a(l,ll,lll) = 2
+        enddo
+      enddo doll
+120 end do dol
+  !$acc end kernels
+  if (any(a(1:3,1:3,1:3).ne.2)) call abort
+end program collapse3
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-collapse-4.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-collapse-4.f90
new file mode 100644
index 0000000..db382a7
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-collapse-4.f90
@@ -0,0 +1,41 @@ 
+! Test the collapse and reduction loop clauses inside a kernels region.
+
+! { dg-do run }
+
+program collapse4
+  integer :: i, j, k, a(1:7, -3:5, 12:19), b(1:7, -3:5, 12:19)
+  logical :: l, r
+  l = .false.
+  r = .false.
+  a(:, :, :) = 0
+  b(:, :, :) = 0
+  !$acc kernels
+  !$acc loop collapse (3) reduction (.or.:l)
+    do i = 2, 6
+      do j = -2, 4
+        do k = 13, 18
+          l = l.or.i.lt.2.or.i.gt.6.or.j.lt.-2.or.j.gt.4
+          l = l.or.k.lt.13.or.k.gt.18
+          if (.not.l) a(i, j, k) = a(i, j, k) + 1
+        end do
+      end do
+    end do
+  !$acc end kernels
+  do i = 2, 6
+    do j = -2, 4
+      do k = 13, 18
+        r = r.or.i.lt.2.or.i.gt.6.or.j.lt.-2.or.j.gt.4
+        r = r.or.k.lt.13.or.k.gt.18
+        if (.not.l) b(i, j, k) = b(i, j, k) + 1
+      end do
+    end do
+  end do
+  if (l .neqv. r) call abort
+  do i = 2, 6
+    do j = -2, 4
+      do k = 13, 18
+         if (a(i, j, k) .ne. b(i, j, k)) call abort
+      end do
+    end do
+  end do
+end program collapse4
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-independent.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-independent.f90
new file mode 100644
index 0000000..a881fbb
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-independent.f90
@@ -0,0 +1,42 @@ 
+! { dg-do run }
+! { dg-additional-options "-cpp" }
+
+#define N (1024 * 512)
+
+subroutine foo (a,  b,  c)
+  integer, parameter :: n = N
+  integer, dimension (n) :: a
+  integer, dimension (n) :: b
+  integer, dimension (n) :: c
+  integer i, ii
+
+  do i = 1, n
+    a(i) = i * 2;
+  end do
+
+  do i = 1, n
+    b(i) = i * 4;
+  end do
+
+  !$acc kernels copyin (a(1:n), b(1:n)) copyout (c(1:n))
+    !$acc loop independent
+    do ii = 1, n
+      c(ii) = a(ii) + b(ii)
+    end do
+  !$acc end kernels
+
+  do i = 1, n
+    if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end subroutine
+
+program main
+  integer, parameter :: n = N
+  integer :: a(n)
+  integer :: b(n)
+  integer :: c(n)
+
+  call foo (a, b, c)
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-1.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-1.f90
new file mode 100644
index 0000000..edcdc56
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-1.f90
@@ -0,0 +1,66 @@ 
+! Exercise the auto, independent, seq and tile loop clauses inside
+! kernels regions. 
+
+! { dg-do run }
+
+program loops
+  integer, parameter     :: n = 20
+  integer                :: i, a(n), b(n)
+
+  a(:) = 0
+  b(:) = 0
+
+  ! COPY
+
+  !$acc kernels copy (a)
+  !$acc loop auto
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end kernels
+
+  do i = 1, n
+     b(i) = i
+  end do
+
+  call check (a, b, n)
+
+  ! COPYOUT
+
+  a(:) = 0
+
+  !$acc kernels copyout (a)
+  !$acc loop independent
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end kernels
+
+  do i = 1, n
+     if (a(i) .ne. b(i)) call abort
+  end do
+  call check (a, b, n)
+
+  ! COPYIN
+
+  a(:) = 0
+
+  !$acc kernels copyout (a) copyin (b)
+  !$acc loop seq
+  do i = 1, n
+     a(i) = b(i)
+  end do
+  !$acc end kernels
+
+  call check (a, b, n)
+
+end program loops
+
+subroutine check (a, b, n)
+  integer :: n, a(n), b(n)
+  integer :: i
+
+  do i = 1, n
+     if (a(i) .ne. b(i)) call abort
+  end do
+end subroutine check
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-map-1.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-map-1.f90
new file mode 100644
index 0000000..704ff62
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-map-1.f90
@@ -0,0 +1,116 @@ 
+! Test the copy, copyin, copyout, pcopy, pcopyin, pcopyout, and pcreate
+! clauses on kernels constructs.
+
+! { dg-do run }
+
+program map
+  integer, parameter     :: n = 20, c = 10
+  integer                :: i, a(n), b(n), d(n)
+
+  a(:) = 0
+  b(:) = 0
+
+  ! COPY
+
+  !$acc kernels copy (a)
+  !$acc loop
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end kernels
+
+  do i = 1, n
+     b(i) = i
+  end do
+
+  call check (a, b, n)
+
+  ! COPYOUT
+
+  a(:) = 0
+
+  !$acc kernels copyout (a)
+  !$acc loop
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end kernels
+
+  do i = 1, n
+     if (a(i) .ne. b(i)) call abort
+  end do
+  call check (a, b, n)
+
+  ! COPYIN
+
+  a(:) = 0
+
+  !$acc kernels copyout (a) copyin (b)
+  !$acc loop
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end kernels
+
+  call check (a, b, n)
+
+  ! PRESENT_OR_COPY
+
+  !$acc kernels pcopy (a)
+  !$acc loop
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end kernels
+
+  call check (a, b, n)
+
+  ! PRESENT_OR_COPYOUT
+
+  a(:) = 0
+
+  !$acc kernels pcopyout (a)
+  !$acc loop
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end kernels
+
+  call check (a, b, n)
+
+  ! PRESENT_OR_COPYIN
+
+  a(:) = 0
+
+  !$acc kernels pcopyout (a) pcopyin (b)
+  !$acc loop
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end kernels
+
+  call check (a, b, n)
+
+  ! PRESENT_OR_CREATE
+
+  a(:) = 0
+
+  !$acc kernels pcopyout (a) pcreate (d)
+  !$acc loop
+  do i = 1, n
+     d(i) = i
+     a(i) = d(i)
+  end do
+  !$acc end kernels
+
+  call check (a, b, n)
+end program map
+
+subroutine check (a, b, n)
+  integer :: n, a(n), b(n)
+  integer :: i
+
+  do i = 1, n
+     if (a(i) .ne. b(i)) call abort
+  end do
+end subroutine check
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95 libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95
new file mode 100644
index 0000000..fe1088c
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95
@@ -0,0 +1,36 @@ 
+! { dg-do run }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  !$acc parallel present (b(0:n-1))
+  !$acc loop
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end parallel
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc exit data copyout (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-1.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-1.f90
new file mode 100644
index 0000000..5119fab
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-1.f90
@@ -0,0 +1,23 @@ 
+! Test of gang-private variables declared on loop directive.
+
+! { dg-do run }
+
+program main
+  integer :: x, i, arr(32)
+
+  do i = 1, 32
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32) private(x)
+  do i = 1, 32
+     x = i * 2;
+     arr(i) = arr(i) + x;
+  end do
+  !$acc end kernels
+
+  do i = 1, 32
+     if (arr(i) .ne. i * 3) call abort
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-2.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-2.f90
new file mode 100644
index 0000000..5e46287
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-2.f90
@@ -0,0 +1,28 @@ 
+! Test of gang-private variables declared on loop directive, with broadcasting
+! to partitioned workers.
+
+! { dg-do run }
+
+program main
+  integer :: x, i, j, arr(0:32*32)
+
+  do i = 0, 32*32 -1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32) private(x)
+  do i = 0, 31
+     x = i * 2;
+
+     !$acc loop worker(num:32)
+     do j = 0, 31
+        arr(i * 32 + j) = arr(i * 32 + j) + x;
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 * 32 - 1
+     if (arr(i) .ne. i + (i / 32) * 2) call abort
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-3.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-3.f90
new file mode 100644
index 0000000..5cc3378
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-3.f90
@@ -0,0 +1,28 @@ 
+! Test of gang-private variables declared on loop directive, with broadcasting
+! to partitioned vectors.
+
+! { dg-do run }
+
+program main
+  integer :: x, i, j, arr(0:32*32)
+
+  do i = 0, 32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32) private(x)
+  do i = 0, 31
+     x = i * 2;
+
+     !$acc loop vector(length:32)
+     do j = 0, 31
+        arr(i * 32 + j) = arr(i * 32 + j) + x;
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 * 32 - 1
+     if (arr(i) .ne. i + (i / 32) * 2) call abort
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-6.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-6.f90
new file mode 100644
index 0000000..1e41555
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-6.f90
@@ -0,0 +1,36 @@ 
+! Test of gang-private addressable variable declared on loop directive, with
+! broadcasting to partitioned workers.
+
+! { dg-do run }
+
+program main
+  type vec3
+     integer x, y, z, attr(13)
+  end type vec3
+
+  integer x, i, j, arr(0:32*32)
+  type(vec3) pt
+  
+  do i = 0, 32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32) private(pt)
+  do i = 0, 31
+     pt%x = i
+     pt%y = i * 2
+     pt%z = i * 4
+     pt%attr(5) = i * 6
+
+     !$acc loop vector(length:32)
+     do j = 0, 31
+        arr(i * 32 + j) = arr(i * 32 + j) + pt%x + pt%y + pt%z + pt%attr(5);
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 * 32 - 1
+     if (arr(i) .ne. i + (i / 32) * 13) call abort
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-1.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-1.f90
new file mode 100644
index 0000000..3efd9fe
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-1.f90
@@ -0,0 +1,41 @@ 
+! Test of vector-private variables declared on loop directive.
+
+! { dg-do run }
+
+program main
+  integer :: x, i, j, k, idx, arr(0:32*32*32)
+
+  do i = 0, 32*32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32)
+  do i = 0, 31
+     !$acc loop worker(num:8)
+     do j = 0, 31
+        !$acc loop vector(length:32) private(x)
+        do k = 0, 31
+           x = ieor(i, j * 3)
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + x * k
+        end do
+        !$acc loop vector(length:32) private(x)
+        do k = 0, 31
+           x = ior(i, j * 5)
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + x * k
+        end do
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 - 1
+     do j = 0, 32 -1
+        do k = 0, 32 - 1
+           idx = i * 1024 + j * 32 + k
+           if (arr(idx) .ne. idx + ieor(i, j * 3) * k + ior(i, j * 5) * k) then
+              call abort
+           end if
+        end do
+     end do
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-2.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-2.f90
new file mode 100644
index 0000000..1cf3b98
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-2.f90
@@ -0,0 +1,38 @@ 
+! Test of vector-private variables declared on loop directive. Array type.
+
+! { dg-do run }
+
+program main
+  integer :: i, j, k, idx, arr(0:32*32*32), pt(2)
+
+  do i = 0, 32*32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32)
+  do i = 0, 31
+     !$acc loop worker(num:8)
+     do j = 0, 31
+        !$acc loop vector(length:32) private(x, pt)
+        do k = 0, 31
+           pt(1) = ieor(i, j * 3)
+           pt(2) = ior(i, j * 5)
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + pt(1) * k
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + pt(2) * k
+        end do
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 - 1
+     do j = 0, 32 -1
+        do k = 0, 32 - 1
+           idx = i * 1024 + j * 32 + k
+           if (arr(idx) .ne. idx + ieor(i, j * 3) * k + ior(i, j * 5) * k) then
+              call abort
+           end if
+        end do
+     end do
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-1.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-1.f90
new file mode 100644
index 0000000..55e98e0
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-1.f90
@@ -0,0 +1,27 @@ 
+! Test of worker-private variables declared on a loop directive.
+
+! { dg-do run }
+
+program main
+  integer :: x, i, j, arr(0:32*32)
+  common x
+
+  do i = 0, 32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32) private(x)
+  do i = 0, 31
+     !$acc loop worker(num:8) private(x)
+     do j = 0, 31
+        x = ieor(i, j * 3)
+        arr(i * 32 + j) = arr(i * 32 + j) + x
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 * 32 - 1
+     if (arr(i) .ne. i + ieor(i / 32, mod(i, 32) * 3)) call abort
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-2.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-2.f90
new file mode 100644
index 0000000..7924e7f
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-2.f90
@@ -0,0 +1,36 @@ 
+! Test of worker-private variables declared on a loop directive, broadcasting
+! to vector-partitioned mode.
+
+! { dg-do run }
+
+program main
+  integer :: x, i, j, k, idx, arr(0:32*32*32)
+
+  do i = 0, 32*32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32)
+  do i = 0, 31
+     !$acc loop worker(num:8) private(x)
+     do j = 0, 31
+        x = ieor(i, j * 3)
+
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + x * k
+        end do
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 - 1
+     do j = 0, 32 -1
+        do k = 0, 32 - 1
+           idx = i * 1024 + j * 32 + k
+           if (arr(idx) .ne. idx + ieor(i, j * 3) * k) call abort
+        end do
+     end do
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-3.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-3.f90
new file mode 100644
index 0000000..598c6fd
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-3.f90
@@ -0,0 +1,48 @@ 
+! Test of worker-private variables declared on a loop directive, broadcasting
+! to vector-partitioned mode.  Back-to-back worker loops.
+
+! { dg-do run }
+
+program main
+  integer :: x, i, j, k, idx, arr(0:32*32*32)
+
+  do i = 0, 32*32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32)
+  do i = 0, 31
+     !$acc loop worker(num:8) private(x)
+     do j = 0, 31
+        x = ieor(i, j * 3)
+
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + x * k
+        end do
+     end do
+
+     !$acc loop worker(num:8) private(x)
+     do j = 0, 31
+        x = ior(i, j * 5)
+
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + x * k
+        end do
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 - 1
+     do j = 0, 32 -1
+        do k = 0, 32 - 1
+           idx = i * 1024 + j * 32 + k
+           if (arr(idx) .ne. idx + ieor(i, j * 3) * k + ior(i, j * 5) * k) then
+              call abort
+           end if
+        end do
+     end do
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-4.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-4.f90
new file mode 100644
index 0000000..8512d7c
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-4.f90
@@ -0,0 +1,45 @@ 
+! Test of worker-private variables declared on a loop directive, broadcasting
+! to vector-partitioned mode.  Successive vector loops.  */
+
+! { dg-do run }
+
+program main
+  integer :: x, i, j, k, idx, arr(0:32*32*32)
+
+  do i = 0, 32*32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32)
+  do i = 0, 31
+     !$acc loop worker(num:8) private(x)
+     do j = 0, 31
+        x = ieor(i, j * 3)
+
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + x * k
+        end do
+
+        x = ior(i, j * 5)
+
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + x * k
+        end do
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 - 1
+     do j = 0, 32 -1
+        do k = 0, 32 - 1
+           idx = i * 1024 + j * 32 + k
+           if (arr(idx) .ne. idx + ieor(i, j * 3) * k + ior(i, j * 5) * k) then
+              call abort
+           end if
+        end do
+     end do
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-5.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-5.f90
new file mode 100644
index 0000000..c3ebf74
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-5.f90
@@ -0,0 +1,48 @@ 
+! Test of worker-private variables declared on a loop directive, broadcasting
+! to vector-partitioned mode.  Addressable worker variable.
+
+! { dg-do run }
+
+program main
+  integer :: i, j, k, idx, arr(0:32*32*32)
+  integer, target :: x
+  integer, pointer :: p
+
+  do i = 0, 32*32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32)
+  do i = 0, 31
+     !$acc loop worker(num:8) private(x, p)
+     do j = 0, 31
+        p => x
+        x = ieor(i, j * 3)
+
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + x * k
+        end do
+
+        p = ior(i, j * 5)
+
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + x * k
+        end do
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 - 1
+     do j = 0, 32 -1
+        do k = 0, 32 - 1
+           idx = i * 1024 + j * 32 + k
+           if (arr(idx) .ne. idx + ieor(i, j * 3) * k + ior(i, j * 5) * k) then
+              call abort
+           end if
+        end do
+     end do
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-6.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-6.f90
new file mode 100644
index 0000000..2a8a590
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-6.f90
@@ -0,0 +1,49 @@ 
+! Test of worker-private variables declared on a loop directive, broadcasting
+! to vector-partitioned mode.  Aggregate worker variable.
+
+! { dg-do run }
+
+program main
+  type vec2
+     integer x, y
+  end type vec2
+  
+  integer :: i, j, k, idx, arr(0:32*32*32)
+  type(vec2) :: pt
+  
+  do i = 0, 32*32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32)
+  do i = 0, 31
+     !$acc loop worker(num:8) private(pt)
+     do j = 0, 31
+        pt%x = ieor(i, j * 3)
+        pt%y = ior(i, j * 5)
+        
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + pt%x * k
+        end do
+
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + pt%y * k
+        end do
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 - 1
+     do j = 0, 32 -1
+        do k = 0, 32 - 1
+           idx = i * 1024 + j * 32 + k
+           if (arr(idx) .ne. idx + ieor(i, j * 3) * k + ior(i, j * 5) * k) then
+              call abort
+           end if
+        end do
+     end do
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-7.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-7.f90
new file mode 100644
index 0000000..7dd1d3d
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-7.f90
@@ -0,0 +1,44 @@ 
+! Test of worker-private variables declared on loop directive, broadcasting
+! to vector-partitioned mode.  Array worker variable.
+
+! { dg-do run }
+
+program main
+  integer :: i, j, k, idx, arr(0:32*32*32), pt(2)
+
+  do i = 0, 32*32*32-1
+     arr(i) = i
+  end do
+
+  !$acc kernels copy(arr)
+  !$acc loop gang(num:32)
+  do i = 0, 31
+     !$acc loop worker(num:8) private(pt)
+     do j = 0, 31
+        pt(1) = ieor(i, j * 3)
+        pt(2) = ior(i, j * 5)
+        
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + pt(1) * k
+        end do
+
+        !$acc loop vector(length:32)
+        do k = 0, 31
+           arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + pt(2) * k
+        end do
+     end do
+  end do
+  !$acc end kernels
+
+  do i = 0, 32 - 1
+     do j = 0, 32 -1
+        do k = 0, 32 - 1
+           idx = i * 1024 + j * 32 + k
+           if (arr(idx) .ne. idx + ieor(i, j * 3) * k + ior(i, j * 5) * k) then
+              call abort
+           end if
+        end do
+     end do
+  end do
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90 libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90
new file mode 100644
index 0000000..c7a52ed
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90
@@ -0,0 +1,19 @@ 
+! Test a simple acc loop reduction inside a kernels region. 
+
+! { dg-do run }
+
+program reduction
+  integer, parameter     :: n = 20
+  integer                :: i, red
+
+  red = 0
+
+  !$acc kernels
+  !$acc loop reduction (+:red)
+  do i = 1, n
+     red = red + 1
+  end do
+  !$acc end kernels
+
+  if (red .ne. n) call abort
+end program reduction
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
new file mode 100644
index 0000000..e307dfd
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
@@ -0,0 +1,27 @@ 
+! { dg-do run }
+! { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "-O0" "-O1" } { "" } }
+
+program main
+  use openacc
+  implicit none
+
+  integer :: i, j, n
+
+  j = 0
+  n = 1000000
+
+  !$acc parallel async (0) copy (j)
+    do i = 1, 1000000
+      j = j + 1
+    end do
+  !$acc end parallel
+
+  call acc_wait_async (0, 1)
+
+  if (acc_async_test (0) .neqv. .TRUE.) call abort
+
+  if (acc_async_test (1) .neqv. .TRUE.) call abort
+
+  call acc_wait (1)
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90
new file mode 100644
index 0000000..6d713b1
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90
@@ -0,0 +1,34 @@ 
+! { dg-do run }
+! { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "-O0" "-O1" } { "" } }
+
+program main
+  use openacc
+  implicit none
+
+  integer :: i, j
+  integer, parameter :: N = 1000000
+  integer, parameter :: nprocs = 2
+  integer :: k(nprocs)
+
+  k(:) = 0
+
+  !$acc data copy (k(1:nprocs))
+    do j = 1, nprocs
+      !$acc parallel async (j)
+        do i = 1, N
+          k(j) = k(j) + 1
+        end do
+      !$acc end parallel
+    end do
+  !$acc end data
+
+  if (acc_async_test (1) .neqv. .TRUE.) call abort
+  if (acc_async_test (2) .neqv. .TRUE.) call abort
+
+  call acc_wait_all_async (nprocs + 1)
+
+  if (acc_async_test (nprocs + 1) .neqv. .TRUE.) call abort
+
+  call acc_wait_all ()
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-14.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-14.f90
new file mode 100644
index 0000000..eb0206c
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-14.f90
@@ -0,0 +1,82 @@ 
+! Exercise the data movement runtime library functions on non-shared memory
+! targets.
+
+! { dg-do run { target openacc_nvidia_accel_selected } }
+
+program main
+  use openacc
+  implicit none
+
+  integer, parameter :: N = 256
+  integer, allocatable :: h(:)
+  integer :: i
+
+  allocate (h(N))
+
+  do i = 1, N
+    h(i) = i
+  end do 
+
+  call acc_present_or_copyin (h)
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  call acc_copyout (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+  do i = 1, N
+    if (h(i) /= i) call abort
+  end do
+
+  do i = 1, N
+    h(i) = i + i
+  end do 
+
+  call acc_pcopyin (h, sizeof (h))
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  call acc_copyout (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+  do i = 1, N
+    if (h(i) /= i + i) call abort
+  end do
+
+  call acc_create (h)
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  !$acc parallel loop
+    do i = 1, N
+      h(i) = i
+    end do
+  !$end acc parallel
+
+  call acc_copyout (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+  do i = 1, N
+    if (h(i) /= i) call abort
+  end do
+
+  call acc_present_or_create (h, sizeof (h))
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  call acc_delete (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+  call acc_pcreate (h)
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  call acc_delete (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-15.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-15.f90
new file mode 100644
index 0000000..3a834db
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-15.f90
@@ -0,0 +1,52 @@ 
+! { dg-do run }
+! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } }
+
+program main
+  use openacc
+  implicit none
+
+  integer, parameter :: N = 256
+  integer, allocatable :: h(:)
+  integer :: i
+
+  allocate (h(N))
+
+  do i = 1, N
+    h(i) = i
+  end do 
+
+  call acc_copyin (h)
+
+  do i = 1, N
+    h(i) = i + i
+  end do 
+
+  call acc_update_device (h, sizeof (h))
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  h(:) = 0
+
+  call acc_copyout (h, sizeof (h))
+
+  do i = 1, N
+    if (h(i) /= i + i) call abort
+  end do 
+
+  call acc_copyin (h, sizeof (h))
+
+  h(:) = 0
+
+  call acc_update_self (h, sizeof (h))
+  
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  do i = 1, N
+    if (h(i) /= i + i) call abort
+  end do 
+
+  call acc_delete (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+  
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-1.f90 libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-1.f90
new file mode 100644
index 0000000..754b833
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-1.f90
@@ -0,0 +1,77 @@ 
+! Exercise the auto, independent, seq and tile loop clauses inside
+! parallel regions. 
+
+! { dg-do run }
+
+program loops
+  integer, parameter     :: n = 20, c = 10
+  integer                :: i, a(n), b(n)
+
+  a(:) = 0
+  b(:) = 0
+
+  ! COPY
+
+  !$acc parallel copy (a)
+  !$acc loop auto
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end parallel
+
+  do i = 1, n
+     b(i) = i
+  end do
+
+  call check (a, b, n)
+
+  ! COPYOUT
+
+  a(:) = 0
+
+  !$acc parallel copyout (a)
+  !$acc loop independent
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end parallel
+
+  do i = 1, n
+     if (a(i) .ne. b(i)) call abort
+  end do
+  call check (a, b, n)
+
+  ! COPYIN
+
+  a(:) = 0
+
+  !$acc parallel copyout (a) copyin (b)
+  !$acc loop seq
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end parallel
+
+  call check (a, b, n)
+
+  ! PRESENT_OR_COPY
+
+  !$acc parallel pcopy (a)
+  !$acc loop tile (*)
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end parallel
+
+  call check (a, b, n)
+
+end program loops
+
+subroutine check (a, b, n)
+  integer :: n, a(n), b(n)
+  integer :: i
+
+  do i = 1, n
+     if (a(i) .ne. b(i)) call abort
+  end do
+end subroutine check
diff --git libgomp/testsuite/libgomp.oacc-fortran/reference-reductions.f90 libgomp/testsuite/libgomp.oacc-fortran/reference-reductions.f90
new file mode 100644
index 0000000..a684d07
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/reference-reductions.f90
@@ -0,0 +1,38 @@ 
+! Test reductions on dummy arguments inside modules.
+
+! { dg-do run }
+
+module prm
+  implicit none
+
+contains
+
+subroutine param_reduction(var)
+  implicit none
+  integer(kind=8) :: var
+  integer      :: j,k
+
+!$acc parallel copy(var)
+!$acc loop reduction(+ : var) gang
+ do k=1,10
+!$acc loop vector reduction(+ : var)
+    do j=1,100
+     var = var + 1.0
+    enddo
+ enddo
+!$acc end parallel
+end subroutine param_reduction
+
+end module prm
+
+program test
+  use prm
+  implicit none
+
+  integer(8) :: r
+
+  r=10.0
+  call param_reduction (r)
+
+  if (r .ne. 1010) call abort ()
+end program test
diff --git libgomp/testsuite/libgomp.oacc-fortran/vector-routine.f90 libgomp/testsuite/libgomp.oacc-fortran/vector-routine.f90
new file mode 100644
index 0000000..1edcee4
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/vector-routine.f90
@@ -0,0 +1,41 @@ 
+! { dg-do run }
+
+module param
+  integer, parameter :: N = 32
+end module param
+
+program main
+  use param
+  integer :: i
+  integer :: a(N)
+
+  do i = 1, N
+    a(i) = i
+  end do
+
+  !$acc parallel copy (a)
+  !$acc loop worker
+    do i = 1, N
+      call vector (a)
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (a(i) .ne. 0) call abort
+  end do
+
+contains
+
+  subroutine vector (a)
+  !$acc routine vector
+  integer, intent (inout) :: a(N)
+  integer :: i
+
+  !$acc loop vector
+  do i = 1, N
+    a(i) = a(i) - a(i) 
+  end do
+
+end subroutine vector
+
+end program main