new file mode 100644
@@ -0,0 +1,41 @@
+/* Check offloaded function's attributes and classification for unparallelized
+ OpenACC 'kernels' with Graphite kernles handling (default). */
+
+/* { dg-additional-options "-O2" }
+ { dg-additional-options "-fno-openacc-kernels-annotate-loops" }
+ { dg-additional-options "-fopt-info-optimized-omp" }
+ { dg-additional-options "-fopt-info-note-omp" }
+ { dg-additional-options "-fdump-tree-ompexp" }
+ { dg-additional-options "-fdump-tree-graphite-details" }
+ { dg-additional-options "-fdump-tree-oaccloops1" }
+ { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose-details" } */
+
+#define N 1024
+
+extern unsigned int *__restrict a;
+extern unsigned int *__restrict b;
+extern unsigned int *__restrict c;
+
+extern unsigned int f (unsigned int);
+#pragma acc routine (f) seq
+
+void KERNELS ()
+{
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ for (unsigned int i = 0; i < N; i++) /* { dg-message "note: beginning .Graphite. part in OpenACC .kernels. region" } */
+ /* An "extern"al mapping of loop iterations/array indices makes the loop
+ unparallelizable. */
+ c[i] = a[f (i)] + b[f (i)]; /* { dg-optimized "assigned OpenACC seq loop parallelism" } */
+}
+
+/* Check the offloaded function's attributes.
+ { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc parallel_kernels_graphite, omp target entrypoint\\)\\)" 1 "ompexp" } } */
+
+/* Check that Graphite can handle neither the original nor the offloaded region
+ { dg-final { scan-tree-dump-times "number of SCoPs: 0" 2 "graphite" } }
+
+/* Check the offloaded function's classification and compute dimensions (will
+ always be 1 x 1 x 1 for non-offloading compilation).
+ { dg-final { scan-tree-dump-times "(?n)Function is parallel_kernels_graphite OpenACC kernels offload" 1 "oaccloops1" } }
+ { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } }
+ { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc parallel_kernels_graphite, omp target entrypoint\\)\\)" 1 "oaccloops1" } } */
new file mode 100644
@@ -0,0 +1,47 @@
+/* Check offloaded function's attributes and classification for unparallelized
+ OpenACC 'kernels' with "parloops" handling. */
+
+/* { dg-additional-options "-O2" }
+ { dg-additional-options "--param openacc-kernels=decompose-parloops" }
+ { dg-additional-options "-fno-openacc-kernels-annotate-loops" }
+ { dg-additional-options "-fopt-info-note-optimized-omp" }
+ { dg-additional-options "-fdump-tree-ompexp" }
+ { dg-additional-options "-fdump-tree-parloops1-all" }
+ { dg-additional-options "-fdump-tree-oaccloops1" }
+ { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose-details" } */
+
+/* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
+ aspects of that functionality. */
+
+#define N 1024
+
+extern unsigned int *__restrict a;
+extern unsigned int *__restrict b;
+extern unsigned int *__restrict c;
+
+extern unsigned int f (unsigned int);
+#pragma acc routine (f) seq
+
+void KERNELS ()
+{
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (unsigned int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ /* An "extern"al mapping of loop iterations/array indices makes the loop
+ unparallelizable. */
+ c[i] = a[f (i)] + b[f (i)];
+}
+
+/* Check the offloaded function's attributes.
+ { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels, omp target entrypoint\\)\\)" 1 "ompexp" } } */
+
+/* Check that exactly one OpenACC kernels construct is analyzed, and that it
+ can't be parallelized.
+ { dg-final { scan-tree-dump-times "FAILED:" 1 "parloops1" } }
+ { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } }
+ { dg-final { scan-tree-dump-not "SUCCESS: may be parallelized" "parloops1" } } */
+
+/* Check the offloaded function's classification and compute dimensions (will
+ always be 1 x 1 x 1 for non-offloading compilation).
+ { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccloops1" } }
+ { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } }
+ { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops1" } } */
new file mode 100644
@@ -0,0 +1,125 @@
+/* Test OpenACC .kernels. region decomposition with
+ "split-parloops" handling. */
+/* { dg-additional-options "--param openacc-kernels=decompose-parloops" } */
+/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */
+/* { dg-additional-options "-fopt-info-omp-all" } */
+/* { dg-additional-options "-Wopenacc-parallelism" } */
+/* { dg-additional-options "-O2" } for "parloops". */
+
+/* See also "../../gfortran.dg/goacc/kernels-decompose-1.f95". */
+
+#pragma acc routine gang
+extern int
+f_g (int);
+
+#pragma acc routine worker
+extern int
+f_w (int);
+
+#pragma acc routine vector
+extern int
+f_v (int);
+
+#pragma acc routine seq
+extern int
+f_s (int);
+
+int
+main ()
+{
+ int x, y, z;
+#define N 10
+ int a[N], b[N], c[N];
+
+#pragma acc kernels
+ {
+ x = 0; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+ y = x < 10;
+ z = x++;
+ ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+ for (int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ a[i] = 0;
+
+#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; i++)
+ b[i] = a[N - i - 1];
+
+#pragma acc kernels
+ {
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; i++)
+ b[i] = a[N - i - 1];
+
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; i++)
+ c[i] = a[i] * b[i];
+
+ a[z] = 0; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; i++)
+ c[i] += a[i];
+
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */
+ for (int i = 0 + 1; i < N; i++)
+ c[i] += c[i - 1];
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+ {
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+ /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; ++i)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+ for (int j = 0; j < N; ++j)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 } */
+ for (int k = 0; k < N; ++k)
+ a[(i + j + k) % N]
+ = b[j]
+ + f_v (c[k]); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+
+ //TODO Should the following turn into "gang-single" instead of "parloops"?
+ //TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>; else goto <D.2548>;", thus "parloops".
+ if (y < 5)
+#pragma acc loop independent /* { dg-missed "unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" } */
+ for (int j = 0; j < N; ++j)
+ b[j] = f_w (c[j]);
+ }
+
+#pragma acc kernels /* { dg-warning "region contains gang partitioned code but is not gang partitioned" } */
+ {
+ /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" "" { target *-*-* } .+1 } */
+ y = f_g (a[5]); /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+ /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */
+ for (int j = 0; j < N; ++j)
+ b[j] = y + f_w (c[j]); /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+ }
+
+#pragma acc kernels
+ {
+ y = 3; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+ /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */
+ for (int j = 0; j < N; ++j)
+ b[j] = y + f_v (c[j]); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+
+ z = 2; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+ }
+
+#pragma acc kernels /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+ ;
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,36 @@
+/* { dg-additional-options "--param=openacc-kernels=parloops" } as this is
+ specifically testing "parloops" handling. */
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define n 10000
+
+unsigned int a[n];
+
+void __attribute__((noinline,noclone))
+foo (void)
+{
+ int i;
+ unsigned int sum = 1;
+
+#pragma acc kernels copyin (a[0:n]) copy (sum)
+ {
+ for (i = 0; i < n; ++i)
+ sum += a[i];
+ }
+
+ if (sum != 5001)
+ abort ();
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint, noclone, noinline\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-graphite-details" } */
+
+#include <stdlib.h>
+
+#define n 10000
+
+unsigned int a[n];
+
+void __attribute__((noinline,noclone))
+foo (void)
+{
+ int i;
+ unsigned int sum = 1;
+
+#pragma acc parallel copyin (a[0:n])
+ {
+#pragma acc loop auto reduction(+:sum) /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism"} */
+ for (i = 0; i < n; ++i)
+ sum += a[i];
+ }
+}
new file mode 100644
@@ -0,0 +1,128 @@
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+ construct containing 'loop' constructs with explicit or implicit 'auto'
+ clause that are handled by "parloops". */
+
+/* { dg-additional-options "--param openacc-kernels=decompose-parloops" } */
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+/* { dg-additional-options "-fopt-info-note-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90". */
+
+int
+main ()
+{
+ int x, y, z;
+
+#pragma acc kernels
+ /* Strangely indented to keep this similar to other test cases. */
+ {
+#pragma acc loop
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+ ;
+
+#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+ ;
+
+#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+ ;
+
+#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+ ;
+
+#pragma acc loop auto gang vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+ ;
+
+#pragma acc loop auto gang worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+ ;
+
+#pragma acc loop auto worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+ ;
+
+#pragma acc loop auto gang worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+ ;
+
+#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ for (y = 0; y < 10; y++)
+#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+ for (z = 0; z < 10; z++)
+ ;
+
+#pragma acc loop auto
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+ ;
+
+#pragma acc loop auto
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+#pragma acc loop auto
+ for (y = 0; y < 10; y++)
+ ;
+
+#pragma acc loop auto
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+#pragma acc loop auto
+ for (y = 0; y < 10; y++)
+#pragma acc loop auto
+ for (z = 0; z < 10; z++)
+ ;
+
+#pragma acc loop
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+#pragma acc loop auto
+ for (y = 0; y < 10; y++)
+#pragma acc loop auto
+ for (z = 0; z < 10; z++)
+ ;
+
+#pragma acc loop auto
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+#pragma acc loop
+ for (y = 0; y < 10; y++)
+#pragma acc loop auto
+ for (z = 0; z < 10; z++)
+ ;
+
+#pragma acc loop auto
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+#pragma acc loop auto
+ for (y = 0; y < 10; y++)
+#pragma acc loop
+ for (z = 0; z < 10; z++)
+ ;
+
+#pragma acc loop
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (x = 0; x < 10; x++)
+#pragma acc loop auto
+ for (y = 0; y < 10; y++)
+#pragma acc loop
+ for (z = 0; z < 10; z++)
+ ;
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,61 @@
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC "kernels"
+ construct containing loops. */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+/* { dg-additional-options "-fopt-info-note-omp" } */
+/* { dg-additional-options "-O2" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90". */
+
+int
+main ()
+{
+ int x, y, z;
+
+#pragma acc kernels
+ for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+ ;
+
+#pragma acc kernels
+ for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+ ;
+
+#pragma acc kernels
+ for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+ for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (z = 0; z < 10; z++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ ;
+
+#pragma acc kernels
+ for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+ ;
+
+#pragma acc kernels
+ for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+ for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ ;
+
+#pragma acc kernels
+ for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+ for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (z = 0; z < 10; z++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ ;
+
+#pragma acc kernels
+ for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ \
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+ for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (z = 0; z < 10; z++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ ;
+
+ return 0;
+}
+
+/* { dg-prune-output ".auto. loop cannot be parallel" } */
new file mode 100644
@@ -0,0 +1,53 @@
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+ construct containing loops. */
+
+/* { dg-additional-options "--param openacc-kernels=decompose-parloops" } */
+/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+/* { dg-additional-options "-fopt-info-note-omp" } */
+/* { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose-details" } */
+// TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90". */
+
+int
+main ()
+{
+ int x, y, z;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (y = 0; y < 10; y++)
+ for (z = 0; z < 10; z++)
+ ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (y = 0; y < 10; y++)
+ ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (y = 0; y < 10; y++)
+ for (z = 0; z < 10; z++)
+ ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+ for (y = 0; y < 10; y++)
+ for (z = 0; z < 10; z++)
+ ;
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,44 @@
+! Check offloaded function's attributes and classification for unparallelized
+! OpenACC kernels that are handled by "parloops".
+
+! { dg-additional-options "--param openacc-kernels=decompose-parloops" }
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-fno-openacc-kernels-annotate-loops" }
+! { dg-additional-options "-fopt-info-optimized-note-omp" }
+! { dg-additional-options "-fdump-tree-ompexp" }
+! { dg-additional-options "-fdump-tree-parloops1-all" }
+! { dg-additional-options "-fdump-tree-oaccloops1" }
+
+program main
+ implicit none
+ integer, parameter :: n = 1024
+ integer, dimension (0:n-1) :: a, b, c
+ integer :: i
+
+ ! A function call in a data-reference makes the loop unparallelizable
+ integer, external :: f
+
+ call setup(a, b)
+
+ !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1)) ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+ do i = 0, n - 1
+ ! { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" "" { target *-*-* } .-1 }
+ c(i) = a(f (i)) + b(f (i))
+ end do
+ !$acc end kernels
+end program main
+
+! Check the offloaded function's attributes.
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels, omp target entrypoint\\)\\)" 1 "ompexp" } }
+
+! Check that exactly one OpenACC kernels construct is analyzed, and that it
+! can't be parallelized.
+! { dg-final { scan-tree-dump-times "FAILED:" 1 "parloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } }
+! { dg-final { scan-tree-dump-not "SUCCESS: may be parallelized" "parloops1" } }
+
+! Check the offloaded function's classification and compute dimensions (will
+! always be 1 x 1 x 1 for non-offloading compilation).
+! { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops1" } }
new file mode 100644
@@ -0,0 +1,52 @@
+! { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" }
+
+program main
+ implicit none
+ integer, parameter :: N = 1024
+ integer, dimension (1:N) :: a
+ integer :: i, sum
+
+ !$acc kernels copyin(a(1:N)) copy(sum)
+
+ ! converted to "oacc_kernels"
+ !$acc loop
+ do i = 1, N
+ sum = sum + a(i)
+ end do
+
+ ! converted to "oacc_parallel_kernels_gang_single"
+ sum = sum + 1
+ a(1) = a(1) + 1
+
+ ! converted to "oacc_parallel_kernels_parallelized"
+ !$acc loop independent
+ do i = 1, N
+ sum = sum + a(i)
+ end do
+
+ ! converted to "oacc_kernels"
+ if (sum .gt. 10) then
+ !$acc loop
+ do i = 1, N
+ sum = sum + a(i)
+ end do
+ end if
+
+ ! converted to "oacc_kernels"
+ !$acc loop auto
+ do i = 1, N
+ sum = sum + a(i)
+ end do
+
+ !$acc end kernels
+end program main
+
+! Check that the kernels region is split into a data region and enclosed
+! parallel regions.
+! { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite " 5 "omp_oacc_kernels_decompose" } }
+
+! Each of the parallel regions is async, and there is a final call to
+! __builtin_GOACC_wait.
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite async\\(-1\\)" 5 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "omp_oacc_kernels_decompose" } }
new file mode 100644
@@ -0,0 +1,121 @@
+! Test OpenACC 'kernels' construct decomposition with "decompose-parloops"
+! handling
+
+! { dg-additional-options "--param openacc-kernels=decompose-parloops" }
+! { dg-additional-options "-fopt-info-optimized-omp" }
+! { dg-additional-options "-Wopenacc-parallelism" }
+! { dg-additional-options "-O2" } for "parloops".
+
+! See also "../../c-c++-common/goacc/kernels-decompose-1.c".
+
+program main
+ implicit none
+
+ integer, external :: f_g
+ !$acc routine (f_g) gang
+ integer, external :: f_w
+ !$acc routine (f_w) worker
+ integer, external :: f_v
+ !$acc routine (f_v) vector
+ integer, external :: f_s
+ !$acc routine (f_s) seq
+
+ integer :: i, j, k
+ integer :: x, y, z
+ logical :: y_l
+ integer, parameter :: N = 10
+ integer :: a(N), b(N), c(N)
+
+ !$acc kernels
+ x = 0
+ y = 0
+ y_l = x < 10
+ z = x
+ x = x + 1
+ !$acc end kernels
+
+ !$acc kernels
+ do i = 1, N
+ ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } .-1 }
+ a(i) = 0
+ end do
+ !$acc end kernels
+
+ !$acc kernels loop ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+ do i = 1, N
+ b(i) = a(N - i + 1)
+ end do
+
+ !$acc kernels
+ !$acc loop ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+ do i = 1, N
+ b(i) = a(N - i + 1)
+ end do
+
+ !$acc loop ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+ do i = 1, N
+ c(i) = a(i) * b(i)
+ end do
+
+ a(z) = 0
+
+ !$acc loop ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+ do i = 1, N
+ c(i) = c(i) + a(i)
+ end do
+
+ !$acc loop seq ! { dg-optimized "assigned OpenACC seq loop parallelism" }
+ do i = 1 + 1, N
+ c(i) = c(i) + c(i - 1)
+ end do
+ !$acc end kernels
+
+ !$acc kernels ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
+ !$acc loop independent ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+ do i = 1, N
+ !$acc loop independent ! { dg-optimized "assigned OpenACC worker loop parallelism" }
+ do j = 1, N
+ !$acc loop independent ! { dg-optimized "assigned OpenACC seq loop parallelism" }
+ ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 }
+ ! { dg-bogus "optimized: assigned OpenACC vector loop parallelism" "" { target *-*-* } .-2 }
+ do k = 1, N
+ a(1 + mod(i + j + k, N)) &
+ = b(j) &
+ + f_v (c(k)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+ end do
+ end do
+ end do
+
+ !TODO Should the following turn into "gang-single" instead of "parloops"?
+ !TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>; else goto <D.2548>;", thus "parloops".
+ if (y < 5) then
+ !$acc loop independent
+ do j = 1, N
+ b(j) = f_w (c(j))
+ end do
+ end if
+ !$acc end kernels
+
+ !$acc kernels ! { dg-warning "region contains gang partitioned code but is not gang partitioned" }
+ y = f_g (a(5)) ! { dg-optimized "assigned OpenACC gang worker vector loop parallelism" }
+
+ !$acc loop independent ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+ do j = 1, N
+ b(j) = y + f_w (c(j)) ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
+ end do
+ !$acc end kernels
+
+ !$acc kernels
+ y = 3
+
+ !$acc loop independent ! { dg-optimized "assigned OpenACC gang worker loop parallelism" }
+ do j = 1, N
+ b(j) = y + f_v (c(j)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+ end do
+
+ z = 2
+ !$acc end kernels
+
+ !$acc kernels
+ !$acc end kernels
+end program main
new file mode 100644
@@ -0,0 +1,154 @@
+! Test OpenACC 'kernels' construct decomposition.
+
+! { dg-additional-options "-fopenacc-kernels-annotate-loops" }
+! { dg-additional-options "-fopt-info-omp-all" }
+! { dg-additional-options "--param=openacc-kernels=decompose-parloops" }
+! { dg-additional-options "-O2" } for 'parloops'.
+
+! { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
+! aspects of that functionality.
+
+! See also '../../c-c++-common/goacc/kernels-decompose-2.c'.
+
+! It's only with Tcl 8.5 (released in 2007) that "the variable 'varName'
+! passed to 'incr' may be unset, and in that case, it will be set to [...]",
+! so to maintain compatibility with earlier Tcl releases, we manually
+! initialize counter variables:
+! { dg-line l_dummy[variable c_loop_i 0 c_loop_j 0 c_loop_k 0 c_part 0] }
+! { dg-message "dummy" "" { target iN-VAl-Id } l_dummy } to avoid
+! "WARNING: dg-line var l_dummy defined, but not used".
+
+program main
+ implicit none
+
+ integer, external :: f_g
+ !$acc routine (f_g) gang
+ integer, external :: f_w
+ !$acc routine (f_w) worker
+ integer, external :: f_v
+ !$acc routine (f_v) vector
+ integer, external :: f_s
+ !$acc routine (f_s) seq
+
+ integer :: i, j, k
+ integer :: x, y, z
+ logical :: y_l
+ integer, parameter :: N = 10
+ integer :: a(N), b(N), c(N)
+
+ !$acc kernels
+ x = 0 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
+ y = 0
+ y_l = x < 10
+ z = x
+ x = x + 1
+ ;
+ !$acc end kernels
+
+ !$acc kernels
+ do i = 1, N ! { dg-line l_loop_i[incr c_loop_i] }
+ ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+ ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+ a(i) = 0
+ end do
+ !$acc end kernels
+
+ !$acc kernels loop ! { dg-line l_loop_i[incr c_loop_i] }
+ ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+ ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+ do i = 1, N
+ b(i) = a(N - i + 1)
+ end do
+
+ !$acc kernels
+ !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+ ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+ ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+ do i = 1, N
+ b(i) = a(N - i + 1)
+ end do
+
+ !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+ ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+ ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+ do i = 1, N
+ c(i) = a(i) * b(i)
+ end do
+
+ a(z) = 0
+
+ !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+ ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+ ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+ do i = 1, N
+ c(i) = c(i) + a(i)
+ end do
+
+ !$acc loop seq ! { dg-line l_loop_i[incr c_loop_i] }
+ ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
+ ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+ do i = 1 + 1, N
+ c(i) = c(i) + c(i - 1)
+ end do
+ !$acc end kernels
+
+ !$acc kernels ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
+ !$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
+ ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
+ ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+ do i = 1, N
+ !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+ ! { dg-optimized "assigned OpenACC worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
+ do j = 1, N
+ !$acc loop independent ! { dg-line l_loop_k[incr c_loop_k] }
+ ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } l_loop_k$c_loop_k }
+ ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_k$c_loop_k }
+ do k = 1, N
+ a(1 + mod(i + j + k, N)) &
+ = b(j) &
+ + f_v (c(k)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+ end do
+ end do
+ end do
+
+ !TODO Should the following turn into "gang-single" instead of "parloops"?
+ !TODO The problem is that the first STMT is 'if (y <= 4) goto <D.2547>; else goto <D.2548>;', thus "parloops".
+ if (y < 5) then ! { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" }
+ !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+ ! { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_j$c_loop_j }
+ do j = 1, N
+ b(j) = f_w (c(j))
+ end do
+ end if
+ !$acc end kernels
+
+ !$acc kernels
+ ! { dg-bogus "\[Ww\]arning: region contains gang partitioned code but is not gang partitioned" "TODO 'kernels'" { xfail *-*-* } .-1 }
+ y = f_g (a(5)) ! { dg-line l_part[incr c_part] }
+ !TODO If such a construct is placed in its own part (like it is, here), can't this actually use gang paralelism, instead of "gang-single"?
+ ! { dg-optimized "assigned OpenACC gang worker vector loop parallelism" "" { target *-*-* } l_part$c_part }
+
+ !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+ ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j }
+ ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
+ do j = 1, N
+ b(j) = y + f_w (c(j)) ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
+ end do
+ !$acc end kernels
+
+ !$acc kernels
+ y = 3
+
+ !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+ ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j }
+ ! { dg-optimized "assigned OpenACC gang worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
+ do j = 1, N
+ b(j) = y + f_v (c(j)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+ end do
+
+ z = 2
+ !$acc end kernels
+
+ !$acc kernels
+ !$acc end kernels
+end program main
new file mode 100644
@@ -0,0 +1,52 @@
+! { dg-additional-options "--param=openacc-kernels=decompose-parloops" } as this is
+! specifically testing "parloops" handling.
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-fopenacc-kernels-annotate-loops" }
+! { dg-additional-options "-fdump-tree-parloops1-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+ implicit none
+ integer, parameter :: n = 1024
+ integer, dimension (0:n-1) :: a, b, c
+ integer :: i, ii
+
+ !$acc data copyout (a(0:n-1))
+ !$acc kernels present (a(0:n-1))
+ do i = 0, n - 1
+ a(i) = i * 2
+ end do
+ !$acc end kernels
+ !$acc end data
+
+ !$acc data copyout (b(0:n-1))
+ !$acc kernels present (b(0:n-1))
+ do i = 0, n -1
+ b(i) = i * 4
+ end do
+ !$acc end kernels
+ !$acc end data
+
+ !$acc data copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
+ !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+ do ii = 0, n - 1
+ c(ii) = a(ii) + b(ii)
+ end do
+ !$acc end kernels
+ !$acc end data
+
+ do i = 0, n - 1
+ if (c(i) .ne. a(i) + b(i)) STOP 1
+ end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
new file mode 100644
@@ -0,0 +1,45 @@
+! { dg-additional-options "--param openacc-kernels=decompose-parloops" } as this is
+! specifically testing "parloops" handling.
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-fdump-tree-parloops1-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+ implicit none
+ integer, parameter :: n = 1024
+ integer, dimension (0:n-1) :: a, b, c
+ integer :: i, ii
+
+ !$acc kernels copyout (a(0:n-1))
+ do i = 0, n - 1
+ a(i) = i * 2
+ end do
+ !$acc end kernels
+
+ !$acc kernels copyout (b(0:n-1))
+ do i = 0, n -1
+ b(i) = i * 4
+ end do
+ !$acc end kernels
+
+ !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
+ do ii = 0, n - 1
+ c(ii) = a(ii) + b(ii)
+ end do
+ !$acc end kernels
+
+ do i = 0, n - 1
+ if (c(i) .ne. a(i) + b(i)) STOP 1
+ end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
new file mode 100644
@@ -0,0 +1,39 @@
+! { dg-additional-options "--param openacc-kernels=decompose-parloops" } as this is
+! specifically testing "parloops" handling.
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-fdump-tree-parloops1-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+ implicit none
+ integer, parameter :: n = 1024
+ integer, dimension (0:n-1) :: a, b, c
+ integer :: i, ii
+
+ do i = 0, n - 1
+ a(i) = i * 2
+ end do
+
+ do i = 0, n -1
+ b(i) = i * 4
+ end do
+
+ !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
+ do ii = 0, n - 1
+ c(ii) = a(ii) + b(ii)
+ end do
+ !$acc end kernels
+
+ do i = 0, n - 1
+ if (c(i) .ne. a(i) + b(i)) STOP 1
+ end do
+
+end program main
+
+! Check that only one loop is analyzed, and that it can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
new file mode 100644
@@ -0,0 +1,37 @@
+! { dg-additional-options "--param openacc-kernels=decompose" }
+
+! A regression test checking that the reduction clause lowering does
+! not fail if a subroutine argument is used as a reduction variable in
+! a kernels region.
+
+! This was fine ...
+subroutine reduction_var_not_argument(res)
+ real res
+ real tmp
+ integer i
+
+ !$acc kernels
+ !$acc loop reduction(+:tmp)
+ do i=0,n-1
+ tmp = tmp + 1
+ end do
+ !$acc end kernels
+
+ res = tmp
+end subroutine reduction_var_not_argument
+
+! ... but this led to problems because ARG
+! was a pointer type that did not get dereferenced.
+subroutine reduction_var_as_argument(arg)
+ real arg
+ integer i
+
+ !$acc kernels
+ !$acc loop reduction(+:arg)
+ do i=0,n-1
+ arg = arg + 1
+ end do
+ !$acc end kernels
+end subroutine reduction_var_as_argument
+
+
new file mode 100644
@@ -0,0 +1,98 @@
+! Check that the Graphite-based "auto" loop and "kernels" handling
+! is able to assign the parallelism dimensions correctly for a simple
+! loop-nest with reductions. All loops should be parallelized.
+
+! { dg-additional-options "-O2 -g" }
+! { dg-additional-options "-foffload=-fdump-tree-oaccloops1-details" }
+! { dg-additional-options "-foffload=-fopt-info-optimized" }
+! { dg-additional-options "-fdump-tree-oaccloops1-details" }
+! { dg-additional-options "-fopt-info-optimized" }
+
+module test
+ implicit none
+
+ integer, parameter :: n = 10000
+ integer :: a(n,n)
+ integer :: sums(n,n)
+
+contains
+ function sum_loop_auto() result(sum)
+ integer :: i, j
+ integer :: sum, max_val
+
+ sum = 0
+ max_val = 0
+
+ !$acc parallel copyin (a) reduction(+:sum)
+ !$acc loop auto reduction(+:sum) reduction(max:max_val) ! { dg-optimized "assigned OpenACC gang worker loop parallelism" }
+ ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 }
+ do i = 1,size (a, 1)
+ !$acc loop auto reduction(max:max_val) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+ ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 }
+ do j = 1,size(a, 2)
+ max_val = a(i,j)
+ end do
+ sum = sum + max_val
+ end do
+ !$acc end parallel
+ end function sum_loop_auto
+
+ function sum_kernels() result(sum)
+ integer :: i, j
+ integer :: sum, max_val
+
+ sum = 0
+ max_val = 0
+
+ !$acc kernels
+ ! { dg-optimized {'map\(force_tofrom:max_val [^)]+\)' optimized to 'map\(to:max_val [^)]+\)'} "" { target *-*-* } .-1 }
+ !$acc loop reduction(+:sum) reduction(max:max_val) ! { dg-optimized "assigned OpenACC gang worker loop parallelism" }
+ ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 }
+ ! { dg-optimized "forwarded loop nest in OpenACC .kernels. construct to .Graphite." "" { target *-*-* } .-2 }
+ do i = 1,size (a, 1)
+ !$acc loop reduction(max:max_val) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+ ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 }
+ do j = 1,size(a, 2)
+ max_val = a(i,j)
+ end do
+ sum = sum + max_val
+ end do
+ !$acc end kernels
+ end function sum_kernels
+end module test
+
+program main
+ use test
+
+ implicit none
+
+ integer :: result, i, j
+
+ ! We sum the maxima of n rows, each containing numbers
+ ! 1..n
+ integer, parameter :: expected_sum = n * n
+
+ do i = 1, size (a, 1) ! { dg-optimized "loop nest optimized" }
+ do j = 1, size (a, 2)
+ a(i, j) = j
+ end do
+ end do
+
+
+ result = sum_loop_auto()
+ if (result /= expected_sum) then
+ write (*, *) "Wrong result:", result
+ call abort()
+ endif
+
+ result = sum_kernels()
+ if (result /= expected_sum) then
+ write (*, *) "Wrong result:", result
+ call abort()
+ endif
+end program main
+
+! This ensures that the dg-optimized assertions above hold for both
+! compilers because the output goes to stderr and the dump file.
+! { dg-final { scan-offload-tree-dump-times "optimized: assigned OpenACC .*? parallelism" 4 "oaccloops1" } }
+! { dg-final { scan-tree-dump-times "optimized: assigned OpenACC .*? parallelism" 4 "oaccloops1" } }