diff mbox

[gomp4,committed] Use marked_independent in oacc kernels region

Message ID 55A4C48C.8000002@mentor.com
State New
Headers show

Commit Message

Tom de Vries July 14, 2015, 8:13 a.m. UTC
Hi,

this patch uses the marked_independent field to skip the dependence 
analysis in parloops for loops in oacc kernels regions.

Bootstrapped and reg-tested on x86_64.

Committed to gomp-4_0-branch.

Thanks,
- Tom
diff mbox

Patch

Use marked_independent in oacc kernels region

2015-07-14  Tom de Vries  <tom@codesourcery.com>

	* tree-parloops.c (parallelize_loops): Use marked_independent flag in
	oacc kernels region.

	* c-c++-common/goacc/kernels-independent.c: New test.

	* testsuite/libgomp.oacc-c-c++-common/kernels-independent.c: New test.
---
 .../c-c++-common/goacc/kernels-independent.c       | 40 +++++++++++++++++++
 gcc/tree-parloops.c                                | 21 ++++++++--
 .../kernels-independent.c                          | 45 ++++++++++++++++++++++
 3 files changed, 103 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-independent.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-independent.c

diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-independent.c b/gcc/testsuite/c-c++-common/goacc/kernels-independent.c
new file mode 100644
index 0000000..2f086b6
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-independent.c
@@ -0,0 +1,40 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+void
+foo (unsigned int *a,  unsigned int *b,  unsigned int *c)
+{
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    a[i] = i * 2;
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    #pragma acc loop independent
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized, marked independent" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)pragma omp target oacc_parallel.*num_gangs\\(32\\)" 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index f27dfa9..149c336 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -2797,9 +2797,24 @@  parallelize_loops (bool oacc_kernels_p)
       if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p))
 	continue;
 
-      if (!flag_loop_parallelize_all
-	  && !loop_parallel_p (loop, &parloop_obstack))
-	continue;
+      if (!flag_loop_parallelize_all)
+	{
+	  bool independent = (oacc_kernels_p
+			      && loop->marked_independent);
+
+	  if (independent)
+	    {
+	      if (dump_file
+		  && (dump_flags & TDF_DETAILS))
+		fprintf (dump_file,
+			 "  SUCCESS: may be parallelized, marked independent\n");
+	    }
+	  else
+	    independent = loop_parallel_p (loop, &parloop_obstack);
+
+	  if (!independent)
+	    continue;
+	}
 
       changed = true;
       if (dump_file && (dump_flags & TDF_DETAILS))
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-independent.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-independent.c
new file mode 100644
index 0000000..d169a5f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-independent.c
@@ -0,0 +1,45 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+void __attribute__((noinline,noclone))
+foo (unsigned int *a,  unsigned int *b,  unsigned int *c)
+{
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    a[i] = i * 2;
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    #pragma acc loop independent
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+}
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+  foo (a, b, c);
+
+  return 0;
+}
-- 
1.9.1