commit c4edb6e748c86c2bc5251707f61d4d37679194cf
Author: Julian Brown <julian@codesourcery.com>
Date: Thu Jun 4 07:16:56 2015 -0700
Add a set of OpenACC worker-single/worker-partitioned mode tests.
new file mode 100644
@@ -0,0 +1,30 @@
+#include <assert.h>
+
+/* Test worker-partitioned/vector-single mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr[32 * 8], i;
+
+ for (i = 0; i < 32 * 8; i++)
+ arr[i] = 0;
+
+ #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ int k;
+ #pragma acc loop worker
+ for (k = 0; k < 8; k++)
+ arr[j * 8 + k] += j * 8 + k;
+ }
+ }
+
+ for (i = 0; i < 32 * 8; i++)
+ assert (arr[i] == i);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,44 @@
+#include <assert.h>
+
+/* Test condition in worker-partitioned mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr[32 * 32 * 8], i;
+
+ for (i = 0; i < 32 * 32 * 8; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ int k;
+ #pragma acc loop worker
+ for (k = 0; k < 8; k++)
+ {
+ int m;
+ if ((k % 2) == 0)
+ {
+ #pragma acc loop vector
+ for (m = 0; m < 32; m++)
+ arr[j * 32 * 8 + k * 32 + m]++;
+ }
+ else
+ {
+ #pragma acc loop vector
+ for (m = 0; m < 32; m++)
+ arr[j * 32 * 8 + k * 32 + m] += 2;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < 32 * 32 * 8; i++)
+ assert (arr[i] == i + ((i / 32) % 2) + 1);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,54 @@
+#include <assert.h>
+
+/* Test switch in worker-partitioned mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr[32 * 32 * 8], i;
+
+ for (i = 0; i < 32 * 32 * 8; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ int k;
+ #pragma acc loop worker
+ for (k = 0; k < 8; k++)
+ {
+ int m;
+ switch ((j * 32 + k) % 3)
+ {
+ case 0:
+ #pragma acc loop vector
+ for (m = 0; m < 32; m++)
+ arr[j * 32 * 8 + k * 32 + m]++;
+ break;
+
+ case 1:
+ #pragma acc loop vector
+ for (m = 0; m < 32; m++)
+ arr[j * 32 * 8 + k * 32 + m] += 2;
+ break;
+
+ case 2:
+ #pragma acc loop vector
+ for (m = 0; m < 32; m++)
+ arr[j * 32 * 8 + k * 32 + m] += 3;
+ break;
+
+ default: ;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < 32 * 32 * 8; i++)
+ assert (arr[i] == i + ((i / 32) % 3) + 1);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,54 @@
+#include <assert.h>
+
+/* Test worker-single/worker-partitioned transitions. */
+
+int
+main (int argc, char *argv[])
+{
+ int n[32], arr[32 * 32], i;
+
+ for (i = 0; i < 32 * 32; i++)
+ arr[i] = 0;
+
+ for (i = 0; i < 32; i++)
+ n[i] = 0;
+
+ #pragma acc parallel copy(n, arr) num_gangs(8) num_workers(16) \
+ vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ int k;
+
+ n[j]++;
+
+ #pragma acc loop worker
+ for (k = 0; k < 32; k++)
+ arr[j * 32 + k]++;
+
+ n[j]++;
+
+ #pragma acc loop worker
+ for (k = 0; k < 32; k++)
+ arr[j * 32 + k]++;
+
+ n[j]++;
+
+ #pragma acc loop worker
+ for (k = 0; k < 32; k++)
+ arr[j * 32 + k]++;
+
+ n[j]++;
+ }
+ }
+
+ for (i = 0; i < 32; i++)
+ assert (n[i] == 4);
+
+ for (i = 0; i < 32 * 32; i++)
+ assert (arr[i] == 3);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,47 @@
+/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } */
+
+#include <assert.h>
+
+/* Test correct synchronisation between worker-partitioned loops. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr_a[32 * 32], arr_b[32 * 32], i;
+ int num_workers, num_gangs;
+
+ for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
+ for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
+ {
+ for (i = 0; i < 32 * 32; i++)
+ arr_a[i] = i;
+
+ #pragma acc parallel copyin(arr_a) copyout(arr_b) num_gangs(num_gangs) \
+ num_workers(num_workers) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ int k;
+
+ #pragma acc loop worker
+ for (k = 0; k < 32; k++)
+ arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
+
+ #pragma acc loop worker
+ for (k = 0; k < 32; k++)
+ arr_a[j * 32 + (31 - k)] = arr_b[j * 32 + k] * 2;
+
+ #pragma acc loop worker
+ for (k = 0; k < 32; k++)
+ arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
+ }
+ }
+
+ for (i = 0; i < 32 * 32; i++)
+ assert (arr_b[i] == (i ^ 31) * 8);
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,47 @@
+/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } */
+
+#include <assert.h>
+
+/* Test correct synchronisation between worker+vector-partitioned loops. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
+ int num_workers, num_gangs;
+
+ for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
+ for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
+ {
+ for (i = 0; i < 32 * 32 * 32; i++)
+ arr_a[i] = i;
+
+ #pragma acc parallel copyin(arr_a) copyout(arr_b) num_gangs(num_gangs) \
+ num_workers(num_workers) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ int k;
+
+ #pragma acc loop worker vector
+ for (k = 0; k < 32 * 32; k++)
+ arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
+
+ #pragma acc loop worker vector
+ for (k = 0; k < 32 * 32; k++)
+ arr_a[j * 32 * 32 + (1023 - k)] = arr_b[j * 32 * 32 + k] * 2;
+
+ #pragma acc loop worker vector
+ for (k = 0; k < 32 * 32; k++)
+ arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
+ }
+ }
+
+ for (i = 0; i < 32 * 32 * 32; i++)
+ assert (arr_b[i] == (i ^ 1023) * 8);
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,90 @@
+#include <assert.h>
+
+/* Test correct synchronisation between vector-partitioned loops in
+ worker-partitioned mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int n[32 * 32], arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
+ int num_workers, num_gangs;
+
+ for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
+ for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
+ {
+ for (i = 0; i < 32 * 32 * 32; i++)
+ arr_a[i] = i;
+
+ for (i = 0; i < 32 * 32; i++)
+ n[i] = 0;
+
+ #pragma acc parallel copy (n) copyin(arr_a) copyout(arr_b) \
+ num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ int k;
+
+ #pragma acc loop worker
+ for (k = 0; k < 32; k++)
+ {
+ int m;
+
+ n[j * 32 + k]++;
+
+ #pragma acc loop vector
+ for (m = 0; m < 32; m++)
+ {
+ if (((j * 1024 + k * 32 + m) % 2) == 0)
+ arr_b[j * 1024 + k * 32 + (31 - m)]
+ = arr_a[j * 1024 + k * 32 + m] * 2;
+ else
+ arr_b[j * 1024 + k * 32 + (31 - m)]
+ = arr_a[j * 1024 + k * 32 + m] * 3;
+ }
+
+ /* Test returning to vector-single mode... */
+ n[j * 32 + k]++;
+
+ #pragma acc loop vector
+ for (m = 0; m < 32; m++)
+ {
+ if (((j * 1024 + k * 32 + m) % 3) == 0)
+ arr_a[j * 1024 + k * 32 + (31 - m)]
+ = arr_b[j * 1024 + k * 32 + m] * 5;
+ else
+ arr_a[j * 1024 + k * 32 + (31 - m)]
+ = arr_b[j * 1024 + k * 32 + m] * 7;
+ }
+
+ /* ...and back-to-back vector loops. */
+
+ #pragma acc loop vector
+ for (m = 0; m < 32; m++)
+ {
+ if (((j * 1024 + k * 32 + m) % 2) == 0)
+ arr_b[j * 1024 + k * 32 + (31 - m)]
+ = arr_a[j * 1024 + k * 32 + m] * 3;
+ else
+ arr_b[j * 1024 + k * 32 + (31 - m)]
+ = arr_a[j * 1024 + k * 32 + m] * 2;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < 32 * 32; i++)
+ assert (n[i] == 2);
+
+ for (i = 0; i < 32 * 32 * 32; i++)
+ {
+ int m = 6 * ((i % 3) == 0 ? 5 : 7);
+ assert (arr_b[i] == (i ^ 31) * m);
+ }
+ }
+
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,25 @@
+#include <assert.h>
+
+/* Test worker-single/vector-single mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr[32], i;
+
+ for (i = 0; i < 32; i++)
+ arr[i] = 0;
+
+ #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ arr[j]++;
+ }
+
+ for (i = 0; i < 32; i++)
+ assert (arr[i] == 1);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,28 @@
+#include <assert.h>
+
+/* Test worker-single/vector-single mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr[32], i;
+
+ for (i = 0; i < 32; i++)
+ arr[i] = 0;
+
+ #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ #pragma acc atomic
+ arr[j]++;
+ }
+ }
+
+ for (i = 0; i < 32; i++)
+ assert (arr[i] == 1);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,28 @@
+#include <assert.h>
+
+/* Test condition in worker-single/vector-single mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr[32], i;
+
+ for (i = 0; i < 32; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ if ((arr[j] % 2) != 0)
+ arr[j]++;
+ else
+ arr[j] += 2;
+ }
+
+ for (i = 0; i < 32; i++)
+ assert (arr[i] == ((i % 2) != 0) ? i + 1 : i + 2);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,33 @@
+#include <assert.h>
+
+/* Test switch in worker-single/vector-single mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr[32], i;
+
+ for (i = 0; i < 32; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ switch (arr[j] % 5)
+ {
+ case 0: arr[j] += 1; break;
+ case 1: arr[j] += 2; break;
+ case 2: arr[j] += 3; break;
+ case 3: arr[j] += 4; break;
+ case 4: arr[j] += 5; break;
+ default: arr[j] += 99;
+ }
+ }
+
+ for (i = 0; i < 32; i++)
+ assert (arr[i] == i + (i % 5) + 1);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,33 @@
+#include <assert.h>
+
+/* Test worker-single/vector-partitioned mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr[32 * 32], i;
+
+ for (i = 0; i < 32 * 32; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ int k;
+ #pragma acc loop vector
+ for (k = 0; k < 32; k++)
+ {
+ #pragma acc atomic
+ arr[j * 32 + k]++;
+ }
+ }
+ }
+
+ for (i = 0; i < 32 * 32; i++)
+ assert (arr[i] == i + 1);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,49 @@
+#include <assert.h>
+
+/* Test multiple conditional vector-partitioned loops in worker-single
+ mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int arr[32 * 32], i;
+
+ for (i = 0; i < 32 * 32; i++)
+ arr[i] = i;
+
+ #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+ {
+ int j;
+ #pragma acc loop gang
+ for (j = 0; j < 32; j++)
+ {
+ int k;
+ if ((j % 3) == 0)
+ {
+ #pragma acc loop vector
+ for (k = 0; k < 32; k++)
+ {
+ #pragma acc atomic
+ arr[j * 32 + k] += 3;
+ }
+ }
+ else if ((j % 3) == 1)
+ {
+ #pragma acc loop vector
+ for (k = 0; k < 32; k++)
+ {
+ #pragma acc atomic
+ arr[j * 32 + k] += 7;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < 32 * 32; i++)
+ {
+ int j = (i / 32) % 3;
+ assert (arr[i] == i + ((j == 0) ? 3 : (j == 1) ? 7 : 0));
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,46 @@
+#include <assert.h>
+
+#if defined(ACC_DEVICE_TYPE_host) || defined(ACC_DEVICE_TYPE_host_nonshm)
+#define ACTUAL_GANGS 1
+#else
+#define ACTUAL_GANGS 8
+#endif
+
+/* Test worker-single, vector-partitioned, gang-redundant mode. */
+
+int
+main (int argc, char *argv[])
+{
+ int n, arr[32], i;
+
+ for (i = 0; i < 32; i++)
+ arr[i] = 0;
+
+ n = 0;
+
+ #pragma acc parallel copy(n, arr) num_gangs(ACTUAL_GANGS) num_workers(8) \
+ vector_length(32)
+ {
+ int j;
+
+ #pragma acc atomic
+ n++;
+
+ #pragma acc loop vector
+ for (j = 0; j < 32; j++)
+ {
+ #pragma acc atomic
+ arr[j] += 1;
+ }
+
+ #pragma acc atomic
+ n++;
+ }
+
+ assert (n == ACTUAL_GANGS * 2);
+
+ for (i = 0; i < 32; i++)
+ assert (arr[i] == ACTUAL_GANGS);
+
+ return 0;
+}